├── .github
    └── workflows
    │   ├── codeql-analysis.yml
    │   ├── linux.yml
    │   ├── macos.yml
    │   ├── pypi-test.yml
    │   ├── release.yml
    │   └── windows.yml
├── .gitignore
├── .readthedocs.yml
├── .travis.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── CONTRIBUTORS
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docs
    ├── Makefile
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── complete tutorial.ipynb
    │   ├── conf.py
    │   ├── images
    │       ├── scorecard.png
    │       └── stepwise.png
    │   ├── index.md
    │   ├── modules.md
    │   ├── reference.md
    │   ├── toad.detector.md
    │   ├── toad.md
    │   ├── toad.merge.md
    │   ├── toad.metrics.md
    │   ├── toad.nn.functional.md
    │   ├── toad.nn.md
    │   ├── toad.nn.module.md
    │   ├── toad.nn.trainer.md
    │   ├── toad.plot.md
    │   ├── toad.preprocessing.md
    │   ├── toad.preprocessing.partition.md
    │   ├── toad.preprocessing.process.md
    │   ├── toad.scorecard.md
    │   ├── toad.selection.md
    │   ├── toad.stats.md
    │   ├── toad.transform.md
    │   ├── toad.utils.decorator.md
    │   ├── toad.utils.func.md
    │   ├── toad.utils.md
    │   ├── toad.utils.mixin.md
    │   ├── tutorial.ipynb
    │   └── tutorial_chinese.ipynb
├── images
    └── toad_logo.png
├── pyproject.toml
├── requirements-dist.txt
├── requirements-nn.txt
├── requirements-test.txt
├── requirements-tools.txt
├── requirements.txt
├── scripts
    └── build_wheels.sh
├── setup.cfg
├── setup.py
├── tests
    └── test_data.csv
└── toad
    ├── __init__.py
    ├── c_utils.pxd
    ├── c_utils.pyx
    ├── cli.py
    ├── cli_test.py
    ├── commands
        ├── __init__.py
        ├── detect
        │   └── __init__.py
        ├── evaluate
        │   ├── __init__.py
        │   └── evaluate.py
        └── tree
        │   ├── __init__.py
        │   └── tree.py
    ├── detector.py
    ├── impute.py
    ├── impute_test.py
    ├── merge.pyx
    ├── merge_test.py
    ├── metrics.py
    ├── metrics_test.py
    ├── nn
        ├── __init__.py
        ├── functional.py
        ├── functional_test.py
        ├── loss.py
        ├── loss_test.py
        ├── module.py
        ├── module_test.py
        ├── trainer
        │   ├── __init__.py
        │   ├── callback.py
        │   ├── callback_test.py
        │   ├── earlystop.py
        │   ├── earlystop_test.py
        │   ├── event.py
        │   ├── event_test.py
        │   ├── history.py
        │   ├── history_test.py
        │   ├── metrics.py
        │   ├── trainer.py
        │   └── trainer_test.py
        └── zoo
        │   ├── __init__.py
        │   ├── autoencoder.py
        │   └── autoencoder_test.py
    ├── plot.py
    ├── plot_test.py
    ├── preprocessing
        ├── __init__.py
        ├── partition.py
        ├── partition_test.py
        ├── process.py
        └── process_test.py
    ├── scorecard.py
    ├── scorecard_test.py
    ├── selection.py
    ├── selection_test.py
    ├── stats.py
    ├── stats_test.py
    ├── tadpole
        ├── __init__.py
        ├── base.py
        ├── fonts
        │   └── NotoSansCJKsc-Regular.otf
        ├── func.py
        └── utils.py
    ├── transform.py
    ├── transform_test.py
    ├── utils
        ├── __init__.py
        ├── decorator.py
        ├── decorator_test.py
        ├── func.py
        ├── func_test.py
        ├── mixin.py
        ├── mixin_test.py
        ├── pickletracer.py
        ├── pickletracer_test.py
        └── progress
        │   ├── __init__.py
        │   ├── pandas.py
        │   ├── pandas_test.py
        │   ├── progress.py
        │   └── progress_test.py
    └── version.py


/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master, dev]
 6 |   pull_request:
 7 |     # The branches below must be a subset of the branches above
 8 |     branches: [master]
 9 |   schedule:
10 |     - cron: '0 3 * * 4'
11 | 
12 | jobs:
13 |   analyse:
14 |     name: Analyse
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - name: Checkout repository
19 |       uses: actions/checkout@v2
20 |       with:
21 |         # We must fetch at least the immediate parents so that if this is
22 |         # a pull request then we can checkout the head.
23 |         fetch-depth: 2
24 | 
25 |     # If this run was triggered by a pull request event, then checkout
26 |     # the head of the pull request instead of the merge commit.
27 |     - run: git checkout HEAD^2
28 |       if: ${{ github.event_name == 'pull_request' }}
29 | 
30 |     # Initializes the CodeQL tools for scanning.
31 |     - name: Initialize CodeQL
32 |       uses: github/codeql-action/init@v1
33 |       # Override language selection by uncommenting this and choosing your languages
34 |       # with:
35 |       #   languages: go, javascript, csharp, python, cpp, java
36 | 
37 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
38 |     # If this step fails, then you should remove it and run the build manually (see below)
39 |     - name: Autobuild
40 |       uses: github/codeql-action/autobuild@v1
41 | 
42 |     # ℹ️ Command-line programs to run using the OS shell.
43 |     # 📚 https://git.io/JvXDl
44 | 
45 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
46 |     #    and modify them (or add more) to build your code if your project
47 |     #    uses a compiled language
48 | 
49 |     #- run: |
50 |     #   make bootstrap
51 |     #   make release
52 | 
53 |     - name: Perform CodeQL Analysis
54 |       uses: github/codeql-action/analyze@v1
55 | 


--------------------------------------------------------------------------------
/.github/workflows/linux.yml:
--------------------------------------------------------------------------------
 1 | name: Test on Linux
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     strategy:
 8 |       matrix:
 9 |         python-version: ['3.9', '3.10', '3.11', '3.12']
10 |         experimental: [false]
11 |         include:
12 |           - python-version: '3.13'
13 |             experimental: true
14 |       fail-fast: false
15 |     runs-on: ubuntu-latest
16 |     continue-on-error: ${{ matrix.experimental }}
17 |     name: Test py ${{ matrix.python-version }}
18 |     steps:
19 |       - uses: actions/checkout@master
20 |       - name: Setup Python
21 |         uses: actions/setup-python@v5
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 |       - run: make build_deps
25 |       - run: pip install -r requirements-nn.txt
26 |       - run: pip install .[all]
27 |       - run: make test
28 |   release:
29 |     needs: [test]
30 |     # release when using `tags` or `release` branch
31 |     if: ${{ startsWith(github.ref, 'refs/tags') || github.ref == 'refs/heads/release' }}
32 |     runs-on: ubuntu-latest
33 |     steps:
34 |       - uses: actions/checkout@master
35 |       - name: Setup Python
36 |         uses: actions/setup-python@v5
37 |         with:
38 |           python-version: '3.10'
39 |           architecture: x64
40 |       - run: make dist
41 |       - uses: RalfG/python-wheels-manylinux-build@v0.7.1
42 |         with:
43 |           build-requirements: 'cython numpy'
44 |       - run: rm dist/*-linux_x86_64.whl
45 |       - uses: pypa/gh-action-pypi-publish@release/v1
46 |         name: publish pypi
47 |         with:
48 |           user: __token__
49 |           password: ${{ secrets.PYPI }}
50 |           skip-existing: true
51 |           verbose: true
52 | 


--------------------------------------------------------------------------------
/.github/workflows/macos.yml:
--------------------------------------------------------------------------------
 1 | name: Test on MacOS
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     strategy:
 8 |       matrix:
 9 |         python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
10 |         macos-version: ['macos-13', 'macos-latest']
11 |         include:
12 |           - experimental: false
13 |           - macos-version: 'macos-latest'
14 |             experimental: true
15 |           - python-version: '3.9'
16 |             experimental: true
17 |           - python-version: '3.13'
18 |             experimental: true
19 |       fail-fast: false
20 |     runs-on: ${{ matrix.macos-version }}
21 |     continue-on-error: ${{ matrix.experimental }}
22 |     name: Test py ${{ matrix.python-version }} ${{ matrix.macos-version }}
23 |     steps:
24 |       - uses: actions/checkout@master
25 |       - name: Setup Python
26 |         uses: actions/setup-python@v5
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |       - run: make build_deps
30 |       - run: pip install -r requirements-nn.txt
31 |       - run: pip install .[all]
32 |       - run: make test
33 |       - run: make dist_wheel
34 |       - uses: actions/upload-artifact@v4
35 |         with:
36 |           name: wheel-${{ matrix.python-version }}-${{ matrix.macos-version }}
37 |           path: dist/*.whl
38 |   release:
39 |     needs: [test]
40 |     # release when using `tags` or `release` branch
41 |     if: ${{ startsWith(github.ref, 'refs/tags') || github.ref == 'refs/heads/release' }}
42 |     runs-on: ubuntu-latest
43 |     steps:
44 |       - uses: actions/download-artifact@v4
45 |         with:
46 |           pattern: wheel-*
47 |           path: dist/
48 |           merge-multiple: true
49 |       - uses: pypa/gh-action-pypi-publish@release/v1
50 |         name: publish pypi
51 |         with:
52 |           user: __token__
53 |           password: ${{ secrets.PYPI }}
54 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-test.yml:
--------------------------------------------------------------------------------
1 | name: Pypi test
2 | 
3 | on:
4 |   push:
5 |     branches:
6 |       - 'pypi/**'
7 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "*"
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@master
14 | 
15 |       - name: Release
16 |         uses: docker://antonyurchenko/git-release:latest
17 |         env:
18 |           GITHUB_TOKEN: ${{ secrets.TOKEN }}
19 | 


--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
 1 | name: Test on Windows
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     strategy:
 8 |       matrix:
 9 |         python-version: ['3.9', '3.10', '3.11', '3.12']
10 |         experimental: [false]
11 |         include:
12 |           - python-version: '3.13'
13 |             experimental: true
14 |       fail-fast: false
15 |     runs-on: windows-latest
16 |     continue-on-error: ${{ matrix.experimental }}
17 |     name: Test py ${{ matrix.python-version }}
18 |     steps:
19 |       - uses: actions/checkout@master
20 |       - name: Setup Python
21 |         uses: actions/setup-python@v5
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 |       - run: make build_deps
25 |       - run: pip install -r requirements-nn.txt
26 |       - run: pip install .[all]
27 |       - run: make test
28 |       - run: make dist_wheel
29 |       - uses: actions/upload-artifact@v4
30 |         with:
31 |           name: wheel-${{ matrix.python-version }}
32 |           path: dist/*.whl
33 |   release:
34 |     needs: [test]
35 |     # release when using `tags` or `release` branch
36 |     if: ${{ startsWith(github.ref, 'refs/tags') || github.ref == 'refs/heads/release' }}
37 |     runs-on: ubuntu-latest
38 |     steps:
39 |       - uses: actions/download-artifact@v4
40 |         with:
41 |           pattern: wheel-*
42 |           path: dist/
43 |           merge-multiple: true
44 |       - uses: pypa/gh-action-pypi-publish@release/v1
45 |         name: publish pypi
46 |         with:
47 |           user: __token__
48 |           password: ${{ secrets.PYPI }}
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | build/
 3 | *.egg-info/
 4 | dist/
 5 | .tox/
 6 | .vscode/
 7 | .DS_Store
 8 | .python-version
 9 | *.csv
10 | *.xlsx
11 | *.c
12 | *.so
13 | *.pyc
14 | .idea/
15 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.11"
 7 | 
 8 | sphinx:
 9 |   configuration: docs/source/conf.py
10 | 
11 | formats: all
12 | 
13 | python:
14 |   install:
15 |     - requirements: requirements.txt
16 |     - requirements: requirements-nn.txt
17 |     - requirements: docs/requirements.txt
18 |     - method: setuptools
19 |       path: .
20 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | matrix:
 2 |   include:
 3 |     - name: "Python 3.6 on Linux"
 4 |       os: linux
 5 |       language: python
 6 |       python: "3.6"
 7 |       sudo: required
 8 |       services:
 9 |         - docker
10 |       env:
11 |         - DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64
12 |         - PLAT=manylinux1_x86_64
13 |       before_install:
14 |           - sudo apt-get install -y graphviz
15 |       dist: trusty
16 |       before_deploy:
17 |           - make dist_manylinux
18 | 
19 |     - name: "Python 3.7 on macOS"
20 |       os: osx
21 |       osx_image: xcode11.3
22 |       language: shell
23 |       env:
24 |         - SUDO=sudo
25 |         - HOMEBREW_NO_INSTALL_CLEANUP=TRUE
26 |       before_install:
27 |           - brew update
28 |           # - brew install graphviz
29 |       before_deploy:
30 |           - make dist_wheel
31 | 
32 |     - name: "Python 3.7 on Windows"
33 |       os: windows
34 |       language: shell
35 |       python: "3.7"
36 |       env:
37 |           - PATH=/c/Python37:/c/Python37/Scripts:$PATH
38 |       before_install:
39 |           - choco install python --version=3.7.2
40 |           - choco install graphviz
41 |           - choco install make
42 |       before_deploy:
43 |           - make dist_wheel
44 | 
45 |     - name: "Python 3.6 on Windows"
46 |       os: windows
47 |       language: shell
48 |       python: "3.6"
49 |       env:
50 |           - PATH=/c/Python36:/c/Python36/Scripts:$PATH
51 |       before_install:
52 |           - choco install python --version=3.6.8
53 |           - choco install graphviz
54 |           - choco install make
55 |           - pip install -U patsy
56 |       before_deploy:
57 |           - make dist_wheel
58 | 
59 | 
60 | install:
61 |     - make install
62 | script:
63 |     - make test
64 | 
65 | deploy:
66 |   - skip_cleanup: true
67 |     provider: script
68 |     script: make upload
69 |     on:
70 |       tags: true
71 | 
72 |   - skip_cleanup: true
73 |     provider: script
74 |     script: make upload
75 |     on:
76 |       branch: release
77 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  7 | 
  8 | ## [Unreleased]
  9 | 
 10 | ## [0.1.5] - 2025-02-18
 11 | 
 12 | ### Add
 13 | - Added `ax` support for `plot`
 14 | - Added `Apple M1` support
 15 | 
 16 | ## [0.1.4] - 2024-11-03
 17 | 
 18 | ### Add
 19 | - Added wheel package supported for `py3.12`
 20 | - Added `figsize` param in `toad.plot.bin_plot` function
 21 | 
 22 | ### Changed
 23 | - Update `pandas` version to `>=1.5`
 24 | - Python `3.7` `3.8` is no longer supported
 25 | 
 26 | ## [0.1.3] - 2023-12-10
 27 | 
 28 | ### Add
 29 | - Added `performance` in `toad.utils` for test code performance
 30 | - Added `pickletracer` in `toad.utils` for infer requirements in pickle object
 31 | 
 32 | ### Fixed
 33 | - Fixed `Value Error` in `select` and `drop_corr` method when using `pandas >= 2.0.x`
 34 | 
 35 | ## [0.1.2] - 2023-04-09
 36 | 
 37 | ### Add
 38 | - Added `ks_plot` for KS plot, [#102](https://github.com/amphibian-dev/toad/issues/102) thanks @kevin-meng
 39 | - Added `xgb_loss` decorator for convert a normal loss function to a xgb supported loss function
 40 | - Added `binary_focal_loss` function in `nn.functional`
 41 | - Added `event` module in `nn.trainer`, and changed `trainer` mode to event-based
 42 | - Added wheel package supported for `py3.9`, `py3.10` and `py3.11`
 43 | 
 44 | ### Changed
 45 | - Now you can pass arguments to `DecisionTreeClassifier` in `merge` or `Combiner` when use `method = dt`
 46 | 
 47 | ### Fixed
 48 | - Fixed `groupby` rewrited in `preprocessing`
 49 | - Fixed the expired deprecations of numpy types in `1.24.0`
 50 | 
 51 | ## [0.1.1] - 2022-08-14
 52 | 
 53 | ### Add
 54 | - Added `Progress` for `pandas.apply` by using `pandas_enable` and `pandas_disable`
 55 | - Added `feature_bin_stats` for feature bins, [#91](https://github.com/amphibian-dev/toad/issues/91) thanks @kevin-meng
 56 | 
 57 | ### Changed
 58 | - `countBlank` can use customize missing value, [#101](https://github.com/amphibian-dev/toad/issues/101) thanks @kevin-meng
 59 | - remove ref of `merge` in `__init__` file
 60 | 
 61 | 
 62 | 
 63 | ## [0.1.0] - 2021-10-08
 64 | 
 65 | ### Add
 66 | 
 67 | - Added `backward_rounds` for `nn.Trainer.train`
 68 | - Added `evalute` func in `nn.Module`
 69 | - Added `get_reason` func in `ScoreCard`, [#79](https://github.com/amphibian-dev/toad/issues/79) thanks @qianweishuo
 70 | - Added dict type input support for `ScoreCard.predict` and `Combiner.transform`, [#79](https://github.com/amphibian-dev/toad/issues/79) thanks @qianweishuo
 71 | - Added iterator support for `Progress`
 72 | 
 73 | ### Changed
 74 | 
 75 | - Change `callback` and `earlystopping` to python decorator
 76 | 
 77 | 
 78 | ## [0.0.65] - 2021-06-30
 79 | 
 80 | ### Breaking Changes
 81 | 
 82 | - Add new `lift` value and rename the old `lift` value to `cum_lift` in `KS_Bucket`
 83 | - Move `nn.autoencoder` to `nn.zoo.autoencoder`
 84 | 
 85 | ### Add
 86 | 
 87 | - Added `label_smoothing`, `focal_loss` function in `nn` module
 88 | - Added some features in `nn.trainer`
 89 | - Added default `early_stopping` for `nn.Trainer`
 90 | 
 91 | ### Changed
 92 | 
 93 | - Update `numpy` version to `>=1.20`
 94 | - Python `3.6` is no longer supported
 95 | 
 96 | ### Fixed
 97 | 
 98 | - Fixed combiner error after `ScoreCard` reload. [#67](https://github.com/amphibian-dev/toad/issues/67)
 99 | 
100 | 
101 | ## [0.0.64] - 2021-03-22
102 | 
103 | ### Added
104 | 
105 | - Added `callback` param in `fit` method for `nn`
106 | - Added `Trainer` and `EarlyStopping` in `nn.trainer` module
107 | 
108 | ### Changed
109 | 
110 | - Use mean of loss in `nn.Module.fit` instead of the latest loss value
111 | - Set default rotation for x tick labels
112 | 
113 | ### Fixed
114 | 
115 | - Fixed dependence version of `numpy`
116 | - Fixed `DistModule` module
117 | - Fixed `ScoreCard` representation error
118 | 
119 | ## [0.0.62] - 2021-02-19
120 | 
121 | ### Added
122 | 
123 | - `save` and `load` method for nn module
124 | - Added `lift` value in `KS_bucket` function
125 | - Added checking duplicate keys in `Transformer`
126 | 
127 | ### Changed
128 | 
129 | - `quality` method support `indicators`
130 | 
131 | ### Fixed
132 | 
133 | - Fixed tadpole warning of legend. [#52](https://github.com/amphibian-dev/toad/issues/52)
134 | - Fixed tadpole `title` and `x/y label` display for `UTF8` 
135 | - Fixed default rule in RuleMixin.
136 | - Fixed loss function of VAE model.
137 | - Fixed `decimal` argument in `ScoreCard.export` function
138 | 
139 | ### Enhancements
140 | 
141 | - Reduce memory usage when using `select` function
142 | 
143 | ## [0.0.61] - 2020-06-24
144 | 
145 | ### Added
146 | 
147 | - Support for calculating IV for each groups in a feature. [#25](https://github.com/amphibian-dev/toad/issues/25)
148 | - Add `cpu_cores` for `quality` function
149 | - Add `predict_proba` for `ScoreCard`
150 | - Impute module
151 | - NN module
152 | 
153 | ### Changed
154 | 
155 | - The y axis of `badrate_plot` is starting with `0` now. [#23](https://github.com/amphibian-dev/toad/issues/23)
156 | - `KS` is implemented using `ks2samp` instead
157 | 
158 | ### Fixed
159 | 
160 | - Fixed `Preprocess` bugs
161 | 
162 | ### Docs
163 | 
164 | - Add references for `Chi-Merge`, `Stepwise Regression`, `Scorecard Transformation`
165 | 
166 | ## [0.0.60] - 2020-04-20
167 | 
168 | ### Added
169 | 
170 | - Preprocess module.
171 | - Annotation format for bin plot.
172 | - KS bucket support split pointers as bucket. [#22](https://github.com/amphibian-dev/toad/issues/22)
173 | 
174 | ### Changed
175 | 
176 | - Format_bins support ellipsis.
177 | - Reverse cumulative columns in KS bucket
178 | - Use correct order of score for auc and roc plot. [#21](https://github.com/amphibian-dev/toad/issues/21)
179 | 
180 | ### Fixed
181 | 
182 | - Fixed number type of x axis of badrate plot. [#20](https://github.com/amphibian-dev/toad/issues/20)
183 | - Fixed negative ks value in `KS_bucket`.
184 | 
185 | ## [0.0.59] - 2020-02-07
186 | 
187 | ### Added
188 | 
189 | - Combiner support empty separate.
190 | - Confusion matrix function in metrics.
191 | - support python 3.8.
192 | 
193 | ### Changed
194 | 
195 | - Transform support y as string type.
196 | - VIF independent statsmodels.
197 | 
198 | 
199 | [Unreleased]: https://github.com/amphibian-dev/toad/compare/0.1.0...HEAD
200 | [0.1.0]: https://github.com/amphibian-dev/toad/compare/0.0.65...0.1.0
201 | [0.0.65]: https://github.com/amphibian-dev/toad/compare/0.0.64...0.0.65
202 | [0.0.64]: https://github.com/amphibian-dev/toad/compare/0.0.62...0.0.64
203 | [0.0.62]: https://github.com/amphibian-dev/toad/compare/0.0.61...0.0.62
204 | [0.0.61]: https://github.com/amphibian-dev/toad/compare/0.0.60...0.0.61
205 | [0.0.60]: https://github.com/amphibian-dev/toad/compare/0.0.59...0.0.60
206 | [0.0.59]: https://github.com/amphibian-dev/toad/compare/0.0.58...0.0.59
207 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Welcome to Toad contributing guide 
 2 | 
 3 | We're so glad you're thinking about contributing to toad project. If you're unsure about anything, just ask @Secbone or submit the issue or pull request anyway. The worst that can happen is you'll be politely asked to change something. We love all friendly contributions.
 4 | 
 5 | 我们非常开心你乐意为 toad 项目贡献代码。如果你有任何疑问，可以联系 @Secbone 或者提交 issue 和 pull request 都可以。最糟不过是被礼貌地要求你修改一些东西。我们非常愿意看到所有善意的问题。
 6 | 
 7 | ## Getting Started · 开始吧
 8 | 
 9 | ### Setup Environment · 设置环境
10 | 
11 | Setting up the environment is very simple, you just need to run the following command
12 | 
13 | 设置环境非常简单，你只需要执行以下代码
14 | 
15 | ```bash
16 | make install
17 | ```
18 | 
19 | All done! Now you can enjoy your coding~
20 | 
21 | 完成！开始享受你的编码吧~
22 | 
23 | ### About Cython · 关于 Cython
24 | 
25 | `toad.merge` module is compiled with `cython`, so if you want to change something with `toad.merge`, you need to run
26 | 
27 | `toad.merge` 模块是使用 `cython` 编译的，所有如果你想要对 `toad.merge` 模块进行改动时，你需要运行
28 | 
29 | ```bash
30 | make build
31 | ```
32 | after you updated code.
33 | 
34 | 之后来使你的代码生效。
35 | 
36 | ### Testing · 测试
37 | 
38 | You can run
39 | 
40 | 你可以执行
41 | 
42 | ```bash
43 | make test
44 | ```
45 | 
46 | for testing the whole package. We recommend that you do this before every commit to avoid new code impacting old functionality.
47 | 
48 | 来测试整个包的代码。我们建议你在每次体检前这么做，以防止新代码对老的功能产生影响。
49 | 
50 | You can also run
51 | 
52 | 你也可以运行
53 | 
54 | ```bash
55 | make test toad/xxxx_test.py
56 | ```
57 | 
58 | to test only a single module.
59 | 
60 | 来只测试某一个模块。
61 | 
62 | ### Pull Request
63 | 
64 | When you're finished with the changes, creating a pull request and waiting for merge.
65 | 
66 | 当你完成所有的改动后，就可以创建一个 pull request 并且等它被合并啦~
67 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS:
--------------------------------------------------------------------------------
1 | Lei Cui
2 | Secbone
3 | Shaoqian Dong
4 | Xiyu Zhou
5 | Yanping He
6 | Yutong Jiang
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 ESC Team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.md
 2 | include requirements.txt
 3 | include requirements-*.txt
 4 | include setup.py
 5 | include toad/*.pyd
 6 | include toad/*.pyx
 7 | include toad/tadpole/fonts/*
 8 | 
 9 | include CONTRIBUTORS
10 | include LICENSE
11 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build test
 2 | 
 3 | SHELL = /bin/bash
 4 | 
 5 | PYTHON = python3
 6 | PIP = pip3
 7 | SUDO ?=
 8 | 
 9 | SPHINXOPTS =
10 | SPHINXBUILD = sphinx-build
11 | SPHINXPROJ = toad
12 | DOCSDIR = docs
13 | SOURCEDIR := $(DOCSDIR)/source
14 | BUILDDIR := $(DOCSDIR)/build
15 | 
16 | 
17 | ifeq ('$(shell type -P python3)','')
18 |     PYTHON = python
19 | endif
20 | 
21 | ifeq ('$(shell type -P pip3)','')
22 |     PIP = pip
23 | endif
24 | 
25 | 
26 | install: build
27 | 	$(SUDO) $(PIP) install -e .
28 | 
29 | uninstall:
30 | 	cat files.txt | xargs rm -rf
31 | 
32 | test_deps:
33 | 	$(SUDO) $(PIP) install -r requirements-test.txt
34 | 
35 | test: test_deps
36 | 	$(eval TARGET := $(filter-out $@, $(MAKECMDGOALS)))
37 | 	@if [ -z $(TARGET) ]; then \
38 | 		$(PYTHON) -m pytest -x toad; \
39 | 	else \
40 | 		$(PYTHON) -m pytest -s $(TARGET); \
41 | 	fi
42 | 
43 | build_deps:
44 | 	$(SUDO) $(PIP) install -r requirements.txt
45 | 
46 | build: build_deps
47 | 	$(PYTHON) setup.py build_ext --inplace
48 | 
49 | dist_deps:
50 | 	$(SUDO) $(PIP) install -U -r requirements-dist.txt
51 | 
52 | dist: build dist_deps
53 | 	$(SUDO) $(PYTHON) setup.py sdist
54 | 
55 | dist_wheel: build dist_deps
56 | 	$(SUDO) $(PYTHON) setup.py bdist_wheel --universal
57 | 
58 | upload:
59 | 	twine check dist/*
60 | 	@twine upload dist/*  -u $(TWINE_USER) -p $(TWINE_PASS)
61 | 
62 | clean:
63 | 	@rm -rf build/ dist/ *.egg-info/ **/__pycache__/
64 | 	@rm -rf toad/*.c toad/*.so
65 | 
66 | docs: build
67 | 	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
68 | 
69 | %:
70 | 	@:
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |     <img src="https://raw.githubusercontent.com/amphibian-dev/toad/master/images/toad_logo.png" width="350px" />
  3 | </div>
  4 | 
  5 | # TOAD
  6 | 
  7 | 
  8 | [![PyPi version][pypi-image]][pypi-url]
  9 | [![Python version][python-image]][docs-url]
 10 | [![Build Status][actions-image]][actions-url]
 11 | [![Downloads Status][downloads-image]][docs-url]
 12 | 
 13 | 
 14 | Toad is dedicated to facilitating model development process, especially for a scorecard. It provides intuitive functions of the entire process, from EDA, feature engineering and selection etc. to results validation and scorecard transformation. Its key functionality streamlines the most critical and time-consuming process such as feature selection and fine binning.
 15 | 
 16 | Toad 是专为工业界模型开发设计的Python工具包，特别针对评分卡的开发。Toad 的功能覆盖了建模全流程，从 EDA、特征工程、特征筛选 到 模型验证和评分卡转化。Toad 的主要功能极大简化了建模中最重要最费时的流程，即特征筛选和分箱。
 17 | 
 18 | ## Install and Upgrade · 安装与升级
 19 |  
 20 | Pip
 21 | 
 22 | ```bash
 23 | pip install toad # to install
 24 | pip install -U toad # to upgrade
 25 | ```
 26 | 
 27 | Conda
 28 | 
 29 | ```bash
 30 | conda install toad --channel conda-forge # to install
 31 | conda install -U toad --channel conda-forge # to upgrade
 32 | ```
 33 | 
 34 | Source code
 35 | 
 36 | ```bash
 37 | python setup.py install
 38 | ```
 39 | 
 40 | ## Key features · 主要功能
 41 | 
 42 | The following showcases some of the most popular features of toad, for more detailed demonstrations and user guidance, please refer to the tutorials.
 43 | 
 44 | 以下部分简单介绍了toad最受欢迎的一些功能，具体的使用方法和使用教程，请详见文档部分。
 45 | 
 46 | - Simple IV calculation for all features · 一键算IV:
 47 | 
 48 | ```python
 49 | toad.quality(data, 'target', indicators = ['iv'])
 50 | ```
 51 | 
 52 | - Preliminary selection based on criteria · 根据特定条件的初步变量筛选; 
 53 | - and stepwise feature selection (with optimised algorithm) · 优化过的逐步回归:
 54 | 
 55 | ```python
 56 | selected_data = toad.selection.select(data,target = 'target', empty = 0.5, iv = 0.02, corr = 0.7, return_drop=True, exclude=['ID','month'])
 57 | 
 58 | final_data = toad.selection.stepwise(data_woe,target = 'target', estimator='ols', direction = 'both', criterion = 'aic', exclude = to_drop)
 59 | ```
 60 | 
 61 | - Reliable fine binning with visualisation · 分箱及可视化:
 62 | 
 63 | ```python
 64 | # Chi-squared fine binning
 65 | c = toad.transform.Combiner()
 66 | c.fit(data_selected.drop(to_drop, axis=1), y = 'target', method = 'chi', min_samples = 0.05) 
 67 | print(c.export())
 68 | 
 69 | # Visualisation to check binning results 
 70 | col = 'feature_name'
 71 | bin_plot(c.transform(data_selected[[col,'target']], labels=True), x=col, target='target')
 72 | ```
 73 | 
 74 | - Intuitive model results presentation · 模型结果展示:
 75 | 
 76 | ```python
 77 | toad.metrics.KS_bucket(pred_proba, final_data['target'], bucket=10, method = 'quantile')
 78 | ```
 79 | 
 80 | - One-click scorecard transformation · 评分卡转化:
 81 | 
 82 | ```python
 83 | card = toad.ScoreCard(
 84 |     combiner = c,
 85 |     transer = transer,
 86 |     class_weight = 'balanced',
 87 |     C=0.1,
 88 |     base_score = 600,
 89 |     base_odds = 35 ,
 90 |     pdo = 60,
 91 |     rate = 2
 92 | )
 93 | 
 94 | card.fit(final_data[col], final_data['target'])
 95 | print(card.export())
 96 | ```
 97 | 
 98 | ## Documents · 文档
 99 | 
100 | - [Tutorial](https://toad.readthedocs.io/en/latest/tutorial.html)
101 | 
102 | - [中文指引](https://toad.readthedocs.io/en/latest/tutorial_chinese.html)
103 | 
104 | - [docs][docs-url]
105 | 
106 | - [Contributing](CONTRIBUTING.md)
107 | 
108 | ## Community · 社区
109 | We welcome public feedback and new PRs. We hold a WeChat group for questions and suggestions. 
110 | 
111 | 欢迎各位提PR，同时我们有toad使用交流的微信群，欢迎询问加群。
112 | 
113 | ## Contributors
114 | 
115 | [![Contributors][contributor-image]][contributor-url]
116 | 
117 | ------------
118 | 
119 | ## Dedicated by **The ESC Team** 
120 | 
121 | [pypi-image]: https://img.shields.io/pypi/v/toad?style=flat-square
122 | [pypi-url]: https://pypi.org/project/toad/
123 | [python-image]: https://img.shields.io/pypi/pyversions/toad?style=flat-square
124 | [actions-image]: https://img.shields.io/github/actions/workflow/status/amphibian-dev/toad/release.yml?style=flat-square
125 | [actions-url]: https://github.com/amphibian-dev/toad/actions
126 | [downloads-image]: https://img.shields.io/pypi/dm/toad?style=flat-square
127 | [docs-url]: https://toad.readthedocs.io/
128 | [contributor-image]: https://contrib.rocks/image?repo=amphibian-dev/toad
129 | [contributor-url]: https://github.com/amphibian-dev/toad/graphs/contributors
130 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | docutils==0.16
2 | recommonmark
3 | sphinx-readable-theme
4 | ipykernel
5 | nbsphinx
6 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | 
 13 | import os
 14 | import sys
 15 | import inspect
 16 | 
 17 | sys.path.insert(0, os.path.abspath('../..'))
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = 'toad'
 23 | copyright = '2020, ESC Team'
 24 | author = 'ESC Team'
 25 | 
 26 | 
 27 | import toad
 28 | version = toad.VERSION
 29 | # The full version, including alpha/beta/rc tags
 30 | release = version
 31 | 
 32 | 
 33 | # -- General configuration ---------------------------------------------------
 34 | import recommonmark
 35 | import sphinx_readable_theme
 36 | from recommonmark.transform import AutoStructify
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 |     'sphinx.ext.doctest',
 43 |     'sphinx.ext.intersphinx',
 44 |     'sphinx.ext.todo',
 45 |     'sphinx.ext.autodoc',
 46 |     "sphinx.ext.autosummary",
 47 |     'sphinx.ext.linkcode',
 48 |     'sphinx.ext.napoleon',
 49 |     'nbsphinx',
 50 |     'recommonmark',
 51 |     'sphinx_readable_theme',
 52 | ]
 53 | 
 54 | 
 55 | 
 56 | autodoc_member_order = 'bysource'
 57 | 
 58 | # Add any paths that contain templates here, relative to this directory.
 59 | templates_path = ['_templates']
 60 | 
 61 | # List of patterns, relative to source directory, that match files and
 62 | # directories to ignore when looking for source files.
 63 | # This pattern also affects html_static_path and html_extra_path.
 64 | exclude_patterns = [
 65 |     'toad/commands',
 66 |     '_build',
 67 |     '**.ipynb_checkpoints',
 68 | ]
 69 | 
 70 | master_doc = 'index'
 71 | 
 72 | 
 73 | def linkcode_resolve(domain, info):
 74 |     """linkcode extension config function
 75 |     """
 76 |     if domain != "py":
 77 |         return None
 78 | 
 79 |     modname = info["module"]
 80 |     fullname = info["fullname"]
 81 | 
 82 |     submod = sys.modules.get(modname)
 83 |     if submod is None:
 84 |         return None
 85 | 
 86 |     obj = submod
 87 |     for part in fullname.split("."):
 88 |         try:
 89 |             obj = getattr(obj, part)
 90 |         except AttributeError:
 91 |             return None
 92 | 
 93 |     try:
 94 |         # inspect.unwrap() was added in Python version 3.4
 95 |         if sys.version_info >= (3, 5):
 96 |             fn = inspect.getsourcefile(inspect.unwrap(obj))
 97 |         else:
 98 |             fn = inspect.getsourcefile(obj)
 99 |     except TypeError:
100 |         fn = None
101 |     if not fn:
102 |         return None
103 | 
104 |     try:
105 |         source, lineno = inspect.getsourcelines(obj)
106 |     except OSError:
107 |         lineno = None
108 | 
109 |     if lineno:
110 |         linespec = "#L{:d}-L{:d}".format(lineno, lineno + len(source) - 1)
111 |     else:
112 |         linespec = ""
113 | 
114 |     fn = os.path.relpath(fn, start = os.path.dirname(toad.__file__))
115 | 
116 |     return "http://github.com/amphibian-dev/toad/blob/master/toad/{}{}".format(
117 |         fn, linespec
118 |     )
119 | 
120 | 
121 | # -- Options for HTML output -------------------------------------------------
122 | 
123 | # The theme to use for HTML and HTML Help pages.  See the documentation for
124 | # a list of builtin themes.
125 | #
126 | html_theme_path = [sphinx_readable_theme.get_html_theme_path()]
127 | html_theme = 'readable'
128 | 
129 | # Add any paths that contain custom static files (such as style sheets) here,
130 | # relative to this directory. They are copied after the builtin static files,
131 | # so a file named "default.css" will overwrite the builtin "default.css".
132 | html_static_path = ['_static']
133 | 
134 | 
135 | 
136 | def setup(app):
137 |     app.add_config_value(
138 |         'recommonmark_config',
139 |         {
140 |             'enable_eval_rst': True,
141 |             'enable_auto_toc_tree': True,
142 |             'auto_toc_tree_section': 'Contents',
143 |         },
144 |         True,
145 |     )
146 | 
147 |     app.add_transform(AutoStructify)
148 | 


--------------------------------------------------------------------------------
/docs/source/images/scorecard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/docs/source/images/scorecard.png


--------------------------------------------------------------------------------
/docs/source/images/stepwise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/docs/source/images/stepwise.png


--------------------------------------------------------------------------------
/docs/source/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to toad's documentation!
 2 | 
 3 | 
 4 | ## Installation
 5 | 
 6 | via pip
 7 | 
 8 | ```bash
 9 | pip install toad
10 | ```
11 | 
12 | via anaconda
13 | ```bash
14 | conda install toad --channel conda-forge
15 | ```
16 | 
17 | via source code
18 | ```bash
19 | python setup.py install
20 | ```
21 | 
22 | ## Tutorial
23 | 
24 | A [basic tutorial](tutorial) is provided.
25 | 
26 | [中文指引](tutorial_chinese)
27 | 
28 | ## Contents
29 | 
30 | ```eval_rst
31 | .. toctree::
32 |    :maxdepth: 1
33 | 
34 |    toad
35 | ```
36 | 
37 | 
38 | ## Indices and tables
39 | 
40 | 
41 | ```eval_rst
42 | * :ref:`genindex`
43 | * :ref:`modindex`
44 | * :ref:`search`
45 | ```
46 | 
47 | 
48 | ## Links
49 | 
50 | [FiboRule](http://open.fibo.cn/)
51 | 


--------------------------------------------------------------------------------
/docs/source/modules.md:
--------------------------------------------------------------------------------
 1 | ## toad
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. toctree::
 6 |    :maxdepth: 4
 7 | 
 8 |    toad
 9 | ```
10 | 


--------------------------------------------------------------------------------
/docs/source/reference.md:
--------------------------------------------------------------------------------
 1 | # ChiMerge
 2 | 
 3 | [https://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf](https://sci2s.ugr.es/keel/pdf/algorithm/congreso/1992-Kerber-ChimErge-AAAI92.pdf)
 4 | 
 5 | ChiMerge Algorithm uses Chi-squared statistic to discretize attributes (numeric). In toad, we firstly transform Char/Object attributes to numeric with WOE function. The Algorithm  is clear in paper (i.e. ChiMerge Algorithm Part).
 6 | 
 7 | # Stepwise Regression
 8 | 
 9 | [https://link.springer.com/article/10.1007%2FBF02576123](https://link.springer.com/article/10.1007%2FBF02576123) [1]
10 | 
11 | [https://www.sciencedirect.com/science/article/pii/S0950584917305153?via%3Dihub](https://www.sciencedirect.com/science/article/pii/S0950584917305153?via%3Dihub) [2]
12 | 
13 | [http://www.jstor.org/stable/1434071](http://www.jstor.org/stable/1434071)[3]
14 | 
15 | Stepwise Regression （Forward/Backward/Stepwise, i.e. [2] 3.6. Stepwise Linear Regression）is uesed to reduce Low Information Gain Attributes and simplify the Final Model.
16 | 
17 | The Stepwise Regression Process[2]:
18 | 
19 | ```eval_rst
20 | .. image:: images/stepwise.png
21 |    :width: 80%
22 |    :align: center
23 | ```
24 | 
25 | # Scorecard Transformation
26 | 
27 | John Wiley & Sons, Inc., *Credit Risk Scorecards Developing and Implementing Intelligent Credit Scoring* (Final Scorecard Production Part)
28 | 
29 | 
30 | 
31 | Formula:
32 | 
33 | Score = Offset + Factor ∗ ln (odds)                    #odds: good:bad
34 | 
35 | Score + pdo = Offset + Factor ∗ ln (2 ∗ odds)   # pdo: points to double the odds
36 | 
37 | ==>
38 | 
39 | pdo = Factor ∗ ln (2),  
40 | 
41 | Factor = pdo / ln (2);
42 | 
43 | Offset = Score - Factor ∗ ln (odds)
44 | 
45 | For example, if a scorecard were being scaled where the user wanted
46 | 
47 | odds of 50:1 at 600 points and wanted the odds to double every 20
48 | 
49 | points (i.e., pdo = 20), the factor and offset would be:
50 | 
51 | Factor = 20 / ln (2) = 28.8539
52 | 
53 | Offset = 600 – 28.8539 * ln (50) = 487.123
54 | 
55 | ==>
56 | 
57 | Each score corresponding to each set of odds:
58 | 
59 | Score = 487.123 + 28.8539 * ln (odds)
60 | 
61 | Scorecard is developed with WOE as input, the formula can be modified as:
62 | 
63 | ```eval_rst
64 | .. image:: images/scorecard.png
65 |    :width: 80%
66 |    :align: center
67 | ```
68 | 
69 | WOE = weight of evidence for each grouped attribute
70 | 
71 | β = regression coefficient for each characteristic
72 | 
73 | a = intercept term from logistic regression
74 | 
75 | n = number of characteristics
76 | 
77 | k = number of groups (of attributes) in each characteristic
78 | 


--------------------------------------------------------------------------------
/docs/source/toad.detector.md:
--------------------------------------------------------------------------------
 1 | ## toad.detector module
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. automodule:: toad.detector
 6 |    :members:
 7 |    :special-members: __init__
 8 |    :show-inheritance:
 9 | ```
10 | 


--------------------------------------------------------------------------------
/docs/source/toad.md:
--------------------------------------------------------------------------------
 1 | ## toad package
 2 | 
 3 | 
 4 | 
 5 | ## Submodules
 6 | 
 7 | 
 8 | ```eval_rst
 9 | .. toctree::
10 | 
11 |    toad.detector
12 |    toad.merge
13 |    toad.metrics
14 |    toad.plot
15 |    toad.scorecard
16 |    toad.selection
17 |    toad.stats
18 |    toad.transform
19 |    toad.preprocessing
20 |    toad.nn
21 |    toad.utils
22 | ```
23 | 
24 | ## Module contents
25 | 
26 | ```eval_rst
27 | .. automodule:: toad
28 |    :members:
29 |    :special-members: __init__
30 |    :show-inheritance:
31 | ```
32 | 


--------------------------------------------------------------------------------
/docs/source/toad.merge.md:
--------------------------------------------------------------------------------
 1 | ## toad.merge module
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. automodule:: toad.merge
 6 |    :members:
 7 |    :special-members: __init__
 8 |    :show-inheritance:
 9 | ```
10 | 


--------------------------------------------------------------------------------
/docs/source/toad.metrics.md:
--------------------------------------------------------------------------------
 1 | ## toad.metrics module
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. automodule:: toad.metrics
 6 |    :members:
 7 |    :special-members: __init__
 8 |    :show-inheritance:
 9 | ```
10 | 


--------------------------------------------------------------------------------
/docs/source/toad.nn.functional.md:
--------------------------------------------------------------------------------
1 | ## toad.nn.functional module
2 | 
3 | ```eval_rst
4 | .. automodule:: toad.nn.functional
5 |    :members:
6 |    :special-members: __init__
7 |    :show-inheritance:
8 | ```
9 | 


--------------------------------------------------------------------------------
/docs/source/toad.nn.md:
--------------------------------------------------------------------------------
 1 | ## toad.nn module
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. toctree::
 6 |    toad.nn.module
 7 |    toad.nn.functional
 8 |    toad.nn.trainer
 9 | ```
10 | 


--------------------------------------------------------------------------------
/docs/source/toad.nn.module.md:
--------------------------------------------------------------------------------
1 | ## toad.nn.module module
2 | 
3 | ```eval_rst
4 | .. automodule:: toad.nn.module
5 |    :members:
6 |    :special-members: __init__
7 |    :show-inheritance:
8 | ```
9 | 


--------------------------------------------------------------------------------
/docs/source/toad.nn.trainer.md:
--------------------------------------------------------------------------------
1 | ## toad.nn.trainer module
2 | 
3 | ```eval_rst
4 | .. automodule:: toad.nn.trainer
5 |    :members:
6 |    :special-members: __init__
7 |    :show-inheritance:
8 | ```
9 | 


--------------------------------------------------------------------------------
/docs/source/toad.plot.md:
--------------------------------------------------------------------------------
 1 | ## toad.plot module
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. automodule:: toad.plot
 6 |    :members:
 7 |    :special-members: __init__
 8 |    :show-inheritance:
 9 | ```
10 | 


--------------------------------------------------------------------------------
/docs/source/toad.preprocessing.md:
--------------------------------------------------------------------------------
1 | ## toad.preprocessing module
2 | 
3 | 
4 | ```eval_rst
5 | .. toctree::
6 |    toad.preprocessing.process
7 |    toad.preprocessing.partition
8 | ```
9 | 


--------------------------------------------------------------------------------
/docs/source/toad.preprocessing.partition.md:
--------------------------------------------------------------------------------
1 | ## toad.preprocessing.partition module
2 | 
3 | ```eval_rst
4 | .. automodule:: toad.preprocessing.partition
5 |    :members:
6 |    :special-members: __init__
7 |    :show-inheritance:
8 | ```
9 | 


--------------------------------------------------------------------------------
/docs/source/toad.preprocessing.process.md:
--------------------------------------------------------------------------------
1 | ## toad.preprocessing.process module
2 | 
3 | ```eval_rst
4 | .. automodule:: toad.preprocessing.process
5 |    :members:
6 |    :special-members: __init__
7 |    :show-inheritance:
8 | ```
9 | 


--------------------------------------------------------------------------------
/docs/source/toad.scorecard.md:
--------------------------------------------------------------------------------
 1 | ## toad.scorecard module
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. automodule:: toad.scorecard
 6 |    :members:
 7 |    :special-members: __init__
 8 |    :show-inheritance:
 9 | ```
10 | 


--------------------------------------------------------------------------------
/docs/source/toad.selection.md:
--------------------------------------------------------------------------------
 1 | ## toad.selection module
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. automodule:: toad.selection
 6 |    :members:
 7 |    :special-members: __init__
 8 |    :show-inheritance:
 9 | ```
10 | 


--------------------------------------------------------------------------------
/docs/source/toad.stats.md:
--------------------------------------------------------------------------------
 1 | ## toad.stats module
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. automodule:: toad.stats
 6 |    :members:
 7 |    :special-members: __init__
 8 |    :show-inheritance:
 9 | ```
10 | 


--------------------------------------------------------------------------------
/docs/source/toad.transform.md:
--------------------------------------------------------------------------------
 1 | ## toad.transform module
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. automodule:: toad.transform
 6 |    :members:
 7 |    :special-members: __init__
 8 |    :inherited-members:
 9 |    :show-inheritance:
10 | 
11 | ```
12 | 


--------------------------------------------------------------------------------
/docs/source/toad.utils.decorator.md:
--------------------------------------------------------------------------------
1 | ## toad.utils.decorator module
2 | 
3 | ```eval_rst
4 | .. automodule:: toad.utils.decorator
5 |    :members:
6 |    :special-members: __init__
7 |    :show-inheritance:
8 | ```
9 | 


--------------------------------------------------------------------------------
/docs/source/toad.utils.func.md:
--------------------------------------------------------------------------------
1 | ## toad.utils.func module
2 | 
3 | ```eval_rst
4 | .. automodule:: toad.utils.func
5 |    :members:
6 |    :special-members: __init__
7 |    :show-inheritance:
8 | ```
9 | 


--------------------------------------------------------------------------------
/docs/source/toad.utils.md:
--------------------------------------------------------------------------------
 1 | ## toad.utils module
 2 | 
 3 | 
 4 | ```eval_rst
 5 | .. toctree::
 6 |    toad.utils.func
 7 |    toad.utils.decorator
 8 |    toad.utils.mixin
 9 | ```
10 | 


--------------------------------------------------------------------------------
/docs/source/toad.utils.mixin.md:
--------------------------------------------------------------------------------
1 | ## toad.utils.mixin module
2 | 
3 | ```eval_rst
4 | .. automodule:: toad.utils.mixin
5 |    :members:
6 |    :special-members: __init__
7 |    :show-inheritance:
8 | ```
9 | 


--------------------------------------------------------------------------------
/images/toad_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/images/toad_logo.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "toad"
 3 | description = "Toad is dedicated to facilitating model development process, especially for a scorecard."
 4 | requires-python = ">=3.9"
 5 | license = {file = "LICENSE"}
 6 | authors = [{name = "Secbone", email = "secbone@gmail.com"}]
 7 | classifiers = [
 8 |     'Operating System :: POSIX',
 9 |     'Operating System :: Microsoft :: Windows',
10 |     'Operating System :: MacOS :: MacOS X',
11 |     'Programming Language :: Python :: 3.9',
12 |     'Programming Language :: Python :: 3.10',
13 |     'Programming Language :: Python :: 3.11',
14 |     'Programming Language :: Python :: 3.12',
15 | ]
16 | dynamic = [
17 |     "version",
18 |     "readme",
19 |     "dependencies",
20 |     "optional-dependencies",
21 |     "entry-points",
22 | ]
23 | 
24 | [tool.setuptools.dynamic]
25 | readme = {file = ["README.md"], content-type = "text/markdown"}
26 | dependencies = {file = ["requirements.txt"]}
27 | optional-dependencies = {nn = {file = ["requirements-nn.txt"]}, tools = {file = ["requirements-tools.txt"]}, all = {file = ["requirements-nn.txt", "requirements-tools.txt"]} }
28 | 
29 | [build-system]
30 | requires = [
31 |     "setuptools",
32 |     "Cython >= 0.29.15",
33 |     "numpy >= 1.20",
34 |     "wheel",
35 |     "twine",
36 | ]
37 | build-backend = "setuptools.build_meta"
38 | 
39 | [console_scripts]
40 | toad = "toad.cli:main"
41 | 
42 | [tool.setuptools.packages.find]
43 | exclude = ["tests"]
44 | 
45 | [project.urls]
46 | Homepage = "https://github.com/amphibian-dev/toad"
47 | Documentation = "https://toad.readthedocs.io/en/stable/"
48 | Repository = "https://github.com/amphibian-dev/toad.git"
49 | Issues = "https://github.com/amphibian-dev/toad/issues"
50 | Changelog = "https://github.com/amphibian-dev/toad/blob/master/CHANGELOG.md"
51 | 


--------------------------------------------------------------------------------
/requirements-dist.txt:
--------------------------------------------------------------------------------
1 | wheel
2 | twine
3 | 


--------------------------------------------------------------------------------
/requirements-nn.txt:
--------------------------------------------------------------------------------
1 | torch >= 1.8.1
2 | torchvision >= 0.9.1
3 | numpy < 2.0 ; sys_platform == "darwin"
4 | 


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-timeout
3 | 


--------------------------------------------------------------------------------
/requirements-tools.txt:
--------------------------------------------------------------------------------
1 | cloudpickle
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Cython <= 0.29.15 ; python_version < "3.10"
 2 | Cython >= 0.29.15 ; python_version >= "3.10"
 3 | numpy <= 1.24 ; python_version < "3.10"
 4 | numpy >= 1.24 ; python_version >= "3.10"
 5 | pandas >= 1.5
 6 | scipy
 7 | joblib >= 0.12
 8 | scikit-learn >= 0.21
 9 | seaborn >= 0.10.0
10 | setuptools
11 | 


--------------------------------------------------------------------------------
/scripts/build_wheels.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -x
 3 | 
 4 | 
 5 | # Compile wheels
 6 | for PYBIN in /opt/python/cp3[5678]*/bin; do
 7 |     "${PYBIN}/pip" install -r /io/dev-requirements.txt
 8 |     "${PYBIN}/pip" wheel --no-deps /io/ -w /dist/
 9 | done
10 | 
11 | # Bundle external shared libraries into the wheels
12 | for whl in /dist/toad*.whl; do
13 |     auditwheel repair "$whl" --plat $PLAT -w /io/dist/
14 | done


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [build_ext]
2 | inplace = 1
3 | 
4 | [bdist_wheel]
5 | universal=1
6 | 
7 | [aliases]
8 | test=pytest
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from setuptools import setup, find_packages, Extension
 4 | 
 5 | 
 6 | NAME = 'toad'
 7 | 
 8 | 
 9 | CURRENT_PATH = os.path.abspath(os.path.dirname(__file__))
10 | VERSION_FILE = os.path.join(CURRENT_PATH, NAME, 'version.py')
11 | 
12 | def get_version():
13 |     ns = {}
14 |     with open(VERSION_FILE) as f:
15 |         exec(f.read(), ns)
16 |     return ns['__version__']
17 | 
18 | 
19 | def get_ext_modules():
20 |     from Cython.Build import cythonize
21 | 
22 |     extensions = [
23 |         Extension('toad.c_utils', sources = ['toad/c_utils.pyx'], include_dirs = [np.get_include()]),
24 |         Extension('toad.merge', sources = ['toad/merge.pyx'], include_dirs = [np.get_include()]),
25 |     ]
26 | 
27 |     return cythonize(extensions)
28 | 
29 | 
30 | def get_requirements(stage = None):
31 |     file_name = 'requirements'
32 | 
33 |     if stage is not None:
34 |         file_name = f"{file_name}-{stage}"
35 |     
36 |     requirements = []
37 |     with open(f"{file_name}.txt", 'r') as f:
38 |         for line in f:
39 |             line = line.strip()
40 |             if not line or line.startswith('-'):
41 |                 continue
42 |             
43 |             requirements.append(line)
44 |     
45 |     return requirements
46 | 
47 | 
48 | setup(
49 |     name = NAME,
50 |     version = get_version(),
51 |     description = 'Toad is dedicated to facilitating model development process, especially for a scorecard.',
52 |     long_description = open('README.md', encoding = 'utf-8').read(),
53 |     long_description_content_type = 'text/markdown',
54 |     url = 'https://github.com/amphibian-dev/toad',
55 |     author = 'ESC Team',
56 |     author_email = 'secbone@gmail.com',
57 |     packages = find_packages(exclude = ['tests']),
58 |     include_dirs = [np.get_include()],
59 |     ext_modules = get_ext_modules(),
60 |     include_package_data = True,
61 |     python_requires = '>=3.7',
62 |     setup_requires = ['numpy'],
63 |     tests_require = get_requirements('test'),
64 |     license = 'MIT',
65 |     classifiers = [
66 |         'Operating System :: POSIX',
67 |         'Operating System :: Microsoft :: Windows',
68 |         'Operating System :: MacOS :: MacOS X',
69 |         'Programming Language :: Python :: 3.8',
70 |         'Programming Language :: Python :: 3.9',
71 |         'Programming Language :: Python :: 3.10',
72 |         'Programming Language :: Python :: 3.11',
73 |         'Programming Language :: Python :: 3.12',
74 |     ],
75 |     entry_points = {
76 |         'console_scripts': [
77 |             'toad = toad.cli:main',
78 |         ],
79 |     },
80 | )
81 | 


--------------------------------------------------------------------------------
/toad/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from .merge import merge, DTMerge, ChiMerge, StepMerge, QuantileMerge, KMeansMerge
 3 | except ImportError:
 4 |     import warnings
 5 |     warnings.warn(
 6 |         """`merge` module need to be builded""",
 7 |         ImportWarning,
 8 |     )
 9 | 
10 | from .detector import detect
11 | from .metrics import KS, KS_bucket, F1
12 | from .stats import quality, IV, VIF, WOE, entropy, entropy_cond, gini, gini_cond
13 | from .transform import Combiner, WOETransformer
14 | from .selection import select
15 | from .scorecard import ScoreCard
16 | from .utils import Progress, performance
17 | from .version import __version__
18 | 
19 | VERSION = __version__
20 | 


--------------------------------------------------------------------------------
/toad/c_utils.pxd:
--------------------------------------------------------------------------------
 1 | ctypedef fused number:
 2 |     int
 3 |     double
 4 |     long
 5 | 
 6 | 
 7 | cdef number c_min(number[:] arr)
 8 | 
 9 | cdef number c_sum(number[:,:] arr)
10 | 
11 | cdef number[:] c_sum_axis_0(number[:,:] arr)
12 | 
13 | cdef number[:] c_sum_axis_1(number[:,:] arr)
14 | 


--------------------------------------------------------------------------------
/toad/c_utils.pyx:
--------------------------------------------------------------------------------
 1 | # cython: language_level = 3, infer_types = True, boundscheck = False
 2 | 
 3 | import numpy as np
 4 | cimport numpy as np
 5 | cimport cython
 6 | 
 7 | 
 8 | 
 9 | cdef number c_min(number[:] arr):
10 |     cdef number res = np.inf
11 | 
12 |     for i in range(arr.shape[0]):
13 |         if res > arr[i]:
14 |             res = arr[i]
15 |     return res
16 | 
17 | 
18 | cdef number c_sum(number[:,:] arr):
19 |     cdef number res = 0
20 | 
21 |     cdef Py_ssize_t i,j
22 |     for i in range(arr.shape[0]):
23 |         for j in range(arr.shape[1]):
24 |             res += arr[i, j]
25 | 
26 |     return res
27 | 
28 | 
29 | cdef number[:] c_sum_axis_0(number[:,:] arr):
30 |     cdef number[:] res = np.zeros(arr.shape[1], dtype=float)
31 | 
32 |     for i in range(arr.shape[0]):
33 |         for j in range(arr.shape[1]):
34 |             res[j] += arr[i, j]
35 | 
36 |     return res
37 | 
38 | 
39 | cdef number[:] c_sum_axis_1(number[:,:] arr):
40 |     cdef number[:] res = np.zeros(arr.shape[0], dtype=float)
41 | 
42 |     for i in range(arr.shape[0]):
43 |         for j in range(arr.shape[1]):
44 |             res[i] += arr[i, j]
45 | 
46 |     return res
47 | 


--------------------------------------------------------------------------------
/toad/cli.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toad command line application
 3 | """
 4 | import argparse
 5 | from .commands import get_plugins
 6 | 
 7 | 
 8 | def add_sub(parsers, config):
 9 |     """add sub parser by config
10 |     """
11 |     info = config.get('info', {})
12 |     args = config.get('args', [])
13 |     defaults = config.get('defaults', None)
14 | 
15 |     sub_parser = parsers.add_parser(**info)
16 | 
17 |     for detail in args:
18 |         flag = detail.pop('flag')
19 |         sub_parser.add_argument(*flag, **detail)
20 | 
21 |     if defaults:
22 |         sub_parser.set_defaults(**defaults)
23 | 
24 | 
25 | def get_parser():
26 |     """get parser
27 |     """
28 |     parser = argparse.ArgumentParser(
29 |         prog = 'toad',
30 |         description = 'Detect data from a csv file',
31 |     )
32 | 
33 |     subparsers = parser.add_subparsers()
34 | 
35 |     plugins = get_plugins()
36 |     for plug in plugins:
37 |         add_sub(subparsers, plug.ARGS)
38 | 
39 |     return parser
40 | 
41 | 
42 | def main():
43 |     """
44 |     """
45 |     parser = get_parser()
46 | 
47 |     args = parser.parse_args()
48 |     if hasattr(args, 'func'):
49 |         args.func(args)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 


--------------------------------------------------------------------------------
/toad/cli_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | import pyximport
 6 | 
 7 | pyximport.install(setup_args={"include_dirs": np.get_include()})
 8 | 
 9 | from toad.cli import get_parser
10 | 
11 | def disable_stdout(fn):
12 | 
13 |     def wrapper(*args):
14 |         import os
15 |         import sys
16 | 
17 |         with open(os.devnull, 'w') as f:
18 |             so = sys.stdout
19 |             sys.stdout = f
20 | 
21 |             fn(*args)
22 | 
23 |             sys.stdout = so
24 | 
25 |     return wrapper
26 | 
27 | 
28 | parser = get_parser()
29 | 
30 | 
31 | 
32 | @disable_stdout
33 | def test_detect():
34 |     args = parser.parse_args(['detect', '-i', 'tests/test_data.csv'])
35 |     rep = args.func(args)
36 |     assert rep.loc['E', 'unique'] == 20
37 | 
38 | @pytest.mark.skip("tree command will generate a pic in travis-ci log")
39 | @disable_stdout
40 | def test_tree():
41 |     args = parser.parse_args(['tree', '-i', 'tests/test_data.csv'])
42 |     args.func(args)
43 |     pass
44 | 


--------------------------------------------------------------------------------
/toad/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pkgutil
 3 | from importlib import import_module
 4 | 
 5 | COMMAND_DIR = os.path.dirname(os.path.abspath(__file__))
 6 | 
 7 | def get_plugins():
 8 |     plugins = []
 9 | 
10 |     for _, name, ispkg in pkgutil.iter_modules([COMMAND_DIR]):
11 |         if ispkg:
12 |             module = import_module('toad.commands.{}'.format(name))
13 |             plugins.append(module)
14 | 
15 |     return plugins
16 | 


--------------------------------------------------------------------------------
/toad/commands/detect/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | import pandas as pd
 4 | 
 5 | def func(args):
 6 |     """detect csv data
 7 | 
 8 |     Examples:
 9 | 
10 |         toad detect -i xxx.csv -o report.csv
11 |     """
12 |     from toad.detector import detect
13 | 
14 |     sys.stdout.write('reading data....\n')
15 |     with args.input as input:
16 |         data = pd.read_csv(input)
17 | 
18 |     sys.stdout.write('detecting...\n')
19 |     report = detect(data)
20 | 
21 |     if args.output:
22 |         sys.stdout.write('saving report...\n')
23 |         report.to_csv(args.output)
24 |         sys.stdout.write('report saved!\n')
25 |     else:
26 |         sys.stdout.write(str(report))
27 |         sys.stdout.write('\n')
28 | 
29 |     return report
30 | 
31 | ARGS = {
32 |     'info': {
33 |         'name': 'detect',
34 |         'description': 'detect data from a csv file',
35 |     },
36 |     'defaults': {
37 |         'func': func,
38 |     },
39 |     'args': [
40 |         {
41 |             'flag': ('-i', '--input'),
42 |             'type': argparse.FileType(),
43 |             'help': 'the csv file which will be detected',
44 |             'required': True,
45 |         },
46 |         {
47 |             'flag': ('-o', '--output'),
48 |             'type': argparse.FileType('w'),
49 |             'help': 'path of the csv report will be saved',
50 |         },
51 |     ]
52 | }
53 | 


--------------------------------------------------------------------------------
/toad/commands/evaluate/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | import pandas as pd
 4 | 
 5 | def func(args):
 6 |     """detect csv data
 7 | 
 8 |     Examples:
 9 | 
10 |         toad evaluate -i xxx.csv
11 |     """
12 |     from .evaluate import evaluate
13 | 
14 | 
15 |     sys.stdout.write('reading data....\n')
16 | 
17 |     test_data = pd.read_csv(args.input)
18 |     if args.base is not None:
19 |         self_data = pd.read_csv(args.base)
20 |     else:
21 |         self_data = None
22 | 
23 |     arguments = {
24 |         'excel_name': args.name,
25 |         'num': args.top,
26 |         'iv_threshold_value': args.iv,
27 |         'unique_num': args.unique,
28 |         'self_data': self_data,
29 |         'overdue_days': args.overdue,
30 |     }
31 | 
32 |     evaluate(test_data, **arguments)
33 | 
34 | 
35 | ARGS = {
36 |     'info': {
37 |         'name': 'evaluate',
38 |         'description': '第三方数据评估',
39 |     },
40 |     'defaults': {
41 |         'func': func,
42 |     },
43 |     'args': [
44 |         {
45 |             'flag': ('-i', '--input'),
46 |             'type': argparse.FileType('r', encoding='utf-8'),
47 |             'help': '需要评估的 csv 文件',
48 |             'required': True,
49 |         },
50 |         {
51 |             'flag': ('--base',),
52 |             'type': argparse.FileType('r', encoding='utf-8'),
53 |             'help': '用于测试提升效果的基准 csv 数据文件',
54 |             'default': None,
55 |         },
56 |         {
57 |             'flag': ('--overdue',),
58 |             'help': '是否启用逾期天数分析',
59 |             'action': 'store_true',
60 |         },
61 |         {
62 |             'flag': ('--top',),
63 |             'type': int,
64 |             'help': '选择 IV 最高的 n 个变量分析',
65 |             'default': 10,
66 |         },
67 |         {
68 |             'flag': ('--iv',),
69 |             'type': float,
70 |             'help': '选择 IV 大于阈值的变量进行分析',
71 |             'default': 0.02,
72 |         },
73 |         {
74 |             'flag': ('--unique',),
75 |             'type': int,
76 |             'help': '将连续变量合并成 n 组进行分析',
77 |             'default': 10,
78 |         },
79 |         {
80 |             'flag': ('--name',),
81 |             'type': str,
82 |             'help': '生成报告的文件名',
83 |             'default': 'report.xlsx',
84 |         },
85 |     ]
86 | }
87 | 


--------------------------------------------------------------------------------
/toad/commands/tree/__init__.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | import pandas as pd
  4 | 
  5 | def func(args):
  6 |     """detect csv data
  7 | 
  8 |     Examples:
  9 | 
 10 |         toad tree -i xxx.csv
 11 |     """
 12 |     import toad
 13 |     from .tree import split_data, dtree
 14 |     args = vars(args)
 15 | 
 16 |     # remove func attribute
 17 |     args.pop('func')
 18 |     
 19 |     input = args.pop('input')
 20 |     target = args.pop('target')
 21 |     include = args.pop('include')
 22 |     exclude = args.pop('exclude')
 23 | 
 24 |     sys.stdout.write('reading data....\n')
 25 |     data = pd.read_csv(input)
 26 | 
 27 |     X, *tars = split_data(data, target = target)
 28 | 
 29 |     if include is not None:
 30 |         X = X[include]
 31 | 
 32 |     if exclude is not None:
 33 |         X = X.drop(columns = exclude)
 34 | 
 35 |     X = toad.utils.get_dummies(X)
 36 | 
 37 | 
 38 |     for t in tars:
 39 |         sys.stdout.write('analyse '+ t.name +' ...\n')
 40 |         dtree(X, t, **args)
 41 | 
 42 | 
 43 | ARGS = {
 44 |     'info': {
 45 |         'name': 'tree',
 46 |         'description': 'analyse bad rate from a csv file',
 47 |     },
 48 |     'defaults': {
 49 |         'func': func,
 50 |     },
 51 |     'args': [
 52 |         {
 53 |             'flag': ('-i', '--input'),
 54 |             'type': argparse.FileType('r', encoding='utf-8'),
 55 |             'help': 'the csv file which will be analysed',
 56 |             'required': True,
 57 |         },
 58 |         {
 59 |             'flag': ('-t', '--target'),
 60 |             'nargs': '+',
 61 |             'help': 'the target(s) will be analysed',
 62 |             'default': 'target',
 63 |         },
 64 |         {
 65 |             'flag': ('-c', '--criterion'),
 66 |             'type': str,
 67 |             'help': 'criterion to measure the quality of a split. Support "gini" (default), "entropy"',
 68 |             'default': 'gini',
 69 |         },
 70 |         {
 71 |             'flag': ('-d', '--depth'),
 72 |             'type': int,
 73 |             'help': 'the maximum depth of the tree',
 74 |             'default': None,
 75 |         },
 76 |         {
 77 |             'flag': ('-s', '--sample'),
 78 |             'type': float,
 79 |             'help': 'minimum number of sample in each node',
 80 |             'default': 0.01,
 81 |         },
 82 |         {
 83 |             'flag': ('-r', '--ratio'),
 84 |             'type': float,
 85 |             'help': 'threshold of ratio that will be highlighted',
 86 |             'default': 0.15,
 87 |         },
 88 |         {
 89 |             'flag': ('--exclude',),
 90 |             'nargs': '+',
 91 |             'help': 'feature names that will not use to analyse',
 92 |             'default': None,
 93 |         },
 94 |         {
 95 |             'flag': ('--include',),
 96 |             'nargs': '+',
 97 |             'help': 'feature names that will be used to analyse',
 98 |             'default': None,
 99 |         },
100 |     ]
101 | }
102 | 


--------------------------------------------------------------------------------
/toad/commands/tree/tree.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Windows:
  3 |     conda install python-graphviz
  4 | Mac:
  5 |     brew install graphviz
  6 |     pip install graphviz
  7 | """
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | import graphviz
 13 | 
 14 | import sklearn
 15 | from sklearn.tree import DecisionTreeClassifier
 16 | 
 17 | 
 18 | def tree_to_dot(tree, features, high_light = 0.15):
 19 |     from io import StringIO
 20 |     from sklearn.tree import _tree
 21 | 
 22 |     out = StringIO()
 23 |     tree_ = tree.tree_
 24 | 
 25 |     features = np.array([
 26 |         features[i] if i != _tree.TREE_UNDEFINED else "undefined!"
 27 |         for i in tree_.feature
 28 |     ])
 29 | 
 30 |     out.write('digraph Tree {\n')
 31 |     out.write('edge [fontname="FangSong"];\n')
 32 |     out.write('node [shape=box];\n')
 33 | 
 34 |     def recurse(node, parent = None, label = None):
 35 |         sample = tree_.n_node_samples[node]
 36 |         bad_rate = tree_.value[node][0,1] / sample
 37 | 
 38 |         out.write('{} [label="'.format(node))
 39 | 
 40 |         out.write('bad rate: {:.2%}\n'.format(bad_rate))
 41 |         out.write('sample: {:.2%}\n'.format(sample / tree_.n_node_samples[0]))
 42 | 
 43 |         # end of label
 44 |         out.write('"')
 45 | 
 46 |         if bad_rate > high_light:
 47 |             out.write(', color="red"')
 48 | 
 49 |         # end of node
 50 |         out.write('];\n')
 51 | 
 52 |         if tree_.feature[node] != _tree.TREE_UNDEFINED:
 53 |             name = features[node]
 54 |             threshold = tree_.threshold[node]
 55 |             recurse(tree_.children_left[node], node, '{} <= {:.2f}'.format(name, threshold))
 56 |             recurse(tree_.children_right[node], node, '{} > {:.2f}'.format(name, threshold))
 57 | 
 58 |         if parent is not None:
 59 |             out.write('{} -> {} [label="{}"];\n'.format(parent, node, label))
 60 | 
 61 |     recurse(0, None)
 62 | 
 63 |     out.write('}')
 64 |     s = out.getvalue()
 65 |     out.close()
 66 |     return s
 67 | 
 68 | 
 69 | def dot_to_img(dot, file = 'report.png'):
 70 |     import os
 71 | 
 72 |     name, ext = os.path.splitext(file)
 73 | 
 74 |     graph = graphviz.Source(dot)
 75 |     graph.format = ext[1:]
 76 |     graph.view(name, cleanup = True)
 77 | 
 78 | 
 79 | def split_data(frame, target = 'target'):
 80 |     X = frame.drop(columns = target)
 81 | 
 82 |     res = (X,)
 83 |     if isinstance(target, str):
 84 |         target = [target]
 85 | 
 86 |     for col in target:
 87 |         res += (frame[col],)
 88 | 
 89 |     return res
 90 | 
 91 | 
 92 | def dtree(frame, target, criterion = 'gini', depth = None, sample = 0.01, ratio = 0.15):
 93 |     tree = DecisionTreeClassifier(
 94 |         criterion = criterion,
 95 |         min_samples_leaf = sample,
 96 |         max_depth = depth,
 97 |     )
 98 | 
 99 |     tree.fit(frame.fillna(-1), target)
100 | 
101 |     dot_string = tree_to_dot(tree, frame.columns.values, high_light = ratio)
102 | 
103 |     dot_to_img(dot_string, file = target.name + '.png')
104 | 


--------------------------------------------------------------------------------
/toad/detector.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | """Command line tools for detecting csv data
  4 | 
  5 | Team: ESC
  6 | 
  7 | Examples:
  8 | 
  9 |     python detector.py -i xxx.csv -o report.csv
 10 | 
 11 | """
 12 | 
 13 | import pandas as pd
 14 | 
 15 | def getTopValues(series, top = 5, reverse = False):
 16 |     """Get top/bottom n values
 17 | 
 18 |     Args:
 19 |         series (Series): data series
 20 |         top (number): number of top/bottom n values
 21 |         reverse (bool): it will return bottom n values if True is given
 22 | 
 23 |     Returns:
 24 |         Series: Series of top/bottom n values and percentage. ['value:percent', None]
 25 |     """
 26 |     itype = 'top'
 27 |     counts = series.value_counts()
 28 |     counts = list(zip(counts.index, counts, counts.divide(series.size)))
 29 | 
 30 |     if reverse:
 31 |         counts.reverse()
 32 |         itype = 'bottom'
 33 | 
 34 |     template = "{0[0]}:{0[2]:.2%}"
 35 |     indexs = [itype + str(i + 1) for i in range(top)]
 36 |     values = [template.format(counts[i]) if i < len(counts) else None for i in range(top)]
 37 | 
 38 |     return pd.Series(values, index = indexs)
 39 | 
 40 | 
 41 | def getDescribe(series, percentiles = [.25, .5, .75]):
 42 |     """Get describe of series
 43 | 
 44 |     Args:
 45 |         series (Series): data series
 46 |         percentiles: the percentiles to include in the output
 47 | 
 48 |     Returns:
 49 |         Series: the describe of data include mean, std, min, max and percentiles
 50 |     """
 51 |     d = series.describe(percentiles)
 52 |     return d.drop('count')
 53 | 
 54 | 
 55 | def countBlank(series, blanks = []):
 56 |     """Count number and percentage of blank values in series
 57 | 
 58 |     Args:
 59 |         series (Series): data series
 60 |         blanks (list): list of blank values
 61 | 
 62 |     Returns:
 63 |         number: number of blanks
 64 |         str: the percentage of blank values
 65 |     """
 66 |     if len(blanks)>0:
 67 |         isnull = series.replace(blanks, None).isnull()
 68 |     else:
 69 |         isnull = series.isnull()
 70 |     n = isnull.sum()
 71 |     ratio = isnull.mean()
 72 | 
 73 |     return (n, "{0:.2%}".format(ratio))
 74 | 
 75 | 
 76 | def isNumeric(series):
 77 |     """Check if the series's type is numeric
 78 | 
 79 |     Args:
 80 |         series (Series): data series
 81 | 
 82 |     Returns:
 83 |         bool
 84 |     """
 85 |     return series.dtype.kind in 'ifc'
 86 | 
 87 | 
 88 | def detect(dataframe):
 89 |     """ Detect data
 90 | 
 91 |     Args:
 92 |         dataframe (DataFrame): data that will be detected
 93 | 
 94 |     Returns:
 95 |         DataFrame: report of detecting
 96 |     """
 97 | 
 98 |     rows = []
 99 |     for name, series in dataframe.items():
100 |         numeric_index = ['mean', 'std', 'min', '1%', '10%', '50%', '75%', '90%', '99%', 'max']
101 |         discrete_index = ['top1', 'top2', 'top3', 'top4', 'top5', 'bottom5', 'bottom4', 'bottom3', 'bottom2', 'bottom1']
102 | 
103 |         details_index = [numeric_index[i] + '_or_' + discrete_index[i] for i in range(len(numeric_index))]
104 |         details = []
105 | 
106 |         if isNumeric(series):
107 |             desc = getDescribe(
108 |                 series,
109 |                 percentiles = [.01, .1, .5, .75, .9, .99]
110 |             )
111 |             details = desc.tolist()
112 |         else:
113 |             top5 = getTopValues(series)
114 |             bottom5 = getTopValues(series, reverse = True)
115 |             details = top5.tolist() + bottom5[::-1].tolist()
116 | 
117 |         # print(details_index)
118 |         nblank, pblank = countBlank(series)
119 | 
120 |         row = pd.Series(
121 |             index = ['type', 'size', 'missing', 'unique'] + details_index,
122 |             data = [series.dtype, series.size, pblank, series.nunique()] + details
123 |         )
124 | 
125 |         row.name = name
126 |         rows.append(row)
127 | 
128 |     return pd.DataFrame(rows)
129 | 


--------------------------------------------------------------------------------
/toad/impute.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from pandas.api.types import is_numeric_dtype
  4 | from sklearn.experimental import enable_iterative_imputer
  5 | from sklearn.impute import IterativeImputer
  6 | from sklearn.ensemble import RandomForestRegressor
  7 | from sklearn.preprocessing import LabelEncoder
  8 | 
  9 | 
 10 | 
 11 | def impute(df):
 12 |     imputer = Imputer(
 13 |         estimator = RandomForestRegressor(),
 14 |         random_state = 1,
 15 |     )
 16 | 
 17 |     return imputer.fit_transform(df)
 18 | 
 19 | 
 20 | class Imputer(IterativeImputer):
 21 |     def __init__(self, missing_values = np.nan, **kwargs):
 22 |         super().__init__(missing_values = np.nan, **kwargs)
 23 | 
 24 |         if not isinstance(missing_values, list):
 25 |             missing_values = [missing_values]
 26 |         
 27 |         self.missing_values_list = missing_values
 28 |         self.encoder_dict = dict()
 29 | 
 30 |     def _impute_one_feature(self, X_filled, mask_missing_values, feat_idx,
 31 |                             neighbor_feat_idx, **kwargs):
 32 |         
 33 |         return super()._impute_one_feature(X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, **kwargs)
 34 | 
 35 |     def fit_transform(self, X, **kwargs):
 36 |         X, mask = self._replace_empty(X)
 37 |         X = self._fit_encode(X, mask)
 38 | 
 39 |         res = super().fit_transform(X, **kwargs)
 40 |         res = pd.DataFrame(res, columns = X.columns)
 41 |         return self._decode(res)
 42 | 
 43 |     
 44 |     def transform(self, X, **kwargs):
 45 |         X, mask = self._replace_empty(X)
 46 |         X = self._encode(X, mask)
 47 | 
 48 |         res = super().transform(X, **kwargs)
 49 |         res = pd.DataFrame(res, columns = X.columns)
 50 |         return self._decode(res)
 51 |     
 52 | 
 53 |     def _replace_empty(self, X):
 54 |         mask = X.isin(self.missing_values_list)
 55 |         X = X.where(~mask, np.nan)
 56 |         return X, mask
 57 | 
 58 |     def _fit_encode(self, X, mask):
 59 |         """fit encoder for object data
 60 | 
 61 |         Args:
 62 |             X (DataFrame)
 63 |             mask (Mask): empty mask for X
 64 |         """
 65 |         category_data = X.select_dtypes(exclude = np.number).columns
 66 |         
 67 |         for col in category_data:
 68 |             unique, X[col].loc[~mask[col]] = np.unique(X[col][~mask[col]], return_inverse = True)
 69 | 
 70 |             self.encoder_dict[col] = unique
 71 |         
 72 |         return X
 73 | 
 74 |     def _encode(self, X, mask):
 75 |         """encode object data to number
 76 | 
 77 |         Args:
 78 |             X (DataFrame)
 79 |             mask (Mask): empty mask for X
 80 |         """
 81 |         for col, unique in self.encoder_dict.items():
 82 |             table = dict(zip(unique, np.arange(len(unique))))
 83 |             X[col].loc[~mask[col]] = np.array([table[v] for v in X[col][~mask[col]]])
 84 |         
 85 |         return X
 86 |     
 87 |     def _decode(self, X):
 88 |         """decode object data from number to origin data
 89 | 
 90 |         Args:
 91 |             X (DataFrame)
 92 |             mask (Mask): empty mask for X
 93 |         """
 94 |         for col, unique in self.encoder_dict.items():
 95 |             ix = X[col].values.astype(int)
 96 |             X[col] = unique[ix]
 97 |         
 98 |         return X
 99 | 
100 |     
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/toad/impute_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from .impute import impute
 5 | 
 6 | 
 7 | ab = np.array(list('ABCDEFG'))
 8 | int_feat = np.random.randint(10, size = 500)
 9 | float_feat = np.random.rand(500)
10 | str_feat = ab[np.random.choice(7, 500)]
11 | uni_feat = np.ones(500)
12 | # empty_feat = np.full(500, np.nan)
13 | 
14 | target = np.random.randint(2, size = 500)
15 | 
16 | df = pd.DataFrame({
17 |     'A': int_feat,
18 |     'B': str_feat,
19 |     'C': uni_feat,
20 |     'D': float_feat,
21 |     # 'E': empty_feat,
22 | })
23 | 
24 | mask = np.random.choice([True, False], size = 500 * 4, p = [0.95, 0.05]).reshape(500, 4)
25 | df = df.where(mask, np.nan)
26 | 
27 | 
28 | def test_impute_with_number():
29 |     res = impute(df.drop(columns = 'B'))
30 | 
31 |     assert res.isna().sum().sum() == 0
32 | 
33 | 
34 | def test_impute_with_str():
35 |     res = impute(df)
36 | 
37 |     assert res.isna().sum().sum() == 0
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/toad/merge_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | import pyximport
 6 | 
 7 | pyximport.install(setup_args={"include_dirs": np.get_include()})
 8 | 
 9 | from .merge import merge, ChiMerge, DTMerge, QuantileMerge, StepMerge, KMeansMerge
10 | 
11 | np.random.seed(1)
12 | feature = np.random.rand(500)
13 | target = np.random.randint(2, size = 500)
14 | A = np.random.randint(100, size = 500)
15 | B = np.random.randint(3, size = 500)
16 | 
17 | df = pd.DataFrame({
18 |     'feature': feature,
19 |     'target': target,
20 |     'A': A,
21 | })
22 | 
23 | 
24 | 
25 | def test_chimerge():
26 |     splits = ChiMerge(feature, target, n_bins = 10)
27 |     assert len(splits) == 9
28 | 
29 | def test_chimerge_bins_not_enough():
30 |     splits = ChiMerge(B, target, n_bins = 10)
31 |     assert len(splits) == 2
32 | 
33 | def test_chimerge_bins_with_min_samples():
34 |     splits = ChiMerge(feature, target, min_samples = 0.02)
35 |     assert len(splits) == 10
36 | 
37 | def test_dtmerge():
38 |     splits = DTMerge(feature, target, n_bins = 10)
39 |     assert len(splits) == 9
40 | 
41 | def test_quantilemerge():
42 |     splits = QuantileMerge(feature, n_bins = 10)
43 |     assert len(splits) == 9
44 | 
45 | def test_quantilemerge_not_enough():
46 |     splits = QuantileMerge(B, n_bins = 10)
47 |     assert len(splits) == 2
48 | 
49 | def test_stepmerge():
50 |     splits = StepMerge(feature, n_bins = 10)
51 |     assert len(splits) == 9
52 | 
53 | def test_kmeansmerge():
54 |     splits = KMeansMerge(feature, n_bins = 10)
55 |     assert len(splits) == 9
56 | 
57 | def test_merge():
58 |     res = merge(feature, target = target, method = 'chi', n_bins = 10)
59 |     assert len(np.unique(res)) == 10
60 | 
61 | def test_merge_frame():
62 |     res = merge(df, target = 'target', method = 'chi', n_bins = 10)
63 |     assert len(np.unique(res['A'])) == 10
64 | 


--------------------------------------------------------------------------------
/toad/metrics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from scipy.stats import ks_2samp
  4 | 
  5 | from sklearn.metrics import f1_score, roc_auc_score, roc_curve
  6 | 
  7 | from .utils import (
  8 |     feature_splits,
  9 |     iter_df,
 10 |     unpack_tuple,
 11 |     bin_by_splits,
 12 | )
 13 | 
 14 | 
 15 | def KS(score, target):
 16 |     """calculate ks value
 17 | 
 18 |     Args:
 19 |         score (array-like): list of score or probability that the model predict
 20 |         target (array-like): list of real target
 21 | 
 22 |     Returns:
 23 |         float: the max KS value
 24 |     """
 25 |     mask = target == 1
 26 |     res = ks_2samp(score[mask], score[~mask])
 27 |     return res[0]
 28 | 
 29 | 
 30 | def KS_bucket(score, target, bucket = 10, method = 'quantile', return_splits = False, **kwargs):
 31 |     """calculate ks value by bucket
 32 | 
 33 |     Args:
 34 |         score (array-like): list of score or probability that the model predict
 35 |         target (array-like): list of real target
 36 |         bucket (int): n groups that will bin into
 37 |         method (str): method to bin score. `quantile` (default), `step`
 38 |         return_splits (bool): if need to return splits of bucket
 39 | 
 40 |     Returns:
 41 |         DataFrame
 42 |     """
 43 |     df = pd.DataFrame({
 44 |         'score': score,
 45 |         'bad': target,
 46 |     })
 47 | 
 48 |     df['good'] = 1 - df['bad']
 49 | 
 50 |     bad_total = df['bad'].sum()
 51 |     good_total = df['good'].sum()
 52 |     all_total = bad_total + good_total
 53 | 
 54 |     splits = None
 55 |     df['bucket'] = 0
 56 | 
 57 |     if bucket is False:
 58 |         df['bucket'] = score
 59 |     elif isinstance(bucket, (list, np.ndarray, pd.Series)):
 60 |         # list of split pointers
 61 |         if len(bucket) < len(score):
 62 |             bucket = bin_by_splits(score, bucket)
 63 |         
 64 |         df['bucket'] = bucket
 65 |     elif isinstance(bucket, int):
 66 |         from .merge import merge
 67 |         df['bucket'], splits = merge(score, n_bins = bucket, method = method, return_splits = True, **kwargs)
 68 | 
 69 |     grouped = df.groupby('bucket', as_index = False)
 70 | 
 71 |     agg1 = pd.DataFrame()
 72 |     agg1['min'] = grouped.min()['score']
 73 |     agg1['max'] = grouped.max()['score']
 74 |     agg1['bads'] = grouped.sum()['bad']
 75 |     agg1['goods'] = grouped.sum()['good']
 76 |     agg1['total'] = agg1['bads'] + agg1['goods']
 77 | 
 78 |     agg2 = (agg1.sort_values(by = 'min')).reset_index(drop = True)
 79 | 
 80 |     agg2['bad_rate'] = agg2['bads'] / agg2['total']
 81 |     agg2['good_rate'] = agg2['goods'] / agg2['total']
 82 | 
 83 |     agg2['odds'] = agg2['bads'] / agg2['goods']
 84 | 
 85 |     agg2['bad_prop'] = agg2['bads'] / bad_total
 86 |     agg2['good_prop'] = agg2['goods'] / good_total
 87 |     agg2['total_prop'] = agg2['total'] / all_total
 88 |     
 89 | 
 90 |     cum_bads = agg2['bads'].cumsum()
 91 |     cum_goods = agg2['goods'].cumsum()
 92 |     cum_total = agg2['total'].cumsum()
 93 | 
 94 |     cum_bads_rev = agg2.loc[::-1, 'bads'].cumsum()[::-1]
 95 |     cum_goods_rev = agg2.loc[::-1, 'goods'].cumsum()[::-1]
 96 |     cum_total_rev = agg2.loc[::-1, 'total'].cumsum()[::-1]
 97 | 
 98 |     agg2['cum_bad_rate'] = cum_bads / cum_total
 99 |     agg2['cum_bad_rate_rev'] = cum_bads_rev / cum_total_rev
100 |     
101 |     agg2['cum_bads_prop'] = cum_bads / bad_total
102 |     agg2['cum_bads_prop_rev'] = cum_bads_rev / bad_total
103 |     agg2['cum_goods_prop'] = cum_goods / good_total
104 |     agg2['cum_goods_prop_rev'] = cum_goods_rev / good_total
105 |     agg2['cum_total_prop'] = cum_total / all_total
106 |     agg2['cum_total_prop_rev'] = cum_total_rev / all_total
107 | 
108 | 
109 |     agg2['ks'] = agg2['cum_bads_prop'] - agg2['cum_goods_prop']
110 | 
111 |     reverse_suffix = ''
112 |     # fix negative ks value
113 |     if agg2['ks'].sum() < 0:
114 |         agg2['ks'] = -agg2['ks']
115 |         reverse_suffix = '_rev'
116 |     
117 |     agg2['lift'] = agg2['bad_prop'] / agg2['total_prop']
118 |     agg2['cum_lift'] = agg2['cum_bads_prop' + reverse_suffix] / agg2['cum_total_prop' + reverse_suffix]
119 | 
120 |     if return_splits and splits is not None:
121 |         return agg2, splits
122 |     
123 |     return agg2
124 | 
125 | def KS_by_col(df, by='feature', score='score', target='target'):
126 |     """
127 |     """
128 | 
129 |     pass
130 | 
131 | 
132 | def SSE(y_pred, y):
133 |     """sum of squares due to error
134 |     """
135 |     return np.sum((y_pred - y) ** 2)
136 | 
137 | 
138 | def MSE(y_pred, y):
139 |     """mean of squares due to error
140 |     """
141 |     return np.mean((y_pred - y) ** 2)
142 | 
143 | 
144 | def AIC(y_pred, y, k, llf = None):
145 |     """Akaike Information Criterion
146 | 
147 |     Args:
148 |         y_pred (array-like)
149 |         y (array-like)
150 |         k (int): number of featuers
151 |         llf (float): result of log-likelihood function
152 |     """
153 |     if llf is None:
154 |         llf = np.log(SSE(y_pred, y))
155 | 
156 |     return 2 * k - 2 * llf
157 | 
158 | 
159 | def BIC(y_pred, y, k, llf = None):
160 |     """Bayesian Information Criterion
161 | 
162 |     Args:
163 |         y_pred (array-like)
164 |         y (array-like)
165 |         k (int): number of featuers
166 |         llf (float): result of log-likelihood function
167 |     """
168 |     n = len(y)
169 |     if llf is None:
170 |         llf = np.log(SSE(y_pred, y))
171 | 
172 |     return np.log(n) * k - 2 * llf
173 | 
174 | 
175 | def F1(score, target, split = 'best', return_split = False):
176 |     """calculate f1 value
177 | 
178 |     Args:
179 |         score (array-like)
180 |         target (array-like)
181 | 
182 |     Returns:
183 |         float: best f1 score
184 |         float: best spliter
185 |     """
186 |     dataframe = pd.DataFrame({
187 |         'score': score,
188 |         'target': target,
189 |     })
190 | 
191 |     if split == 'best':
192 |         # find best split for score
193 |         splits = feature_splits(dataframe['score'], dataframe['target'])
194 |     else:
195 |         splits = [split]
196 | 
197 |     best = 0
198 |     sp = None
199 |     for df, pointer in iter_df(dataframe, 'score', 'target', splits):
200 |         v = f1_score(df['target'], df['score'])
201 | 
202 |         if v > best:
203 |             best = v
204 |             sp = pointer
205 | 
206 |     if return_split:
207 |         return best, sp
208 | 
209 |     return best
210 | 
211 | 
212 | def AUC(score, target, return_curve = False):
213 |     """AUC Score
214 | 
215 |     Args:
216 |         score (array-like): list of score or probability that the model predict
217 |         target (array-like): list of real target
218 |         return_curve (bool): if need return curve data for ROC plot
219 | 
220 |     Returns:
221 |         float: auc score
222 |     """
223 |     # fix score order
224 |     if np.nanmax(score) > 1:
225 |         score = -score
226 | 
227 |     auc = roc_auc_score(target, score)
228 | 
229 |     if not return_curve:
230 |         return auc
231 |     
232 |     return (auc,) + roc_curve(target, score)
233 | 
234 | 
235 | def _PSI(test, base):
236 |     test_prop = pd.Series(test).value_counts(normalize = True, dropna = False)
237 |     base_prop = pd.Series(base).value_counts(normalize = True, dropna = False)
238 | 
239 |     psi = np.sum((test_prop - base_prop) * np.log(test_prop / base_prop))
240 | 
241 |     frame = pd.DataFrame({
242 |         'test': test_prop,
243 |         'base': base_prop,
244 |     })
245 |     frame.index.name = 'value'
246 | 
247 |     return psi, frame.reset_index()
248 | 
249 | 
250 | 
251 | def PSI(test, base, combiner = None, return_frame = False):
252 |     """calculate PSI
253 | 
254 |     Args:
255 |         test (array-like): data to test PSI
256 |         base (array-like): base data for calculate PSI
257 |         combiner (Combiner|list|dict): combiner to combine data
258 |         return_frame (bool): if need to return frame of proportion
259 | 
260 |     Returns:
261 |         float|Series
262 |     """
263 | 
264 |     if combiner is not None:
265 |         if isinstance(combiner, (dict, list)):
266 |             from .transform import Combiner
267 |             combiner = Combiner().load(combiner)
268 | 
269 |         test = combiner.transform(test, labels = True)
270 |         base = combiner.transform(base, labels = True)
271 | 
272 |     psi = list()
273 |     frame = list()
274 | 
275 |     if isinstance(test, pd.DataFrame):
276 |         for col in test:
277 |             p, f = _PSI(test[col], base[col])
278 |             psi.append(p)
279 |             frame.append(f)
280 | 
281 |         psi = pd.Series(psi, index = test.columns)
282 | 
283 |         frame = pd.concat(
284 |             frame,
285 |             keys = test.columns,
286 |             names = ['columns', 'id'],
287 |         ).reset_index()
288 |         frame = frame.drop(columns = 'id')
289 |     else:
290 |         psi, frame = _PSI(test, base)
291 | 
292 | 
293 |     res = (psi,)
294 | 
295 |     if return_frame:
296 |         res += (frame,)
297 | 
298 |     return unpack_tuple(res)
299 | 
300 | 
301 | def matrix(y_pred, y, splits = None):
302 |     """confusion matrix of target
303 | 
304 |     Args:
305 |         y_pred (array-like)
306 |         y (array-like)
307 |         splits (float|list): split points of y_pred
308 | 
309 |     Returns:
310 |         DataFrame: confusion matrix witch true labels in rows and predicted labels in columns
311 | 
312 |     """
313 |     if splits is not None:
314 |         y_pred = bin_by_splits(y_pred, splits)
315 |     
316 |     labels = np.unique(y)
317 |     from sklearn.metrics import confusion_matrix
318 |     m = confusion_matrix(y, y_pred, labels = labels)
319 | 
320 |     return pd.DataFrame(
321 |         m,
322 |         index = pd.Index(labels, name = 'Actual'),
323 |         columns = pd.Index(labels, name = 'Predicted'),
324 |     )
325 | 


--------------------------------------------------------------------------------
/toad/metrics_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | from .metrics import KS, KS_bucket, F1, PSI, AUC, matrix
  6 | 
  7 | np.random.seed(1)
  8 | 
  9 | feature = np.random.rand(500)
 10 | target = np.random.randint(2, size = 500)
 11 | base_feature = np.random.rand(500)
 12 | 
 13 | test_df = pd.DataFrame({
 14 |     'A': np.random.rand(500),
 15 |     'B': np.random.rand(500),
 16 | })
 17 | base_df = pd.DataFrame({
 18 |     'A': np.random.rand(500),
 19 |     'B': np.random.rand(500),
 20 | })
 21 | 
 22 | FUZZ_THRESHOLD = 1e-10
 23 | 
 24 | def test_KS():
 25 |     result = KS(feature, target)
 26 |     assert result == 0.05536775661256989
 27 | 
 28 | def test_KS_bucket():
 29 |     result = KS_bucket(feature, target)
 30 |     assert result.loc[4, 'ks'] == -0.028036335090276976
 31 | 
 32 | def test_KS_bucket_use_step():
 33 |     result = KS_bucket(feature, target, method = 'step', clip_q = 0.01)
 34 |     assert result.loc[4, 'ks'] == -0.0422147102645028
 35 | 
 36 | def test_KS_bucket_for_all_score():
 37 |     result = KS_bucket(feature, target, bucket = False)
 38 |     assert len(result) == 500
 39 | 
 40 | def test_KS_bucket_return_splits():
 41 |     result, splits = KS_bucket(feature, target, return_splits = True)
 42 |     assert len(splits) == 9
 43 | 
 44 | def test_KS_bucket_use_split_pointers():
 45 |     result = KS_bucket(feature, target, bucket = [0.2, 0.6])
 46 |     assert len(result) == 3
 47 | 
 48 | def test_KS_bucket_with_lift():
 49 |     result = KS_bucket(feature, target)
 50 |     assert result.loc[3, 'lift'] == 1.0038610038610036
 51 | 
 52 | def test_KS_bucket_with_cum_lift():
 53 |     result = KS_bucket(feature, target)
 54 |     assert result.loc[3, 'cum_lift'] == 1.003861003861004
 55 | 
 56 | 
 57 | def test_F1():
 58 |     result, split = F1(feature, target, return_split = True)
 59 |     assert result == pytest.approx(0.6844207723035951, FUZZ_THRESHOLD)
 60 | 
 61 | def test_F1_split():
 62 |     result = F1(feature, target, split = 0.5)
 63 |     assert result == pytest.approx(0.51417004048583, FUZZ_THRESHOLD)
 64 | 
 65 | def test_AUC():
 66 |     result = AUC(feature, target)
 67 |     assert result == 0.5038690142424582
 68 | 
 69 | def test_AUC_with_curve():
 70 |     auc, fpr, tpr, thresholds = AUC(feature, target, return_curve = True)
 71 |     assert thresholds[200] == 0.15773006987053328
 72 | 
 73 | def test_PSI():
 74 |     result = PSI(feature, base_feature, combiner = [0.3, 0.5, 0.7])
 75 |     assert result == 0.018630024627491467
 76 | 
 77 | def test_PSI_frame():
 78 |     result = PSI(
 79 |         test_df,
 80 |         base_df,
 81 |         combiner = {
 82 |             'A': [0.3, 0.5, 0.7],
 83 |             'B': [0.4, 0.8],
 84 |         },
 85 |     )
 86 | 
 87 |     assert result['B'] == pytest.approx(0.014528279995858708, FUZZ_THRESHOLD)
 88 | 
 89 | def test_PSI_return_frame():
 90 |     result, frame = PSI(
 91 |         test_df,
 92 |         base_df,
 93 |         combiner = {
 94 |             'A': [0.3, 0.5, 0.7],
 95 |             'B': [0.4, 0.8],
 96 |         },
 97 |         return_frame = True,
 98 |     )
 99 | 
100 |     assert frame.loc[4, 'test'] == 0.38
101 | 
102 | def test_matrix():
103 |     df = matrix(feature, target, splits = 0.5)
104 |     assert df.iloc[0,1] == 133
105 | 


--------------------------------------------------------------------------------
/toad/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .module import Module
2 | from .trainer import *
3 | 


--------------------------------------------------------------------------------
/toad/nn/functional.py:
--------------------------------------------------------------------------------
 1 | from toad.utils.decorator import support_numpy
 2 | 
 3 | def flooding(loss, b):
 4 |     """flooding loss
 5 |     """
 6 |     return (loss - b).abs() + b
 7 | 
 8 | 
 9 | @support_numpy
10 | def focal_loss(input, target, alpha = 1., gamma = 2., reduction = 'mean'):
11 |     """focal loss
12 |     
13 |     Args:
14 |         input (Tensor): N x C, C is the number of classes
15 |         target (Tensor): N, each value is the index of classes
16 |         alpha (Variable): balaced variant of focal loss, range is in [0, 1]
17 |         gamma (float): focal loss parameter
18 |         reduction (str): `mean`, `sum`, `none` for reduce the loss of each classes
19 |     """
20 |     import numpy as np
21 |     import torch
22 |     import torch.nn.functional as F
23 | 
24 |     prob = F.sigmoid(input)
25 |     weight = torch.pow(1. - prob, gamma)
26 |     focal = -alpha * weight * torch.log(prob)
27 |     loss = F.nll_loss(focal, target, reduction = reduction)
28 | 
29 |     return loss
30 | 
31 | 
32 | @support_numpy
33 | def binary_focal_loss(input, target, **kwargs):
34 |     """binary focal loss
35 |     """
36 |     # convert 1d tensor to 2d
37 |     if input.ndim == 1:
38 |         import torch
39 |         input = input.view(-1, 1)
40 |         input = torch.hstack([1 - input, input])
41 |     
42 |     return focal_loss(input, target, **kwargs)
43 | 
44 | 
45 | def focal_loss_for_numpy(input, target, alpha = 1., gamma = 2., reduction = 'mean'):
46 |     """focal loss for numpy array
47 |     """
48 |     import numpy as np
49 | 
50 |     prob = 1 / (1 + np.exp(-input))
51 |     weight = np.power(1. - prob, gamma)
52 |     focal = -alpha * weight * np.log(prob)
53 |     loss = -focal[np.arange(len(focal)), target]
54 | 
55 |     if reduction == 'mean':
56 |         loss = loss.mean()
57 |     elif reduction == 'sum':
58 |         loss = loss.sum()
59 |     elif reduction == 'none':
60 |         pass
61 | 
62 |     return loss
63 | 
64 | 
65 | def label_smoothing(labels, smoothing = 0.1):
66 |     """label smoothing
67 |     """
68 |     assert len(labels.shape) == 2, "labels must be 2 dim where shape should be (N, C)"
69 | 
70 |     return (1. - smoothing) * labels + smoothing / labels.shape[1]
71 | 


--------------------------------------------------------------------------------
/toad/nn/functional_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | import torch.nn.functional as F
 6 | from torch.utils.data import Dataset, DataLoader
 7 | 
 8 | from .functional import focal_loss, binary_focal_loss
 9 | 
10 | DATASET_SIZE = 20000
11 | NUM_CLASSES = 4
12 | 
13 | 
14 | @pytest.fixture(autouse=True)
15 | def seed():
16 |     torch.manual_seed(0)
17 |     yield
18 | 
19 | 
20 | def test_focal_loss(seed):
21 |     y_pred = torch.rand(DATASET_SIZE, NUM_CLASSES, dtype=torch.float)
22 |     y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long)
23 |     loss = focal_loss(y_pred, y)
24 |     assert loss.item() == pytest.approx(-0.07764504849910736, 1e-6)
25 | 
26 | 
27 | def test_loss_with_grad(seed):
28 |     y_pred = torch.rand(DATASET_SIZE, NUM_CLASSES, dtype=torch.float, requires_grad=True)
29 |     y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long)
30 |     loss = focal_loss(y_pred, y)
31 |     loss.backward()
32 |     assert y_pred.grad is not None
33 | 
34 | 
35 | def test_binary_focal_loss(seed):
36 |     y_pred = torch.rand(DATASET_SIZE, dtype=torch.float)
37 |     y = torch.randint(2, size=(DATASET_SIZE,), dtype=torch.long)
38 |     loss = binary_focal_loss(y_pred, y)
39 |     assert loss.item() == pytest.approx(-0.07776755839586258, 1e-6)
40 | 
41 | 
42 | def test_numpy_support_focal_loss(seed):
43 |     y_pred = torch.rand(DATASET_SIZE, NUM_CLASSES, dtype=torch.float).numpy()
44 |     y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long).numpy()
45 |     loss = focal_loss(y_pred, y)
46 |     assert loss.item() == pytest.approx(-0.07764504849910736, 1e-6)
47 | 
48 | 
49 | def test_binary_focal_loss_for_xgb(seed):
50 |     from toad.utils.decorator import xgb_loss
51 | 
52 |     y_pred = torch.rand(DATASET_SIZE, dtype=torch.float).numpy()
53 |     y = torch.randint(2, size=(DATASET_SIZE,), dtype=torch.long).numpy()
54 |     loss_func = xgb_loss(gamma=5.0, alpha=0.5)(binary_focal_loss)
55 |     grad, hess = loss_func(y_pred, y)
56 | 
57 |     assert grad == pytest.approx(-0.00023283064365386963)
58 |     assert hess == pytest.approx(465.66128730773926)
59 | 
60 | 
61 | # TODO
62 | # focal loss sum/none
63 | # label_smoothing
64 | 


--------------------------------------------------------------------------------
/toad/nn/loss.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | 
 3 | import torch
 4 | from torch.nn import Module
 5 | 
 6 | from .functional import focal_loss
 7 | 
 8 | 
 9 | class FocalLoss(Module):
10 |     def __init__(self, alpha = 1., gamma = 2., reduction = 'mean'):
11 |         super(FocalLoss, self).__init__()
12 |         
13 |         self.alpha = alpha
14 |         self.gamma = gamma
15 |         self.reduction = reduction
16 | 
17 |     def forward(self, input, target):
18 |         return focal_loss(
19 |             input,
20 |             target,
21 |             alpha = self.alpha,
22 |             gamma = self.gamma,
23 |             reduction = self.reduction,
24 |         )
25 | 
26 | 
27 | class DictLoss(Module):
28 |     def __init__(self, torch_loss, weights: Dict[str, float] = None):
29 |         super(DictLoss, self).__init__()
30 |         self.torch_loss = torch_loss
31 |         self.weights = weights or {}
32 | 
33 |     def forward(self, input: Dict[str, torch.Tensor], target: Dict[str, torch.Tensor]):
34 |         loss = 0
35 |         weight_sum = 0
36 |         for key, _target in target.items():
37 |             if key not in input:
38 |                 continue
39 |             weight = self.weights.get(key, 1)
40 |             mask = torch.bitwise_not(torch.isnan(_target))
41 |             _target = _target.to(input[key].device)
42 |             loss += weight * self.torch_loss(input[key][mask], _target[mask])
43 |             weight_sum += weight
44 | 
45 |         return loss / weight_sum
46 | 
47 | 
48 | class ListLoss(Module):
49 |     def __init__(self, torch_loss, weights: List[float] = None):
50 |         super(ListLoss, self).__init__()
51 |         self.torch_loss = torch_loss
52 |         self.weights = weights
53 | 
54 |     def forward(self, input: List[torch.Tensor], target: List[torch.Tensor]):
55 |         loss = 0
56 |         weight_sum = 0
57 |         for i, (_input, _target) in enumerate(zip(input, target)):
58 |             if self.weights:
59 |                 weight = self.weights[i]
60 |             else:
61 |                 weight = 1
62 |             _target = _target.to(_input.device)
63 |             mask = torch.bitwise_not(torch.isnan(_target))
64 |             loss += weight * self.torch_loss(_input[mask], _target[mask])
65 |             weight_sum += weight
66 | 
67 |         return loss / weight_sum
68 | 


--------------------------------------------------------------------------------
/toad/nn/loss_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | from torch.utils.data import Dataset, DataLoader
 5 | 
 6 | from .module import Module
 7 | from .loss import DictLoss, ListLoss
 8 | 
 9 | DATASET_SIZE = 20000
10 | NUM_FEATS = 784
11 | NUM_CLASSES = 2
12 | 
13 | X = torch.rand(DATASET_SIZE, NUM_FEATS, dtype=torch.float)
14 | y = torch.randint(NUM_CLASSES, size=(DATASET_SIZE,), dtype=torch.long)
15 | 
16 | 
17 | class DictDataset(Dataset):
18 |     def __init__(self, x, y):
19 |         super().__init__()
20 |         self.x = x
21 |         self.y = y
22 | 
23 |     def __len__(self):
24 |         return self.x.shape[0]
25 | 
26 |     def __getitem__(self, item):
27 |         return self.x[item], {'y': self.y[item]}
28 | 
29 | 
30 | class ListDataset(Dataset):
31 |     def __init__(self, x, y):
32 |         super().__init__()
33 |         self.x = x
34 |         self.y = y
35 | 
36 |     def __len__(self):
37 |         return self.x.shape[0]
38 | 
39 |     def __getitem__(self, item):
40 |         return self.x[item], [self.y[item]]
41 | 
42 | 
43 | class TestDictModel(Module):
44 |     def __init__(self, in_feats, out_feats):
45 |         super().__init__()
46 | 
47 |         self.linear = nn.Linear(in_feats, out_feats)
48 | 
49 |     def forward(self, x):
50 |         x = self.linear(x)
51 |         return {'y': F.relu(x)}
52 | 
53 |     def fit_step(self, batch, loss=None):
54 |         x, y = batch
55 |         y_hat = self(x)
56 |         return loss(y_hat, y)
57 | 
58 | 
59 | class TestListModel(Module):
60 |     def __init__(self, in_feats, out_feats):
61 |         super().__init__()
62 | 
63 |         self.linear = nn.Linear(in_feats, out_feats)
64 | 
65 |     def forward(self, x):
66 |         x = self.linear(x)
67 |         return [F.relu(x)]
68 | 
69 |     def fit_step(self, batch, loss=None):
70 |         x, y = batch
71 |         y_hat = self(x)
72 |         return loss(y_hat, y)
73 | 
74 | 
75 | def test_dict_loss():
76 |     model = TestDictModel(NUM_FEATS, NUM_CLASSES)
77 |     loader = DataLoader(
78 |         DictDataset(X, y),
79 |         batch_size=128,
80 |         shuffle=True,
81 |     )
82 |     model.fit(loader, epoch=1, loss=DictLoss(F.cross_entropy))
83 | 
84 | 
85 | def test_list_loss():
86 |     model = TestListModel(NUM_FEATS, NUM_CLASSES)
87 |     loader = DataLoader(
88 |         ListDataset(X, y),
89 |         batch_size=128,
90 |         shuffle=True,
91 |     )
92 |     model.fit(loader, epoch=1, loss=ListLoss(F.cross_entropy))
93 | 


--------------------------------------------------------------------------------
/toad/nn/module.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from torch import nn, optim
  4 | from torch.nn.parallel import DistributedDataParallel
  5 | 
  6 | from .trainer.history import get_current_history
  7 | from ..utils.progress import Progress
  8 | 
  9 | 
 10 | 
 11 | class Module(nn.Module):
 12 |     """base module for every model
 13 | 
 14 |     Examples:
 15 |         >>> from toad.nn import Module
 16 |         ... from torch import nn
 17 |         ... 
 18 |         ... class Net(Module):
 19 |         ...     def __init__(self, inputs, hidden, outputs):
 20 |         ...         super().__init__()
 21 |         ...         self.model = nn.Sequential(
 22 |         ...             nn.Linear(inputs, hidden),
 23 |         ...             nn.ReLU(),
 24 |         ...             nn.Linear(hidden, outputs),
 25 |         ...             nn.Sigmoid(),
 26 |         ...         )
 27 |         ...     
 28 |         ...     def forward(self, x):
 29 |         ...         return self.model(x)
 30 |         ...     
 31 |         ...     def fit_step(self, batch):
 32 |         ...         x, y = batch
 33 |         ...         y_hat = self(x)
 34 |         ... 
 35 |         ...         # log into history
 36 |         ...         self.log('y', y)
 37 |         ...         self.log('y_hat', y_hat)
 38 |         ... 
 39 |         ...         return nn.functional.mse_loss(y_hat, y)
 40 |         ... 
 41 |         ... model = Net(10, 4, 1)
 42 |         ... 
 43 |         ... model.fit(train_loader)
 44 | 
 45 |     """
 46 |     def __init__(self):
 47 |         """define model struct
 48 |         """
 49 |         super().__init__()
 50 |     
 51 | 
 52 |     @property
 53 |     def device(self):
 54 |         """device of model
 55 |         """
 56 |         return next(self.parameters()).device
 57 | 
 58 | 
 59 |     def fit(self, loader, trainer = None, optimizer = None, loss = None, early_stopping = None, **kwargs):
 60 |         """train model
 61 | 
 62 |         Args:
 63 |             loader (DataLoader): loader for training model
 64 |             trainer (Trainer): trainer for training model
 65 |             optimizer (torch.Optimier): the default optimizer is `Adam(lr = 1e-3)`
 66 |             loss (Callable): could be called as 'loss(y_hat, y)'
 67 |             early_stopping (earlystopping): the default value is `loss_earlystopping`, 
 68 |                 you can set it to `False` to disable early stopping
 69 |             epoch (int): number of epoch for training loop
 70 |             callback (callable): callable function will be called every epoch
 71 |         """
 72 |         if trainer is None:
 73 |             from .trainer import Trainer
 74 |             trainer = Trainer(self, loader, optimizer = optimizer, loss = loss, early_stopping = early_stopping)
 75 |         
 76 |         trainer.train(**kwargs)
 77 |     
 78 | 
 79 |     def evaluate(self, loader, trainer = None):
 80 |         """evaluate model
 81 |         
 82 |         Args:
 83 |             loader (DataLoader): loader for evaluate model
 84 |             trainer (Trainer): trainer for evaluate model
 85 |         """
 86 |         if trainer is None:
 87 |             from .trainer import Trainer
 88 |             trainer = Trainer(self)
 89 |         
 90 |         return trainer.evaluate(loader)
 91 | 
 92 |     
 93 | 
 94 |     def fit_step(self, batch, loss = None, *args, **kwargs):
 95 |         """step for fitting
 96 |         
 97 |         Args:
 98 |             batch (Any): batch data from dataloader
 99 |             loss (Callable): could be called as 'loss(y_hat, y)'
100 |         
101 |         Returns:
102 |             Tensor: loss of this step
103 |         """
104 |         x, y = batch
105 |         y_hat = self.__call__(x)
106 |         if loss is None:
107 |             loss = nn.functional.mse_loss
108 |         return loss(y_hat, y)
109 | 
110 | 
111 |     def save(self, path):
112 |         """save model
113 |         """
114 |         torch.save(self.state_dict(), path)
115 |     
116 | 
117 |     def load(self, path):
118 |         """load model
119 |         """
120 |         state = torch.load(path)
121 |         self.load_state_dict(state)
122 |     
123 | 
124 |     def log(self, key, value):
125 |         """log values to history
126 | 
127 |         Args:
128 |             key (str): name of message
129 |             value (Tensor): tensor of values
130 |         """
131 |         history = get_current_history()
132 |         if history is None:
133 |             return
134 |         
135 |         return history.log(key, value)
136 |         
137 |         
138 |     def distributed(self, backend = None, **kwargs):
139 |         """get distributed model
140 |         """
141 |         if not torch.distributed.is_initialized():
142 |             if backend is None:
143 |                 # choose a backend
144 |                 backend = 'nccl' if torch.distributed.is_nccl_available() else 'gloo'
145 | 
146 |             torch.distributed.init_process_group(backend, **kwargs)
147 |         
148 |         return DistModule(self)
149 |         
150 | 
151 | 
152 | class DistModule(DistributedDataParallel):
153 |     """distributed module class
154 |     """
155 |     def fit(self, *args, **kwargs):
156 |         return self.module.fit(*args, **kwargs)
157 |     
158 |     def save(self, *args, **kwargs):
159 |         return self.module.save(*args, **kwargs)
160 |     
161 |     def load(self, *args, **kwargs):
162 |         return self.module.load(*args, **kwargs)
163 |     
164 |     def log(self, *args, **kwargs):
165 |         return self.module.log(*args, **kwargs)
166 |     
167 |     
168 | 


--------------------------------------------------------------------------------
/toad/nn/module_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | from torch.utils.data import TensorDataset, DataLoader
 5 | 
 6 | from .module import Module
 7 | 
 8 | DATASET_SIZE = 20000
 9 | NUM_FEATS = 784
10 | NUM_CLASSES = 2
11 | 
12 | X = torch.rand(DATASET_SIZE, NUM_FEATS, dtype = torch.float)
13 | y = torch.randint(NUM_CLASSES, size = (DATASET_SIZE,), dtype = torch.long)
14 | 
15 | loader = DataLoader(
16 |     TensorDataset(X, y),
17 |     batch_size = 128,
18 |     shuffle = True,
19 | )
20 | 
21 | class TestModel(Module):
22 |     def __init__(self, in_feats, out_feats):
23 |         super().__init__()
24 |         
25 |         self.linear = nn.Linear(in_feats, out_feats)
26 |     
27 |     def forward(self, x):
28 |         x = self.linear(x)
29 |         return F.relu(x)
30 |     
31 |     def fit_step(self, batch):
32 |         x, y = batch
33 |         y_hat = self(x)
34 |         return F.cross_entropy(y_hat, y)
35 | 
36 | def test_model():
37 |     model = TestModel(NUM_FEATS, NUM_CLASSES)
38 |     model.fit(loader, epoch = 1)
39 | 
40 | 
41 | def test_fit_callback():
42 |     h_list = []
43 | 
44 |     def func(history, epoch):
45 |         h_list.append(history)
46 |     
47 |     model = TestModel(NUM_FEATS, NUM_CLASSES)
48 |     model.fit(loader, epoch = 2, callback = func)
49 |     assert len(h_list) == 2
50 | 
51 | 
52 | class TestModel2(TestModel):
53 |     def fit_step(self, batch, loss=None):
54 |         x, y = batch
55 |         y_hat = self(x)
56 |         return loss(y_hat, y)
57 | 
58 | 
59 | def test_model_loss():
60 |     model = TestModel2(NUM_FEATS, NUM_CLASSES)
61 |     model.fit(loader, epoch=1, loss=F.cross_entropy)
62 | 


--------------------------------------------------------------------------------
/toad/nn/trainer/__init__.py:
--------------------------------------------------------------------------------
 1 | from .history import History, get_current_history
 2 | from .callback import callback
 3 | from .earlystop import earlystopping
 4 | from .trainer import Trainer
 5 | 
 6 | __all__ = [
 7 |     'History',
 8 |     'get_current_history',
 9 |     'callback',
10 |     'earlystopping',
11 |     'Trainer',
12 | ]
13 | 


--------------------------------------------------------------------------------
/toad/nn/trainer/callback.py:
--------------------------------------------------------------------------------
 1 | from ...utils.decorator import Decorator
 2 | 
 3 | class callback(Decorator):
 4 |     """callback for trainer
 5 |     
 6 |     Examples:
 7 |         >>> @callback
 8 |         ... def savemodel(model):
 9 |         ...     model.save("path_to_file")
10 |         ...
11 |         ... trainer.train(model, callback = savemodel)
12 |     
13 |     """
14 |     def __init__(self, *args, **kwargs):
15 |         if hasattr(self, 'wrapped'):
16 |             # use `wrapped` func as core func
17 |             super().__init__(getattr(self, 'wrapped'))
18 |             # setup configuration
19 |             self.setup(*args, **kwargs)
20 |             return
21 |         
22 |         # init normal decorator
23 |         super().__init__(*args, **kwargs)
24 | 
25 | 
26 |     def setup_func(self, func):
27 |         import inspect
28 |         self._params = inspect.signature(func).parameters
29 |         
30 |         return func
31 |         
32 | 
33 |     def wrapper(self, **kwargs):
34 |         params = {k: v for k ,v in kwargs.items() if k in self._params.keys()}
35 | 
36 |         return self.call(**params)
37 | 
38 | 
39 | 
40 | class checkpoint(callback):
41 |     """
42 |     Args:
43 |         dir (string): dir name for saving checkpoint
44 |         every (int): every epoch for saving
45 |         format (string): checkpoint file format
46 |     """
47 |     dirpath = "model_checkpoints"
48 |     every = 1
49 |     filename = "{name}-{epoch}.pt"
50 |     
51 | 
52 |     def wrapper(self, **kwargs):
53 |         model = kwargs.get("model")
54 |         epoch = kwargs.get("epoch")
55 | 
56 |         name = type(model).__name__
57 | 
58 |         from pathlib import Path
59 |         dirpath = Path(self.dirpath)
60 |         dirpath.mkdir(parents = True, exist_ok = True)
61 | 
62 |         filename = self.filename.format(
63 |             name = name,
64 |             epoch = epoch,
65 |         )
66 | 
67 |         path = dirpath / filename
68 | 
69 |         if epoch % self.every == 0:
70 |             super().wrapper(
71 |                 path = path,
72 |                 **kwargs
73 |             )
74 | 
75 | 
76 | class savemodel(checkpoint):
77 |     """
78 |     Args:
79 |         dir (string): dir name for saving checkpoint
80 |         every (int): every epoch for saving
81 |         format (string): checkpoint file format, default is `{name}-{epoch}.pt`
82 |     """
83 |     def wrapped(self, model, path):
84 |         import torch
85 |         torch.save(model.state_dict(), path)
86 | 


--------------------------------------------------------------------------------
/toad/nn/trainer/callback_test.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from .callback import callback, savemodel
 4 | from ..module import Module
 5 | 
 6 | class TestModel(Module):
 7 |     def __init__(self, in_feats, out_feats):
 8 |         super().__init__()
 9 | 
10 |         self.linear = nn.Linear(in_feats, out_feats)
11 | 
12 | 
13 | def test_callback():
14 |     @callback
15 |     def hook(history, trainer):
16 |         return history['a']
17 |     
18 |     res = hook(epoch = 1, trainer = None, history = {"a": 3})
19 | 
20 |     assert res == 3
21 | 
22 | def test_checkpoint():
23 |     model = TestModel(10, 2)
24 |     hook = savemodel(dirpath = '/dev', filename = "null")
25 |     hook(model = model, epoch = 1)
26 | 


--------------------------------------------------------------------------------
/toad/nn/trainer/earlystop.py:
--------------------------------------------------------------------------------
 1 | from .callback import callback
 2 | from ...utils.decorator import Decorator
 3 | 
 4 | 
 5 | class earlystopping(callback):
 6 |     """
 7 |     Examples:
 8 |         >>> @earlystopping(delta = 1e-3, patience = 5)
 9 |         ... def auc(history):
10 |         ...     return AUC(history['y_hat'], history['y'])
11 |     """
12 |     delta = -1e-3
13 |     patience = 10
14 |     skip = 0
15 | 
16 |     def setup(self, delta = -1e-3, patience = 10, skip = 0):
17 |         """
18 |         Args:
19 |             delta (float): stop training if diff of new score is smaller than delta
20 |             patience (int): patience of rounds to stop training
21 |             skip (int): n rounds from starting training to warm up
22 |         """
23 |         self.direction = 1.0 if delta > 0 else -1.0
24 |         self.delta = delta * self.direction
25 |         self.patience = patience
26 |         self.skip = skip
27 |         
28 |         self.reset()
29 |     
30 |     
31 |     def get_best_state(self):
32 |         """get best state of model
33 |         """
34 |         return self.best_state
35 |     
36 | 
37 |     def reset(self):
38 |         """
39 |         """
40 |         self.best_score = float('inf') * (-self.direction)
41 |         self.best_state = None
42 |         self._times = 0
43 |     
44 | 
45 |     def wrapper(self, model, trainer = None, epoch = 0, **kwargs):
46 |         # set skip round
47 |         if epoch < self.skip:
48 |             return False
49 |         
50 |         score = super().wrapper(model = model, epoch = epoch, **kwargs)
51 |         diff = (score - self.best_score) * self.direction
52 |         
53 |         if diff > self.delta:
54 |             self.best_state = model.state_dict()
55 |             self.best_score = score
56 |             self._times = 0
57 |             return False
58 |         
59 |         self._times += 1
60 |         if self._times >= self.patience:
61 |             model.load_state_dict(self.best_state)
62 |             
63 |             if trainer:
64 |                 trainer.terminate()
65 | 
66 |             return True
67 |         
68 | 
69 | class loss_stopping(earlystopping):
70 |     """scoring function
71 |     """
72 |     def wrapped(self, history):
73 |         return history['loss'].mean()
74 | 


--------------------------------------------------------------------------------
/toad/nn/trainer/earlystop_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .earlystop import earlystopping
 3 | 
 4 | 
 5 | 
 6 | def test_earlystopping():
 7 |     model = torch.nn.Linear(10, 10)
 8 | 
 9 |     @earlystopping(delta = -1, patience = 3)
10 |     def scoring(history):
11 |         return history['loss']
12 | 
13 |     rounds = []
14 |     for i in range(10):
15 |         if scoring(model = model, history = {"loss": 1}):
16 |             break
17 | 
18 |         rounds.append(i)
19 |     
20 |     assert len(rounds) == 3
21 | 
22 | 
23 | def test_best_state():
24 |     model = torch.nn.Linear(10, 1)
25 | 
26 |     @earlystopping(delta = -1, patience = 1)
27 |     def scoring(history):
28 |         return history['loss']
29 |      
30 |     with torch.no_grad():
31 |         model.weight.fill_(1.)
32 | 
33 |     # save init weight
34 |     scoring(model = model, history = {"loss": 10})
35 |     assert scoring.best_state["weight"].sum().item() == 10
36 | 
37 |     # change weight
38 |     with torch.no_grad():
39 |         model.weight.fill_(0.)
40 |     
41 |     # save best weight
42 |     scoring(model = model, history = {"loss": 5})
43 |     assert scoring.best_state["weight"].sum().item() == 0
44 | 
45 | 


--------------------------------------------------------------------------------
/toad/nn/trainer/event.py:
--------------------------------------------------------------------------------
 1 | from .callback import callback as Callback
 2 | 
 3 | 
 4 | class Event:
 5 |     def __init__(self):
 6 |         self._events = {}
 7 | 
 8 |     def register(self, event, handler, every = 1):
 9 |         """register events handler
10 |         """
11 |         if not isinstance(handler, Callback):
12 |             handler = Callback(handler)
13 |         
14 |         if event not in self._events:
15 |             self._events[event] = []
16 | 
17 |         handler._event_count = 0
18 |         handler._event_every = every
19 |         
20 |         self._events[event].append(handler)
21 | 
22 | 
23 |     def on(self, event, **kwargs):
24 |         def wrapper(handler):
25 |             self.register(event, handler, **kwargs)
26 |             return handler
27 | 
28 |         return wrapper
29 |     
30 | 
31 |     def emit(self, event, *args, **kwargs):
32 |         """emit event
33 |         """
34 |         if event not in self._events:
35 |             return
36 |         
37 |         # trigger handler
38 |         for handler in self._events[event]:
39 |             # increase count
40 |             handler._event_count += 1
41 | 
42 |             # trigger event
43 |             if handler._event_count % handler._event_every == 0:
44 |                 handler(*args, **kwargs)
45 |     
46 | 
47 |     def mute(self, event):
48 |         """remove events handlers
49 |         """
50 |         if event in self._events:
51 |             handlers = self._events.pop(event)
52 | 


--------------------------------------------------------------------------------
/toad/nn/trainer/event_test.py:
--------------------------------------------------------------------------------
 1 | from .event import Event
 2 | 
 3 | 
 4 | def test_event_trigger():
 5 |     e = Event()
 6 | 
 7 |     counts = 0
 8 | 
 9 |     @e.on("test:trigger")
10 |     def func():
11 |         nonlocal counts
12 |         counts += 1
13 |     
14 |     e.emit("test:trigger")
15 | 
16 |     assert counts == 1
17 | 
18 | 
19 | def test_event_trigger_every():
20 |     e = Event()
21 | 
22 |     counts = 0
23 | 
24 |     @e.on("test:trigger", every = 2)
25 |     def func():
26 |         nonlocal counts
27 |         counts += 1
28 |     
29 |     for i in range(10):
30 |         e.emit("test:trigger")
31 | 
32 |     assert counts == 5
33 | 


--------------------------------------------------------------------------------
/toad/nn/trainer/history.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | _history_stack = [None]
 6 | 
 7 | 
 8 | def get_current_history():
 9 |     global _history_stack
10 | 
11 |     return _history_stack[-1]
12 | 
13 | 
14 | 
15 | class History:
16 |     """model history
17 |     """
18 |     def __init__(self):
19 |         self._store = {}
20 |     
21 | 
22 |     def __getitem__(self, key):
23 |         return self._store[key]
24 |     
25 | 
26 |     def __setitem__(self, key, value):
27 |         return self.log(key, value)
28 |     
29 | 
30 |     def _push(self, key, value):
31 |         """push value into history
32 | 
33 |         Args:
34 |             key (str): key of history
35 |             value (np.ndarray): an array of values
36 |         """
37 |         if key not in self._store:
38 |             self._store[key] = value
39 |             return
40 | 
41 |         self._store[key] = np.concatenate([
42 |             self._store[key],
43 |             value,
44 |         ])
45 |     
46 | 
47 |     def log(self, key, value):
48 |         """log message to history
49 | 
50 |         Args:
51 |             key (str): name of message
52 |             value (Tensor): tensor of values
53 |         """
54 |         if isinstance(value, torch.Tensor):
55 |             value = value.detach().cpu().numpy()
56 |             
57 |             # fix scaler tensor
58 |             if value.ndim == 0:
59 |                 value = value.reshape(-1)
60 | 
61 |         if np.isscalar(value):
62 |             value = np.array([value])
63 |         
64 |         if not isinstance(value, np.ndarray):
65 |             raise TypeError("value should be `torch.Tensor` or `scalar`")
66 |         
67 |         self._push(key, value)
68 |     
69 | 
70 |     def start(self):
71 |         global _history_stack
72 |         _history_stack.append(self)
73 |         
74 |         return self
75 |     
76 | 
77 |     def end(self):
78 |         global _history_stack
79 |         return _history_stack.pop()
80 |     
81 | 
82 |     def __enter__(self):
83 |         return self.start()
84 |     
85 | 
86 |     def __exit__(self, exc_type, exc_val, exc_tb):
87 |         return self.end()
88 | 


--------------------------------------------------------------------------------
/toad/nn/trainer/history_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from .history import History, get_current_history
 4 | 
 5 | def test_history_log():
 6 |     history = History()
 7 | 
 8 |     for i in range(10):
 9 |         history.log('tensor', torch.rand(3, 5))
10 |     
11 |     assert history['tensor'].shape == (30, 5)
12 | 
13 | 
14 | def test_current_history():
15 |     history = History()
16 | 
17 |     with history:
18 |         h = get_current_history()
19 |         h.log('tensor', torch.rand(3, 5))
20 |     
21 |     assert history['tensor'].shape == (3, 5)
22 | 


--------------------------------------------------------------------------------
/toad/nn/trainer/metrics.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/toad/nn/trainer/metrics.py


--------------------------------------------------------------------------------
/toad/nn/trainer/trainer_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | import torch.nn.functional as F
  6 | from torch.utils.data import TensorDataset, DataLoader
  7 | 
  8 | from .history import History
  9 | from ..module import Module
 10 | from .trainer import Trainer
 11 | from .callback import callback
 12 | from .earlystop import earlystopping
 13 | 
 14 | 
 15 | DATASET_SIZE = 20000
 16 | NUM_FEATS = 784
 17 | NUM_CLASSES = 2
 18 | 
 19 | X = torch.rand(DATASET_SIZE, NUM_FEATS, dtype = torch.float)
 20 | y = torch.randint(NUM_CLASSES, size = (DATASET_SIZE,), dtype = torch.long)
 21 | 
 22 | loader = DataLoader(
 23 |     TensorDataset(X, y),
 24 |     batch_size = 128,
 25 |     shuffle = True,
 26 | )
 27 | 
 28 | class TestModel(Module):
 29 |     def __init__(self, in_feats, out_feats):
 30 |         super().__init__()
 31 | 
 32 |         self.linear = nn.Linear(in_feats, out_feats)
 33 |     
 34 |     def forward(self, x):
 35 |         x = self.linear(x)
 36 |         return F.relu(x)
 37 |     
 38 |     def fit_step(self, batch):
 39 |         x, y = batch
 40 |         y_hat = self(x)
 41 |         return F.cross_entropy(y_hat, y)
 42 | 
 43 | 
 44 | def test_trainer():
 45 |     model = TestModel(NUM_FEATS, NUM_CLASSES)
 46 |     trainer = Trainer(model, loader)
 47 |     trainer.train(epoch = 2)
 48 |     assert len(trainer.history) == 2
 49 | 
 50 | 
 51 | def test_trainer_early_stopping():
 52 |     model = TestModel(NUM_FEATS, NUM_CLASSES)
 53 |     
 54 |     @earlystopping(delta = -1.0, patience = 3)
 55 |     def scoring(history):
 56 |         return history['loss'].mean()
 57 | 
 58 |     trainer = Trainer(model, loader, early_stopping = scoring)
 59 |     trainer.train(epoch = 200)
 60 |     assert len(trainer.history) == 4
 61 | 
 62 | 
 63 | def test_trainer_fit_step():
 64 |     model = TestModel(NUM_FEATS, NUM_CLASSES)
 65 |     trainer = Trainer(model, loader)
 66 |     step_count = 0
 67 | 
 68 |     @trainer.fit_step
 69 |     def step(model, batch):
 70 |         x, y = batch
 71 |         y_hat = model(x)
 72 |         nonlocal step_count
 73 |         step_count += 1
 74 |         return F.cross_entropy(y_hat, y)
 75 |     
 76 |     trainer.train(epoch = 2)
 77 |     assert step_count > 1
 78 | 
 79 | 
 80 | def test_multi_callbacks():
 81 |     log = {}
 82 |     
 83 |     @callback
 84 |     def log_epoch(epoch):
 85 |         log['epoch'] = epoch
 86 |     
 87 |     @callback
 88 |     def log_loss(history):
 89 |         log['loss'] = history['loss']
 90 | 
 91 |     model = TestModel(NUM_FEATS, NUM_CLASSES)
 92 |     trainer = Trainer(model)
 93 |     trainer.train(loader, epoch = 2, callback = [log_epoch, log_loss])
 94 |     
 95 |     assert log['epoch'] == 1
 96 |     assert len(log['loss']) == 157
 97 | 
 98 | 
 99 | def test_trainer_evaluate():
100 |     model = TestModel(NUM_FEATS, NUM_CLASSES)
101 |     trainer = Trainer(model, loader)
102 | 
103 |     @trainer.fit_step
104 |     def step(model, batch):
105 |         x, y = batch
106 |         y_hat = model(x)
107 |         return F.cross_entropy(y_hat, y)
108 |     
109 |     history = trainer.evaluate(loader)
110 | 
111 |     assert len(history["loss"]) == 157
112 |     
113 | 
114 | 
115 | class TestModel2(TestModel):
116 |     def fit_step(self, batch, loss=None):
117 |         x, y = batch
118 |         y_hat = self(x)
119 |         return loss(y_hat, y)
120 | 
121 | 
122 | def test_trainer_loss():
123 |     model = TestModel2(NUM_FEATS, NUM_CLASSES)
124 |     trainer = Trainer(model, loader, loss = F.cross_entropy)
125 |     trainer.train(epoch = 2)
126 |     assert len(trainer.history) == 2
127 | 
128 | 
129 | # def test_trainer_distributed():
130 | #     model = TestModel(NUM_FEATS, NUM_CLASSES)
131 | #     trainer = Trainer(model, loader)
132 | #     trainer.distributed(workers = 2)
133 | #     trainer.train(epoch = 5)
134 | 
135 | 
136 | 
137 | ### distribut model test
138 | # from toad.nn.trainer.trainer import Trainer
139 | # from torchvision.transforms import ToTensor
140 | # import torch
141 | # from torch import nn
142 | # from torchvision import datasets
143 | # from toad.nn import Module
144 | # from torch.utils.data import DataLoader
145 | # import ray
146 | 
147 | # class NeuralNetwork(Module):
148 | #     def __init__(self):
149 | #         super(NeuralNetwork, self).__init__()
150 | #         self.flatten = nn.Flatten()
151 | #         self.linear_relu_stack = nn.Sequential(
152 | #             nn.Linear(28 * 28, 512),
153 | #             nn.ReLU(),
154 | #             nn.Linear(512, 512),
155 | #             nn.ReLU(),
156 | #             nn.Linear(512, 10),
157 | #             nn.ReLU(),
158 | #         )
159 | #     def forward(self, x):
160 | #         x = self.flatten(x)
161 | #         logits = self.linear_relu_stack(x)
162 | #         return logits
163 | #     def fit_step(self, batch):
164 | #         X, y = batch
165 | #         pred =self(X)
166 | #         loss_fn=nn.CrossEntropyLoss()
167 | #         return loss_fn(pred, y)
168 | 
169 | 
170 | # @pytest.mark.skip("distributed trainer skip")
171 | # def test_distribute_example():
172 | #     training_data = datasets.FashionMNIST(
173 | #         root="~/data",
174 | #         train=True,
175 | #         download=True,
176 | #         transform=ToTensor(),
177 | #     )
178 | #     # Download test data from open datasets.
179 | #     test_data = datasets.FashionMNIST(
180 | #         root="~/data",
181 | #         train=False,
182 | #         download=True,
183 | #         transform=ToTensor(),
184 | #     )
185 | #     worker_batch_size = 64 // 4
186 | #     # Create data loaders.
187 | #     train_dataloader = DataLoader(training_data, batch_size=16)
188 | #     test_dataloader = DataLoader(test_data, batch_size=16)
189 | #     model=NeuralNetwork()
190 | #     optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
191 | #     trainer=Trainer(model,train_dataloader,optimizer)
192 | #     trainer.distributed(address="ray://172.20.144.21:10001",num_works=4,use_gpu=False)
193 | #     trainer.train(epoch=1)
194 | #     trainer.evaluate(test_dataloader)
195 | 


--------------------------------------------------------------------------------
/toad/nn/zoo/__init__.py:
--------------------------------------------------------------------------------
1 | from .autoencoder import BaseAutoEncoder, VAE
2 | 


--------------------------------------------------------------------------------
/toad/nn/zoo/autoencoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn.functional import relu, binary_cross_entropy
 4 | 
 5 | from ..module import Module
 6 | 
 7 | 
 8 | 
 9 | class BaseAutoEncoder(Module):
10 |     def __init__(self, input, hidden, zipped):
11 |         super().__init__()
12 | 
13 |         self.encoder = nn.Sequential(
14 |             nn.Linear(input, hidden),
15 |             nn.ReLU(),
16 |             nn.Linear(hidden, zipped),
17 |         )
18 | 
19 |         self.decoder = nn.Sequential(
20 |             nn.Linear(zipped, hidden),
21 |             nn.ReLU(),
22 |             nn.Linear(hidden, input),
23 |         )
24 | 
25 |         self.loss = nn.MSELoss()
26 |     
27 |     
28 |     def encode(self, x):
29 |         return self.encoder(x)
30 |     
31 |     def decode(self, x):
32 |         return self.decoder(x)
33 | 
34 |     def forward(self, x):
35 |         z = self.encode(x)
36 |         return self.decode(z)
37 |     
38 |     def fit_step(self, x):
39 |         return self.loss(self(x), x)
40 | 
41 | 
42 | 
43 | class VAE(Module):
44 |     def __init__(self, input, hidden, zipped):
45 |         super().__init__()
46 |         
47 |         self.hidden_layer = nn.Linear(input, hidden)
48 | 
49 |         self.mu_layer = nn.Linear(hidden, zipped)
50 |         self.var_layer = nn.Linear(hidden, zipped)
51 | 
52 |         self.decoder = nn.Sequential(
53 |             nn.Linear(zipped, hidden),
54 |             nn.ReLU(),
55 |             nn.Linear(hidden, input),
56 |         )
57 | 
58 |         self.loss = nn.MSELoss()
59 |     
60 |     def encode(self, x):
61 |         h = relu(self.hidden_layer(x))
62 |         mu = self.mu_layer(h)
63 |         var = self.var_layer(h)
64 | 
65 |         std = torch.exp(var / 2)
66 |         eps = torch.rand_like(std)
67 |         
68 |         z = mu + eps * std
69 |         return z, mu, var
70 |     
71 |     def decode(self, x):
72 |         return self.decoder(x)
73 |     
74 |     def forward(self, x):
75 |         z, mu, var = self.encode(x)
76 |         x_hat = self.decode(z)
77 |         return x_hat, mu, var
78 |     
79 |     def fit_step(self, x):
80 |         x_hat, mu, var = self(x)
81 |         l = self.loss(x_hat, x)
82 |         kld = -0.5 * torch.sum(1 + var - torch.pow(mu, 2) - torch.exp(var))
83 | 
84 |         loss = l + kld
85 |         return loss
86 | 


--------------------------------------------------------------------------------
/toad/nn/zoo/autoencoder_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | import pytest
 4 | import numpy as np
 5 | from torch.utils.data import TensorDataset, DataLoader
 6 | 
 7 | from .autoencoder import BaseAutoEncoder, VAE
 8 | 
 9 | # skip testing with python 3.9 on linux
10 | if sys.version_info >= (3, 9) and sys.platform.startswith('linux'):
11 |     pytest.skip("failed with python 3.9 on linux, need fix!", allow_module_level = True)
12 | 
13 | 
14 | X = torch.Tensor(np.random.rand(20000, 784))
15 | 
16 | loader = DataLoader(
17 |     X,
18 |     batch_size = 128,
19 |     shuffle = True,
20 | )
21 | 
22 | def test_ae():
23 |     ae = BaseAutoEncoder(784, 200, 10)
24 |     ae.fit(loader, epoch = 1)
25 | 
26 | def test_vae():
27 |     vae = VAE(784, 200, 10)
28 |     vae.fit(loader, epoch = 1)
29 | 


--------------------------------------------------------------------------------
/toad/plot.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | from sklearn.metrics import roc_curve
  5 | 
  6 | from .stats import IV, feature_bin_stats
  7 | from .metrics import AUC
  8 | from .tadpole import tadpole
  9 | from .tadpole.utils import HEATMAP_CMAP, MAX_STYLE, add_annotate, add_text, reset_ylim
 10 | from .utils import unpack_tuple, generate_str
 11 | 
 12 | def badrate_plot(frame, x = None, target = 'target', by = None,
 13 |                 freq = None, format = None, return_counts = False,
 14 |                 return_proportion = False, return_frame = False):
 15 |     """plot for badrate
 16 | 
 17 |     Args:
 18 |         frame (DataFrame)
 19 |         x (str): column in frame that will be used as x axis
 20 |         target (str): target column in frame
 21 |         by (str): column in frame that will be calculated badrate by it
 22 |         freq (str): offset aliases string by pandas
 23 |                     http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
 24 |         format (str): format string for time
 25 |         return_counts (bool): if need return counts plot
 26 |         return_frame (bool): if need return frame
 27 | 
 28 |     Returns:
 29 |         Axes: badrate plot
 30 |         Axes: counts plot
 31 |         Axes: proportion plot
 32 |         Dataframe: grouping detail data
 33 |     """
 34 |     frame = frame.copy()
 35 |     markers = True
 36 | 
 37 |     if not isinstance(target, str):
 38 |         temp_name = generate_str()
 39 |         frame[temp_name] = target
 40 |         target = temp_name
 41 | 
 42 |     grouper = x
 43 |     if freq is not None:
 44 |         frame.loc[:, x] = pd.to_datetime(frame[x], format = format)
 45 |         grouper = pd.Grouper(key = x, freq = freq)
 46 | 
 47 |     if by is not None:
 48 |         grouper = [by, grouper]
 49 | 
 50 |         styles_count = frame[by].nunique()
 51 |         if styles_count > MAX_STYLE:
 52 |             markers = ['o'] * styles_count
 53 | 
 54 |     group = frame.groupby(grouper)
 55 |     table = group[target].agg(['sum', 'count']).reset_index()
 56 |     table['badrate'] = table['sum'] / table['count']
 57 | 
 58 |     # set number dtype to object
 59 |     if np.issubdtype(table[x].dtype, np.number):
 60 |         table[x] = table[x].astype(str)
 61 | 
 62 | 
 63 |     rate_plot = tadpole.lineplot(
 64 |         x = x,
 65 |         y = 'badrate',
 66 |         hue = by,
 67 |         style = by,
 68 |         data = table,
 69 |         legend = 'full',
 70 |         markers = markers,
 71 |         dashes = False,
 72 |     )
 73 | 
 74 |     # set y axis start with 0
 75 |     rate_plot.set_ylim(0, None)
 76 | 
 77 |     res = (rate_plot,)
 78 | 
 79 |     if return_counts:
 80 |         count_plot = tadpole.barplot(
 81 |             x = x,
 82 |             y = 'count',
 83 |             hue = by,
 84 |             data = table,
 85 |         )
 86 |         res += (count_plot,)
 87 | 
 88 | 
 89 |     if return_proportion:
 90 |         table['prop'] = 0
 91 |         for v in table[x].unique():
 92 |             mask = (table[x] == v)
 93 |             table.loc[mask, 'prop'] = table[mask]['count'] / table[mask]['count'].sum()
 94 | 
 95 |         prop_plot = tadpole.barplot(
 96 |             x = x,
 97 |             y = 'prop',
 98 |             hue = by,
 99 |             data = table,
100 |         )
101 |         res += (prop_plot,)
102 | 
103 | 
104 |     if return_frame:
105 |         res += (table,)
106 | 
107 |     return unpack_tuple(res)
108 | 
109 | 
110 | def corr_plot(frame, figure_size = (20, 15), ax = None):
111 |     """plot for correlation
112 | 
113 |     Args:
114 |         frame (DataFrame): frame to draw plot
115 |     Returns:
116 |         Axes
117 |     """
118 |     corr = frame.corr()
119 | 
120 |     mask = np.zeros_like(corr, dtype = bool)
121 |     mask[np.triu_indices_from(mask)] = True
122 | 
123 |     map_plot = tadpole.heatmap(
124 |         corr,
125 |         mask = mask,
126 |         cmap = HEATMAP_CMAP,
127 |         vmax = 1,
128 |         vmin = -1,
129 |         center = 0,
130 |         square = True,
131 |         cbar_kws = {"shrink": .5},
132 |         linewidths = .5,
133 |         annot = True,
134 |         fmt = '.2f',
135 |         figure_size = figure_size,
136 |         ax = ax,
137 |     )
138 | 
139 |     return map_plot
140 | 
141 | 
142 | def proportion_plot(x = None, keys = None, ax = None):
143 |     """plot for comparing proportion in different dataset
144 | 
145 |     Args:
146 |         x (Series|list): series or list of series data for plot
147 |         keys (str|list): keys for each data
148 | 
149 |     Returns:
150 |         Axes
151 |     """
152 |     if not isinstance(x, list):
153 |         x = [x]
154 | 
155 |     if keys is None:
156 |         keys = [
157 |             x[ix].name
158 |             if hasattr(x[ix], 'name') and x[ix].name is not None
159 |             else ix
160 |             for ix in range(len(x))
161 |         ]
162 |     elif isinstance(keys, str):
163 |         keys = [keys]
164 | 
165 |     x = map(pd.Series, x)
166 |     data = pd.concat(x, keys = keys, names = ['keys']).reset_index()
167 |     data = data.rename(columns = {data.columns[2]: 'value'})
168 | 
169 |     prop_data = data.groupby('keys')['value'].value_counts(
170 |         normalize = True,
171 |         dropna = False,
172 |     ).rename('proportion').reset_index()
173 | 
174 |     prop_plot = tadpole.barplot(
175 |         x = 'value',
176 |         y = 'proportion',
177 |         hue = 'keys',
178 |         data = prop_data,
179 |         ax = ax,
180 |     )
181 | 
182 |     return prop_plot
183 | 
184 | 
185 | def roc_plot(score, target, compare = None, figsize = (14, 10), ax = None):
186 |     """plot for roc
187 | 
188 |     Args:
189 |         score (array-like): predicted score
190 |         target (array-like): true target
191 |         compare (array-like): another score for comparing with score
192 | 
193 |     Returns:
194 |         Axes
195 |     """
196 |     auc, fpr, tpr, thresholds = AUC(score, target, return_curve = True)
197 | 
198 |     if ax is None:
199 |         fig, ax = plt.subplots(1, 1, figsize = figsize)
200 |     
201 |     ax.plot(fpr, tpr, label = 'ROC curve (area = %0.5f)' % auc)
202 |     ax.fill_between(fpr, tpr, alpha = 0.3)
203 |     if compare is not None:
204 |         c_aux, c_fpr, c_tpr, _ = AUC(compare, target, return_curve = True)
205 |         ax.plot(c_fpr, c_tpr,label = 'ROC compare (area = %0.5f)' % c_aux)
206 |         ax.fill_between(c_fpr, c_tpr, alpha = 0.3)
207 | 
208 |     ax.plot([0, 1], [0, 1], color = 'red', linestyle = '--')
209 |     plt.legend(loc = "lower right")
210 | 
211 |     return ax
212 | 
213 | def ks_plot(score, target, figsize = (14, 10), ax = None):
214 |     """plot for ks
215 | 
216 |     Args:
217 |         score (array-like): predicted score
218 |         target (array-like): true target
219 |         compare (array-like): another score for comparing with score
220 | 
221 |     Returns:
222 |         Axes
223 |     """
224 |     fpr, tpr, thresholds = roc_curve(target, score)
225 |     
226 |     if ax is None:
227 |         fig, ax = plt.subplots(1, 1, figsize = figsize)
228 |     
229 |     ax.plot(thresholds[1 : ], tpr[1 : ], label = 'tpr')
230 |     ax.plot(thresholds[1 : ], fpr[1 : ], label = 'fpr')
231 |     ax.plot(thresholds[1 : ], (tpr - fpr)[1 : ], label = 'ks')
232 | 
233 |     ax.invert_xaxis()
234 |     ax.legend()
235 | 
236 |     ks_value = max(tpr - fpr)
237 |     x = np.argwhere(abs(fpr - tpr) == ks_value)[0, 0]
238 |     thred_value = thresholds[x]
239 |     ax.axvline(thred_value, color = 'r', linestyle = '--')
240 |     plt.title(f'ks:{ks_value:.5f}    threshold:{thred_value:.5f}')
241 | 
242 |     return ax
243 | 
244 | def bin_plot(frame, x = None, target = 'target', iv = True, annotate_format = ".2f", 
245 |             return_frame = False, figsize = (12, 6), ax = None):
246 |     """plot for bins
247 | 
248 |     Args:
249 |         frame (DataFrame)
250 |         x (str): column in frame that will be used as x axis
251 |         target (str): target column in frame
252 |         iv (bool): if need to show iv in plot
253 |         annotate_format (str): format str for axis annotation of chart
254 |         return_frame (bool): if need return bin frame
255 |         figsize (tuple): size of the figure (width, height)
256 | 
257 |     Returns:
258 |         Dataframe: contains good, bad, badrate, prop, y_prop, n_prop, woe, iv
259 |     """
260 |     frame = frame.copy()
261 | 
262 |     if not isinstance(target, str):
263 |         temp_name = generate_str()
264 |         frame[temp_name] = target
265 |         target = temp_name
266 |     
267 |     table = feature_bin_stats(frame, x, target)
268 | 
269 |     if ax is None:
270 |         fig, ax = plt.subplots(figsize=figsize)
271 |     
272 |     ax = tadpole.barplot(
273 |         x = x,
274 |         y = 'prop',
275 |         data = table,
276 |         color = '#82C6E2',
277 |         ax = ax,
278 |     )
279 | 
280 |     ax = add_annotate(ax, format = annotate_format)
281 | 
282 |     badrate_ax = ax.twinx()
283 |     badrate_ax.grid(False)
284 | 
285 |     badrate_ax = tadpole.lineplot(
286 |         x = x,
287 |         y = 'badrate',
288 |         data = table,
289 |         color = '#D65F5F',
290 |         ax = badrate_ax,
291 |     )
292 | 
293 |     badrate_ax.set_ylim([0, None])
294 |     badrate_ax = add_annotate(badrate_ax, format = annotate_format)
295 | 
296 |     if iv:
297 |         ax = reset_ylim(ax)
298 |         ax = add_text(ax, 'IV: {:.5f}'.format(table['iv'].sum()))
299 | 
300 |     res = (ax,)
301 |     
302 |     if return_frame:
303 |         res += (table,)
304 | 
305 |     return unpack_tuple(res)
306 | 


--------------------------------------------------------------------------------
/toad/plot_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | from .plot import (
 6 |     badrate_plot,
 7 |     corr_plot,
 8 |     proportion_plot,
 9 |     roc_plot,
10 |     bin_plot,
11 | )
12 | 
13 | np.random.seed(1)
14 | 
15 | LENGTH = 500
16 | 
17 | A = np.random.rand(LENGTH)
18 | A[np.random.choice(LENGTH, 20, replace = False)] = np.nan
19 | 
20 | B = np.random.randint(100, size = LENGTH)
21 | C = A + np.random.normal(0, 0.2, LENGTH)
22 | D = A + np.random.normal(0, 0.1, LENGTH)
23 | 
24 | E = np.random.rand(LENGTH)
25 | E[np.random.choice(LENGTH, 480, replace = False)] = np.nan
26 | 
27 | F = B + np.random.normal(0, 10, LENGTH)
28 | 
29 | target = np.random.randint(2, size = LENGTH)
30 | 
31 | frame = pd.DataFrame({
32 |     'A': A,
33 |     'B': B,
34 |     'C': C,
35 |     'D': D,
36 |     'E': E,
37 |     'F': F,
38 | })
39 | 
40 | frame['target'] = target
41 | 
42 | 
43 | def test_badrate_plot():
44 |     g = badrate_plot(
45 |         frame,
46 |         x = 'A',
47 |         target = 'target',
48 |         return_counts = True,
49 |         return_proportion = True,
50 |     )
51 | 
52 | def test_badrate_plot_y_axis():
53 |     g = badrate_plot(
54 |         frame,
55 |         x = 'A',
56 |         target = 'target',
57 |     )
58 |     bottom, _ = g.get_ylim()
59 |     assert bottom == 0
60 | 
61 | def test_corr_plot():
62 |     g = corr_plot(frame)
63 | 
64 | 
65 | def test_proportion_plot():
66 |     g = proportion_plot(x = frame['target'])
67 | 
68 | 
69 | def test_roc_plot():
70 |     g = roc_plot(frame['B'], frame['target'])
71 | 
72 | 
73 | def test_bin_plot():
74 |     g = bin_plot(frame, x = 'B', target = 'target')
75 | 
76 | 
77 | def test_bin_plot_return_frame():
78 |     g, df = bin_plot(frame, x = 'B', target = 'target', return_frame = True)
79 |     assert df.shape == (100, 10)
80 | 


--------------------------------------------------------------------------------
/toad/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .process import Processing, Mask, F
2 | from .partition import Partition, TimePartition


--------------------------------------------------------------------------------
/toad/preprocessing/partition.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | class Partition:
 6 |     def partition(self, data):
 7 |         """partition data
 8 | 
 9 |         Args:
10 |             data (DataFrame): dataframe
11 |         
12 |         Returns:
13 |             iterator -> ndarray[bool]: mask of partition data
14 |             iterator -> str: suffix string of current partition
15 |         """
16 |         yield np.ones(len(data)).astype(bool), ''
17 | 
18 | 
19 | 
20 | class TimePartition(Partition):
21 |     """partition data by time delta
22 | 
23 |     Args:
24 |         base (str): column name of base time
25 |         filter (str): column name of target time to be compared
26 |         times (list): list of time delta`
27 |     
28 |     Example:
29 | 
30 |     >>> TimePartition('apply_time', 'query_time', ['30d', '90d', 'all'])
31 | 
32 |     """
33 |     def __init__(self, base, filter, times):
34 |         self.base = base
35 |         self.filter = filter
36 |         self.times = times
37 |     
38 | 
39 |     def partition(self, data):
40 |         base = pd.to_datetime(data[self.base])
41 |         filter = pd.to_datetime(data[self.filter])
42 | 
43 |         for t in self.times:
44 |             if t != 'all':
45 |                 delta = pd.Timedelta(t)
46 |                 mask = filter > (base - delta)
47 |             else:
48 |                 mask = np.ones(len(filter)).astype(bool)
49 |             
50 |             yield mask, '_' + t
51 | 
52 | 
53 | class ValuePartition(Partition):
54 |     """partition data by column values
55 | 
56 |     Args:
57 |         column (str): column name which will be used as partition
58 |     
59 |     Example:
60 | 
61 |     >>> ValuePartition('status')
62 |     
63 |     """
64 |     def __init__(self, column):
65 |         self.column = column
66 |     
67 | 
68 |     def partition(self, data):
69 |         data = data[self.column]
70 |         unique = data.unique()
71 | 
72 |         for u in unique:
73 |             if pd.isna(u):
74 |                 mask = data.isna()
75 |             else:
76 |                 mask = (data == u)
77 |             
78 |             yield mask, '_' + str(u)


--------------------------------------------------------------------------------
/toad/preprocessing/partition_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | 
 6 | from .partition import TimePartition, ValuePartition
 7 | 
 8 | 
 9 | np.random.seed(1)
10 | 
11 | ab = np.array(list('ABCDEFG'))
12 | 
13 | history = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, 400, size = 500)
14 | open_time = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, size = 500)
15 | A = ab[np.random.choice(7, 500)]
16 | B = np.random.randint(10, size = 500).astype(float)
17 | B[np.random.choice(500, 10)] = np.nan
18 | 
19 | 
20 | df = pd.DataFrame({
21 |     'history': history,
22 |     'open_time': open_time,
23 |     'A': A,
24 |     'B': B,
25 | })
26 | 
27 | 
28 | def test_timepartition():
29 |     tp = TimePartition('open_time', 'history', ['90d', '180d'])
30 |     mask, suffix = next(tp.partition(df))
31 |     assert mask.sum() == 93
32 | 
33 | 
34 | def test_timepartition_all():
35 |     tp = TimePartition('open_time', 'history', ['all'])
36 |     mask, suffix = next(tp.partition(df))
37 |     assert mask.sum() == 500
38 | 
39 | def test_valuepartition():
40 |     vp = ValuePartition('A')
41 |     mask, suffix = next(vp.partition(df))
42 |     assert mask.sum() == 67
43 | 
44 | def test_valuepartition_with_na():
45 |     vp = ValuePartition('B')
46 |     s = 0
47 |     for mask, suffix in vp.partition(df):
48 |         s += mask.sum()
49 |     
50 |     assert s == 500


--------------------------------------------------------------------------------
/toad/preprocessing/process.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | 
  4 | _ALL_SYMBOL_ = '__all_symbol__'
  5 | 
  6 | class Processing:
  7 |     """
  8 | 
  9 |     Examples:
 10 | 
 11 |     >>> (Processing(data)
 12 |     ...     .groupby('id')
 13 |     ...     .partitionby(TimePartition(
 14 |     ...         'base_time',
 15 |     ...         'filter_time',
 16 |     ...         ['30d', '60d', '180d', '365d', 'all']
 17 |     ...     ))
 18 |     ...     .apply({'A': ['max', 'min', 'mean']})
 19 |     ...     .apply({'B': ['max', 'min', 'mean']})
 20 |     ...     .apply({'C': 'nunique'})
 21 |     ...     .apply({'D': {
 22 |     ...         'f': len,
 23 |     ...         'name': 'normal_count',
 24 |     ...         'mask':  Mask('D').isin(['normal']),
 25 |     ...     }})
 26 |     ...     .apply({'id': 'count'})
 27 |     ...     .exec()
 28 |     ... )
 29 |     """
 30 |     def __init__(self, data):
 31 |         self.data = data
 32 |         self.funcs = {}
 33 |         self.partitions = None
 34 | 
 35 |     def groupby(self, name):
 36 |         """group data by name
 37 | 
 38 |         Args:
 39 |             name (str): column name in data
 40 |         """
 41 |         self._groupby = name
 42 |         return self
 43 |     
 44 |     def apply(self, f):
 45 |         """apply functions to data
 46 | 
 47 |         Args:
 48 |             f (dict|function): a config dict that keys are the column names and 
 49 |                 values are the functions, it will take the column series as the
 50 |                 functions argument. if `f` is a function, it will take the whole
 51 |                 dataframe as the argument.
 52 |             
 53 |         """
 54 |         if not isinstance(f, dict):
 55 |             f = {
 56 |                 _ALL_SYMBOL_: f
 57 |             }
 58 |         
 59 |         for k, v in f.items():
 60 |             self.append_func(k, v)
 61 |         
 62 |         return self
 63 |     
 64 | 
 65 |     def append_func(self, col, func):
 66 |         if not isinstance(func, (list, tuple)):
 67 |             func = [func]
 68 |         
 69 |         if col not in self.funcs:
 70 |             self.funcs[col] = []
 71 |         
 72 |         for f in func:
 73 |             self.funcs[col].append(self._convert_func(f))
 74 |     
 75 | 
 76 |     def _convert_func(self, f):
 77 |         if isinstance(f, F):
 78 |             return f
 79 |         
 80 |         if not isinstance(f, dict):
 81 |             f = {'f': f}
 82 |         
 83 |         return F(**f)
 84 |         
 85 |     
 86 |     def partitionby(self, p):
 87 |         """partition data to multiple pieces, processing will process to all the pieces
 88 | 
 89 |         Args:
 90 |             p (Partition)
 91 |         """
 92 |         self.partitions = p
 93 |         return self
 94 |     
 95 |     def exec(self):
 96 |         if self.partitions is None:
 97 |             return self.process(self.data)
 98 | 
 99 |         res = None
100 |         for mask, suffix in self.partitions.partition(self.data):
101 |             data = self.process(self.data[mask])
102 |             data = data.add_suffix(suffix)
103 | 
104 |             if res is None:
105 |                 res = data
106 |                 continue
107 |             
108 |             res = res.join(data, how = 'outer')
109 |         
110 |         return res
111 |             
112 | 
113 |     
114 |     def process(self, data):
115 |         group = data.groupby(self._groupby)
116 | 
117 |         res = []
118 |         for col, l in self.funcs.items():
119 |             for f in l:
120 |                 g = group
121 | 
122 |                 if f.need_filter:
123 |                     g = f.filter(data).groupby(self._groupby)
124 |                 
125 |                 if f.is_buildin:
126 |                     r = getattr(g[col], f.name)()
127 |                     r.name = f.name
128 |                 else:
129 |                     if col == _ALL_SYMBOL_:
130 |                         col = None
131 |                     
132 |                     r = g.apply(f, col = col)
133 |                 
134 |                 if isinstance(r, pd.Series):
135 |                     r = pd.DataFrame(r)
136 | 
137 |                 res.append(r.add_prefix(col + '_'))
138 |         
139 |         return pd.concat(res, axis=1)
140 |     
141 | 
142 | 
143 | class Mask:
144 |     """a placeholder to select dataframe
145 |     """
146 |     def __init__(self, column = None):
147 |         self.column = column
148 |         self.operators = []
149 |     
150 |     def push(self, op, value):
151 |         self.operators.append({
152 |             'op': op,
153 |             'value': value,
154 |         })
155 |     
156 |     def replay(self, data):
157 |         base = data
158 |         if self.column is not None:
159 |             base = data[self.column]
160 | 
161 |         for item in self.operators:
162 |             v = item['value']
163 | 
164 |             if isinstance(v, Mask):
165 |                 v = v.replay(data)
166 |             
167 |             f = getattr(base, item['op'])
168 | 
169 |             if v is None:
170 |                 base = f()
171 |                 continue
172 | 
173 |             base = f(v)
174 |         
175 |         return base
176 | 
177 |     def __eq__(self, other):
178 |         self.push('__eq__', other)
179 |         return self
180 |     
181 |     def __lt__(self, other):
182 |         self.push('__lt__', other)
183 |         return self
184 |     
185 |     def __gt__(self, other):
186 |         self.push('__gt__', other)
187 |         return self
188 |     
189 |     def __le__(self, other):
190 |         self.push('__le__', other)
191 |         return self
192 |     
193 |     def __ge__(self, other):
194 |         self.push('__ge__', other)
195 |         return self
196 |     
197 |     def __invert__(self):
198 |         self.push('__invert__', None)
199 |         return self
200 |     
201 |     def __and__(self, other):
202 |         self.push('__and__', other)
203 |         return self
204 |     
205 |     def __or__(self, other):
206 |         self.push('__or__', other)
207 |         return self
208 |     
209 |     def __xor__(self, other):
210 |         self.push('__xor__', other)
211 |         return self
212 |     
213 |     def isin(self, other):
214 |         self.push('isin', other)
215 |         return self
216 |     
217 |     def isna(self):
218 |         self.push('isna', None)
219 |         return self
220 | 
221 | 
222 | 
223 | class F:
224 |     """function class for processing
225 |     """
226 |     def __init__(self, f, name = None, mask = None):
227 |         self.f = f
228 | 
229 |         if name is None:
230 |             if self.is_buildin:
231 |                 name = f
232 |             else:
233 |                 name = f.__name__
234 |         
235 |         self.__name__ = name
236 | 
237 |         self.mask = mask
238 |     
239 |     @property
240 |     def name(self):
241 |         return self.__name__
242 | 
243 |     @property
244 |     def is_buildin(self):
245 |         return isinstance(self.f, str)
246 |     
247 |     @property
248 |     def need_filter(self):
249 |         return self.mask is not None
250 |     
251 |     def __call__(self, data, *args, col = None, **kwargs):
252 |         if col in data:
253 |             data = data[col]
254 | 
255 |         r = self.f(data, *args, **kwargs)
256 | 
257 |         if not isinstance(r, dict):
258 |             r = {
259 |                 self.name: r
260 |             }
261 | 
262 |         return pd.Series(r)
263 |     
264 | 
265 |     def filter(self, data):
266 |         if self.mask is None:
267 |             return data
268 |         
269 |         mask = self.mask
270 |         if isinstance(self.mask, Mask):
271 |             mask = self.mask.replay(data)
272 |         
273 |         return data[mask]
274 | 
275 | 


--------------------------------------------------------------------------------
/toad/preprocessing/process_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | 
 6 | from .process import Processing, Mask, F
 7 | 
 8 | 
 9 | np.random.seed(1)
10 | 
11 | history = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, 400, size = 500)
12 | open_time = np.full(500, np.datetime64('2020-03-01')) - np.random.randint(30, size = 500)
13 | A = np.random.randint(10, size = 500)
14 | B = np.random.rand(500)
15 | B[np.random.choice(500, 10)] = np.nan
16 | 
17 | 
18 | df = pd.DataFrame({
19 |     'history': history,
20 |     'open_time': open_time,
21 |     'A': A,
22 |     'B': B,
23 | })
24 | 
25 | 
26 | def test_mask():
27 |     m = Mask('A') > 3
28 |     assert m.replay(df).sum() == (A > 3).sum()
29 | 
30 | 
31 | def test_mask_without_name():
32 |     m = Mask() > 3
33 |     assert m.replay(A).sum() == (A > 3).sum()
34 | 
35 | def test_mask_isin():
36 |     m = Mask('A').isin([1,2,3])
37 |     assert m.replay(df).sum() == df['A'].isin([1,2,3]).sum()
38 | 
39 | def test_mask_isna():
40 |     m = Mask('A').isna()
41 |     assert m.replay(df).sum() == df['A'].isna().sum()
42 | 
43 | def test_f():
44 |     assert F(len)(A)[0] == 500
45 | 
46 | def test_processing():
47 |     res = (
48 |         Processing(df)
49 |         .groupby('open_time')
50 |         .apply({'A': ['min', 'mean']})
51 |         .apply({'B': [
52 |             {
53 |                 'f': 'size',
54 |                 'mask': Mask('A') > 1,
55 |             },
56 |             {
57 |                 'f': len,
58 |             },
59 |         ]})
60 |         .exec()
61 |     )
62 |     
63 |     assert res.size == 120 and res.loc['2020-02-29', 'B_size'] == 23
64 | 
65 | 
66 | def test_processing_with_partition():
67 |     from .partition import ValuePartition
68 |     res = (
69 |         Processing(df)
70 |         .groupby('open_time')
71 |         .partitionby(ValuePartition('A'))
72 |         .apply({'B': ['mean', 'size']})
73 |         .exec()
74 |     )
75 | 
76 |     assert res.size == 600 and res.loc['2020-02-29', 'B_size_1'] == 2


--------------------------------------------------------------------------------
/toad/scorecard_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.linear_model import LogisticRegression
  5 | 
  6 | from .scorecard import ScoreCard, WOETransformer, Combiner
  7 | 
  8 | np.random.seed(1)
  9 | 
 10 | # Create a testing dataframe and a scorecard model.
 11 | 
 12 | ab = np.array(list('ABCDEFG'))
 13 | feature = np.random.randint(10, size = 500)
 14 | target = np.random.randint(2, size = 500)
 15 | str_feat = ab[np.random.choice(7, 500)]
 16 | 
 17 | df = pd.DataFrame({
 18 |     'A': feature,
 19 |     'B': str_feat,
 20 |     'C': ab[np.random.choice(2, 500)],
 21 |     'D': np.ones(500),
 22 | })
 23 | 
 24 | card_config = {
 25 |     'A': {
 26 |         '[-inf ~ 3)': 100,
 27 |         '[3 ~ 5)': 200,
 28 |         '[5 ~ 8)': 300,
 29 |         '[8 ~ inf)': 400,
 30 |         'nan': 500,
 31 |     },
 32 |     'B': {
 33 |         ','.join(list('ABCD')): 200,
 34 |         ','.join(list('EF')): 400,
 35 |         'else': 500,
 36 |     },
 37 |     'C': {
 38 |         'A': 200,
 39 |         'B': 100,
 40 |     },
 41 | }
 42 | 
 43 | combiner = Combiner()
 44 | bins = combiner.fit_transform(df, target, n_bins = 5)
 45 | woe_transer = WOETransformer()
 46 | woe = woe_transer.fit_transform(bins, target)
 47 | 
 48 | # create a score card
 49 | card = ScoreCard(
 50 |     combiner = combiner,
 51 |     transer = woe_transer,
 52 | )
 53 | card.fit(woe, target)
 54 | 
 55 | 
 56 | FUZZ_THRESHOLD = 1e-6
 57 | TEST_SCORE = pytest.approx(453.5702462572068, FUZZ_THRESHOLD)
 58 | TEST_PROBA = pytest.approx(0.4673322872985267, FUZZ_THRESHOLD)
 59 | 
 60 | 
 61 | def test_representation():
 62 |     repr(card)
 63 | 
 64 | 
 65 | def test_load():
 66 |     card = ScoreCard().load(card_config)
 67 |     score = card.predict(df)
 68 |     assert score[200] == 600
 69 | 
 70 | 
 71 | def test_load_after_init_combiner():
 72 |     card = ScoreCard(
 73 |         combiner = combiner,
 74 |         transer = woe_transer,
 75 |     )
 76 |     card.load(card_config)
 77 |     score = card.predict(df)
 78 |     assert score[200] == 600
 79 | 
 80 | 
 81 | def test_proba_to_score():
 82 |     model = LogisticRegression()
 83 |     model.fit(woe, target)
 84 | 
 85 |     proba = model.predict_proba(woe)[:, 1]
 86 |     score = card.proba_to_score(proba)
 87 |     assert score[404] == TEST_SCORE
 88 | 
 89 | 
 90 | def test_score_to_prob():
 91 |     score = card.predict(df)
 92 |     proba = card.score_to_proba(score)
 93 |     assert proba[404] == TEST_PROBA
 94 | 
 95 | 
 96 | def test_predict():
 97 |     score = card.predict(df)
 98 |     assert score[404] == TEST_SCORE
 99 | 
100 | 
101 | def test_predict_proba():
102 |     proba = card.predict_proba(df)
103 |     assert proba[404, 1] == TEST_PROBA
104 | 
105 | 
106 | def test_card_feature_effect():
107 |     """
108 |     verify the `base effect of each feature` is consistent with assumption
109 |     FEATURE_EFFECT is manually calculated with following logic:
110 |     FEATURE_EFFECT = np.median(card.woe_to_score(df),axis = 0)
111 |     """
112 |     FEATURE_EFFECT = pytest.approx(np.array([142.26368948220417, 152.82747912111066, 148.82665746001695, 0.]), FUZZ_THRESHOLD)
113 |     assert card.base_effect.values == FEATURE_EFFECT
114 | 
115 | 
116 | def test_predict_sub_score():
117 |     score, sub = card.predict(df, return_sub=True)
118 |     assert sub.loc[250, 'B'] == pytest.approx(162.09822360428146, FUZZ_THRESHOLD)
119 | 
120 | 
121 | def test_woe_to_score():
122 |     score = card.woe_to_score(woe)
123 |     score = np.sum(score, axis=1)
124 |     assert score[404] == TEST_SCORE
125 | 
126 | 
127 | def test_bin_to_score():
128 |     score = card.bin_to_score(bins)
129 |     assert score[404] == TEST_SCORE
130 | 
131 | 
132 | def test_export_map():
133 |     card_map = card.export()
134 |     assert card_map['B']['D'] == 159.26
135 | 
136 | 
137 | def test_card_map():
138 |     config = card.export()
139 |     card_from_map = ScoreCard().load(config)
140 |     score = card_from_map.predict(df)
141 |     assert score[404] == 453.57
142 | 
143 | 
144 | def test_card_map_with_else():
145 |     card_from_map = ScoreCard().load(card_config)
146 |     score = card_from_map.predict(df)
147 |     assert score[80] == 1000
148 | 
149 | 
150 | def test_generate_testing_frame():
151 |     card = ScoreCard().load(card_config)
152 |     frame = card.testing_frame()
153 |     assert frame.loc[4, 'B'] == 'E'
154 | 
155 | 
156 | def test_export_frame():
157 |     card = ScoreCard().load(card_config)
158 |     frame = card.export(to_frame=True)
159 |     rows = frame[(frame['name'] == 'B') & (frame['value'] == 'else')].reset_index()
160 |     assert rows.loc[0, 'score'] == 500
161 | 
162 | 
163 | def test_card_combiner_number_not_match():
164 |     c = combiner.export()
165 |     c['A'] = [0, 3, 6, 8]
166 |     com = Combiner().load(c)
167 |     bins = com.transform(df)
168 |     woe_transer = WOETransformer()
169 |     woe = woe_transer.fit_transform(bins, target)
170 | 
171 |     card = ScoreCard(
172 |         combiner=com,
173 |         transer=woe_transer,
174 |     )
175 | 
176 |     with pytest.raises(Exception) as e:
177 |         # will raise an exception when fitting a card
178 |         card.fit(woe, target)
179 | 
180 |     assert '\'A\' is not matched' in str(e.value)
181 | 
182 | 
183 | def test_card_combiner_str_not_match():
184 |     c = combiner.export()
185 |     c['C'] = [['A'], ['B'], ['C']]
186 |     com = Combiner().load(c)
187 |     bins = com.transform(df)
188 |     woe_transer = WOETransformer()
189 |     woe = woe_transer.fit_transform(bins, target)
190 | 
191 |     card = ScoreCard(
192 |         combiner=com,
193 |         transer=woe_transer,
194 |     )
195 | 
196 |     with pytest.raises(Exception) as e:
197 |         # will raise an exception when fitting a card
198 |         card.fit(woe, target)
199 | 
200 |     assert '\'C\' is not matched' in str(e.value)
201 | 
202 | 
203 | def test_card_with_less_X():
204 |     x = woe.drop(columns='A')
205 |     card = ScoreCard(
206 |         combiner=combiner,
207 |         transer=woe_transer,
208 |     )
209 | 
210 |     card.fit(x, target)
211 |     assert card.predict(df)[200] == pytest.approx(457.5903160102142, FUZZ_THRESHOLD)
212 | 
213 | 
214 | def test_card_predict_with_unknown_feature():
215 |     np.random.seed(9)
216 |     unknown_df = df.copy()
217 |     unknown_df.loc[200, 'C'] = 'U'
218 |     assert card.predict(unknown_df)[200] == pytest.approx(456.41288777297257, FUZZ_THRESHOLD)
219 | 
220 | 
221 | def test_card_predict_with_unknown_feature_default_max():
222 |     np.random.seed(9)
223 |     unknown_df = df.copy()
224 |     unknown_df.loc[200, 'C'] = 'U'
225 |     score, sub = card.predict(unknown_df, default = 'max', return_sub = True)
226 | 
227 |     assert sub.loc[200, 'C'] == card['C']['scores'].max()
228 |     assert score[200] == pytest.approx(462.2871531373114, FUZZ_THRESHOLD)
229 | 
230 | 
231 | def test_card_predict_with_unknown_feature_default_with_value():
232 |     np.random.seed(9)
233 |     unknown_df = df.copy()
234 |     unknown_df.loc[200, 'C'] = 'U'
235 |     score, sub = card.predict(unknown_df, default = 42, return_sub = True)
236 |     
237 |     assert sub.loc[200, 'C'] == 42
238 |     assert score[200] == pytest.approx(355.46049567729443, FUZZ_THRESHOLD)
239 | 
240 | 
241 | def test_get_reason_vector():
242 |     """
243 |     verify the score reason of df is consistent with assumption
244 |     DF_REASON is manually calculated with following logic:
245 |     if score is lower than base_odds, select top k feature with lowest subscores where their corresponding  subscores are lower than the base effect of features.
246 |     if score is higher than base_odds, select top k feature with highest subscores where their corresponding  subscores are higher than the base effect of features.
247 | 
248 |     e.g. xx.iloc[404]
249 |     sub_scores:  151    159 143 0
250 |     base_effect: 142    153 149 0
251 |     diff_effect:  +9     +6  -6 0
252 | 
253 |     total_score: 453(151+159+143+0) > base_odds(35)
254 |         which is larger than base, hence, we try to find top `keep` features who contributed most to positivity
255 |     find_largest_top_3:  A(+9) B(+6) D(+0)
256 |     """
257 |     reason = card.get_reason(df)
258 |     assert reason.iloc[404]['top1'].tolist() == ['C', pytest.approx(142.9523920956781, FUZZ_THRESHOLD), 'B']
259 | 
260 | 
261 | @pytest.mark.timeout(0.007)
262 | def test_predict_dict():
263 |     """ a test for scalar inference time cost """
264 |     proba = card.predict(df.iloc[404].to_dict())
265 |     assert proba == TEST_SCORE
266 | 
267 | 


--------------------------------------------------------------------------------
/toad/selection_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | from .selection import drop_empty, drop_var, drop_corr, drop_iv, drop_vif, select, stepwise
  6 | 
  7 | np.random.seed(1)
  8 | 
  9 | LENGTH = 500
 10 | 
 11 | A = np.random.rand(LENGTH)
 12 | A[np.random.choice(LENGTH, 20, replace = False)] = np.nan
 13 | 
 14 | B = np.random.randint(100, size = LENGTH)
 15 | C = A + np.random.normal(0, 0.2, LENGTH)
 16 | D = A + np.random.normal(0, 0.1, LENGTH)
 17 | 
 18 | E = np.random.rand(LENGTH)
 19 | E[np.random.choice(LENGTH, 480, replace = False)] = np.nan
 20 | 
 21 | F = B + np.random.normal(0, 10, LENGTH)
 22 | 
 23 | target = np.random.randint(2, size = LENGTH)
 24 | 
 25 | frame = pd.DataFrame({
 26 |     'A': A,
 27 |     'B': B,
 28 |     'C': C,
 29 |     'D': D,
 30 |     'E': E,
 31 |     'F': F,
 32 | })
 33 | 
 34 | frame['target'] = target
 35 | 
 36 | 
 37 | def test_drop_empty():
 38 |     df = drop_empty(frame, threshold = 0.8)
 39 |     assert 'E' not in df
 40 | 
 41 | def test_drop_var():
 42 |     df = drop_var(frame, threshold = 0.1)
 43 |     assert 'A' not in df
 44 | 
 45 | def test_drop_var_exclude():
 46 |     df = drop_var(frame, threshold = 0.1, exclude = 'A')
 47 |     assert 'A' in df
 48 | 
 49 | def test_drop_corr():
 50 |     df = drop_corr(frame, target = 'target')
 51 |     assert set(['D', 'E', 'F', 'target']) == set(df.columns.tolist())
 52 | 
 53 | def test_drop_corr_with_string():
 54 |     ab = np.array(list('ABCDEFG'))
 55 |     str_feat = pd.Series(ab[np.random.choice(7, 500)])
 56 | 
 57 |     df = drop_corr(pd.concat((frame, str_feat.rename('str_f')), axis = 1), target = 'target')
 58 |     assert set(['D', 'E', 'F', 'target', 'str_f']) == set(df.columns.tolist())
 59 | 
 60 | def test_drop_iv():
 61 |     df = drop_iv(frame, target = 'target', threshold = 0.25)
 62 |     assert 'B' not in df
 63 | 
 64 | def test_select():
 65 |     df = select(frame, target = 'target', empty = 0.8, iv = 0.2, corr = 0.7)
 66 |     assert ['D', 'F', 'target'] == df.columns.tolist()
 67 | 
 68 | def test_select_exclude():
 69 |     df = select(frame, target = 'target', empty = 0.8, iv = 0.2, corr = 0.7, exclude = ['A'])
 70 |     assert ['A', 'D', 'F', 'target'] == df.columns.tolist()
 71 | 
 72 | def test_stepwise():
 73 |     df = stepwise(frame.fillna(-1), target = 'target')
 74 |     assert ['C', 'E', 'F', 'target'] == df.columns.tolist()
 75 | 
 76 | def test_stepwise_backward():
 77 |     df = stepwise(frame.fillna(-1), target = 'target', direction = 'backward')
 78 |     assert ['C', 'E', 'F', 'target'] == df.columns.tolist()
 79 | 
 80 | def test_stepwise_forward():
 81 |     df = stepwise(frame.fillna(-1), target = 'target', direction = 'forward')
 82 |     assert ['C', 'E', 'F', 'target'] == df.columns.tolist()
 83 | 
 84 | def test_stepwise_exclude():
 85 |     df = stepwise(frame.fillna(-1), target = 'target', exclude = 'B')
 86 |     assert ['B', 'C', 'E', 'F', 'target'] == df.columns.tolist()
 87 | 
 88 | def test_stepwise_return_drop():
 89 |     df, drop_list = stepwise(frame.fillna(-1), target = 'target', return_drop = True)
 90 |     assert ['B', 'A', 'D'] == drop_list
 91 | 
 92 | def test_stepwise_lr():
 93 |     df = stepwise(frame.fillna(-1), target = 'target', estimator = 'lr', direction = 'forward')
 94 |     assert ['C', 'target'] == df.columns.tolist()
 95 | 
 96 | def test_stepwise_ks():
 97 |     df = stepwise(frame.fillna(-1), target = 'target', criterion = 'ks', direction = 'forward')
 98 |     assert ['A', 'C', 'target'] == df.columns.tolist()
 99 | 
100 | def test_stepwise_zero():
101 |     df = pd.DataFrame({
102 |         'X': np.zeros(500),
103 |         'Z': np.random.rand(500),
104 |         'Y': np.random.randint(2, size = 500),
105 |     })
106 |     df = stepwise(df, target = 'Y')
107 |     assert set(['Z', 'Y']) == set(df.columns.tolist())
108 | 
109 | def test_stepwise_forward_when_best_is_first():
110 |     df = frame[['E', 'F', 'B', 'A', 'D', 'C', 'target']]
111 |     df = stepwise(df.fillna(-1), target = 'target', direction = 'forward')
112 |     assert ['E', 'F', 'C', 'target'] == df.columns.tolist()
113 | 
114 | def test_drop_vif():
115 |     df = drop_vif(frame.fillna(-1), exclude = 'target')
116 |     assert ['C', 'F', 'target'] == df.columns.tolist()
117 | 


--------------------------------------------------------------------------------
/toad/stats_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | from .stats import IV, WOE, gini, gini_cond, entropy_cond, quality, _IV, VIF
 6 | 
 7 | 
 8 | np.random.seed(1)
 9 | 
10 | feature = np.random.rand(500)
11 | target = np.random.randint(2, size = 500)
12 | A = np.random.randint(100, size = 500)
13 | B = np.random.randint(100, size = 500)
14 | mask = np.random.randint(8, size = 500)
15 | 
16 | df = pd.DataFrame({
17 |     'feature': feature,
18 |     'target': target,
19 |     'A': A,
20 |     'B': B,
21 | })
22 | 
23 | 
24 | def test_woe():
25 |     value = WOE(0.2, 0.3)
26 |     assert value == pytest.approx(-0.4054651081081643)
27 | 
28 | def test_iv_priv():
29 |     value, _ = _IV(df['feature'], df['target'])
30 |     assert value == pytest.approx(0.010385942643745403)
31 | 
32 | def test_iv():
33 |     value = IV(df['feature'], df['target'], n_bins = 10, method = 'dt')
34 |     assert value == pytest.approx(0.2735917707743619)
35 | 
36 | def test_iv_return_sub():
37 |     _, sub = IV(mask, df['target'], return_sub = True, n_bins = 10, method = 'dt')
38 |     assert len(sub) == 8
39 |     assert sub[4] == pytest.approx(0.006449386778057019)
40 | 
41 | def test_iv_frame():
42 |     res = IV(df, 'target', n_bins = 10, method = 'chi')
43 |     assert res.loc[0, 'A'] == pytest.approx(0.226363832867123)
44 | 
45 | def test_gini():
46 |     value = gini(df['target'])
47 |     assert value == 0.499352
48 | 
49 | def test_gini_cond():
50 |     value = gini_cond(df['feature'], df['target'])
51 |     assert value == pytest.approx(0.4970162601626016)
52 | 
53 | def test_entropy_cond():
54 |     value = entropy_cond(df['feature'], df['target'])
55 |     assert value == pytest.approx(0.6924990371522171)
56 | 
57 | def test_quality():
58 |     result = quality(df, 'target')
59 |     assert result.loc['feature', 'iv'] == 0.2735917707743619
60 |     assert result.loc['A', 'gini'] == 0.49284164671885444
61 |     assert result.loc['B', 'entropy'] ==  pytest.approx(0.6924956879070063, 5e-5)
62 |     assert result.loc['feature', 'unique'] == 500
63 | 
64 | def test_quality_iv_only():
65 |     result = quality(df, 'target', iv_only = True)
66 |     assert np.isnan(result.loc['feature', 'gini'])
67 | 
68 | def test_quality_with_merge():
69 |     result = quality(df, 'target', n_bins = 5, method = 'chi')
70 |     assert result.loc['feature', 'iv'] == 0.13367825777558
71 | 
72 | def test_quality_object_type_array_with_nan():
73 |     feature = np.array([np.nan, 'A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype = 'O')[mask]
74 | 
75 |     df = pd.DataFrame({
76 |         'feature': feature,
77 |         'target': target,
78 |     })
79 |     result = quality(df)
80 |     assert result.loc['feature', 'iv'] == 0.016379338180530334
81 | 
82 | def test_vif():
83 |     vif = VIF(df)
84 |     assert vif['A'] == 2.969336442640111
85 | 


--------------------------------------------------------------------------------
/toad/tadpole/__init__.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | 
3 | sns.set_palette('muted')
4 | 
5 | from .base import Tadpole
6 | from .utils import tadpole_axes
7 | 
8 | 
9 | tadpole = Tadpole()


--------------------------------------------------------------------------------
/toad/tadpole/base.py:
--------------------------------------------------------------------------------
 1 | import seaborn as sns
 2 | from .utils import (
 3 |     get_axes,
 4 |     tadpole_axes,
 5 |     FIG_SIZE,
 6 | )
 7 | 
 8 | class Tadpole:
 9 |     def __getattr__(self, name):
10 |         t = getattr(sns, name)
11 |         if callable(t):
12 |             return self.wrapsns(t)
13 | 
14 |         return t
15 | 
16 |     def wrapsns(self, f):
17 |         @tadpole_axes
18 |         def wrapper(*args, figure_size = FIG_SIZE, **kwargs):
19 |             kw = kwargs.copy()
20 |             if 'ax' not in kw:
21 |                 kw['ax'] = get_axes(size = figure_size)
22 | 
23 |             try:
24 |                 return f(*args, **kw)
25 |             except:
26 |                 return f(*args, **kwargs)
27 | 
28 |         return wrapper
29 | 


--------------------------------------------------------------------------------
/toad/tadpole/fonts/NotoSansCJKsc-Regular.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amphibian-dev/toad/380c1e98d5f63d3433100ca23b6abf3a03d63e1f/toad/tadpole/fonts/NotoSansCJKsc-Regular.otf


--------------------------------------------------------------------------------
/toad/tadpole/func.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/toad/tadpole/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import seaborn as sns
  3 | from functools import wraps
  4 | import matplotlib.pyplot as plt
  5 | from matplotlib.axes import Axes
  6 | from matplotlib.font_manager import FontProperties
  7 | 
  8 | sns.set_palette('muted')
  9 | 
 10 | CURRENT_PATH = os.path.abspath(os.path.dirname(__file__))
 11 | FONT_FILE = 'NotoSansCJKsc-Regular.otf'
 12 | FONTS_PATH = os.path.join(CURRENT_PATH, 'fonts', FONT_FILE)
 13 | myfont = FontProperties(fname = os.path.abspath(FONTS_PATH))
 14 | sns.set(font = myfont.get_family())
 15 | 
 16 | HEATMAP_CMAP = sns.diverging_palette(240, 10, as_cmap = True)
 17 | MAX_STYLE = 6
 18 | FIG_SIZE = (12, 6)
 19 | 
 20 | def get_axes(size = FIG_SIZE):
 21 |     _, ax = plt.subplots(figsize = size)
 22 |     return ax
 23 | 
 24 | def reset_legend(axes):
 25 |     if axes.get_legend() is not None:
 26 |         axes.legend(
 27 |             loc='center left',
 28 |             bbox_to_anchor=(1, 0.5),
 29 |             framealpha = 0,
 30 |             prop = myfont,
 31 |         )
 32 | 
 33 |     return axes
 34 | 
 35 | def reset_ticklabels(axes):
 36 |     labels = []
 37 |     if axes.get_xticklabels():
 38 |         labels += axes.get_xticklabels()
 39 | 
 40 |     if axes.get_yticklabels():
 41 |         labels += axes.get_yticklabels()
 42 | 
 43 |     for label in labels:
 44 |         label.set_fontproperties(myfont)
 45 | 
 46 |     return axes
 47 | 
 48 | def reset_xticks(axes):
 49 |     for label in axes.get_xticklabels():
 50 |         label.set_ha('left')
 51 |         label.set_rotation(-25)
 52 |     
 53 |     return axes
 54 | 
 55 | 
 56 | def reset_title(axes):
 57 |     title = axes.get_title()
 58 |     
 59 |     if title:
 60 |         axes.set_title(title, fontproperties = myfont)
 61 |     
 62 |     return axes
 63 | 
 64 | 
 65 | def reset_xylabels(axes):
 66 |     y_label = axes.get_ylabel()
 67 |     if y_label:
 68 |         axes.set_ylabel(y_label, fontproperties = myfont)
 69 |     
 70 |     x_label = axes.get_xlabel()
 71 |     if x_label:
 72 |         axes.set_xlabel(x_label, fontproperties = myfont)
 73 |     
 74 |     return axes
 75 | 
 76 | 
 77 | def reset_ylim(axes):
 78 |     # for axes and twins
 79 |     for ax in axes.figure.axes:
 80 |         if ax.bbox.bounds == axes.bbox.bounds:
 81 |             bottom, top = ax.get_ylim()
 82 |             top += (top - bottom) * 0.1
 83 |             ax.set_ylim(bottom, top)
 84 | 
 85 |     return axes
 86 | 
 87 | 
 88 | def fix_axes(axes):
 89 |     if not isinstance(axes, Axes):
 90 |         return axes
 91 | 
 92 |     functions = [reset_title, reset_xylabels, reset_ticklabels, reset_legend, reset_xticks]
 93 | 
 94 |     for func in functions:
 95 |         func(axes)
 96 |     return axes
 97 | 
 98 | def tadpole_axes(fn):
 99 |     @wraps(fn)
100 |     def func(*args, **kwargs):
101 |         res = fn(*args, **kwargs)
102 | 
103 |         if not isinstance(res, tuple):
104 |             return fix_axes(res)
105 | 
106 |         r = tuple()
107 |         for i in res:
108 |             r += (fix_axes(i),)
109 | 
110 |         return r
111 | 
112 |     return func
113 | 
114 | 
115 | 
116 | def annotate(ax, x, y, space = 5, format = ".2f"):
117 |     """
118 |     """
119 |     va = 'bottom'
120 | 
121 |     if y < 0:
122 |         space *= -1
123 |         va = 'top'
124 | 
125 |     ax.annotate(
126 |         ("{:"+ format +"}").format(y),
127 |         (x, y),
128 |         xytext = (0, space),
129 |         textcoords = "offset points",
130 |         ha = 'center',
131 |         va = va,
132 |     )
133 | 
134 | 
135 | 
136 | def add_bar_annotate(ax, **kwargs):
137 |     """
138 |     """
139 |     for rect in ax.patches:
140 |         y_value = rect.get_height()
141 |         x_value = rect.get_x() + rect.get_width() / 2
142 | 
143 |         annotate(ax, x_value, y_value, **kwargs)
144 | 
145 |     return ax
146 | 
147 | 
148 | def add_line_annotate(ax, **kwargs):
149 |     """
150 |     """
151 |     for line in ax.lines:
152 |         points = line.get_xydata()
153 | 
154 |         for point in points:
155 |             annotate(ax, point[0], point[1], **kwargs)
156 | 
157 |     return ax
158 | 
159 | 
160 | def add_annotate(ax, **kwargs):
161 |     if len(ax.lines) > 0:
162 |         add_line_annotate(ax, **kwargs)
163 | 
164 |     if len(ax.patches) > 0:
165 |         add_bar_annotate(ax, **kwargs)
166 | 
167 |     return ax
168 | 
169 | 
170 | def add_text(ax, text, loc = 'top left', offset = (0.01, 0.04)):
171 |     x_min, x_max = ax.get_xlim()
172 |     y_min, y_max = ax.get_ylim()
173 | 
174 |     x_offset = (x_max - x_min) * offset[0]
175 |     y_offset = (y_max - y_min) * offset[1]
176 | 
177 |     if loc == 'top left':
178 |         loc = (x_min + x_offset, y_max - y_offset)
179 |     elif loc == 'top right':
180 |         loc = (x_max - x_offset, y_max - y_offset)
181 |     elif loc == 'bottom left':
182 |         loc = (x_min + x_offset, y_min + y_offset)
183 |     elif loc == 'bottom right':
184 |         loc = (x_max - x_offset, y_min + y_offset)
185 | 
186 |     ax.text(*loc, text, fontsize = 'x-large')
187 | 
188 |     return ax
189 | 


--------------------------------------------------------------------------------
/toad/transform_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | import pyximport
  6 | 
  7 | pyximport.install(setup_args={"include_dirs": np.get_include()})
  8 | 
  9 | from .transform import WOETransformer, Combiner, GBDTTransformer
 10 | 
 11 | np.random.seed(1)
 12 | 
 13 | ab = np.array(list('ABCDEFG'))
 14 | feature = np.random.randint(10, size = 500)
 15 | target = np.random.randint(2, size = 500)
 16 | str_feat = ab[np.random.choice(7, 500)]
 17 | uni_feat = np.ones(500)
 18 | empty_feat = feature.astype(float)
 19 | empty_feat[np.random.choice(500, 50, replace = False)] = np.nan
 20 | 
 21 | df = pd.DataFrame({
 22 |     'A': feature,
 23 |     'B': str_feat,
 24 |     'C': uni_feat,
 25 |     'D': empty_feat,
 26 |     'target': target,
 27 | })
 28 | 
 29 | 
 30 | 
 31 | def test_duplicated_keys():
 32 |     dup_df = df.rename(columns = {"C": "A"})
 33 |     with pytest.raises(Exception, match=r"X has duplicate keys `.*`"):
 34 |         WOETransformer().fit_transform(dup_df, target)
 35 | 
 36 | def test_woe_transformer():
 37 |     f = WOETransformer().fit_transform(feature, target)
 38 |     assert f[451] == pytest.approx(-0.17061154127869285)
 39 | 
 40 | def test_woe_transformer_with_str():
 41 |     f = WOETransformer().fit_transform(str_feat, target)
 42 |     assert f[451] == pytest.approx(-0.2198594761130199)
 43 | 
 44 | def test_woe_transformer_with_unknown_group():
 45 |     transer = WOETransformer().fit(str_feat, target)
 46 |     res = transer.transform(['Z'], default = 'min')
 47 |     assert res[0] == pytest.approx(-0.2198594761130199)
 48 | 
 49 | def test_woe_transformer_frame():
 50 |     res = WOETransformer().fit_transform(df, target)
 51 |     assert res.iloc[451, 1] == pytest.approx(-0.2198594761130199)
 52 | 
 53 | def test_woe_transformer_dict():
 54 |     transer = WOETransformer().fit(df, 'target')
 55 |     res = transer.transform({
 56 |         "A": 6,
 57 |         "B": "C",
 58 |         "C": 1,
 59 |         "D": 2,
 60 |     })
 61 |     assert res['B'].item() == pytest.approx(-0.09149433112609942)
 62 | 
 63 | def test_woe_transformer_select_dtypes():
 64 |     res = WOETransformer().fit_transform(df, target, select_dtypes = 'object')
 65 |     assert res.loc[451, 'A'] == 3
 66 | 
 67 | def test_woe_transformer_exclude():
 68 |     res = WOETransformer().fit_transform(df, target, exclude = 'A')
 69 |     assert res.loc[451, 'A'] == 3
 70 | 
 71 | def test_woe_transformer_export_single():
 72 |     transer = WOETransformer().fit(feature, target)
 73 |     t = transer.export()
 74 |     assert t[transer._default_name][5] == pytest.approx(0.3938235330926786)
 75 | 
 76 | def test_woe_transformer_export():
 77 |     transer = WOETransformer().fit(df, target)
 78 |     t = transer.export()
 79 |     assert t['C'][1] == 0
 80 | 
 81 | def test_woe_transformer_load():
 82 |     rules = {
 83 |         'A': {
 84 |             1: 0.1,
 85 |             2: 0.2,
 86 |             3: 0.3,
 87 |         }
 88 |     }
 89 | 
 90 |     transer = WOETransformer().load(rules)
 91 |     assert transer._rules['A']['woe'][1] == 0.2
 92 | 
 93 | 
 94 | def test_combiner():
 95 |     f = Combiner().fit_transform(feature, target, method = 'chi')
 96 |     assert f[451] == 3
 97 | 
 98 | def test_combiner_with_str():
 99 |     f = Combiner().fit_transform(str_feat, target, method = 'chi')
100 |     assert f[451] == 0
101 | 
102 | def test_combiner_unique_feature():
103 |     f = Combiner().fit_transform(uni_feat, target, method = 'chi')
104 |     assert f[451] == 0
105 | 
106 | def test_combiner_frame():
107 |     res = Combiner().fit_transform(df, target)
108 |     assert res.iloc[404, 1] == 2
109 | 
110 | def test_combiner_select_dtypes():
111 |     res = Combiner().fit_transform(df, target, select_dtypes = 'number')
112 |     assert res.loc[451, 'B'] == 'G'
113 | 
114 | def test_combiner_exclude():
115 |     res = Combiner().fit_transform(df, target, exclude = 'B')
116 |     assert res.loc[451, 'B'] == 'G'
117 | 
118 | def test_combiner_labels():
119 |     combiner = Combiner().fit(df, target)
120 |     res = combiner.transform(df, labels = True)
121 |     assert res.loc[451, 'A'] == '03.[3 ~ 4)'
122 | 
123 | def test_combiner_single_feature():
124 |     combiner = Combiner().fit(df['A'], method = 'step', n_bins = 5)
125 |     res = combiner.transform(df['A'])
126 |     assert res[451] == 1
127 | 
128 | def test_combiner_export():
129 |     combiner = Combiner().fit(df, target, method = 'chi', n_bins = 4)
130 |     bins = combiner.export()
131 |     assert isinstance(bins['B'][0], list)
132 | 
133 | def test_combiner_update():
134 |     combiner = Combiner().fit(df, target, method = 'chi', n_bins = 4)
135 |     combiner.update({'A': [1,2,3,4,5,6]})
136 |     bins = combiner.export()
137 |     assert len(bins['A']) == 6
138 | 
139 | def test_combiner_step():
140 |     combiner = Combiner().fit(df['A'], method = 'step', n_bins = 4)
141 |     bins = combiner.export()
142 |     assert bins['A'][1] == 4.5
143 | 
144 | def test_combiner_target_in_frame():
145 |     combiner = Combiner().fit(df, 'target', n_bins = 4)
146 |     bins = combiner.export()
147 |     assert bins['A'][1] == 6
148 | 
149 | def test_combiner_target_in_frame_kwargs():
150 |     combiner = Combiner().fit(df, y = 'target', n_bins = 4)
151 |     bins = combiner.export()
152 |     assert bins['A'][1] == 6
153 | 
154 | def test_combiner_empty_separate():
155 |     combiner = Combiner()
156 |     bins = combiner.fit_transform(df, 'target', n_bins = 4, empty_separate = True)
157 |     mask = pd.isna(df['D'])
158 |     assert (bins['D'][~mask] != 4).all()
159 | 
160 | def test_combiner_labels_with_empty():
161 |     combiner = Combiner().fit(df, 'target', n_bins = 4, empty_separate = True)
162 |     res = combiner.transform(df, labels = True)
163 |     assert res.loc[2, 'D'] == '04.nan'
164 | 
165 | def test_gbdt_transformer():
166 |     np.random.seed(1)
167 | 
168 |     df = pd.DataFrame({
169 |         'A': np.random.rand(500),
170 |         'B': np.random.randint(10, size = 500),
171 |     })
172 |     f = GBDTTransformer().fit_transform(df, target, n_estimators = 10, max_depth = 2)
173 |     assert f.shape == (500, 40)
174 | 


--------------------------------------------------------------------------------
/toad/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .func import *
2 | from .decorator import *
3 | from .progress import Progress
4 | 


--------------------------------------------------------------------------------
/toad/utils/decorator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from time import time
  4 | from .func import save_json, read_json
  5 | from functools import wraps, WRAPPER_ASSIGNMENTS
  6 | 
  7 | 
  8 | 
  9 | class Decorator:
 10 |     """base decorater class
 11 |     """
 12 |     _cls = None
 13 |     is_class = False
 14 | 
 15 |     def __init__(self, *args, is_class = False, **kwargs):
 16 |         self.is_class = is_class
 17 |         self.args = []
 18 |         self.kwargs = {}
 19 | 
 20 |         if len(args) == 1 and callable(args[0]):
 21 |             self.fn = args[0]
 22 |         else:
 23 |             self.setup(*args, **kwargs)
 24 |     
 25 | 
 26 |     @property
 27 |     def fn(self):
 28 |         if hasattr(self, '__wrapped__'):
 29 |             return self.__wrapped__
 30 |         
 31 |         return None
 32 |     
 33 |     @fn.setter
 34 |     def fn(self, func):
 35 |         if hasattr(self, 'setup_func'):
 36 |             func = self.setup_func(func)
 37 |         
 38 |         self.__wrapped__ = func
 39 | 
 40 |     def __call__(self, *args, **kwargs):
 41 |         if self.fn is None:
 42 |             self.fn = args[0]
 43 |             return self
 44 | 
 45 |         if self.is_class:
 46 |             self._cls = args[0]
 47 |             args = args[1:]
 48 | 
 49 |         return self.wrapper(*args, **kwargs)
 50 | 
 51 | 
 52 |     def __get__(self, instance, type = None):
 53 |         self.is_class = True
 54 |         self._cls = instance
 55 | 
 56 |         @wraps(self.__wrapped__)
 57 |         def func(*args, **kwargs):
 58 |             return self.__call__(instance, *args, **kwargs)
 59 | 
 60 |         return func
 61 | 
 62 | 
 63 |     def __getattribute__(self, name):
 64 |         if name in WRAPPER_ASSIGNMENTS:
 65 |             return getattr(self.__wrapped__, name)
 66 | 
 67 |         return object.__getattribute__(self, name)
 68 | 
 69 | 
 70 |     def setup(self, *args, **kwargs):
 71 |         self.args = args
 72 |         self.kwargs = kwargs
 73 | 
 74 |         for key in kwargs:
 75 |             setattr(self, key, kwargs[key])
 76 | 
 77 | 
 78 |     def call(self, *args, **kwargs):
 79 |         if self._cls is not None:
 80 |             args = (self._cls, *args)
 81 | 
 82 |         return self.fn(*args, **kwargs)
 83 | 
 84 |     def wrapper(self, *args, **kwargs):
 85 |         return self.call(*args, **kwargs)
 86 | 
 87 | 
 88 | class frame_exclude(Decorator):
 89 |     """decorator for exclude columns
 90 |     """
 91 | 
 92 |     def wrapper(self, X, *args, exclude = None, **kwargs):
 93 |         if exclude is not None and isinstance(X, pd.DataFrame):
 94 |             X = X.drop(columns = exclude)
 95 | 
 96 |         return self.call(X, *args, **kwargs)
 97 | 
 98 | 
 99 | class select_dtypes(Decorator):
100 |     """ decorator for select frame by dtypes
101 |     """
102 | 
103 |     def wrapper(self, X, *args, select_dtypes = None, **kwargs):
104 |         if select_dtypes is not None and isinstance(X, pd.DataFrame):
105 |             X = X.select_dtypes(include = select_dtypes)
106 | 
107 |         return self.call(X, *args, **kwargs)
108 | 
109 | 
110 | class save_to_json(Decorator):
111 |     """support save result to json file
112 |     """
113 |     def wrapper(self, *args, to_json = None, **kwargs):
114 |         res = self.call(*args, **kwargs)
115 | 
116 |         if to_json is not None:
117 |             save_json(res, to_json)
118 | 
119 |         return res
120 | 
121 | 
122 | class load_from_json(Decorator):
123 |     """support load data from json file
124 |     """
125 |     require_first = False
126 | 
127 |     def wrapper(self, *args, from_json = None, **kwargs):
128 |         if from_json is not None:
129 |             obj = read_json(from_json)
130 |             args = (obj, *args)
131 |         
132 |         elif self.require_first and len(args) > 0 and isinstance(args[0], str):
133 |             obj = read_json(args[0])
134 |             args = (obj, *args[1:])
135 | 
136 |         return self.call(*args, **kwargs)
137 | 
138 | 
139 | class support_dataframe(Decorator):
140 |     """decorator for supporting dataframe
141 |     """
142 |     require_target = True
143 |     target = 'target'
144 | 
145 |     def wrapper(self, frame, *args, **kwargs):
146 |         if not isinstance(frame, pd.DataFrame):
147 |             return self.call(frame, *args, **kwargs)
148 | 
149 |         frame = frame.copy()
150 |         if self.require_target and isinstance(args[0], str):
151 |             target = frame.pop(args[0])
152 |             args = (target,) + args[1:]
153 |         elif self.target in kwargs and isinstance(kwargs[self.target], str):
154 |             kwargs[self.target] = frame.pop(kwargs[self.target])
155 | 
156 |         res = dict()
157 |         for col in frame:
158 |             r = self.call(frame[col], *args, **kwargs)
159 | 
160 |             if not isinstance(r, np.ndarray):
161 |                 r = [r]
162 | 
163 |             res[col] = r
164 |         return pd.DataFrame(res)
165 | 
166 | 
167 | class proxy_docstring(Decorator):
168 |     method_name = None
169 |     
170 |     def __get__(self, *args):
171 |         func = super().__get__(*args)
172 |         
173 |         if self.method_name is not None and hasattr(self._cls, self.method_name):
174 |             setattr(func, '__doc__', getattr(self._cls, self.method_name).__doc__)
175 |         
176 |         return func
177 | 
178 | 
179 | class support_numpy(Decorator):
180 |     """decorator for supporting numpy array to use torch function
181 |     """
182 |     def wrapper(self, *args, **kwargs):
183 |         import torch
184 | 
185 |         has_numpy = False
186 |         l_args = []
187 |         for a in args:
188 |             if not isinstance(a, torch.Tensor):
189 |                 a = torch.tensor(a)
190 |                 has_numpy = True
191 |             
192 |             l_args.append(a)
193 | 
194 |         res = self.call(*l_args, **kwargs)
195 | 
196 |         # only when arguments has numpy array, convert result to numpy array
197 |         if has_numpy and isinstance(res, torch.Tensor):
198 |             res = res.numpy()
199 |         
200 |         return res
201 | 
202 | 
203 | class xgb_loss(Decorator):
204 |     """decorator for converting function to xgb supported loss function
205 | 
206 |     Args:
207 |         loss_func (callable): loss function
208 |         **kwargs: other arguments for loss function except `pred` and `label`
209 | 
210 |     Examples:
211 | 
212 |     >>> @xgb_loss(**kwargs)
213 |     >>> def loss_func(pred, label, **kwargs):
214 |     >>>     ...
215 |     >>>     return loss
216 |     >>>
217 |     >>> # or use `xgb_loss` directly
218 |     >>> xgb_func = xgb_loss(**kwargs)(loss_func)
219 |     >>>
220 |     >>> # use in xgb
221 |     >>> model = xgb.XGBClassifier(objective = xgb_func)
222 |     """
223 |     def wrapper(self, pred, label):
224 |         from .func import derivative
225 | 
226 |         def partial_func(x):
227 |             return self.call(x, label, **self.kwargs)
228 |         
229 |         grad = derivative(partial_func, pred, n=1, dx=1e-6)
230 |         hess = derivative(partial_func, pred, n=2, dx=1e-6)
231 | 
232 |         return grad, hess
233 | 
234 | 
235 | class performance(Decorator):
236 |     """decorator for analysis code performance
237 | 
238 |     Args:
239 |         loop (int): loop times, default `1`
240 |     
241 |     Examples:
242 |     >>> @performance(loop = 100)
243 |     >>> def func():
244 |     >>>     ... # code
245 |     >>>     return res
246 |     >>>
247 |     >>> func()
248 |     >>> 
249 |     >>> # or use `performance` in `with` statement
250 |     >>> with performance():
251 |     >>>     ... # code
252 |     """
253 |     loop = 1
254 | 
255 |     def wrapper(self, *args, **kwargs):
256 |         costs = []
257 |         for _ in range(self.loop):
258 |             start = time()
259 |             res = self.call(*args, **kwargs)
260 |             end = time()
261 |             costs.append(end - start)
262 | 
263 |         self.analysis(costs)
264 |         return res
265 |     
266 | 
267 |     def analysis(self, costs):
268 |         import numpy as np
269 | 
270 |         print('total cost: {:.5f}s'.format(np.sum(costs)))
271 |         print("-"*40)
272 |         data = {
273 |             "Mean": np.mean(costs),
274 |             "Min": np.min(costs),
275 |             "Max": np.max(costs),
276 |             "90%": np.percentile(costs, 90),
277 |             "95%": np.percentile(costs, 95),
278 |             "99%": np.percentile(costs, 99),
279 |         }
280 |         HEADER = "{:>8}"*len(data)
281 |         BODY = "{:>7.3f}s"*len(data)
282 |         print(HEADER.format(*data.keys()))
283 |         print(BODY.format(*data.values()))
284 |     
285 | 
286 |     def __enter__(self):
287 |         self.start = time()
288 |         return self
289 |     
290 |     def __exit__(self, exc_type, exc_value, traceback):
291 |         self.end = time()
292 |         self.analysis([self.end - self.start])
293 | 


--------------------------------------------------------------------------------
/toad/utils/decorator_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | from .decorator import (
 6 |     Decorator,
 7 |     frame_exclude,
 8 |     xgb_loss,
 9 |     performance,
10 | )
11 | 
12 | np.random.seed(1)
13 | 
14 | 
15 | def func():
16 |     "This is a doc for method"
17 |     pass
18 | 
19 | 
20 | def test_decorator_doc():
21 |     f = frame_exclude(func)
22 | 
23 |     assert f.__doc__ == 'This is a doc for method'
24 | 
25 | 
26 | def test_decorator_init_func():
27 |     class a(Decorator):
28 |         def setup_func(self, func):
29 |             return sum
30 |     
31 |     f = a(func)
32 | 
33 |     assert f([10, 20]) == 30
34 | 
35 | 
36 | def test_decorator_inherit():
37 |     class a(Decorator):
38 |         bias = 0
39 |         def wrapper(self, *args, a = 0, **kwargs):
40 |             return self.call(a + self.bias)
41 |     
42 |     class b(a):
43 |         def wrapper(self, *args, b = 0, **kwargs):
44 |             a = super().wrapper(*args, **kwargs)
45 |             b = self.call(b)
46 |             return a + b
47 |     
48 |     f = b(bias = 2)(lambda x: x+1)
49 |     assert f(a = 1, b = 2) == 7
50 | 
51 | 
52 | def test_xgb_loss():
53 |     def loss(x, y):
54 |         return np.abs(x - y).sum()
55 |     
56 |     xgb_l = xgb_loss(loss)
57 |     grad, hess = xgb_l(np.arange(3), np.arange(3, 6))
58 | 
59 |     assert grad == pytest.approx(-3.0)
60 |     assert hess == pytest.approx(0.0)
61 | 
62 | 
63 | def test_performance():
64 |     @performance(loop = 10)
65 |     def func(x):
66 |         from time import sleep
67 |         sleep(0.01)
68 |         return x**x
69 |     
70 |     assert func(2) == 4
71 | 
72 | 
73 | def test_performance_with_clause():
74 |     def func(x):
75 |         from time import sleep
76 |         sleep(0.01)
77 |         return x**x
78 |     
79 |     with performance():
80 |         res = func(2)
81 |     
82 |     assert res == 4
83 | 


--------------------------------------------------------------------------------
/toad/utils/func_test.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import pytest
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from .func import (
  7 |     np_unique,
  8 |     fillna,
  9 |     clip,
 10 |     diff_time_frame,
 11 |     bin_to_number,
 12 |     generate_target,
 13 |     generate_str,
 14 |     get_dummies,
 15 |     feature_splits,
 16 | )
 17 | 
 18 | np.random.seed(1)
 19 | feature = np.random.rand(500)
 20 | target = np.random.randint(2, size = 500)
 21 | 
 22 | 
 23 | 
 24 | def test_fillna():
 25 |     res = fillna(np.array([1, 2, 3, np.nan, 4, 5]))
 26 |     assert res[3] == -1
 27 | 
 28 | 
 29 | def test_np_unique():
 30 |     res = np_unique(np.array([np.nan, np.nan, np.nan]))
 31 |     assert len(res) == 1
 32 | 
 33 | 
 34 | def test_clip():
 35 |     res1 = clip(feature, quantile = (.05, .95))
 36 |     res2 = clip(feature, quantile = 0.05)
 37 |     assert np.testing.assert_array_equal(res1, res2) is None
 38 | 
 39 | 
 40 | def test_feature_splits():
 41 |     value = feature_splits(feature, target)
 42 |     assert len(value) == 243
 43 | 
 44 | 
 45 | @pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python3.8 or higher")
 46 | def test_diff_time_frame():
 47 |     time_data = [
 48 |         {
 49 |             'base': '2018-01',
 50 |             'time1': '2018-04',
 51 |             'time2': '2018-04-02',
 52 |         },
 53 |         {
 54 |             'base': '2018-01',
 55 |             'time1': '2018-05',
 56 |             'time2': '2018-04-05',
 57 |         },
 58 |         {
 59 |             'base': '2018-02',
 60 |             'time1': '2018-04',
 61 |             'time2': '2018-04-10',
 62 |         },
 63 |     ]
 64 | 
 65 |     frame = pd.DataFrame(time_data)
 66 |     res = diff_time_frame(frame['base'], frame[['time1', 'time2']])
 67 |     assert res.iloc[0, 1] == 91
 68 | 
 69 | 
 70 | def test_bin_to_number():
 71 |     s = pd.Series([
 72 |         '1',
 73 |         '1-100',
 74 |         '-',
 75 |         '100-200',
 76 |         np.nan,
 77 |         '200-300',
 78 |         '300',
 79 |         '100-200',
 80 |         '>500',
 81 |     ])
 82 | 
 83 |     res = s.apply(bin_to_number())
 84 |     assert res[3] == 150
 85 | 
 86 | def test_bin_to_number_for_frame():
 87 |     df = pd.DataFrame([
 88 |         {
 89 |             'area_1': '100-200',
 90 |             'area_2': '150~200',
 91 |         },
 92 |         {
 93 |             'area_1': '300-400',
 94 |             'area_2': '200~250',
 95 |         },
 96 |         {
 97 |             'area_1': '200-300',
 98 |             'area_2': '450~500',
 99 |         },
100 |         {
101 |             'area_1': '100-200',
102 |             'area_2': '250~300',
103 |         },
104 |     ])
105 | 
106 |     res = df.applymap(bin_to_number())
107 |     assert res.loc[1, 'area_2'] == 225
108 | 
109 | def test_generate_target():
110 |     t = generate_target(len(feature), rate = 0.3, weight = feature)
111 |     rate = t.sum() / len(t)
112 |     assert rate == 0.3
113 | 
114 | @pytest.fixture
115 | def test_generate_str():
116 |     s = generate_str(size = 8)
117 |     assert s == 'EPL5MTQK'
118 | 
119 | def test_get_dummies_binary():
120 |     ab = np.array(list('ABCDEFG'))
121 |     df = pd.DataFrame({
122 |         'binary': ab[np.random.choice(2, 500)],
123 |         'multiple': ab[np.random.choice(5, 500)],
124 |     })
125 |     data = get_dummies(df, binary_drop = True)
126 |     
127 |     assert 'binary_A' not in data.columns
128 | 


--------------------------------------------------------------------------------
/toad/utils/mixin.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import numpy as np
  3 | from copy import deepcopy
  4 | from .decorator import save_to_json, load_from_json
  5 | 
  6 | 
  7 | DEFAULT_NAME = '_feature_default_name_'
  8 | 
  9 | 
 10 | class RulesMixin:
 11 |     _rules = {}
 12 | 
 13 |     def _parse_rule(self, rule):
 14 |         return rule
 15 |     
 16 |     def _format_rule(self, rule):
 17 |         return rule
 18 |     
 19 |     def default_rule(self):
 20 |         if len(self._rules) == 1:
 21 |             # return the only rule as default
 22 |             return next(iter(self._rules.values()))
 23 |         
 24 |         if self._default_name not in self._rules:
 25 |             raise Exception('can not get default rule')
 26 | 
 27 |         return self._rules[self._default_name]
 28 |     
 29 |     @property
 30 |     def _default_name(self):
 31 |         return DEFAULT_NAME
 32 | 
 33 |     @property
 34 |     def rules(self):
 35 |         return self._rules
 36 |     
 37 |     @rules.setter
 38 |     def rules(self, value):
 39 |         self._rules = value
 40 |     
 41 | 
 42 |     @load_from_json(is_class = True, require_first = True)
 43 |     def load(self, rules, update = False, **kwargs):
 44 |         """load rules from dict or json file
 45 | 
 46 |         Args:
 47 |             rules (dict): dictionary of rules
 48 |             from_json (str|IOBase): json file of rules
 49 |             update (bool): if need to use updating instead of replacing rules
 50 |         """
 51 |         rules = deepcopy(rules)
 52 | 
 53 |         if not isinstance(rules, dict):
 54 |             rules = {
 55 |                 self._default_name: rules,
 56 |             }
 57 |         
 58 |         for key in rules:
 59 |             rules[key] = self._parse_rule(rules[key], **kwargs)
 60 |         
 61 |         if update:
 62 |             self._rules.update(rules)
 63 |         else:
 64 |             self._rules = rules
 65 |         
 66 |         if hasattr(self, 'after_load'):
 67 |             self.after_load(rules)
 68 |         
 69 |         return self
 70 |     
 71 |     @save_to_json(is_class = True)
 72 |     def export(self, **kwargs):
 73 |         """export rules to dict or a json file
 74 | 
 75 |         Args:
 76 |             to_json (str|IOBase): json file to save rules
 77 |         
 78 |         Returns:
 79 |             dict: dictionary of rules
 80 |         """
 81 |         res = {}
 82 |         for key in self._rules:
 83 |             res[key] = self._format_rule(self._rules[key], **kwargs)
 84 |         
 85 |         if hasattr(self, 'after_export'):
 86 |             res = self.after_export(res, **kwargs)
 87 |         
 88 |         return res
 89 |     
 90 |     def update(self, *args, **kwargs):
 91 |         """update rules
 92 | 
 93 |         Args:
 94 |             rules (dict): dictionary of rules
 95 |             from_json (str|IOBase): json file of rules
 96 |         """
 97 |         return self.load(*args, update = True, **kwargs)
 98 |     
 99 | 
100 |     def __len__(self):
101 |         return len(self._rules.keys())
102 |     
103 |     def __contains__(self, key):
104 |         return key in self._rules
105 |     
106 |     def __getitem__(self, key):
107 |         return self._rules[key]
108 |     
109 |     def __setitem__(self, key, value):
110 |         self._rules[key] = value
111 | 
112 |     def __iter__(self):
113 |         return iter(self._rules)
114 | 
115 | 
116 | 
117 | 
118 | RE_NUM = r'-?\d+(.\d+)?'
119 | RE_SEP = r'[~-]'
120 | RE_BEGIN = r'(-inf|{num})'.format(num = RE_NUM)
121 | RE_END = r'(inf|{num})'.format(num = RE_NUM)
122 | RE_RANGE = r'\[{begin}\s*{sep}\s*{end}\)'.format(
123 |     begin = RE_BEGIN,
124 |     end = RE_END,
125 |     sep = RE_SEP,
126 | )
127 | 
128 | 
129 | 
130 | 
131 | 
132 | class BinsMixin:
133 |     EMPTY_BIN = -1
134 |     ELSE_GROUP = 'else'
135 |     NUMBER_EXP = re.compile(RE_RANGE)
136 | 
137 |     @classmethod
138 |     def parse_bins(self, bins):
139 |         """parse labeled bins to array
140 |         """
141 |         if self._is_numeric(bins):
142 |             return self._numeric_parser(bins)
143 |         
144 |         l = list()
145 | 
146 |         for item in bins:
147 |             if item == self.ELSE_GROUP:
148 |                 l.append(item)
149 |             else:
150 |                 l.append(item.split(','))
151 | 
152 |         return np.array(l, dtype = object)
153 | 
154 | 
155 |     @classmethod
156 |     def format_bins(self, bins, index = False, ellipsis = None):
157 |         """format bins to label
158 | 
159 |         Args:
160 |             bins (ndarray): bins to format
161 |             index (bool): if need index prefix
162 |             ellipsis (int): max length threshold that labels will not be ellipsis, `None` for skipping ellipsis
163 |         
164 |         Returns:
165 |             ndarray: array of labels
166 |         """
167 |         l = list()
168 | 
169 |         if np.issubdtype(bins.dtype, np.number):
170 |             has_empty = len(bins) > 0 and np.isnan(bins[-1])
171 |             
172 |             if has_empty:
173 |                 bins = bins[:-1]
174 |             
175 |             sp_l = [-np.inf] + bins.tolist() + [np.inf]
176 |             for i in range(len(sp_l) - 1):
177 |                 l.append('['+str(sp_l[i])+' ~ '+str(sp_l[i+1])+')')
178 |             
179 |             if has_empty:
180 |                 l.append('nan')
181 |         else:
182 |             for keys in bins:
183 |                 if isinstance(keys, str) and keys == self.ELSE_GROUP:
184 |                     l.append(keys)
185 |                 else:
186 |                     label = ','.join(keys)
187 |                     if ellipsis is not None:
188 |                         label = label[:ellipsis] + '..' if len(label) > ellipsis else label
189 |                     l.append(label)
190 | 
191 |         if index:
192 |             l = ["{:02}.{}".format(ix, lab) for ix, lab in enumerate(l)]
193 | 
194 |         return np.array(l)
195 |     
196 | 
197 |     @classmethod
198 |     def _is_numeric(self, bins):
199 |         m = self.NUMBER_EXP.match(bins[0])
200 | 
201 |         return m is not None
202 |     
203 |     @classmethod
204 |     def _numeric_parser(self, bins):
205 |         l = list()
206 | 
207 |         for item in bins:
208 | 
209 |             if item == 'nan':
210 |                 l.append(np.nan)
211 |                 continue
212 |             
213 |             m = self.NUMBER_EXP.match(item)
214 |             split = m.group(3)
215 | 
216 |             if split == 'inf':
217 |                 # split = np.inf
218 |                 continue
219 |             
220 |             split = float(split)
221 | 
222 |             l.append(split)
223 |         
224 |         return np.array(l)
225 | 


--------------------------------------------------------------------------------
/toad/utils/mixin_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | from .mixin import RulesMixin, BinsMixin
 4 | 
 5 | np.random.seed(1)
 6 | 
 7 | class RulesObject(RulesMixin):
 8 |     def _parse_rule(self, rule):
 9 |         return {
10 |             'rule': rule
11 |         }
12 | 
13 | 
14 |     def _format_rule(self, rule):
15 |         return 'rule -> %s' % rule['rule']
16 | 
17 | 
18 | rules = {'A': 'rule_A'}
19 | 
20 | def test_rule_parse():
21 |     r = RulesObject().load(rules)
22 |     assert r.rules['A']['rule'] == 'rule_A'
23 | 
24 | def test_rule_format():
25 |     r = RulesObject().load(rules)
26 |     assert r.export()['A'] == 'rule -> rule_A'
27 | 
28 | def test_save_update():
29 |     r = RulesObject().load(rules)
30 |     r.update({'A': 'update_A'})
31 |     assert r.rules['A']['rule'] == 'update_A'
32 | 
33 | def test_format_bins():
34 |     obj = BinsMixin()
35 |     formated = obj.format_bins(np.array([2,4,6]))
36 |     expect = ['[-inf ~ 2)', '[2 ~ 4)', '[4 ~ 6)', '[6 ~ inf)']
37 |     assert all([a == b for a, b in zip(formated, expect)])
38 | 
39 | def test_format_bins_with_index():
40 |     obj = BinsMixin()
41 |     formated = obj.format_bins(np.array([2,4,6]), index = True)
42 |     assert '01.[2 ~ 4)' in formated
43 | 
44 | def test_format_bins_with_ellipsis():
45 |     obj = BinsMixin()
46 |     formated = obj.format_bins(np.array([['A', 'B', 'C'], ['D', 'E']], dtype = object), ellipsis = 3)
47 |     assert formated[0] == 'A,B..' and formated[1] == 'D,E'
48 | 


--------------------------------------------------------------------------------
/toad/utils/pickletracer.py:
--------------------------------------------------------------------------------
  1 | import cloudpickle
  2 | from pickle import Unpickler
  3 | from cloudpickle import CloudPickler
  4 | 
  5 | _global_tracer = None
  6 | 
  7 | def get_current_tracer():
  8 |     global _global_tracer
  9 |     # if _global_tracer is None:
 10 |     #     raise ValueError("tracer is not initialized")
 11 |     return _global_tracer
 12 | 
 13 | 
 14 | class Unpickler(Unpickler):
 15 |     """trace object dependences during unpickle"""
 16 |     def find_class(self, module, name):
 17 |         tracer = get_current_tracer()
 18 |         tracer.add(module)
 19 |         return super().find_class(module, name)
 20 | 
 21 | 
 22 | class Pickler(CloudPickler):
 23 |     """trace object dependences during pickle"""
 24 |     def __init__(self, *args, **kwargs):
 25 |         super().__init__(*args, **kwargs)
 26 | 
 27 |         import types
 28 |         self._reduce_module = CloudPickler.dispatch_table[types.ModuleType]
 29 |         self.dispatch_table[types.ModuleType] = self.reduce_module
 30 |     
 31 | 
 32 |     def reduce_module(self, obj):
 33 |         tracer = get_current_tracer()
 34 |         tracer.add(obj.__name__)
 35 |         return self._reduce_module(obj)
 36 |     
 37 | 
 38 |     def __setattr__(self, name, value):
 39 |         if name == 'persistent_id':
 40 |             # fix torch module
 41 |             def wrapper_func(obj):
 42 |                 from torch.nn import Module
 43 |                 if isinstance(obj, Module):
 44 |                     return None
 45 |                 
 46 |                 return value(obj)
 47 |             
 48 |             return super().__setattr__(name, wrapper_func)
 49 |         
 50 |         return super().__setattr__(name, value)
 51 | 
 52 | 
 53 | class Tracer:
 54 |     def __init__(self):
 55 |         import re
 56 | 
 57 |         self._modules = set()
 58 |         self._ignore_modules = {"builtins"}
 59 |         self._temp_dispatch_table = {}
 60 | 
 61 |         # match python site packages path
 62 |         self._regex = re.compile(r".*python[\d\.]+\/site-packages/[\w-]+")
 63 |     
 64 |     def add(self, module):
 65 |         root = module.split(".")[0]
 66 |         
 67 |         if root in self._ignore_modules:
 68 |             return
 69 |         
 70 |         self._modules.add(root)
 71 |     
 72 |     def trace(self, obj):
 73 |         """trace `obj` by picke and unpicke
 74 |         """
 75 |         import io
 76 |         dummy = io.BytesIO()
 77 | 
 78 |         with self:
 79 |             Pickler(dummy).dump(obj)
 80 |             dummy.seek(0)
 81 |             Unpickler(dummy).load()
 82 | 
 83 |         return self.get_deps()
 84 | 
 85 | 
 86 |     def get_deps(self):
 87 |         import sys
 88 |         
 89 |         deps = {
 90 |             "pip": [],
 91 |             "files": [],
 92 |         }
 93 | 
 94 |         for name in self._modules:
 95 |             if name not in sys.modules:
 96 |                 # TODO: should raise error
 97 |                 continue
 98 |             
 99 |             module = sys.modules[name]
100 |             # package module
101 |             if self._regex.match(module.__spec__.origin):
102 |                 # TODO: spilt pip and conde pkg
103 |                 deps["pip"].append(module)
104 |                 continue
105 |             
106 |             # local file module
107 |             deps["files"].append(module)
108 | 
109 |         return deps
110 |     
111 | 
112 |     def __enter__(self):
113 |         global _global_tracer
114 |         if _global_tracer is not None:
115 |             raise ValueError("a tracer is already exists")
116 |         
117 |         # save the Cloudpickler global dispatch table
118 |         self._temp_dispatch_table = CloudPickler.dispatch_table.copy()
119 |         # setup the global tracer
120 |         _global_tracer = self
121 |         return self
122 |     
123 |     def __exit__(self, exc_type, exc_val, exc_tb):
124 |         global _global_tracer
125 | 
126 |         # restore the dispatch table to Cloudpickler
127 |         CloudPickler.dispatch_table = self._temp_dispatch_table
128 |         # clean the global tracer
129 |         _global_tracer = None
130 |         
131 |     
132 | 
133 | 
134 | def dump(obj, file, *args, **kwargs):
135 |     return Pickler(file).dump(obj)
136 | 
137 | 
138 | def load(file, *args, **kwargs):
139 |     return Unpickler(file).load()
140 | 
141 | 


--------------------------------------------------------------------------------
/toad/utils/pickletracer_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pytest
 3 | from .pickletracer import Tracer, get_current_tracer
 4 | 
 5 | 
 6 | def test_tracer_with_clause():
 7 |     assert get_current_tracer() is None
 8 |     with Tracer() as t:
 9 |         assert get_current_tracer() == t
10 |     
11 |     assert get_current_tracer() is None
12 | 
13 | 
14 | @pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
15 | def test_trace_pyfunc():
16 |     import pandas as pd
17 |     import numpy as np
18 |     from sklearn.linear_model import LinearRegression
19 |     X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
20 |     # y = 1 * x_0 + 2 * x_1 + 3
21 |     y = np.dot(X, np.array([1, 2])) + 3
22 |     reg = LinearRegression().fit(X, y)
23 |     reg.score(X, y)
24 | 
25 |     def func(data):
26 |         # data = dfunc(data)
27 |         df = pd.DataFrame(data)
28 |         return df
29 | 
30 |     class Model:
31 |         def __init__(self, model, pref):
32 |             self.model = model
33 |             self.pref = pref
34 |         
35 |         def predict(self, data):
36 |             data = self.pref(data)
37 |             return self.model.predict(data)
38 | 
39 | 
40 |     m = Model(reg, func)
41 | 
42 |     deps = Tracer().trace(m)
43 | 
44 |     assert set([m.__name__ for m in deps['pip']]) == set(['numpy', 'pandas', 'cloudpickle', 'sklearn'])
45 | 
46 | 
47 | def test_default_cloudpickle():
48 |     import pandas as pd
49 |     
50 |     def func(data):
51 |         # data = dfunc(data)
52 |         df = pd.DataFrame(data)
53 |         return df
54 |     
55 |     deps = Tracer().trace(func)
56 | 
57 |     import io
58 |     import cloudpickle
59 |     
60 |     dummy = io.BytesIO()
61 |     # this should be correct after trace object
62 |     # test for restore cloudpickle global dispatch table
63 |     cloudpickle.dump(func, dummy)
64 | 


--------------------------------------------------------------------------------
/toad/utils/progress/__init__.py:
--------------------------------------------------------------------------------
1 | from .progress import Progress
2 | 


--------------------------------------------------------------------------------
/toad/utils/progress/pandas.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from .progress import Progress
 4 | 
 5 | 
 6 | class ProgressAccessor:
 7 |     def __init__(self, obj):
 8 |         self.obj = obj
 9 |     
10 |     def apply(self, func, *args, **kwargs):
11 |         if isinstance(self.obj, pd.Series):
12 |             l = len(self.obj)
13 |         else:
14 |             # dataframe
15 |             axis = kwargs.get("axis", 0)
16 |             if axis == 'index':
17 |                 axis = 0
18 |             elif axis == 'columns':
19 |                 axis = 1
20 |             
21 |             l = self.obj.size // self.obj.shape[axis]
22 |         
23 |         p = iter(Progress(range(l)))
24 |         
25 |         def wrapper(*args, **kwargs):
26 |             next(p)
27 |             return func(*args, **kwargs)
28 |         
29 |         res = self.obj.apply(wrapper, *args, **kwargs)
30 |         p.end()
31 |         return res
32 | 
33 | 
34 | class pandas_enable:
35 |     def __init__(self):
36 |         pd.api.extensions.register_dataframe_accessor("progress")(ProgressAccessor)
37 |         pd.api.extensions.register_series_accessor("progress")(ProgressAccessor)
38 |     
39 |     def __enter__(self):
40 |         return self
41 |     
42 |     def __exit__(self, exce_type, exce_value, exce_trace):
43 |         pandas_disable()
44 | 
45 | 
46 | def pandas_disable():
47 |     if hasattr(pd.DataFrame, 'progress'):
48 |         delattr(pd.DataFrame, 'progress')
49 |     
50 |     if hasattr(pd.Series, 'progress'):
51 |         delattr(pd.Series, 'progress')
52 | 


--------------------------------------------------------------------------------
/toad/utils/progress/pandas_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from .pandas import pandas_enable, pandas_disable
 4 | 
 5 | 
 6 | 
 7 | def test_pandas_with():
 8 |     assert hasattr(pd.DataFrame, 'progress') == False
 9 |     assert hasattr(pd.Series, 'progress') == False
10 |     with pandas_enable():
11 |         assert hasattr(pd.DataFrame, 'progress') == True
12 |         assert hasattr(pd.Series, 'progress') == True
13 |     assert hasattr(pd.DataFrame, 'progress') == False
14 |     assert hasattr(pd.Series, 'progress') == False
15 | 
16 | def test_pandas_disable():
17 |     assert hasattr(pd.DataFrame, 'progress') == False
18 |     assert hasattr(pd.Series, 'progress') == False
19 |     pandas_enable()
20 |     assert hasattr(pd.DataFrame, 'progress') == True
21 |     assert hasattr(pd.Series, 'progress') == True
22 |     pandas_disable()
23 |     assert hasattr(pd.DataFrame, 'progress') == False
24 |     assert hasattr(pd.Series, 'progress') == False
25 | 
26 | def test_dataframe_apply():
27 |     df = pd.DataFrame({
28 |         "A": np.random.rand(1000),
29 |         "B": np.random.randint(10, size = (1000,))
30 |     })
31 | 
32 |     with pandas_enable():
33 |         res = df.progress.apply(lambda x: x + 1)
34 | 
35 | def test_dataframe_apply_axis():
36 |     df = pd.DataFrame({
37 |         "A": np.random.rand(1000),
38 |         "B": np.random.randint(10, size = (1000,))
39 |     })
40 | 
41 |     with pandas_enable():
42 |         res = df.progress.apply(lambda x: x + 1, axis = 1)
43 |     
44 | 
45 | def test_series_apply():
46 |     series = pd.Series(np.random.rand(2000))
47 | 
48 |     with pandas_enable():
49 |         res = series.progress.apply(lambda x: x + 1)
50 | 
51 | 


--------------------------------------------------------------------------------
/toad/utils/progress/progress.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from time import time
  3 | 
  4 | class Progress:
  5 |     """
  6 |     """
  7 |     def __init__(self, iterable, size = None, interval = 0.1):
  8 |         """
  9 |         Args:
 10 |             iterable
 11 |             size (int): max size of iterable
 12 |             interval (float): update bar interval second, default is `0.1`
 13 |         
 14 |         Attrs:
 15 |             BAR_LENGTH (int): bar length, default is `32`
 16 |             SYMBOL_DONE (str): symbol indicating complation
 17 |             SYMBOL_REST (str): symbol indicating remaining
 18 |             prefix (str): string template before progress bar
 19 |             suffix (str): string template after progress bar
 20 |             template (str): string template for rendering, `{prefix} {bar} {suffix}`
 21 |         """
 22 |         self.iterable = iterable
 23 |         self.interval = interval
 24 | 
 25 |         self.batch = 1
 26 |         self.size = size
 27 |         if hasattr(iterable, '__len__'):
 28 |             self.size = len(iterable)
 29 |         
 30 |         # is pytorch dataloader
 31 |         if hasattr(iterable, 'batch_size'):
 32 |             self.batch = getattr(iterable, 'batch_size')
 33 |             self.size = len(iterable.dataset)
 34 | 
 35 | 
 36 |         self.reset()
 37 | 
 38 | 
 39 |         self.BAR_LENGTH = 32
 40 |         
 41 |         self.SYMBOL_DONE = '█'
 42 |         self.SYMBOL_REST = '.'
 43 |         self.prefix = ""
 44 |         self.suffix = ""
 45 | 
 46 |         if self.size is None:
 47 |             self.template = "{prefix} {done} iters {time:.2f}s {tps}it/s {suffix}"
 48 |         else:
 49 |             self.template = "{prefix} {percent:3.0f}%|{bar}| [{done}/{size}] {time:.2f}s {suffix}"
 50 | 
 51 | 
 52 |     def __len__(self):
 53 |         return self.size
 54 |     
 55 | 
 56 |     def __iter__(self):
 57 |         self.reset()
 58 |         self.iterator = iter(self.iterable)
 59 |         return self
 60 |     
 61 | 
 62 |     def __next__(self):
 63 |         try:
 64 |             return self.next()
 65 |         except StopIteration as e:
 66 |             self.end()
 67 |             raise e      
 68 |     
 69 | 
 70 |     def reset(self):
 71 |         # reset index
 72 |         self.idx = 0
 73 | 
 74 |         # reset time
 75 |         self.time = None
 76 |         self.start_time = time()
 77 |         self._last_time = self.start_time
 78 |         self.iterator = iter(self.iterable)
 79 |     
 80 | 
 81 |     def next(self):
 82 |         item = next(self.iterator)
 83 |         self.update()
 84 |         return item
 85 |     
 86 | 
 87 |     def update(self, idx = None, force = False):
 88 |         # update idx
 89 |         if idx is None:
 90 |             idx = self.idx + 1
 91 |         
 92 |         self.idx = idx
 93 | 
 94 |         curr_time = time()
 95 |         self.time = curr_time - self.start_time
 96 | 
 97 |         # skip update if delta is too small
 98 |         if not force and curr_time - self._last_time < self.interval:
 99 |             return
100 |         
101 |         self._last_time = curr_time
102 |         
103 |         # update bar
104 |         self.flush()
105 |     
106 | 
107 |     def end(self):
108 |         """progress end
109 |         """
110 |         self.update(idx = self.idx, force = True)
111 |         self.print('\n')
112 |     
113 | 
114 |     def flush(self):
115 |         if self.size is None:
116 |             done = self.idx * self.batch
117 |             percent = 0
118 |             bar = None
119 |         else:
120 |             done = min(self.idx * self.batch, self.size)
121 |             percent = done / self.size
122 | 
123 |             bar = (self.SYMBOL_DONE * int(percent * self.BAR_LENGTH)).ljust(self.BAR_LENGTH, self.SYMBOL_REST)
124 | 
125 |         self.print('\r' + self.template.format(
126 |             percent = percent * 100,
127 |             bar = bar,
128 |             done = done,
129 |             size = self.size,
130 |             time = self.time,
131 |             tps = done / max(self.time, 1),
132 |             prefix = self.prefix,
133 |             suffix = self.suffix,
134 |         ))
135 |     
136 | 
137 |     def print(self, text):
138 |         sys.stdout.write(text)
139 |         sys.stdout.flush()
140 |     
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/toad/utils/progress/progress_test.py:
--------------------------------------------------------------------------------
 1 | from time import sleep, time
 2 | from .progress import Progress
 3 | 
 4 | 
 5 | class TestIterator:
 6 |     def __init__(self, size):
 7 |         self._size = size
 8 |     
 9 |     def __iter__(self):
10 |         for i in range(self._size):
11 |             yield i
12 | 
13 | 
14 | def test_progress():
15 |     p =  Progress(range(100))
16 |     for i in p:
17 |         sleep(0.01)
18 |     assert p.idx == 100
19 | 
20 | def test_progress_size():
21 |     p = Progress(range(9527))
22 |     assert p.size == 9527
23 | 
24 | def test_iterator():
25 |     ti = TestIterator(100)
26 |     p = Progress(ti)
27 |     for i in p:
28 |         sleep(0.01)
29 |     assert p.idx == 100
30 | 
31 | 
32 | def test_multi_loop():
33 |     p = Progress(range(100))
34 |     for i in p:
35 |         sleep(0.01)
36 |     assert p.idx == 100
37 |     
38 |     for i in p:
39 |         sleep(0.01)
40 |     assert p.idx == 100
41 | 
42 | def test_speed():
43 |     p = Progress(range(1000))
44 |     for i in p:
45 |         sleep(0.001)
46 |     assert p.idx == 1000
47 | 


--------------------------------------------------------------------------------
/toad/version.py:
--------------------------------------------------------------------------------
 1 | __version_info__ = (0, 1, 5, 'final', 0)
 2 | 
 3 | def get_version(version):
 4 |     main = '.'.join(str(x) for x in version[:3])
 5 | 
 6 |     if version[3] == 'final':
 7 |         return main
 8 | 
 9 |     symbol = {
10 |         'alpha': 'a',
11 |         'beta': 'b',
12 |         'rc': 'rc',
13 |     }
14 | 
15 |     return main + symbol[version[3]] + str(version[4])
16 | 
17 | __version__ = get_version(__version_info__)
18 | 


--------------------------------------------------------------------------------