├── .all-contributorsrc
├── .gitignore
├── LICENSE
├── README.md
├── arguments.py
├── baselines
├── canonical_ensemble.py
└── canonical_resampling.py
├── data
├── Mammo_0.1noised_test.csv
├── Mammo_0.1noised_train.csv
├── Mammo_0.1noised_valid.csv
├── Mammo_0.25noised_test.csv
├── Mammo_0.25noised_train.csv
├── Mammo_0.25noised_valid.csv
├── Mammo_0.4noised_test.csv
├── Mammo_0.4noised_train.csv
├── Mammo_0.4noised_valid.csv
├── Mammo_test.csv
├── Mammo_train.csv
└── Mammo_valid.csv
├── environment.py
├── main.py
├── mesa-example.ipynb
├── mesa.py
├── requirements.txt
├── sac_src
├── model.py
├── replay_memory.py
├── sac.py
└── utils.py
└── utils.py
/.all-contributorsrc:
--------------------------------------------------------------------------------
1 | {
2 | "files": [
3 | "README.md"
4 | ],
5 | "imageSize": 100,
6 | "commit": false,
7 | "badgeTemplate": "
-orange.svg\">",
8 | "contributors": [
9 | {
10 | "login": "ZhiningLiu1998",
11 | "name": "Zhining Liu",
12 | "avatar_url": "https://avatars.githubusercontent.com/u/26108487?v=4",
13 | "profile": "http://zhiningliu.com",
14 | "contributions": [
15 | "ideas",
16 | "code"
17 | ]
18 | }
19 | ],
20 | "contributorsPerLine": 7,
21 | "projectName": "mesa",
22 | "projectOwner": "ZhiningLiu1998",
23 | "repoType": "github",
24 | "repoHost": "https://github.com",
25 | "skipCi": true
26 | }
27 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Zhining Liu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
MESA: Meta-sampler for imbalanced learning
2 |
3 |
4 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 | MESA: Boost Ensemble Imbalanced Learning with MEta-SAmpler (NeurIPS 2020)
33 |
34 |
35 |
43 |
44 | **MESA is a ***meta-learning-based ensemble learning framework*** for solving class-imbalanced learning problems. It is a task-agnostic general-purpose solution that is able to boost most of the existing machine learning models' performance on imbalanced data.**
45 |
46 |
47 |
48 | # Cite Us
49 |
50 | **If you find this repository helpful in your work or research, we would greatly appreciate citations to the following paper:**
51 |
52 | ```
53 | @inproceedings{liu2020mesa,
54 | title={MESA: Boost Ensemble Imbalanced Learning with MEta-SAmpler},
55 | author={Liu, Zhining and Wei, Pengfei and Jiang, Jing and Cao, Wei and Bian, Jiang and Chang, Yi},
56 | booktitle={Conference on Neural Information Processing Systems},
57 | year={2020},
58 | }
59 | ```
60 |
61 | # Table of Contents
62 |
63 | - [Cite Us](#cite-us)
64 | - [Table of Contents](#table-of-contents)
65 | - [Background](#background)
66 | - [About MESA](#about-mesa)
67 | - [Pros and Cons of MESA](#pros-and-cons-of-mesa)
68 | - [Requirements](#requirements)
69 | - [Usage](#usage)
70 | - [Running main.py](#running-mainpy)
71 | - [Running mesa-example.ipynb](#running-mesa-exampleipynb)
72 | - [Visualization and Results](#visualization-and-results)
73 | - [From mesa-example.ipynb](#from-mesa-exampleipynb)
74 | - [Class distribution of Mammography dataset](#class-distribution-of-mammography-dataset)
75 | - [Visualize the meta-training process](#visualize-the-meta-training-process)
76 | - [Comparison with baseline methods](#comparison-with-baseline-methods)
77 | - [Other results](#other-results)
78 | - [Dataset description](#dataset-description)
79 | - [Comparisons of MESA with under-sampling-based EIL methods](#comparisons-of-mesa-with-under-sampling-based-eil-methods)
80 | - [Comparisons of MESA with over-sampling-based EIL methods](#comparisons-of-mesa-with-over-sampling-based-eil-methods)
81 | - [Comparisons of MESA with resampling-based EIL methods](#comparisons-of-mesa-with-resampling-based-eil-methods)
82 | - [Miscellaneous](#miscellaneous)
83 | - [References](#references)
84 | - [Contributors ✨](#contributors-)
85 |
86 |
87 | # Background
88 |
89 | ## About MESA
90 |
91 | We introduce a novel ensemble imbalanced learning (EIL) framework named MESA. It adaptively resamples the training set in iterations to get multiple classifiers and forms a cascade ensemble model. MESA directly learns a parameterized sampling strategy (i.e., meta-sampler) from data to optimize the final metric beyond following random heuristics. It consists of three parts: ***meta sampling*** as well as ***ensemble training*** to build ensemble classifiers, and ***meta-training*** to optimize the meta-sampler.
92 |
93 | The figure below gives an overview of the MESA framework.
94 |
95 | 
96 |
97 | ## Pros and Cons of MESA
98 |
99 | Here are some personal thoughts on the advantages and disadvantages of MESA. More discussions are welcome!
100 |
101 | **Pros:**
102 | - 🍎 *Wide compatiblilty.*
103 | We decoupled the model-training and meta-training process in MESA, making it compatible with most of the existing machine learning models.
104 | - 🍎 *High data efficiency.*
105 | MESA performs strictly balanced under-sampling to train each base-learner in the ensemble. This makes it more data-efficient than other methods, especially on highly skewed data sets.
106 | - 🍎 *Good performance.*
107 | The sampling strategy is optimized for better final generalization performance, we expect this can provide us with a better ensemble model.
108 | - 🍎 *Transferability.*
109 | We use only task-agnostic meta-information during meta-training, which means that a meta-sampler can be directly used in unseen new tasks, thereby greatly reducing the computational cost brought about by meta-training.
110 |
111 | **Cons:**
112 | - 🍏 *Meta-training cost.*
113 | Meta-training repeats the ensemble training process multiple times, which can be costly in practice (By shrinking the dataset used in meta-training, the computational cost can be reduced at the cost of minor performance loss).
114 | - 🍏 *Need to set aside a separate validation set for training.*
115 | The meta-state is formed by computing the error distribution on both the training and validation sets.
116 | - 🍏 *Possible unstable performance on small datasets.*
117 | Small datasets may cause the obtained error distribution statistics to be inaccurate/unstable, which will interfere with the meta-training process.
118 |
119 | # Requirements
120 | **Main dependencies:**
121 | - [Python](https://www.python.org/) (>=3.5)
122 | - [PyTorch](https://pytorch.org/) (=1.0.0)
123 | - [Gym](https://gym.openai.com/) (>=0.17.3)
124 | - [pandas](https://pandas.pydata.org/) (>=0.23.4)
125 | - [numpy](https://numpy.org/) (>=1.11)
126 | - [scikit-learn](https://scikit-learn.org/stable/) (>=0.20.1)
127 | - [imbalanced-learn](https://imbalanced-learn.readthedocs.io/en/stable/index.html) (=0.5.0, optional, for baseline methods)
128 |
129 | To install requirements, run:
130 |
131 | ```Shell
132 | pip install -r requirements.txt
133 | ```
134 |
135 | > **NOTE**: this implementation requires an old version of PyTorch (v1.0.0).
136 | > You may want to start a new conda environment to run our code. The step-by-step guide is as follows (using torch-cpu for an example):
137 | > - `conda create --name mesa python=3.7.11`
138 | > - `conda activate mesa`
139 | > - `conda install pytorch-cpu==1.0.0 torchvision-cpu==0.2.1 cpuonly -c pytorch`
140 | > - `pip install -r requirements.txt`
141 | >
142 | > These commands should help you to get ready for running mesa. If you have any further questions, please feel free to open an issue or drop me an email.
143 |
144 | # Usage
145 |
146 | A typical usage example:
147 |
148 | ```python
149 | # load dataset & prepare environment
150 | args = parser.parse_args()
151 | rater = Rater(args.metric)
152 | X_train, y_train, X_valid, y_valid, X_test, y_test = load_dataset(args.dataset)
153 | base_estimator = DecisionTreeClassifier()
154 |
155 | # meta-training
156 | mesa = Mesa(
157 | args=args,
158 | base_estimator=base_estimator,
159 | n_estimators=10)
160 | mesa.meta_fit(X_train, y_train, X_valid, y_valid, X_test, y_test)
161 |
162 | # ensemble training
163 | mesa.fit(X_train, y_train, X_valid, y_valid)
164 |
165 | # evaluate
166 | y_pred_test = mesa.predict_proba(X_test)[:, 1]
167 | score = rater.score(y_test, y_pred_test)
168 | ```
169 |
170 | ## Running [main.py](https://github.com/ZhiningLiu1998/mesa/blob/master/main.py)
171 |
172 | Here is an example:
173 |
174 | ```powershell
175 | python main.py --dataset Mammo --meta_verbose 10 --update_steps 1000
176 | ```
177 |
178 | You can get help with arguments by running:
179 |
180 | ```powershell
181 | python main.py --help
182 | ```
183 |
184 | ```
185 | optional arguments:
186 | # Soft Actor-critic Arguments
187 | -h, --help show this help message and exit
188 | --env-name ENV_NAME
189 | --policy POLICY Policy Type: Gaussian | Deterministic (default:
190 | Gaussian)
191 | --eval EVAL Evaluates a policy every 10 episode (default:
192 | True)
193 | --gamma G discount factor for reward (default: 0.99)
194 | --tau G target smoothing coefficient(τ) (default: 0.01)
195 | --lr G learning rate (default: 0.001)
196 | --lr_decay_steps N step_size of StepLR learning rate decay scheduler
197 | (default: 10)
198 | --lr_decay_gamma N gamma of StepLR learning rate decay scheduler
199 | (default: 0.99)
200 | --alpha G Temperature parameter α determines the relative
201 | importance of the entropy term against the reward
202 | (default: 0.1)
203 | --automatic_entropy_tuning G
204 | Automaically adjust α (default: False)
205 | --seed N random seed (default: None)
206 | --batch_size N batch size (default: 64)
207 | --hidden_size N hidden size (default: 50)
208 | --updates_per_step N model updates per simulator step (default: 1)
209 | --update_steps N maximum number of steps (default: 1000)
210 | --start_steps N Steps sampling random actions (default: 500)
211 | --target_update_interval N
212 | Value target update per no. of updates per step
213 | (default: 1)
214 | --replay_size N size of replay buffer (default: 1000)
215 |
216 | # Mesa Arguments
217 | --cuda run on CUDA (default: False)
218 | --dataset N the dataset used for meta-training (default: Mammo)
219 | --metric N the metric used for evaluate (default: aucprc)
220 | --reward_coefficient N
221 | --num_bins N number of bins (default: 5). state-size = 2 *
222 | num_bins.
223 | --sigma N sigma of the Gaussian function used in meta-sampling
224 | (default: 0.2)
225 | --max_estimators N maximum number of base estimators in each meta-
226 | training episode (default: 10)
227 | --meta_verbose N number of episodes between verbose outputs. If 'full'
228 | print log for each base estimator (default: 10)
229 | --meta_verbose_mean_episodes N
230 | number of episodes used for compute latest mean score
231 | in verbose outputs.
232 | --verbose N enable verbose when ensemble fit (default: False)
233 | --random_state N random_state (default: None)
234 | --train_ir N imbalance ratio of the training set after meta-
235 | sampling (default: 1)
236 | --train_ratio N the ratio of the data used in meta-training. set
237 | train_ratio<1 to use a random subset for meta-training
238 | (default: 1)
239 | ```
240 |
241 | ## Running [mesa-example.ipynb](https://github.com/ZhiningLiu1998/mesa/blob/master/mesa-example.ipynb)
242 |
243 | We include a highly imbalanced dataset [Mammography](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.datasets.fetch_datasets.html#imblearn.datasets.fetch_datasets) (#majority class instances = 10,923, #minority class instances = 260, imbalance ratio = 42.012) and its variants with flip label noise for quick testing and visualization of MESA and other baselines.
244 | You can use [mesa-example.ipynb](https://github.com/ZhiningLiu1998/mesa/blob/master/mesa-example.ipynb) to quickly:
245 | - conduct a comparative experiment
246 | - visualize the meta-training process of MESA
247 | - visualize the experimental results of MESA and other baselines
248 |
249 | **Please check [mesa-example.ipynb](https://github.com/ZhiningLiu1998/mesa/blob/master/mesa-example.ipynb) for more details.**
250 |
251 | # Visualization and Results
252 |
253 | ## From [mesa-example.ipynb](https://github.com/ZhiningLiu1998/mesa/blob/master/mesa-example.ipynb)
254 |
255 | ### Class distribution of Mammography dataset
256 | 
257 |
258 | ### Visualize the meta-training process
259 |
260 |
261 |
262 |
263 |
264 | ### Comparison with baseline methods
265 | 
266 |
267 | ## Other results
268 |
269 | ### Dataset description
270 |
271 | 
272 |
273 | ### Comparisons of MESA with under-sampling-based EIL methods
274 |
275 | 
276 |
277 | ### Comparisons of MESA with over-sampling-based EIL methods
278 |
279 | 
280 |
281 | ### Comparisons of MESA with resampling-based EIL methods
282 |
283 | 
284 |
285 |
286 |
287 | # Miscellaneous
288 |
289 | **Check out our previous work [Self-paced Ensemble](https://github.com/ZhiningLiu1998/self-paced-ensemble) (ICDE 2020).
290 | It is a simple heuristic-based method, but being very fast and works reasonably well.**
291 |
292 | **This repository contains:**
293 | - Implementation of MESA
294 | - Implementation of 7 ensemble imbalanced learning baselines
295 | - `SMOTEBoost` [1]
296 | - `SMOTEBagging` [2]
297 | - `RAMOBoost` [3]
298 | - `RUSBoost` [4]
299 | - `UnderBagging` [5]
300 | - `BalanceCascade` [6]
301 | - `SelfPacedEnsemble` [7]
302 | - Implementation of 11 resampling imbalanced learning baselines [8]
303 |
304 | > **NOTE:** The implementations of the above baseline methods are based on [imbalanced-algorithms](https://github.com/dialnd/imbalanced-algorithms) and [imbalanced-learn](https://github.com/scikit-learn-contrib/imbalanced-learn).
305 |
306 | # References
307 |
308 | | # | Reference |
309 | |-----|-------|
310 | | [1] | N. V. Chawla, A. Lazarevic, L. O. Hall, and K. W. Bowyer, Smoteboost: Improving prediction of the minority class in boosting. in European conference on principles of data mining and knowledge discovery. Springer, 2003, pp. 107–119|
311 | | [2] | S. Wang and X. Yao, Diversity analysis on imbalanced data sets by using ensemble models. in 2009 IEEE Symposium on Computational Intelligence and Data Mining. IEEE, 2009, pp. 324–331.|
312 | | [3] | Sheng Chen, Haibo He, and Edwardo A Garcia. 2010. RAMOBoost: ranked minority oversampling in boosting. IEEE Transactions on Neural Networks 21, 10 (2010), 1624–1642.|
313 | | [4] | C. Seiffert, T. M. Khoshgoftaar, J. Van Hulse, and A. Napolitano, Rusboost: A hybrid approach to alleviating class imbalance. IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans, vol. 40, no. 1, pp. 185–197, 2010.|
314 | | [5] | R. Barandela, R. M. Valdovinos, and J. S. Sanchez, New applications´ of ensembles of classifiers. Pattern Analysis & Applications, vol. 6, no. 3, pp. 245–256, 2003.|
315 | | [6] | X.-Y. Liu, J. Wu, and Z.-H. Zhou, Exploratory undersampling for class-imbalance learning. IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539–550, 2009. |
316 | | [7] | Zhining Liu, Wei Cao, Zhifeng Gao, Jiang Bian, Hechang Chen, Yi Chang, and Tie-Yan Liu. 2019. Self-paced Ensemble for Highly Imbalanced Massive Data Classification. 2020 IEEE 36th International Conference on Data Engineering (ICDE). IEEE, 2020, pp. 841-852.
317 | | [8] | Guillaume Lemaître, Fernando Nogueira, and Christos K. Aridas. Imbalanced-learn: A python toolbox to tackle the curse of imbalanced datasets in machine learning. Journal of Machine Learning Research, 18(17):1–5, 2017. |
318 | ## Contributors ✨
319 |
320 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
321 |
322 |
323 |
324 |
325 |
330 |
331 |
332 |
333 |
334 |
335 |
336 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
337 |
--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | parser = argparse.ArgumentParser(description='Mesa Arguments')
4 | parser.add_argument('--env-name', default="MESA-SAC")
5 |
6 | # SAC arguments
7 | parser.add_argument('--policy', default="Gaussian",
8 | help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
9 | parser.add_argument('--eval', type=bool, default=True,
10 | help='Evaluates a policy every 10 episode (default: True)')
11 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
12 | help='discount factor for reward (default: 0.99)')
13 | parser.add_argument('--tau', type=float, default=0.01, metavar='G',
14 | help='target smoothing coefficient(τ) (default: 0.01)')
15 | parser.add_argument('--lr', type=float, default=0.001, metavar='G',
16 | help='learning rate (default: 0.001)')
17 | parser.add_argument('--lr_decay_steps', type=int, default=10, metavar='N',
18 | help='step_size of StepLR learning rate decay scheduler (default: 10)')
19 | parser.add_argument('--lr_decay_gamma', type=float, default=0.99, metavar='N',
20 | help='gamma of StepLR learning rate decay scheduler (default: 0.99)')
21 | parser.add_argument('--alpha', type=float, default=0.1, metavar='G',
22 | help='Temperature parameter α determines the relative importance of the entropy\
23 | term against the reward (default: 0.1)')
24 | parser.add_argument('--automatic_entropy_tuning', type=bool, default=False, metavar='G',
25 | help='Automaically adjust α (default: False)')
26 | parser.add_argument('--seed', type=int, default=None, metavar='N',
27 | help='random seed (default: None)')
28 | parser.add_argument('--batch_size', type=int, default=64, metavar='N',
29 | help='batch size (default: 64)')
30 | parser.add_argument('--hidden_size', type=int, default=50, metavar='N',
31 | help='hidden size (default: 50)')
32 | parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
33 | help='model updates per simul|ator step (default: 1)')
34 | parser.add_argument('--update_steps', type=int, default=1000, metavar='N',
35 | help='maximum number of steps (default: 1000)')
36 | parser.add_argument('--start_steps', type=int, default=500, metavar='N',
37 | help='Steps sampling random actions (default: 500)')
38 | parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
39 | help='Value target update per no. of updates per step (default: 1)')
40 | parser.add_argument('--replay_size', type=int, default=1000, metavar='N',
41 | help='size of replay buffer (default: 1000)')
42 | parser.add_argument('--cuda', action="store_true", default=False,
43 | help='run on CUDA (default: False)')
44 |
45 | # MESA arguments
46 | parser.add_argument('--dataset', type=str, default='Mammo', metavar='N',
47 | help='the dataset used for meta-training (default: Mammo)')
48 | parser.add_argument('--metric', type=str, default='aucprc', metavar='N',
49 | help='the metric used for evaluate (default: aucprc)')
50 | parser.add_argument('--reward_coefficient', type=float, default=100, metavar='N')
51 | parser.add_argument('--num_bins', type=int, default=5, metavar='N',
52 | help='number of bins (default: 5). state-size = 2 * num_bins.')
53 | parser.add_argument('--sigma', type=float, default=0.2, metavar='N',
54 | help='sigma of the Gaussian function used in meta-sampling (default: 0.2)')
55 | parser.add_argument('--max_estimators', type=int, default=10, metavar='N',
56 | help='maximum number of base estimators in each meta-training episode (default: 10)')
57 | parser.add_argument('--meta_verbose', type=int, default=10, metavar='N',
58 | help='number of episodes between verbose outputs. \
59 | If \'full\' print log for each base estimator (default: 10)')
60 | parser.add_argument('--meta_verbose_mean_episodes', type=int, default=25, metavar='N',
61 | help='number of episodes used for compute latest mean score in verbose outputs.')
62 | parser.add_argument('--verbose', type=bool, default=False, metavar='N',
63 | help='enable verbose when ensemble fit (default: False)')
64 | parser.add_argument('--random_state', type=int, default=None, metavar='N',
65 | help='random_state (default: None)')
66 | parser.add_argument('--train_ir', type=float, default=1, metavar='N',
67 | help='imbalance ratio of the training set after meta-sampling (default: 1)')
68 | parser.add_argument('--train_ratio', type=float, default=1, metavar='N',
69 | help='the ratio of the data used in meta-training. \
70 | set train_ratio<1 to use a random subset for meta-training (default: 1)')
--------------------------------------------------------------------------------
/baselines/canonical_ensemble.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Jan 13 14:32:27 2020
4 | @author: ZhiningLiu1998
5 | mailto: zhining.liu@outlook.com
6 |
7 | NOTE: The implementation of SMOTEBoost/RUSBoost/RAMOBoost was obtained from
8 | imbalanced-algorithms: https://github.com/dialnd/imbalanced-algorithms
9 | """
10 |
11 | import numpy as np
12 | import sklearn
13 | from sklearn.base import is_regressor
14 | from sklearn.ensemble import AdaBoostClassifier
15 | from sklearn.tree import DecisionTreeClassifier
16 | from sklearn.ensemble.forest import BaseForest
17 | from sklearn.neighbors import NearestNeighbors
18 | from sklearn.preprocessing import normalize
19 | from sklearn.tree.tree import BaseDecisionTree
20 | from sklearn.utils import check_random_state
21 | from sklearn.utils import check_X_y
22 | from sklearn.utils import check_array
23 | from sklearn.preprocessing import binarize
24 | from utils import *
25 | from collections import Counter
26 | import warnings
27 | warnings.filterwarnings("ignore")
28 |
29 |
30 | class SMOTE(object):
31 | """Implementation of Synthetic Minority Over-Sampling Technique (SMOTE).
32 | SMOTE performs oversampling of the minority class by picking target
33 | minority class samples and their nearest minority class neighbors and
34 | generating new samples that linearly combine features of each target
35 | sample with features of its selected minority class neighbors [1].
36 | Parameters
37 | ----------
38 | k_neighbors : int, optional (default=5)
39 | Number of nearest neighbors.
40 | random_state : int or None, optional (default=None)
41 | If int, random_state is the seed used by the random number generator.
42 | If None, the random number generator is the RandomState instance used
43 | by np.random.
44 | References
45 | ----------
46 | .. [1] N. V. Chawla, K. W. Bowyer, L. O. Hall, and P. Kegelmeyer. "SMOTE:
47 | Synthetic Minority Over-Sampling Technique." Journal of Artificial
48 | Intelligence Research (JAIR), 2002.
49 | """
50 |
51 | def __init__(self, k_neighbors=5, random_state=None):
52 | self.k = k_neighbors
53 | self.random_state = random_state
54 |
55 | def sample(self, n_samples):
56 | """Generate samples.
57 | Parameters
58 | ----------
59 | n_samples : int
60 | Number of new synthetic samples.
61 | Returns
62 | -------
63 | S : array, shape = [n_samples, n_features]
64 | Returns synthetic samples.
65 | """
66 | np.random.seed(seed=self.random_state)
67 |
68 | S = np.zeros(shape=(n_samples, self.n_features))
69 | # Calculate synthetic samples.
70 | for i in range(n_samples):
71 | j = np.random.randint(0, self.X.shape[0])
72 |
73 | # Find the NN for each sample.
74 | # Exclude the sample itself.
75 | nn = self.neigh.kneighbors(self.X[j].reshape(1, -1),
76 | return_distance=False)[:, 1:]
77 | nn_index = np.random.choice(nn[0])
78 |
79 | dif = self.X[nn_index] - self.X[j]
80 | gap = np.random.random()
81 |
82 | S[i, :] = self.X[j, :] + gap * dif[:]
83 |
84 | return S
85 |
86 | def fit(self, X):
87 | """Train model based on input data.
88 | Parameters
89 | ----------
90 | X : array-like, shape = [n_minority_samples, n_features]
91 | Holds the minority samples.
92 | """
93 | self.X = X
94 | self.n_minority_samples, self.n_features = self.X.shape
95 |
96 | # Learn nearest neighbors.
97 | self.neigh = NearestNeighbors(n_neighbors=self.k + 1)
98 | self.neigh.fit(self.X)
99 |
100 | return self
101 |
102 | class SMOTEBoost(AdaBoostClassifier):
103 | """Implementation of SMOTEBoost.
104 | SMOTEBoost introduces data sampling into the AdaBoost algorithm by
105 | oversampling the minority class using SMOTE on each boosting iteration [1].
106 | This implementation inherits methods from the scikit-learn
107 | AdaBoostClassifier class, only modifying the `fit` method.
108 | Parameters
109 | ----------
110 | n_samples : int, optional (default=100)
111 | Number of new synthetic samples per boosting step.
112 | k_neighbors : int, optional (default=5)
113 | Number of nearest neighbors.
114 | base_estimator : object, optional (default=DecisionTreeClassifier)
115 | The base estimator from which the boosted ensemble is built.
116 | Support for sample weighting is required, as well as proper `classes_`
117 | and `n_classes_` attributes.
118 | n_estimators : int, optional (default=50)
119 | The maximum number of estimators at which boosting is terminated.
120 | In case of perfect fit, the learning procedure is stopped early.
121 | learning_rate : float, optional (default=1.)
122 | Learning rate shrinks the contribution of each classifier by
123 | ``learning_rate``. There is a trade-off between ``learning_rate`` and
124 | ``n_estimators``.
125 | algorithm : {'SAMME', 'SAMME.R'}, optional (default='SAMME.R')
126 | If 'SAMME.R' then use the SAMME.R real boosting algorithm.
127 | ``base_estimator`` must support calculation of class probabilities.
128 | If 'SAMME' then use the SAMME discrete boosting algorithm.
129 | The SAMME.R algorithm typically converges faster than SAMME,
130 | achieving a lower test error with fewer boosting iterations.
131 | random_state : int or None, optional (default=None)
132 | If int, random_state is the seed used by the random number generator.
133 | If None, the random number generator is the RandomState instance used
134 | by np.random.
135 | References
136 | ----------
137 | .. [1] N. V. Chawla, A. Lazarevic, L. O. Hall, and K. W. Bowyer.
138 | "SMOTEBoost: Improving Prediction of the Minority Class in
139 | Boosting." European Conference on Principles of Data Mining and
140 | Knowledge Discovery (PKDD), 2003.
141 | """
142 |
143 | def __init__(self,
144 | n_samples=100,
145 | k_neighbors=5,
146 | base_estimator=None,
147 | n_estimators=50,
148 | learning_rate=1.,
149 | algorithm='SAMME.R',
150 | random_state=None):
151 |
152 | self.n_samples = n_samples
153 | self.algorithm = algorithm
154 | self.smote = SMOTE(k_neighbors=k_neighbors,
155 | random_state=random_state)
156 |
157 | super(SMOTEBoost, self).__init__(
158 | base_estimator=base_estimator,
159 | n_estimators=n_estimators,
160 | learning_rate=learning_rate,
161 | random_state=random_state)
162 |
163 | def fit(self, X, y, sample_weight=None, minority_target=None):
164 | """Build a boosted classifier/regressor from the training set (X, y),
165 | performing SMOTE during each boosting step.
166 | Parameters
167 | ----------
168 | X : {array-like, sparse matrix} of shape = [n_samples, n_features]
169 | The training input samples. Sparse matrix can be CSC, CSR, COO,
170 | DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is
171 | forced to DTYPE from tree._tree if the base classifier of this
172 | ensemble weighted boosting classifier is a tree or forest.
173 | y : array-like of shape = [n_samples]
174 | The target values (class labels in classification, real numbers in
175 | regression).
176 | sample_weight : array-like of shape = [n_samples], optional
177 | Sample weights. If None, the sample weights are initialized to
178 | 1 / n_samples.
179 | minority_target : int
180 | Minority class label.
181 | Returns
182 | -------
183 | self : object
184 | Returns self.
185 | Notes
186 | -----
187 | Based on the scikit-learn v0.18 AdaBoostClassifier and
188 | BaseWeightBoosting `fit` methods.
189 | """
190 | # Check that algorithm is supported.
191 | if self.algorithm not in ('SAMME', 'SAMME.R'):
192 | raise ValueError("algorithm %s is not supported" % self.algorithm)
193 |
194 | # Check parameters.
195 | if self.learning_rate <= 0:
196 | raise ValueError("learning_rate must be greater than zero")
197 |
198 | if (self.base_estimator is None or
199 | isinstance(self.base_estimator, (BaseDecisionTree,
200 | BaseForest))):
201 | DTYPE = np.float64 # from fast_dict.pxd
202 | dtype = DTYPE
203 | accept_sparse = 'csc'
204 | else:
205 | dtype = None
206 | accept_sparse = ['csr', 'csc']
207 |
208 | X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype,
209 | y_numeric=is_regressor(self))
210 |
211 | if sample_weight is None:
212 | # Initialize weights to 1 / n_samples.
213 | sample_weight = np.empty(X.shape[0], dtype=np.float64)
214 | sample_weight[:] = 1. / X.shape[0]
215 | else:
216 | sample_weight = check_array(sample_weight, ensure_2d=False)
217 | # Normalize existing weights.
218 | sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)
219 |
220 | # Check that the sample weights sum is positive.
221 | if sample_weight.sum() <= 0:
222 | raise ValueError(
223 | "Attempting to fit with a non-positive "
224 | "weighted number of samples.")
225 |
226 | if minority_target is None:
227 | # Determine the minority class label.
228 | stats_c_ = Counter(y)
229 | maj_c_ = max(stats_c_, key=stats_c_.get)
230 | min_c_ = min(stats_c_, key=stats_c_.get)
231 | self.minority_target = min_c_
232 | else:
233 | self.minority_target = minority_target
234 |
235 | # Check parameters.
236 | self._validate_estimator()
237 |
238 | # Clear any previous fit results.
239 | self.estimators_ = []
240 | self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
241 | self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)
242 |
243 | random_state = check_random_state(self.random_state)
244 |
245 | self.total_training_instances = 0
246 | self.total_training_instances_list = []
247 | for iboost in range(self.n_estimators):
248 | # SMOTE step.
249 | X_min = X[np.where(y == self.minority_target)]
250 | self.smote.fit(X_min)
251 | X_syn = self.smote.sample(self.n_samples)
252 | y_syn = np.full(X_syn.shape[0], fill_value=self.minority_target,
253 | dtype=np.int64)
254 |
255 | # Normalize synthetic sample weights based on current training set.
256 | sample_weight_syn = np.empty(X_syn.shape[0], dtype=np.float64)
257 | sample_weight_syn[:] = 1. / X.shape[0]
258 |
259 | # print ('Boosting Iter: {} n_train: {} n_smote: {}'.format(
260 | # iboost, len(X_min), len(y_syn)))
261 |
262 | # Combine the original and synthetic samples.
263 | X = np.vstack((X, X_syn))
264 | y = np.append(y, y_syn)
265 |
266 | self.total_training_instances = self.total_training_instances + len(y)
267 | self.total_training_instances_list.append(self.total_training_instances)
268 | print(f'SMOTEBoost total training size: {self.total_training_instances}')
269 |
270 | # Combine the weights.
271 | sample_weight = \
272 | np.append(sample_weight, sample_weight_syn).reshape(-1, 1)
273 | sample_weight = \
274 | np.squeeze(normalize(sample_weight, axis=0, norm='l1'))
275 |
276 | # X, y, sample_weight = shuffle(X, y, sample_weight,
277 | # random_state=random_state)
278 |
279 | # Boosting step.
280 | sample_weight, estimator_weight, estimator_error = self._boost(
281 | iboost,
282 | X, y,
283 | sample_weight,
284 | random_state)
285 |
286 | # Early termination.
287 | if sample_weight is None:
288 | print('sample_weight: {}'.format(sample_weight))
289 | break
290 |
291 | self.estimator_weights_[iboost] = estimator_weight
292 | self.estimator_errors_[iboost] = estimator_error
293 |
294 | # Stop if error is zero.
295 | # if estimator_error == 0:
296 | # print('error: {}'.format(estimator_error))
297 | # break
298 |
299 | sample_weight_sum = np.sum(sample_weight)
300 |
301 | # Stop if the sum of sample weights has become non-positive.
302 | if sample_weight_sum <= 0:
303 | print('sample_weight_sum: {}'.format(sample_weight_sum))
304 | break
305 |
306 | if iboost < self.n_estimators - 1:
307 | # Normalize.
308 | sample_weight /= sample_weight_sum
309 |
310 | return self
311 |
312 | class RankedMinorityOversampler(object):
313 | """Implementation of Ranked Minority Oversampling (RAMO).
314 | Oversample the minority class by picking samples according to a specified
315 | sampling distribution.
316 | Parameters
317 | ----------
318 | k_neighbors_1 : int, optional (default=5)
319 | Number of nearest neighbors used to adjust the sampling probability of
320 | the minority examples.
321 | k_neighbors_2 : int, optional (default=5)
322 | Number of nearest neighbors used to generate the synthetic data
323 | instances.
324 | alpha : float, optional (default=0.3)
325 | Scaling coefficient.
326 | random_state : int or None, optional (default=None)
327 | If int, random_state is the seed used by the random number generator.
328 | If None, the random number generator is the RandomState instance used
329 | by np.random.
330 | """
331 |
332 | def __init__(self, k_neighbors_1=5, k_neighbors_2=5, alpha=0.3,
333 | random_state=None):
334 | self.k_neighbors_1 = k_neighbors_1
335 | self.k_neighbors_2 = k_neighbors_2
336 | self.alpha = alpha
337 | self.random_state = random_state
338 |
339 | def sample(self, n_samples):
340 | """Generate samples.
341 | Parameters
342 | ----------
343 | n_samples : int
344 | Number of new synthetic samples.
345 | Returns
346 | -------
347 | S : array, shape = [n_samples, n_features]
348 | Returns synthetic samples.
349 | """
350 | np.random.seed(seed=self.random_state)
351 |
352 | S = np.zeros(shape=(n_samples, self.n_features))
353 | # Calculate synthetic samples.
354 | for i in range(n_samples):
355 | # Choose a sample according to the sampling distribution, r.
356 | j = np.random.choice(self.n_minority_samples, p=self.r)
357 |
358 | # Find the NN for each sample.
359 | # Exclude the sample itself.
360 | nn = self.neigh_2.kneighbors(self.X_min[j].reshape(1, -1),
361 | return_distance=False)[:, 1:]
362 | nn_index = np.random.choice(nn[0])
363 |
364 | dif = self.X_min[nn_index] - self.X_min[j]
365 | gap = np.random.random()
366 |
367 | S[i, :] = self.X_min[j, :] + gap * dif[:]
368 |
369 | return S
370 |
371 | def fit(self, X, y, sample_weight=None, minority_target=None):
372 | """Train model based on input data.
373 | Parameters
374 | ----------
375 | X : array-like, shape = [n_total_samples, n_features]
376 | Holds the majority and minority samples.
377 | y : array-like, shape = [n_total_samples]
378 | Holds the class targets for samples.
379 | sample_weight : array-like of shape = [n_samples], optional
380 | Sample weights multiplier. If None, the multiplier is 1.
381 | minority_target : int, optional (default=None)
382 | Minority class label.
383 | """
384 | if minority_target is None:
385 | # Determine the minority class label.
386 | stats_c_ = Counter(y)
387 | maj_c_ = max(stats_c_, key=stats_c_.get)
388 | min_c_ = min(stats_c_, key=stats_c_.get)
389 | self.minority_target = min_c_
390 | else:
391 | self.minority_target = minority_target
392 |
393 | self.X_min = X[y == self.minority_target]
394 | self.n_minority_samples, self.n_features = self.X_min.shape
395 |
396 | neigh_1 = NearestNeighbors(n_neighbors=self.k_neighbors_1 + 1)
397 | neigh_1.fit(X)
398 | nn = neigh_1.kneighbors(self.X_min, return_distance=False)[:, 1:]
399 |
400 | if sample_weight is None:
401 | sample_weight_min = np.ones(shape=(len(self.minority_target)))
402 | else:
403 | assert(len(y) == len(sample_weight))
404 | sample_weight_min = sample_weight[y == self.minority_target]
405 |
406 | self.r = np.zeros(shape=(self.n_minority_samples))
407 | for i in range(self.n_minority_samples):
408 | majority_neighbors = 0
409 | for n in nn[i]:
410 | if y[n] != self.minority_target:
411 | majority_neighbors += 1
412 |
413 | self.r[i] = 1. / (1 + np.exp(-self.alpha * majority_neighbors))
414 |
415 | self.r = (self.r * sample_weight_min).reshape(1, -1)
416 | self.r = np.squeeze(normalize(self.r, axis=1, norm='l1'))
417 |
418 | # Learn nearest neighbors.
419 | self.neigh_2 = NearestNeighbors(n_neighbors=self.k_neighbors_2 + 1)
420 | self.neigh_2.fit(self.X_min)
421 |
422 | return self
423 |
424 |
425 | class RAMOBoost(AdaBoostClassifier):
426 | """Implementation of RAMOBoost.
427 | RAMOBoost introduces data sampling into the AdaBoost algorithm by
428 | oversampling the minority class according to a specified sampling
429 | distribution on each boosting iteration [1].
430 | This implementation inherits methods from the scikit-learn
431 | AdaBoostClassifier class, only modifying the `fit` method.
432 | Parameters
433 | ----------
434 | n_samples : int, optional (default=100)
435 | Number of new synthetic samples per boosting step.
436 | k_neighbors_1 : int, optional (default=5)
437 | Number of nearest neighbors used to adjust the sampling probability of
438 | the minority examples.
439 | k_neighbors_2 : int, optional (default=5)
440 | Number of nearest neighbors used to generate the synthetic data
441 | instances.
442 | alpha : float, optional (default=0.3)
443 | Scaling coefficient.
444 | base_estimator : object, optional (default=DecisionTreeClassifier)
445 | The base estimator from which the boosted ensemble is built.
446 | Support for sample weighting is required, as well as proper `classes_`
447 | and `n_classes_` attributes.
448 | n_estimators : int, optional (default=50)
449 | The maximum number of estimators at which boosting is terminated.
450 | In case of perfect fit, the learning procedure is stopped early.
451 | learning_rate : float, optional (default=1.)
452 | Learning rate shrinks the contribution of each classifier by
453 | ``learning_rate``. There is a trade-off between ``learning_rate`` and
454 | ``n_estimators``.
455 | algorithm : {'SAMME', 'SAMME.R'}, optional (default='SAMME.R')
456 | If 'SAMME.R' then use the SAMME.R real boosting algorithm.
457 | ``base_estimator`` must support calculation of class probabilities.
458 | If 'SAMME' then use the SAMME discrete boosting algorithm.
459 | The SAMME.R algorithm typically converges faster than SAMME,
460 | achieving a lower test error with fewer boosting iterations.
461 | random_state : int or None, optional (default=None)
462 | If int, random_state is the seed used by the random number generator.
463 | If None, the random number generator is the RandomState instance used
464 | by np.random.
465 | References
466 | ----------
467 | .. [1] S. Chen, H. He, and E. A. Garcia. "RAMOBoost: Ranked Minority
468 | Oversampling in Boosting". IEEE Transactions on Neural Networks,
469 | 2010.
470 | """
471 |
472 | def __init__(self,
473 | n_samples=100,
474 | k_neighbors_1=5,
475 | k_neighbors_2=5,
476 | alpha=0.3,
477 | base_estimator=None,
478 | n_estimators=50,
479 | learning_rate=1.,
480 | algorithm='SAMME.R',
481 | random_state=None):
482 |
483 | self.n_samples = n_samples
484 | self.algorithm = algorithm
485 | self.ramo = RankedMinorityOversampler(k_neighbors_1, k_neighbors_2,
486 | alpha, random_state=random_state)
487 |
488 | super(RAMOBoost, self).__init__(
489 | base_estimator=base_estimator,
490 | n_estimators=n_estimators,
491 | learning_rate=learning_rate,
492 | random_state=random_state)
493 |
494 | def fit(self, X, y, sample_weight=None, minority_target=None):
495 | """Build a boosted classifier/regressor from the training set (X, y),
496 | performing random undersampling during each boosting step.
497 | Parameters
498 | ----------
499 | X : {array-like, sparse matrix} of shape = [n_samples, n_features]
500 | The training input samples. Sparse matrix can be CSC, CSR, COO,
501 | DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is
502 | forced to DTYPE from tree._tree if the base classifier of this
503 | ensemble weighted boosting classifier is a tree or forest.
504 | y : array-like of shape = [n_samples]
505 | The target values (class labels in classification, real numbers in
506 | regression).
507 | sample_weight : array-like of shape = [n_samples], optional
508 | Sample weights. If None, the sample weights are initialized to
509 | 1 / n_samples.
510 | minority_target : int
511 | Minority class label.
512 | Returns
513 | -------
514 | self : object
515 | Returns self.
516 | Notes
517 | -----
518 | Based on the scikit-learn v0.18 AdaBoostClassifier and
519 | BaseWeightBoosting `fit` methods.
520 | """
521 | # Check that algorithm is supported.
522 | if self.algorithm not in ('SAMME', 'SAMME.R'):
523 | raise ValueError("algorithm %s is not supported" % self.algorithm)
524 |
525 | # Check parameters.
526 | if self.learning_rate <= 0:
527 | raise ValueError("learning_rate must be greater than zero")
528 |
529 | if (self.base_estimator is None or
530 | isinstance(self.base_estimator, (BaseDecisionTree,
531 | BaseForest))):
532 | DTYPE = np.float64 # from fast_dict.pxd
533 | dtype = DTYPE
534 | accept_sparse = 'csc'
535 | else:
536 | dtype = None
537 | accept_sparse = ['csr', 'csc']
538 |
539 | X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype,
540 | y_numeric=is_regressor(self))
541 |
542 | if sample_weight is None:
543 | # Initialize weights to 1 / n_samples.
544 | sample_weight = np.empty(X.shape[0], dtype=np.float64)
545 | sample_weight[:] = 1. / X.shape[0]
546 | else:
547 | sample_weight = check_array(sample_weight, ensure_2d=False)
548 | # Normalize existing weights.
549 | sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)
550 |
551 | # Check that the sample weights sum is positive.
552 | if sample_weight.sum() <= 0:
553 | raise ValueError(
554 | "Attempting to fit with a non-positive "
555 | "weighted number of samples.")
556 |
557 | if minority_target is None:
558 | # Determine the minority class label.
559 | stats_c_ = Counter(y)
560 | maj_c_ = max(stats_c_, key=stats_c_.get)
561 | min_c_ = min(stats_c_, key=stats_c_.get)
562 | self.minority_target = min_c_
563 | else:
564 | self.minority_target = minority_target
565 |
566 | # Check parameters.
567 | self._validate_estimator()
568 |
569 | # Clear any previous fit results.
570 | self.estimators_ = []
571 | self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
572 | self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)
573 |
574 | random_state = check_random_state(self.random_state)
575 |
576 | self.total_training_instances = 0
577 | self.total_training_instances_list = []
578 | for iboost in range(self.n_estimators):
579 | # RAMO step.
580 | self.ramo.fit(X, y, sample_weight=sample_weight)
581 | X_syn = self.ramo.sample(self.n_samples)
582 | y_syn = np.full(X_syn.shape[0], fill_value=self.minority_target,
583 | dtype=np.int64)
584 |
585 | # Combine the minority and majority class samples.
586 | X = np.vstack((X, X_syn))
587 | y = np.append(y, y_syn)
588 |
589 | self.total_training_instances = self.total_training_instances + len(y)
590 | self.total_training_instances_list.append(self.total_training_instances)
591 | print (f'RAMOBoost total training size: {self.total_training_instances}')
592 |
593 | # Normalize synthetic sample weights based on current training set.
594 | sample_weight_syn = np.empty(X_syn.shape[0], dtype=np.float64)
595 | sample_weight_syn[:] = 1. / X.shape[0]
596 |
597 | # Combine the weights.
598 | sample_weight = \
599 | np.append(sample_weight, sample_weight_syn).reshape(-1, 1)
600 | sample_weight = \
601 | np.squeeze(normalize(sample_weight, axis=0, norm='l1'))
602 |
603 | # X, y, sample_weight = shuffle(X, y, sample_weight,
604 | # random_state=random_state)
605 |
606 | # Boosting step.
607 | sample_weight, estimator_weight, estimator_error = self._boost(
608 | iboost,
609 | X, y,
610 | sample_weight,
611 | random_state)
612 |
613 | # Early termination.
614 | # if sample_weight is None:
615 | # break
616 |
617 | self.estimator_weights_[iboost] = estimator_weight
618 | self.estimator_errors_[iboost] = estimator_error
619 |
620 | # Stop if error is zero.
621 | # if estimator_error == 0:
622 | # break
623 |
624 | sample_weight_sum = np.sum(sample_weight)
625 |
626 | # Stop if the sum of sample weights has become non-positive.
627 | # if sample_weight_sum <= 0:
628 | # break
629 |
630 | if iboost < self.n_estimators - 1:
631 | # Normalize.
632 | sample_weight /= sample_weight_sum
633 |
634 | return self
635 |
636 | class RandomUnderSampler(object):
637 | """Implementation of random undersampling (RUS).
638 | Undersample the majority class(es) by randomly picking samples with or
639 | without replacement.
640 | Parameters
641 | ----------
642 | with_replacement : bool, optional (default=True)
643 | Undersample with replacement.
644 | return_indices : bool, optional (default=False)
645 | Whether or not to return the indices of the samples randomly selected
646 | from the majority class.
647 | random_state : int or None, optional (default=None)
648 | If int, random_state is the seed used by the random number generator.
649 | If None, the random number generator is the RandomState instance used
650 | by np.random.
651 | """
652 |
653 | def __init__(self, with_replacement=True, return_indices=False,
654 | random_state=None):
655 | self.return_indices = return_indices
656 | self.with_replacement = with_replacement
657 | self.random_state = random_state
658 |
659 | def sample(self, n_samples):
660 | """Perform undersampling.
661 | Parameters
662 | ----------
663 | n_samples : int
664 | Number of samples to remove.
665 | Returns
666 | -------
667 | S : array, shape = [n_majority_samples - n_samples, n_features]
668 | Returns synthetic samples.
669 | """
670 | np.random.seed(seed=self.random_state)
671 |
672 | if self.n_majority_samples <= n_samples:
673 | n_samples = self.n_majority_samples
674 |
675 | idx = np.random.choice(self.n_majority_samples,
676 | # size=self.n_majority_samples - n_samples,
677 | size=self.n_minority_samples,
678 | replace=self.with_replacement)
679 |
680 | if self.return_indices:
681 | return (self.X_maj[idx], idx)
682 | else:
683 | return self.X_maj[idx]
684 |
685 | def fit(self, X_maj, X_min):
686 | """Train model based on input data.
687 | Parameters
688 | ----------
689 | X : array-like, shape = [n_majority_samples, n_features]
690 | Holds the majority samples.
691 | """
692 | self.X_maj = X_maj
693 | self.X_min = X_min
694 | self.n_majority_samples, self.n_features = self.X_maj.shape
695 | self.n_minority_samples = self.X_min.shape[0]
696 |
697 | return self
698 |
699 | import pandas as pd
700 |
701 | class RUSBoost(AdaBoostClassifier):
702 | """Implementation of RUSBoost.
703 | RUSBoost introduces data sampling into the AdaBoost algorithm by
704 | undersampling the majority class using random undersampling (with or
705 | without replacement) on each boosting iteration [1].
706 | This implementation inherits methods from the scikit-learn
707 | AdaBoostClassifier class, only modifying the `fit` method.
708 | Parameters
709 | ----------
710 | n_samples : int, optional (default=100)
711 | Number of new synthetic samples per boosting step.
712 | min_ratio : float (default=1.0)
713 | Minimum ratio of majority to minority class samples to generate.
714 | with_replacement : bool, optional (default=True)
715 | Undersample with replacement.
716 | base_estimator : object, optional (default=DecisionTreeClassifier)
717 | The base estimator from which the boosted ensemble is built.
718 | Support for sample weighting is required, as well as proper `classes_`
719 | and `n_classes_` attributes.
720 | n_estimators : int, optional (default=50)
721 | The maximum number of estimators at which boosting is terminated.
722 | In case of perfect fit, the learning procedure is stopped early.
723 | learning_rate : float, optional (default=1.)
724 | Learning rate shrinks the contribution of each classifier by
725 | ``learning_rate``. There is a trade-off between ``learning_rate`` and
726 | ``n_estimators``.
727 | algorithm : {'SAMME', 'SAMME.R'}, optional (default='SAMME.R')
728 | If 'SAMME.R' then use the SAMME.R real boosting algorithm.
729 | ``base_estimator`` must support calculation of class probabilities.
730 | If 'SAMME' then use the SAMME discrete boosting algorithm.
731 | The SAMME.R algorithm typically converges faster than SAMME,
732 | achieving a lower test error with fewer boosting iterations.
733 | random_state : int or None, optional (default=None)
734 | If int, random_state is the seed used by the random number generator.
735 | If None, the random number generator is the RandomState instance used
736 | by np.random.
737 | References
738 | ----------
739 | .. [1] C. Seiffert, T. M. Khoshgoftaar, J. V. Hulse, and A. Napolitano.
740 | "RUSBoost: Improving Classification Performance when Training Data
741 | is Skewed". International Conference on Pattern Recognition
742 | (ICPR), 2008.
743 | """
744 |
745 | def __init__(self,
746 | n_samples=100,
747 | min_ratio=1.0,
748 | with_replacement=True,
749 | base_estimator=None,
750 | n_estimators=10,
751 | learning_rate=1.,
752 | algorithm='SAMME.R',
753 | random_state=None):
754 |
755 | self.n_samples = n_samples
756 | self.min_ratio = min_ratio
757 | self.algorithm = algorithm
758 | self.rus = RandomUnderSampler(with_replacement=with_replacement,
759 | return_indices=True,
760 | random_state=random_state)
761 |
762 | super(RUSBoost, self).__init__(
763 | base_estimator=base_estimator,
764 | n_estimators=n_estimators,
765 | learning_rate=learning_rate,
766 | random_state=random_state)
767 |
768 | def fit(self, X, y, sample_weight=None, minority_target=None, verbose=False):
769 | """Build a boosted classifier/regressor from the training set (X, y),
770 | performing random undersampling during each boosting step.
771 | Parameters
772 | ----------
773 | X : {array-like, sparse matrix} of shape = [n_samples, n_features]
774 | The training input samples. Sparse matrix can be CSC, CSR, COO,
775 | DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is
776 | forced to DTYPE from tree._tree if the base classifier of this
777 | ensemble weighted boosting classifier is a tree or forest.
778 | y : array-like of shape = [n_samples]
779 | The target values (class labels in classification, real numbers in
780 | regression).
781 | sample_weight : array-like of shape = [n_samples], optional
782 | Sample weights. If None, the sample weights are initialized to
783 | 1 / n_samples.
784 | minority_target : int
785 | Minority class label.
786 | Returns
787 | -------
788 | self : object
789 | Returns self.
790 | Notes
791 | -----
792 | Based on the scikit-learn v0.18 AdaBoostClassifier and
793 | BaseWeightBoosting `fit` methods.
794 | """
795 | # Check that algorithm is supported.
796 | if self.algorithm not in ('SAMME', 'SAMME.R'):
797 | raise ValueError("algorithm %s is not supported" % self.algorithm)
798 |
799 | # Check parameters.
800 | if self.learning_rate <= 0:
801 | raise ValueError("learning_rate must be greater than zero")
802 |
803 | if (self.base_estimator is None or
804 | isinstance(self.base_estimator, (BaseDecisionTree,
805 | BaseForest))):
806 | DTYPE = np.float64 # from fast_dict.pxd
807 | dtype = DTYPE
808 | accept_sparse = 'csc'
809 | else:
810 | dtype = None
811 | accept_sparse = ['csr', 'csc']
812 |
813 | X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype,
814 | y_numeric=is_regressor(self))
815 |
816 | if sample_weight is None:
817 | # Initialize weights to 1 / n_samples.
818 | sample_weight = np.empty(X.shape[0], dtype=np.float64)
819 | sample_weight[:] = 1. / X.shape[0]
820 | else:
821 | sample_weight = check_array(sample_weight, ensure_2d=False)
822 | # Normalize existing weights.
823 | sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)
824 |
825 | # Check that the sample weights sum is positive.
826 | if sample_weight.sum() <= 0:
827 | raise ValueError(
828 | "Attempting to fit with a non-positive "
829 | "weighted number of samples.")
830 |
831 | if minority_target is None:
832 | # Determine the minority class label.
833 | stats_c_ = Counter(y)
834 | maj_c_ = max(stats_c_, key=stats_c_.get)
835 | min_c_ = min(stats_c_, key=stats_c_.get)
836 | self.minority_target = min_c_
837 | else:
838 | self.minority_target = minority_target
839 |
840 | # Check parameters.
841 | self._validate_estimator()
842 |
843 | # Clear any previous fit results.
844 | self.estimators_ = []
845 | self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
846 | self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)
847 |
848 | random_state = check_random_state(self.random_state)
849 |
850 | for iboost in range(self.n_estimators):
851 | # Random undersampling step.
852 | X_maj = X[np.where(y != self.minority_target)]
853 | X_min = X[np.where(y == self.minority_target)]
854 | self.rus.fit(X_maj, X_min)
855 | # self.rus.fit(X_maj)
856 |
857 | n_maj = X_maj.shape[0]
858 | n_min = X_min.shape[0]
859 | if n_maj - self.n_samples < int(n_min * self.min_ratio):
860 | self.n_samples = n_maj - int(n_min * self.min_ratio)
861 | X_rus, X_idx = self.rus.sample(self.n_samples)
862 |
863 | if verbose:
864 | print ('{:<12s} | Iter: {} X_maj: {} X_rus: {} X_min: {}'.format(
865 | 'RUSBoost', iboost, len(X_maj), len(X_rus), len(X_min)))
866 |
867 | y_rus = y[np.where(y != self.minority_target)][X_idx]
868 | y_min = y[np.where(y == self.minority_target)]
869 |
870 | sample_weight_rus = \
871 | sample_weight[np.where(y != self.minority_target)][X_idx]
872 | sample_weight_min = \
873 | sample_weight[np.where(y == self.minority_target)]
874 |
875 | # Combine the minority and majority class samples.
876 | X_train = np.vstack((X_rus, X_min))
877 | y_train = np.append(y_rus, y_min)
878 |
879 | # Combine the weights.
880 | sample_weight_train = \
881 | np.append(sample_weight_rus, sample_weight_min).reshape(-1, 1)
882 | sample_weight_train = \
883 | np.squeeze(normalize(sample_weight_train, axis=0, norm='l1'))
884 |
885 | # Boosting step.
886 | _, estimator_weight_train, estimator_error = self._boost(
887 | iboost,
888 | X_train, y_train,
889 | sample_weight_train,
890 | random_state)
891 |
892 | y_predict_proba = self.estimators_[-1].predict_proba(X)
893 | y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1),
894 | axis=0)
895 | # Instances incorrectly classified
896 | incorrect = y_predict != y
897 | # Error fraction
898 | estimator_error = np.mean(
899 | np.average(incorrect, weights=sample_weight, axis=0))
900 | n_classes = self.n_classes_
901 | classes = self.classes_
902 | y_codes = np.array([-1. / (n_classes - 1), 1.])
903 | y_coding = y_codes.take(classes == y[:, np.newaxis])
904 | estimator_weight = (-1. * self.learning_rate
905 | * ((n_classes - 1.) / n_classes)
906 | * (y_coding * (y_predict_proba)).sum(axis=1))
907 |
908 | if not iboost == self.n_estimators - 1:
909 | # Only boost positive weights
910 | sample_weight *= np.exp(estimator_weight * ((sample_weight > 0) | (estimator_weight < 0)))
911 |
912 | # Early termination.
913 | if sample_weight is None:
914 | break
915 |
916 | self.estimator_weights_[iboost] = estimator_weight_train
917 | self.estimator_errors_[iboost] = estimator_error
918 |
919 | # Stop if error is zero.
920 | # if estimator_error == 0:
921 | # print('error: {}'.format(estimator_error))
922 | # break
923 |
924 | sample_weight_sum = np.sum(sample_weight)
925 |
926 | # Stop if the sum of sample weights has become non-positive.
927 | if sample_weight_sum <= 0:
928 | break
929 |
930 | if iboost < self.n_estimators - 1:
931 | # Normalize.
932 | sample_weight /= sample_weight_sum
933 |
934 | return self
935 |
936 | import pandas as pd
937 | from imblearn.over_sampling import SMOTE as SMOTE_IMB
938 | from sklearn.tree import DecisionTreeClassifier as DT
939 |
940 | class SMOTEBagging():
941 | def __init__(self,
942 | n_samples=100,
943 | min_ratio=1.0,
944 | with_replacement=True,
945 | base_estimator=None,
946 | n_estimators=10,
947 | learning_rate=1.,
948 | algorithm='SAMME.R',
949 | random_state=None):
950 |
951 | self.base_estimator = base_estimator
952 | self.n_estimators = n_estimators
953 | self.random_state = random_state
954 | self.estimators_ = []
955 |
956 | def fit(self, X, y, verbose=False):
957 |
958 | self.total_training_instances = 0
959 | self.total_training_instances_list = []
960 | self.estimators_ = []
961 | df = pd.DataFrame(X); df['label'] = y
962 | df_maj = df[df['label']==0]; n_maj = len(df_maj)
963 | df_min = df[df['label']==1]; n_min = len(df_min)
964 | cols = df.columns.tolist(); cols.remove('label')
965 |
966 | for ibagging in range(self.n_estimators):
967 | b = min(0.1*((ibagging%10)+1), 1)
968 | train_maj = df_maj.sample(frac=1, replace=True)
969 | train_min = df_min.sample(frac=(n_maj/n_min)*b, replace=True)
970 | n_min_train = train_min.shape[0]
971 | N = int((n_maj/n_min_train)*(1-b)*100)
972 | ratio = min((n_min_train + N) / n_maj, 1)
973 | df_k = train_maj.append(train_min)
974 |
975 | if N > 0:
976 | X_train, y_train = SMOTE_IMB(
977 | k_neighbors=min(5, len(train_min)-1),
978 | ratio=ratio,
979 | random_state=self.random_state,
980 | ).fit_resample(
981 | df_k[cols], df_k['label']
982 | )
983 | else:
984 | X_train, y_train = df_k[cols], df_k['label']
985 |
986 | self.total_training_instances = self.total_training_instances + len(y_train)
987 | self.total_training_instances_list.append(self.total_training_instances)
988 | if verbose:
989 | print ('{:<12s} | Iter: {} |b: {:.1f}|n_train: {}|n_smote: {}|n_total_train: {}'.format(
990 | 'SMOTEBagging', ibagging, b, len(y_train), len(y_train)-len(df_k), self.total_training_instances))
991 | model = clone(self.base_estimator).fit(X_train, y_train)
992 | self.estimators_.append(model)
993 |
994 | return self
995 |
996 | def predict_proba(self, X):
997 |
998 | y_pred = np.array([model.predict_proba(X)[:, 1] for model in self.estimators_]).mean(axis=0)
999 | if y_pred.ndim == 1:
1000 | y_pred = y_pred[:, np.newaxis]
1001 | if y_pred.shape[1] == 1:
1002 | y_pred = np.append(1-y_pred, y_pred, axis=1)
1003 | return y_pred
1004 |
1005 | def predict(self, X):
1006 |
1007 | y_pred_binarazed = binarize(self.predict_proba(X)[:,1].reshape(1,-1), threshold=0.5)[0]
1008 | return y_pred_binarazed
1009 |
1010 |
1011 | import pandas as pd
1012 | from sklearn.tree import DecisionTreeClassifier as DT
1013 |
1014 | class UnderBagging():
1015 | def __init__(self,
1016 | n_samples=100,
1017 | min_ratio=1.0,
1018 | with_replacement=True,
1019 | base_estimator=None,
1020 | n_estimators=10,
1021 | learning_rate=1.,
1022 | algorithm='SAMME.R',
1023 | random_state=None):
1024 |
1025 | self.base_estimator = base_estimator
1026 | self.n_estimators = n_estimators
1027 | self.random_state = random_state
1028 | self.estimators_ = []
1029 |
1030 | def fit(self, X, y, verbose=False):
1031 |
1032 | self.estimators_ = []
1033 | df = pd.DataFrame(X); df['label'] = y
1034 | df_maj = df[df['label']==0]; n_maj = len(df_maj)
1035 | df_min = df[df['label']==1]; n_min = len(df_min)
1036 | cols = df.columns.tolist(); cols.remove('label')
1037 |
1038 | for ibagging in range(self.n_estimators):
1039 | train_maj = df_maj.sample(n=int(n_min), random_state=self.random_state)
1040 | train_min = df_min
1041 | if verbose:
1042 | print ('{:<12s} | Iter: {} X_maj: {} X_rus: {} X_min: {}'.format(
1043 | 'UnderBagging', ibagging, len(df_maj), len(train_maj), len(train_min)))
1044 | df_k = train_maj.append(train_min)
1045 | X_train, y_train = df_k[cols], df_k['label']
1046 | model = clone(self.base_estimator).fit(X_train, y_train)
1047 | self.estimators_.append(model)
1048 |
1049 | return self
1050 |
1051 | def predict_proba(self, X):
1052 |
1053 | y_pred = np.array([model.predict_proba(X)[:, 1] for model in self.estimators_]).mean(axis=0)
1054 | if y_pred.ndim == 1:
1055 | y_pred = y_pred[:, np.newaxis]
1056 | if y_pred.shape[1] == 1:
1057 | y_pred = np.append(1-y_pred, y_pred, axis=1)
1058 | return y_pred
1059 |
1060 | def predict(self, X):
1061 |
1062 | y_pred_binarazed = binarize(self.predict_proba(X)[:,1].reshape(1,-1), threshold=0.5)[0]
1063 | return y_pred_binarazed
1064 |
1065 |
1066 | from sklearn.base import clone
1067 | class BalanceCascade():
1068 | """
1069 | The implementation of BalanceCascade.
1070 | Hyper-parameters:
1071 | base_estimator : scikit-learn classifier object
1072 | optional (default=DecisionTreeClassifier)
1073 | The base estimator from which the ensemble is built.
1074 | n_estimators: Number of iterations / estimators
1075 | k_bins: Number of hardness bins
1076 | """
1077 | def __init__(self, base_estimator=DT(), n_estimators=10, random_state=None):
1078 |
1079 | self.base_estimator = base_estimator
1080 | self.n_estimators = n_estimators
1081 | self.random_state = random_state
1082 | self.estimators_ = []
1083 | # Will be set in the fit function
1084 | self.feature_cols = None
1085 |
1086 | def _fit_baselearner(self, df_train):
1087 |
1088 | model = clone(self.base_estimator)
1089 | return model.fit(df_train[self.feature_cols], df_train['label'])
1090 |
1091 | def fit(self, X, y, verbose=False, visualize=False):
1092 |
1093 | self.estimators_ = []
1094 | # Initialize majority & minority set
1095 | df = pd.DataFrame(X); df['label'] = y
1096 | df_maj = df[y==0]; n_maj = df_maj.shape[0]
1097 | df_min = df[y==1]; n_min = df_min.shape[0]
1098 | self.feature_cols = df.columns.tolist()
1099 | self.feature_cols.remove('label')
1100 |
1101 | ir = n_min / n_maj
1102 | keep_fp_rate = np.power(ir, 1/(self.n_estimators-1))
1103 |
1104 | # Algorithm start
1105 | for ibagging in range(1, self.n_estimators):
1106 | df_train = df_maj.sample(n=n_min).append(df_min)
1107 | if visualize:
1108 | df_train.plot.scatter(x=0, y=1, s=3, c='label', colormap='coolwarm', title='Iter {} training set'.format(ibagging))
1109 | if verbose:
1110 | print ('{:<12s} | Iter: {} X_maj: {} X_rus: {} X_min: {}'.format(
1111 | 'Cascade', ibagging, len(df_maj), len(df_min), len(df_min)))
1112 | self.estimators_.append(self._fit_baselearner(df_train))
1113 | # drop "easy" majority samples
1114 | df_maj['pred_proba'] = self.predict(df_maj[self.feature_cols])
1115 | df_maj = df_maj.sort_values(by='pred_proba', ascending=False)[:int(keep_fp_rate*len(df_maj)+1)]
1116 |
1117 | return self
1118 |
1119 | def predict_proba(self, X):
1120 |
1121 | y_pred = np.array([model.predict_proba(X)[:, 1] for model in self.estimators_]).mean(axis=0)
1122 | if y_pred.ndim == 1:
1123 | y_pred = y_pred[:, np.newaxis]
1124 | if y_pred.shape[1] == 1:
1125 | y_pred = np.append(1-y_pred, y_pred, axis=1)
1126 | return y_pred
1127 |
1128 | def predict(self, X):
1129 |
1130 | y_pred_binarazed = binarize(self.predict_proba(X)[:,1].reshape(1,-1), threshold=0.5)[0]
1131 | return y_pred_binarazed
1132 |
1133 | class SelfPacedEnsemble():
1134 | """ Self-paced Ensemble (SPE)
1135 |
1136 | Parameters
1137 | ----------
1138 | base_estimator : object, optional (default=sklearn.Tree.DecisionTreeClassifier())
1139 | | The base estimator to fit on self-paced under-sampled subsets of the dataset.
1140 | | NO need to support sample weighting.
1141 | | Built-in `fit()`, `predict()`, `predict_proba()` methods are required.
1142 |
1143 | hardness_func : function, optional
1144 | | (default=`lambda y_true, y_pred: np.absolute(y_true-y_pred)`)
1145 | | User-specified classification hardness function
1146 | | | Parameters:
1147 | | | | y_true: 1-d array-like, shape = [n_samples]
1148 | | | | y_pred: 1-d array-like, shape = [n_samples]
1149 | | | Returns:
1150 | | | | hardness: 1-d array-like, shape = [n_samples]
1151 |
1152 | n_estimators : integer, optional (default=10)
1153 | | The number of base estimators in the ensemble.
1154 |
1155 | k_bins : integer, optional (default=10)
1156 | | The number of hardness bins that were used to approximate hardness distribution.
1157 |
1158 | random_state : integer / RandomState instance / None, optional (default=None)
1159 | | If integer, random_state is the seed used by the random number generator;
1160 | | If RandomState instance, random_state is the random number generator;
1161 | | If None, the random number generator is the RandomState instance used by
1162 | | `numpy.random`.
1163 |
1164 | Attributes
1165 | ----------
1166 | base_estimator_ : estimator
1167 | | The base estimator from which the ensemble is grown.
1168 |
1169 | estimators_ : list of estimator
1170 | | The collection of fitted base estimators.
1171 |
1172 |
1173 | Example:
1174 | ```
1175 | import numpy as np
1176 | from sklearn import datasets
1177 | from sklearn.tree import DecisionTreeClassifier
1178 | from src.self_paced_ensemble import SelfPacedEnsemble
1179 | from src.utils import (
1180 | make_binary_classification_target, imbalance_train_test_split)
1181 |
1182 | X, y = datasets.fetch_covtype(return_X_y=True)
1183 | y = make_binary_classification_target(y, 7, True)
1184 | X_train, X_test, y_train, y_test = imbalance_train_test_split(
1185 | X, y, test_size=0.2, random_state=42)
1186 |
1187 | def absolute_error(y_true, y_pred):
1188 | # Self-defined classification hardness function
1189 | return np.absolute(y_true - y_pred)
1190 |
1191 | spe = SelfPacedEnsemble(
1192 | base_estimator=DecisionTreeClassifier(),
1193 | hardness_func=absolute_error,
1194 | n_estimators=10,
1195 | k_bins=10,
1196 | random_state=42,
1197 | ).fit(
1198 | X=X_train,
1199 | y=y_train,
1200 | )
1201 | print('auc_prc_score: {}'.format(spe.score(X_test, y_test)))
1202 | ```
1203 |
1204 | """
1205 | def __init__(self,
1206 | base_estimator=DecisionTreeClassifier(),
1207 | hardness_func=cross_entropy,
1208 | n_estimators=10,
1209 | k_bins=10,
1210 | random_state=None):
1211 | self.base_estimator = base_estimator
1212 | self.estimators_ = []
1213 | self._hardness_func = hardness_func
1214 | self._n_estimators = n_estimators
1215 | self._k_bins = k_bins
1216 | self._random_state = random_state
1217 |
1218 | def _fit_base_estimator(self, X, y):
1219 | """Private function used to train a single base estimator."""
1220 | return sklearn.base.clone(self.base_estimator).fit(X, y)
1221 |
1222 | def _random_under_sampling(self, X_maj, y_maj, X_min, y_min):
1223 | """Private function used to perform random under-sampling."""
1224 | np.random.seed(self._random_state)
1225 | idx = np.random.choice(len(X_maj), len(X_min), replace=False)
1226 | X_train = np.concatenate([X_maj[idx], X_min])
1227 | y_train = np.concatenate([y_maj[idx], y_min])
1228 | return X_train, y_train
1229 |
1230 | def _self_paced_under_sampling(self,
1231 | X_maj, y_maj, X_min, y_min, i_estimator):
1232 | """Private function used to perform self-paced under-sampling."""
1233 | # Update hardness value estimation
1234 | y_pred_maj = self.predict_proba(X_maj)[:, 1]
1235 | hardness = self._hardness_func(y_maj, y_pred_maj)
1236 |
1237 | # If hardness values are not distinguishable, perform random smapling
1238 | if hardness.max() == hardness.min():
1239 | X_train, y_train = self._random_under_sampling(X_maj, y_maj, X_min, y_min)
1240 | # Else allocate majority samples into k hardness bins
1241 | else:
1242 | step = (hardness.max()-hardness.min()) / self._k_bins
1243 | bins = []; ave_contributions = []
1244 | for i_bins in range(self._k_bins):
1245 | idx = (
1246 | (hardness >= i_bins*step + hardness.min()) &
1247 | (hardness < (i_bins+1)*step + hardness.min())
1248 | )
1249 | # Marginal samples with highest hardness value -> kth bin
1250 | if i_bins == (self._k_bins-1):
1251 | idx = idx | (hardness==hardness.max())
1252 | bins.append(X_maj[idx])
1253 | ave_contributions.append(hardness[idx].mean())
1254 |
1255 | # Update self-paced factor alpha
1256 | alpha = np.tan(np.pi*0.5*(i_estimator/(self._n_estimators-1)))
1257 | # Caculate sampling weight
1258 | weights = 1 / (ave_contributions + alpha)
1259 | weights[np.isnan(weights)] = 0
1260 | # Caculate sample number from each bin
1261 | n_sample_bins = len(X_min) * weights / weights.sum()
1262 | n_sample_bins = n_sample_bins.astype(int)+1
1263 |
1264 | # Perform self-paced under-sampling
1265 | sampled_bins = []
1266 | for i_bins in range(self._k_bins):
1267 | if min(len(bins[i_bins]), n_sample_bins[i_bins]) > 0:
1268 | np.random.seed(self._random_state)
1269 | idx = np.random.choice(
1270 | len(bins[i_bins]),
1271 | min(len(bins[i_bins]), n_sample_bins[i_bins]),
1272 | replace=False)
1273 | sampled_bins.append(bins[i_bins][idx])
1274 | X_train_maj = np.concatenate(sampled_bins, axis=0)
1275 | y_train_maj = np.full(X_train_maj.shape[0], y_maj[0])
1276 | X_train = np.concatenate([X_train_maj, X_min])
1277 | y_train = np.concatenate([y_train_maj, y_min])
1278 |
1279 | return X_train, y_train
1280 |
1281 | def fit(self, X, y, label_maj=0, label_min=1, verbose=False):
1282 | """Build a self-paced ensemble of estimators from the training set (X, y).
1283 |
1284 | Parameters
1285 | ----------
1286 | X : {array-like, sparse matrix} of shape = [n_samples, n_features]
1287 | The training input samples. Sparse matrices are accepted only if
1288 | they are supported by the base estimator.
1289 |
1290 | y : array-like, shape = [n_samples]
1291 | The target values (class labels).
1292 |
1293 | label_maj : int, bool or float, optional (default=0)
1294 | The majority class label, default to be negative class.
1295 |
1296 | label_min : int, bool or float, optional (default=1)
1297 | The minority class label, default to be positive class.
1298 |
1299 | Returns
1300 | ------
1301 | self : object
1302 | """
1303 | self.estimators_ = []
1304 | # Initialize by spliting majority / minority set
1305 | X_maj = X[y==label_maj]; y_maj = y[y==label_maj]
1306 | X_min = X[y==label_min]; y_min = y[y==label_min]
1307 |
1308 | # Random under-sampling in the 1st round (cold start)
1309 | X_train, y_train = self._random_under_sampling(
1310 | X_maj, y_maj, X_min, y_min)
1311 | self.estimators_.append(
1312 | self._fit_base_estimator(
1313 | X_train, y_train))
1314 |
1315 | # Loop start
1316 | for i_estimator in range(1, self._n_estimators):
1317 | X_train, y_train = self._self_paced_under_sampling(
1318 | X_maj, y_maj, X_min, y_min, i_estimator,)
1319 | if verbose:
1320 | print ('{:<12s} | Iter: {} X_maj: {} X_min: {} alpha: {:.3f}'.format(
1321 | 'SPEnsemble', i_estimator, len(X_maj), len(X_min), np.tan(np.pi*0.5*(i_estimator/(self._n_estimators-1)))))
1322 | self.estimators_.append(
1323 | self._fit_base_estimator(
1324 | X_train, y_train))
1325 |
1326 | return self
1327 |
1328 | def predict_proba(self, X):
1329 | """Predict class probabilities for X.
1330 |
1331 | The predicted class probabilities of an input sample is computed as
1332 | the mean predicted class probabilities of the base estimators in the
1333 | ensemble. If base estimators do not implement a ``predict_proba``
1334 | method, then it resorts to voting and the predicted class probabilities
1335 | of an input sample represents the proportion of estimators predicting
1336 | each class.
1337 |
1338 | Parameters
1339 | ----------
1340 | X : {array-like, sparse matrix} of shape = [n_samples, n_features]
1341 | The training input samples. Sparse matrices are accepted only if
1342 | they are supported by the base estimator.
1343 |
1344 | Returns
1345 | -------
1346 | p : array of shape = [n_samples, n_classes]
1347 | The class probabilities of the input samples.
1348 | """
1349 | y_pred = np.array(
1350 | [model.predict_proba(X)[:, 1] for model in self.estimators_]
1351 | ).mean(axis=0)
1352 | if y_pred.ndim == 1:
1353 | y_pred = y_pred[:, np.newaxis]
1354 | if y_pred.shape[1] == 1:
1355 | y_pred = np.append(1-y_pred, y_pred, axis=1)
1356 | return y_pred
1357 |
1358 | def predict(self, X):
1359 | """Predict class for X.
1360 |
1361 | The predicted class of an input sample is computed as the class with
1362 | the highest mean predicted probability. If base estimators do not
1363 | implement a ``predict_proba`` method, then it resorts to voting.
1364 |
1365 | Parameters
1366 | ----------
1367 | X : {array-like, sparse matrix} of shape = [n_samples, n_features]
1368 | The training input samples. Sparse matrices are accepted only if
1369 | they are supported by the base estimator.
1370 |
1371 | Returns
1372 | -------
1373 | y : array of shape = [n_samples]
1374 | The predicted classes.
1375 | """
1376 | y_pred_binarized = sklearn.preprocessing.binarize(
1377 | self.predict_proba(X)[:,1].reshape(1,-1), threshold=0.5)[0]
1378 | return y_pred_binarized
1379 |
1380 | def score(self, X, y):
1381 | """Returns the average precision score (equivalent to the area under
1382 | the precision-recall curve) on the given test data and labels.
1383 |
1384 | Parameters
1385 | ----------
1386 | X : array-like, shape = (n_samples, n_features)
1387 | Test samples.
1388 |
1389 | y : array-like, shape = (n_samples) or (n_samples, n_outputs)
1390 | True labels for X.
1391 |
1392 | Returns
1393 | -------
1394 | score : float
1395 | Average precision of self.predict_proba(X)[:, 1] wrt. y.
1396 | """
1397 | return sklearn.metrics.average_precision_score(
1398 | y, self.predict_proba(X)[:, 1])
--------------------------------------------------------------------------------
/baselines/canonical_resampling.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Jan 13 14:32:27 2019
4 | @author: v-zhinli
5 | mailto: znliu19@mails.jlu.edu.cn / zhining.liu@outlook.com
6 | """
7 |
8 | from imblearn.under_sampling import (
9 | ClusterCentroids,
10 | NearMiss,
11 | RandomUnderSampler,
12 | EditedNearestNeighbours,
13 | AllKNN,
14 | TomekLinks,
15 | OneSidedSelection,
16 | RepeatedEditedNearestNeighbours,
17 | CondensedNearestNeighbour,
18 | NeighbourhoodCleaningRule,
19 | )
20 | from imblearn.over_sampling import (
21 | RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE,
22 | )
23 | from imblearn.combine import (
24 | SMOTEENN, SMOTETomek,
25 | )
26 |
27 | from sklearn.tree import DecisionTreeClassifier as DT
28 | from collections import Counter
29 | from time import clock
30 | import pandas as pd
31 |
32 | class Error(Exception):
33 | pass
34 |
35 | class Resample_classifier(object):
36 | '''
37 | Re-sampling methods for imbalance classification, based on imblearn python package.
38 | imblearn url: https://github.com/scikit-learn-contrib/imbalanced-learn
39 | Hyper-parameters:
40 | base_estimator : scikit-learn classifier object
41 | optional (default=DecisionTreeClassifier)
42 | The base estimator used for training after re-sampling
43 | '''
44 | def __init__(self, base_estimator=DT(), resample_by='ORG'):
45 | self.base_estimator = base_estimator
46 | self.resample_by = resample_by
47 |
48 | def fit(self, X_train, y_train, verbose=False):
49 | start_time = clock()
50 | X_train_resampled, y_train_resampled = self.resample(X_train, y_train, by=self.resample_by)
51 | end_time = clock()
52 | self._last_resample_info = 'Resampling method: {}, class distribution from {} to {}, time used {}s'.format(
53 | self.resample_by, dict(Counter(y_train)), dict(Counter(y_train_resampled)), end_time - start_time,
54 | )
55 | if verbose:
56 | print (self._last_resample_info)
57 | self.base_estimator.fit(X_train_resampled, y_train_resampled)
58 |
59 | def predict(self, X):
60 | return self.base_estimator.predict(X)
61 |
62 | def predict_proba(self, X):
63 | return self.base_estimator.predict_proba(X)
64 |
65 | def resample(self, X, y, by, random_state=None):
66 | '''
67 | by: String
68 | The method used to perform re-sampling
69 | currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
70 | 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
71 | 'ORG']
72 | '''
73 | if by == 'RUS':
74 | sampler = RandomUnderSampler(random_state=random_state)
75 | elif by == 'CNN':
76 | sampler = CondensedNearestNeighbour(random_state=random_state)
77 | elif by == 'ENN':
78 | sampler = EditedNearestNeighbours(random_state=random_state)
79 | elif by == 'NCR':
80 | sampler = NeighbourhoodCleaningRule(random_state=random_state)
81 | elif by == 'Tomek':
82 | sampler = TomekLinks(random_state=random_state)
83 | elif by == 'ALLKNN':
84 | sampler = AllKNN(random_state=random_state)
85 | elif by == 'OSS':
86 | sampler = OneSidedSelection(random_state=random_state)
87 | elif by == 'NM':
88 | sampler = NearMiss(random_state=random_state)
89 | elif by == 'CC':
90 | sampler = ClusterCentroids(random_state=random_state)
91 | elif by == 'ROS':
92 | sampler = RandomOverSampler(random_state=random_state)
93 | elif by == 'SMOTE':
94 | sampler = SMOTE(random_state=random_state)
95 | elif by == 'ADASYN':
96 | sampler = ADASYN(random_state=random_state)
97 | elif by == 'BorderSMOTE':
98 | sampler = BorderlineSMOTE(random_state=random_state)
99 | elif by == 'SMOTEENN':
100 | sampler = SMOTEENN(random_state=random_state)
101 | elif by == 'SMOTETomek':
102 | sampler = SMOTETomek(random_state=random_state)
103 | elif by == 'ORG':
104 | sampler = None
105 | else:
106 | raise Error('Unexpected \'by\' type {}'.format(by))
107 |
108 | if by != 'ORG':
109 | X_train, y_train = sampler.fit_resample(X, y)
110 | else:
111 | X_train, y_train = X, y
112 |
113 | return X_train, y_train
--------------------------------------------------------------------------------
/environment.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Feb 8 02:27:20 2020
4 | @author: ZhiningLiu1998
5 | mailto: zhining.liu@outlook.com / v-zhinli@microsoft.com
6 | """
7 |
8 | import pandas as pd
9 | import numpy as np
10 | import sklearn
11 | import warnings
12 | warnings.filterwarnings("ignore")
13 |
14 | from utils import (
15 | Rater, meta_sampling, histogram_error_distribution, imbalance_train_test_split,
16 | )
17 |
18 | class Ensemble():
19 | """A basic ensemble learning framework.
20 |
21 | Parameters
22 | ----------
23 | base_estimator : object (scikit-learn classifier)
24 | The base estimator used to build ensemble classifiers.
25 | NO need to support sample weighting.
26 | Built-in `fit()`, `predict()`, `predict_proba()` methods are required.
27 |
28 | Attributes
29 | ----------
30 | base_estimator_ : estimator
31 | The base estimator from which the ensemble is grown.
32 |
33 | estimators_ : list of estimators
34 | The collection of fitted sub-estimators.
35 | """
36 | def __init__(self, base_estimator):
37 | self.estimators_ = []
38 | if not sklearn.base.is_classifier(base_estimator):
39 | raise TypeError(f'Base estimator {base_estimator} is not a sklearn classifier.')
40 | self.base_estimator_ = base_estimator
41 |
42 | def fit_step(self, X, y):
43 | """Bulid a new base classifier from the training set (X, y).
44 |
45 | Parameters
46 | ----------
47 | y : array-like of shape = [n_samples]
48 | The training labels.
49 |
50 | X : array-like of shape = [n_samples, n_features]
51 | The training instances.
52 |
53 | Returns
54 | ----------
55 | self : object (Ensemble)
56 | """
57 | self.estimators_.append(
58 | sklearn.base.clone(self.base_estimator_).fit(X, y)
59 | )
60 | return self
61 |
62 | def predict_proba(self, X):
63 | """Predict class probabilities for X.
64 |
65 | The predicted class probabilities of an input sample is computed as the
66 | mean predicted class probabilities of the classifiers in the ensemble.
67 |
68 | Parameters
69 | ----------
70 | X : array-like of shape = [n_samples, n_features]
71 | The input data instances.
72 |
73 | Returns
74 | ----------
75 | p : array-like of shape [n_samples, n_classes]
76 | The class probabilities of the input samples.
77 | """
78 | y_pred = np.array(
79 | [model.predict_proba(X)[:, 1] for model in self.estimators_]
80 | ).mean(axis=0)
81 | if y_pred.ndim == 1:
82 | y_pred = y_pred[:, np.newaxis]
83 | if y_pred.shape[1] == 1:
84 | y_pred = np.append(1-y_pred, y_pred, axis=1)
85 | return y_pred
86 |
87 | def predict(self, X):
88 | """Predict classes for X.
89 |
90 | The predicted class of an input sample is computed as the mean
91 | prediction of the classifiers in the ensemble.
92 |
93 | Parameters
94 | ----------
95 | X : array-like of shape = [n_samples, n_features]
96 | The input data instances.
97 |
98 | Returns
99 | ----------
100 | y : array-like of shape = [n_samples]
101 | The predicted classes.
102 | """
103 | y_pred_binarized = sklearn.preprocessing.binarize(
104 | self.predict_proba(X)[:,1].reshape(1,-1), threshold=0.5)[0]
105 | return y_pred_binarized
106 |
107 | def score(self, X, y):
108 | """Return area under precision recall curve (AUCPRC) scores for X, y.
109 |
110 | Parameters
111 | ----------
112 | X : array-like of shape = [n_samples, n_features]
113 | The input data instances.
114 |
115 | y : array-like of shape = [n_samples]
116 | Labels for X.
117 |
118 | Yields
119 | ----------
120 | z : float
121 | """
122 | yield sklearn.metrics.average_precision_score(
123 | y, self.predict_proba(X)[:, 1])
124 |
125 | class EnsembleTrainingEnv(Ensemble):
126 | """The ensemble training environment in MESA.
127 |
128 | Parameters
129 | ----------
130 | args : arguments
131 | See arguments.py for more information.
132 |
133 | base_estimator : object (scikit-learn classifier)
134 | The base estimator used to build ensemble classifiers.
135 | NO need to support sample weighting.
136 | Built-in `fit()`, `predict()`, `predict_proba()` methods are required.
137 |
138 | Attributes
139 | ----------
140 | args : arguments
141 |
142 | rater : object (Rater)
143 | Rater for evaluate classifiers performance on class imabalanced data.
144 | See arguments.py for more information.
145 |
146 | base_estimator_ : object (scikit-learn classifier)
147 | The base estimator from which the ensemble is grown.
148 |
149 | estimators_ : list of classifiers
150 | The collection of fitted sub-estimators.
151 | """
152 | def __init__(self, args, base_estimator):
153 |
154 | super(EnsembleTrainingEnv, self).__init__(
155 | base_estimator=base_estimator)
156 |
157 | self.base_estimator_ = base_estimator
158 | self.args = args
159 | self.rater = Rater(metric=args.metric)
160 |
161 | def load_data(self, X_train, y_train, X_valid, y_valid, X_test=None, y_test=None, train_ratio=1):
162 | """Load and preprocess the train/valid/test data into the environment."""
163 | self.flag_use_test_set = False if X_test is None or y_test is None else True
164 | if train_ratio < 1:
165 | print ('Using {:.2%} random subset for meta-training.'.format(train_ratio))
166 | _, X_train, _, y_train = imbalance_train_test_split(X_train, y_train, test_size=train_ratio)
167 | self.X_train, self.y_train = pd.DataFrame(X_train), pd.Series(y_train)
168 | self.X_valid, self.y_valid = pd.DataFrame(X_valid), pd.Series(y_valid)
169 | self.X_test, self.y_test = pd.DataFrame(X_test), pd.Series(y_test)
170 | self.mask_maj_train, self.mask_min_train = (y_train==0), (y_train==1)
171 | self.mask_maj_valid, self.mask_min_valid = (y_valid==0), (y_valid==1)
172 | self.n_min_samples = self.mask_min_train.sum()
173 | n_samples = int(self.n_min_samples*self.args.train_ir)
174 | if n_samples > self.mask_maj_train.sum():
175 | raise ValueError(f"\
176 | Argument 'train_ir' should be smaller than imbalance ratio,\n \
177 | Please set this parameter to < {self.mask_maj_train.sum()/self.mask_min_train.sum()}.\
178 | ")
179 | self.n_samples = n_samples
180 |
181 | def init(self):
182 | """Reset the environment."""
183 | self.estimators_ = []
184 | # buffer the predict probabilities for better efficiency
185 | # initialize
186 | self.y_pred_train_buffer = np.zeros_like(self.y_train)
187 | self.y_pred_valid_buffer = np.zeros_like(self.y_valid)
188 | if self.flag_use_test_set:
189 | self.y_pred_test_buffer = np.zeros_like(self.y_test)
190 | self._warm_up()
191 |
192 | def get_state(self):
193 | """Fetch the current state of the environment."""
194 | hist_train = histogram_error_distribution(
195 | self.y_train[self.mask_maj_train],
196 | self.y_pred_train_buffer[self.mask_maj_train],
197 | self.args.num_bins)
198 | hist_valid = histogram_error_distribution(
199 | self.y_valid[self.mask_maj_valid],
200 | self.y_pred_valid_buffer[self.mask_maj_valid],
201 | self.args.num_bins)
202 | hist_train = hist_train / hist_train.sum() * self.args.num_bins
203 | hist_valid = hist_valid / hist_valid.sum() * self.args.num_bins
204 | state = np.concatenate([hist_train, hist_valid])
205 | return state
206 |
207 | def step(self, action, verbose=False):
208 | """Perform an environment step.
209 |
210 | Parameters
211 | ----------
212 | action: float, in [0, 1]
213 | The action (mu) to execute in the environment.
214 |
215 | verbose: bool, optional (default=False)
216 | Whether to compute and return the information about the current ensemble.
217 |
218 | Returns
219 | ----------
220 | next_state : array-like of shape [state_size]
221 | The state of the environment after executing the action.
222 |
223 | reward : float
224 | The reward of taking the action.
225 |
226 | done : bool
227 | Indicates the end of an episode.
228 | True if the ensemble reaches the maximum number of base estimators.
229 |
230 | info : string
231 | Information about the current ensemble.
232 | Empty string if verbose == False.
233 | """
234 | # check action value
235 | if action < 0 or action > 1:
236 | raise ValueError("Action must be a float in [0, 1].")
237 |
238 | # perform meta-sampling
239 | X_maj_subset = meta_sampling(
240 | y_pred = self.y_pred_train_buffer[self.mask_maj_train],
241 | y_true = self.y_train[self.mask_maj_train],
242 | n_under_samples = self.n_samples,
243 | X = self.X_train[self.mask_maj_train],
244 | mu = action,
245 | sigma = self.args.sigma,
246 | random_state = self.args.random_state,)
247 | # build training subset (X_train_iter, y_train_iter)
248 | X_train_iter = pd.concat([X_maj_subset, self.X_train[self.mask_min_train]]).values
249 | y_train_iter = np.concatenate([np.zeros(X_maj_subset.shape[0]), np.ones(self.n_min_samples)])
250 |
251 | score_valid_before = self.rater.score(self.y_valid, self.y_pred_valid_buffer)
252 |
253 | # build a new base classifier from (X_train_iter, y_train_iter)
254 | self.fit_step(X_train_iter, y_train_iter)
255 | self.update_all_pred_buffer()
256 |
257 | score_valid = self.rater.score(self.y_valid, self.y_pred_valid_buffer)
258 |
259 | # obtain return values
260 | next_state = self.get_state()
261 | reward = score_valid - score_valid_before
262 | done = True if len(self.estimators_) >= self.args.max_estimators else False
263 | info = ''
264 |
265 | # fetch environment information if verbose==True
266 | if self.args.meta_verbose is 'full' or verbose:
267 | score_train = self.rater.score(self.y_train, self.y_pred_train_buffer)
268 | score_test = self.rater.score(self.y_test, self.y_pred_test_buffer) if self.flag_use_test_set else 'NULL'
269 | info = 'k={:<3d}|{}| train {:.3f} | valid {:.3f} | '.format(
270 | len(self.estimators_)-1, self.args.metric, score_train, score_valid)
271 | info += 'test {:.3f}'.format(score_test) if self.flag_use_test_set else 'test NULL'
272 |
273 | return next_state, reward, done, info
274 |
275 | def update_all_pred_buffer(self):
276 | """Update all buffered predict probabilities."""
277 | n_clf = len(self.estimators_)
278 | self.y_pred_train_buffer = self._update_pred_buffer(n_clf, self.X_train, self.y_pred_train_buffer)
279 | self.y_pred_valid_buffer = self._update_pred_buffer(n_clf, self.X_valid, self.y_pred_valid_buffer)
280 | if self.flag_use_test_set:
281 | self.y_pred_test_buffer = self._update_pred_buffer(n_clf, self.X_test, self.y_pred_test_buffer)
282 | return
283 |
284 | def _update_pred_buffer(self, n_clf, X, y_pred_buffer):
285 | """Update buffered predict probabilities.
286 |
287 | Parameters
288 | ----------
289 | n_clf : int
290 | Current ensemble size.
291 |
292 | X : array-like of shape = [n_samples, n_features]
293 | The input data instances.
294 |
295 | y_pred_buffer : array-like of shape [n_samples]
296 | The buffered predict probabilities of X.
297 |
298 | Returns
299 | ----------
300 | y_pred_updated : array-like of shape [n_samples]
301 | """
302 | y_pred_last_clf = self.estimators_[-1].predict_proba(X)[:, 1]
303 | y_pred_buffer_updated = (y_pred_buffer * (n_clf-1) + y_pred_last_clf) / n_clf
304 | return y_pred_buffer_updated
305 |
306 | def _warm_up(self):
307 | """Train the first base classifier with random under-sampling."""
308 | X_maj = self.X_train[self.mask_maj_train]
309 | X_min = self.X_train[self.mask_min_train]
310 | X_maj_rus = X_maj.sample(n=self.n_samples, random_state=self.args.random_state)
311 | # X_maj_rus = X_maj
312 | X_train_rus = pd.concat([X_maj_rus, X_min]).values
313 | y_train_rus = np.concatenate([np.zeros(X_maj_rus.shape[0]), np.ones(X_min.shape[0])])
314 | self.fit_step(X_train_rus, y_train_rus)
315 | self.update_all_pred_buffer()
316 | return
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import pandas as pd
3 | import numpy as np
4 | import time
5 | from mesa import Mesa
6 | from arguments import parser
7 | from utils import Rater, load_dataset
8 | from sklearn.tree import DecisionTreeClassifier
9 |
10 | if __name__ == '__main__':
11 |
12 | # load dataset & prepare environment
13 | args = parser.parse_args()
14 | rater = Rater(args.metric)
15 | X_train, y_train, X_valid, y_valid, X_test, y_test = load_dataset(args.dataset)
16 | base_estimator = DecisionTreeClassifier(max_depth=None)
17 |
18 | # meta-training
19 | print ('\nStart meta-training of MESA ... ...\n')
20 | mesa = Mesa(
21 | args=args,
22 | base_estimator=base_estimator,
23 | n_estimators=args.max_estimators)
24 | mesa.meta_fit(X_train, y_train, X_valid, y_valid, X_test, y_test)
25 |
26 | # test
27 | print ('\nStart ensemble training of MESA ... ...\n')
28 | runs = 50
29 | scores_list, time_list = [], []
30 | for i_run in tqdm(range(runs)):
31 | start_time = time.clock()
32 | mesa.fit(X_train, y_train, X_valid, y_valid, verbose=False)
33 | end_time = time.clock()
34 | time_list.append(end_time - start_time)
35 | score_train = rater.score(y_train, mesa.predict_proba(X_train)[:,1])
36 | score_valid = rater.score(y_valid, mesa.predict_proba(X_valid)[:,1])
37 | score_test = rater.score(y_test, mesa.predict_proba(X_test)[:,1])
38 | scores_list.append([score_train, score_valid, score_test])
39 |
40 | # print results to stdout
41 | df_scores = pd.DataFrame(scores_list, columns=['train', 'valid', 'test'])
42 | info = f'Dataset: {args.dataset}\nMESA {args.metric}|'
43 | for column in df_scores.columns:
44 | info += ' {} {:.3f}-{:.3f} |'.format(column, df_scores.mean()[column], df_scores.std()[column])
45 | info += ' {} runs (mean-std) |'.format(runs)
46 | info += ' ave run time: {:.2f}s'.format(np.mean(time_list))
47 | print (info)
--------------------------------------------------------------------------------
/mesa.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Feb 8 02:27:20 2020
4 | @author: ZhiningLiu1998
5 | mailto: zhining.liu@outlook.com / v-zhinli@microsoft.com
6 | """
7 |
8 | import os
9 | import torch
10 | import pandas as pd
11 | import numpy as np
12 | from gym import spaces
13 | from sac_src.sac import SAC
14 | from sac_src.replay_memory import ReplayMemory
15 | from environment import EnsembleTrainingEnv
16 | from utils import *
17 |
18 | class Mesa(EnsembleTrainingEnv):
19 | """The ensemble imbalanced learning framework MESA.
20 |
21 | Parameters
22 | ----------
23 | args : arguments
24 | See arguments.py for more information.
25 |
26 | base_estimator : scikit-learn classifier object
27 | The base estimator used to build ensemble classifiers.
28 | NO need to support sample weighting.
29 | Built-in `fit()`, `predict()`, `predict_proba()` methods are required.
30 |
31 | n_estimators : int, optional (default=10)
32 | The number of base estimators used to form an MESA ensemble.
33 |
34 | Attributes
35 | ----------
36 | args : arguments
37 |
38 | rater : object (Rater)
39 | Rater for evaluate classifiers performance on class imabalanced data.
40 | See arguments.py for more information.
41 |
42 | base_estimator_ : object (scikit-learn classifier)
43 | The base estimator from which the ensemble is grown.
44 |
45 | estimators_ : list of classifiers
46 | The collection of fitted sub-estimators.
47 |
48 | n_estimators : int
49 | The number of base estimators used to form an MESA ensemble.
50 |
51 | meta_sampler : object (SAC)
52 | The meta-sampler in MESA.
53 |
54 | env : object (EnsembleTrainingEnv)
55 | The ensemble training environment in MESA.
56 |
57 | memory : object (ReplayMemory)
58 | The replay memory for Soft Actor-Critic training.
59 | """
60 | def __init__(self, args, base_estimator, n_estimators=10):
61 |
62 | super(Mesa, self).__init__(args, base_estimator)
63 |
64 | # state-size = 2 x num_bins
65 | state_size = int(args.num_bins*2)
66 | action_space = spaces.Box(low=0.0, high=1.0, shape=[1], dtype=np.float32)
67 |
68 | self.args = args
69 | self.n_estimators = n_estimators
70 | self.base_estimator_ = base_estimator
71 | self.meta_sampler = SAC(state_size, action_space, self.args)
72 | self.env = EnsembleTrainingEnv(args, base_estimator)
73 | self.memory = ReplayMemory(self.args.replay_size)
74 |
75 | def meta_fit(self, X_train, y_train, X_valid, y_valid, X_test=None, y_test=None):
76 | """Meta-training process of MESA.
77 |
78 | Parameters
79 | ----------
80 | X_train : array-like of shape = [n_training_samples, n_features]
81 | The training data instances.
82 |
83 | y_train : array-like of shape = [n_training_samples]
84 | Labels for X_train.
85 |
86 | X_valid : array-like of shape = [n_validation_samples, n_features]
87 | The validation data instances.
88 |
89 | y_valid : array-like of shape = [n_validation_samples]
90 | Labels for X_valid.
91 |
92 | X_test : array-like of shape = [n_training_samples, n_features], optional (default=None)
93 | The test data instances.
94 |
95 | y_train : array-like of shape = [n_training_samples], optional (default=None)
96 | Labels for X_test.
97 |
98 | Returns
99 | ----------
100 | self : object (Mesa)
101 | """
102 | # initialize replay memory and environment
103 | self.env.load_data(X_train, y_train, X_valid, y_valid, X_test, y_test, train_ratio=self.args.train_ratio)
104 | self.memory = memory_init_fulfill(self.args, ReplayMemory(self.args.replay_size))
105 |
106 | self.scores = []
107 | total_steps = self.args.update_steps + self.args.start_steps
108 | num_steps, num_updates, num_episodes = 0, 0, 0
109 |
110 | # start meta-training
111 | while num_steps < total_steps:
112 | self.env.init()
113 | state = self.env.get_state()
114 | done = False
115 |
116 | # for each episode
117 | while not done:
118 | num_steps += 1
119 |
120 | # take an action
121 | if num_steps >= self.args.start_steps:
122 | action, by = self.meta_sampler.select_action(state), 'mesa'
123 | else:
124 | action, by = self.meta_sampler.action_space.sample(), 'rand'
125 |
126 | # store transition
127 | next_state, reward, done, info = self.env.step(action[0])
128 | reward = reward * self.args.reward_coefficient
129 | self.memory.push(state, action, reward, next_state, float(done))
130 |
131 | # update meta-sampler parameters
132 | if num_steps > self.args.start_steps:
133 | for i in range(self.args.updates_per_step):
134 | _, _, _, _, _ = self.meta_sampler.update_parameters(
135 | self.memory, self.args.batch_size, num_updates)
136 | num_updates += self.args.updates_per_step
137 |
138 | # print log to stdout
139 | if self.args.meta_verbose is 'full':
140 | print ('Epi.{:<4d} updates{:<4d}| {} | {} by {}'.format(num_episodes, num_updates, info, action[0], by))
141 |
142 | if done:
143 | num_episodes += 1
144 | self.record_scores()
145 | # record print mean score of latest args.meta_verbose_mean_episodes to stdout
146 | self.verbose_mean_scores(num_episodes, num_updates, by)
147 |
148 | return self
149 |
150 | def record_scores(self):
151 | """Record the training/validation/test performance scores."""
152 | train_score = self.env.rater.score(self.env.y_train, self.env.y_pred_train_buffer)
153 | valid_score = self.env.rater.score(self.env.y_valid, self.env.y_pred_valid_buffer)
154 | test_score = self.env.rater.score(self.env.y_test, self.env.y_pred_test_buffer) if self.env.flag_use_test_set else 'NULL'
155 | self.scores.append([train_score, valid_score, test_score] if self.env.flag_use_test_set else [train_score, valid_score])
156 | return
157 |
158 | def verbose_mean_scores(self, num_episodes, num_updates, by):
159 | """Print mean score of latest n episodes to stdout.
160 |
161 | n = args.meta_verbose_mean_episodes
162 |
163 | Parameters
164 | ----------
165 | num_episodes : int
166 | The number of finished meta-training episodes.
167 |
168 | num_updates : int
169 | The number of finished meta-sampler updates.
170 |
171 | by : {'rand', 'mesa'}, string
172 | The way of selecting actions in the current episode.
173 | """
174 | if self.args.meta_verbose is 'full' or (self.args.meta_verbose != 0 and num_episodes % self.args.meta_verbose == 0):
175 | view_bound = max(-self.args.meta_verbose_mean_episodes, -len(self.scores))
176 | recent_scores_mean = np.array(self.scores)[view_bound:].mean(axis=0)
177 | print ('Epi.{:<4d} updates {:<4d} |last-{}-mean-{}| train {:.3f} | valid {:.3f} | test {:.3f} | by {}'.format(
178 | num_episodes, num_updates, self.args.meta_verbose_mean_episodes, self.args.metric,
179 | recent_scores_mean[0], recent_scores_mean[1], recent_scores_mean[2], by))
180 | return
181 |
182 | def fit(self, X, y, X_valid, y_valid, n_estimators=None, verbose=False):
183 | """Build a MESA ensemble from training set (X, y) and validation set (X_valid, y_valid).
184 |
185 | Parameters
186 | ----------
187 | X : array-like of shape = [n_training_samples, n_features]
188 | The training data instances.
189 |
190 | y : array-like of shape = [n_training_samples]
191 | Labels for X.
192 |
193 | X_valid : array-like of shape = [n_validation_samples, n_features]
194 | The validation data instances.
195 |
196 | y_valid : array-like of shape = [n_validation_samples]
197 | Labels for X_valid.
198 |
199 | n_estimators : int, optional (default=self.n_estimators)
200 | The number of base estimators used to form an MESA ensemble.
201 |
202 | verbose: bool, optional (default=False)
203 | Whether to print progress messages to stdout.
204 |
205 | Returns
206 | ----------
207 | self : object (Mesa)
208 | """
209 | n_estimators = self.n_estimators if n_estimators is None else n_estimators
210 | self.load_data(X, y, X_valid, y_valid)
211 | self.init()
212 | self.actions_record = []
213 | for i in range(n_estimators-1):
214 | state = self.get_state()
215 | action = self.meta_sampler.select_action(state)
216 | self.actions_record.append(action[0])
217 | _, _, _, info = self.step(action[0], verbose)
218 | if verbose:
219 | print ('{:<12s} | action: {} {}'.format('Mesa', action, info))
220 | return self
221 |
222 | def save_meta_sampler(self, directory='save_model', suffix='meta_sampler'):
223 | """Save trained meta-sampler to files.
224 |
225 | Parameters
226 | ----------
227 | directory : string, optional (default='save_model')
228 | The directory to save files.
229 | Create the directory if it does not exist.
230 |
231 | suffix : string, optional (default='meta_sampler')
232 | The actor network will be saved in {directory}/actor_{suffix}.
233 | The critic network will be saved in {directory}/critic_{suffix}.
234 | """
235 | directory_path = f'{directory}/'
236 | if not os.path.exists(directory_path):
237 | os.makedirs(directory_path)
238 | actor_path = f'{directory_path}actor_{suffix}'
239 | critic_path = f'{directory_path}critic_{suffix}'
240 | self.meta_sampler.save_model(actor_path, critic_path)
241 | return
242 |
243 | def load_meta_sampler(self, directory='save_model', suffix='meta_sampler'):
244 | """Load trained meta-sampler from files.
245 |
246 | Parameters
247 | ----------
248 | directory : string, optional (default='save_model')
249 | The directory to load files.
250 |
251 | suffix : string, optional (default='meta_sampler')
252 | The actor network will be loaded from {directory}/actor_{suffix}.
253 | The critic network will be loaded from {directory}/critic_{suffix}.
254 | """
255 | directory_path = f'{directory}/'
256 | actor_path = f'{directory_path}actor_{suffix}'
257 | critic_path = f'{directory_path}critic_{suffix}'
258 | self.meta_sampler.load_model(actor_path, critic_path)
259 | return self
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # torch==1.0.0
2 | gym==0.17.3
3 | tqdm==4.28.1
4 | pandas==0.23.4
5 | numpy==1.15.4
6 | seaborn==0.9.0
7 | imbalanced-learn==0.5.0
8 | scikit-learn==0.21
9 | jupyter==1.0.0
--------------------------------------------------------------------------------
/sac_src/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.distributions import Normal
5 |
6 | LOG_SIG_MAX = 2
7 | LOG_SIG_MIN = -20
8 | epsilon = 1e-6
9 |
10 | # Initialize Policy weights
11 | def weights_init_(m):
12 | if isinstance(m, nn.Linear):
13 | torch.nn.init.xavier_uniform_(m.weight, gain=1)
14 | torch.nn.init.constant_(m.bias, 0)
15 |
16 |
17 | class ValueNetwork(nn.Module):
18 | def __init__(self, num_inputs, hidden_dim):
19 | super(ValueNetwork, self).__init__()
20 |
21 | self.linear1 = nn.Linear(num_inputs, hidden_dim)
22 | self.linear2 = nn.Linear(hidden_dim, hidden_dim)
23 | self.linear3 = nn.Linear(hidden_dim, 1)
24 |
25 | self.apply(weights_init_)
26 |
27 | def forward(self, state):
28 | x = F.relu(self.linear1(state))
29 | x = F.relu(self.linear2(x))
30 | x = self.linear3(x)
31 | return x
32 |
33 |
34 | class QNetwork(nn.Module):
35 | def __init__(self, num_inputs, num_actions, hidden_dim):
36 | super(QNetwork, self).__init__()
37 |
38 | # Q1 architecture
39 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim)
40 | self.linear2 = nn.Linear(hidden_dim, hidden_dim)
41 | self.linear3 = nn.Linear(hidden_dim, 1)
42 |
43 | # Q2 architecture
44 | self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
45 | self.linear5 = nn.Linear(hidden_dim, hidden_dim)
46 | self.linear6 = nn.Linear(hidden_dim, 1)
47 |
48 | self.apply(weights_init_)
49 |
50 | def forward(self, state, action):
51 | xu = torch.cat([state, action], 1)
52 |
53 | x1 = F.relu(self.linear1(xu))
54 | x1 = F.relu(self.linear2(x1))
55 | x1 = self.linear3(x1)
56 |
57 | x2 = F.relu(self.linear4(xu))
58 | x2 = F.relu(self.linear5(x2))
59 | x2 = self.linear6(x2)
60 |
61 | return x1, x2
62 |
63 |
64 | class GaussianPolicy(nn.Module):
65 | def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None):
66 | super(GaussianPolicy, self).__init__()
67 |
68 | self.linear1 = nn.Linear(num_inputs, hidden_dim)
69 | # self.linear2 = nn.Linear(hidden_dim, hidden_dim)
70 | # self.linear3 = nn.Linear(hidden_dim, hidden_dim)
71 | # self.linear4 = nn.Linear(hidden_dim, hidden_dim)
72 |
73 | self.mean_linear = nn.Linear(hidden_dim, num_actions)
74 | self.log_std_linear = nn.Linear(hidden_dim, num_actions)
75 |
76 | self.apply(weights_init_)
77 |
78 | # action rescaling
79 | if action_space is None:
80 | self.action_scale = torch.tensor(1.)
81 | self.action_bias = torch.tensor(0.)
82 | else:
83 | self.action_scale = torch.FloatTensor(
84 | (action_space.high - action_space.low) / 2.)
85 | self.action_bias = torch.FloatTensor(
86 | (action_space.high + action_space.low) / 2.)
87 |
88 | def forward(self, state):
89 | x = F.relu(self.linear1(state))
90 | # x = F.relu(self.linear2(x))
91 | # x = F.relu(self.linear3(x))
92 | # x = F.relu(self.linear4(x))
93 | mean = self.mean_linear(x)
94 | log_std = self.log_std_linear(x)
95 | log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX)
96 | return mean, log_std
97 |
98 | def sample(self, state):
99 | mean, log_std = self.forward(state)
100 | std = log_std.exp()
101 | normal = Normal(mean, std)
102 | x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1))
103 | y_t = torch.tanh(x_t)
104 | action = y_t * self.action_scale + self.action_bias
105 | log_prob = normal.log_prob(x_t)
106 | # Enforcing Action Bound
107 | log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon)
108 | log_prob = log_prob.sum(1, keepdim=True)
109 | mean = torch.tanh(mean) * self.action_scale + self.action_bias
110 | return action, log_prob, mean
111 |
112 | def to(self, device):
113 | self.action_scale = self.action_scale.to(device)
114 | self.action_bias = self.action_bias.to(device)
115 | return super(GaussianPolicy, self).to(device)
116 |
117 |
118 | class DeterministicPolicy(nn.Module):
119 | def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None):
120 | super(DeterministicPolicy, self).__init__()
121 | self.linear1 = nn.Linear(num_inputs, hidden_dim)
122 | self.linear2 = nn.Linear(hidden_dim, hidden_dim)
123 |
124 | self.mean = nn.Linear(hidden_dim, num_actions)
125 | self.noise = torch.Tensor(num_actions)
126 |
127 | self.apply(weights_init_)
128 |
129 | # action rescaling
130 | if action_space is None:
131 | self.action_scale = 1.
132 | self.action_bias = 0.
133 | else:
134 | self.action_scale = torch.FloatTensor(
135 | (action_space.high - action_space.low) / 2.)
136 | self.action_bias = torch.FloatTensor(
137 | (action_space.high + action_space.low) / 2.)
138 |
139 | def forward(self, state):
140 | x = F.relu(self.linear1(state))
141 | x = F.relu(self.linear2(x))
142 | mean = torch.tanh(self.mean(x)) * self.action_scale + self.action_bias
143 | return mean
144 |
145 | def sample(self, state):
146 | mean = self.forward(state)
147 | noise = self.noise.normal_(0., std=0.1)
148 | noise = noise.clamp(-0.25, 0.25)
149 | action = mean + noise
150 | return action, torch.tensor(0.), mean
151 |
152 | def to(self, device):
153 | self.action_scale = self.action_scale.to(device)
154 | self.action_bias = self.action_bias.to(device)
155 | self.noise = self.noise.to(device)
156 | return super(DeterministicPolicy, self).to(device)
157 |
--------------------------------------------------------------------------------
/sac_src/replay_memory.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 |
4 | class ReplayMemory:
5 | def __init__(self, capacity):
6 | self.capacity = capacity
7 | self.buffer = []
8 | self.position = 0
9 |
10 | def push(self, state, action, reward, next_state, done):
11 | if len(self.buffer) < self.capacity:
12 | self.buffer.append(None)
13 | self.buffer[self.position] = (state, action, reward, next_state, done)
14 | self.position = (self.position + 1) % self.capacity
15 |
16 | def sample(self, batch_size):
17 | batch = random.sample(self.buffer, batch_size)
18 | state, action, reward, next_state, done = map(np.stack, zip(*batch))
19 | return state, action, reward, next_state, done
20 |
21 | def __len__(self):
22 | return len(self.buffer)
23 |
--------------------------------------------------------------------------------
/sac_src/sac.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import torch.nn.functional as F
4 | from torch.optim import Adam, lr_scheduler
5 | from sac_src.utils import soft_update, hard_update
6 | from sac_src.model import GaussianPolicy, QNetwork, DeterministicPolicy
7 |
8 |
9 | class SAC(object):
10 | def __init__(self, num_inputs, action_space, args):
11 |
12 | self.gamma = args.gamma
13 | self.tau = args.tau
14 | self.alpha = args.alpha
15 | self.action_space = action_space
16 | self.learning_rate = args.lr
17 |
18 | self.policy_type = args.policy
19 | self.target_update_interval = args.target_update_interval
20 | self.automatic_entropy_tuning = args.automatic_entropy_tuning
21 |
22 | self.device = torch.device("cuda" if args.cuda else "cpu")
23 |
24 | self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device)
25 | self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)
26 |
27 | self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
28 | hard_update(self.critic_target, self.critic)
29 |
30 | if self.policy_type == "Gaussian":
31 | # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
32 | if self.automatic_entropy_tuning == True:
33 | self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
34 | self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
35 | self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
36 |
37 | self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
38 | self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
39 |
40 | else:
41 | self.alpha = 0
42 | self.automatic_entropy_tuning = False
43 | self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
44 | self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
45 |
46 | self.policy_scheduler = lr_scheduler.StepLR(self.critic_optim, step_size=args.lr_decay_steps, gamma=args.lr_decay_gamma)
47 |
48 | def learning_rate_decay(self, decay_ratio=0.5):
49 | self.learning_rate = self.learning_rate * decay_ratio
50 | self.critic_optim = Adam(self.critic.parameters(), lr=self.learning_rate)
51 | self.policy_optim = Adam(self.policy.parameters(), lr=self.learning_rate)
52 |
53 | def select_action(self, state, eval=False):
54 | state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
55 | if eval == False:
56 | action, _, _ = self.policy.sample(state)
57 | else:
58 | _, _, action = self.policy.sample(state)
59 | return action.detach().cpu().numpy()[0]
60 |
61 | def update_parameters(self, memory, batch_size, updates):
62 | # Sample a batch from memory
63 | state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size)
64 |
65 | state_batch = torch.FloatTensor(state_batch).to(self.device)
66 | next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
67 | action_batch = torch.FloatTensor(action_batch).to(self.device)
68 | reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
69 | mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)
70 |
71 | with torch.no_grad():
72 | next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch)
73 | qf1_next_target, qf2_next_target = self.critic_target(next_state_batch, next_state_action)
74 | min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
75 | next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target)
76 |
77 | qf1, qf2 = self.critic(state_batch, action_batch) # Two Q-functions to mitigate positive bias in the policy improvement step
78 | qf1_loss = F.mse_loss(qf1, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
79 | qf2_loss = F.mse_loss(qf2, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
80 |
81 | pi, log_pi, _ = self.policy.sample(state_batch)
82 |
83 | qf1_pi, qf2_pi = self.critic(state_batch, pi)
84 | min_qf_pi = torch.min(qf1_pi, qf2_pi)
85 |
86 | policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]
87 |
88 | self.critic_optim.zero_grad()
89 | qf1_loss.backward()
90 | self.critic_optim.step()
91 |
92 | self.critic_optim.zero_grad()
93 | qf2_loss.backward()
94 | self.critic_optim.step()
95 |
96 | self.policy_optim.zero_grad()
97 | policy_loss.backward()
98 | self.policy_optim.step()
99 | self.policy_scheduler.step()
100 |
101 | if self.automatic_entropy_tuning:
102 | alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
103 |
104 | self.alpha_optim.zero_grad()
105 | alpha_loss.backward()
106 | self.alpha_optim.step()
107 |
108 | self.alpha = self.log_alpha.exp()
109 | alpha_tlogs = self.alpha.clone() # For TensorboardX logs
110 | else:
111 | alpha_loss = torch.tensor(0.).to(self.device)
112 | alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs
113 |
114 |
115 | if updates % self.target_update_interval == 0:
116 | soft_update(self.critic_target, self.critic, self.tau)
117 |
118 | return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item(), alpha_tlogs.item()
119 |
120 | # Save model parameters
121 | def save_model(self, actor_path, critic_path):
122 | torch.save(self.policy.state_dict(), actor_path)
123 | torch.save(self.critic.state_dict(), critic_path)
124 |
125 | # Load model parameters
126 | def load_model(self, actor_path, critic_path):
127 | print('Loading models from {} and {}'.format(actor_path, critic_path))
128 | if actor_path is not None:
129 | self.policy.load_state_dict(torch.load(actor_path))
130 | if critic_path is not None:
131 | self.critic.load_state_dict(torch.load(critic_path))
132 |
133 |
--------------------------------------------------------------------------------
/sac_src/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 |
4 | def create_log_gaussian(mean, log_std, t):
5 | quadratic = -((0.5 * (t - mean) / (log_std.exp())).pow(2))
6 | l = mean.shape
7 | log_z = log_std
8 | z = l[-1] * math.log(2 * math.pi)
9 | log_p = quadratic.sum(dim=-1) - log_z.sum(dim=-1) - 0.5 * z
10 | return log_p
11 |
12 | def logsumexp(inputs, dim=None, keepdim=False):
13 | if dim is None:
14 | inputs = inputs.view(-1)
15 | dim = 0
16 | s, _ = torch.max(inputs, dim=dim, keepdim=True)
17 | outputs = s + (inputs - s).exp().sum(dim=dim, keepdim=True).log()
18 | if not keepdim:
19 | outputs = outputs.squeeze(dim)
20 | return outputs
21 |
22 | def soft_update(target, source, tau):
23 | for target_param, param in zip(target.parameters(), source.parameters()):
24 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
25 |
26 | def hard_update(target, source):
27 | for target_param, param in zip(target.parameters(), source.parameters()):
28 | target_param.data.copy_(param.data)
29 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Feb 8 02:27:20 2020
4 | @author: ZhiningLiu1998
5 | mailto: zhining.liu@outlook.com / v-zhinli@microsoft.com
6 | """
7 |
8 | import pandas as pd
9 | import numpy as np
10 | from sklearn.metrics import (
11 | f1_score,
12 | average_precision_score,
13 | matthews_corrcoef,
14 | )
15 | from sklearn.model_selection import train_test_split
16 |
17 | class Rater():
18 | """Rater for evaluate classifiers performance on class imabalanced data.
19 |
20 | Parameters
21 | ----------
22 | metric : {'aucprc', 'mcc', 'fscore'}, optional (default='aucprc')
23 | Specify the performance metric used for evaluation.
24 | If 'aucprc' then use Area Under Precision-Recall Curve.
25 | If 'mcc' then use Matthews Correlation Coefficient.
26 | If 'fscore' then use F1-score, also known as balanced F-score or F-measure.
27 | Passing other values raises an exception.
28 |
29 | threshold : float, optional (default=0.5)
30 | The threshold used for binarizing the predicted probability.
31 | It does not affect the AUCPRC score
32 |
33 | Attributes
34 | ----------
35 | metric_ : string
36 | The performance metric used for evaluation.
37 |
38 | threshold_ : float
39 | The predict threshold.
40 | """
41 | def __init__(self, metric='aucprc', threshold=0.5):
42 |
43 | if metric not in ['aucprc', 'mcc', 'fscore', 'bacc']:
44 | raise ValueError(f'Metric {metric} is not supported.\
45 | \nSupport metrics: [aucprc, mcc, fscore].')
46 |
47 | self.metric_ = metric
48 | self.threshold_ = threshold
49 |
50 | def score(self, y_true, y_pred):
51 | """Score function.
52 |
53 | Parameters
54 | ----------
55 | y_true : array-like of shape = [n_samples]
56 | The ground truth labels.
57 |
58 | y_pred : array-like of shape = [n_samples]
59 | The predict probabilities.
60 |
61 | Returns
62 | ----------
63 | score: float
64 | """
65 | if self.metric_ == 'aucprc':
66 | return average_precision_score(y_true , y_pred)
67 | elif self.metric_ == 'mcc':
68 | y_pred_b = y_pred.copy()
69 | y_pred_b[y_pred_b < self.threshold_] = 0
70 | y_pred_b[y_pred_b >= self.threshold_] = 1
71 | return matthews_corrcoef(y_true, y_pred_b)
72 | elif self.metric_ == 'fscore':
73 | y_pred_b = y_pred.copy()
74 | y_pred_b[y_pred_b < self.threshold_] = 0
75 | y_pred_b[y_pred_b >= self.threshold_] = 1
76 | return f1_score(y_true, y_pred_b)
77 |
78 | def load_dataset(dataset_name):
79 | """Util function that load training/validation/test data from /data folder.
80 |
81 | Parameters
82 | ----------
83 | dataset_name : string
84 | Name of the target dataset.
85 | Train/validation/test data are expected to save in .csv files with
86 | suffix _{train/valid/test}.csv. Labels should be at the last column
87 | named with 'label'.
88 |
89 | Returns
90 | ----------
91 | X_train, y_train, X_valid, y_valid, X_test, y_test
92 | Pandas DataFrames / Series
93 | """
94 | df_train = pd.read_csv(f'data/{dataset_name}_train.csv')
95 | X_train = df_train[df_train.columns.tolist()[:-1]]
96 | y_train = df_train['label']
97 | df_valid = pd.read_csv(f'data/{dataset_name}_valid.csv')
98 | X_valid = df_valid[df_valid.columns.tolist()[:-1]]
99 | y_valid = df_valid['label']
100 | df_test = pd.read_csv(f'data/{dataset_name}_test.csv')
101 | X_test = df_test[df_test.columns.tolist()[:-1]]
102 | y_test = df_test['label']
103 | return X_train.values, y_train.values, \
104 | X_valid.values, y_valid.values, \
105 | X_test.values, y_test.values
106 |
107 | def histogram_error_distribution(y_true, y_pred, bins):
108 | """Util function that compute the error histogram.
109 |
110 | Parameters
111 | ----------
112 | y_true : array-like of shape = [n_samples]
113 | The ground truth labels.
114 |
115 | y_pred : array-like of shape = [n_samples]
116 | The predict probabilities.
117 |
118 | bins : int, number of bins in the histogram
119 |
120 | Returns
121 | ----------
122 | hist : array-like of shape = [bins]
123 | """
124 | error = np.absolute(y_true - y_pred)
125 | hist, _ = np.histogram(error, bins=bins)
126 | return hist
127 |
128 | def gaussian_prob(x, mu, sigma):
129 | """The Gaussian function.
130 |
131 | Parameters
132 | ----------
133 | x : float
134 | Input number.
135 |
136 | mu : float
137 | Parameter mu of the Gaussian function.
138 |
139 | sigma : float
140 | Parameter sigma of the Gaussian function.
141 |
142 | Returns
143 | ----------
144 | output : float
145 | """
146 | return (1 / (sigma * np.sqrt(2*np.pi))) * np.exp(-0.5*np.power((x-mu)/sigma, 2))
147 |
148 | def meta_sampling(y_pred, y_true, X, n_under_samples, mu, sigma, random_state=None):
149 | """The meta-sampling process in MESA.
150 |
151 | Parameters
152 | ----------
153 | y_pred : array-like of shape = [n_samples]
154 | The predict probabilities.
155 |
156 | y_true : array-like of shape = [n_samples]
157 | The ground truth labels.
158 |
159 | X : array-like of shape = [n_samples, n_features]
160 | The original data to be meta-sampled.
161 |
162 | n_under_samples : int, <= n_samples
163 | The expected number of instances in the subset after meta-sampling.
164 |
165 | mu : float
166 | Parameter mu of the Gaussian function.
167 |
168 | sigma : float
169 | Parameter sigma of the Gaussian function.
170 |
171 | random_state : int or None, optional (default=None)
172 | If int, random_state is the seed used by the random number generator.
173 | If None, the random number generator is the RandomState instance used
174 | by np.random.
175 |
176 | Returns
177 | ----------
178 | X_subset : array-like of shape = [n_under_samples, n_features]
179 | The subset after meta-sampling.
180 | """
181 | sample_weights = gaussian_prob(np.absolute(y_true - y_pred), mu, sigma)
182 | X_subset = pd.DataFrame(X).sample(n_under_samples, weights=sample_weights, random_state=random_state)
183 | return X_subset
184 |
185 | def imbalance_train_test_split(X, y, test_size, random_state=None):
186 | '''Train/Test split that guarantee same class distribution between split datasets.'''
187 | classes = np.unique(y)
188 | X_trains, y_trains, X_tests, y_tests = [], [], [], []
189 | for label in classes:
190 | inds = (y==label)
191 | X_label, y_label = X[inds], y[inds]
192 | X_train, X_test, y_train, y_test = train_test_split(
193 | X_label, y_label, test_size=test_size, random_state=random_state)
194 | X_trains.append(X_train)
195 | X_tests.append(X_test)
196 | y_trains.append(y_train)
197 | y_tests.append(y_test)
198 | X_train = np.concatenate(X_trains)
199 | X_test = np.concatenate(X_tests)
200 | y_train = np.concatenate(y_trains)
201 | y_test = np.concatenate(y_tests)
202 | return X_train, X_test, y_train, y_test
203 |
204 | def state_scale(state, scale):
205 | '''Scale up the meta-states.'''
206 | return state / state.sum() * 2 * scale
207 |
208 | def memory_init_fulfill(args, memory):
209 | '''Initialize the memory.'''
210 | num_bins = args.num_bins
211 | memory_size = args.replay_size
212 | error_in_bins = np.linspace(0, 1, num_bins)
213 | mu = 0.3
214 | unfitted, midfitted, fitted = \
215 | gaussian_prob(error_in_bins, 1, mu), \
216 | gaussian_prob(error_in_bins, 0.5, mu), \
217 | gaussian_prob(error_in_bins, 0, mu)
218 | underfitting_state = state_scale(np.concatenate([unfitted, unfitted]), num_bins)
219 | learning_state = state_scale(np.concatenate([midfitted, midfitted]), num_bins)
220 | overfitting_state = state_scale(np.concatenate([fitted, midfitted]), num_bins)
221 | noise_scale = 0.5
222 | num_per_transitions = int(memory_size/3)
223 | for i in range(num_per_transitions):
224 | state = underfitting_state + np.random.rand(num_bins*2) * noise_scale
225 | next_state = underfitting_state + np.random.rand(num_bins*2) * noise_scale
226 | memory.push(state, [0.9], args.reward_coefficient * 0.05, next_state, 0)
227 | for i in range(num_per_transitions):
228 | state = learning_state + np.random.rand(num_bins*2) * noise_scale
229 | next_state = learning_state + np.random.rand(num_bins*2) * noise_scale
230 | memory.push(state, [0.5], args.reward_coefficient * 0.05, next_state, 0)
231 | for i in range(num_per_transitions):
232 | state = overfitting_state + np.random.rand(num_bins*2) * noise_scale
233 | next_state = overfitting_state + np.random.rand(num_bins*2) * noise_scale
234 | memory.push(state, [0.1], args.reward_coefficient * 0.05, next_state, 0)
235 | return memory
236 |
237 | def transform(y):
238 | if y.ndim == 1:
239 | y = y[:, np.newaxis]
240 | if y.shape[1] == 1:
241 | y = np.append(1-y, y, axis=1)
242 | return y
243 |
244 | def cross_entropy(y_pred, y_true, epsilon=1e-4):
245 | '''Cross-entropy error function.'''
246 | y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
247 | y_pred = transform(y_pred)
248 | y_true = transform(y_true)
249 | return (-y_true*np.log(y_pred)).sum(axis=1)
250 |
251 | def slide_mean(data, window_half):
252 | '''Slide mean for better visualization.'''
253 | result = []
254 | for i in range(len(data)):
255 | lower_bound = max(i-window_half, 0)
256 | upper_bound = min(i+window_half+1, len(data)-1)
257 | result.append(np.mean(data[lower_bound:upper_bound]))
258 | return result
--------------------------------------------------------------------------------