├── .gitignore
├── LICENSE
├── README.md
├── data
├── aliccp
│ └── dataset description.md
├── avazu
│ └── dataset description.md
├── criteo
│ └── dataset description.md
├── movielens-1m
│ └── dataset desciption.md
└── preprocess.py
├── fs_run.py
├── models
├── basemodel.py
├── config.yaml
├── fs
│ ├── adafs.py
│ ├── autofield.py
│ ├── gbdt.py
│ ├── lasso.py
│ ├── lpfs.py
│ ├── mvfs.py
│ ├── no_selection.py
│ ├── optfs.py
│ ├── optfs_old.py
│ ├── permutation.py
│ ├── rf.py
│ ├── sfs.py
│ ├── shark.py
│ └── xgb.py
├── layers.py
└── rec
│ ├── dcn.py
│ ├── deepfm.py
│ ├── fibinet.py
│ ├── fm.py
│ ├── mlp.py
│ └── widedeep.py
├── nni
└── search_spaces
│ ├── config.json
│ └── fs
│ ├── adafs.json
│ ├── autofield.json
│ ├── gbdt.json
│ ├── lasso.json
│ └── optfs.json
├── nni_tune.py
├── requirements.txt
└── utils
├── datasets.py
├── fs_trainer.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | ERASE: Benchmarking Feature Selection Methods for Deep Recommender Systems
3 |
4 |
5 |
6 |
7 | In this repo, our scripts can be divided to two parts: `dataset preprocess` and `run fs`.
8 |
9 | You can also download the preprocessed dataset from Huggingface [ERASE_Dataset](https://huggingface.co/datasets/Jia-py/ERASE_Dataset)
10 |
11 | Please note that you need to run the following script from the root directory of the project.
12 |
13 | # package requirment
14 |
15 | * torch
16 | * pandas
17 | * numpy
18 | * nni
19 |
20 | ## File Structure
21 |
22 | ```
23 | - checkpoints
24 | - checkpoints_for_retrain
25 | - data
26 | - avazu
27 | - preprocessed_avazu.csv # your data should put here
28 | - criteo
29 | - preprocessed_criteo.csv # your data should put here
30 | - movielens-1m
31 | - aliccp
32 | - preprocess.py # preprocess script
33 | - nni
34 | - search spaces
35 | - fs
36 | - specific-method.json # the hyperparameter search space for each methods in fs
37 | config.json # some hyperparameters related to general training, e.g., number of selected fields, learning rate
38 | - notebooks # some test notebooks
39 | - utils
40 | - datasets.py # read datasets
41 | - fs_trainer.py # trainer for feature selection
42 | - utils # some functions
43 | - fs_run.py # main script to run feature selection
44 | - nni_tune.py # run the nni tune
45 | - requirements.text # python libraries needed for this repository
46 | ```
47 |
48 | ## Dataset Preprocess
49 |
50 | ```bash
51 | python data/preprocess.py --dataset=[avazu/criteo] --data_path=[default is data/]
52 | ```
53 |
54 | ## Run FS & ES
55 |
56 | ### Parameters in run.py
57 |
58 | * dataset: (avazu/criteo)
59 | * model: backbone model (mlp)
60 | * fs: feature selection method (no_selecion/autofield/adafs/optfs/gbdt/lasso/gbr/pca)
61 | * seed: random seed (specific number or 0(random))
62 | * device: cuda or cpu
63 | * data_path: your data path (default is `data/`)
64 | * batch_size
65 | * dataset_shuffle: (True or False)
66 | * embedding_dim: embedding size (default is 8)
67 | * train_or_search: need train_or_search (True/False)
68 | * retrain: need retrain (True/False)
69 | * k: number of selected fields (specific number)
70 | * learning_rate
71 | * epoch: training epoch (default 100)
72 | * patience: patience of earlystopper (default 3)
73 | * num_workers: num_workers in dataloader (default 32)
74 | * nni: whether use nni to tune hyperparameters (default False)
75 | * rank_path: if only want retrain, please specify the path of feature rank file
76 | * read_feature_rank: whether to use pre-saved feature rank
77 |
78 | ### Feature Selection
79 |
80 | ```bash
81 | python fs_run.py --model=[model_name] --fs=[feature_selection_method] --train_or_search=True --retrain=True
82 | ```
83 |
84 |
85 |
86 | # More experimental results
87 |
88 | 1. Overall experimental results of feature selection for deep recommender systems.
89 |
90 |
91 |
92 | 2. Experimental results on more backbone models with different number of selected features on Avazu.
93 |
94 | 
95 |
96 | 3. Experimental results on more backbone models with different number of selected features on Criteo.
97 |
98 | 
99 |
100 |
101 | # Citation
102 |
103 | If you find our work useful, please consider citing our paper below. Thank you!
104 | ```
105 | @inproceedings{jia2024erase,
106 | title={ERASE: Benchmarking Feature Selection Methods for Deep Recommender Systems},
107 | author={Jia, Pengyue and Wang, Yejing and Du, Zhaocheng and Zhao, Xiangyu and Wang, Yichao and Chen, Bo and Wang, Wanyu and Guo, Huifeng and Tang, Ruiming},
108 | booktitle={Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
109 | pages={5194--5205},
110 | year={2024}
111 | }
112 | ```
113 |
--------------------------------------------------------------------------------
/data/aliccp/dataset description.md:
--------------------------------------------------------------------------------
1 | The dataset is preprocessed by Datawhale, refer to (torch-rechub)[https://github.com/datawhalechina/torch-rechub/tree/main/examples/ranking]
--------------------------------------------------------------------------------
/data/avazu/dataset description.md:
--------------------------------------------------------------------------------
1 | ## Dataset Description
2 |
3 | [Click-Through Rate Prediction | Kaggle](https://www.kaggle.com/competitions/avazu-ctr-prediction/data)
4 |
5 | ## File descriptions
6 |
7 | - **train** - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks are subsampled according to different strategies.
8 | - **test** - Test set. 1 day of ads to for testing your model predictions.
9 | - **sampleSubmission.csv** - Sample submission file in the correct format, corresponds to the All-0.5 Benchmark.
10 |
11 | ## Data fields
12 |
13 | - id: ad identifier
14 | - click: 0/1 for non-click/click
15 | - hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
16 | - C1 -- anonymized categorical variable
17 | - banner_pos
18 | - site_id
19 | - site_domain
20 | - site_category
21 | - app_id
22 | - app_domain
23 | - app_category
24 | - device_id
25 | - device_ip
26 | - device_model
27 | - device_type
28 | - device_conn_type
29 | - C14-C21 -- anonymized categorical variables
30 |
31 | ## Notes
32 |
33 | For the test file does not contain the labels, so we just use the training file.
34 |
35 | ## read dtypes
36 |
37 | ```bash
38 | Memory usage of dataframe is 7402.76 MB
39 | Memory usage after optimization is: 1773.58 MB
40 | Decreased by 76.0%
41 | dtypes: click int8
42 | hour int16
43 | C1 int8
44 | banner_pos int8
45 | site_id int16
46 | site_domain int16
47 | site_category int8
48 | app_id int16
49 | app_domain int16
50 | app_category int8
51 | device_id int32
52 | device_ip int32
53 | device_model int16
54 | device_type int8
55 | device_conn_type int8
56 | C14 int16
57 | C15 int8
58 | C16 int8
59 | C17 int16
60 | C18 int8
61 | C19 int8
62 | C20 int16
63 | C21 int8
64 | dtype: object
65 | preprocess avazu done!
66 | ```
--------------------------------------------------------------------------------
/data/criteo/dataset description.md:
--------------------------------------------------------------------------------
1 | ------ Display Advertising Challenge ------
2 |
3 | Dataset: dac-v1
4 |
5 | This dataset contains feature values and click feedback for millions of display
6 | ads. Its purpose is to benchmark algorithms for clickthrough rate (CTR) prediction.
7 | It has been used for the Display Advertising Challenge hosted by Kaggle:
8 | https://www.kaggle.com/c/criteo-display-ad-challenge/
9 |
10 | ===================================================
11 |
12 | Full description:
13 |
14 | This dataset contains 2 files:
15 | train.txt
16 | test.txt
17 | corresponding to the training and test parts of the data.
18 |
19 | ====================================================
20 |
21 | Dataset construction:
22 |
23 | The training dataset consists of a portion of Criteo's traffic over a period
24 | of 7 days. Each row corresponds to a display ad served by Criteo and the first
25 | column is indicates whether this ad has been clicked or not.
26 | The positive (clicked) and negatives (non-clicked) examples have both been
27 | subsampled (but at different rates) in order to reduce the dataset size.
28 |
29 | There are 13 features taking integer values (mostly count features) and 26
30 | categorical features. The values of the categorical features have been hashed
31 | onto 32 bits for anonymization purposes.
32 | The semantic of these features is undisclosed. Some features may have missing values.
33 |
34 | The rows are chronologically ordered.
35 |
36 | The test set is computed in the same way as the training set but it
37 | corresponds to events on the day following the training period.
38 | The first column (label) has been removed.
39 |
40 | ====================================================
41 |
42 | Format:
43 |
44 | The columns are tab separeted with the following schema:
45 | ... ...
46 |
47 | When a value is missing, the field is just empty.
48 | There is no label field in the test set.
49 |
50 | ====================================================
51 |
52 | Dataset assembled by Olivier Chapelle (o.chapelle@criteo.com)
53 |
54 | ```
55 | Memory usage of dataframe is 13989.45 MB
56 | Memory usage after optimization is: 3322.49 MB
57 | Decreased by 76.2%
58 | dtypes: 0 int8
59 | 1 int8
60 | 2 int8
61 | 3 int8
62 | 4 int8
63 | 5 int16
64 | 6 int16
65 | 7 int8
66 | 8 int8
67 | 9 int8
68 | 10 int8
69 | 11 int8
70 | 12 int8
71 | 13 int8
72 | 14 int16
73 | 15 int16
74 | 16 int32
75 | 17 int32
76 | 18 int16
77 | 19 int8
78 | 20 int16
79 | 21 int16
80 | 22 int8
81 | 23 int32
82 | 24 int16
83 | 25 int32
84 | 26 int16
85 | 27 int8
86 | 28 int16
87 | 29 int32
88 | 30 int8
89 | 31 int16
90 | 32 int16
91 | 33 int8
92 | 34 int32
93 | 35 int8
94 | 36 int8
95 | 37 int32
96 | 38 int8
97 | 39 int32
98 | dtype: object
99 | 2023-09-06 16:00:58
100 | save to file...
101 | preprocess criteo done!
102 | ```
--------------------------------------------------------------------------------
/data/movielens-1m/dataset desciption.md:
--------------------------------------------------------------------------------
1 | The dataset is preprocessed by Datawhale, refer to (torch-rechub)[https://github.com/datawhalechina/torch-rechub/tree/main/examples/matching]
--------------------------------------------------------------------------------
/data/preprocess.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from tqdm import tqdm
4 | from sklearn.preprocessing import LabelEncoder
5 | import sys
6 | sys.path.append('utils/')
7 | from utils import print_time
8 |
9 | def reduce_mem_usage(df):
10 | start_mem = df.memory_usage().sum() / 1024**2
11 | print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
12 |
13 | for col in df.columns:
14 | col_type = df[col].dtype
15 | if col_type != object:
16 | c_min = df[col].min()
17 | c_max = df[col].max()
18 | if str(col_type)[:3] == 'int':
19 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
20 | df[col] = df[col].astype(np.int8)
21 | elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
22 | df[col] = df[col].astype(np.uint8)
23 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
24 | df[col] = df[col].astype(np.int16)
25 | elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
26 | df[col] = df[col].astype(np.uint16)
27 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
28 | df[col] = df[col].astype(np.int32)
29 | elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
30 | df[col] = df[col].astype(np.uint32)
31 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
32 | df[col] = df[col].astype(np.int64)
33 | elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
34 | df[col] = df[col].astype(np.uint64)
35 | elif str(col_type)[:5] == 'float':
36 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
37 | df[col] = df[col].astype(np.float16)
38 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
39 | df[col] = df[col].astype(np.float32)
40 | else:
41 | df[col] = df[col].astype(np.float64)
42 |
43 | end_mem = df.memory_usage().sum() / 1024**2
44 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
45 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
46 | print('dtypes: ', df.dtypes)
47 | return df
48 |
49 | def preprocess_avazu(data_path, feature_value_filter=False, threshold=4):
50 | print('start reading file...')
51 | df_train = pd.read_csv(data_path + 'train.csv')
52 | df_val = pd.read_csv(data_path + 'valid.csv')
53 | df_test = pd.read_csv(data_path + 'test.csv')
54 | df = pd.concat([df_train, df_val, df_test])
55 | del df_train, df_val, df_test
56 | print('finish reading file...')
57 | df.drop(columns=['id'], inplace=True)
58 | # transform hour to hour
59 | # df['hour:token'] = pd.to_datetime(df['timestamp:float'], format='%y%m%d%H')
60 | # df['hour:token'] = df['hour:token'].dt.hour
61 | # df.drop(['timestamp:float'], axis=1, inplace=True)
62 | sparse_features = [f for f in df.columns]
63 | # df = df.fillna('-1')
64 |
65 | if feature_value_filter:
66 | print('start replace values')
67 | tqdm.pandas(desc='pandas bar')
68 | def replace_values(series):
69 | counts = series.value_counts()
70 | return series.apply(lambda x: -99 if counts[x] < threshold else x)
71 | df = df.parallel_apply(replace_values)
72 | print('finish replace values')
73 | df = df.astype(str)
74 |
75 | tk0 = tqdm(sparse_features, desc='LabelEncoder')
76 | for feat in tk0:
77 | lbe = LabelEncoder()
78 | df[feat] = lbe.fit_transform(df[feat])
79 | df = df.infer_objects()
80 | df = reduce_mem_usage(df)
81 | df.to_csv(data_path + 'preprocessed_avazu.csv', index=False)
82 |
83 | def preprocess_criteo(data_path, feature_value_filter=False, threshold=4):
84 | print_time('start reading file...')
85 | # df = pd.read_csv(data_path + 'criteo.inter', sep='\t')
86 | df = pd.read_csv(data_path + 'train.txt', sep='\t', header=None)
87 | print(df)
88 | print_time('finish reading file...')
89 | '''
90 | Index([ (label)0,
91 | (float)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
92 | (object)14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
93 | 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
94 | dtype='int64')
95 | '''
96 | df.columns= [str(x) for x in list(range(40))]
97 | dense_features = [f for f in df.columns.tolist() if (df[f].dtype in ['int64', 'float64'] and f != '0')]
98 | sparse_features = [f for f in df.columns.tolist() if df[f].dtype in ['object']]
99 |
100 | print_time('fill nan...')
101 | df[sparse_features] = df[sparse_features].fillna('-999')
102 | df[dense_features] = df[dense_features].fillna(-999)
103 |
104 | print_time('convert float features...')
105 | import math
106 | for feat in dense_features:
107 | df[feat] = df[feat].apply(lambda x:str(int(math.log(x) ** 2)) if x > 2 else str(int(x)-2))
108 | all_features = [f for f in df.columns]
109 |
110 | # df = df.astype(str)
111 | print_time('label encoding...')
112 | tk0 = tqdm(all_features, desc='LabelEncoder')
113 | for feat in tk0:
114 | lbe = LabelEncoder()
115 | df[feat] = lbe.fit_transform(df[feat])
116 | df = df.infer_objects()
117 | # 设置display.max_rows选项
118 | pd.set_option('display.max_rows', None)
119 | df = reduce_mem_usage(df)
120 |
121 | print_time('save to file...')
122 | df.to_csv(data_path + 'preprocessed_criteo.csv', index=False)
123 |
124 |
125 |
126 | if __name__ == '__main__':
127 | import argparse
128 | parser = argparse.ArgumentParser()
129 | parser.add_argument('--dataset', type=str, default='avazu', help='avazu, criteo')
130 | parser.add_argument('--data_path', type=str, default='data/', help='data path')
131 |
132 | args = parser.parse_args()
133 |
134 | if args.dataset == 'avazu':
135 | preprocess_avazu(args.data_path + args.dataset + '/')
136 | print('preprocess avazu done!')
137 | elif args.dataset == 'criteo':
138 | preprocess_criteo(args.data_path + args.dataset + '/')
139 | print('preprocess criteo done!')
--------------------------------------------------------------------------------
/fs_run.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pandas as pd
3 | import numpy as np
4 | import torch
5 | import os
6 | import argparse
7 | import yaml
8 | import nni
9 | import time
10 | import datetime as dt
11 | from tqdm import tqdm
12 | import utils.utils as utils
13 | from utils.fs_trainer import modeltrainer
14 | from utils.datasets import read_dataset
15 | from models.basemodel import BaseModel
16 |
17 |
18 | def main(args):
19 | if args.seed != 0:
20 | utils.seed_everything(args.seed)
21 |
22 | if args.train_or_search:
23 | utils.print_time('start train or search...')
24 |
25 | if args.fs in ['gbdt', 'lasso', 'permutation','rf','xgb']: # machine learning feature selection
26 | features, unique_values, data = read_dataset(args.dataset, args.data_path, args.batch_size, args.dataset_shuffle, num_workers=args.num_workers, machine_learning_method = True)
27 | ml_start_time = dt.datetime.now()
28 | feature_rank = utils.machine_learning_selection(args, args.fs, features, unique_values, data, args.k)
29 | ml_end_time = dt.datetime.now()
30 | print('machine learning feature selection time: {} s'.format((ml_end_time - ml_start_time).total_seconds()))
31 | model_path = 'checkpoints/' + args.model + '_' + args.fs + '_' + args.es + '_' + args.dataset + '_' + args.timestr + '/'
32 | utils.print_time(model_path)
33 | utils.print_time('feature rank:')
34 | utils.print_time(feature_rank)
35 | if not os.path.exists(model_path):
36 | os.makedirs(model_path)
37 | np.save(model_path + 'feature_rank.npy', feature_rank)
38 |
39 | else:
40 | features, label, train_dataloader, val_dataloader, test_dataloader, unique_values = read_dataset(args.dataset, args.data_path, batch_size=args.batch_size, shuffle=args.dataset_shuffle, num_workers=args.num_workers, machine_learning_method=False)
41 |
42 | print(features)
43 | print(unique_values)
44 |
45 | model = BaseModel(args, args.model, args.fs, args.es, unique_values, features)
46 | model.fs.mode = 'train'
47 | trainer = modeltrainer(args, model, args.model, args.device, epochs=args.epoch, retrain=False)
48 | trainer.fit(train_dataloader, val_dataloader)
49 | auc = trainer.test(test_dataloader, ['auc', 'logloss'])
50 | if args.retrain is False and args.nni:
51 | nni.report_final_result(auc)
52 | # print selected features
53 | # 如果model存在output_selected_features方法,就输出selected features
54 | if hasattr(model.fs, 'save_selection'):
55 | model_path = 'checkpoints/' + args.model + '_' + args.fs + '_' + args.es + '_' + args.dataset + '_' + args.timestr + '/'
56 | res = model.fs.save_selection(k=args.k)
57 | if isinstance(res, np.ndarray):
58 | feature_rank = res
59 | else:
60 | feature_rank = res(val_dataloader,model,torch.device(args.device))
61 | utils.print_time('feature rank:')
62 | utils.print_time(feature_rank)
63 | np.save(model_path + 'feature_rank.npy', feature_rank)
64 | # if retrain, retrain
65 | if args.retrain:
66 | utils.print_time('start retrain...')
67 | # if no need train or search, you should put the feature_rank.npy in the following path mannualy
68 | if not args.train_or_search:
69 | model_path = 'checkpoints_for_retrain/' + args.rank_path + '/'
70 | if not os.path.exists(model_path):
71 | raise('Only retraining chossen, please make shure you have putted the generated file during searching in the following path: checkpoints/for_retrain/fs_es_dataset/')
72 | else:
73 | model_path = 'checkpoints/' + args.model + '_' + args.fs + '_' + args.es + '_' + args.dataset + '_' + args.timestr + '/'
74 | # read selection results
75 | if args.read_feature_rank: # need to read selected features
76 | feature_rank = np.load(model_path + 'feature_rank.npy')
77 | selected_features = feature_rank[0][:args.k]
78 | utils.print_time('feature rank: {}'.format(feature_rank))
79 | utils.print_time('selected features: {}'.format(selected_features))
80 | features, label, train_dataloader, val_dataloader, test_dataloader, unique_values = read_dataset(args.dataset, args.data_path, batch_size=args.batch_size, shuffle=args.dataset_shuffle, num_workers=args.num_workers, use_fields=selected_features, machine_learning_method=False)
81 | else:
82 | features, label, train_dataloader, val_dataloader, test_dataloader, unique_values = read_dataset(args.dataset, args.data_path, batch_size=args.batch_size, shuffle=args.dataset_shuffle, num_workers=args.num_workers, use_fields=None, machine_learning_method=False)
83 |
84 | model = BaseModel(args, args.model, args.fs, args.es, unique_values, features)
85 | if model.fs.load_checkpoint:
86 | model.load_state_dict(torch.load(model_path + 'model_search.pth'))
87 | # if args.fs == 'optfs':
88 | # tmp_model = torch.load(model_path + 'model_search.pth')
89 | # # param_dict = {k:v for k, v in tmp_model.items() if 'fs.gate' in k or 'embedding.weight' in k}
90 | # param_dict = {k:v for k, v in tmp_model.items() if 'fs.mask_weight' in k}
91 | # model_dict = model.state_dict()
92 | # model_dict.update(param_dict)
93 | # model.load_state_dict(model_dict)
94 | if hasattr(model.fs, 'before_retrain'):
95 | model.fs.before_retrain()
96 | model.fs.mode = 'retrain'
97 |
98 | trainer = modeltrainer(args, model, args.model, args.device, epochs=args.epoch, retrain=True)
99 | trainer.fit(train_dataloader, val_dataloader)
100 | auc = trainer.test(test_dataloader, ['auc', 'logloss'])
101 | if args.nni:
102 | nni.report_final_result(auc)
103 |
104 |
105 | if __name__ == '__main__':
106 |
107 | parser = argparse.ArgumentParser()
108 | parser.add_argument('--dataset', type=str, default='avazu', help='avazu, criteo, movielens-1m, aliccp')
109 | parser.add_argument('--model', type=str, default='mlp', help='mlp, ...')
110 | parser.add_argument('--fs', type=str, default='no_selection', help='feature selection methods: no_selecion, autofield, adafs, optfs, gbdt, lasso, gbr, pca, shark, sfs, lpfs, mvfs')
111 | parser.add_argument('--es', type=str, default='no_selection', help='embedding search methods: no_selecion, ...')
112 | parser.add_argument('--seed', type=int, default=0, help='random seed, 0 represents not setting the random seed')
113 | parser.add_argument('--device',type=str, default='cuda' if torch.cuda.is_available() else 'cpu', help='cpu, cuda')
114 | parser.add_argument('--data_path', type=str, default='data/', help='data path') # ~/autodl-tmp/ or data/
115 | parser.add_argument('--batch_size', type=int, default=4096, help='batch size')
116 | parser.add_argument('--dataset_shuffle', type=bool, default=True, help='whether to shuffle the dataset')
117 | parser.add_argument('--embedding_dim', type=int, default=8, help='embedding dimension')
118 | parser.add_argument('--train_or_search', type=utils.str2bool, default=True, help='whether to train or search')
119 | parser.add_argument('--retrain', type=utils.str2bool, default=True, help='whether to retrain')
120 | parser.add_argument('--k', type=int, default=0, help='top k features')
121 | parser.add_argument('--learning_rate', type=float, default=0.001, help='learning rate')
122 | parser.add_argument('--epoch', type=int, default=1, help='epoch')
123 | parser.add_argument('--patience', type=int, default=1, help='early stopping patience')
124 | parser.add_argument('--num_workers', type=int, default=32, help='num_workers')
125 | parser.add_argument('--nni', type=bool, default=False, help='whether to use nni')
126 | parser.add_argument('--rank_path', type=str, default='None', help='if only retrain, no train, please specify the path of feature_rank file. e.g., autofield_no_selection_avazu')
127 | parser.add_argument('--read_feature_rank', type=utils.str2bool, default=True, help='whether to use pre-saved feature rank')
128 |
129 | args = parser.parse_args()
130 |
131 | # k
132 | if args.k == 0:
133 | if args.dataset == 'avazu':
134 | args.k = 22
135 | elif args.dataset == 'criteo':
136 | args.k = 39
137 |
138 | with open('models/config.yaml', 'r') as file:
139 | data = yaml.safe_load(file)
140 | args.__dict__.update(data)
141 |
142 | # read tune parameters from nni
143 | if args.nni:
144 | tuner_params = nni.get_next_parameter()
145 | for key in tuner_params:
146 | if key[:2] == 'fs':
147 | args.fs_config[args.fs][key[3:]] = tuner_params[key]
148 | elif key[:2] == 'es':
149 | args.es_config[args.es][key[3:]] = tuner_params[key]
150 | else:
151 | args.__dict__[key] = tuner_params[key]
152 |
153 | # print args
154 | for key in args.__dict__:
155 | if key not in ['fs_config', 'es_config', 'rec_config']:
156 | print(key, ':', args.__dict__[key])
157 | else:
158 | print(key, ':')
159 | for key2 in args.__dict__[key]:
160 | if key2 in [args.model, args.fs, args.es]:
161 | print('\t', key2, ':', args.__dict__[key][key2])
162 |
163 | args.timestr = str(time.time())
164 |
165 | main(args)
--------------------------------------------------------------------------------
/models/basemodel.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | import torch
3 | from torch import Tensor
4 | import torch.nn as nn
5 | import numpy as np
6 | import torch.nn.functional as F
7 | from torch.nn.modules.module import Module
8 | from utils.utils import get_model
9 |
10 | class BaseModel(nn.Module):
11 | def __init__(self, args, backbone_model_name, fs, es, unique_values, features):
12 | super(BaseModel, self).__init__()
13 | # embedding table
14 | self.embedding = nn.Embedding(sum(unique_values), embedding_dim = args.embedding_dim)
15 | torch.nn.init.normal_(self.embedding.weight.data, mean=0, std=0.01)
16 | self.offsets = np.array((0, *np.cumsum(unique_values)[:-1]))
17 |
18 | self.input_dims = args.embedding_dim * len(unique_values)
19 |
20 | self.bb = get_model(backbone_model_name, 'rec')(args, self.input_dims) # backbone model name
21 | self.fs = get_model(fs, 'fs')(args, unique_values, features) # feature selection method
22 | self.es = get_model(es, 'es')() # embedding search method
23 | self.args = args
24 |
25 | def forward(self, x, current_epoch, current_step):
26 | raw_x = x.clone().detach()
27 | x = self.embedding(x + x.new_tensor(self.offsets))
28 | x = self.es(x)
29 | x = self.fs(x, current_epoch, current_step, raw_data = raw_x)
30 | x = self.bb(x)
31 | return x
32 |
33 | def set_optimizer(self):
34 | optimizer_bb = torch.optim.Adam([params for name,params in self.named_parameters() if ('fs' not in name and 'es' not in name) or 'bb' in name], lr = self.args.learning_rate)
35 |
36 | if [params for name,params in self.named_parameters() if 'fs' in name] != []:
37 | optimizer_fs = torch.optim.Adam([params for name,params in self.named_parameters() if 'fs' in name and 'bb' not in name], lr = self.args.learning_rate)
38 | else:
39 | optimizer_fs = None
40 |
41 | if [params for name,params in self.named_parameters() if 'es' in name] != []:
42 | optimizer_es = torch.optim.Adam([params for name,params in self.named_parameters() if 'es' in name and 'bb' not in name], lr = self.args.learning_rate)
43 | else:
44 | optimizer_es = None
45 | return {'optimizer_bb': optimizer_bb, 'optimizer_fs': optimizer_fs, 'optimizer_es': optimizer_es}
46 |
--------------------------------------------------------------------------------
/models/config.yaml:
--------------------------------------------------------------------------------
1 | fs_config:
2 | adafs:
3 | pretrain_epoch: 0
4 | hidden_size: 16
5 | dropout: 0.2
6 | update_frequency: 4
7 | autofield:
8 | update_frequency: 10
9 | gbdt:
10 | learning_rate: 0.1
11 | n_estimators: 100
12 | subsample: 1.0
13 | min_samples_split: 2
14 | min_samples_leaf: 1
15 | min_weight_fraction_leaf: 0.0
16 | max_depth: 3
17 | n_iter_no_change: 3
18 | gbr:
19 | lasso:
20 | alpha: 0.001
21 | fit_intercept: true
22 | copy_X: True
23 | max_iter: 1000
24 | tol: 0.0001
25 | positive: false
26 | selection: cyclic
27 | lpfs:
28 | no_selection:
29 | optfs:
30 | gamma: 5000
31 | pretrain_epoch: 5
32 | pca:
33 | shark:
34 | sfs:
35 | num_batch_sampling: 100
36 | rf:
37 | xgb:
38 | mvfs:
39 | pretrain_epoch: 0
40 | sub_network_num: 6
41 | dropout: 0.2
42 | l: 0.2
43 | es_config:
44 | no_selection:
45 | rec_config:
46 | mlp:
--------------------------------------------------------------------------------
/models/fs/adafs.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from models.layers import MLP
4 |
5 | class adafs(nn.Module):
6 |
7 | def __init__(self, args, unique_values, features):
8 | super(adafs, self).__init__()
9 |
10 | self.pretrain_epoch = args.fs_config[args.fs]['pretrain_epoch']
11 | self.feature_num = len(unique_values)
12 | self.batchnorm_bb = nn.BatchNorm1d(args.embedding_dim) # add the _bb to add this parameter in bb optimizer
13 | self.hidden_size = args.fs_config[args.fs]['hidden_size']
14 | self.dropout = args.fs_config[args.fs]['dropout']
15 | self.mlp = MLP(self.feature_num * args.embedding_dim, False, [self.hidden_size, self.feature_num], self.dropout)
16 | self.mlp_bb = MLP(self.feature_num * args.embedding_dim, False, [self.hidden_size, self.feature_num], self.dropout)
17 | self.mode = 'train'
18 | if self.mode == 'retrain':
19 | raise Exception('adafs should not be used in retrain mode')
20 | self.optimizer_method = 'normal'
21 | self.update_frequency = args.fs_config[args.fs]['update_frequency']
22 |
23 | self.load_checkpoint = False
24 |
25 | def forward(self, x, current_epoch, current_step, raw_data):
26 | b,f,e = x.shape
27 | if self.optimizer_method == 'darts':
28 | if current_epoch is not None and current_epoch < self.pretrain_epoch: # current_epoch not None (in training or validation) and current_epoch <= self.pretrain_epoch
29 | x = x.transpose(1,2)
30 | x = self.batchnorm_bb(x)
31 | return x.transpose(1,2)
32 | else:
33 | x = x.transpose(1,2)
34 | x = self.batchnorm_bb(x)
35 | weight = self.mlp(x.reshape(b, -1))
36 | weight = torch.softmax(weight, dim=-1)
37 | x = torch.mul(x, weight.unsqueeze(1))
38 | return x.transpose(1,2)
39 | elif self.optimizer_method == 'normal':
40 | x = x.transpose(1,2)
41 | x = self.batchnorm_bb(x)
42 | weight = self.mlp_bb(x.reshape(b, -1))
43 | weight = torch.softmax(weight, dim=-1)
44 | x = torch.mul(x, weight.unsqueeze(1))
45 | return x.transpose(1,2)
--------------------------------------------------------------------------------
/models/fs/autofield.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 |
5 | class autofield(nn.Module):
6 |
7 | def __init__(self, args, unique_values, features):
8 | super(autofield, self).__init__()
9 |
10 | self.feature_num = len(unique_values)
11 | self.device = args.device
12 | self.args = args
13 | self.features = np.array(features)
14 |
15 | self.gate = {features[field_idx]: torch.Tensor(np.ones([1,2])*0.5).to(self.device) for field_idx in range(self.feature_num)}
16 | self.gate = {features[field_idx]: nn.Parameter(self.gate[features[field_idx]], requires_grad=True) for field_idx in range(self.feature_num)}
17 | self.gate = nn.ParameterDict(self.gate)
18 | self.tau = 1.0
19 |
20 | self.mode = 'train'
21 | self.optimizer_method = 'darts'
22 | self.update_frequency = args.fs_config[args.fs]['update_frequency']
23 | self.load_checkpoint = False
24 |
25 | def forward(self, x, current_epoch, current_step, raw_data):
26 | b,f,e = x.shape
27 | if self.mode == 'retrain':
28 | return x
29 | elif self.mode == 'train':
30 | if self.tau > 0.01:
31 | self.tau -= 0.00005
32 | gate_ = torch.ones([1,f,1]).to(self.device)
33 | for field_idx in range(self.feature_num):
34 | gate_[:,field_idx,:] = torch.nn.functional.gumbel_softmax(self.gate[self.features[field_idx]], tau=self.tau, hard=False, dim=-1)[:,-1].reshape(1,1,1)
35 | x = x * gate_
36 | return x
37 |
38 | def save_selection(self, k):
39 | selected_idx = []
40 | gate = torch.concat([self.gate[self.features[field_idx]] for field_idx in range(self.feature_num)], dim=0)[:,-1]
41 | indices = torch.argsort(gate, descending=True)
42 | ranked_importance = gate[indices].detach().cpu().numpy()
43 | ranked_features = [self.features[i] for i in indices]
44 | return np.array([ranked_features, ranked_importance])
45 | # for i in indices:
46 | # selected_idx.append(i.item())
47 | # if len(selected_idx) == k:
48 | # break
49 | # return self.features[selected_idx]
50 |
51 |
52 |
--------------------------------------------------------------------------------
/models/fs/gbdt.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class gbdt(nn.Module):
6 | def __init__(self, args, unique_values, features):
7 | super(gbdt, self).__init__()
8 |
9 | # 必需的参数
10 | self.load_checkpoint = False
11 | self.optimizer_method = 'normal'
12 |
13 | def forward(self, x, current_epoch, current_step, raw_data):
14 | return x
--------------------------------------------------------------------------------
/models/fs/lasso.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class lasso(nn.Module):
6 | def __init__(self, args, unique_values, features):
7 | super(lasso, self).__init__()
8 |
9 | # 必需的参数
10 | self.load_checkpoint = False
11 | self.optimizer_method = 'normal'
12 |
13 | def forward(self, x, current_epoch, current_step, raw_data):
14 | return x
--------------------------------------------------------------------------------
/models/fs/lpfs.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 |
5 | class lpfs(nn.Module):
6 | def __init__(self, args, unique_values, features):
7 | super(lpfs, self).__init__()
8 |
9 | self.feature_num = len(features)
10 | self.features = features
11 | self.x = nn.Parameter(torch.ones(self.feature_num, 1).to(args.device))
12 | self.epochs = args.epoch
13 | self.epsilon_update_frequency = 100
14 | self.device = args.device
15 | self.epsilon = 0.1
16 |
17 | self.load_checkpoint = False
18 | self.optimizer_method = 'normal'
19 |
20 | def forward(self, x, current_epoch, current_step, raw_data):
21 | b,f,e = x.shape
22 | if current_step % self.epsilon_update_frequency == 0:
23 | self.epsilon = self.epsilon * 0.9978
24 | g = self.lpfs_pp(self.x, self.epsilon).reshape(1, f, 1)
25 | x_ = torch.zeros_like(x)
26 | x_ = x * g
27 | return x_
28 |
29 | def lpfs_pp(self, x, epsilon, alpha=10, tau=2, init_val=1.0):
30 | g1 = x*x/(x*x+epsilon)
31 | g2 = alpha * epsilon ** (1.0/tau)*torch.atan(x)
32 | g = torch.where(x>0, g2+g1, g2-g1)/init_val
33 | return g
34 |
35 | def save_selection(self, k):
36 | # gate = torch.concat([self.gate[self.features[field_idx]] for field_idx in range(self.feature_num)], dim=0)[:,-1]
37 | gate = self.x.reshape(self.feature_num)
38 | indices = torch.argsort(gate, descending=True)
39 | ranked_importance = gate[indices].detach().cpu().numpy()
40 | ranked_features = [self.features[i] for i in indices]
41 | print(ranked_features)
42 | print(ranked_importance)
43 | return np.array([ranked_features, ranked_importance])
44 |
45 |
--------------------------------------------------------------------------------
/models/fs/mvfs.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from models.layers import MLP
4 |
5 | class mvfs(nn.Module):
6 |
7 | def __init__(self, args, unique_values, features):
8 | super(mvfs, self).__init__()
9 |
10 | self.pretrain_epoch = args.fs_config[args.fs]['pretrain_epoch']
11 | self.feature_num = len(unique_values)
12 | self.sub_network_num = args.fs_config[args.fs]['sub_network_num']
13 | self.dropout = args.fs_config[args.fs]['dropout']
14 | self.l = args.fs_config[args.fs]['l']
15 | self.sub_network_list_bb = torch.nn.ModuleList()
16 | for i in range(self.sub_network_num):
17 | self.sub_network_list_bb.append(nn.Linear(self.feature_num*args.embedding_dim, self.feature_num))
18 | self.W_g_bb = nn.Parameter(torch.Tensor(self.sub_network_num, self.sub_network_num * self.feature_num))
19 | self.b_g_bb = nn.Parameter(torch.Tensor(self.sub_network_num))
20 |
21 | self.W = nn.Parameter(torch.Tensor(self.sub_network_num))
22 |
23 | self.mode = 'train'
24 | if self.mode == 'retrain':
25 | raise Exception('adafs should not be used in retrain mode')
26 | self.optimizer_method = 'normal'
27 |
28 | self.load_checkpoint = False
29 | self.t = 1
30 |
31 | def forward(self, x, current_epoch, current_step, raw_data):
32 | b,f,e = x.shape
33 | if current_epoch is not None and current_epoch < self.pretrain_epoch:
34 | return x
35 | else:
36 | self.t += 0.001
37 | C = []
38 | for i in range(self.sub_network_num):
39 | C.append(torch.softmax(self.sub_network_list_bb[i](x.reshape(b, -1)), dim=1))
40 | r = self.W_g_bb.unsqueeze(0) @ torch.cat(C, dim=1).reshape(b,-1,1) + self.b_g_bb.reshape(1,-1,1)
41 | r = torch.softmax(r, dim=1) # b, K, 1
42 | I = torch.mul(r, torch.stack(C, dim=1))
43 | I = I.sum(dim=1) # b, f
44 | # 𝑠𝑛 = 0.5 ∗ (1 + tanh(𝜏 · (𝐼𝑛 − 𝑙)))
45 | if self.t < 5:
46 | s = 0.5 * (1 + torch.tanh(5 * (I - self.l)))
47 | else:
48 | s = 0.5 * (1 + torch.tanh(self.t * (I-self.l)))
49 | x = x * s.unsqueeze(2)
50 | return x
51 |
52 |
--------------------------------------------------------------------------------
/models/fs/no_selection.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class no_selection(nn.Module):
6 | def __init__(self, args, unique_values, features):
7 | super(no_selection, self).__init__()
8 |
9 | # 必需的参数
10 | self.load_checkpoint = False
11 | self.optimizer_method = 'normal'
12 |
13 | def forward(self, x, current_epoch, current_step, raw_data):
14 | return x
--------------------------------------------------------------------------------
/models/fs/optfs.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | import torch.nn.functional as F
5 |
6 | class optfs(nn.Module):
7 |
8 | def __init__(self, args, unique_values, features):
9 | super().__init__()
10 |
11 | self.mask_weight = nn.Parameter(torch.Tensor(np.sum(unique_values), 1))
12 | nn.init.constant_(self.mask_weight, 0.5)
13 | self.offsets = np.array((0, *np.cumsum(unique_values)[:-1]))
14 |
15 | self.mode = 'train'
16 | self.device = args.device
17 | self.features = features
18 |
19 | self.gamma = args.fs_config[args.fs]['gamma']
20 | if args.dataset == 'avazu':
21 | self.gamma = 5000
22 | elif args.dataset == 'criteo':
23 | self.gamma = 2000
24 | else:
25 | self.gamma = 2000
26 | self.pretrain_epoch = args.fs_config[args.fs]['pretrain_epoch']
27 | self.load_checkpoint = True
28 | self.optimizer_method = 'normal'
29 |
30 | self.temp_increase = self.gamma ** (1./ (self.pretrain_epoch-1))
31 | self.temp = 1
32 | self.current_epoch = -1
33 |
34 | def sigmoid(self, x):
35 | return float(1./(1.+np.exp(-x)))
36 |
37 | def compute_mask(self, raw_data, temp, ticket):
38 | scaling = 1./ self.sigmoid(0.5)
39 | mask_weight = F.embedding(raw_data + raw_data.new_tensor(self.offsets), self.mask_weight)
40 | if ticket:
41 | mask = (mask_weight > 0).float()
42 | else:
43 | mask = torch.sigmoid(temp * mask_weight)
44 | return scaling * mask
45 |
46 | def forward(self, x, current_epoch, current_step, raw_data):
47 | b,f,e = x.shape
48 | if current_epoch != self.current_epoch:
49 | self.temp *= self.temp_increase
50 | self.current_epoch = current_epoch
51 | if self.mode == 'retrain':
52 | ticket = True
53 | else:
54 | ticket = False
55 | mask = self.compute_mask(raw_data, self.temp, ticket)
56 |
57 | return x * mask
58 |
59 | def before_retrain(self):
60 | # print remain
61 | ratio = float((self.mask_weight > 0).sum()) / self.mask_weight.numel()
62 | print('remain: ', ratio)
63 |
--------------------------------------------------------------------------------
/models/fs/optfs_old.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class optfs(nn.Module):
5 |
6 | def __init__(self, args, unique_values, features):
7 | super().__init__()
8 |
9 | self.gate = {features[field_idx]: torch.Tensor(unique_values[field_idx], 1).to(args.device) for field_idx in range(len(features))}
10 | for feature in features:
11 | torch.nn.init.xavier_uniform_(self.gate[feature].data)
12 |
13 | self.raw_gate = {features[field_idx]: self.gate[features[field_idx]].clone().detach().to(args.device) for field_idx in range(len(features))}
14 | self.raw_gc = torch.concat([self.raw_gate[feature] for feature in features], dim=0)
15 |
16 | self.g = {feature: torch.ones_like(self.gate[feature]).to(args.device) for feature in features}
17 | self.gate = {feature: nn.Parameter(self.gate[feature], requires_grad=True) for feature in features}
18 | self.gate = torch.nn.ParameterDict(self.gate)
19 |
20 | self.mode = 'train'
21 | self.device = args.device
22 | self.features = features
23 |
24 | self.gamma = args.fs_config[args.fs]['gamma']
25 | self.pretrain_epoch = args.fs_config[args.fs]['pretrain_epoch']
26 | self.load_checkpoint = True
27 | self.optimizer_method = 'normal'
28 |
29 | def forward(self, x, current_epoch, current_step, raw_data):
30 | b,f,e = x.shape
31 | gc = torch.concat([self.gate[feature] for feature in self.features], dim=0)
32 | if current_epoch is not None: # that's mean, in training or validation
33 | t = self.gamma ** (current_epoch / self.pretrain_epoch)
34 | else: # current_epoch is None, that's mean, in test or retrain
35 | t = self.gamma
36 | if self.mode == 'train':
37 | self.g_tmp = torch.sigmoid(gc * t) / torch.sigmoid(self.raw_gc)
38 | # g_tmp分段赋值给g
39 | for feature in self.features:
40 | self.g[feature] = self.g_tmp[:len(self.gate[feature])]
41 | self.g_tmp = self.g_tmp[len(self.gate[feature]):]
42 | x_ = torch.zeros_like(x).to(self.device)
43 | for j in range(f):
44 | feature = self.features[j]
45 | x_[:,j,:] = x[:,j,:] * self.g[feature][raw_data[:,j]]
46 | elif self.mode == 'retrain':
47 | # self.g_tmp = torch.concat([self.g[feature] for feature in self.features], dim=0)
48 | x_ = torch.zeros_like(x).to(self.device)
49 | for j in range(f):
50 | feature = self.features[j]
51 | x_[:,j,:] = x[:,j,:] * self.g[feature][raw_data[:,j]]
52 |
53 |
54 |
55 | # for i in range(b):
56 | # for j in range(f):
57 | # feature = self.features[j]
58 | # x_[i,j,:] = x[i,j,:] * self.g[feature][raw_data[i,j]]
59 | # for j in range(f):
60 | # feature = self.features[j]
61 | # x_[:,j,:] = x[:,j,:] * self.g[feature][raw_data[:,j]]
62 |
63 | return x_
64 |
65 | def before_retrain(self):
66 | # self.gate <= 0 的赋值为0, else 1
67 | self.gate.requires_grad_(False)
68 | for feature in self.features:
69 | self.gate[feature][self.gate[feature] <= 0] = 0
70 | self.gate[feature][self.gate[feature] > 0] = 1
71 | print('feature:', feature, 'keep ratio:', torch.sum(self.gate[feature])/self.gate[feature].shape[0])
72 | self.g = {feature: nn.Parameter(self.gate[feature].clone().detach().to(self.device)) for feature in self.features}
73 | self.g = torch.nn.ParameterDict(self.g)
74 | self.g.requires_grad_(False)
75 |
--------------------------------------------------------------------------------
/models/fs/permutation.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class permutation(nn.Module):
6 | def __init__(self, args, unique_values, features):
7 | super(permutation, self).__init__()
8 |
9 | # 必需的参数
10 | self.load_checkpoint = False
11 | self.optimizer_method = 'normal'
12 |
13 | def forward(self, x, current_epoch, current_step, raw_data):
14 | return x
--------------------------------------------------------------------------------
/models/fs/rf.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class rf(nn.Module):
6 | def __init__(self, args, unique_values, features):
7 | super(rf, self).__init__()
8 |
9 | # 必需的参数
10 | self.load_checkpoint = False
11 | self.optimizer_method = 'normal'
12 |
13 | def forward(self, x, current_epoch, current_step, raw_data):
14 | return x
--------------------------------------------------------------------------------
/models/fs/sfs.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | import tqdm
5 | import numpy as np
6 |
7 | class sfs(nn.Module):
8 |
9 | def __init__(self, args, unique_values, features):
10 | super(sfs, self).__init__()
11 | self.load_checkpoint = False
12 | self.optimizer_method = 'normal'
13 |
14 | self.feature_num = len(unique_values)
15 | self.device = args.device
16 | self.args = args
17 | self.criterion = torch.nn.BCELoss()
18 | self.features = np.array(features)
19 |
20 | opt = args.fs_config[args.fs]
21 | #self.cr = opt['cr']
22 | self.num_batch_sampling = opt['num_batch_sampling']
23 |
24 | self.mode = 'train'
25 | self.offsets = np.array((0, *np.cumsum(unique_values)[:-1]))
26 | print(self.offsets)
27 | print(self.feature_num)
28 | self.mask = nn.Parameter(torch.ones([self.feature_num,1]))
29 | self.mask.requires_grad = False
30 |
31 | def forward(self, x, current_epoch, current_step, raw_data):
32 | return x*self.mask
33 |
34 | def save_selection(self, k):
35 | def prun(dataloader,model,device):
36 | model.fs.mask.requires_grad = True
37 | for i, (c_data, labels) in enumerate(dataloader):
38 | if i == model.fs.num_batch_sampling:
39 | break
40 | c_data, labels = c_data.to(device), labels.to(device)
41 | out = model(c_data,0,i)
42 | loss =self.criterion(out, labels.float().unsqueeze(-1))
43 | model.zero_grad()
44 | loss.backward()
45 | grads = torch.abs(model.fs.mask.grad)
46 | if i == 0:
47 | moving_average_grad = grads
48 | else:
49 | moving_average_grad = ((moving_average_grad * i) + grads) / (i + 1)
50 | grads = torch.flatten(moving_average_grad)
51 | importance = grads / grads.sum()
52 | feature_rank = torch.argsort(importance, descending=True)
53 | ranked_importance = importance[feature_rank].detach().cpu().numpy()
54 | ranked_features = [self.features[i] for i in feature_rank]
55 | return np.array([ranked_features, ranked_importance])
56 | return prun
57 |
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/models/fs/shark.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import tqdm
5 | import numpy as np
6 |
7 | class shark(nn.Module):
8 | def __init__(self, args, unique_values, features):
9 | super(shark, self).__init__()
10 | self.feature_num = len(unique_values)
11 | self.features = np.array(features)
12 | # 必需的参数
13 | self.load_checkpoint = False
14 | self.optimizer_method = 'normal'
15 | self.criterion = torch.nn.BCELoss()
16 | self.offsets = np.array((0, *np.cumsum(unique_values)[:-1]))
17 |
18 | def forward(self, x, current_epoch, current_step, raw_data):
19 | return x
20 |
21 | def save_selection(self, k):
22 | def selection(test_dataloader, model, device):
23 | tk0 = tqdm.tqdm(test_dataloader, desc="f-permutation", smoothing=0, mininterval=1.0)
24 | model = model.to(device)
25 | num = 0
26 | # importance = torch.zeros(len(model.offsets)).to(device) # save importance for each field
27 | importance = np.zeros(len(model.offsets))
28 | expectation = torch.zeros((len(model.offsets))).to(device)
29 | for x,y in test_dataloader:
30 | x = x.to(device)
31 | y = y.to(device)
32 | embs = model.embedding(x + x.new_tensor(self.offsets))
33 | if len(expectation.shape) == 1:
34 | expectation = torch.zeros((len(model.offsets), embs.shape[2])).to(device)
35 | expectation += torch.sum(embs, dim=0)
36 | num += x.shape[0]
37 | expectation = expectation / num
38 | expectation = expectation.reshape(1, len(model.offsets), -1)
39 | # expectation = torch.zeros((1, len(model.offsets), 8)).to(device)
40 | num = 0
41 | new_dataloader = torch.utils.data.DataLoader(test_dataloader.dataset, batch_size=1, num_workers=16)
42 | tk0 = tqdm.tqdm(new_dataloader, desc="f-permutation", smoothing=0, mininterval=1.0)
43 | for i, (x, y) in enumerate(tk0):
44 | x = x.to(device)
45 | y = y.to(device)
46 | model.zero_grad()
47 | embs = model.embedding(x + x.new_tensor(self.offsets))
48 | # expectation = torch.mean(embs, dim=0)
49 | expectation_resize = expectation.repeat(x.shape[0], 1,1)
50 | right_part = expectation_resize - embs
51 | y_pred = model(x, current_epoch=None, current_step=i)
52 | loss = self.criterion(y_pred, y.float().reshape(-1, 1))
53 | # cal gradient for each embedding
54 | loss.backward()
55 | # get gradient
56 | gradients = F.embedding(x + x.new_tensor(self.offsets),model.embedding.weight.grad).to(device)
57 | # use the torch.gradient
58 | # cal importance
59 | error = gradients * right_part # b,f,e
60 | error = torch.sum(error, dim=2) # b,f
61 | error = torch.sum(abs(error), dim=0) # f
62 | importance += error.detach().cpu().numpy()
63 | num += x.shape[0]
64 | importance = importance / num
65 | # sort importance
66 | feature_rank = np.argsort(importance)[::-1]
67 | ranked_importance = importance[feature_rank]
68 | ranked_features = [self.features[i] for i in feature_rank]
69 | return np.array([ranked_features, ranked_importance])
70 | return selection
--------------------------------------------------------------------------------
/models/fs/xgb.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class xgb(nn.Module):
6 | def __init__(self, args, unique_values, features):
7 | super(xgb, self).__init__()
8 |
9 | # 必需的参数
10 | self.load_checkpoint = False
11 | self.optimizer_method = 'normal'
12 |
13 | def forward(self, x, current_epoch, current_step, raw_data):
14 | return x
--------------------------------------------------------------------------------
/models/layers.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | from itertools import combinations
4 |
5 | class MLP(nn.Module):
6 |
7 | def __init__(self, input_dim, output_layer=True, dims=None, dropout=0):
8 | super().__init__()
9 | if dims is None:
10 | dims = []
11 | layers = list()
12 | for i_dim in dims:
13 | layers.append(nn.Linear(input_dim, i_dim))
14 | layers.append(nn.BatchNorm1d(i_dim))
15 | layers.append(nn.ReLU())
16 | layers.append(nn.Dropout(p=dropout))
17 | input_dim = i_dim
18 | if output_layer:
19 | layers.append(nn.Linear(input_dim, 1))
20 | self.mlp = nn.Sequential(*layers)
21 |
22 | def forward(self, x):
23 | return self.mlp(x)
24 |
25 | class CrossNetwork(nn.Module):
26 | """CrossNetwork mentioned in the DCN paper.
27 |
28 | Args:
29 | input_dim (int): input dim of input tensor
30 |
31 | Shape:
32 | - Input: `(batch_size, *)`
33 | - Output: `(batch_size, *)`
34 |
35 | """
36 |
37 | def __init__(self, input_dim, num_layers):
38 | super().__init__()
39 | self.num_layers = num_layers
40 | self.w = torch.nn.ModuleList([torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)])
41 | self.b = torch.nn.ParameterList([torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)])
42 |
43 | def forward(self, x):
44 | """
45 | :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
46 | """
47 | x0 = x
48 | for i in range(self.num_layers):
49 | xw = self.w[i](x)
50 | x = x0 * xw + self.b[i] + x
51 | return x
52 |
53 | class SENETLayer(nn.Module):
54 | """
55 | A weighted feature gating system in the SENet paper
56 | Args:
57 | num_fields (int): number of feature fields
58 |
59 | Shape:
60 | - num_fields: `(batch_size, *)`
61 | - Output: `(batch_size, *)`
62 | """
63 | def __init__(self, num_fields, reduction_ratio=3):
64 | super(SENETLayer, self).__init__()
65 | reduced_size = max(1, int(num_fields/ reduction_ratio))
66 | self.mlp = nn.Sequential(nn.Linear(num_fields, reduced_size, bias=False),
67 | nn.ReLU(),
68 | nn.Linear(reduced_size, num_fields, bias=False),
69 | nn.ReLU())
70 | def forward(self, x):
71 | z = torch.mean(x, dim=-1, out=None)
72 | a = self.mlp(z)
73 | v = x*a.unsqueeze(-1)
74 | return v
75 |
76 | class BiLinearInteractionLayer(nn.Module):
77 | """
78 | Bilinear feature interaction module, which is an improved model of the FFM model
79 | Args:
80 | num_fields (int): number of feature fields
81 | bilinear_type(str): the type bilinear interaction function
82 | Shape:
83 | - num_fields: `(batch_size, *)`
84 | - Output: `(batch_size, *)`
85 | """
86 | def __init__(self, input_dim, num_fields, bilinear_type = "field_interaction"):
87 | super(BiLinearInteractionLayer, self).__init__()
88 | self.bilinear_type = bilinear_type
89 | if self.bilinear_type == "field_all":
90 | self.bilinear_layer = nn.Linear(input_dim, input_dim, bias=False)
91 | elif self.bilinear_type == "field_each":
92 | self.bilinear_layer = nn.ModuleList([nn.Linear(input_dim, input_dim, bias=False) for i in range(num_fields)])
93 | elif self.bilinear_type == "field_interaction":
94 | self.bilinear_layer = nn.ModuleList([nn.Linear(input_dim, input_dim, bias=False) for i,j in combinations(range(num_fields), 2)])
95 | else:
96 | raise NotImplementedError()
97 |
98 | def forward(self, x):
99 | feature_emb = torch.split(x, 1, dim=1)
100 | if self.bilinear_type == "field_all":
101 | bilinear_list = [self.bilinear_layer(v_i)*v_j for v_i, v_j in combinations(feature_emb, 2)]
102 | elif self.bilinear_type == "field_each":
103 | bilinear_list = [self.bilinear_layer[i](feature_emb[i])*feature_emb[j] for i,j in combinations(range(len(feature_emb)), 2)]
104 | elif self.bilinear_type == "field_interaction":
105 | bilinear_list = [self.bilinear_layer[i](v[0])*v[1] for i,v in enumerate(combinations(feature_emb, 2))]
106 | return torch.cat(bilinear_list, dim=1)
107 |
108 | class FactorizationMachine(torch.nn.Module):
109 | def __init__(self, reduce_sum=True):
110 | super().__init__()
111 | self.reduce_sum = reduce_sum
112 |
113 | def forward(self, x):
114 | """
115 | :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
116 | :return : tensor of size (batch_size, 1) if reduce_sum
117 | tensor of size (batch_size, embed_dim) else
118 | """
119 | square_of_sum = torch.sum(x, dim=1) ** 2
120 | sum_of_square = torch.sum(x ** 2, dim=1)
121 | ix = square_of_sum - sum_of_square
122 | if self.reduce_sum:
123 | ix = torch.sum(ix, dim=1, keepdim=True)
124 | return 0.5 * ix
--------------------------------------------------------------------------------
/models/rec/dcn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from ..layers import MLP, CrossNetwork
4 |
5 | class dcn(nn.Module):
6 |
7 | def __init__(self, args, input_dim):
8 | super(dcn, self).__init__()
9 | self.dims = input_dim
10 |
11 | self.cn = CrossNetwork(self.dims, num_layers=2)
12 | self.mlp = MLP(self.dims, False, dims=[32,16], dropout=0.2)
13 | self.linear = nn.Linear(self.dims + 16, 1)
14 |
15 | def forward(self, x):
16 | b,f,e = x.shape
17 | x = x.reshape(b,-1)
18 | cn_out = self.cn(x)
19 | mlp_out = self.mlp(x)
20 | x = torch.cat([cn_out, mlp_out], dim=1)
21 | x = self.linear(x)
22 | x = torch.sigmoid(x)
23 | return x
--------------------------------------------------------------------------------
/models/rec/deepfm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from ..layers import MLP, FactorizationMachine
4 |
5 | class deepfm(nn.Module):
6 |
7 | def __init__(self, args, input_dim):
8 | super(deepfm, self).__init__()
9 | self.dims = input_dim
10 |
11 | self.dropout = 0.2
12 | self.dnn = MLP(self.dims, True, dims=[32, 16], dropout=self.dropout)
13 | self.fm = FactorizationMachine(reduce_sum=True)
14 |
15 | def forward(self, x):
16 | b,f,e = x.shape
17 | output_fm = self.fm(x)
18 | x_dnn = x.reshape(b,-1)
19 | x_dnn = self.dnn(x_dnn)
20 | output = output_fm + x_dnn
21 | output = torch.sigmoid(output)
22 | return output
--------------------------------------------------------------------------------
/models/rec/fibinet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from ..layers import MLP, SENETLayer, BiLinearInteractionLayer
4 |
5 | class fibinet(nn.Module):
6 |
7 | def __init__(self, args, input_dim):
8 | super(fibinet, self).__init__()
9 |
10 | # if you change the embedding_size, this value should be changed too
11 | embedding_dim = args.embedding_dim
12 | self.dims = input_dim
13 | self.num_fields = self.dims // embedding_dim
14 | self.senet_layer = SENETLayer(self.num_fields, reduction_ratio=3)
15 | self.bilinear_interaction = BiLinearInteractionLayer(embedding_dim, self.num_fields, bilinear_type="field_interaction")
16 | self.hidden_size = self.num_fields * (self.num_fields - 1) * embedding_dim
17 | self.mlp = MLP(self.hidden_size, True, dims=[32,16], dropout=0.2)
18 |
19 | def forward(self, x):
20 | b,f,e = x.shape
21 | embed_senet = self.senet_layer(x)
22 | embed_bi1 = self.bilinear_interaction(x)
23 | embed_bi2 = self.bilinear_interaction(embed_senet)
24 | shallow_part = torch.flatten(torch.cat([embed_bi1, embed_bi2], dim=1), start_dim=1)
25 | mlp_out = self.mlp(shallow_part)
26 | output = torch.sigmoid(mlp_out)
27 | return output
28 |
--------------------------------------------------------------------------------
/models/rec/fm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from ..layers import MLP, FactorizationMachine
4 |
5 | class fm(nn.Module):
6 |
7 | def __init__(self, args, input_dim):
8 | super(fm, self).__init__()
9 | self.dims = input_dim
10 |
11 | self.fm = FactorizationMachine(reduce_sum=True)
12 |
13 | def forward(self, x):
14 | b,f,e = x.shape
15 | output_fm = self.fm(x)
16 | x = torch.sigmoid(output_fm)
17 | return x
--------------------------------------------------------------------------------
/models/rec/mlp.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 | import torch
4 |
5 | # class mlp(nn.Module):
6 | # def __init__(self, input_size, hidden_size = 16, output_size = 16, dropout = 0.2):
7 | # super(mlp, self).__init__()
8 | # self.fc1 = nn.Linear(input_size, hidden_size)
9 | # self.dropout1 = nn.Dropout(dropout)
10 | # self.fc2 = nn.Linear(hidden_size, hidden_size)
11 | # self.dropout2 = nn.Dropout(dropout)
12 | # self.fc3 = nn.Linear(hidden_size, output_size)
13 | # self.output_layer = nn.Linear(output_size, 1)
14 | # self.init_weights()
15 |
16 | # # optimizer
17 | # self.optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
18 |
19 | # def init_weights(self):
20 | # for m in self.modules():
21 | # if isinstance(m, nn.Linear):
22 | # nn.init.normal_(m.weight.data, mean=0, std=0.01)
23 | # nn.init.constant_(m.bias.data, 0)
24 |
25 | # def forward(self, x):
26 | # b = x.shape[0]
27 | # x = x.reshape(b, -1)
28 | # x = self.fc1(x)
29 | # x = torch.relu(x)
30 | # x = self.dropout1(x)
31 | # x = torch.relu(self.fc2(x))
32 | # x = self.dropout2(x)
33 | # x = self.fc3(x)
34 | # x = self.output_layer(x)
35 | # x = torch.sigmoid(x)
36 | # return x
37 |
38 | class mlp(nn.Module):
39 | def __init__(self, args, input_dim, embed_dims = [16,16], dropout = 0.2, output_layer=True):
40 | super().__init__()
41 | layers = list()
42 | self.mlps = nn.ModuleList()
43 | self.out_layer = output_layer
44 | for embed_dim in embed_dims:
45 | layers.append(nn.Linear(input_dim, embed_dim))
46 | layers.append(nn.BatchNorm1d(embed_dim))
47 | layers.append(nn.ReLU())
48 | layers.append(nn.Dropout(p=dropout))
49 | input_dim = embed_dim
50 | self.mlps.append(nn.Sequential(*layers))
51 | layers = list()
52 | if self.out_layer:
53 | self.out = nn.Linear(input_dim, 1)
54 |
55 | def forward(self, x):
56 | """
57 | :param x: Float tensor of size ``(batch_size, embed_dim)``
58 | """
59 | b = x.shape[0]
60 | x = x.reshape(b,-1)
61 | for layer in self.mlps:
62 | x = layer(x)
63 | if self.out_layer:
64 | x = self.out(x)
65 | x = torch.sigmoid(x)
66 | return x
--------------------------------------------------------------------------------
/models/rec/widedeep.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | from ..layers import MLP
5 |
6 | class widedeep(nn.Module):
7 |
8 | def __init__(self, args, input_dim):
9 | super(widedeep, self).__init__()
10 | self.dims = input_dim
11 |
12 | self.mlp = MLP(self.dims, True, dims=[32,16], dropout=0.2)
13 | self.linear = nn.Linear(self.dims, 1)
14 |
15 | def forward(self, x):
16 | b,f,e = x.shape
17 | x = x.reshape(b,-1)
18 | mlp_out = self.mlp(x)
19 | linear_out = self.linear(x)
20 | x = mlp_out + linear_out
21 | x = torch.sigmoid(x)
22 | return x
--------------------------------------------------------------------------------
/nni/search_spaces/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "k": {"_type": "randint", "_value": [15,23]},
3 | "learning_rate": {"_type": "choice", "_value": [0.001, 0.0005, 0.0001]}
4 | }
--------------------------------------------------------------------------------
/nni/search_spaces/fs/adafs.json:
--------------------------------------------------------------------------------
1 | {
2 | "fs_pretrain_epoch": {"_type": "choice", "_value": [1,2,3,4,5,6]},
3 | "fs_hidden_size": {"_type": "choice", "_value": [16,32,64,128]},
4 | "fs_dropout": {"_type": "choice", "_value": [0, 0.2, 0.4, 0.6]},
5 | "fs_update_frequency": {"_type": "choice", "_value": [1,2,4,8,16]}
6 | }
--------------------------------------------------------------------------------
/nni/search_spaces/fs/autofield.json:
--------------------------------------------------------------------------------
1 | {
2 | "fs_update_frequency": {"_type": "choice", "_value": [5,10,15,20,30]}
3 | }
--------------------------------------------------------------------------------
/nni/search_spaces/fs/gbdt.json:
--------------------------------------------------------------------------------
1 | {
2 | "fs_learning_rate": {"_type": "choice", "_value": [0.1,0.01,0.001,0.0001]},
3 | "fs_n_estimators": {"_type": "choice", "_value": [50,100,200]},
4 | "fs_subsample": {"_type": "choice", "_value": [0.5, 0.7, 1.0]},
5 | "fs_min_samples_split": {"_type": "choice", "_value": [2, 4, 8, 16]},
6 | "fs_min_samples_leaf": {"_type": "choice", "_value": [1, 2, 4, 8, 16]},
7 | "fs_min_weight_fraction_leaf": {"_type": "choice", "_value": [0.0, 0.3, 0.5]},
8 | "fs_max_depth": {"_type": "choice", "_value": [3, 6, 9]},
9 | "fs_n_iter_no_change": {"_type": "choice", "_value": [null, 3, 6, 9]}
10 | }
--------------------------------------------------------------------------------
/nni/search_spaces/fs/lasso.json:
--------------------------------------------------------------------------------
1 | {
2 | "fs_alpha": {"_type": "choice", "_value": [0.0001,0.001,0.01,0.1,1.0]},
3 | "fs_fit_intercept": {"_type": "choice", "_value": [true, false]},
4 | "fs_copy_X": {"_type": "choice", "_value": [true,false]},
5 | "fs_max_iter": {"_type": "choice", "_value": [100,200,500,1000,2000]},
6 | "fs_tol": {"_type": "choice", "_value": [1e-5,1e-4,1e-3,1e-2]},
7 | "fs_positive": {"_type": "choice", "_value": [true,false]},
8 | "fs_selection": {"_type": "choice", "_value": ["cyclic","random"]}
9 | }
--------------------------------------------------------------------------------
/nni/search_spaces/fs/optfs.json:
--------------------------------------------------------------------------------
1 | {
2 | "fs_epochs": {"_type": "choice", "_value": [5,10,15]},
3 | "fs_gamma": {"_type": "choice", "_value": [200,500,1000,2000,5000,10000]}
4 | }
--------------------------------------------------------------------------------
/nni_tune.py:
--------------------------------------------------------------------------------
1 | import nni
2 | import argparse
3 | import json
4 | import os
5 | import re
6 | from nni.experiment import Experiment
7 | from utils.utils import str2bool
8 |
9 |
10 | if __name__ == '__main__':
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--dataset', type=str, default='criteo', help='avazu, criteo')
13 | parser.add_argument('--model', type=str, default='mlp')
14 | parser.add_argument('--fs', type=str, default='no_selection')
15 | parser.add_argument('--es', type=str, default='no_selection')
16 | parser.add_argument('--data_path', type=str, default='data/')
17 | parser.add_argument('--train_or_search', type=str2bool, default=True, help='whether to train or search')
18 | parser.add_argument('--retrain', type=str2bool, default=True, help='whether to retrain')
19 | parser.add_argument('--k', type=int, default=0, help='top k features, if set, use just this k')
20 | parser.add_argument('--port', type=int, default=8080, help='port of nni server')
21 |
22 | args = parser.parse_args()
23 | script_name = None
24 | if args.es != 'no_selection':
25 | script_name = 'es_run.py'
26 | else:
27 | script_name = 'fs_run.py'
28 |
29 | field_num = 0
30 | if args.dataset == 'avazu':
31 | field_num = 22
32 | elif args.dataset == 'criteo':
33 | field_num = 39
34 |
35 | fs_search_space, es_search_space, model_search_space = None, None, None
36 | fs_search_space_path = 'nni/search_spaces/fs/' + args.fs + '.json'
37 | es_search_space_path = 'nni/search_spaces/es/' + args.es + '.json'
38 | model_search_space_path = 'nni/search_spaces/config.json'
39 | if not os.path.exists(fs_search_space_path):
40 | print('fs search space not exists, skip')
41 | else:
42 | with open(fs_search_space_path, 'r') as f:
43 | fs_search_space = json.load(f)
44 | if not os.path.exists(es_search_space_path):
45 | print('es search space not exists, skip')
46 | else:
47 | with open(es_search_space_path, 'r') as f:
48 | es_search_space = json.load(f)
49 | with open(model_search_space_path, 'r') as f:
50 | model_search_space = json.load(f)
51 | search_space = {}
52 | if fs_search_space is not None:
53 | search_space.update(fs_search_space)
54 | if es_search_space is not None:
55 | search_space.update(es_search_space)
56 | search_space.update(model_search_space)
57 |
58 | if args.k == 0:
59 | # if no specific k, set k to be a random value between field_num * 0.8 and field_num
60 | search_space["k"] = {"_type": "randint", "_value": [int(field_num * 0.8), field_num]}
61 | else:
62 | # if specific k, set k to be a choice value
63 | search_space["k"] = {"_type": "choice", "_value": [args.k]}
64 |
65 | experiment = Experiment('local')
66 | experiment.config.experiment_name = args.dataset + '_' + args.model + '_' + args.fs + '_' + args.es
67 | experiment.config.trial_command = 'python {} --dataset={} --model={} --fs={} --es={} --data_path={} --nni=True --train_or_search={} --retrain={} --k={}'.format(script_name, args.dataset, args.model, args.fs, args.es, args.data_path, args.train_or_search, args.retrain, args.k)
68 | experiment.config.trial_code_directory = '.' # code directory
69 | experiment.config.experiment_working_directory = 'experiments/' # working directory
70 | if not os.path.exists(experiment.config.experiment_working_directory):
71 | os.makedirs(experiment.config.experiment_working_directory)
72 | experiment.config.search_space = search_space
73 |
74 | experiment.config.tuner.name = 'TPE'
75 | experiment.config.tuner.class_args['optimize_mode'] = 'maximize'
76 |
77 | experiment.config.max_trial_number = 16
78 | experiment.config.trial_concurrency = 8
79 | experiment.config.max_experiment_duration = '24h'
80 |
81 | experiment.config.trial_gpu_number = 1
82 | experiment.config.training_service.use_active_gpu = True
83 |
84 | experiment.run(args.port)
85 | # experiment_id = nni.get_experiment_id()
86 | # # get the best parameters
87 | # experiment_dir = os.path.join('nni-experiments',experiment_id, 'trials')
88 | # auc_value, logloss_value = 0.0, 100.0
89 | # best_trial = None
90 | # for trial in os.listdir(experiment_dir):
91 | # file_path = os.path.join(experiment_dir, trial, 'trial.log')
92 | # auc_pattern = r"test auc: ([0-9.]+)"
93 | # logloss_pattern = r"test logloss: ([0-9.]+)"
94 | # with open(file_path, "r") as file:
95 | # lines = file.readlines()
96 | # auc_match = re.search(auc_pattern, lines[-2])
97 | # logloss_match = re.search(logloss_pattern, lines[-1])
98 | # if auc_match:
99 | # auc_value = max(auc_value, float(auc_match.group(1)))
100 | # if auc_value == float(auc_match.group(1)):
101 | # best_trial = trial
102 | # if logloss_match:
103 | # logloss_value = min(logloss_value, float(logloss_match.group(1)))
104 | # print('best trial: ', best_trial)
105 | # print('best auc: ', auc_value)
106 | # print('best logloss: ', logloss_value)
107 | # print('best parameters:')
108 | # best_trial_para_path = os.path.join(experiment_dir, best_trial, 'parameter.cfg')
109 | # with open(best_trial_para_path, 'r') as file:
110 | # lines = file.readlines()
111 | # print(lines)
112 |
113 | experiment.stop()
114 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy~=1.19.3
2 | pandas~=1.4.4
3 | scikit_learn~=1.2.2
4 | torch~=1.11.0
5 | tqdm~=4.65.0
6 | pyyaml
7 | nni
8 | xgboost
--------------------------------------------------------------------------------
/utils/datasets.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import os
4 | import torch
5 | from torch.utils.data import TensorDataset, DataLoader
6 | from sklearn.preprocessing import LabelEncoder
7 |
8 | def read_dataset(dataset_name, data_path, batch_size, shuffle, num_workers, use_fields=None, machine_learning_method=False):
9 | if machine_learning_method:
10 | if dataset_name == 'avazu':
11 | return read_avazu_ml(data_path, batch_size, shuffle)
12 | elif dataset_name == 'criteo':
13 | return read_criteo_ml(data_path, batch_size, shuffle)
14 | elif dataset_name == 'movielens-1m':
15 | return read_movielens1m_ml(data_path, batch_size, shuffle)
16 | elif dataset_name == 'aliccp':
17 | return read_aliccp_ml(data_path, batch_size, shuffle)
18 | elif not machine_learning_method:
19 | if dataset_name == 'avazu':
20 | return read_avazu(data_path, batch_size, shuffle, num_workers, use_fields)
21 | elif dataset_name == 'criteo':
22 | return read_criteo(data_path, batch_size, shuffle, num_workers, use_fields)
23 | elif dataset_name == 'movielens-1m':
24 | return read_movielens1m(data_path, batch_size, shuffle, num_workers, use_fields)
25 | elif dataset_name == 'aliccp':
26 | return read_aliccp(data_path, batch_size, shuffle, num_workers, use_fields)
27 |
28 | def read_avazu(data_path, batch_size, shuffle, num_workers, use_fields=None):
29 | dtypes = {
30 | 'click': np.int8,
31 | 'hour':np.int16,
32 | 'C1':np.int8,
33 | 'banner_pos':np.int8,
34 | 'site_id':np.int16,
35 | 'site_domain':np.int16,
36 | 'site_category':np.int8,
37 | 'app_id':np.int16,
38 | 'app_domain':np.int16,
39 | 'app_category':np.int8,
40 | 'device_id':np.int32,
41 | 'device_ip':np.int32,
42 | 'device_model':np.int16,
43 | 'device_type':np.int8,
44 | 'device_conn_type':np.int8,
45 | 'C14':np.int16,
46 | 'C15':np.int8,
47 | 'C16':np.int8,
48 | 'C17':np.int16,
49 | 'C18':np.int8,
50 | 'C19':np.int8,
51 | 'C20':np.int16,
52 | 'C21':np.int8
53 | }
54 | print('start reading avazu...')
55 | if use_fields is None:
56 | df = pd.read_csv(os.path.join(data_path, 'avazu/preprocessed_avazu.csv'), dtype = dtypes)
57 | else:
58 | df = pd.read_csv(os.path.join(data_path, 'avazu/preprocessed_avazu.csv'), dtype = dtypes, usecols=list(use_fields)+['click'])
59 | print('finish reading avazu.')
60 | train_idx = int(df.shape[0] * 0.7)
61 | val_idx = int(df.shape[0] * 0.9)
62 | features = [f for f in df.columns if f not in ['click']]
63 | unique_values = [df[col].max()+1 for col in features]
64 | label = 'click'
65 | train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
66 | train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
67 | train_x, val_x, test_x = torch.tensor(train_x.values, dtype=torch.long), torch.tensor(val_x.values, dtype=torch.long), torch.tensor(test_x.values, dtype=torch.long)
68 | train_y, val_y, test_y = torch.tensor(train_y.values, dtype=torch.long), torch.tensor(val_y.values, dtype=torch.long), torch.tensor(test_y.values, dtype=torch.long)
69 | train_dataloader = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
70 | val_dataloader = DataLoader(TensorDataset(val_x, val_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
71 | test_dataloader = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
72 | return features, label, train_dataloader, val_dataloader, test_dataloader, unique_values
73 |
74 | def read_avazu_ml(data_path, batch_size, shuffle, use_fields=None):
75 | dtypes = {
76 | 'click': np.int8,
77 | 'hour':np.int16,
78 | 'C1':np.int8,
79 | 'banner_pos':np.int8,
80 | 'site_id':np.int16,
81 | 'site_domain':np.int16,
82 | 'site_category':np.int8,
83 | 'app_id':np.int16,
84 | 'app_domain':np.int16,
85 | 'app_category':np.int8,
86 | 'device_id':np.int32,
87 | 'device_ip':np.int32,
88 | 'device_model':np.int16,
89 | 'device_type':np.int8,
90 | 'device_conn_type':np.int8,
91 | 'C14':np.int16,
92 | 'C15':np.int8,
93 | 'C16':np.int8,
94 | 'C17':np.int16,
95 | 'C18':np.int8,
96 | 'C19':np.int8,
97 | 'C20':np.int16,
98 | 'C21':np.int8
99 | }
100 | print('start reading avazu...')
101 | if use_fields is None:
102 | df = pd.read_csv(os.path.join(data_path, 'avazu/preprocessed_avazu.csv'), dtype = dtypes)
103 | # df.drop(columns=['item_id:token'], inplace=True)
104 | else:
105 | df = pd.read_csv(os.path.join(data_path, 'avazu/preprocessed_avazu.csv'), dtype = dtypes, usecols=list(use_fields)+['click'])
106 | print('finish reading avazu.')
107 | train_idx = int(df.shape[0] * 0.7)
108 | val_idx = int(df.shape[0] * 0.9)
109 | features = [f for f in df.columns if f not in ['click']]
110 | unique_values = [df[col].max()+1 for col in features]
111 | label = 'click'
112 | train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
113 | train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
114 | return features, unique_values, (train_x, train_y, val_x, val_y, test_x, test_y)
115 |
116 | def read_criteo(data_path, batch_size, shuffle, num_workers, use_fields=None):
117 | dtypes = {
118 | '0': np.int8,
119 | '1': np.int8,
120 | '2': np.int8,
121 | '3': np.int8,
122 | '4': np.int8,
123 | '5': np.int16,
124 | '6': np.int16,
125 | '7': np.int8,
126 | '8': np.int8,
127 | '9': np.int8,
128 | '10': np.int8,
129 | '11': np.int8,
130 | '12': np.int8,
131 | '13': np.int8,
132 | '14': np.int16,
133 | '15': np.int16,
134 | '16': np.int32,
135 | '17': np.int32,
136 | '18': np.int16,
137 | '19': np.int8,
138 | '20': np.int16,
139 | '21': np.int16,
140 | '22': np.int8,
141 | '23': np.int32,
142 | '24': np.int16,
143 | '25': np.int32,
144 | '26': np.int16,
145 | '27': np.int8,
146 | '28': np.int16,
147 | '29': np.int32,
148 | '30': np.int8,
149 | '31': np.int16,
150 | '32': np.int16,
151 | '33': np.int8,
152 | '34': np.int32,
153 | '35': np.int8,
154 | '36': np.int8,
155 | '37': np.int32,
156 | '38': np.int8,
157 | '39': np.int32
158 | }
159 | print('start reading criteo...')
160 | if use_fields is None:
161 | df = pd.read_csv(os.path.join(data_path, 'criteo/preprocessed_criteo.csv'), dtype = dtypes)
162 | # df.drop(columns=['index:float'], inplace=True)
163 | else:
164 | df = pd.read_csv(os.path.join(data_path, 'criteo/preprocessed_criteo.csv'), dtype = dtypes, usecols=list(use_fields)+['0'])
165 | print('finish reading criteo.')
166 | train_idx = int(df.shape[0] * 0.7)
167 | val_idx = int(df.shape[0] * 0.9)
168 | features = [f for f in df.columns if f not in ['0']]
169 | unique_values = [df[col].max()+1 for col in features]
170 | label = '0'
171 | train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
172 | train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
173 | train_x, val_x, test_x = torch.tensor(train_x.values, dtype=torch.long), torch.tensor(val_x.values, dtype=torch.long), torch.tensor(test_x.values, dtype=torch.long)
174 | train_y, val_y, test_y = torch.tensor(train_y.values, dtype=torch.long), torch.tensor(val_y.values, dtype=torch.long), torch.tensor(test_y.values, dtype=torch.long)
175 | train_dataloader = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
176 | val_dataloader = DataLoader(TensorDataset(val_x, val_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
177 | test_dataloader = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
178 | return features, label, train_dataloader, val_dataloader, test_dataloader, unique_values
179 |
180 |
181 | def read_criteo_ml(data_path, batch_size, shuffle, use_fields=None):
182 | dtypes = {
183 | '0': np.int8,
184 | '1': np.int8,
185 | '2': np.int8,
186 | '3': np.int8,
187 | '4': np.int8,
188 | '5': np.int16,
189 | '6': np.int16,
190 | '7': np.int8,
191 | '8': np.int8,
192 | '9': np.int8,
193 | '10': np.int8,
194 | '11': np.int8,
195 | '12': np.int8,
196 | '13': np.int8,
197 | '14': np.int16,
198 | '15': np.int16,
199 | '16': np.int32,
200 | '17': np.int32,
201 | '18': np.int16,
202 | '19': np.int8,
203 | '20': np.int16,
204 | '21': np.int16,
205 | '22': np.int8,
206 | '23': np.int32,
207 | '24': np.int16,
208 | '25': np.int32,
209 | '26': np.int16,
210 | '27': np.int8,
211 | '28': np.int16,
212 | '29': np.int32,
213 | '30': np.int8,
214 | '31': np.int16,
215 | '32': np.int16,
216 | '33': np.int8,
217 | '34': np.int32,
218 | '35': np.int8,
219 | '36': np.int8,
220 | '37': np.int32,
221 | '38': np.int8,
222 | '39': np.int32
223 | }
224 | print('start reading criteo...')
225 | if use_fields is None:
226 | df = pd.read_csv(os.path.join(data_path, 'criteo/preprocessed_criteo.csv'), dtype = dtypes)
227 | # df.drop(columns=['index:float'], inplace=True)
228 | else:
229 | df = pd.read_csv(os.path.join(data_path, 'criteo/preprocessed_criteo.csv'), dtype = dtypes, usecols=list(use_fields)+['0'])
230 | print('finish reading criteo.')
231 | train_idx = int(df.shape[0] * 0.7)
232 | val_idx = int(df.shape[0] * 0.9)
233 | features = [f for f in df.columns if f not in ['0']]
234 | unique_values = [df[col].max()+1 for col in features]
235 | label = '0'
236 | train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
237 | train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
238 | return features, unique_values, (train_x, train_y, val_x, val_y, test_x, test_y)
239 |
240 | def read_movielens1m(data_path, batch_size, shuffle, num_workers, use_fields=None):
241 | print('start reading movielens 1m...')
242 | if use_fields is None:
243 | df = pd.read_csv(os.path.join(data_path, 'movielens-1m/ml-1m.csv'))
244 | else:
245 | df = pd.read_csv(os.path.join(data_path, 'movielens-1m/ml-1m.csv'), usecols=list(use_fields)+['rating'])
246 | print('finish reading movielens 1m.')
247 | df['rating'] = df['rating'].apply(lambda x: 1 if x > 3 else 0)
248 | df = df.sample(frac=1, random_state=43) # shuffle
249 | train_idx = int(df.shape[0] * 0.7)
250 | val_idx = int(df.shape[0] * 0.9)
251 | features = [f for f in df.columns if f not in ['rating']]
252 | for feature in features:
253 | le = LabelEncoder()
254 | df[feature] = le.fit_transform(df[feature])
255 | unique_values = [df[col].max()+1 for col in features]
256 | label = 'rating'
257 | train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
258 | train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
259 | train_x, val_x, test_x = torch.tensor(train_x.values, dtype=torch.long), torch.tensor(val_x.values, dtype=torch.long), torch.tensor(test_x.values, dtype=torch.long)
260 | train_y, val_y, test_y = torch.tensor(train_y.values, dtype=torch.long), torch.tensor(val_y.values, dtype=torch.long), torch.tensor(test_y.values, dtype=torch.long)
261 | train_dataloader = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
262 | val_dataloader = DataLoader(TensorDataset(val_x, val_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
263 | test_dataloader = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
264 | return features, label, train_dataloader, val_dataloader, test_dataloader, unique_values
265 |
266 | def read_movielens1m_ml(data_path, batch_size, shuffle, use_fields=None):
267 | print('start reading movielens 1m...')
268 | if use_fields is None:
269 | df = pd.read_csv(os.path.join(data_path, 'movielens-1m/ml-1m.csv'))
270 | # df.drop(columns=['item_id:token'], inplace=True)
271 | else:
272 | df = pd.read_csv(os.path.join(data_path, 'movielens-1m/ml-1m.csv'), usecols=list(use_fields)+['rating'])
273 | print('finish reading movielens 1m.')
274 | df['rating'] = df['rating'].apply(lambda x: 1 if x > 3 else 0)
275 | df = df.sample(frac=1, random_state=43) # shuffle
276 | train_idx = int(df.shape[0] * 0.7)
277 | val_idx = int(df.shape[0] * 0.9)
278 | features = [f for f in df.columns if f not in ['rating']]
279 | for feature in features:
280 | le = LabelEncoder()
281 | df[feature] = le.fit_transform(df[feature])
282 | unique_values = [df[col].max()+1 for col in features]
283 | label = 'rating'
284 | train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
285 | train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
286 | return features, unique_values, (train_x, train_y, val_x, val_y, test_x, test_y)
287 |
288 | def read_aliccp(data_path, batch_size, shuffle, num_workers, use_fields=None):
289 | print('start reading aliccp...')
290 | data_type = {'click':np.int8, 'purchase': np.int8, '101':np.int32, '121':np.uint8, '122':np.uint8, '124':np.uint8, '125':np.uint8, '126':np.uint8, '127':np.uint8, '128':np.uint8, '129':np.uint8, '205':np.int32, '206':np.int16, '207':np.int32, '210':np.int32, '216':np.int32, '508':np.int16, '509':np.int32, '702':np.int32, '853':np.int32, '301':np.int8, '109_14':np.int16, '110_14':np.int32, '127_14':np.int32, '150_14':np.int32, 'D109_14': np.float16, 'D110_14': np.float16, 'D127_14': np.float16, 'D150_14': np.float16, 'D508': np.float16, 'D509': np.float16, 'D702': np.float16, 'D853': np.float16}
291 | if use_fields is None:
292 | df1 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_train.csv'), dtype=data_type)
293 | df2 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_val.csv'), dtype=data_type)
294 | df3 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_test.csv'), dtype=data_type)
295 | df = pd.concat([df1, df2, df3])
296 | else:
297 | df1 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_train.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
298 | df2 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_val.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
299 | df3 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_test.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
300 | df = pd.concat([df1, df2, df3])
301 | print('finish reading aliccp.')
302 | # df = df.sample(frac=1) # shuffle
303 | train_idx = int(df.shape[0] * 0.5)
304 | val_idx = int(df.shape[0] * 0.75)
305 | features = []
306 | for f in df.columns:
307 | if f not in ['click','purchase'] and f[:1] != 'D':
308 | features.append(f)
309 | if '301' in features:
310 | df['301'] = df['301'] - 1
311 | unique_values = [df[col].max()+1 for col in features]
312 | label = 'click'
313 | train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
314 | train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
315 | train_x, val_x, test_x = torch.tensor(train_x.values, dtype=torch.long), torch.tensor(val_x.values, dtype=torch.long), torch.tensor(test_x.values, dtype=torch.long)
316 | train_y, val_y, test_y = torch.tensor(train_y.values, dtype=torch.long), torch.tensor(val_y.values, dtype=torch.long), torch.tensor(test_y.values, dtype=torch.long)
317 | train_dataloader = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
318 | val_dataloader = DataLoader(TensorDataset(val_x, val_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
319 | test_dataloader = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
320 | return features, label, train_dataloader, val_dataloader, test_dataloader, unique_values
321 |
322 | def read_aliccp_ml(data_path, batch_size, shuffle, use_fields=None):
323 | print('start reading aliccp...')
324 | data_type = {'click':np.int8, 'purchase': np.int8, '101':np.int32, '121':np.uint8, '122':np.uint8, '124':np.uint8, '125':np.uint8, '126':np.uint8, '127':np.uint8, '128':np.uint8, '129':np.uint8, '205':np.int32, '206':np.int16, '207':np.int32, '210':np.int32, '216':np.int32, '508':np.int16, '509':np.int32, '702':np.int32, '853':np.int32, '301':np.int8, '109_14':np.int16, '110_14':np.int32, '127_14':np.int32, '150_14':np.int32, 'D109_14': np.float16, 'D110_14': np.float16, 'D127_14': np.float16, 'D150_14': np.float16, 'D508': np.float16, 'D509': np.float16, 'D702': np.float16, 'D853': np.float16}
325 | if use_fields is None:
326 | df1 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_train.csv'), dtype=data_type)
327 | df2 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_val.csv'), dtype=data_type)
328 | df3 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_test.csv'), dtype=data_type)
329 | df = pd.concat([df1, df2, df3])
330 | else:
331 | df1 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_train.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
332 | df2 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_val.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
333 | df3 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_test.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
334 | df = pd.concat([df1, df2, df3])
335 | print('finish reading aliccp.')
336 | # df = df.sample(frac=1) # shuffle
337 | train_idx = int(df.shape[0] * 0.5)
338 | val_idx = int(df.shape[0] * 0.75)
339 | features = []
340 | for f in df.columns:
341 | if f not in ['click','purchase'] and f[:1] != 'D':
342 | features.append(f)
343 | df['301'] = df['301'] - 1
344 | unique_values = [df[col].max()+1 for col in features]
345 | label = 'click'
346 | train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
347 | train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
348 | return features, unique_values, (train_x, train_y, val_x, val_y, test_x, test_y)
--------------------------------------------------------------------------------
/utils/fs_trainer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import tqdm
4 | import os
5 | import nni
6 | import datetime as dt
7 | from utils.utils import EarlyStopper
8 | from sklearn.metrics import roc_auc_score, log_loss
9 |
10 | class modeltrainer():
11 | def __init__(self, args, model, model_name, device, epochs, retrain):
12 | self.args = args
13 | self.model = model
14 | self.optimizers = model.set_optimizer() # dict of optimizers
15 | self.criterion = torch.nn.BCELoss()
16 | self.device = torch.device(device)
17 | self.model.to(self.device)
18 | self.n_epoch = epochs
19 | self.model_path = 'checkpoints/' + model_name + '_' + args.fs + '_' + args.es + '_' + args.dataset + '_' + args.timestr + '/'
20 | self.early_stopper = EarlyStopper(patience=args.patience)
21 | self.retrain = retrain
22 |
23 | def train_one_epoch(self, train_dataloader, val_dataloader, epoch_i, log_interval=10):
24 | self.model.train()
25 | total_loss = 0
26 | val_iter = iter(val_dataloader)
27 | tk0 = tqdm.tqdm(train_dataloader, desc="train", smoothing=0, mininterval=1.0)
28 | for i, (x, y) in enumerate(tk0):
29 | # x_dict = {k: v.to(self.device) for k, v in x_dict.items()} #tensor to GPU
30 | x = x.to(self.device)
31 | y = y.to(self.device)
32 | y_pred = self.model(x, current_epoch=epoch_i, current_step=i)
33 | loss = self.criterion(y_pred, y.float().reshape(-1, 1))
34 | # optfs l1 norm
35 | if self.args.fs == 'optfs' and not self.retrain:
36 | reg_loss = torch.sum(torch.sigmoid(self.model.fs.temp * self.model.fs.mask_weight))
37 | # g = torch.concat([self.model.fs.g[feature] for feature in self.model.fs.features], dim=0)
38 | # l1_loss = torch.norm(g, p=1) * 2e-9
39 | if self.args.dataset == 'avazu':
40 | loss = loss + reg_loss * 4e-9
41 | elif self.args.dataset == 'criteo':
42 | loss = loss + reg_loss * 1e-8
43 | elif self.args.dataset == 'movielens-1m':
44 | loss = loss + reg_loss * 1e-4
45 | elif self.args.dataset == 'aliccp':
46 | loss = loss + reg_loss * 1e-8
47 | else:
48 | print('please set the hyparameters for optfs of reg_loss in fs_trainer.py')
49 |
50 | self.model.zero_grad()
51 | # self.optimizer.zero_grad()
52 | loss.backward()
53 | self.optimizers['optimizer_bb'].step()
54 | if self.args.fs == 'optfs' and not self.retrain:
55 | self.optimizers['optimizer_fs'].step()
56 | total_loss += loss.item()
57 | if (i + 1) % log_interval == 0:
58 | tk0.set_postfix(loss=total_loss / log_interval)
59 | total_loss = 0
60 |
61 | # other optimizers
62 | if self.model.fs.optimizer_method == 'darts' and i % self.model.fs.update_frequency == 0:
63 | self.optimizers['optimizer_fs'].zero_grad()
64 | try:
65 | batch = next(val_iter)
66 | except StopIteration:
67 | val_iter = iter(val_dataloader)
68 | batch = next(val_iter)
69 | x_,y_ = batch
70 | x_, y_ = x_.to(self.device), y_.to(self.device)
71 | y_pred_ = self.model(x_, current_epoch=epoch_i, current_step=i)
72 | loss_ = self.criterion(y_pred_, y_.float().reshape(-1, 1))
73 | loss_.backward()
74 | self.optimizers['optimizer_fs'].step()
75 | elif self.args.fs == 'lpfs':
76 | p = self.optimizers['optimizer_fs'].param_groups[0]['params'][0]
77 | self.optimizers['optimizer_fs'].step()
78 | thr = 0.01 * self.args.learning_rate
79 | in1 = p.data > thr
80 | in2 = p.data < -thr
81 | in3 = ~(in1 | in2)
82 | p.data[in1] -= thr
83 | p.data[in2] += thr
84 | p.data[in3] = 0.0
85 |
86 |
87 |
88 | def fit(self, train_dataloader, val_dataloader=None):
89 | all_start_time = dt.datetime.now()
90 | epoch_time_lis = []
91 | for epoch_i in range(self.n_epoch):
92 | print('epoch:', epoch_i)
93 | epoch_start_time = dt.datetime.now()
94 | self.train_one_epoch(train_dataloader, val_dataloader, epoch_i)
95 | epoch_end_time = dt.datetime.now()
96 | epoch_time_lis.append((epoch_end_time - epoch_start_time).total_seconds())
97 | if val_dataloader:
98 | auc = self.evaluate(val_dataloader, epoch_i)
99 | # nni
100 | if self.args.nni:
101 | nni.report_intermediate_result(auc.item())
102 | print('epoch:', epoch_i, 'validation: auc:', auc)
103 | if self.early_stopper.stop_training(auc, self.model.state_dict()):
104 | print(f'validation: best auc: {self.early_stopper.best_auc}')
105 | self.model.load_state_dict(self.early_stopper.best_weights)
106 | break
107 | # stop early stopper during adafs pretrain
108 | if self.args.fs in ['adafs','mvfs'] and epoch_i < self.model.fs.pretrain_epoch:
109 | print('reset early stopper due to pretraining')
110 | self.early_stopper.trial_counter = 0
111 | self.early_stopper.best_auc = 0
112 | self.early_stopper.best_weights = None
113 | all_end_time = dt.datetime.now()
114 | print('all training time: {} s'.format((all_end_time - all_start_time).total_seconds()))
115 | print('average epoch time: {} s'.format(sum(epoch_time_lis) / len(epoch_time_lis)))
116 | if not os.path.exists(self.model_path):
117 | os.makedirs(self.model_path)
118 | if self.model.fs.mode != 'retrain':
119 | torch.save(self.model.state_dict(), os.path.join(self.model_path, "model_search.pth")) #save best auc model
120 | # else:
121 | # torch.save(self.model.state_dict(), os.path.join(self.model_path, "model_retrain.pth"))
122 |
123 | def evaluate(self, data_loader, current_epoch):
124 | self.model.eval()
125 | targets, predicts = list(), list()
126 | with torch.no_grad():
127 | tk0 = tqdm.tqdm(data_loader, desc="validation", smoothing=0, mininterval=1.0)
128 | for i, (x, y) in enumerate(tk0):
129 | x = x.to(self.device)
130 | # x_dict = {k: v.to(self.device) for k, v in x_dict.items()}
131 | y = y.to(self.device)
132 | y_pred = self.model(x, current_epoch, current_step=i) # current_epoch=None means not in training mode
133 | targets.extend(y.tolist())
134 | predicts.extend(y_pred.tolist())
135 | return roc_auc_score(targets, predicts)
136 |
137 | def test(self, data_loader, evaluate_fns):
138 | self.model.eval()
139 | targets, predicts = list(), list()
140 | with torch.no_grad():
141 | tk0 = tqdm.tqdm(data_loader, desc="test", smoothing=0, mininterval=1.0)
142 | start_time = dt.datetime.now()
143 | for i, (x, y) in enumerate(tk0):
144 | x = x.to(self.device)
145 | y = y.to(self.device)
146 | y_pred = self.model(x, current_epoch=None, current_step=i)
147 | targets.extend(y.tolist())
148 | predicts.extend(y_pred.tolist())
149 | end_time = dt.datetime.now()
150 | print('infer time: {} s'.format((end_time - start_time).total_seconds()))
151 | for evaluate_fn in evaluate_fns:
152 | if evaluate_fn == 'auc':
153 | auc = roc_auc_score(targets, predicts)
154 | print('test auc:', auc)
155 | elif evaluate_fn == 'logloss':
156 | logloss = log_loss(targets, predicts)
157 | print('test logloss:', logloss)
158 | return auc
159 |
--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 | import torch
4 | import os
5 | import copy
6 | import importlib
7 | import datetime
8 |
9 | def seed_everything(seed):
10 | random.seed(seed)
11 | np.random.seed(seed)
12 | torch.manual_seed(seed)
13 | torch.cuda.manual_seed_all(seed)
14 |
15 | def get_model(model_name: str, model_type: str):
16 | """
17 | Automatically select model class based on model name
18 |
19 | Args:
20 | model_name (str): model name
21 | model_type (str): rec, fs, es
22 |
23 | Returns:
24 | Recommender: model class
25 | Dict: model configuration dict
26 | """
27 | model_file_name = model_name.lower()
28 | model_module = None
29 | module_path = '.'.join(['models', model_type, model_file_name])
30 | if importlib.util.find_spec(module_path, __name__):
31 | model_module = importlib.import_module(module_path, __name__)
32 | else:
33 | raise ValueError(f'`model_name` [{model_name}] is not the name of an existing model.')
34 | model_class = getattr(model_module, model_name)
35 | # dir = os.path.dirname(model_module.__file__)
36 | # conf = dict()
37 | # fname = os.path.join(os.path.dirname(dir), 'basemodel', 'basemodel.yaml')
38 | # conf.update(parser_yaml(fname))
39 | # for name in ['all', model_file_name]:
40 | # fname = os.path.join(dir, 'config', name+'.yaml')
41 | # if os.path.isfile(fname):
42 | # conf = deep_update(conf, parser_yaml(fname))
43 | return model_class
44 |
45 |
46 | class EarlyStopper(object):
47 | """Early stops the training if validation loss doesn't improve after a given patience.
48 |
49 | Args:
50 | patience (int): How long to wait after last time validation auc improved.
51 | """
52 |
53 | def __init__(self, patience):
54 | self.patience = patience
55 | self.trial_counter = 0
56 | self.best_auc = 0
57 | self.best_weights = None
58 |
59 | def stop_training(self, val_auc, weights):
60 | """whether to stop training.
61 |
62 | Args:
63 | val_auc (float): auc score in val data.
64 | weights (tensor): the weights of model
65 | """
66 | if val_auc > self.best_auc:
67 | self.best_auc = val_auc
68 | self.trial_counter = 0
69 | self.best_weights = copy.deepcopy(weights)
70 | return False
71 | elif self.trial_counter + 1 < self.patience:
72 | self.trial_counter += 1
73 | return False
74 | else:
75 | return True
76 |
77 | def machine_learning_selection(args, fs, features, unique_values, data, k):
78 | train_x, train_y, val_x, val_y, test_x, test_y = data
79 | features = np.array(features)
80 | if fs == 'lasso':
81 | from sklearn.linear_model import Lasso
82 | lasso = Lasso(
83 | alpha=args.fs_config[args.fs]['alpha'],
84 | fit_intercept=args.fs_config[args.fs]['fit_intercept'],
85 | copy_X=args.fs_config[args.fs]['copy_X'],
86 | max_iter=args.fs_config[args.fs]['max_iter'],
87 | tol=args.fs_config[args.fs]['tol'],
88 | positive=args.fs_config[args.fs]['positive'],
89 | selection=args.fs_config[args.fs]['selection']
90 | )
91 | lasso.fit(train_x, train_y)
92 | field_importance = abs(lasso.coef_)
93 | rank = field_importance.argsort()[::-1]
94 | ranked_features = features[rank]
95 | ranked_importance = field_importance[rank]
96 | return np.array([ranked_features, ranked_importance])
97 | select_idx = []
98 | for i in range(k):
99 | print(features[rank[i]], field_importance[rank[i]])
100 | select_idx.append(rank[i])
101 | return features[select_idx]
102 | elif fs == 'gbdt':
103 | from sklearn.ensemble import GradientBoostingClassifier
104 | gbdt = GradientBoostingClassifier(
105 | learning_rate=args.fs_config[args.fs]['learning_rate'],
106 | n_estimators=args.fs_config[args.fs]['n_estimators'],
107 | subsample=args.fs_config[args.fs]['subsample'],
108 | min_samples_split=args.fs_config[args.fs]['min_samples_split'],
109 | min_samples_leaf=args.fs_config[args.fs]['min_samples_leaf'],
110 | min_weight_fraction_leaf=args.fs_config[args.fs]['min_weight_fraction_leaf'],
111 | max_depth=args.fs_config[args.fs]['max_depth'],
112 | n_iter_no_change=args.fs_config[args.fs]['n_iter_no_change'],
113 | verbose=1
114 | )
115 | gbdt.fit(train_x, train_y)
116 | field_importance = gbdt.feature_importances_
117 | rank = field_importance.argsort()[::-1]
118 | ranked_features = features[rank]
119 | ranked_importance = field_importance[rank]
120 | return np.array([ranked_features, ranked_importance])
121 | select_idx = []
122 | for i in range(k):
123 | print(features[rank[i]], field_importance[rank[i]])
124 | select_idx.append(rank[i])
125 | return features[select_idx]
126 | elif fs == 'gbr':
127 | from sklearn.ensemble import GradientBoostingRegressor
128 | gbr = GradientBoostingRegressor()
129 | gbr.fit(train_x, train_y)
130 | field_importance = gbr.feature_importances_
131 | rank = field_importance.argsort()[::-1]
132 | ranked_features = features[rank]
133 | ranked_importance = field_importance[rank]
134 | return np.array([ranked_features, ranked_importance])
135 | select_idx = []
136 | for i in range(k):
137 | print(features[rank[i]], field_importance[rank[i]])
138 | select_idx.append(rank[i])
139 | return features[select_idx]
140 | elif fs == 'pca':
141 | from sklearn.decomposition import PCA
142 | pca = PCA(n_components=k)
143 | pca.fit(train_x)
144 | # first component
145 | field_importance = abs(pca.components_[0])
146 | rank = field_importance.argsort()[::-1]
147 | ranked_features = features[rank]
148 | ranked_importance = field_importance[rank]
149 | return np.array([ranked_features, ranked_importance])
150 | select_idx = []
151 | for i in range(k):
152 | print(features[rank[i]], field_importance[rank[i]])
153 | select_idx.append(rank[i])
154 | return features[select_idx]
155 | elif fs == 'permutation':
156 | from sklearn.ensemble import GradientBoostingClassifier
157 | from sklearn.linear_model import Ridge
158 | from sklearn.inspection import permutation_importance
159 | from sklearn.neural_network import MLPClassifier
160 | from sklearn.linear_model import LogisticRegression
161 | from sklearn.ensemble import RandomForestClassifier
162 | model = RandomForestClassifier(n_estimators=10, max_depth=None, n_jobs=6, verbose=1).fit(train_x, train_y)
163 | # model = LogisticRegression(verbose=1,multi_class='ovr',n_jobs=32).fit(train_x, train_y)
164 | # model = MLPClassifier(verbose=True, early_stopping=True, n_iter_no_change=3, hidden_layer_sizes=(16,16)).fit(train_x, train_y)
165 | field_importance = permutation_importance(model, train_x, train_y, n_jobs=5)
166 | rank = field_importance.importances_mean.argsort()[::-1]
167 | ranked_features = features[rank]
168 | ranked_importance = field_importance.importances_mean[rank]
169 | return np.array([ranked_features, ranked_importance])
170 | elif fs == 'rf':
171 | from sklearn.ensemble import RandomForestClassifier
172 | model = RandomForestClassifier(n_estimators=10, max_depth=None, n_jobs=6, verbose=1).fit(train_x, train_y)
173 | field_importance = model.feature_importances_
174 | rank = field_importance.argsort()[::-1]
175 | ranked_features = features[rank]
176 | ranked_importance = field_importance[rank]
177 | return np.array([ranked_features, ranked_importance])
178 | elif fs == 'xgb':
179 | from xgboost import XGBClassifier
180 | model = XGBClassifier(n_estimators=10, max_depth=None, n_jobs=6, verbose=1).fit(train_x, train_y)
181 | field_importance = model.feature_importances_
182 | rank = field_importance.argsort()[::-1]
183 | ranked_features = features[rank]
184 | ranked_importance = field_importance[rank]
185 | return np.array([ranked_features, ranked_importance])
186 |
187 |
188 | def print_time(message):
189 | print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S '), message)
190 |
191 | def str2bool(v):
192 | if isinstance(v, bool):
193 | return v
194 | if v.lower() in ('yes', 'true', 't', 'y', '1'):
195 | return True
196 | elif v.lower() in ('no', 'false', 'f', 'n', '0'):
197 | return False
198 | else:
199 | raise argparse.ArgumentTypeError('Boolean value expected.')
--------------------------------------------------------------------------------