├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── aliccp
    │   └── dataset description.md
    ├── avazu
    │   └── dataset description.md
    ├── criteo
    │   └── dataset description.md
    ├── movielens-1m
    │   └── dataset desciption.md
    └── preprocess.py
├── fs_run.py
├── models
    ├── basemodel.py
    ├── config.yaml
    ├── fs
    │   ├── adafs.py
    │   ├── autofield.py
    │   ├── gbdt.py
    │   ├── lasso.py
    │   ├── lpfs.py
    │   ├── mvfs.py
    │   ├── no_selection.py
    │   ├── optfs.py
    │   ├── optfs_old.py
    │   ├── permutation.py
    │   ├── rf.py
    │   ├── sfs.py
    │   ├── shark.py
    │   └── xgb.py
    ├── layers.py
    └── rec
    │   ├── dcn.py
    │   ├── deepfm.py
    │   ├── fibinet.py
    │   ├── fm.py
    │   ├── mlp.py
    │   └── widedeep.py
├── nni
    └── search_spaces
    │   ├── config.json
    │   └── fs
    │       ├── adafs.json
    │       ├── autofield.json
    │       ├── gbdt.json
    │       ├── lasso.json
    │       └── optfs.json
├── nni_tune.py
├── requirements.txt
└── utils
    ├── datasets.py
    ├── fs_trainer.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <center><h1>
  2 |     ERASE: Benchmarking Feature Selection Methods for Deep Recommender Systems 
  3 |     </h1></center>
  4 | 
  5 | 
  6 | 
  7 | In this repo, our scripts can be divided to two parts: `dataset preprocess` and `run fs`.
  8 | 
  9 | You can also download the preprocessed dataset from Huggingface [ERASE_Dataset](https://huggingface.co/datasets/Jia-py/ERASE_Dataset)
 10 | 
 11 | Please note that you need to run the following script from the root directory of the project.
 12 | 
 13 | # package requirment
 14 | 
 15 | * torch
 16 | * pandas
 17 | * numpy
 18 | * nni
 19 | 
 20 | ## File Structure
 21 | 
 22 | ```
 23 | - checkpoints
 24 | - checkpoints_for_retrain
 25 | - data
 26 |     - avazu
 27 |         - preprocessed_avazu.csv # your data should put here
 28 |     - criteo
 29 |         - preprocessed_criteo.csv # your data should put here
 30 |     - movielens-1m
 31 |     - aliccp
 32 |     - preprocess.py # preprocess script
 33 | - nni
 34 |     - search spaces
 35 |         - fs
 36 |             - specific-method.json # the hyperparameter search space for each methods in fs
 37 |         config.json # some hyperparameters related to general training, e.g., number of selected fields, learning rate
 38 | - notebooks # some test notebooks
 39 | - utils
 40 |     - datasets.py # read datasets
 41 |     - fs_trainer.py # trainer for feature selection
 42 |     - utils # some functions
 43 | - fs_run.py # main script to run feature selection
 44 | - nni_tune.py # run the nni tune
 45 | - requirements.text # python libraries needed for this repository
 46 | ```
 47 | 
 48 | ## Dataset Preprocess
 49 | 
 50 | ```bash
 51 | python data/preprocess.py --dataset=[avazu/criteo] --data_path=[default is data/]
 52 | ```
 53 | 
 54 | ## Run FS & ES
 55 | 
 56 | ### Parameters in run.py
 57 | 
 58 | * dataset: (avazu/criteo)
 59 | * model: backbone model (mlp)
 60 | * fs: feature selection method (no_selecion/autofield/adafs/optfs/gbdt/lasso/gbr/pca)
 61 | * seed: random seed (specific number or 0(random))
 62 | * device: cuda or cpu
 63 | * data_path: your data path (default is `data/`)
 64 | * batch_size
 65 | * dataset_shuffle: (True or False)
 66 | * embedding_dim: embedding size (default is 8)
 67 | * train_or_search: need train_or_search (True/False)
 68 | * retrain: need retrain (True/False)
 69 | * k: number of selected fields (specific number)
 70 | * learning_rate
 71 | * epoch: training epoch (default 100)
 72 | * patience: patience of earlystopper (default 3)
 73 | * num_workers: num_workers in dataloader (default 32)
 74 | * nni: whether use nni to tune hyperparameters (default False)
 75 | * rank_path: if only want retrain, please specify the path of feature rank file
 76 | * read_feature_rank: whether to use pre-saved feature rank
 77 | 
 78 | ### Feature Selection
 79 | 
 80 | ```bash
 81 | python fs_run.py --model=[model_name] --fs=[feature_selection_method] --train_or_search=True --retrain=True
 82 | ```
 83 | 
 84 | 
 85 | 
 86 | # More experimental results
 87 | 
 88 | 1. Overall experimental results of feature selection for deep recommender systems.
 89 | 
 90 |    <img src="https://raw.githubusercontent.com/Jia-py/blog_picture/master/img/image-20240618142823898.png" alt="image-20240618142823898" style="zoom: 80%;" />
 91 | 
 92 | 2. Experimental results on more backbone models with different number of selected features on Avazu.
 93 | 
 94 | ![image-20240618142657795](https://raw.githubusercontent.com/Jia-py/blog_picture/master/img/image-20240618142657795.png)
 95 | 
 96 | 3. Experimental results on more backbone models with different number of selected features on Criteo.
 97 | 
 98 | ![image-20240618142731324](https://raw.githubusercontent.com/Jia-py/blog_picture/master/img/image-20240618142731324.png)
 99 | 
100 | 
101 | # Citation
102 | 
103 | If you find our work useful, please consider citing our paper below. Thank you!
104 | ```
105 | @inproceedings{jia2024erase,
106 |   title={ERASE: Benchmarking Feature Selection Methods for Deep Recommender Systems},
107 |   author={Jia, Pengyue and Wang, Yejing and Du, Zhaocheng and Zhao, Xiangyu and Wang, Yichao and Chen, Bo and Wang, Wanyu and Guo, Huifeng and Tang, Ruiming},
108 |   booktitle={Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
109 |   pages={5194--5205},
110 |   year={2024}
111 | }
112 | ```
113 | 


--------------------------------------------------------------------------------
/data/aliccp/dataset description.md:
--------------------------------------------------------------------------------
1 | The dataset is preprocessed by Datawhale, refer to (torch-rechub)[https://github.com/datawhalechina/torch-rechub/tree/main/examples/ranking]


--------------------------------------------------------------------------------
/data/avazu/dataset description.md:
--------------------------------------------------------------------------------
 1 | ## Dataset Description
 2 | 
 3 | [Click-Through Rate Prediction | Kaggle](https://www.kaggle.com/competitions/avazu-ctr-prediction/data)
 4 | 
 5 | ## File descriptions
 6 | 
 7 | - **train** - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks are subsampled according to different strategies.
 8 | - **test** - Test set. 1 day of ads to for testing your model predictions. 
 9 | - **sampleSubmission.csv** - Sample submission file in the correct format, corresponds to the All-0.5 Benchmark.
10 | 
11 | ## Data fields
12 | 
13 | - id: ad identifier
14 | - click: 0/1 for non-click/click
15 | - hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
16 | - C1 -- anonymized categorical variable
17 | - banner_pos
18 | - site_id
19 | - site_domain
20 | - site_category
21 | - app_id
22 | - app_domain
23 | - app_category
24 | - device_id
25 | - device_ip
26 | - device_model
27 | - device_type
28 | - device_conn_type
29 | - C14-C21 -- anonymized categorical variables
30 | 
31 | ## Notes
32 | 
33 | For the test file does not contain the labels, so we just use the training file.
34 | 
35 | ## read dtypes
36 | 
37 | ```bash
38 | Memory usage of dataframe is 7402.76 MB
39 | Memory usage after optimization is: 1773.58 MB
40 | Decreased by 76.0%
41 | dtypes:  click                int8
42 | hour                int16
43 | C1                   int8
44 | banner_pos           int8
45 | site_id             int16
46 | site_domain         int16
47 | site_category        int8
48 | app_id              int16
49 | app_domain          int16
50 | app_category         int8
51 | device_id           int32
52 | device_ip           int32
53 | device_model        int16
54 | device_type          int8
55 | device_conn_type     int8
56 | C14                 int16
57 | C15                  int8
58 | C16                  int8
59 | C17                 int16
60 | C18                  int8
61 | C19                  int8
62 | C20                 int16
63 | C21                  int8
64 | dtype: object
65 | preprocess avazu done!
66 | ```


--------------------------------------------------------------------------------
/data/criteo/dataset description.md:
--------------------------------------------------------------------------------
  1 |         ------ Display Advertising Challenge ------
  2 | 
  3 | Dataset: dac-v1
  4 | 
  5 | This dataset contains feature values and click feedback for millions of display 
  6 | ads. Its purpose is to benchmark algorithms for clickthrough rate (CTR) prediction.
  7 | It has been used for the Display Advertising Challenge hosted by Kaggle:
  8 | https://www.kaggle.com/c/criteo-display-ad-challenge/
  9 | 
 10 | ===================================================
 11 | 
 12 | Full description:
 13 | 
 14 | This dataset contains 2 files:
 15 |   train.txt
 16 |   test.txt
 17 | corresponding to the training and test parts of the data. 
 18 | 
 19 | ====================================================
 20 | 
 21 | Dataset construction:
 22 | 
 23 | The training dataset consists of a portion of Criteo's traffic over a period
 24 | of 7 days. Each row corresponds to a display ad served by Criteo and the first
 25 | column is indicates whether this ad has been clicked or not.
 26 | The positive (clicked) and negatives (non-clicked) examples have both been
 27 | subsampled (but at different rates) in order to reduce the dataset size.
 28 | 
 29 | There are 13 features taking integer values (mostly count features) and 26
 30 | categorical features. The values of the categorical features have been hashed
 31 | onto 32 bits for anonymization purposes. 
 32 | The semantic of these features is undisclosed. Some features may have missing values.
 33 | 
 34 | The rows are chronologically ordered.
 35 | 
 36 | The test set is computed in the same way as the training set but it 
 37 | corresponds to events on the day following the training period. 
 38 | The first column (label) has been removed.
 39 | 
 40 | ====================================================
 41 | 
 42 | Format:
 43 | 
 44 | The columns are tab separeted with the following schema:
 45 | <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
 46 | 
 47 | When a value is missing, the field is just empty.
 48 | There is no label field in the test set.
 49 | 
 50 | ====================================================
 51 | 
 52 | Dataset assembled by Olivier Chapelle (o.chapelle@criteo.com)
 53 |     
 54 | ```
 55 | Memory usage of dataframe is 13989.45 MB
 56 | Memory usage after optimization is: 3322.49 MB
 57 | Decreased by 76.2%
 58 | dtypes:  0      int8
 59 | 1      int8
 60 | 2      int8
 61 | 3      int8
 62 | 4      int8
 63 | 5     int16
 64 | 6     int16
 65 | 7      int8
 66 | 8      int8
 67 | 9      int8
 68 | 10     int8
 69 | 11     int8
 70 | 12     int8
 71 | 13     int8
 72 | 14    int16
 73 | 15    int16
 74 | 16    int32
 75 | 17    int32
 76 | 18    int16
 77 | 19     int8
 78 | 20    int16
 79 | 21    int16
 80 | 22     int8
 81 | 23    int32
 82 | 24    int16
 83 | 25    int32
 84 | 26    int16
 85 | 27     int8
 86 | 28    int16
 87 | 29    int32
 88 | 30     int8
 89 | 31    int16
 90 | 32    int16
 91 | 33     int8
 92 | 34    int32
 93 | 35     int8
 94 | 36     int8
 95 | 37    int32
 96 | 38     int8
 97 | 39    int32
 98 | dtype: object
 99 | 2023-09-06  16:00:58 
100 | save to file...
101 | preprocess criteo done!
102 | ```


--------------------------------------------------------------------------------
/data/movielens-1m/dataset desciption.md:
--------------------------------------------------------------------------------
1 | The dataset is preprocessed by Datawhale, refer to (torch-rechub)[https://github.com/datawhalechina/torch-rechub/tree/main/examples/matching]


--------------------------------------------------------------------------------
/data/preprocess.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from tqdm import tqdm
  4 | from sklearn.preprocessing import LabelEncoder
  5 | import sys
  6 | sys.path.append('utils/')
  7 | from utils import print_time
  8 | 
  9 | def reduce_mem_usage(df):
 10 |     start_mem = df.memory_usage().sum() / 1024**2
 11 |     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
 12 | 
 13 |     for col in df.columns:
 14 |         col_type = df[col].dtype
 15 |         if col_type != object:
 16 |             c_min = df[col].min()
 17 |             c_max = df[col].max()
 18 |             if str(col_type)[:3] == 'int':
 19 |                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
 20 |                     df[col] = df[col].astype(np.int8)
 21 |                 elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
 22 |                     df[col] = df[col].astype(np.uint8)
 23 |                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
 24 |                     df[col] = df[col].astype(np.int16)
 25 |                 elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
 26 |                     df[col] = df[col].astype(np.uint16)
 27 |                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
 28 |                     df[col] = df[col].astype(np.int32)
 29 |                 elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
 30 |                     df[col] = df[col].astype(np.uint32)                    
 31 |                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
 32 |                     df[col] = df[col].astype(np.int64)
 33 |                 elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
 34 |                     df[col] = df[col].astype(np.uint64)
 35 |             elif str(col_type)[:5] == 'float':
 36 |                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
 37 |                     df[col] = df[col].astype(np.float16)
 38 |                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
 39 |                     df[col] = df[col].astype(np.float32)
 40 |                 else:
 41 |                     df[col] = df[col].astype(np.float64)
 42 | 
 43 |     end_mem = df.memory_usage().sum() / 1024**2
 44 |     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
 45 |     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
 46 |     print('dtypes: ', df.dtypes)
 47 |     return df
 48 | 
 49 | def preprocess_avazu(data_path, feature_value_filter=False, threshold=4):
 50 |     print('start reading file...')
 51 |     df_train = pd.read_csv(data_path + 'train.csv')
 52 |     df_val = pd.read_csv(data_path + 'valid.csv')
 53 |     df_test = pd.read_csv(data_path + 'test.csv')
 54 |     df = pd.concat([df_train, df_val, df_test])
 55 |     del df_train, df_val, df_test
 56 |     print('finish reading file...')
 57 |     df.drop(columns=['id'], inplace=True)
 58 |     # transform hour to hour
 59 |     # df['hour:token'] = pd.to_datetime(df['timestamp:float'], format='%y%m%d%H')
 60 |     # df['hour:token'] = df['hour:token'].dt.hour
 61 |     # df.drop(['timestamp:float'], axis=1, inplace=True)
 62 |     sparse_features = [f for f in df.columns]
 63 |     # df = df.fillna('-1')
 64 | 
 65 |     if feature_value_filter:
 66 |         print('start replace values')
 67 |         tqdm.pandas(desc='pandas bar')
 68 |         def replace_values(series):
 69 |             counts = series.value_counts()
 70 |             return series.apply(lambda x: -99 if counts[x] < threshold else x)
 71 |         df = df.parallel_apply(replace_values)
 72 |         print('finish replace values')
 73 |     df = df.astype(str)
 74 | 
 75 |     tk0 = tqdm(sparse_features, desc='LabelEncoder')
 76 |     for feat in tk0:
 77 |         lbe = LabelEncoder()
 78 |         df[feat] = lbe.fit_transform(df[feat])
 79 |     df = df.infer_objects()
 80 |     df = reduce_mem_usage(df)
 81 |     df.to_csv(data_path + 'preprocessed_avazu.csv', index=False)
 82 | 
 83 | def preprocess_criteo(data_path, feature_value_filter=False, threshold=4):
 84 |     print_time('start reading file...')
 85 | #     df = pd.read_csv(data_path + 'criteo.inter', sep='\t')
 86 |     df = pd.read_csv(data_path + 'train.txt', sep='\t', header=None)
 87 |     print(df)
 88 |     print_time('finish reading file...')
 89 |     '''
 90 |     Index([ (label)0,  
 91 |     (float)1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
 92 |     (object)14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 
 93 |     27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
 94 |       dtype='int64')
 95 |     '''
 96 |     df.columns= [str(x) for x in list(range(40))]
 97 |     dense_features = [f for f in df.columns.tolist() if (df[f].dtype in ['int64', 'float64'] and f != '0')]
 98 |     sparse_features = [f for f in df.columns.tolist() if df[f].dtype in ['object']]
 99 |     
100 |     print_time('fill nan...')
101 |     df[sparse_features] = df[sparse_features].fillna('-999')
102 |     df[dense_features] = df[dense_features].fillna(-999)
103 |     
104 |     print_time('convert float features...')
105 |     import math
106 |     for feat in dense_features:
107 |         df[feat] = df[feat].apply(lambda x:str(int(math.log(x) ** 2)) if x > 2 else str(int(x)-2))
108 |     all_features = [f for f in df.columns]
109 |     
110 | #     df = df.astype(str)
111 |     print_time('label encoding...')
112 |     tk0 = tqdm(all_features, desc='LabelEncoder')
113 |     for feat in tk0:
114 |         lbe = LabelEncoder()
115 |         df[feat] = lbe.fit_transform(df[feat])
116 |     df = df.infer_objects()
117 |     # 设置display.max_rows选项
118 |     pd.set_option('display.max_rows', None)
119 |     df = reduce_mem_usage(df)
120 |     
121 |     print_time('save to file...')
122 |     df.to_csv(data_path + 'preprocessed_criteo.csv', index=False)
123 | 
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     import argparse
128 |     parser = argparse.ArgumentParser()
129 |     parser.add_argument('--dataset', type=str, default='avazu', help='avazu, criteo')
130 |     parser.add_argument('--data_path', type=str, default='data/', help='data path')
131 |     
132 |     args = parser.parse_args()
133 | 
134 |     if args.dataset == 'avazu':
135 |         preprocess_avazu(args.data_path + args.dataset + '/')
136 |         print('preprocess avazu done!')
137 |     elif args.dataset == 'criteo':
138 |         preprocess_criteo(args.data_path + args.dataset + '/')
139 |         print('preprocess criteo done!')


--------------------------------------------------------------------------------
/fs_run.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import pandas as pd
  3 | import numpy as np
  4 | import torch
  5 | import os
  6 | import argparse
  7 | import yaml
  8 | import nni
  9 | import time
 10 | import datetime as dt
 11 | from tqdm import tqdm
 12 | import utils.utils as utils
 13 | from utils.fs_trainer import modeltrainer
 14 | from utils.datasets import read_dataset
 15 | from models.basemodel import BaseModel
 16 | 
 17 | 
 18 | def main(args):
 19 |     if args.seed != 0:
 20 |         utils.seed_everything(args.seed)
 21 |     
 22 |     if args.train_or_search:
 23 |         utils.print_time('start train or search...')
 24 | 
 25 |         if args.fs in ['gbdt', 'lasso', 'permutation','rf','xgb']: # machine learning feature selection
 26 |             features, unique_values, data = read_dataset(args.dataset, args.data_path, args.batch_size, args.dataset_shuffle, num_workers=args.num_workers, machine_learning_method = True)
 27 |             ml_start_time = dt.datetime.now()
 28 |             feature_rank = utils.machine_learning_selection(args, args.fs, features, unique_values, data, args.k)
 29 |             ml_end_time = dt.datetime.now()
 30 |             print('machine learning feature selection time: {} s'.format((ml_end_time - ml_start_time).total_seconds()))
 31 |             model_path = 'checkpoints/' + args.model + '_' + args.fs + '_' + args.es + '_' + args.dataset + '_' + args.timestr + '/'
 32 |             utils.print_time(model_path)
 33 |             utils.print_time('feature rank:')
 34 |             utils.print_time(feature_rank)
 35 |             if not os.path.exists(model_path):
 36 |                 os.makedirs(model_path)
 37 |             np.save(model_path + 'feature_rank.npy', feature_rank)
 38 | 
 39 |         else:
 40 |             features, label, train_dataloader, val_dataloader, test_dataloader, unique_values = read_dataset(args.dataset, args.data_path, batch_size=args.batch_size, shuffle=args.dataset_shuffle, num_workers=args.num_workers, machine_learning_method=False)
 41 | 
 42 |             print(features)
 43 |             print(unique_values)
 44 | 
 45 |             model = BaseModel(args, args.model, args.fs, args.es, unique_values, features)
 46 |             model.fs.mode = 'train'
 47 |             trainer = modeltrainer(args, model, args.model, args.device, epochs=args.epoch, retrain=False)
 48 |             trainer.fit(train_dataloader, val_dataloader)
 49 |             auc = trainer.test(test_dataloader, ['auc', 'logloss'])
 50 |             if args.retrain is False and args.nni:
 51 |                 nni.report_final_result(auc)
 52 |             # print selected features
 53 |             # 如果model存在output_selected_features方法，就输出selected features
 54 |             if hasattr(model.fs, 'save_selection'):
 55 |                 model_path = 'checkpoints/' + args.model + '_' + args.fs + '_' + args.es + '_' + args.dataset + '_' + args.timestr + '/'
 56 |                 res = model.fs.save_selection(k=args.k)
 57 |                 if isinstance(res, np.ndarray):
 58 |                     feature_rank = res
 59 |                 else: 
 60 |                     feature_rank = res(val_dataloader,model,torch.device(args.device))
 61 |                 utils.print_time('feature rank:')
 62 |                 utils.print_time(feature_rank)
 63 |                 np.save(model_path + 'feature_rank.npy', feature_rank)
 64 |     # if retrain, retrain
 65 |     if args.retrain:
 66 |         utils.print_time('start retrain...')
 67 |         # if no need train or search, you should put the feature_rank.npy in the following path mannualy
 68 |         if not args.train_or_search:
 69 |             model_path = 'checkpoints_for_retrain/' + args.rank_path + '/'
 70 |             if not os.path.exists(model_path):
 71 |                 raise('Only retraining chossen, please make shure you have putted the generated file during searching in the following path: checkpoints/for_retrain/fs_es_dataset/')
 72 |         else:
 73 |             model_path = 'checkpoints/' + args.model + '_' + args.fs + '_' + args.es + '_' + args.dataset + '_' + args.timestr + '/'
 74 |         # read selection results
 75 |         if args.read_feature_rank: # need to read selected features
 76 |             feature_rank = np.load(model_path + 'feature_rank.npy')
 77 |             selected_features = feature_rank[0][:args.k]
 78 |             utils.print_time('feature rank: {}'.format(feature_rank))
 79 |             utils.print_time('selected features: {}'.format(selected_features))
 80 |             features, label, train_dataloader, val_dataloader, test_dataloader, unique_values = read_dataset(args.dataset, args.data_path, batch_size=args.batch_size, shuffle=args.dataset_shuffle, num_workers=args.num_workers, use_fields=selected_features, machine_learning_method=False)
 81 |         else:
 82 |             features, label, train_dataloader, val_dataloader, test_dataloader, unique_values = read_dataset(args.dataset, args.data_path, batch_size=args.batch_size, shuffle=args.dataset_shuffle, num_workers=args.num_workers, use_fields=None, machine_learning_method=False)
 83 |         
 84 |         model = BaseModel(args, args.model, args.fs, args.es, unique_values, features)
 85 |         if model.fs.load_checkpoint:
 86 |             model.load_state_dict(torch.load(model_path + 'model_search.pth'))
 87 |         # if args.fs == 'optfs':
 88 |         #     tmp_model = torch.load(model_path + 'model_search.pth')
 89 |         #     # param_dict = {k:v for k, v in tmp_model.items() if 'fs.gate' in k or 'embedding.weight' in k}
 90 |         #     param_dict = {k:v for k, v in tmp_model.items() if 'fs.mask_weight' in k}
 91 |         #     model_dict = model.state_dict()
 92 |         #     model_dict.update(param_dict)
 93 |         #     model.load_state_dict(model_dict)
 94 |         if hasattr(model.fs, 'before_retrain'):
 95 |             model.fs.before_retrain()
 96 |         model.fs.mode = 'retrain'
 97 |         
 98 |         trainer = modeltrainer(args, model, args.model, args.device, epochs=args.epoch, retrain=True)
 99 |         trainer.fit(train_dataloader, val_dataloader)
100 |         auc = trainer.test(test_dataloader, ['auc', 'logloss'])
101 |         if args.nni:
102 |             nni.report_final_result(auc)
103 | 
104 | 
105 | if __name__ == '__main__':
106 | 
107 |     parser = argparse.ArgumentParser()
108 |     parser.add_argument('--dataset', type=str, default='avazu', help='avazu, criteo, movielens-1m, aliccp')
109 |     parser.add_argument('--model', type=str, default='mlp', help='mlp, ...')
110 |     parser.add_argument('--fs', type=str, default='no_selection', help='feature selection methods: no_selecion, autofield, adafs, optfs, gbdt, lasso, gbr, pca, shark, sfs, lpfs, mvfs')
111 |     parser.add_argument('--es', type=str, default='no_selection', help='embedding search methods: no_selecion, ...')
112 |     parser.add_argument('--seed', type=int, default=0, help='random seed, 0 represents not setting the random seed')
113 |     parser.add_argument('--device',type=str, default='cuda' if torch.cuda.is_available() else 'cpu', help='cpu, cuda')
114 |     parser.add_argument('--data_path', type=str, default='data/', help='data path') # ~/autodl-tmp/ or data/
115 |     parser.add_argument('--batch_size', type=int, default=4096, help='batch size')
116 |     parser.add_argument('--dataset_shuffle', type=bool, default=True, help='whether to shuffle the dataset')
117 |     parser.add_argument('--embedding_dim', type=int, default=8, help='embedding dimension')
118 |     parser.add_argument('--train_or_search', type=utils.str2bool, default=True, help='whether to train or search')
119 |     parser.add_argument('--retrain', type=utils.str2bool, default=True, help='whether to retrain')
120 |     parser.add_argument('--k', type=int, default=0, help='top k features')
121 |     parser.add_argument('--learning_rate', type=float, default=0.001, help='learning rate')
122 |     parser.add_argument('--epoch', type=int, default=1, help='epoch')
123 |     parser.add_argument('--patience', type=int, default=1, help='early stopping patience')
124 |     parser.add_argument('--num_workers', type=int, default=32, help='num_workers')
125 |     parser.add_argument('--nni', type=bool, default=False, help='whether to use nni')
126 |     parser.add_argument('--rank_path', type=str, default='None', help='if only retrain, no train, please specify the path of feature_rank file. e.g., autofield_no_selection_avazu')
127 |     parser.add_argument('--read_feature_rank', type=utils.str2bool, default=True, help='whether to use pre-saved feature rank')
128 | 
129 |     args = parser.parse_args()
130 |     
131 |     # k
132 |     if args.k == 0:
133 |         if args.dataset == 'avazu':
134 |             args.k = 22
135 |         elif args.dataset == 'criteo':
136 |             args.k = 39
137 | 
138 |     with open('models/config.yaml', 'r') as file:
139 |         data = yaml.safe_load(file)
140 |     args.__dict__.update(data)
141 | 
142 |     # read tune parameters from nni
143 |     if args.nni:
144 |         tuner_params = nni.get_next_parameter()
145 |         for key in tuner_params:
146 |             if key[:2] == 'fs':
147 |                 args.fs_config[args.fs][key[3:]] = tuner_params[key]
148 |             elif key[:2] == 'es':
149 |                 args.es_config[args.es][key[3:]] = tuner_params[key]
150 |             else:
151 |                 args.__dict__[key] = tuner_params[key]
152 |     
153 |     # print args
154 |     for key in args.__dict__:
155 |         if key not in ['fs_config', 'es_config', 'rec_config']:
156 |             print(key, ':', args.__dict__[key])
157 |         else:
158 |             print(key, ':')
159 |             for key2 in args.__dict__[key]:
160 |                 if key2 in [args.model, args.fs, args.es]:
161 |                     print('\t', key2, ':', args.__dict__[key][key2])
162 |     
163 |     args.timestr = str(time.time())
164 |     
165 |     main(args)


--------------------------------------------------------------------------------
/models/basemodel.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | import torch
 3 | from torch import Tensor
 4 | import torch.nn as nn
 5 | import numpy as np
 6 | import torch.nn.functional as F
 7 | from torch.nn.modules.module import Module
 8 | from utils.utils import get_model
 9 | 
10 | class BaseModel(nn.Module):
11 |     def __init__(self, args, backbone_model_name, fs, es, unique_values, features):
12 |         super(BaseModel, self).__init__()
13 |         # embedding table
14 |         self.embedding = nn.Embedding(sum(unique_values), embedding_dim = args.embedding_dim)
15 |         torch.nn.init.normal_(self.embedding.weight.data, mean=0, std=0.01)
16 |         self.offsets = np.array((0, *np.cumsum(unique_values)[:-1]))
17 | 
18 |         self.input_dims = args.embedding_dim * len(unique_values)
19 | 
20 |         self.bb = get_model(backbone_model_name, 'rec')(args, self.input_dims) # backbone model name
21 |         self.fs = get_model(fs, 'fs')(args, unique_values, features) # feature selection method
22 |         self.es = get_model(es, 'es')() # embedding search method
23 |         self.args = args
24 | 
25 |     def forward(self, x, current_epoch, current_step):
26 |         raw_x = x.clone().detach()
27 |         x = self.embedding(x + x.new_tensor(self.offsets))
28 |         x = self.es(x)
29 |         x = self.fs(x, current_epoch, current_step, raw_data = raw_x)
30 |         x = self.bb(x)
31 |         return x
32 |     
33 |     def set_optimizer(self):
34 |         optimizer_bb = torch.optim.Adam([params for name,params in self.named_parameters() if ('fs' not in name and 'es' not in name) or 'bb' in name], lr = self.args.learning_rate)
35 |         
36 |         if [params for name,params in self.named_parameters() if 'fs' in name] != []:
37 |             optimizer_fs = torch.optim.Adam([params for name,params in self.named_parameters() if 'fs' in name and 'bb' not in name], lr = self.args.learning_rate)
38 |         else:
39 |             optimizer_fs = None
40 |         
41 |         if [params for name,params in self.named_parameters() if 'es' in name] != []:
42 |             optimizer_es = torch.optim.Adam([params for name,params in self.named_parameters() if 'es' in name and 'bb' not in name], lr = self.args.learning_rate)
43 |         else:
44 |             optimizer_es = None
45 |         return {'optimizer_bb': optimizer_bb, 'optimizer_fs': optimizer_fs, 'optimizer_es': optimizer_es}
46 |     


--------------------------------------------------------------------------------
/models/config.yaml:
--------------------------------------------------------------------------------
 1 | fs_config:
 2 |     adafs:
 3 |         pretrain_epoch: 0
 4 |         hidden_size: 16
 5 |         dropout: 0.2
 6 |         update_frequency: 4
 7 |     autofield:
 8 |         update_frequency: 10
 9 |     gbdt:
10 |         learning_rate: 0.1
11 |         n_estimators: 100
12 |         subsample: 1.0
13 |         min_samples_split: 2
14 |         min_samples_leaf: 1
15 |         min_weight_fraction_leaf: 0.0
16 |         max_depth: 3
17 |         n_iter_no_change: 3
18 |     gbr:
19 |     lasso:
20 |         alpha: 0.001
21 |         fit_intercept: true
22 |         copy_X: True
23 |         max_iter: 1000
24 |         tol: 0.0001
25 |         positive: false
26 |         selection: cyclic
27 |     lpfs:
28 |     no_selection:
29 |     optfs:
30 |         gamma: 5000
31 |         pretrain_epoch: 5
32 |     pca:
33 |     shark:
34 |     sfs:
35 |         num_batch_sampling: 100
36 |     rf:
37 |     xgb:
38 |     mvfs:
39 |         pretrain_epoch: 0
40 |         sub_network_num: 6
41 |         dropout: 0.2
42 |         l: 0.2
43 | es_config:
44 |     no_selection:
45 | rec_config:
46 |     mlp:


--------------------------------------------------------------------------------
/models/fs/adafs.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from models.layers import MLP
 4 | 
 5 | class adafs(nn.Module):
 6 | 
 7 |     def __init__(self, args, unique_values, features):
 8 |         super(adafs, self).__init__()
 9 | 
10 |         self.pretrain_epoch = args.fs_config[args.fs]['pretrain_epoch']
11 |         self.feature_num = len(unique_values)
12 |         self.batchnorm_bb = nn.BatchNorm1d(args.embedding_dim) # add the _bb to add this parameter in bb optimizer
13 |         self.hidden_size = args.fs_config[args.fs]['hidden_size']
14 |         self.dropout = args.fs_config[args.fs]['dropout']
15 |         self.mlp = MLP(self.feature_num * args.embedding_dim, False, [self.hidden_size, self.feature_num], self.dropout)
16 |         self.mlp_bb = MLP(self.feature_num * args.embedding_dim, False, [self.hidden_size, self.feature_num], self.dropout)
17 |         self.mode = 'train'
18 |         if self.mode == 'retrain':
19 |             raise Exception('adafs should not be used in retrain mode')
20 |         self.optimizer_method = 'normal'
21 |         self.update_frequency = args.fs_config[args.fs]['update_frequency']
22 | 
23 |         self.load_checkpoint = False
24 |     
25 |     def forward(self, x, current_epoch, current_step, raw_data):
26 |         b,f,e = x.shape
27 |         if self.optimizer_method == 'darts':
28 |             if current_epoch is not None and current_epoch < self.pretrain_epoch: # current_epoch not None (in training or validation) and current_epoch <= self.pretrain_epoch
29 |                 x = x.transpose(1,2)
30 |                 x = self.batchnorm_bb(x)
31 |                 return x.transpose(1,2)
32 |             else:
33 |                 x = x.transpose(1,2)
34 |                 x = self.batchnorm_bb(x)
35 |                 weight = self.mlp(x.reshape(b, -1))
36 |                 weight = torch.softmax(weight, dim=-1)
37 |                 x = torch.mul(x, weight.unsqueeze(1))
38 |                 return x.transpose(1,2)
39 |         elif self.optimizer_method == 'normal':
40 |             x = x.transpose(1,2)
41 |             x = self.batchnorm_bb(x)
42 |             weight = self.mlp_bb(x.reshape(b, -1))
43 |             weight = torch.softmax(weight, dim=-1)
44 |             x = torch.mul(x, weight.unsqueeze(1))
45 |             return x.transpose(1,2)


--------------------------------------------------------------------------------
/models/fs/autofield.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | class autofield(nn.Module):
 6 | 
 7 |     def __init__(self, args, unique_values, features):
 8 |         super(autofield, self).__init__()
 9 | 
10 |         self.feature_num = len(unique_values)
11 |         self.device = args.device
12 |         self.args = args
13 |         self.features = np.array(features)
14 | 
15 |         self.gate = {features[field_idx]: torch.Tensor(np.ones([1,2])*0.5).to(self.device) for field_idx in range(self.feature_num)}
16 |         self.gate = {features[field_idx]: nn.Parameter(self.gate[features[field_idx]], requires_grad=True) for field_idx in range(self.feature_num)}
17 |         self.gate = nn.ParameterDict(self.gate)
18 |         self.tau = 1.0
19 | 
20 |         self.mode = 'train'
21 |         self.optimizer_method = 'darts'
22 |         self.update_frequency = args.fs_config[args.fs]['update_frequency']
23 |         self.load_checkpoint = False
24 | 
25 |     def forward(self, x, current_epoch, current_step, raw_data):
26 |         b,f,e = x.shape
27 |         if self.mode == 'retrain':
28 |             return x
29 |         elif self.mode == 'train':
30 |             if self.tau > 0.01:
31 |                 self.tau -= 0.00005
32 |         gate_ = torch.ones([1,f,1]).to(self.device)
33 |         for field_idx in range(self.feature_num):
34 |             gate_[:,field_idx,:] = torch.nn.functional.gumbel_softmax(self.gate[self.features[field_idx]], tau=self.tau, hard=False, dim=-1)[:,-1].reshape(1,1,1)
35 |         x = x * gate_
36 |         return x
37 |     
38 |     def save_selection(self, k):
39 |         selected_idx = []
40 |         gate = torch.concat([self.gate[self.features[field_idx]] for field_idx in range(self.feature_num)], dim=0)[:,-1]
41 |         indices = torch.argsort(gate, descending=True)
42 |         ranked_importance = gate[indices].detach().cpu().numpy()
43 |         ranked_features = [self.features[i] for i in indices]
44 |         return np.array([ranked_features, ranked_importance])
45 |         # for i in indices:
46 |         #     selected_idx.append(i.item())
47 |         #     if len(selected_idx) == k:
48 |         #         break
49 |         # return self.features[selected_idx]
50 |     
51 |         
52 | 


--------------------------------------------------------------------------------
/models/fs/gbdt.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class gbdt(nn.Module):
 6 |     def __init__(self, args, unique_values, features):
 7 |         super(gbdt, self).__init__()
 8 | 
 9 |         # 必需的参数
10 |         self.load_checkpoint = False
11 |         self.optimizer_method = 'normal'
12 | 
13 |     def forward(self, x, current_epoch, current_step, raw_data):
14 |         return x


--------------------------------------------------------------------------------
/models/fs/lasso.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class lasso(nn.Module):
 6 |     def __init__(self, args, unique_values, features):
 7 |         super(lasso, self).__init__()
 8 | 
 9 |         # 必需的参数
10 |         self.load_checkpoint = False
11 |         self.optimizer_method = 'normal'
12 | 
13 |     def forward(self, x, current_epoch, current_step, raw_data):
14 |         return x


--------------------------------------------------------------------------------
/models/fs/lpfs.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | class lpfs(nn.Module):
 6 |     def __init__(self, args, unique_values, features):
 7 |         super(lpfs, self).__init__()
 8 |         
 9 |         self.feature_num = len(features)
10 |         self.features = features
11 |         self.x = nn.Parameter(torch.ones(self.feature_num, 1).to(args.device))
12 |         self.epochs = args.epoch
13 |         self.epsilon_update_frequency = 100
14 |         self.device = args.device
15 |         self.epsilon = 0.1
16 | 
17 |         self.load_checkpoint = False
18 |         self.optimizer_method = 'normal'
19 | 
20 |     def forward(self, x, current_epoch, current_step, raw_data):
21 |         b,f,e = x.shape
22 |         if current_step % self.epsilon_update_frequency == 0:
23 |             self.epsilon = self.epsilon * 0.9978
24 |         g = self.lpfs_pp(self.x, self.epsilon).reshape(1, f, 1)
25 |         x_ = torch.zeros_like(x)
26 |         x_ = x * g
27 |         return x_
28 |     
29 |     def lpfs_pp(self, x, epsilon, alpha=10, tau=2, init_val=1.0):
30 |         g1 = x*x/(x*x+epsilon)
31 |         g2 = alpha * epsilon ** (1.0/tau)*torch.atan(x)
32 |         g = torch.where(x>0, g2+g1, g2-g1)/init_val
33 |         return g
34 |     
35 |     def save_selection(self, k):
36 |         # gate = torch.concat([self.gate[self.features[field_idx]] for field_idx in range(self.feature_num)], dim=0)[:,-1]
37 |         gate = self.x.reshape(self.feature_num)
38 |         indices = torch.argsort(gate, descending=True)
39 |         ranked_importance = gate[indices].detach().cpu().numpy()
40 |         ranked_features = [self.features[i] for i in indices]
41 |         print(ranked_features)
42 |         print(ranked_importance)
43 |         return np.array([ranked_features, ranked_importance])
44 | 
45 | 


--------------------------------------------------------------------------------
/models/fs/mvfs.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from models.layers import MLP
 4 | 
 5 | class mvfs(nn.Module):
 6 | 
 7 |     def __init__(self, args, unique_values, features):
 8 |         super(mvfs, self).__init__()
 9 | 
10 |         self.pretrain_epoch = args.fs_config[args.fs]['pretrain_epoch']
11 |         self.feature_num = len(unique_values)
12 |         self.sub_network_num = args.fs_config[args.fs]['sub_network_num']
13 |         self.dropout = args.fs_config[args.fs]['dropout']
14 |         self.l = args.fs_config[args.fs]['l']
15 |         self.sub_network_list_bb = torch.nn.ModuleList()
16 |         for i in range(self.sub_network_num):
17 |             self.sub_network_list_bb.append(nn.Linear(self.feature_num*args.embedding_dim, self.feature_num))
18 |         self.W_g_bb = nn.Parameter(torch.Tensor(self.sub_network_num, self.sub_network_num * self.feature_num))
19 |         self.b_g_bb = nn.Parameter(torch.Tensor(self.sub_network_num))
20 | 
21 |         self.W = nn.Parameter(torch.Tensor(self.sub_network_num))
22 |         
23 |         self.mode = 'train'
24 |         if self.mode == 'retrain':
25 |             raise Exception('adafs should not be used in retrain mode')
26 |         self.optimizer_method = 'normal'
27 | 
28 |         self.load_checkpoint = False
29 |         self.t = 1
30 |     
31 |     def forward(self, x, current_epoch, current_step, raw_data):
32 |         b,f,e = x.shape
33 |         if current_epoch is not None and current_epoch < self.pretrain_epoch:
34 |             return x
35 |         else:
36 |             self.t += 0.001
37 |             C = []
38 |             for i in range(self.sub_network_num):
39 |                 C.append(torch.softmax(self.sub_network_list_bb[i](x.reshape(b, -1)), dim=1))
40 |             r = self.W_g_bb.unsqueeze(0) @ torch.cat(C, dim=1).reshape(b,-1,1) + self.b_g_bb.reshape(1,-1,1)
41 |             r = torch.softmax(r, dim=1) # b, K, 1
42 |             I = torch.mul(r, torch.stack(C, dim=1))
43 |             I = I.sum(dim=1) # b, f
44 |             # 𝑠𝑛 = 0.5 ∗ (1 + tanh(𝜏 · (𝐼𝑛 − 𝑙)))
45 |             if self.t < 5:
46 |                 s = 0.5 * (1 + torch.tanh(5 * (I - self.l)))
47 |             else:
48 |                 s = 0.5 * (1 + torch.tanh(self.t * (I-self.l)))
49 |             x = x * s.unsqueeze(2)
50 |             return x
51 |             
52 | 


--------------------------------------------------------------------------------
/models/fs/no_selection.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class no_selection(nn.Module):
 6 |     def __init__(self, args, unique_values, features):
 7 |         super(no_selection, self).__init__()
 8 | 
 9 |         # 必需的参数
10 |         self.load_checkpoint = False
11 |         self.optimizer_method = 'normal'
12 | 
13 |     def forward(self, x, current_epoch, current_step, raw_data):
14 |         return x


--------------------------------------------------------------------------------
/models/fs/optfs.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | import torch.nn.functional as F
 5 | 
 6 | class optfs(nn.Module):
 7 | 
 8 |     def __init__(self, args, unique_values, features):
 9 |         super().__init__()
10 | 
11 |         self.mask_weight = nn.Parameter(torch.Tensor(np.sum(unique_values), 1))
12 |         nn.init.constant_(self.mask_weight, 0.5)
13 |         self.offsets = np.array((0, *np.cumsum(unique_values)[:-1]))
14 | 
15 |         self.mode = 'train'
16 |         self.device = args.device
17 |         self.features = features
18 |         
19 |         self.gamma = args.fs_config[args.fs]['gamma']
20 |         if args.dataset == 'avazu':
21 |             self.gamma = 5000
22 |         elif args.dataset == 'criteo':
23 |             self.gamma = 2000
24 |         else:
25 |             self.gamma = 2000
26 |         self.pretrain_epoch = args.fs_config[args.fs]['pretrain_epoch']
27 |         self.load_checkpoint = True
28 |         self.optimizer_method = 'normal'
29 | 
30 |         self.temp_increase = self.gamma ** (1./ (self.pretrain_epoch-1))
31 |         self.temp = 1
32 |         self.current_epoch = -1
33 |     
34 |     def sigmoid(self, x):
35 |         return float(1./(1.+np.exp(-x)))
36 | 
37 |     def compute_mask(self, raw_data, temp, ticket):
38 |         scaling = 1./ self.sigmoid(0.5)
39 |         mask_weight = F.embedding(raw_data + raw_data.new_tensor(self.offsets), self.mask_weight)
40 |         if ticket:
41 |             mask = (mask_weight > 0).float()
42 |         else:
43 |             mask = torch.sigmoid(temp * mask_weight)
44 |         return scaling * mask
45 | 
46 |     def forward(self, x, current_epoch, current_step, raw_data):
47 |         b,f,e = x.shape
48 |         if current_epoch != self.current_epoch:
49 |             self.temp *= self.temp_increase
50 |             self.current_epoch = current_epoch
51 |         if self.mode == 'retrain': 
52 |             ticket = True
53 |         else: 
54 |             ticket = False
55 |         mask = self.compute_mask(raw_data, self.temp, ticket)
56 | 
57 |         return x * mask
58 |     
59 |     def before_retrain(self):
60 |         # print remain
61 |         ratio = float((self.mask_weight > 0).sum()) / self.mask_weight.numel()
62 |         print('remain: ', ratio)
63 | 


--------------------------------------------------------------------------------
/models/fs/optfs_old.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class optfs(nn.Module):
 5 | 
 6 |     def __init__(self, args, unique_values, features):
 7 |         super().__init__()
 8 | 
 9 |         self.gate = {features[field_idx]: torch.Tensor(unique_values[field_idx], 1).to(args.device) for field_idx in range(len(features))}
10 |         for feature in features:
11 |             torch.nn.init.xavier_uniform_(self.gate[feature].data)
12 |         
13 |         self.raw_gate = {features[field_idx]: self.gate[features[field_idx]].clone().detach().to(args.device) for field_idx in range(len(features))}
14 |         self.raw_gc = torch.concat([self.raw_gate[feature] for feature in features], dim=0)
15 | 
16 |         self.g = {feature: torch.ones_like(self.gate[feature]).to(args.device) for feature in features}
17 |         self.gate = {feature: nn.Parameter(self.gate[feature], requires_grad=True) for feature in features}
18 |         self.gate = torch.nn.ParameterDict(self.gate)
19 | 
20 |         self.mode = 'train'
21 |         self.device = args.device
22 |         self.features = features
23 |         
24 |         self.gamma = args.fs_config[args.fs]['gamma']
25 |         self.pretrain_epoch = args.fs_config[args.fs]['pretrain_epoch']
26 |         self.load_checkpoint = True
27 |         self.optimizer_method = 'normal'
28 | 
29 |     def forward(self, x, current_epoch, current_step, raw_data):
30 |         b,f,e = x.shape
31 |         gc = torch.concat([self.gate[feature] for feature in self.features], dim=0)
32 |         if current_epoch is not None: # that's mean, in training or validation
33 |             t = self.gamma ** (current_epoch / self.pretrain_epoch)
34 |         else: # current_epoch is None, that's mean, in test or retrain
35 |             t = self.gamma
36 |         if self.mode == 'train':
37 |             self.g_tmp = torch.sigmoid(gc * t) / torch.sigmoid(self.raw_gc)
38 |             # g_tmp分段赋值给g
39 |             for feature in self.features:
40 |                 self.g[feature] = self.g_tmp[:len(self.gate[feature])]
41 |                 self.g_tmp = self.g_tmp[len(self.gate[feature]):]
42 |             x_ = torch.zeros_like(x).to(self.device)
43 |             for j in range(f):
44 |                 feature = self.features[j]
45 |                 x_[:,j,:] = x[:,j,:] * self.g[feature][raw_data[:,j]]
46 |         elif self.mode == 'retrain':
47 |             # self.g_tmp = torch.concat([self.g[feature] for feature in self.features], dim=0)
48 |             x_ = torch.zeros_like(x).to(self.device)
49 |             for j in range(f):
50 |                 feature = self.features[j]
51 |                 x_[:,j,:] = x[:,j,:] * self.g[feature][raw_data[:,j]]
52 |         
53 |         
54 | 
55 |         # for i in range(b):
56 |         #     for j in range(f):
57 |         #         feature = self.features[j]
58 |         #         x_[i,j,:] = x[i,j,:] * self.g[feature][raw_data[i,j]]
59 |         # for j in range(f):
60 |         #     feature = self.features[j]
61 |         #     x_[:,j,:] = x[:,j,:] * self.g[feature][raw_data[:,j]]
62 | 
63 |         return x_
64 |     
65 |     def before_retrain(self):
66 |         # self.gate <= 0 的赋值为0, else 1
67 |         self.gate.requires_grad_(False)
68 |         for feature in self.features:
69 |             self.gate[feature][self.gate[feature] <= 0] = 0
70 |             self.gate[feature][self.gate[feature] > 0] = 1
71 |             print('feature:', feature, 'keep ratio:', torch.sum(self.gate[feature])/self.gate[feature].shape[0])
72 |         self.g = {feature: nn.Parameter(self.gate[feature].clone().detach().to(self.device)) for feature in self.features}
73 |         self.g = torch.nn.ParameterDict(self.g)
74 |         self.g.requires_grad_(False)
75 | 


--------------------------------------------------------------------------------
/models/fs/permutation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class permutation(nn.Module):
 6 |     def __init__(self, args, unique_values, features):
 7 |         super(permutation, self).__init__()
 8 | 
 9 |         # 必需的参数
10 |         self.load_checkpoint = False
11 |         self.optimizer_method = 'normal'
12 | 
13 |     def forward(self, x, current_epoch, current_step, raw_data):
14 |         return x


--------------------------------------------------------------------------------
/models/fs/rf.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class rf(nn.Module):
 6 |     def __init__(self, args, unique_values, features):
 7 |         super(rf, self).__init__()
 8 | 
 9 |         # 必需的参数
10 |         self.load_checkpoint = False
11 |         self.optimizer_method = 'normal'
12 | 
13 |     def forward(self, x, current_epoch, current_step, raw_data):
14 |         return x


--------------------------------------------------------------------------------
/models/fs/sfs.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | import tqdm
 5 | import numpy as np
 6 | 
 7 | class sfs(nn.Module):
 8 | 
 9 |     def __init__(self, args, unique_values, features):
10 |         super(sfs, self).__init__()
11 |         self.load_checkpoint = False
12 |         self.optimizer_method = 'normal'
13 | 
14 |         self.feature_num = len(unique_values)
15 |         self.device = args.device
16 |         self.args = args
17 |         self.criterion = torch.nn.BCELoss()
18 |         self.features = np.array(features)    
19 | 
20 |         opt = args.fs_config[args.fs]
21 |         #self.cr = opt['cr']
22 |         self.num_batch_sampling = opt['num_batch_sampling']
23 | 
24 |         self.mode = 'train'
25 |         self.offsets = np.array((0, *np.cumsum(unique_values)[:-1]))
26 |         print(self.offsets)
27 |         print(self.feature_num)
28 |         self.mask = nn.Parameter(torch.ones([self.feature_num,1]))
29 |         self.mask.requires_grad = False
30 |     
31 |     def forward(self, x, current_epoch, current_step, raw_data):
32 |         return x*self.mask
33 |     
34 |     def save_selection(self, k):
35 |         def prun(dataloader,model,device):
36 |             model.fs.mask.requires_grad = True
37 |             for i, (c_data, labels) in enumerate(dataloader):
38 |                 if i == model.fs.num_batch_sampling:
39 |                     break
40 |                 c_data, labels = c_data.to(device), labels.to(device)
41 |                 out = model(c_data,0,i)
42 |                 loss =self.criterion(out, labels.float().unsqueeze(-1))
43 |                 model.zero_grad()
44 |                 loss.backward()            
45 |                 grads = torch.abs(model.fs.mask.grad)
46 |                 if i == 0:
47 |                     moving_average_grad = grads
48 |                 else:
49 |                     moving_average_grad =  ((moving_average_grad * i) + grads) / (i + 1)
50 |             grads = torch.flatten(moving_average_grad)
51 |             importance = grads / grads.sum()
52 |             feature_rank = torch.argsort(importance, descending=True)
53 |             ranked_importance = importance[feature_rank].detach().cpu().numpy()
54 |             ranked_features = [self.features[i] for i in feature_rank]
55 |             return np.array([ranked_features, ranked_importance])
56 |         return prun
57 |        
58 | 
59 |     
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/models/fs/shark.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import tqdm
 5 | import numpy as np
 6 | 
 7 | class shark(nn.Module):
 8 |     def __init__(self, args, unique_values, features):
 9 |         super(shark, self).__init__()
10 |         self.feature_num = len(unique_values)
11 |         self.features = np.array(features)
12 |         # 必需的参数
13 |         self.load_checkpoint = False
14 |         self.optimizer_method = 'normal'
15 |         self.criterion = torch.nn.BCELoss()
16 |         self.offsets = np.array((0, *np.cumsum(unique_values)[:-1]))
17 | 
18 |     def forward(self, x, current_epoch, current_step, raw_data):
19 |         return x
20 |     
21 |     def save_selection(self, k):
22 |         def selection(test_dataloader, model, device):
23 |             tk0 = tqdm.tqdm(test_dataloader, desc="f-permutation", smoothing=0, mininterval=1.0)
24 |             model = model.to(device)
25 |             num = 0
26 |             # importance = torch.zeros(len(model.offsets)).to(device) # save importance for each field
27 |             importance = np.zeros(len(model.offsets))
28 |             expectation = torch.zeros((len(model.offsets))).to(device)
29 |             for x,y in test_dataloader:
30 |                 x = x.to(device)
31 |                 y = y.to(device)
32 |                 embs = model.embedding(x + x.new_tensor(self.offsets))
33 |                 if len(expectation.shape) == 1:
34 |                     expectation = torch.zeros((len(model.offsets), embs.shape[2])).to(device)
35 |                 expectation += torch.sum(embs, dim=0)
36 |                 num += x.shape[0]
37 |             expectation = expectation / num
38 |             expectation = expectation.reshape(1, len(model.offsets), -1)
39 |             # expectation = torch.zeros((1, len(model.offsets), 8)).to(device)
40 |             num = 0
41 |             new_dataloader = torch.utils.data.DataLoader(test_dataloader.dataset, batch_size=1, num_workers=16)
42 |             tk0 = tqdm.tqdm(new_dataloader, desc="f-permutation", smoothing=0, mininterval=1.0)
43 |             for i, (x, y) in enumerate(tk0):
44 |                 x = x.to(device)
45 |                 y = y.to(device)
46 |                 model.zero_grad()
47 |                 embs = model.embedding(x + x.new_tensor(self.offsets))
48 |                 # expectation = torch.mean(embs, dim=0)
49 |                 expectation_resize = expectation.repeat(x.shape[0], 1,1)
50 |                 right_part = expectation_resize - embs
51 |                 y_pred = model(x, current_epoch=None, current_step=i)
52 |                 loss = self.criterion(y_pred, y.float().reshape(-1, 1))
53 |                 # cal gradient for each embedding
54 |                 loss.backward()
55 |                 # get gradient
56 |                 gradients = F.embedding(x + x.new_tensor(self.offsets),model.embedding.weight.grad).to(device)
57 |                 # use the torch.gradient
58 |                 # cal importance
59 |                 error = gradients * right_part # b,f,e
60 |                 error = torch.sum(error, dim=2) # b,f
61 |                 error = torch.sum(abs(error), dim=0) # f
62 |                 importance += error.detach().cpu().numpy()
63 |                 num += x.shape[0]
64 |             importance = importance / num
65 |             # sort importance
66 |             feature_rank = np.argsort(importance)[::-1]
67 |             ranked_importance = importance[feature_rank]
68 |             ranked_features = [self.features[i] for i in feature_rank]
69 |             return np.array([ranked_features, ranked_importance])
70 |         return selection


--------------------------------------------------------------------------------
/models/fs/xgb.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class xgb(nn.Module):
 6 |     def __init__(self, args, unique_values, features):
 7 |         super(xgb, self).__init__()
 8 | 
 9 |         # 必需的参数
10 |         self.load_checkpoint = False
11 |         self.optimizer_method = 'normal'
12 | 
13 |     def forward(self, x, current_epoch, current_step, raw_data):
14 |         return x


--------------------------------------------------------------------------------
/models/layers.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | from itertools import combinations
  4 | 
  5 | class MLP(nn.Module):
  6 | 
  7 |     def __init__(self, input_dim, output_layer=True, dims=None, dropout=0):
  8 |         super().__init__()
  9 |         if dims is None:
 10 |             dims = []
 11 |         layers = list()
 12 |         for i_dim in dims:
 13 |             layers.append(nn.Linear(input_dim, i_dim))
 14 |             layers.append(nn.BatchNorm1d(i_dim))
 15 |             layers.append(nn.ReLU())
 16 |             layers.append(nn.Dropout(p=dropout))
 17 |             input_dim = i_dim
 18 |         if output_layer:
 19 |             layers.append(nn.Linear(input_dim, 1))
 20 |         self.mlp = nn.Sequential(*layers)
 21 | 
 22 |     def forward(self, x):
 23 |         return self.mlp(x)
 24 | 
 25 | class CrossNetwork(nn.Module):
 26 |     """CrossNetwork  mentioned in the DCN paper.
 27 | 
 28 |     Args:
 29 |         input_dim (int): input dim of input tensor
 30 |     
 31 |     Shape:
 32 |         - Input: `(batch_size, *)`
 33 |         - Output: `(batch_size, *)`
 34 |         
 35 |     """
 36 | 
 37 |     def __init__(self, input_dim, num_layers):
 38 |         super().__init__()
 39 |         self.num_layers = num_layers
 40 |         self.w = torch.nn.ModuleList([torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)])
 41 |         self.b = torch.nn.ParameterList([torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)])
 42 | 
 43 |     def forward(self, x):
 44 |         """
 45 |         :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
 46 |         """
 47 |         x0 = x
 48 |         for i in range(self.num_layers):
 49 |             xw = self.w[i](x)
 50 |             x = x0 * xw + self.b[i] + x
 51 |         return x
 52 |     
 53 | class SENETLayer(nn.Module):
 54 |     """
 55 |     A weighted feature gating system in the SENet paper
 56 |     Args:
 57 |         num_fields (int): number of feature fields
 58 | 
 59 |     Shape:
 60 |         - num_fields: `(batch_size, *)`
 61 |         - Output: `(batch_size, *)`
 62 |     """
 63 |     def __init__(self, num_fields, reduction_ratio=3):
 64 |         super(SENETLayer, self).__init__()
 65 |         reduced_size = max(1, int(num_fields/ reduction_ratio))
 66 |         self.mlp = nn.Sequential(nn.Linear(num_fields, reduced_size, bias=False),
 67 |                                  nn.ReLU(),
 68 |                                  nn.Linear(reduced_size, num_fields, bias=False),
 69 |                                  nn.ReLU())
 70 |     def forward(self, x):
 71 |         z = torch.mean(x, dim=-1, out=None)
 72 |         a = self.mlp(z)
 73 |         v = x*a.unsqueeze(-1)
 74 |         return v
 75 |     
 76 | class BiLinearInteractionLayer(nn.Module):
 77 |     """
 78 |     Bilinear feature interaction module, which is an improved model of the FFM model
 79 |      Args:
 80 |         num_fields (int): number of feature fields
 81 |         bilinear_type(str): the type bilinear interaction function
 82 |     Shape:
 83 |         - num_fields: `(batch_size, *)`
 84 |         - Output: `(batch_size, *)`
 85 |     """
 86 |     def __init__(self, input_dim, num_fields, bilinear_type = "field_interaction"):
 87 |         super(BiLinearInteractionLayer, self).__init__()
 88 |         self.bilinear_type = bilinear_type
 89 |         if self.bilinear_type == "field_all":
 90 |             self.bilinear_layer = nn.Linear(input_dim, input_dim, bias=False)
 91 |         elif self.bilinear_type == "field_each":
 92 |             self.bilinear_layer = nn.ModuleList([nn.Linear(input_dim, input_dim, bias=False) for i in range(num_fields)])
 93 |         elif self.bilinear_type == "field_interaction":
 94 |             self.bilinear_layer = nn.ModuleList([nn.Linear(input_dim, input_dim, bias=False) for i,j in combinations(range(num_fields), 2)])
 95 |         else:
 96 |             raise NotImplementedError()
 97 | 
 98 |     def forward(self, x):
 99 |         feature_emb = torch.split(x, 1, dim=1)
100 |         if self.bilinear_type == "field_all":
101 |             bilinear_list = [self.bilinear_layer(v_i)*v_j for v_i, v_j in combinations(feature_emb, 2)]
102 |         elif self.bilinear_type == "field_each":
103 |             bilinear_list = [self.bilinear_layer[i](feature_emb[i])*feature_emb[j] for i,j in combinations(range(len(feature_emb)), 2)]
104 |         elif self.bilinear_type == "field_interaction":
105 |             bilinear_list = [self.bilinear_layer[i](v[0])*v[1] for i,v in enumerate(combinations(feature_emb, 2))]
106 |         return torch.cat(bilinear_list, dim=1)
107 | 
108 | class FactorizationMachine(torch.nn.Module):
109 |     def __init__(self, reduce_sum=True):
110 |         super().__init__()
111 |         self.reduce_sum = reduce_sum
112 | 
113 |     def forward(self, x):
114 |         """
115 |         :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
116 |         :return : tensor of size (batch_size, 1) if reduce_sum
117 |                   tensor of size (batch_size, embed_dim) else   
118 |         """
119 |         square_of_sum = torch.sum(x, dim=1) ** 2
120 |         sum_of_square = torch.sum(x ** 2, dim=1)
121 |         ix = square_of_sum - sum_of_square
122 |         if self.reduce_sum:
123 |             ix = torch.sum(ix, dim=1, keepdim=True)
124 |         return 0.5 * ix


--------------------------------------------------------------------------------
/models/rec/dcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from ..layers import MLP, CrossNetwork
 4 | 
 5 | class dcn(nn.Module):
 6 | 
 7 |     def __init__(self, args, input_dim):
 8 |         super(dcn, self).__init__()
 9 |         self.dims = input_dim
10 | 
11 |         self.cn = CrossNetwork(self.dims, num_layers=2)
12 |         self.mlp = MLP(self.dims, False, dims=[32,16], dropout=0.2)
13 |         self.linear = nn.Linear(self.dims + 16, 1)
14 | 
15 |     def forward(self, x):
16 |         b,f,e = x.shape
17 |         x = x.reshape(b,-1)
18 |         cn_out = self.cn(x)
19 |         mlp_out = self.mlp(x)
20 |         x = torch.cat([cn_out, mlp_out], dim=1)
21 |         x = self.linear(x)
22 |         x = torch.sigmoid(x)
23 |         return x


--------------------------------------------------------------------------------
/models/rec/deepfm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from ..layers import MLP, FactorizationMachine
 4 | 
 5 | class deepfm(nn.Module):
 6 | 
 7 |     def __init__(self, args, input_dim):
 8 |         super(deepfm, self).__init__()
 9 |         self.dims = input_dim
10 | 
11 |         self.dropout = 0.2
12 |         self.dnn = MLP(self.dims, True, dims=[32, 16], dropout=self.dropout)
13 |         self.fm = FactorizationMachine(reduce_sum=True)
14 | 
15 |     def forward(self, x):
16 |         b,f,e = x.shape
17 |         output_fm = self.fm(x)
18 |         x_dnn = x.reshape(b,-1)
19 |         x_dnn = self.dnn(x_dnn)
20 |         output = output_fm + x_dnn
21 |         output = torch.sigmoid(output)
22 |         return output


--------------------------------------------------------------------------------
/models/rec/fibinet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from ..layers import MLP, SENETLayer, BiLinearInteractionLayer
 4 | 
 5 | class fibinet(nn.Module):
 6 | 
 7 |     def __init__(self, args, input_dim):
 8 |         super(fibinet, self).__init__()
 9 | 
10 |         # if you change the embedding_size, this value should be changed too
11 |         embedding_dim = args.embedding_dim
12 |         self.dims = input_dim
13 |         self.num_fields = self.dims // embedding_dim
14 |         self.senet_layer = SENETLayer(self.num_fields, reduction_ratio=3)
15 |         self.bilinear_interaction = BiLinearInteractionLayer(embedding_dim, self.num_fields, bilinear_type="field_interaction")
16 |         self.hidden_size = self.num_fields * (self.num_fields - 1) * embedding_dim
17 |         self.mlp = MLP(self.hidden_size, True, dims=[32,16], dropout=0.2)
18 | 
19 |     def forward(self, x):
20 |         b,f,e = x.shape
21 |         embed_senet = self.senet_layer(x)
22 |         embed_bi1 = self.bilinear_interaction(x)
23 |         embed_bi2 = self.bilinear_interaction(embed_senet)
24 |         shallow_part = torch.flatten(torch.cat([embed_bi1, embed_bi2], dim=1), start_dim=1)
25 |         mlp_out = self.mlp(shallow_part)
26 |         output = torch.sigmoid(mlp_out)
27 |         return output
28 | 


--------------------------------------------------------------------------------
/models/rec/fm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from ..layers import MLP, FactorizationMachine
 4 | 
 5 | class fm(nn.Module):
 6 | 
 7 |     def __init__(self, args, input_dim):
 8 |         super(fm, self).__init__()
 9 |         self.dims = input_dim
10 | 
11 |         self.fm = FactorizationMachine(reduce_sum=True)
12 | 
13 |     def forward(self, x):
14 |         b,f,e = x.shape
15 |         output_fm = self.fm(x)
16 |         x = torch.sigmoid(output_fm)
17 |         return x


--------------------------------------------------------------------------------
/models/rec/mlp.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | import torch
 4 | 
 5 | # class mlp(nn.Module):
 6 | #     def __init__(self, input_size, hidden_size = 16, output_size = 16, dropout = 0.2):
 7 | #         super(mlp, self).__init__()
 8 | #         self.fc1 = nn.Linear(input_size, hidden_size)
 9 | #         self.dropout1 = nn.Dropout(dropout)
10 | #         self.fc2 = nn.Linear(hidden_size, hidden_size)
11 | #         self.dropout2 = nn.Dropout(dropout)
12 | #         self.fc3 = nn.Linear(hidden_size, output_size)
13 | #         self.output_layer = nn.Linear(output_size, 1)
14 | #         self.init_weights()
15 | 
16 | #         # optimizer
17 | #         self.optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
18 | 
19 | #     def init_weights(self):
20 | #         for m in self.modules():
21 | #             if isinstance(m, nn.Linear):
22 | #                 nn.init.normal_(m.weight.data, mean=0, std=0.01)
23 | #                 nn.init.constant_(m.bias.data, 0)
24 | 
25 | #     def forward(self, x):
26 | #         b = x.shape[0]
27 | #         x = x.reshape(b, -1)
28 | #         x = self.fc1(x)
29 | #         x = torch.relu(x)
30 | #         x = self.dropout1(x)
31 | #         x = torch.relu(self.fc2(x))
32 | #         x = self.dropout2(x)
33 | #         x = self.fc3(x)
34 | #         x = self.output_layer(x)
35 | #         x = torch.sigmoid(x)
36 | #         return x
37 |     
38 | class mlp(nn.Module):
39 |     def __init__(self, args, input_dim, embed_dims = [16,16], dropout = 0.2, output_layer=True):
40 |         super().__init__()
41 |         layers = list()
42 |         self.mlps = nn.ModuleList()
43 |         self.out_layer = output_layer
44 |         for embed_dim in embed_dims:
45 |             layers.append(nn.Linear(input_dim, embed_dim))
46 |             layers.append(nn.BatchNorm1d(embed_dim))
47 |             layers.append(nn.ReLU())
48 |             layers.append(nn.Dropout(p=dropout))
49 |             input_dim = embed_dim
50 |             self.mlps.append(nn.Sequential(*layers))
51 |             layers = list()
52 |         if self.out_layer:
53 |             self.out = nn.Linear(input_dim, 1)
54 | 
55 |     def forward(self, x):
56 |         """
57 |         :param x: Float tensor of size ``(batch_size, embed_dim)``
58 |         """
59 |         b = x.shape[0]
60 |         x = x.reshape(b,-1)
61 |         for layer in self.mlps:
62 |             x = layer(x)
63 |         if self.out_layer:
64 |             x = self.out(x)
65 |         x = torch.sigmoid(x)
66 |         return x


--------------------------------------------------------------------------------
/models/rec/widedeep.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from ..layers import MLP
 5 | 
 6 | class widedeep(nn.Module):
 7 | 
 8 |     def __init__(self, args, input_dim):
 9 |         super(widedeep, self).__init__()
10 |         self.dims = input_dim
11 | 
12 |         self.mlp = MLP(self.dims, True, dims=[32,16], dropout=0.2)
13 |         self.linear = nn.Linear(self.dims, 1)
14 | 
15 |     def forward(self, x):
16 |         b,f,e = x.shape
17 |         x = x.reshape(b,-1)
18 |         mlp_out = self.mlp(x)
19 |         linear_out = self.linear(x)
20 |         x = mlp_out + linear_out
21 |         x = torch.sigmoid(x)
22 |         return x


--------------------------------------------------------------------------------
/nni/search_spaces/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "k": {"_type": "randint", "_value": [15,23]},
3 |     "learning_rate": {"_type": "choice", "_value": [0.001, 0.0005, 0.0001]}
4 | }


--------------------------------------------------------------------------------
/nni/search_spaces/fs/adafs.json:
--------------------------------------------------------------------------------
1 | {
2 |     "fs_pretrain_epoch": {"_type": "choice", "_value": [1,2,3,4,5,6]},
3 |     "fs_hidden_size": {"_type": "choice", "_value": [16,32,64,128]},
4 |     "fs_dropout": {"_type": "choice", "_value": [0, 0.2, 0.4, 0.6]},
5 |     "fs_update_frequency": {"_type": "choice", "_value": [1,2,4,8,16]}
6 | }


--------------------------------------------------------------------------------
/nni/search_spaces/fs/autofield.json:
--------------------------------------------------------------------------------
1 | {
2 |     "fs_update_frequency": {"_type": "choice", "_value": [5,10,15,20,30]}
3 | }


--------------------------------------------------------------------------------
/nni/search_spaces/fs/gbdt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fs_learning_rate": {"_type": "choice", "_value": [0.1,0.01,0.001,0.0001]},
 3 |     "fs_n_estimators": {"_type": "choice", "_value": [50,100,200]},
 4 |     "fs_subsample": {"_type": "choice", "_value": [0.5, 0.7, 1.0]},
 5 |     "fs_min_samples_split": {"_type": "choice", "_value": [2, 4, 8, 16]},
 6 |     "fs_min_samples_leaf": {"_type": "choice", "_value": [1, 2, 4, 8, 16]},
 7 |     "fs_min_weight_fraction_leaf": {"_type": "choice", "_value": [0.0, 0.3, 0.5]},
 8 |     "fs_max_depth": {"_type": "choice", "_value": [3, 6, 9]},
 9 |     "fs_n_iter_no_change": {"_type": "choice", "_value": [null, 3, 6, 9]}
10 | }


--------------------------------------------------------------------------------
/nni/search_spaces/fs/lasso.json:
--------------------------------------------------------------------------------
1 | {
2 |     "fs_alpha": {"_type": "choice", "_value": [0.0001,0.001,0.01,0.1,1.0]},
3 |     "fs_fit_intercept": {"_type": "choice", "_value": [true, false]},
4 |     "fs_copy_X": {"_type": "choice", "_value": [true,false]},
5 |     "fs_max_iter": {"_type": "choice", "_value": [100,200,500,1000,2000]},
6 |     "fs_tol": {"_type": "choice", "_value": [1e-5,1e-4,1e-3,1e-2]},
7 |     "fs_positive": {"_type": "choice", "_value": [true,false]},
8 |     "fs_selection": {"_type": "choice", "_value": ["cyclic","random"]}
9 | }


--------------------------------------------------------------------------------
/nni/search_spaces/fs/optfs.json:
--------------------------------------------------------------------------------
1 | {
2 |     "fs_epochs": {"_type": "choice", "_value": [5,10,15]},
3 |     "fs_gamma": {"_type": "choice", "_value": [200,500,1000,2000,5000,10000]}
4 | }


--------------------------------------------------------------------------------
/nni_tune.py:
--------------------------------------------------------------------------------
  1 | import nni
  2 | import argparse
  3 | import json
  4 | import os
  5 | import re
  6 | from nni.experiment import Experiment
  7 | from utils.utils import str2bool
  8 | 
  9 | 
 10 | if __name__ == '__main__':
 11 |     parser = argparse.ArgumentParser()
 12 |     parser.add_argument('--dataset', type=str, default='criteo', help='avazu, criteo')
 13 |     parser.add_argument('--model', type=str, default='mlp')
 14 |     parser.add_argument('--fs', type=str, default='no_selection')
 15 |     parser.add_argument('--es', type=str, default='no_selection')
 16 |     parser.add_argument('--data_path', type=str, default='data/')
 17 |     parser.add_argument('--train_or_search', type=str2bool, default=True, help='whether to train or search')
 18 |     parser.add_argument('--retrain', type=str2bool, default=True, help='whether to retrain')
 19 |     parser.add_argument('--k', type=int, default=0, help='top k features, if set, use just this k')
 20 |     parser.add_argument('--port', type=int, default=8080, help='port of nni server')
 21 | 
 22 |     args = parser.parse_args()
 23 |     script_name = None
 24 |     if args.es != 'no_selection':
 25 |         script_name = 'es_run.py'
 26 |     else:
 27 |         script_name = 'fs_run.py'
 28 | 
 29 |     field_num = 0
 30 |     if args.dataset == 'avazu':
 31 |         field_num = 22
 32 |     elif args.dataset == 'criteo':
 33 |         field_num = 39
 34 | 
 35 |     fs_search_space, es_search_space, model_search_space = None, None, None
 36 |     fs_search_space_path = 'nni/search_spaces/fs/' + args.fs + '.json'
 37 |     es_search_space_path = 'nni/search_spaces/es/' + args.es + '.json'
 38 |     model_search_space_path = 'nni/search_spaces/config.json'
 39 |     if not os.path.exists(fs_search_space_path):
 40 |         print('fs search space not exists, skip')
 41 |     else:
 42 |         with open(fs_search_space_path, 'r') as f:
 43 |             fs_search_space = json.load(f)
 44 |     if not os.path.exists(es_search_space_path):
 45 |         print('es search space not exists, skip')
 46 |     else:
 47 |         with open(es_search_space_path, 'r') as f:
 48 |             es_search_space = json.load(f)
 49 |     with open(model_search_space_path, 'r') as f:
 50 |         model_search_space = json.load(f)
 51 |     search_space = {}
 52 |     if fs_search_space is not None:
 53 |         search_space.update(fs_search_space)
 54 |     if es_search_space is not None:
 55 |         search_space.update(es_search_space)
 56 |     search_space.update(model_search_space)
 57 | 
 58 |     if args.k == 0:
 59 |         # if no specific k, set k to be a random value between field_num * 0.8 and field_num
 60 |         search_space["k"] = {"_type": "randint", "_value": [int(field_num * 0.8), field_num]}
 61 |     else:
 62 |         # if specific k, set k to be a choice value
 63 |         search_space["k"] = {"_type": "choice", "_value": [args.k]}
 64 |     
 65 |     experiment = Experiment('local')
 66 |     experiment.config.experiment_name = args.dataset + '_' + args.model + '_' + args.fs + '_' + args.es
 67 |     experiment.config.trial_command = 'python {} --dataset={} --model={} --fs={} --es={} --data_path={} --nni=True --train_or_search={} --retrain={} --k={}'.format(script_name, args.dataset, args.model, args.fs, args.es, args.data_path, args.train_or_search, args.retrain, args.k)
 68 |     experiment.config.trial_code_directory = '.' # code directory
 69 |     experiment.config.experiment_working_directory = 'experiments/' # working directory
 70 |     if not os.path.exists(experiment.config.experiment_working_directory):
 71 |         os.makedirs(experiment.config.experiment_working_directory)
 72 |     experiment.config.search_space = search_space
 73 | 
 74 |     experiment.config.tuner.name = 'TPE'
 75 |     experiment.config.tuner.class_args['optimize_mode'] = 'maximize'
 76 | 
 77 |     experiment.config.max_trial_number = 16
 78 |     experiment.config.trial_concurrency = 8
 79 |     experiment.config.max_experiment_duration = '24h'
 80 | 
 81 |     experiment.config.trial_gpu_number = 1
 82 |     experiment.config.training_service.use_active_gpu = True
 83 | 
 84 |     experiment.run(args.port)
 85 |     # experiment_id = nni.get_experiment_id()
 86 |     # # get the best parameters
 87 |     # experiment_dir = os.path.join('nni-experiments',experiment_id, 'trials')
 88 |     # auc_value, logloss_value = 0.0, 100.0
 89 |     # best_trial = None
 90 |     # for trial in os.listdir(experiment_dir):
 91 |     #     file_path = os.path.join(experiment_dir, trial, 'trial.log')
 92 |     #     auc_pattern = r"test auc: ([0-9.]+)"
 93 |     #     logloss_pattern = r"test logloss: ([0-9.]+)"
 94 |     #     with open(file_path, "r") as file:
 95 |     #         lines = file.readlines()
 96 |     #         auc_match = re.search(auc_pattern, lines[-2])
 97 |     #         logloss_match = re.search(logloss_pattern, lines[-1])
 98 |     #         if auc_match:
 99 |     #             auc_value = max(auc_value, float(auc_match.group(1)))
100 |     #             if auc_value == float(auc_match.group(1)):
101 |     #                 best_trial = trial
102 |     #         if logloss_match:
103 |     #             logloss_value = min(logloss_value, float(logloss_match.group(1)))
104 |     # print('best trial: ', best_trial)
105 |     # print('best auc: ', auc_value)
106 |     # print('best logloss: ', logloss_value)
107 |     # print('best parameters:')
108 |     # best_trial_para_path = os.path.join(experiment_dir, best_trial, 'parameter.cfg')
109 |     # with open(best_trial_para_path, 'r') as file:
110 |     #     lines = file.readlines()
111 |     #     print(lines)
112 |     
113 |     experiment.stop()
114 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy~=1.19.3
2 | pandas~=1.4.4
3 | scikit_learn~=1.2.2
4 | torch~=1.11.0
5 | tqdm~=4.65.0
6 | pyyaml
7 | nni
8 | xgboost


--------------------------------------------------------------------------------
/utils/datasets.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import os
  4 | import torch
  5 | from torch.utils.data import TensorDataset, DataLoader
  6 | from sklearn.preprocessing import LabelEncoder
  7 | 
  8 | def read_dataset(dataset_name, data_path, batch_size, shuffle, num_workers, use_fields=None, machine_learning_method=False):
  9 |     if machine_learning_method:
 10 |         if dataset_name == 'avazu':
 11 |             return read_avazu_ml(data_path, batch_size, shuffle)
 12 |         elif dataset_name == 'criteo':
 13 |             return read_criteo_ml(data_path, batch_size, shuffle)
 14 |         elif dataset_name == 'movielens-1m':
 15 |             return read_movielens1m_ml(data_path, batch_size, shuffle)
 16 |         elif dataset_name == 'aliccp':
 17 |             return read_aliccp_ml(data_path, batch_size, shuffle)
 18 |     elif not machine_learning_method:
 19 |         if dataset_name == 'avazu':
 20 |             return read_avazu(data_path, batch_size, shuffle, num_workers, use_fields)
 21 |         elif dataset_name == 'criteo':
 22 |             return read_criteo(data_path, batch_size, shuffle, num_workers, use_fields)
 23 |         elif dataset_name == 'movielens-1m':
 24 |             return read_movielens1m(data_path, batch_size, shuffle, num_workers, use_fields)
 25 |         elif dataset_name == 'aliccp':
 26 |             return read_aliccp(data_path, batch_size, shuffle, num_workers, use_fields)
 27 | 
 28 | def read_avazu(data_path, batch_size, shuffle, num_workers, use_fields=None):
 29 |     dtypes = {
 30 |         'click': np.int8,
 31 |         'hour':np.int16,
 32 |         'C1':np.int8,
 33 |         'banner_pos':np.int8,
 34 |         'site_id':np.int16,
 35 |         'site_domain':np.int16,
 36 |         'site_category':np.int8,
 37 |         'app_id':np.int16,
 38 |         'app_domain':np.int16,
 39 |         'app_category':np.int8,
 40 |         'device_id':np.int32,
 41 |         'device_ip':np.int32,
 42 |         'device_model':np.int16,
 43 |         'device_type':np.int8,
 44 |         'device_conn_type':np.int8,
 45 |         'C14':np.int16,
 46 |         'C15':np.int8,
 47 |         'C16':np.int8,
 48 |         'C17':np.int16,
 49 |         'C18':np.int8,
 50 |         'C19':np.int8,
 51 |         'C20':np.int16,
 52 |         'C21':np.int8
 53 |     }
 54 |     print('start reading avazu...')
 55 |     if use_fields is None:
 56 |         df = pd.read_csv(os.path.join(data_path, 'avazu/preprocessed_avazu.csv'), dtype = dtypes)
 57 |     else:
 58 |         df = pd.read_csv(os.path.join(data_path, 'avazu/preprocessed_avazu.csv'), dtype = dtypes, usecols=list(use_fields)+['click'])
 59 |     print('finish reading avazu.')
 60 |     train_idx = int(df.shape[0] * 0.7)
 61 |     val_idx = int(df.shape[0] * 0.9)
 62 |     features = [f for f in df.columns if f not in ['click']]
 63 |     unique_values = [df[col].max()+1 for col in features]
 64 |     label = 'click'
 65 |     train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
 66 |     train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
 67 |     train_x, val_x, test_x = torch.tensor(train_x.values, dtype=torch.long), torch.tensor(val_x.values, dtype=torch.long), torch.tensor(test_x.values, dtype=torch.long)
 68 |     train_y, val_y, test_y = torch.tensor(train_y.values, dtype=torch.long), torch.tensor(val_y.values, dtype=torch.long), torch.tensor(test_y.values, dtype=torch.long)
 69 |     train_dataloader = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
 70 |     val_dataloader = DataLoader(TensorDataset(val_x, val_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
 71 |     test_dataloader = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
 72 |     return features, label, train_dataloader, val_dataloader, test_dataloader, unique_values
 73 | 
 74 | def read_avazu_ml(data_path, batch_size, shuffle, use_fields=None):
 75 |     dtypes = {
 76 |         'click': np.int8,
 77 |         'hour':np.int16,
 78 |         'C1':np.int8,
 79 |         'banner_pos':np.int8,
 80 |         'site_id':np.int16,
 81 |         'site_domain':np.int16,
 82 |         'site_category':np.int8,
 83 |         'app_id':np.int16,
 84 |         'app_domain':np.int16,
 85 |         'app_category':np.int8,
 86 |         'device_id':np.int32,
 87 |         'device_ip':np.int32,
 88 |         'device_model':np.int16,
 89 |         'device_type':np.int8,
 90 |         'device_conn_type':np.int8,
 91 |         'C14':np.int16,
 92 |         'C15':np.int8,
 93 |         'C16':np.int8,
 94 |         'C17':np.int16,
 95 |         'C18':np.int8,
 96 |         'C19':np.int8,
 97 |         'C20':np.int16,
 98 |         'C21':np.int8
 99 |     }
100 |     print('start reading avazu...')
101 |     if use_fields is None:
102 |         df = pd.read_csv(os.path.join(data_path, 'avazu/preprocessed_avazu.csv'), dtype = dtypes)
103 | #         df.drop(columns=['item_id:token'], inplace=True)
104 |     else:
105 |         df = pd.read_csv(os.path.join(data_path, 'avazu/preprocessed_avazu.csv'), dtype = dtypes, usecols=list(use_fields)+['click'])
106 |     print('finish reading avazu.')
107 |     train_idx = int(df.shape[0] * 0.7)
108 |     val_idx = int(df.shape[0] * 0.9)
109 |     features = [f for f in df.columns if f not in ['click']]
110 |     unique_values = [df[col].max()+1 for col in features]
111 |     label = 'click'
112 |     train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
113 |     train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
114 |     return features, unique_values, (train_x, train_y, val_x, val_y, test_x, test_y)
115 | 
116 | def read_criteo(data_path, batch_size, shuffle, num_workers, use_fields=None):
117 |     dtypes = {
118 |         '0': np.int8,
119 |         '1': np.int8,
120 |         '2': np.int8,
121 |         '3': np.int8,
122 |         '4': np.int8,
123 |         '5': np.int16,
124 |         '6': np.int16,
125 |         '7': np.int8,
126 |         '8': np.int8,
127 |         '9': np.int8,
128 |         '10': np.int8,
129 |         '11': np.int8,
130 |         '12': np.int8,
131 |         '13': np.int8,
132 |         '14': np.int16,
133 |         '15': np.int16,
134 |         '16': np.int32,
135 |         '17': np.int32,
136 |         '18': np.int16,
137 |         '19': np.int8,
138 |         '20': np.int16,
139 |         '21': np.int16,
140 |         '22': np.int8,
141 |         '23': np.int32,
142 |         '24': np.int16,
143 |         '25': np.int32,
144 |         '26': np.int16,
145 |         '27': np.int8,
146 |         '28': np.int16,
147 |         '29': np.int32,
148 |         '30': np.int8,
149 |         '31': np.int16,
150 |         '32': np.int16,
151 |         '33': np.int8,
152 |         '34': np.int32,
153 |         '35': np.int8,
154 |         '36': np.int8,
155 |         '37': np.int32,
156 |         '38': np.int8,
157 |         '39': np.int32
158 |     }
159 |     print('start reading criteo...')
160 |     if use_fields is None:
161 |         df = pd.read_csv(os.path.join(data_path, 'criteo/preprocessed_criteo.csv'), dtype = dtypes)
162 | #         df.drop(columns=['index:float'], inplace=True)
163 |     else:
164 |         df = pd.read_csv(os.path.join(data_path, 'criteo/preprocessed_criteo.csv'), dtype = dtypes, usecols=list(use_fields)+['0'])
165 |     print('finish reading criteo.')
166 |     train_idx = int(df.shape[0] * 0.7)
167 |     val_idx = int(df.shape[0] * 0.9)
168 |     features = [f for f in df.columns if f not in ['0']]
169 |     unique_values = [df[col].max()+1 for col in features]
170 |     label = '0'
171 |     train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
172 |     train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
173 |     train_x, val_x, test_x = torch.tensor(train_x.values, dtype=torch.long), torch.tensor(val_x.values, dtype=torch.long), torch.tensor(test_x.values, dtype=torch.long)
174 |     train_y, val_y, test_y = torch.tensor(train_y.values, dtype=torch.long), torch.tensor(val_y.values, dtype=torch.long), torch.tensor(test_y.values, dtype=torch.long)
175 |     train_dataloader = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
176 |     val_dataloader = DataLoader(TensorDataset(val_x, val_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
177 |     test_dataloader = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
178 |     return features, label, train_dataloader, val_dataloader, test_dataloader, unique_values
179 | 
180 | 
181 | def read_criteo_ml(data_path, batch_size, shuffle, use_fields=None):
182 |     dtypes = {
183 |         '0': np.int8,
184 |         '1': np.int8,
185 |         '2': np.int8,
186 |         '3': np.int8,
187 |         '4': np.int8,
188 |         '5': np.int16,
189 |         '6': np.int16,
190 |         '7': np.int8,
191 |         '8': np.int8,
192 |         '9': np.int8,
193 |         '10': np.int8,
194 |         '11': np.int8,
195 |         '12': np.int8,
196 |         '13': np.int8,
197 |         '14': np.int16,
198 |         '15': np.int16,
199 |         '16': np.int32,
200 |         '17': np.int32,
201 |         '18': np.int16,
202 |         '19': np.int8,
203 |         '20': np.int16,
204 |         '21': np.int16,
205 |         '22': np.int8,
206 |         '23': np.int32,
207 |         '24': np.int16,
208 |         '25': np.int32,
209 |         '26': np.int16,
210 |         '27': np.int8,
211 |         '28': np.int16,
212 |         '29': np.int32,
213 |         '30': np.int8,
214 |         '31': np.int16,
215 |         '32': np.int16,
216 |         '33': np.int8,
217 |         '34': np.int32,
218 |         '35': np.int8,
219 |         '36': np.int8,
220 |         '37': np.int32,
221 |         '38': np.int8,
222 |         '39': np.int32
223 |     }
224 |     print('start reading criteo...')
225 |     if use_fields is None:
226 |         df = pd.read_csv(os.path.join(data_path, 'criteo/preprocessed_criteo.csv'), dtype = dtypes)
227 | #         df.drop(columns=['index:float'], inplace=True)
228 |     else:
229 |         df = pd.read_csv(os.path.join(data_path, 'criteo/preprocessed_criteo.csv'), dtype = dtypes, usecols=list(use_fields)+['0'])
230 |     print('finish reading criteo.')
231 |     train_idx = int(df.shape[0] * 0.7)
232 |     val_idx = int(df.shape[0] * 0.9)
233 |     features = [f for f in df.columns if f not in ['0']]
234 |     unique_values = [df[col].max()+1 for col in features]
235 |     label = '0'
236 |     train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
237 |     train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
238 |     return features, unique_values, (train_x, train_y, val_x, val_y, test_x, test_y)
239 | 
240 | def read_movielens1m(data_path, batch_size, shuffle, num_workers, use_fields=None):
241 |     print('start reading movielens 1m...')
242 |     if use_fields is None:
243 |         df = pd.read_csv(os.path.join(data_path, 'movielens-1m/ml-1m.csv'))
244 |     else:
245 |         df = pd.read_csv(os.path.join(data_path, 'movielens-1m/ml-1m.csv'), usecols=list(use_fields)+['rating'])
246 |     print('finish reading movielens 1m.')
247 |     df['rating'] = df['rating'].apply(lambda x: 1 if x > 3 else 0)
248 |     df = df.sample(frac=1, random_state=43) # shuffle
249 |     train_idx = int(df.shape[0] * 0.7)
250 |     val_idx = int(df.shape[0] * 0.9)
251 |     features = [f for f in df.columns if f not in ['rating']]
252 |     for feature in features:
253 |         le = LabelEncoder()
254 |         df[feature] = le.fit_transform(df[feature])
255 |     unique_values = [df[col].max()+1 for col in features]
256 |     label = 'rating'
257 |     train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
258 |     train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
259 |     train_x, val_x, test_x = torch.tensor(train_x.values, dtype=torch.long), torch.tensor(val_x.values, dtype=torch.long), torch.tensor(test_x.values, dtype=torch.long)
260 |     train_y, val_y, test_y = torch.tensor(train_y.values, dtype=torch.long), torch.tensor(val_y.values, dtype=torch.long), torch.tensor(test_y.values, dtype=torch.long)
261 |     train_dataloader = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
262 |     val_dataloader = DataLoader(TensorDataset(val_x, val_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
263 |     test_dataloader = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
264 |     return features, label, train_dataloader, val_dataloader, test_dataloader, unique_values
265 | 
266 | def read_movielens1m_ml(data_path, batch_size, shuffle, use_fields=None):
267 |     print('start reading movielens 1m...')
268 |     if use_fields is None:
269 |         df = pd.read_csv(os.path.join(data_path, 'movielens-1m/ml-1m.csv'))
270 | #         df.drop(columns=['item_id:token'], inplace=True)
271 |     else:
272 |         df = pd.read_csv(os.path.join(data_path, 'movielens-1m/ml-1m.csv'), usecols=list(use_fields)+['rating'])
273 |     print('finish reading movielens 1m.')
274 |     df['rating'] = df['rating'].apply(lambda x: 1 if x > 3 else 0)
275 |     df = df.sample(frac=1, random_state=43) # shuffle
276 |     train_idx = int(df.shape[0] * 0.7)
277 |     val_idx = int(df.shape[0] * 0.9)
278 |     features = [f for f in df.columns if f not in ['rating']]
279 |     for feature in features:
280 |         le = LabelEncoder()
281 |         df[feature] = le.fit_transform(df[feature])
282 |     unique_values = [df[col].max()+1 for col in features]
283 |     label = 'rating'
284 |     train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
285 |     train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
286 |     return features, unique_values, (train_x, train_y, val_x, val_y, test_x, test_y)
287 | 
288 | def read_aliccp(data_path, batch_size, shuffle, num_workers, use_fields=None):
289 |     print('start reading aliccp...')
290 |     data_type = {'click':np.int8, 'purchase': np.int8, '101':np.int32, '121':np.uint8, '122':np.uint8, '124':np.uint8, '125':np.uint8, '126':np.uint8, '127':np.uint8, '128':np.uint8, '129':np.uint8, '205':np.int32, '206':np.int16, '207':np.int32, '210':np.int32, '216':np.int32, '508':np.int16, '509':np.int32, '702':np.int32, '853':np.int32, '301':np.int8, '109_14':np.int16, '110_14':np.int32, '127_14':np.int32, '150_14':np.int32, 'D109_14': np.float16, 'D110_14': np.float16, 'D127_14': np.float16, 'D150_14': np.float16, 'D508': np.float16, 'D509': np.float16, 'D702': np.float16, 'D853': np.float16}
291 |     if use_fields is None:
292 |         df1 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_train.csv'), dtype=data_type)
293 |         df2 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_val.csv'), dtype=data_type)
294 |         df3 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_test.csv'), dtype=data_type)
295 |         df = pd.concat([df1, df2, df3])
296 |     else:
297 |         df1 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_train.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
298 |         df2 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_val.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
299 |         df3 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_test.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
300 |         df = pd.concat([df1, df2, df3])
301 |     print('finish reading aliccp.')
302 |     # df = df.sample(frac=1) # shuffle
303 |     train_idx = int(df.shape[0] * 0.5)
304 |     val_idx = int(df.shape[0] * 0.75)
305 |     features = []
306 |     for f in df.columns:
307 |         if f not in ['click','purchase'] and f[:1] != 'D':
308 |             features.append(f)
309 |     if '301' in features:
310 |         df['301'] = df['301'] - 1
311 |     unique_values = [df[col].max()+1 for col in features]
312 |     label = 'click'
313 |     train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
314 |     train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
315 |     train_x, val_x, test_x = torch.tensor(train_x.values, dtype=torch.long), torch.tensor(val_x.values, dtype=torch.long), torch.tensor(test_x.values, dtype=torch.long)
316 |     train_y, val_y, test_y = torch.tensor(train_y.values, dtype=torch.long), torch.tensor(val_y.values, dtype=torch.long), torch.tensor(test_y.values, dtype=torch.long)
317 |     train_dataloader = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
318 |     val_dataloader = DataLoader(TensorDataset(val_x, val_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
319 |     test_dataloader = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
320 |     return features, label, train_dataloader, val_dataloader, test_dataloader, unique_values
321 | 
322 | def read_aliccp_ml(data_path, batch_size, shuffle, use_fields=None):
323 |     print('start reading aliccp...')
324 |     data_type = {'click':np.int8, 'purchase': np.int8, '101':np.int32, '121':np.uint8, '122':np.uint8, '124':np.uint8, '125':np.uint8, '126':np.uint8, '127':np.uint8, '128':np.uint8, '129':np.uint8, '205':np.int32, '206':np.int16, '207':np.int32, '210':np.int32, '216':np.int32, '508':np.int16, '509':np.int32, '702':np.int32, '853':np.int32, '301':np.int8, '109_14':np.int16, '110_14':np.int32, '127_14':np.int32, '150_14':np.int32, 'D109_14': np.float16, 'D110_14': np.float16, 'D127_14': np.float16, 'D150_14': np.float16, 'D508': np.float16, 'D509': np.float16, 'D702': np.float16, 'D853': np.float16}
325 |     if use_fields is None:
326 |         df1 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_train.csv'), dtype=data_type)
327 |         df2 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_val.csv'), dtype=data_type)
328 |         df3 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_test.csv'), dtype=data_type)
329 |         df = pd.concat([df1, df2, df3])
330 |     else:
331 |         df1 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_train.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
332 |         df2 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_val.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
333 |         df3 = pd.read_csv(os.path.join(data_path, 'aliccp/ali_ccp_test.csv'), usecols=list(use_fields)+['click'], dtype=data_type)
334 |         df = pd.concat([df1, df2, df3])
335 |     print('finish reading aliccp.')
336 |     # df = df.sample(frac=1) # shuffle
337 |     train_idx = int(df.shape[0] * 0.5)
338 |     val_idx = int(df.shape[0] * 0.75)
339 |     features = []
340 |     for f in df.columns:
341 |         if f not in ['click','purchase'] and f[:1] != 'D':
342 |             features.append(f)
343 |     df['301'] = df['301'] - 1
344 |     unique_values = [df[col].max()+1 for col in features]
345 |     label = 'click'
346 |     train_x, val_x, test_x = df[features][:train_idx], df[features][train_idx:val_idx], df[features][val_idx:]
347 |     train_y, val_y, test_y = df[label][:train_idx], df[label][train_idx:val_idx], df[label][val_idx:]
348 |     return features, unique_values, (train_x, train_y, val_x, val_y, test_x, test_y)


--------------------------------------------------------------------------------
/utils/fs_trainer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import tqdm
  4 | import os
  5 | import nni
  6 | import datetime as dt
  7 | from utils.utils import EarlyStopper
  8 | from sklearn.metrics import roc_auc_score, log_loss
  9 | 
 10 | class modeltrainer():
 11 |     def __init__(self, args, model, model_name, device, epochs, retrain):
 12 |         self.args = args
 13 |         self.model = model
 14 |         self.optimizers = model.set_optimizer() # dict of optimizers
 15 |         self.criterion = torch.nn.BCELoss()
 16 |         self.device = torch.device(device)
 17 |         self.model.to(self.device)
 18 |         self.n_epoch = epochs
 19 |         self.model_path = 'checkpoints/' + model_name + '_' + args.fs + '_' + args.es + '_' + args.dataset + '_' + args.timestr + '/'
 20 |         self.early_stopper = EarlyStopper(patience=args.patience)
 21 |         self.retrain = retrain
 22 | 
 23 |     def train_one_epoch(self, train_dataloader, val_dataloader, epoch_i, log_interval=10):
 24 |         self.model.train()
 25 |         total_loss = 0
 26 |         val_iter = iter(val_dataloader)
 27 |         tk0 = tqdm.tqdm(train_dataloader, desc="train", smoothing=0, mininterval=1.0)
 28 |         for i, (x, y) in enumerate(tk0):
 29 |             # x_dict = {k: v.to(self.device) for k, v in x_dict.items()}  #tensor to GPU
 30 |             x = x.to(self.device)
 31 |             y = y.to(self.device)
 32 |             y_pred = self.model(x, current_epoch=epoch_i, current_step=i)
 33 |             loss = self.criterion(y_pred, y.float().reshape(-1, 1))
 34 |             # optfs l1 norm
 35 |             if self.args.fs == 'optfs' and not self.retrain:
 36 |                 reg_loss = torch.sum(torch.sigmoid(self.model.fs.temp * self.model.fs.mask_weight))
 37 |                 # g = torch.concat([self.model.fs.g[feature] for feature in self.model.fs.features], dim=0)
 38 |                 # l1_loss = torch.norm(g, p=1) * 2e-9
 39 |                 if self.args.dataset == 'avazu':
 40 |                     loss = loss + reg_loss * 4e-9
 41 |                 elif self.args.dataset == 'criteo':
 42 |                     loss = loss + reg_loss * 1e-8
 43 |                 elif self.args.dataset == 'movielens-1m':
 44 |                     loss = loss + reg_loss * 1e-4
 45 |                 elif self.args.dataset == 'aliccp':
 46 |                     loss = loss + reg_loss * 1e-8
 47 |                 else:
 48 |                     print('please set the hyparameters for optfs of reg_loss in fs_trainer.py')
 49 | 
 50 |             self.model.zero_grad()
 51 |             # self.optimizer.zero_grad()
 52 |             loss.backward()
 53 |             self.optimizers['optimizer_bb'].step()
 54 |             if self.args.fs == 'optfs' and not self.retrain:
 55 |                 self.optimizers['optimizer_fs'].step()
 56 |             total_loss += loss.item()
 57 |             if (i + 1) % log_interval == 0:
 58 |                 tk0.set_postfix(loss=total_loss / log_interval)
 59 |                 total_loss = 0
 60 | 
 61 |             # other optimizers
 62 |             if self.model.fs.optimizer_method == 'darts' and i % self.model.fs.update_frequency == 0:
 63 |                 self.optimizers['optimizer_fs'].zero_grad()
 64 |                 try:
 65 |                     batch = next(val_iter)
 66 |                 except StopIteration:
 67 |                     val_iter = iter(val_dataloader)
 68 |                     batch = next(val_iter)
 69 |                 x_,y_ = batch
 70 |                 x_, y_ = x_.to(self.device), y_.to(self.device)
 71 |                 y_pred_ = self.model(x_, current_epoch=epoch_i, current_step=i)
 72 |                 loss_ = self.criterion(y_pred_, y_.float().reshape(-1, 1))
 73 |                 loss_.backward()
 74 |                 self.optimizers['optimizer_fs'].step()
 75 |             elif self.args.fs == 'lpfs':
 76 |                 p = self.optimizers['optimizer_fs'].param_groups[0]['params'][0]
 77 |                 self.optimizers['optimizer_fs'].step()
 78 |                 thr = 0.01 * self.args.learning_rate
 79 |                 in1 = p.data > thr
 80 |                 in2 = p.data < -thr
 81 |                 in3 = ~(in1 | in2)
 82 |                 p.data[in1] -= thr
 83 |                 p.data[in2] += thr
 84 |                 p.data[in3] = 0.0
 85 | 
 86 |             
 87 | 
 88 |     def fit(self, train_dataloader, val_dataloader=None):
 89 |         all_start_time = dt.datetime.now()
 90 |         epoch_time_lis = []
 91 |         for epoch_i in range(self.n_epoch):
 92 |             print('epoch:', epoch_i)
 93 |             epoch_start_time = dt.datetime.now()
 94 |             self.train_one_epoch(train_dataloader, val_dataloader, epoch_i)
 95 |             epoch_end_time = dt.datetime.now()
 96 |             epoch_time_lis.append((epoch_end_time - epoch_start_time).total_seconds())
 97 |             if val_dataloader:
 98 |                 auc = self.evaluate(val_dataloader, epoch_i)
 99 |                 # nni
100 |                 if self.args.nni:
101 |                     nni.report_intermediate_result(auc.item())
102 |                 print('epoch:', epoch_i, 'validation: auc:', auc)
103 |                 if self.early_stopper.stop_training(auc, self.model.state_dict()):
104 |                     print(f'validation: best auc: {self.early_stopper.best_auc}')
105 |                     self.model.load_state_dict(self.early_stopper.best_weights)
106 |                     break
107 |                 # stop early stopper during adafs pretrain
108 |                 if self.args.fs in ['adafs','mvfs'] and epoch_i < self.model.fs.pretrain_epoch:
109 |                     print('reset early stopper due to pretraining')
110 |                     self.early_stopper.trial_counter = 0
111 |                     self.early_stopper.best_auc = 0
112 |                     self.early_stopper.best_weights = None
113 |         all_end_time = dt.datetime.now()
114 |         print('all training time: {} s'.format((all_end_time - all_start_time).total_seconds()))
115 |         print('average epoch time: {} s'.format(sum(epoch_time_lis) / len(epoch_time_lis)))
116 |         if not os.path.exists(self.model_path):
117 |             os.makedirs(self.model_path)
118 |         if self.model.fs.mode != 'retrain':
119 |             torch.save(self.model.state_dict(), os.path.join(self.model_path, "model_search.pth"))  #save best auc model
120 |         # else:
121 |         #     torch.save(self.model.state_dict(), os.path.join(self.model_path, "model_retrain.pth"))
122 |     
123 |     def evaluate(self, data_loader, current_epoch):
124 |         self.model.eval()
125 |         targets, predicts = list(), list()
126 |         with torch.no_grad():
127 |             tk0 = tqdm.tqdm(data_loader, desc="validation", smoothing=0, mininterval=1.0)
128 |             for i, (x, y) in enumerate(tk0):
129 |                 x = x.to(self.device)
130 |                 # x_dict = {k: v.to(self.device) for k, v in x_dict.items()}
131 |                 y = y.to(self.device)
132 |                 y_pred = self.model(x, current_epoch, current_step=i) # current_epoch=None means not in training mode
133 |                 targets.extend(y.tolist())
134 |                 predicts.extend(y_pred.tolist())
135 |         return roc_auc_score(targets, predicts)
136 |     
137 |     def test(self, data_loader, evaluate_fns):
138 |         self.model.eval()
139 |         targets, predicts = list(), list()
140 |         with torch.no_grad():
141 |             tk0 = tqdm.tqdm(data_loader, desc="test", smoothing=0, mininterval=1.0)
142 |             start_time = dt.datetime.now()
143 |             for i, (x, y) in enumerate(tk0):
144 |                 x = x.to(self.device)
145 |                 y = y.to(self.device)
146 |                 y_pred = self.model(x, current_epoch=None, current_step=i)
147 |                 targets.extend(y.tolist())
148 |                 predicts.extend(y_pred.tolist())
149 |             end_time = dt.datetime.now()
150 |             print('infer time: {} s'.format((end_time - start_time).total_seconds()))
151 |         for evaluate_fn in evaluate_fns:
152 |             if evaluate_fn == 'auc':
153 |                 auc = roc_auc_score(targets, predicts)
154 |                 print('test auc:', auc)
155 |             elif evaluate_fn == 'logloss':
156 |                 logloss = log_loss(targets, predicts)
157 |                 print('test logloss:', logloss)
158 |         return auc
159 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import torch
  4 | import os
  5 | import copy
  6 | import importlib
  7 | import datetime
  8 | 
  9 | def seed_everything(seed):
 10 |     random.seed(seed)
 11 |     np.random.seed(seed)
 12 |     torch.manual_seed(seed)
 13 |     torch.cuda.manual_seed_all(seed)
 14 | 
 15 | def get_model(model_name: str, model_type: str):
 16 |     """
 17 |     Automatically select model class based on model name
 18 | 
 19 |     Args:
 20 |         model_name (str): model name
 21 |         model_type (str): rec, fs, es
 22 | 
 23 |     Returns:
 24 |         Recommender: model class
 25 |         Dict: model configuration dict
 26 |     """
 27 |     model_file_name = model_name.lower()
 28 |     model_module = None
 29 |     module_path = '.'.join(['models', model_type, model_file_name])
 30 |     if importlib.util.find_spec(module_path, __name__):
 31 |         model_module = importlib.import_module(module_path, __name__)
 32 |     else:
 33 |         raise ValueError(f'`model_name` [{model_name}] is not the name of an existing model.')
 34 |     model_class = getattr(model_module, model_name)
 35 |     # dir = os.path.dirname(model_module.__file__)
 36 |     # conf = dict()
 37 |     # fname = os.path.join(os.path.dirname(dir), 'basemodel', 'basemodel.yaml')
 38 |     # conf.update(parser_yaml(fname))
 39 |     # for name in ['all', model_file_name]:
 40 |     #     fname = os.path.join(dir, 'config', name+'.yaml')
 41 |     #     if os.path.isfile(fname):
 42 |     #         conf = deep_update(conf, parser_yaml(fname))
 43 |     return model_class
 44 |     
 45 | 
 46 | class EarlyStopper(object):
 47 |     """Early stops the training if validation loss doesn't improve after a given patience.
 48 |         
 49 |     Args:
 50 |         patience (int): How long to wait after last time validation auc improved.
 51 |     """
 52 | 
 53 |     def __init__(self, patience):
 54 |         self.patience = patience
 55 |         self.trial_counter = 0
 56 |         self.best_auc = 0
 57 |         self.best_weights = None
 58 | 
 59 |     def stop_training(self, val_auc, weights):
 60 |         """whether to stop training.
 61 | 
 62 |         Args:
 63 |             val_auc (float): auc score in val data.
 64 |             weights (tensor): the weights of model
 65 |         """
 66 |         if val_auc > self.best_auc:
 67 |             self.best_auc = val_auc
 68 |             self.trial_counter = 0
 69 |             self.best_weights = copy.deepcopy(weights)
 70 |             return False
 71 |         elif self.trial_counter + 1 < self.patience:
 72 |             self.trial_counter += 1
 73 |             return False
 74 |         else:
 75 |             return True
 76 | 
 77 | def machine_learning_selection(args, fs, features, unique_values, data, k):
 78 |     train_x, train_y, val_x, val_y, test_x, test_y = data
 79 |     features = np.array(features)
 80 |     if fs == 'lasso':
 81 |         from sklearn.linear_model import Lasso
 82 |         lasso = Lasso(
 83 |             alpha=args.fs_config[args.fs]['alpha'],
 84 |             fit_intercept=args.fs_config[args.fs]['fit_intercept'],
 85 |             copy_X=args.fs_config[args.fs]['copy_X'],
 86 |             max_iter=args.fs_config[args.fs]['max_iter'],
 87 |             tol=args.fs_config[args.fs]['tol'],
 88 |             positive=args.fs_config[args.fs]['positive'],
 89 |             selection=args.fs_config[args.fs]['selection']
 90 |         )
 91 |         lasso.fit(train_x, train_y)
 92 |         field_importance = abs(lasso.coef_)
 93 |         rank = field_importance.argsort()[::-1]
 94 |         ranked_features = features[rank]
 95 |         ranked_importance = field_importance[rank]
 96 |         return np.array([ranked_features, ranked_importance])
 97 |         select_idx = []
 98 |         for i in range(k):
 99 |             print(features[rank[i]], field_importance[rank[i]])
100 |             select_idx.append(rank[i])
101 |         return features[select_idx]
102 |     elif fs == 'gbdt':
103 |         from sklearn.ensemble import GradientBoostingClassifier
104 |         gbdt = GradientBoostingClassifier(
105 |             learning_rate=args.fs_config[args.fs]['learning_rate'],
106 |             n_estimators=args.fs_config[args.fs]['n_estimators'],
107 |             subsample=args.fs_config[args.fs]['subsample'],
108 |             min_samples_split=args.fs_config[args.fs]['min_samples_split'],
109 |             min_samples_leaf=args.fs_config[args.fs]['min_samples_leaf'],
110 |             min_weight_fraction_leaf=args.fs_config[args.fs]['min_weight_fraction_leaf'],
111 |             max_depth=args.fs_config[args.fs]['max_depth'],
112 |             n_iter_no_change=args.fs_config[args.fs]['n_iter_no_change'],
113 |             verbose=1
114 |         )
115 |         gbdt.fit(train_x, train_y)
116 |         field_importance = gbdt.feature_importances_
117 |         rank = field_importance.argsort()[::-1]
118 |         ranked_features = features[rank]
119 |         ranked_importance = field_importance[rank]
120 |         return np.array([ranked_features, ranked_importance])
121 |         select_idx = []
122 |         for i in range(k):
123 |             print(features[rank[i]], field_importance[rank[i]])
124 |             select_idx.append(rank[i])
125 |         return features[select_idx]
126 |     elif fs == 'gbr':
127 |         from sklearn.ensemble import GradientBoostingRegressor
128 |         gbr = GradientBoostingRegressor()
129 |         gbr.fit(train_x, train_y)
130 |         field_importance = gbr.feature_importances_
131 |         rank = field_importance.argsort()[::-1]
132 |         ranked_features = features[rank]
133 |         ranked_importance = field_importance[rank]
134 |         return np.array([ranked_features, ranked_importance])
135 |         select_idx = []
136 |         for i in range(k):
137 |             print(features[rank[i]], field_importance[rank[i]])
138 |             select_idx.append(rank[i])
139 |         return features[select_idx]
140 |     elif fs == 'pca':
141 |         from sklearn.decomposition import PCA
142 |         pca = PCA(n_components=k)
143 |         pca.fit(train_x)
144 |         # first component
145 |         field_importance = abs(pca.components_[0])
146 |         rank = field_importance.argsort()[::-1]
147 |         ranked_features = features[rank]
148 |         ranked_importance = field_importance[rank]
149 |         return np.array([ranked_features, ranked_importance])
150 |         select_idx = []
151 |         for i in range(k):
152 |             print(features[rank[i]], field_importance[rank[i]])
153 |             select_idx.append(rank[i])
154 |         return features[select_idx]
155 |     elif fs == 'permutation':
156 |         from sklearn.ensemble import GradientBoostingClassifier
157 |         from sklearn.linear_model import Ridge
158 |         from sklearn.inspection import permutation_importance
159 |         from sklearn.neural_network import MLPClassifier
160 |         from sklearn.linear_model import LogisticRegression
161 |         from sklearn.ensemble import RandomForestClassifier
162 |         model = RandomForestClassifier(n_estimators=10, max_depth=None, n_jobs=6, verbose=1).fit(train_x, train_y)
163 |         # model = LogisticRegression(verbose=1,multi_class='ovr',n_jobs=32).fit(train_x, train_y)
164 |         # model = MLPClassifier(verbose=True, early_stopping=True, n_iter_no_change=3, hidden_layer_sizes=(16,16)).fit(train_x, train_y)
165 |         field_importance = permutation_importance(model, train_x, train_y, n_jobs=5)
166 |         rank = field_importance.importances_mean.argsort()[::-1]
167 |         ranked_features = features[rank]
168 |         ranked_importance = field_importance.importances_mean[rank]
169 |         return np.array([ranked_features, ranked_importance])
170 |     elif fs == 'rf':
171 |         from sklearn.ensemble import RandomForestClassifier
172 |         model = RandomForestClassifier(n_estimators=10, max_depth=None, n_jobs=6, verbose=1).fit(train_x, train_y)
173 |         field_importance = model.feature_importances_
174 |         rank = field_importance.argsort()[::-1]
175 |         ranked_features = features[rank]
176 |         ranked_importance = field_importance[rank]
177 |         return np.array([ranked_features, ranked_importance])
178 |     elif fs == 'xgb':
179 |         from xgboost import XGBClassifier
180 |         model = XGBClassifier(n_estimators=10, max_depth=None, n_jobs=6, verbose=1).fit(train_x, train_y)
181 |         field_importance = model.feature_importances_
182 |         rank = field_importance.argsort()[::-1]
183 |         ranked_features = features[rank]
184 |         ranked_importance = field_importance[rank]
185 |         return np.array([ranked_features, ranked_importance])
186 | 
187 |     
188 | def print_time(message):
189 |     print(datetime.datetime.now().strftime('%Y-%m-%d  %H:%M:%S '), message)
190 | 
191 | def str2bool(v):
192 |         if isinstance(v, bool):
193 |             return v
194 |         if v.lower() in ('yes', 'true', 't', 'y', '1'):
195 |             return True
196 |         elif v.lower() in ('no', 'false', 'f', 'n', '0'):
197 |             return False
198 |         else:
199 |             raise argparse.ArgumentTypeError('Boolean value expected.')


--------------------------------------------------------------------------------