├── LICENSE ├── README.md ├── cache └── .gitignore ├── ckp └── .gitignore ├── data ├── beijing │ ├── airbnb_clean.csv │ └── house_clean.csv └── hdb │ ├── hdb_clean.csv │ └── school_clean.csv ├── environment.yml ├── log └── .gitignore ├── runs └── .gitignore └── src ├── align ├── __init__.py ├── cal_sim_score.py ├── exact_align_game.py └── sim_align_game.py ├── metric ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── accuracy.cpython-38.pyc │ ├── base.cpython-38.pyc │ ├── mae.cpython-38.pyc │ ├── r2_score.cpython-38.pyc │ └── rmse.cpython-38.pyc ├── accuracy.py ├── base.py ├── mae.py ├── r2_score.py └── rmse.py ├── model ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ └── __init__.cpython-39.pyc ├── base │ ├── DLRM.py │ ├── MLP.py │ ├── SplitNN.py │ ├── __init__.py │ └── __pycache__ │ │ ├── DLRM.cpython-38.pyc │ │ ├── MLP.cpython-38.pyc │ │ ├── SplitNN.cpython-38.pyc │ │ └── __init__.cpython-38.pyc ├── game │ ├── GameMergeSimModel.py │ └── __init__.py └── vertical_fl │ ├── AvgSimModel.py │ ├── ConcatSimModel.py │ ├── ExactModel.py │ ├── FeatureSimModel.py │ ├── FedSimModel.py │ ├── MergeSimModel.py │ ├── MergeSimModelV2.py │ ├── OnePartyModel.py │ ├── OrderSimModel.py │ ├── SimModel.py │ ├── SimModel.py.bak │ ├── ThresholdSimModel.py │ ├── Top1SimModel.py │ ├── TwoPartyModel.py │ ├── __init__.py │ └── __pycache__ │ ├── ExactModel.cpython-38.pyc │ ├── FeatureSimModel.cpython-38.pyc │ ├── FedSimModel.cpython-38.pyc │ ├── FedSimModel.cpython-39.pyc │ ├── MergeSimModel.cpython-38.pyc │ ├── OnePartyModel.cpython-38.pyc │ ├── SimModel.cpython-38.pyc │ ├── SimModel.cpython-39.pyc │ ├── Top1SimModel.cpython-38.pyc │ ├── TwoPartyModel.cpython-38.pyc │ ├── __init__.cpython-38.pyc │ └── __init__.cpython-39.pyc ├── preprocess ├── __init__.py ├── __pycache__ │ └── __init__.cpython-38.pyc ├── add │ └── __init__.py ├── beijing │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── beijing_loder.cpython-38.pyc │ ├── beijing_loder.py │ ├── clean_airbnb.py │ └── clean_house.py ├── company │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── company_loader.cpython-38.pyc │ ├── clean_company.py │ └── company_loader.py ├── game.old │ ├── __init__.py │ ├── negative_sample_steam_interact.py │ ├── sample_steam_interact.py │ ├── split_data.py │ └── steam_ign_loader.py ├── game │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── game_loader.cpython-38.pyc │ ├── clean_rawg.py │ ├── clean_steam.py │ └── game_loader.py ├── gesis │ ├── __init__.py │ └── sav_to_csv.py ├── hdb │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── hdb_loader.cpython-38.pyc │ ├── clean_hdb.py │ ├── clean_school.py │ └── hdb_loader.py ├── krebs │ ├── __init__.py │ └── generate_dataset.py ├── ml_dataset │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── frog.cpython-38.pyc │ │ ├── miniboone.cpython-38.pyc │ │ └── two_party_loader.cpython-38.pyc │ ├── frog.py │ ├── miniboone.py │ └── two_party_loader.py ├── nytaxi │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── ny_loader.cpython-38.pyc │ ├── clean_airbnb.py │ ├── clean_citibike.py │ ├── clean_tlc.py │ ├── filter_kaggle.py │ └── ny_loader.py ├── sklearn │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── syn_data_generator.cpython-38.pyc │ └── syn_data_generator.py └── song │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-38.pyc │ └── song_loader.cpython-38.pyc │ ├── clean_fma.py │ ├── clean_msd.py │ └── song_loader.py ├── priv_scripts ├── train_beijing_A.py ├── train_beijing_avgsim.py ├── train_beijing_featuresim.py ├── train_beijing_fedsim.py ├── train_beijing_top1sim.py ├── train_hdb_A.py ├── train_hdb_avgsim.py ├── train_hdb_featuresim.py ├── train_hdb_fedsim.py └── train_hdb_top1sim.py ├── train_beijing_A.py ├── train_beijing_B.py ├── train_beijing_avgsim.py ├── train_beijing_featuresim.py ├── train_beijing_fedsim.py ├── train_beijing_fedsim_multi.py ├── train_beijing_top1sim.py ├── train_hdb_A.py ├── train_hdb_B.py ├── train_hdb_avgsim.py ├── train_hdb_featuresim.py ├── train_hdb_fedsim.py ├── train_hdb_top1sim.py └── utils ├── __init__.py ├── __pycache__ ├── __init__.cpython-38.pyc ├── privacy.cpython-38.pyc └── utils.cpython-38.pyc ├── privacy.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FedSim 2 | [![GitHub license](https://img.shields.io/github/license/Xtra-Computing/FedSim)](https://github.com/Xtra-Computing/FedSim/edit/main/LICENSE) 3 | ![PyTorch](https://img.shields.io/badge/torch-1.8.2-orange) 4 | 5 | 6 | 7 | FedSim is a **coupled vertical federated learning framework** that boosts the training with record similarities. 8 | 9 | 10 | ## Requirements 11 | 1. Install conda 4.14 following https://www.anaconda.com/products/distribution 12 | 2. Clone this repo by 13 | ```bash 14 | git clone https://github.com/JerryLife/FedSim.git 15 | ``` 16 | 3. Create environment (named `fedsim`) and install required basic modules. 17 | ```bash 18 | conda env create -f environment.yml 19 | conda activate fedsim 20 | ``` 21 | 4. Install `torch` and `torchvision` according to your cuda version with `pip`. For RTX 3090, we installed `torch==1.8.2` and `torchvision==0.9.2` as below. 22 | ```bash 23 | pip3 install torch==1.8.2 torchvision==0.9.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu111 24 | ``` 25 | 5. Ensure all the required folders are created (which should exist upon git clone). 26 | ```bash 27 | mkdir -p runs ckp log cache 28 | ``` 29 | ## Datasets 30 | In this repo, due to the size limit, we include two datasets `house` and `game` in the `data/` folder. 31 | ``` 32 | data 33 | ├── beijing (house) 34 | │ ├── airbnb_clean.csv (Secondary) 35 | │ └── house_clean.csv (Primary) 36 | └── hdb (hdb) 37 | ├── hdb_clean.csv (Primary) 38 | └── school_clean.csv (Secondary) 39 | ``` 40 | ## Linkage and Training 41 | The linkage and training of each dataset is combined in a single script. 42 | ### FedSim without adding noise 43 | The scripts without adding noise are located under `src/` in the format of `src/train__.py`. You can run each script by 44 | 45 | 46 | > python src/train__.py [-g gpu_index] [-p perturbed_noise_on_similarity] [-k number_of_neighbors] [--mlp-merge] [-ds] [-dw] 47 | 48 | * `-g/--gpu`: GPU index to run this script. If GPU of this index is not available, CPU will be used instead. 49 | * `-k/--top-k`: Number of neighbors to extract from possible matches, which should be less than the value of "knn_k". ($K$ in the paper) 50 | * `-p/--leak-p`: The probability of leakage of bloom filters. ($\tau$ in the paper) 51 | * `--mlp-merge`: whether to replace CNN merge model with MLP merge model 52 | * `-ds/--disable-sort`: whether to distable the sort gate 53 | * `-dw/--disable-weight`: whether to disable the weight gate 54 | 55 | Taking house dataset dataset as an example: 56 | ```bash 57 | python src/train_beijing_fedsim.py -g 1 -p 1e0 -k 5 -ds 58 | ``` 59 | runs FedSim on house dataset with $\tau=1$ (no added noise), $K=5$, merging with CNN, disabling sort gate, enabling weight gate. 60 | 61 | ### FedSim with noise added 62 | The scripts with adding noise are located in `src/priv_scripts` in the same format as the scripts without noise. The only difference are some hyperparamter settings. You may run these scripts by similar command. For example, 63 | ```bash 64 | python src/train_beijing_fedsim.py -g 1 -p 1e-2 -k 5 -ds 65 | ``` 66 | runs FedSim on house dataset with noise satisfying $\tau=0.01$ added, $K=5$, merging with CNN, disabling sort gate, enabling weight gate. 67 | 68 | ## Citation 69 | ```bib 70 | @inproceedings{NEURIPS2022_84b74416, 71 | author = {Wu, Zhaomin and Li, Qinbin and He, Bingsheng}, 72 | booktitle = {Advances in Neural Information Processing Systems}, 73 | editor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh}, 74 | pages = {21087--21100}, 75 | publisher = {Curran Associates, Inc.}, 76 | title = {A Coupled Design of Exploiting Record Similarity for Practical Vertical Federated Learning}, 77 | url = {https://proceedings.neurips.cc/paper_files/paper/2022/file/84b744165a0597360caad96b06e69313-Paper-Conference.pdf}, 78 | volume = {35}, 79 | year = {2022} 80 | } 81 | ``` 82 | 83 | 84 | -------------------------------------------------------------------------------- /cache/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /ckp/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: fedsim 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=conda_forge 7 | - _openmp_mutex=4.5=1_llvm 8 | - absl-py=0.12.0=pyhd8ed1ab_0 9 | - aiohttp=3.7.4=py38h497a2fe_0 10 | - argon2-cffi=20.1.0=py38h497a2fe_2 11 | - async-timeout=3.0.1=py_1000 12 | - async_generator=1.10=py_0 13 | - attrs=20.3.0=pyhd3deb0d_0 14 | - backcall=0.2.0=pyh9f0ad1d_0 15 | - backports=1.0=py_2 16 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 17 | - blas=2.108=mkl 18 | - blas-devel=3.9.0=8_mkl 19 | - bleach=3.3.0=pyh44b312d_0 20 | - blinker=1.4=py_1 21 | - bottleneck=1.3.2=py38h5c078b8_3 22 | - brotlipy=0.7.0=py38h497a2fe_1001 23 | - c-ares=1.17.1=h7f98852_1 24 | - ca-certificates=2021.10.26=h06a4308_2 25 | - cached-property=1.5.2=hd8ed1ab_1 26 | - cached_property=1.5.2=pyha770c72_1 27 | - cachetools=4.2.1=pyhd8ed1ab_0 28 | - certifi=2021.10.8=py38h06a4308_2 29 | - cffi=1.14.5=py38ha65f79e_0 30 | - chardet=4.0.0=py38h578d9bd_1 31 | - click=7.1.2=pyh9f0ad1d_0 32 | - cryptography=3.4.7=py38ha5dfef3_0 33 | - cudatoolkit=11.0.3=h15472ef_8 34 | - cycler=0.10.0=py_2 35 | - dbus=1.13.18=hb2f20db_0 36 | - decorator=4.4.2=py_0 37 | - defusedxml=0.7.1=pyhd8ed1ab_0 38 | - entrypoints=0.3=pyhd8ed1ab_1003 39 | - expat=2.3.0=h9c3ff4c_0 40 | - faiss=1.7.0=py38cuda110h60a57df_5_cuda 41 | - faiss-gpu=1.7.0=h788eb59_5 42 | - fontconfig=2.13.1=hba837de_1004 43 | - freetype=2.10.4=h0708190_1 44 | - gettext=0.19.8.1=h0b5b191_1005 45 | - glib=2.68.0=h9c3ff4c_2 46 | - glib-tools=2.68.0=h9c3ff4c_2 47 | - google-auth=1.28.0=pyh44b312d_0 48 | - google-auth-oauthlib=0.4.1=py_2 49 | - grpcio=1.36.1=py38hdd6454d_0 50 | - gst-plugins-base=1.14.0=hbbd80ab_1 51 | - gstreamer=1.14.0=h28cd5cc_2 52 | - h5py=3.1.0=nompi_py38hafa665b_100 53 | - hdf5=1.10.6=nompi_h6a2412b_1114 54 | - icu=58.2=hf484d3e_1000 55 | - idna=2.10=pyh9f0ad1d_0 56 | - importlib-metadata=3.10.0=py38h578d9bd_0 57 | - ipykernel=5.5.3=py38hd0cf306_0 58 | - ipython=7.23.0=py38hd0cf306_0 59 | - ipython_genutils=0.2.0=py_1 60 | - ipywidgets=7.6.3=pyhd3deb0d_0 61 | - jedi=0.18.0=py38h578d9bd_2 62 | - jinja2=2.11.3=pyh44b312d_0 63 | - joblib=1.0.1=pyhd8ed1ab_0 64 | - jpeg=9b=h024ee3a_2 65 | - jsonschema=3.2.0=pyhd8ed1ab_3 66 | - jupyter=1.0.0=py38h578d9bd_6 67 | - jupyter_client=6.1.12=pyhd8ed1ab_0 68 | - jupyter_console=6.4.0=pyhd8ed1ab_0 69 | - jupyter_core=4.7.1=py38h578d9bd_0 70 | - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0 71 | - jupyterlab_widgets=1.0.0=pyhd8ed1ab_1 72 | - kiwisolver=1.3.1=py38h1fd1430_1 73 | - krb5=1.17.2=h926e7f8_0 74 | - lcms2=2.11=h396b838_0 75 | - ld_impl_linux-64=2.35.1=hea4e1c9_2 76 | - libblas=3.9.0=8_mkl 77 | - libcblas=3.9.0=8_mkl 78 | - libclang=11.1.0=default_ha53f305_0 79 | - libcurl=7.76.1=hc4aaa36_0 80 | - libedit=3.1.20191231=he28a2e2_2 81 | - libev=4.33=h516909a_1 82 | - libevent=2.1.10=hcdb4288_3 83 | - libfaiss=1.7.0=cuda110h8045045_5_cuda 84 | - libfaiss-avx2=1.7.0=cuda110h1234567_5_cuda 85 | - libffi=3.3=h58526e2_2 86 | - libgcc-ng=9.3.0=h2828fa1_18 87 | - libgfortran-ng=9.3.0=hff62375_18 88 | - libgfortran5=9.3.0=hff62375_18 89 | - libglib=2.68.0=h3e27bee_2 90 | - libiconv=1.16=h516909a_0 91 | - liblapack=3.9.0=8_mkl 92 | - liblapacke=3.9.0=8_mkl 93 | - libllvm11=11.1.0=hf817b99_0 94 | - libnghttp2=1.43.0=h812cca2_0 95 | - libopenblas=0.3.12=pthreads_h4812303_1 96 | - libpng=1.6.37=h21135ba_2 97 | - libpq=13.1=hfd2b0eb_2 98 | - libprotobuf=3.15.6=h780b84a_0 99 | - libsodium=1.0.18=h36c2ea0_1 100 | - libssh2=1.9.0=ha56f1ee_6 101 | - libstdcxx-ng=9.3.0=h6de172a_18 102 | - libtiff=4.1.0=h2733197_1 103 | - libuuid=2.32.1=h7f98852_1000 104 | - libuv=1.41.0=h7f98852_0 105 | - libwebp-base=1.2.0=h7f98852_2 106 | - libxcb=1.13=h7f98852_1003 107 | - libxkbcommon=1.0.3=he3ba5ed_0 108 | - libxml2=2.9.10=hb55368b_3 109 | - llvm-openmp=11.1.0=h4bd325d_0 110 | - lz4-c=1.9.3=h9c3ff4c_0 111 | - markdown=3.3.4=pyhd8ed1ab_0 112 | - markupsafe=1.1.1=py38h497a2fe_3 113 | - matplotlib=3.3.4=py38h578d9bd_0 114 | - matplotlib-base=3.3.4=py38h0efea84_0 115 | - matplotlib-inline=0.1.2=pyhd8ed1ab_2 116 | - mistune=0.8.4=py38h497a2fe_1003 117 | - mkl=2020.4=h726a3e6_304 118 | - mkl-devel=2020.4=ha770c72_305 119 | - mkl-include=2020.4=h726a3e6_304 120 | - multidict=5.1.0=py38h497a2fe_1 121 | - mysql-common=8.0.23=ha770c72_1 122 | - mysql-libs=8.0.23=h935591d_1 123 | - nbclient=0.5.3=pyhd8ed1ab_0 124 | - nbconvert=6.0.7=py38h578d9bd_3 125 | - nbformat=5.1.3=pyhd8ed1ab_0 126 | - ncurses=6.2=h58526e2_4 127 | - nest-asyncio=1.5.1=pyhd8ed1ab_0 128 | - networkx=2.5=py_0 129 | - ninja=1.10.2=h4bd325d_0 130 | - nltk=3.5=py_0 131 | - notebook=6.3.0=pyha770c72_1 132 | - nspr=4.30=h9c3ff4c_0 133 | - nss=3.63=hb5efdd6_0 134 | - numpy=1.20.2=py38h9894fe3_0 135 | - oauthlib=3.0.1=py_0 136 | - olefile=0.46=pyh9f0ad1d_1 137 | - openssl=1.1.1m=h7f8727e_0 138 | - packaging=20.9=pyh44b312d_0 139 | - pandas=1.2.3=py38ha9443f7_0 140 | - pandoc=2.12=h7f98852_0 141 | - pandocfilters=1.4.2=py_1 142 | - parso=0.8.2=pyhd8ed1ab_0 143 | - pcre=8.44=he1b5a44_0 144 | - pexpect=4.8.0=pyh9f0ad1d_2 145 | - phe=1.4.0=py38h1cdfbd6_1 146 | - pickleshare=0.7.5=py_1003 147 | - pillow=8.1.2=py38he98fc37_0 148 | - pip=21.0.1=pyhd8ed1ab_0 149 | - prometheus_client=0.10.1=pyhd8ed1ab_0 150 | - prompt-toolkit=3.0.18=pyha770c72_0 151 | - prompt_toolkit=3.0.18=hd8ed1ab_0 152 | - protobuf=3.15.6=py38h709712a_0 153 | - pthread-stubs=0.4=h36c2ea0_1001 154 | - ptyprocess=0.7.0=pyhd3deb0d_0 155 | - pyasn1=0.4.8=py_0 156 | - pyasn1-modules=0.2.7=py_0 157 | - pycparser=2.20=pyh9f0ad1d_2 158 | - pygments=2.8.1=pyhd8ed1ab_0 159 | - pyjwt=2.0.1=pyhd8ed1ab_1 160 | - pyopenssl=20.0.1=pyhd8ed1ab_0 161 | - pyparsing=2.4.7=pyh9f0ad1d_0 162 | - pyqt=5.9.2=py38h05f1152_4 163 | - pyqt5-sip=4.19.18=py38h709712a_7 164 | - pyrsistent=0.17.3=py38h497a2fe_2 165 | - pysocks=1.7.1=py38h578d9bd_3 166 | - python=3.8.8=hffdb5ce_0_cpython 167 | - python-dateutil=2.8.1=py_0 168 | - python-wget=3.2=py_0 169 | - python_abi=3.8=1_cp38 170 | - pytz=2021.1=pyhd8ed1ab_0 171 | - pyzmq=22.0.3=py38h2035c66_1 172 | - qt=5.9.7=h5867ecd_1 173 | - qtconsole=5.0.3=pyhd8ed1ab_0 174 | - qtpy=1.9.0=py_0 175 | - readline=8.0=he28a2e2_2 176 | - regex=2021.3.17=py38h497a2fe_0 177 | - requests=2.25.1=pyhd3deb0d_0 178 | - requests-oauthlib=1.3.0=pyh9f0ad1d_0 179 | - rsa=4.7.2=pyh44b312d_0 180 | - scikit-learn=0.24.1=py38ha9443f7_0 181 | - scipy=1.6.2=py38h7b17777_0 182 | - send2trash=1.5.0=py_0 183 | - setuptools=49.6.0=py38h578d9bd_3 184 | - sip=4.19.13=py38he6710b0_0 185 | - six=1.15.0=pyh9f0ad1d_0 186 | - sortedcontainers=2.3.0=pyhd8ed1ab_0 187 | - sqlite=3.35.3=h74cdb3f_0 188 | - tabulate=0.8.9=pyhd8ed1ab_0 189 | - tensorboard=2.4.1=pyhd8ed1ab_0 190 | - tensorboard-plugin-wit=1.8.0=pyh44b312d_0 191 | - terminado=0.9.4=py38h578d9bd_0 192 | - testpath=0.4.4=py_0 193 | - threadpoolctl=2.1.0=pyh5ca1d4c_0 194 | - tk=8.6.10=h21135ba_1 195 | - tornado=6.1=py38h497a2fe_1 196 | - tqdm=4.59.0=pyhd8ed1ab_0 197 | - traitlets=5.0.5=py_0 198 | - typing-extensions=3.7.4.3=0 199 | - typing_extensions=3.7.4.3=py_0 200 | - urllib3=1.26.4=pyhd8ed1ab_0 201 | - wcwidth=0.2.5=pyh9f0ad1d_2 202 | - webencodings=0.5.1=py_1 203 | - werkzeug=1.0.1=pyh9f0ad1d_0 204 | - wheel=0.36.2=pyhd3deb0d_0 205 | - widgetsnbextension=3.5.1=py38h578d9bd_4 206 | - xorg-libxau=1.0.9=h7f98852_0 207 | - xorg-libxdmcp=1.1.3=h7f98852_0 208 | - xz=5.2.5=h516909a_1 209 | - yarl=1.6.3=py38h497a2fe_1 210 | - zeromq=4.3.4=h9c3ff4c_0 211 | - zipp=3.4.1=pyhd8ed1ab_0 212 | - zlib=1.2.11=h516909a_1010 213 | - zstd=1.4.9=ha95c52a_0 214 | - pip: 215 | - captum==0.3.1 216 | - deprecation==2.1.0 217 | - hiddenlayer==0.3 218 | - lesshash-bloomfilter==0.0.5 219 | - opencv-python==4.6.0.66 220 | - pytorch-ranger==0.1.1 221 | - torch-optimizer==0.1.0 222 | - torchinfo==1.6.3 223 | - torchsummaryx==1.3.0 224 | - torchviz==0.0.2 225 | -------------------------------------------------------------------------------- /log/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /runs/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /src/align/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/align/__init__.py -------------------------------------------------------------------------------- /src/align/cal_sim_score.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import random 4 | import numpy as np 5 | import pandas as pd 6 | from nltk.metrics.distance import edit_distance 7 | import pickle 8 | from tqdm import tqdm 9 | 10 | 11 | def similarity_score(a: str, b: str): 12 | return 1 - edit_distance(a, b) / max(len(a), len(b)) 13 | 14 | 15 | def cal_sim_score(steam_data_path, ign_data_path, out_sim_score_path, seed=0): 16 | random.seed(seed) 17 | np.random.seed(seed) 18 | os.environ['PYTHONHASHSEED'] = str(seed) 19 | 20 | # load raw data from steam and ign 21 | steam_data_df = pd.read_csv(steam_data_path) 22 | ign_data_df = pd.read_csv(ign_data_path) 23 | 24 | steam_title_df = steam_data_df['title'] 25 | ign_title_df = ign_data_df['title'] 26 | 27 | # extract data 28 | data_steam = steam_title_df.to_numpy() 29 | data_ign = ign_title_df.to_numpy() 30 | data_steam = np.unique(data_steam) 31 | data_ign = np.unique(data_ign) 32 | print("Unique steam titles: {}, unique ign titles: {}".format(len(data_steam), len(data_ign))) 33 | 34 | # blocking 35 | block_dict_steam = {} 36 | for record in data_steam: 37 | key = record[:3] 38 | if key in block_dict_steam: 39 | block_dict_steam[key].append(record) 40 | else: 41 | block_dict_steam[key] = [record] 42 | 43 | block_dict_ign = {} 44 | for record in data_ign: 45 | key = record[:3] 46 | if key in block_dict_ign: 47 | block_dict_ign[key].append(record) 48 | else: 49 | block_dict_ign[key] = [record] 50 | 51 | print("#blocks in steam: {}".format(len(block_dict_steam))) 52 | print("#blocks in ign: {}".format(len(block_dict_ign))) 53 | 54 | # Compare 55 | title_sim_scores = {} 56 | for s_key, s_block in tqdm(block_dict_steam.items()): 57 | if s_key not in block_dict_ign: 58 | continue 59 | 60 | i_block = block_dict_ign[s_key] 61 | for s_title in s_block: 62 | for i_title in i_block: 63 | sim_score = similarity_score(s_title, i_title) 64 | title_sim_scores[(s_title, i_title)] = sim_score 65 | 66 | # print("Block {} matched".format(s_key)) 67 | print("Got {} pairs".format(len(title_sim_scores))) 68 | 69 | # Save similarity scores 70 | print("Saving to {}".format(out_sim_score_path)) 71 | with open(out_sim_score_path, 'wb') as f: 72 | pickle.dump(title_sim_scores, f) 73 | print("Saved") 74 | 75 | 76 | if __name__ == '__main__': 77 | print("Calculate similarity scores for train, val & test") 78 | os.chdir(sys.path[0] + "/../../") # change working directory 79 | root = "data/" 80 | steam_data_train_path = root + "steam_data_train.csv" 81 | steam_data_val_path = root + "steam_data_val.csv" 82 | steam_data_test_path = root + "steam_data_test.csv" 83 | ign_data_path = root + "ign_game_clean.csv" 84 | out_sim_aligned_train_path = root + "sim_score_game_train.csv" 85 | out_sim_aligned_val_path = root + "sim_score_game_val.csv" 86 | out_sim_aligned_test_path = root + "sim_score_game_test.csv" 87 | cal_sim_score(steam_data_train_path, ign_data_path, out_sim_aligned_train_path) 88 | cal_sim_score(steam_data_val_path, ign_data_path, out_sim_aligned_val_path) 89 | cal_sim_score(steam_data_test_path, ign_data_path, out_sim_aligned_test_path) 90 | print("Done.") 91 | -------------------------------------------------------------------------------- /src/align/exact_align_game.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import csv 4 | import random 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | 9 | 10 | def clean_title(title: str): 11 | game = title.lower().replace(' ', '') 12 | game = ''.join(filter(str.isalnum, game)) 13 | return game 14 | 15 | 16 | def exact_align_game(steam_data_path, ign_data_path, out_exact_aligned_path, save_unmatched=False, seed=0): 17 | """ 18 | Exact align 'game' dataset on column 'title' 19 | :param steam_data_path: path of pre-processed steam data (sampled and negative sampled) 20 | :param ign_data_path: Cleaned IGN data 21 | :param out_exact_aligned_path: Output path for the aligned data 22 | :param save_unmatched: Whether to save unmatched steam records. If True, unknown features will be N/A. 23 | :param seed: Random seed 24 | :return: 25 | """ 26 | random.seed(seed) 27 | np.random.seed(seed) 28 | os.environ['PYTHONHASHSEED'] = str(seed) 29 | 30 | # load ign titles 31 | ign_title = set() 32 | ign_map = {} 33 | cnt = 0 34 | with open(ign_data_path, 'r') as csvfile: 35 | csv_reader = csv.reader(csvfile) 36 | next(csv_reader) 37 | for row in csv_reader: 38 | game = clean_title(row[0]) 39 | ign_title.add(game) 40 | ign_map[game] = cnt 41 | cnt += 1 42 | print('ign title num: ', len(ign_title)) 43 | 44 | # load steam titles 45 | steam_title = set() 46 | steam_map = {} 47 | cnt = 0 48 | with open(steam_data_path, 'r') as csvfile: 49 | csv_reader = csv.reader(csvfile) 50 | next(csv_reader) 51 | for row in csv_reader: 52 | game = clean_title(row[0]) 53 | steam_title.add(game) 54 | steam_map[game] = [row[1], cnt] 55 | cnt += 1 56 | print('steam title num: ', len(steam_title)) 57 | 58 | # match titles exactly 59 | align_title = ign_title.intersection(steam_title) 60 | align_title = list(align_title) 61 | print('align title num', len(align_title)) 62 | 63 | # find index of records to be aligned 64 | align_info = [] 65 | for a in align_title: 66 | align_info.append([int(steam_map[a][0]), steam_map[a][1], ign_map[a]]) 67 | align_info_df = pd.DataFrame(align_info, columns=['appid', 'steam_index', 'ign_index']) 68 | 69 | # align records from ign 70 | ign_game = pd.read_csv(ign_data_path) 71 | ign_game['ign_index'] = range(ign_game.shape[0]) 72 | ign_align = pd.merge(align_info_df, ign_game, on='ign_index', sort=False) 73 | print(ign_align) 74 | ign_align = ign_align.drop(columns=['title', 'steam_index', 'ign_index']) 75 | 76 | # align records from steam 77 | steam_data = pd.read_csv(steam_data_path) 78 | if save_unmatched: 79 | aligned_data_df = pd.merge(steam_data, ign_align, on='appid', how='left', sort=True) 80 | else: 81 | aligned_data_df = pd.merge(steam_data, ign_align, on='appid', sort=True) 82 | 83 | aligned_data_df.to_csv(out_exact_aligned_path, index=False) 84 | 85 | 86 | if __name__ == '__main__': 87 | os.chdir(sys.path[0] + "/../../") # change working directory 88 | root = "data/" 89 | steam_data_train_path = root + "steam_data_train.csv" 90 | steam_data_val_path = root + "steam_data_val.csv" 91 | steam_data_test_path = root + "steam_data_test.csv" 92 | ign_data_path = root + "ign_game_clean.csv" 93 | out_exact_aligned_train_path = root + "exact_aligned_game_unmatch_train.csv" 94 | out_exact_aligned_val_path = root + "exact_aligned_game_unmatch_val.csv" 95 | out_exact_aligned_test_path = root + "exact_aligned_game_unmatch_test.csv" 96 | exact_align_game(steam_data_train_path, ign_data_path, out_exact_aligned_train_path, save_unmatched=True) 97 | exact_align_game(steam_data_val_path, ign_data_path, out_exact_aligned_val_path, save_unmatched=True) 98 | exact_align_game(steam_data_test_path, ign_data_path, out_exact_aligned_test_path, save_unmatched=True) 99 | print("Done.") 100 | -------------------------------------------------------------------------------- /src/align/sim_align_game.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import csv 4 | import random 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | import pickle 9 | 10 | 11 | def sim_align_game(steam_data_path, ign_data_path, sim_score_path, out_sim_aligned_path, seed=0): 12 | random.seed(seed) 13 | np.random.seed(seed) 14 | os.environ['PYTHONHASHSEED'] = str(seed) 15 | 16 | ign_map = {} 17 | cnt = 0 18 | with open(ign_data_path, 'r') as csvfile: 19 | csv_reader = csv.reader(csvfile) 20 | next(csv_reader) 21 | for row in csv_reader: 22 | game = row[0] 23 | assert game not in ign_map 24 | ign_map[game] = cnt 25 | cnt += 1 26 | csvfile.close() 27 | 28 | steam_map = {} 29 | cnt = 0 30 | with open(steam_data_path, 'r') as csvfile: 31 | csv_reader = csv.reader(csvfile) 32 | next(csv_reader) 33 | for row in csv_reader: 34 | game = row[0] 35 | if game not in steam_map: 36 | steam_map[game] = int(row[1]) 37 | # else: 38 | # assert steam_map[game] == int(row[1]) 39 | cnt += 1 40 | csvfile.close() 41 | 42 | with open(sim_score_path, 'rb') as f: 43 | sim_scores = pickle.load(f) 44 | 45 | # align on titles 46 | align_info = [] 47 | num_pairs = len(sim_scores) 48 | print("Start aligning, got {} pairs".format(num_pairs)) 49 | for (s_title, i_title), score in sim_scores.items(): 50 | if s_title in steam_map: 51 | align_info.append([steam_map[s_title], ign_map[i_title], score]) 52 | else: 53 | num_pairs -= 1 54 | align_info_df = pd.DataFrame(align_info, columns=['appid', 'ign_index', 'sim_score']) 55 | print("Aligning finished. {} pairs remained.".format(num_pairs)) 56 | 57 | # merge ign records 58 | print("Merging ign records") 59 | ign_game = pd.read_csv(ign_data_path) 60 | ign_game['ign_index'] = range(ign_game.shape[0]) 61 | ign_align = pd.merge(align_info_df, ign_game, on='ign_index', sort=False) 62 | ign_align = ign_align.drop(columns=['ign_index']) 63 | ign_align.rename({'title': 'ign_title'}, axis=1, inplace=True) 64 | print("Finished merging with ign records, got {} lines".format(len(ign_align.index))) 65 | 66 | # merge steam records 67 | print("Merging steam records") 68 | steam_data = pd.read_csv(steam_data_path) 69 | steam_data.rename({'title': 'steam_title'}, axis=1, inplace=True) 70 | two_party_data_df = pd.merge(steam_data, ign_align, how='left', on='appid', sort=True) 71 | print("Finished merging with steam records, got {} lines".format(len(two_party_data_df.index))) 72 | 73 | # save aligned data to file 74 | print("Saving aligned data to file") 75 | two_party_data_df.to_csv(out_sim_aligned_path, index=False) 76 | print("Saved") 77 | 78 | 79 | if __name__ == '__main__': 80 | os.chdir(sys.path[0] + "/../../") # change working directory 81 | root = "data/" 82 | steam_data_train_path = root + "steam_data_train.csv" 83 | steam_data_val_path = root + "steam_data_val.csv" 84 | steam_data_test_path = root + "steam_data_test.csv" 85 | ign_data_path = root + "ign_game_clean.csv" 86 | sim_score_train_path = root + "sim_score_game_train.csv" 87 | sim_score_val_path = root + "sim_score_game_val.csv" 88 | sim_score_test_path = root + "sim_score_game_test.csv" 89 | out_sim_aligned_train_path = root + "sim_aligned_game_train.csv" 90 | out_sim_aligned_val_path = root + "sim_aligned_game_val.csv" 91 | out_sim_aligned_test_path = root + "sim_aligned_game_test.csv" 92 | print("Align training set") 93 | sim_align_game(steam_data_train_path, ign_data_path, sim_score_train_path, out_sim_aligned_train_path) 94 | print("Align validation set") 95 | sim_align_game(steam_data_val_path, ign_data_path, sim_score_val_path, out_sim_aligned_val_path) 96 | print("Align test set") 97 | sim_align_game(steam_data_test_path, ign_data_path, sim_score_test_path, out_sim_aligned_test_path) 98 | print("Done.") 99 | -------------------------------------------------------------------------------- /src/metric/__init__.py: -------------------------------------------------------------------------------- 1 | from .accuracy import Accuracy 2 | from .r2_score import R2Score 3 | from .rmse import RMSE 4 | from .mae import MAE 5 | -------------------------------------------------------------------------------- /src/metric/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/metric/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/metric/__pycache__/accuracy.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/metric/__pycache__/accuracy.cpython-38.pyc -------------------------------------------------------------------------------- /src/metric/__pycache__/base.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/metric/__pycache__/base.cpython-38.pyc -------------------------------------------------------------------------------- /src/metric/__pycache__/mae.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/metric/__pycache__/mae.cpython-38.pyc -------------------------------------------------------------------------------- /src/metric/__pycache__/r2_score.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/metric/__pycache__/r2_score.cpython-38.pyc -------------------------------------------------------------------------------- /src/metric/__pycache__/rmse.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/metric/__pycache__/rmse.cpython-38.pyc -------------------------------------------------------------------------------- /src/metric/accuracy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .base import BaseMetric 4 | 5 | 6 | class Accuracy(BaseMetric): 7 | def __init__(self): 8 | super().__init__('Accuracy') 9 | self.worst = -1 10 | 11 | def __call__(self, pred, label): 12 | all_pred = pred.flatten() 13 | assert all_pred.shape == label.shape 14 | return np.count_nonzero(all_pred == label) / label.shape[0] 15 | -------------------------------------------------------------------------------- /src/metric/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class BaseMetric(abc.ABC): 5 | def __init__(self, name): 6 | self.name = name 7 | self.worst = None 8 | 9 | @abc.abstractmethod 10 | def __call__(self, pred, label) -> float: 11 | """ 12 | Calculate score for pred and label 13 | :param pred: Size (n_samples, n_pred_dim) 14 | :param label: Size (n_samples) 15 | :return: Score 16 | """ 17 | pass 18 | 19 | -------------------------------------------------------------------------------- /src/metric/mae.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import mean_absolute_error 3 | 4 | from .base import BaseMetric 5 | 6 | 7 | class MAE(BaseMetric): 8 | def __init__(self): 9 | super().__init__("MAE") 10 | self.worst = np.inf 11 | 12 | def __call__(self, pred, label): 13 | all_pred = pred.flatten() 14 | assert all_pred.shape == label.shape 15 | return mean_absolute_error(label, all_pred) 16 | -------------------------------------------------------------------------------- /src/metric/r2_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import r2_score 3 | 4 | from .base import BaseMetric 5 | 6 | 7 | class R2Score(BaseMetric): 8 | def __init__(self): 9 | super().__init__('R2_Score') 10 | self.worst = -np.inf 11 | 12 | def __call__(self, pred, label): 13 | all_pred = pred.flatten() 14 | assert all_pred.shape == label.shape 15 | return r2_score(label, all_pred) 16 | -------------------------------------------------------------------------------- /src/metric/rmse.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import mean_squared_error 3 | 4 | from .base import BaseMetric 5 | 6 | 7 | class RMSE(BaseMetric): 8 | def __init__(self): 9 | super().__init__('RMSE') 10 | self.worst = np.inf 11 | 12 | def __call__(self, pred, label): 13 | all_pred = pred.flatten() 14 | assert all_pred.shape == label.shape 15 | return np.sqrt(mean_squared_error(label, all_pred)) 16 | -------------------------------------------------------------------------------- /src/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/__init__.py -------------------------------------------------------------------------------- /src/model/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /src/model/base/DLRM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class DLRM(nn.Module): 6 | def __init__(self, top_mlp_units, dense_mlp_units, emb_dim, counts, denses): 7 | super().__init__() 8 | num_fea = len(counts) + len(denses) 9 | self.num_cat = len(counts) 10 | self.num_dense = len(denses) 11 | 12 | embs = [nn.Embedding(cnt, emb_dim) for cnt in counts] 13 | self.embs = nn.ModuleList(embs) 14 | 15 | dense_mlps = [] 16 | for d in denses: 17 | mlp = [] 18 | prev = d 19 | for unit in dense_mlp_units: 20 | mlp.append(nn.Linear(prev, unit)) 21 | mlp.append(nn.LeakyReLU()) 22 | prev = unit 23 | mlp.append(nn.Linear(prev, emb_dim)) 24 | mlp.append(nn.LeakyReLU()) 25 | dense_mlps.append(nn.Sequential(*mlp)) 26 | self.dense_mlps = nn.ModuleList(dense_mlps) 27 | 28 | top_mlp = [] 29 | # prev = 30 | prev = emb_dim * self.num_dense + int(num_fea * (num_fea - 1) / 2) 31 | for unit in top_mlp_units: 32 | top_mlp.append(nn.Linear(prev, unit)) 33 | top_mlp.append(nn.LeakyReLU()) 34 | prev = unit 35 | top_mlp.append(nn.Dropout(0.5)) 36 | top_mlp.append(nn.Linear(prev, 1)) 37 | top_mlp.append(nn.Sigmoid()) 38 | self.top_mlp = nn.Sequential(*top_mlp) 39 | 40 | def forward(self, raw_inputs): 41 | inputs = [x for x in raw_inputs.T] 42 | 43 | cat_embs = [] 44 | dense_embs = [] 45 | 46 | for i in range(self.num_cat): 47 | emb = self.embs[i](inputs[i].long()) 48 | # emb = self.cat_mlps[i](emb) 49 | cat_embs.append(emb) 50 | 51 | for i in range(self.num_dense): 52 | emb = self.dense_mlps[i](inputs[self.num_cat + i].reshape(-1, 1).float()) 53 | dense_embs.append(emb) 54 | 55 | # out = torch.cat(cat_embs + dense_embs, dim=1) 56 | out = self.interact_features(dense_embs, cat_embs) 57 | out = self.top_mlp(out) 58 | out = torch.flatten(out) 59 | 60 | return out 61 | 62 | def interact_features(self, x, ly): 63 | # concatenate dense and sparse features 64 | (batch_size, d) = x[0].shape 65 | T = torch.cat(x + ly, dim=1).view((batch_size, -1, d)) 66 | 67 | # perform a dot product 68 | Z = torch.bmm(T, torch.transpose(T, 1, 2)) 69 | _, ni, nj = Z.shape 70 | offset = 0 71 | li = torch.tensor([i for i in range(ni) for j in range(i + offset)]) 72 | lj = torch.tensor([j for i in range(nj) for j in range(i + offset)]) 73 | Zflat = Z[:, li, lj] 74 | 75 | # concatenate dense features and interactions 76 | R = torch.cat(x + [Zflat], dim=1) 77 | return R 78 | 79 | def _initialize_weights(self): 80 | for m in self.modules(): 81 | if isinstance(m, nn.Linear): 82 | nn.init.xavier_uniform_(m.weight) 83 | nn.init.zeros_(m.bias) 84 | elif isinstance(m, nn.Embedding): 85 | nn.init.uniform_(m.weight, -0.05, 0.05) -------------------------------------------------------------------------------- /src/model/base/MLP.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class MLP(nn.Module): 7 | def __init__(self, input_size, hidden_sizes: list, output_size=1, activation=None): 8 | super(MLP, self).__init__() 9 | self.hidden_sizes = hidden_sizes 10 | self.activation = activation 11 | if len(hidden_sizes) != 0: 12 | self.fc_layers = nn.ModuleList([nn.Linear(input_size, hidden_sizes[0])]) 13 | for i in range(len(hidden_sizes) - 1): 14 | self.fc_layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1])) 15 | self.fc_layers.append(nn.Linear(hidden_sizes[-1], output_size)) 16 | else: 17 | self.fc_layers = nn.ModuleList([nn.Linear(input_size, output_size)]) 18 | 19 | def forward(self, X): 20 | if len(list(self.fc_layers)) == 0: 21 | return X 22 | 23 | if len((list(self.fc_layers))) == 1: 24 | out = X 25 | else: 26 | out = F.relu(self.fc_layers[0](X)) 27 | 28 | for fc in self.fc_layers[1:-1]: 29 | out = F.relu(fc(out)) 30 | 31 | if self.activation == 'sigmoid': 32 | out = torch.sigmoid(self.fc_layers[-1](out)) 33 | elif self.activation == 'tanh': 34 | out = torch.tanh(self.fc_layers[-1](out)) 35 | elif self.activation == 'relu': 36 | out = torch.relu(self.fc_layers[-1](out)) 37 | elif self.activation is None: 38 | out = self.fc_layers[-1](out) 39 | else: 40 | assert False 41 | return out 42 | 43 | 44 | class DropoutInputMLP(MLP): 45 | def __init__(self, dropout_rate=0.5, *args, **kwargs): 46 | super().__init__(*args, **kwargs) 47 | self.dropout = nn.Dropout(dropout_rate) 48 | 49 | def forward(self, X): 50 | out = self.dropout(X) 51 | return super().forward(out) 52 | -------------------------------------------------------------------------------- /src/model/base/SplitNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class SplitNN(nn.Module): 7 | def __init__(self, local_models, local_input_dims, agg_model): 8 | super().__init__() 9 | self.local_input_dims = local_input_dims 10 | self.agg_model = agg_model 11 | self.local_models = nn.ModuleList(local_models) 12 | 13 | self.n_local_models = len(self.local_models) 14 | assert self.n_local_models == len(self.local_input_dims) 15 | 16 | @staticmethod 17 | def split_features(x, dims: list): 18 | """ 19 | split features of x according to dims 20 | :param x: two-dimensional matrix 21 | :param dims: list of int 22 | :return: 23 | """ 24 | assert sum(dims) == x.shape[1], f"{sum(dims)=}, {x.shape[1]=}" 25 | xs = [] 26 | s_pos = 0 27 | for dim in dims: 28 | xs.append(x[:, s_pos: s_pos+dim]) 29 | s_pos += dim 30 | return xs 31 | 32 | def forward(self, X): 33 | Xs = self.split_features(X, self.local_input_dims) 34 | local_out = torch.cat([self.local_models[i](Xs[i]) 35 | for i in range(self.n_local_models)], dim=1) 36 | if self.agg_model is not None: 37 | out = self.agg_model(local_out) 38 | else: 39 | out = local_out 40 | return out 41 | -------------------------------------------------------------------------------- /src/model/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .DLRM import DLRM 2 | from .MLP import MLP, DropoutInputMLP 3 | from .SplitNN import SplitNN 4 | -------------------------------------------------------------------------------- /src/model/base/__pycache__/DLRM.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/base/__pycache__/DLRM.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/base/__pycache__/MLP.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/base/__pycache__/MLP.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/base/__pycache__/SplitNN.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/base/__pycache__/SplitNN.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/base/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/base/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/game/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/game/__init__.py -------------------------------------------------------------------------------- /src/model/vertical_fl/ThresholdSimModel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import abc 3 | import pickle 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.cluster import MiniBatchKMeans 8 | from sklearn.preprocessing import MinMaxScaler 9 | 10 | from tqdm import tqdm 11 | import deprecation 12 | 13 | from .SimModel import SimModel 14 | 15 | 16 | class ThresholdSimModel(SimModel): 17 | def __init__(self, num_common_features, sim_threshold=0.0, **kwargs): 18 | super().__init__(num_common_features, **kwargs) 19 | self.sim_threshold = sim_threshold 20 | 21 | def match(self, data1, data2, labels, idx=None, preserve_key=False, sim_threshold=0.0, grid_min=-3., grid_max=3.01, 22 | grid_width=0.2, knn_k=3, tree_leaf_size=40, radius=0.1) -> tuple: 23 | [matched_data1, matched_data2], ordered_labels, data_indices = \ 24 | super().match(data1, data2, labels, idx=idx, preserve_key=preserve_key, sim_threshold=self.sim_threshold) 25 | # remove similarity score from data 26 | return (matched_data1[:, 1:], matched_data2[:, 1:]), ordered_labels, data_indices -------------------------------------------------------------------------------- /src/model/vertical_fl/TwoPartyModel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import abc 3 | import pickle 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.cluster import MiniBatchKMeans 8 | from sklearn.preprocessing import MinMaxScaler 9 | 10 | from tqdm import tqdm 11 | import deprecation 12 | 13 | from .OnePartyModel import BaseModel 14 | 15 | 16 | class TwoPartyBaseModel(abc.ABC, BaseModel): 17 | def __init__(self, num_common_features, drop_key=True, grid_min=-3., grid_max=3.01, grid_width=0.2, 18 | knn_k=3, tree_leaf_size=40, kd_tree_radius=0.1, 19 | dataset_type='syn', **kwargs): 20 | 21 | super().__init__(**kwargs) 22 | assert dataset_type in ['syn', 'real'] 23 | self.dataset_type = dataset_type 24 | self.drop_key = drop_key 25 | self.num_common_features = num_common_features 26 | self.tree_radius = kd_tree_radius 27 | self.tree_leaf_size = tree_leaf_size 28 | self.knn_k = knn_k 29 | self.grid_min = grid_min 30 | self.grid_max = grid_max 31 | self.grid_width = grid_width 32 | self.sim_scaler = None 33 | 34 | @abc.abstractmethod 35 | def match(self, data1, data2, labels, idx=None, preserve_key=False, sim_threshold=0.0, 36 | grid_min=-3., grid_max=3.01, grid_width=0.2, knn_k=3, tree_leaf_size=40, radius=0.1) -> tuple: 37 | """ 38 | Match the data of two parties, return the matched data 39 | :param radius: 40 | :param knn_k: 41 | :param tree_leaf_size: 42 | :param idx: Index of data1, only for evaluation. It should not be involved in linkage. 43 | :param sim_threshold: threshold of similarity score, everything below the threshold will be removed 44 | :param data1: data in party 1 45 | :param data2: data in party 2 46 | :param labels: labels (in party 1) 47 | :param preserve_key: whether to preserve common features in the output 48 | :return: [matched_data1, matched_data2], matched_labels 49 | Each line refers to one sample 50 | """ 51 | raise NotImplementedError 52 | 53 | # def train_combine(self, data1, data2, labels, data_cache_path=None): 54 | # train_X, val_X, test_X, train_y, val_y, test_y, train_idx, val_idx, test_idx = \ 55 | # self.prepare_train_combine(data1, data2, labels, data_cache_path) 56 | # 57 | # return self._train(train_X, val_X, test_X, train_y, val_y, test_y, 58 | # train_idx[:, 0], val_idx[:, 0], test_idx[:, 0]) 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /src/model/vertical_fl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__init__.py -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/ExactModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/ExactModel.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/FeatureSimModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/FeatureSimModel.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/FedSimModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/FedSimModel.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/FedSimModel.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/FedSimModel.cpython-39.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/MergeSimModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/MergeSimModel.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/OnePartyModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/OnePartyModel.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/SimModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/SimModel.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/SimModel.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/SimModel.cpython-39.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/Top1SimModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/Top1SimModel.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/TwoPartyModel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/TwoPartyModel.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/model/vertical_fl/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/model/vertical_fl/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /src/preprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/__init__.py -------------------------------------------------------------------------------- /src/preprocess/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/add/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/add/__init__.py -------------------------------------------------------------------------------- /src/preprocess/beijing/__init__.py: -------------------------------------------------------------------------------- 1 | from .beijing_loder import * -------------------------------------------------------------------------------- /src/preprocess/beijing/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/beijing/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/beijing/__pycache__/beijing_loder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/beijing/__pycache__/beijing_loder.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/beijing/beijing_loder.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from utils import move_item_to_start_, move_item_to_end_ 4 | 5 | 6 | def load_house(house_path): 7 | print("Loading house from {}".format(house_path)) 8 | house_data = pd.read_csv(house_path) 9 | 10 | house_data.drop(columns=['lon', 'lat'], inplace=True) 11 | 12 | house_data.info(verbose=True) 13 | 14 | labels = house_data['totalPrice'].to_numpy() 15 | house_data = house_data.drop(columns=['totalPrice']).to_numpy() 16 | 17 | return house_data, labels 18 | 19 | 20 | def load_both(house_path, airbnb_path, active_party='house'): 21 | print("Loading house from {}".format(house_path)) 22 | house_data = pd.read_csv(house_path) 23 | print("Loading airbnb from {}".format(airbnb_path)) 24 | airbnb_data = pd.read_csv(airbnb_path) 25 | 26 | if active_party == 'house': 27 | labels = house_data['totalPrice'].to_numpy() 28 | house_data.drop(columns=['totalPrice'], inplace=True) 29 | 30 | # move lon and lat to end 31 | house_cols = list(house_data.columns) 32 | move_item_to_end_(house_cols, ['lon', 'lat']) 33 | house_data = house_data[house_cols] 34 | print("Current house columns {}".format(house_data.columns)) 35 | 36 | # move lon and lat to start 37 | airbnb_cols = list(airbnb_data.columns) 38 | move_item_to_start_(airbnb_cols, ['lon', 'lat']) 39 | airbnb_data = airbnb_data[airbnb_cols] 40 | print("Current airbnb columns {}".format(airbnb_data.columns)) 41 | 42 | data1 = house_data.to_numpy() 43 | data2 = airbnb_data.to_numpy() 44 | else: 45 | raise NotImplementedError 46 | 47 | return [data1, data2], labels 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /src/preprocess/beijing/clean_airbnb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import sys 4 | 5 | 6 | def clean_airbnb(airbnb_path, out_airbnb_path): 7 | airbnb_data = pd.read_csv(airbnb_path) 8 | 9 | # remove useless columns and NA 10 | airbnb_data.drop(columns=['id', 'name', 'host_id', 'host_name', 'last_review'], inplace=True) 11 | airbnb_data.dropna(inplace=True) 12 | 13 | # remove extreme high prices 14 | airbnb_data = airbnb_data[airbnb_data['price'] < 3000] 15 | 16 | airbnb_data.rename(columns={'latitude': 'lat', 'longitude': 'lon'}, inplace=True) 17 | 18 | airbnb_data = pd.get_dummies(airbnb_data, 19 | columns=['neighbourhood', 'room_type'], 20 | prefix=['nbr', 'rt'], drop_first=True) 21 | 22 | print("Got columns " + str(airbnb_data.columns)) 23 | print("Got {} lines".format(len(airbnb_data.index))) 24 | 25 | airbnb_data.to_csv(out_airbnb_path, index=False) 26 | 27 | 28 | if __name__ == '__main__': 29 | os.chdir(sys.path[0] + "/../../../data/beijing") # change working directory 30 | clean_airbnb("airbnb.csv", "airbnb_clean.csv") 31 | -------------------------------------------------------------------------------- /src/preprocess/beijing/clean_house.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import sys 4 | 5 | 6 | def clean_house(house_path, out_house_path, include_cid=False): 7 | house_data = pd.read_csv(house_path, encoding="iso-8859-1", parse_dates=['tradeTime'], 8 | dtype={'Cid': 'category'}) 9 | 10 | house_data.dropna(inplace=True) 11 | 12 | house_data['buildingType'] = house_data['buildingType'].astype('int') 13 | 14 | # remove the houses sold before 2013 15 | house_data = house_data[house_data['tradeTime'].dt.year > 2012] 16 | 17 | house_data['trade_year'] = house_data['tradeTime'].dt.year 18 | house_data['trade_month'] = house_data['tradeTime'].dt.month 19 | 20 | # rename longitude and latitude 21 | house_data.rename(columns={'Lng': 'lon', 'Lat': 'lat', 'Cid': 'cid'}, inplace=True) 22 | 23 | # # filter too large data 24 | # house_data = house_data[house_data['DOM'] < 365] 25 | 26 | # remove non-numeric values in constructionTime 27 | house_data['constructionTime'] = house_data['constructionTime'].str.extract('(\d+)', expand=False) 28 | 29 | # remove non-numeric values in floor 30 | house_data['floor'] = house_data['floor'].str.extract('(\d+)', expand=False) 31 | 32 | # remove houses with prices extremely large or small [10w, 2000w) 33 | house_data = house_data[house_data['totalPrice'] >= 10] 34 | house_data = house_data[house_data['totalPrice'] < 1000] 35 | 36 | # one-hot categorical features 37 | if include_cid: 38 | house_data = pd.get_dummies(house_data, 39 | columns=['cid', 'district', 'buildingType', 'renovationCondition', 40 | 'buildingStructure', 'trade_year', 'trade_month'], 41 | prefix=['cid', 'did', 'bt', 'rc', 'bs', 'ty', 'tm'], drop_first=True) 42 | else: 43 | house_data = pd.get_dummies(house_data, 44 | columns=['district', 'buildingType', 'renovationCondition', 'buildingStructure', 45 | 'trade_year', 'trade_month'], 46 | prefix=['did', 'bt', 'rc', 'bs', 'ty', 'tm'], drop_first=True) 47 | 48 | # price is not needed to predict totalPrice, otherwise totalPrice = price * squares 49 | house_data.drop(columns=['url', 'id', 'communityAverage', 'price', 'tradeTime'], inplace=True) 50 | 51 | print("Got columns " + str(house_data.columns)) 52 | print("Got {} lines".format(len(house_data.index))) 53 | 54 | house_data.dropna(inplace=True) 55 | 56 | house_data.to_csv(out_house_path, index=False) 57 | 58 | 59 | if __name__ == '__main__': 60 | os.chdir(sys.path[0] + "/../../../data/beijing") # change working directory 61 | clean_house("house.csv", "house_clean.csv") 62 | 63 | -------------------------------------------------------------------------------- /src/preprocess/company/__init__.py: -------------------------------------------------------------------------------- 1 | from .company_loader import load_both, load_loan -------------------------------------------------------------------------------- /src/preprocess/company/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/company/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/company/__pycache__/company_loader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/company/__pycache__/company_loader.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/company/clean_company.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import sys 4 | import re 5 | 6 | 7 | def clean_company(company_path, out_path): 8 | company_df = pd.read_csv("company.csv") 9 | us_company_df = company_df[company_df['country'] == 'united states'] 10 | us_company_df = us_company_df.applymap(lambda s: s.lower() if type(s) == str else s) 11 | us_company_df.drop(columns=['Unnamed: 0', 'domain', 'year founded', '']) -------------------------------------------------------------------------------- /src/preprocess/company/company_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import sys 4 | 5 | from utils import move_item_to_start_, move_item_to_end_ 6 | 7 | 8 | def load_loan(loan_path): 9 | print("Loading loan from {}".format(loan_path)) 10 | loan_df = pd.read_csv(loan_path) 11 | loan_df.drop(columns=['Name'], inplace=True) 12 | 13 | loan_df.info(verbose=True) 14 | 15 | labels = loan_df['SBA_Appv'].to_numpy() 16 | loan_data = loan_df.drop(columns=['SBA_Appv']).to_numpy() 17 | 18 | return loan_data, labels 19 | 20 | 21 | def load_both(loan_path, company_path, host_party='loan'): 22 | if host_party == 'loan': 23 | print("Loading loan from {}".format(loan_path)) 24 | loan_df = pd.read_csv(loan_path) 25 | 26 | print("Loading company from {}".format(company_path)) 27 | company_df = pd.read_csv(company_path) 28 | 29 | labels = loan_df['SBA_Appv'].to_numpy() 30 | loan_df.drop(columns=['SBA_Appv'], inplace=True) 31 | 32 | loan_cols = list(loan_df.columns) 33 | move_item_to_end_(loan_cols, ['Name']) 34 | loan_df = loan_df[loan_cols] 35 | print("Current loan columns {}".format(loan_df.columns)) 36 | 37 | company_cols = list(company_df.columns) 38 | move_item_to_start_(company_cols, ['name']) 39 | company_df = company_df[company_cols] 40 | print("Current company columns {}".format(company_df.columns)) 41 | 42 | data1 = loan_df.to_numpy() 43 | data2 = company_df.to_numpy() 44 | else: 45 | assert False 46 | 47 | return [data1, data2], labels 48 | 49 | 50 | if __name__ == '__main__': 51 | os.chdir(sys.path[0] + "/../../../data/company") # change working directory 52 | company_df = pd.read_csv("company_clean.csv") 53 | loan_df = pd.read_csv("loan_clean.csv") 54 | 55 | merge_df = company_df.merge(loan_df, how='inner', on='title') 56 | 57 | -------------------------------------------------------------------------------- /src/preprocess/game.old/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/game.old/__init__.py -------------------------------------------------------------------------------- /src/preprocess/game.old/negative_sample_steam_interact.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import random 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | def negative_sample_steam_interact(interact_sample_path, game_path, out_data_path, seed=0): 9 | random.seed(seed) 10 | np.random.seed(seed) 11 | os.environ['PYTHONHASHSEED'] = str(seed) 12 | 13 | interact_df = pd.read_csv(interact_sample_path) 14 | print(interact_df.shape) 15 | 16 | steam_game = pd.read_csv(game_path) 17 | # app_id = steam_game['appid'] 18 | # print(len(app_id)) 19 | app_id = interact_df['appid'].unique().tolist() 20 | print(len(app_id)) 21 | 22 | grouped = interact_df.groupby('steamid') 23 | negative = [] 24 | partial_cnt = 0 25 | ratio = 1 26 | cnt = 0 27 | print('start') 28 | for index, value in grouped: 29 | exist_id = set(value['appid']) 30 | length = len(exist_id) 31 | 32 | if length * (ratio + 1) >= len(app_id): 33 | for a in app_id: 34 | if a not in exist_id: 35 | negative.append([index, a]) 36 | partial_cnt += 1 37 | else: 38 | while len(exist_id) != length * (ratio + 1): 39 | new_id = app_id[random.randint(0, len(app_id) - 1)] 40 | if new_id not in exist_id: 41 | exist_id.add(new_id) 42 | negative.append([index, new_id]) 43 | # if cnt >= 5: 44 | # break 45 | cnt += 1 46 | if cnt % 1000 == 0: 47 | print('current step', cnt) 48 | 49 | print('group cnt', cnt) 50 | print('partial cnt', partial_cnt) 51 | print('negative sample cnt', len(negative)) 52 | 53 | negative_df = pd.DataFrame(negative, columns=['steamid', 'appid']) 54 | negative_df['label'] = list(np.zeros(negative_df.shape[0], dtype=np.int)) 55 | print(negative_df.shape) 56 | 57 | interact_df['label'] = list(np.ones(interact_df.shape[0], dtype=np.int)) 58 | print(interact_df.shape) 59 | 60 | all_interact_df = interact_df.append(negative_df) 61 | print(all_interact_df.shape) 62 | print(all_interact_df['appid'].value_counts()) 63 | print(all_interact_df['steamid'].value_counts()) 64 | print(all_interact_df.duplicated().any()) 65 | 66 | all_data_df = pd.merge(steam_game, all_interact_df, on='appid', sort=False) 67 | print(all_data_df) 68 | print(all_data_df['appid'].value_counts()) 69 | print(all_data_df['steamid'].value_counts()) 70 | print(all_data_df.duplicated().any()) 71 | 72 | all_data_df.to_csv(out_data_path, index=False) 73 | 74 | 75 | if __name__ == '__main__': 76 | os.chdir(sys.path[0] + "/../../../") # change working directory 77 | root = "data/" 78 | interact_steam_sample_path = root + "steam_interact_sample.csv" 79 | steam_game_path = root + "steam_game_clean.csv" 80 | out_steam_data_path = root + "steam_data.csv" 81 | negative_sample_steam_interact(interact_steam_sample_path, steam_game_path, out_steam_data_path) 82 | print("Done.") 83 | -------------------------------------------------------------------------------- /src/preprocess/game.old/sample_steam_interact.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import random 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | def sample_steam_interact(interact_path, game_path, out_interact_sample_path, seed=0): 9 | random.seed(seed) 10 | np.random.seed(seed) 11 | os.environ['PYTHONHASHSEED'] = str(seed) 12 | 13 | interact_df = pd.read_csv(interact_path, header=None) 14 | interact_df.columns = ['steamid', 'appid'] 15 | print(interact_df.shape) 16 | # print(interact_df['appid'].value_counts()) 17 | # print(interact_df['steamid'].value_counts()) 18 | # print(interact_df.duplicated().any()) 19 | 20 | steam = pd.read_csv(game_path) 21 | app_id_df = steam[['appid']] 22 | print(app_id_df.shape) 23 | 24 | merged = pd.merge(app_id_df, interact_df, on='appid', sort=False) 25 | print(merged.shape) 26 | # print(merged['appid'].value_counts()) 27 | # print(merged['steamid'].value_counts()) 28 | # print(merged.duplicated().any()) 29 | 30 | merged_sub = merged.sample(n=23000000, random_state=seed) 31 | print(merged_sub.shape) 32 | print(merged_sub['appid'].value_counts()) 33 | print(merged_sub['steamid'].value_counts()) 34 | print(merged_sub.duplicated().any()) 35 | 36 | cnts = merged_sub['steamid'].value_counts() 37 | cnts_df = pd.DataFrame({'steamid': cnts.index, 'cnts': cnts.values}) 38 | print(cnts_df) 39 | 40 | filters = cnts_df[cnts_df['cnts'] >= 20] 41 | print(filters) 42 | 43 | filters_id_df = filters[['steamid']] 44 | print(filters_id_df) 45 | 46 | sample = pd.merge(filters_id_df, merged_sub, on='steamid', sort=False) 47 | print(sample.shape) 48 | print(sample['appid'].value_counts()) 49 | print(sample['steamid'].value_counts()) 50 | print(sample.duplicated().any()) 51 | 52 | sample.to_csv(out_interact_sample_path, index=False) 53 | 54 | 55 | if __name__ == '__main__': 56 | os.chdir(sys.path[0] + "/../../../") # change working directory 57 | root = "data/" 58 | steam_interact_path = root + "steam_interact.csv" 59 | steam_game_path = root + "steam_game_clean.csv" 60 | out_steam_interact_sample_path = root + "steam_interact_sample.csv" 61 | sample_steam_interact(steam_interact_path, steam_game_path, out_steam_interact_sample_path) 62 | print("Done.") -------------------------------------------------------------------------------- /src/preprocess/game.old/split_data.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | import os 3 | import sys 4 | import pandas as pd 5 | 6 | 7 | def split_df(data_path, val_rate=0.1, test_rate=0.2, seed=0, save=False): 8 | """ 9 | Split steam data to train, test, validation set. Output to the same directory as steam_data_path by default. 10 | Generate 3 new files. 11 | :param data_path: 12 | :param val_rate: rate of validation set w.r.t. global dataset 13 | :param test_rate: rate of test set w.r.t. global dataset 14 | :param seed: random seed 15 | :return: 16 | """ 17 | os.environ['PYTHONHASHSEED'] = str(seed) 18 | 19 | print("Splitting...") 20 | data_df = pd.read_csv(data_path) 21 | train_val_df, test_df = train_test_split(data_df, test_size=test_rate, random_state=seed) 22 | split_val_rate = val_rate / (1. - test_rate) 23 | train_df, val_df = train_test_split(train_val_df, test_size=split_val_rate, random_state=seed) 24 | 25 | if save: 26 | base_path = data_path.rsplit('.', 1)[0] 27 | train_df.to_csv(base_path + "_train.csv", index=False) 28 | print("Saved to " + base_path + "_train.csv") 29 | val_df.to_csv(base_path + "_val.csv", index=False) 30 | print("Saved to " + base_path + "_val.csv") 31 | test_df.to_csv(base_path + "_test.csv", index=False) 32 | print("Saved to " + base_path + "_test.csv") 33 | 34 | return train_df, val_df, test_df 35 | 36 | 37 | if __name__ == '__main__': 38 | os.chdir(sys.path[0] + "/../../../") # change working directory 39 | root = "data/" 40 | steam_data_path = root + "steam_data.csv" 41 | split_df(steam_data_path, val_rate=0.1, test_rate=0.2, save=True) 42 | -------------------------------------------------------------------------------- /src/preprocess/game.old/steam_ign_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | class SteamIgnLoader: 5 | def __init__(self, steam_train_data_path, steam_val_data_path, steam_test_data_path, ign_data_path): 6 | self.steam_train_data = pd.read_csv(steam_train_data_path) 7 | self.steam_val_data = pd.read_csv(steam_val_data_path) 8 | self.steam_test_data = pd.read_csv(steam_test_data_path) 9 | self.ign_data = pd.read_csv(ign_data_path) 10 | 11 | def load_parties(self): 12 | return self.steam_train_data, self.steam_val_data, self.steam_test_data, self.ign_data -------------------------------------------------------------------------------- /src/preprocess/game/__init__.py: -------------------------------------------------------------------------------- 1 | from .game_loader import load_rawg, load_both, load_steam -------------------------------------------------------------------------------- /src/preprocess/game/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/game/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/game/__pycache__/game_loader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/game/__pycache__/game_loader.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/game/clean_rawg.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import sys 4 | import re 5 | 6 | 7 | def clean_rawg(rawg_path, out_rawg_path): 8 | print("Loading rawg dataset") 9 | rawg_df = pd.read_csv(rawg_path, parse_dates=['released', 'updated']) 10 | 11 | print("Dropping unrelated columns") 12 | rawg_df.drop(columns=['id', 'slug', 'tba', 'metacritic', 'developers', 'publishers', 13 | 'esrb_rating', 'rating_top', 'ratings_count'], inplace=True) 14 | 15 | print("Mapping website to bool") 16 | rawg_df['has_website'] = pd.isnull(rawg_df['website']) 17 | rawg_df.drop(columns=['website'], inplace=True) 18 | 19 | print("Converting date to relative year") 20 | rawg_df['released_year_before_2020'] = 2020 - rawg_df['released'].dt.year 21 | rawg_df['updated_year_before_2020'] = 2020 - rawg_df['updated'].dt.year 22 | rawg_df.drop(columns=['released', 'updated'], inplace=True) 23 | 24 | print("Mapping platforms and genres to dummies") 25 | rawg_df['platforms'].str.replace("||", "|") 26 | rawg_df['genres'].str.replace("||", "|") 27 | platform_df = rawg_df['platforms'].str.get_dummies(sep='|') 28 | genre_df = rawg_df['genres'].str.get_dummies(sep='|') 29 | rawg_df.drop(columns=['platforms', 'genres'], inplace=True) 30 | rawg_df = pd.concat([rawg_df, platform_df, genre_df], axis=1) 31 | 32 | rawg_df.dropna(inplace=True) 33 | rawg_df = rawg_df[rawg_df['playtime'] < 100] 34 | 35 | # remove non-alphanumeric characters and switch to lower cases 36 | rawg_df['name'] = rawg_df['name'].apply(lambda s: re.sub(r'\W', '', s).lower()) 37 | rawg_df.drop_duplicates(subset='name', keep='first', inplace=True) 38 | 39 | print("Saving to file, got {} samples".format(len(rawg_df.index))) 40 | rawg_df.to_csv(out_rawg_path, index=False) 41 | print("Done") 42 | 43 | 44 | if __name__ == '__main__': 45 | os.chdir(sys.path[0] + "/../../../data/game") # change working directory 46 | clean_rawg("rawg.csv", "rawg_clean.csv") 47 | -------------------------------------------------------------------------------- /src/preprocess/game/clean_steam.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import sys 4 | import re 5 | 6 | 7 | def clean_steam(steam_path, out_steam_path): 8 | steam_df = pd.read_csv(steam_path, parse_dates=['release_date']) 9 | 10 | steam_df.dropna(inplace=True) 11 | steam_df.drop(columns=['appid', 'release_date', 'developer', 'publisher'], inplace=True) 12 | 13 | cat_df = steam_df['categories'].str.get_dummies(sep=';').add_prefix('cat_') 14 | # tag_df = steam_df['steamspy_tags'].str.get_dummies(sep=';').add_prefix('tag_') 15 | steam_df.drop(columns=['categories', 'steamspy_tags'], inplace=True) 16 | steam_df = pd.concat([steam_df, cat_df], axis=1) 17 | 18 | print("Mapping platforms and genres to dummies") 19 | steam_df['platforms'].str.replace("||", "|") 20 | steam_df['genres'].str.replace("||", "|") 21 | platform_df = steam_df['platforms'].str.get_dummies(sep='|').add_prefix('pf_') 22 | genre_df = steam_df['genres'].str.get_dummies(sep='|').add_prefix('gn_') 23 | steam_df.drop(columns=['platforms', 'genres'], inplace=True) 24 | steam_df = pd.concat([steam_df, platform_df, genre_df], axis=1) 25 | 26 | # steam_df = pd.get_dummies(steam_df, columns=['owners'], prefix=['owner'], drop_first=True) 27 | steam_df['owners'] = steam_df['owners'].apply(lambda x: 28 | '0-20000' if x == '0-20000' else 29 | # '20000-50000' if x == '20000-50000' else 30 | '20000-200000000') 31 | steam_df['owners'] = pd.factorize(steam_df['owners'])[0] 32 | steam_df.dropna(inplace=True) 33 | 34 | # remove non-alphanumeric characters and switch to lower cases 35 | steam_df['name'] = steam_df['name'].apply(lambda s: re.sub(r'\W', '', s).lower()) 36 | steam_df.drop_duplicates(subset='name', keep='first', inplace=True) 37 | 38 | print("Saving to file, got {} samples".format(len(steam_df.index))) 39 | steam_df.to_csv(out_steam_path, index=False) 40 | print("Done") 41 | 42 | 43 | if __name__ == '__main__': 44 | os.chdir(sys.path[0] + "/../../../data/game") # change working directory 45 | clean_steam("steam.csv", "steam_clean.csv") 46 | -------------------------------------------------------------------------------- /src/preprocess/game/game_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from utils import move_item_to_start_, move_item_to_end_ 4 | 5 | 6 | def load_rawg(rawg_path): 7 | print("Loading rawg from {}".format(rawg_path)) 8 | rawg_data = pd.read_csv(rawg_path) 9 | 10 | rawg_data.drop(columns=['name'], inplace=True) 11 | 12 | rawg_data.info(verbose=True) 13 | 14 | labels = rawg_data['rating'].to_numpy() 15 | rawg_data = rawg_data.drop(columns=['rating']).to_numpy() 16 | 17 | return rawg_data, labels 18 | 19 | 20 | def load_steam(steam_path): 21 | print("Loading steam from {}".format(steam_path)) 22 | steam_data = pd.read_csv(steam_path) 23 | 24 | steam_data.drop(columns=['name'], inplace=True) 25 | 26 | steam_data.info(verbose=True) 27 | 28 | labels = steam_data['owners'].to_numpy() 29 | steam_data = steam_data.drop(columns=['owners']).to_numpy() 30 | 31 | return steam_data, labels 32 | 33 | 34 | def load_both(rawg_path, steam_path, active_party='rawg'): 35 | print("Loading rawg from {}".format(rawg_path)) 36 | rawg_data = pd.read_csv(rawg_path) 37 | print("Loading steam from {}".format(steam_path)) 38 | steam_data = pd.read_csv(steam_path) 39 | 40 | if active_party == 'rawg': 41 | labels = steam_data['owners'].to_numpy() 42 | steam_data = steam_data.drop(columns=['owners']).to_numpy() 43 | 44 | steam_data.drop(list(steam_data.filter(regex='pf')), axis=1, inplace=True) 45 | steam_data.drop(list(steam_data.filter(regex='gn')), axis=1, inplace=True) 46 | 47 | # move lon and lat to end 48 | rawg_cols = list(rawg_data.columns) 49 | move_item_to_end_(rawg_cols, ['name']) 50 | rawg_data = rawg_data[rawg_cols] 51 | print("Current rawg columns {}".format(rawg_data.columns)) 52 | 53 | # move lon and lat to start 54 | steam_cols = list(steam_data.columns) 55 | move_item_to_start_(steam_cols, ['name']) 56 | steam_data = steam_data[steam_cols] 57 | print("Current steam columns {}".format(steam_data.columns)) 58 | 59 | data1 = rawg_data.to_numpy() 60 | data2 = steam_data.to_numpy() 61 | elif active_party == 'steam': 62 | labels = steam_data['owners'].to_numpy() 63 | steam_data = steam_data.drop(columns=['owners']) 64 | 65 | steam_data.drop(list(steam_data.filter(regex='pf')), axis=1, inplace=True) 66 | steam_data.drop(list(steam_data.filter(regex='gn')), axis=1, inplace=True) 67 | 68 | # move keys to end 69 | steam_cols = list(steam_data.columns) 70 | move_item_to_end_(steam_cols, ['name']) 71 | steam_data = steam_data[steam_cols] 72 | print("Current steam columns {}".format(steam_data.columns)) 73 | 74 | # move lon and lat to start 75 | rawg_cols = list(rawg_data.columns) 76 | move_item_to_start_(rawg_cols, ['name']) 77 | rawg_data = rawg_data[rawg_cols] 78 | print("Current rawg columns {}".format(rawg_data.columns)) 79 | 80 | data1 = steam_data.to_numpy() 81 | data2 = rawg_data.to_numpy() 82 | else: 83 | assert False 84 | 85 | return [data1, data2], labels 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /src/preprocess/gesis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/gesis/__init__.py -------------------------------------------------------------------------------- /src/preprocess/gesis/sav_to_csv.py: -------------------------------------------------------------------------------- 1 | import recordlinkage 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /src/preprocess/hdb/__init__.py: -------------------------------------------------------------------------------- 1 | from .hdb_loader import load_both, load_hdb -------------------------------------------------------------------------------- /src/preprocess/hdb/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/hdb/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/hdb/__pycache__/hdb_loader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/hdb/__pycache__/hdb_loader.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/hdb/clean_hdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import requests 4 | import json 5 | 6 | from tqdm import tqdm 7 | import pandas as pd 8 | 9 | 10 | def get_blk_loc(hdb_path, out_hdb_w_blk_loc_path): 11 | hdb_df = pd.read_csv(hdb_path, parse_dates=['month']) 12 | 13 | hdb_df['address'] = hdb_df['block'] + " " + hdb_df['street_name'] 14 | addrs_unique = hdb_df['address'].drop_duplicates(keep='first') 15 | 16 | # get the address of blocks by OneMap API 17 | latitude = [] 18 | longitude = [] 19 | blk_no = [] 20 | road_name = [] 21 | postal_code = [] 22 | address = [] 23 | for addr in tqdm(addrs_unique): 24 | query_string = 'https://developers.onemap.sg/commonapi/search?searchVal=' + str( 25 | addr) + '&returnGeom=Y&getAddrDetails=Y' 26 | resp = requests.get(query_string) 27 | 28 | # Convert JSON into Python Object 29 | data_geo_location = json.loads(resp.content) 30 | if data_geo_location['found'] != 0: 31 | latitude.append(data_geo_location['results'][0]['LATITUDE']) 32 | longitude.append(data_geo_location['results'][0]['LONGITUDE']) 33 | blk_no.append(data_geo_location['results'][0]['BLK_NO']) # this one is a unique block No. 34 | road_name.append(data_geo_location['results'][0]['ROAD_NAME']) 35 | postal_code.append(data_geo_location['results'][0]['POSTAL']) 36 | address.append(addr) 37 | # print(str(addr) + " ,Lat: " + data_geo_location['results'][0]['LATITUDE'] + " Long: " + 38 | # data_geo_location['results'][0]['LONGITUDE']) 39 | else: 40 | print("No Results") 41 | 42 | print("Converting to dataframe") 43 | block_loc_df = pd.DataFrame({'address': address, 'lat': latitude, 'lon': longitude}) 44 | print("Joining with HDB data") 45 | hdb_df = hdb_df.merge(block_loc_df, on='address', how='left') 46 | print("Saving to {}".format(out_hdb_w_blk_loc_path)) 47 | hdb_df.to_csv(out_hdb_w_blk_loc_path, index=False) 48 | 49 | 50 | def clean_hdb(hdb_path, out_hdb_path): 51 | hdb_df = pd.read_csv(hdb_path, parse_dates=['month']) 52 | hdb_df.dropna(inplace=True) 53 | hdb_df.drop(columns=['month', 'block', 'street_name', 'address'], inplace=True) 54 | 55 | hdb_df['lease_commence_year_before_2020'] = 2020 - hdb_df['lease_commence_date'] 56 | hdb_df.drop(columns=['lease_commence_date', 'remaining_lease'], inplace=True) 57 | 58 | hdb_df = pd.get_dummies(hdb_df, 59 | columns=['town', 'flat_type', 'storey_range', 'flat_model'], 60 | prefix=['tn', 'ft', 'sr', 'fm'], drop_first=True) 61 | 62 | hdb_df['resale_price'] = hdb_df['resale_price'] / 1000 # change to kS$ 63 | 64 | hdb_df.to_csv(out_hdb_path, index=False) 65 | 66 | 67 | if __name__ == '__main__': 68 | os.chdir(sys.path[0] + "/../../../data/hdb") # change working directory 69 | # get_blk_loc("resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv", 70 | # "hdb_2017_onwards_w_blk_loc.csv") 71 | clean_hdb("hdb_2017_onwards_w_blk_loc.csv", 72 | "hdb_clean.csv") -------------------------------------------------------------------------------- /src/preprocess/hdb/clean_school.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | import requests 5 | import json 6 | 7 | from tqdm import tqdm 8 | import pandas as pd 9 | 10 | 11 | def clean_school(rank_path_list, out_path): 12 | school_df_list = [] 13 | for i, rank_path in enumerate(rank_path_list): 14 | school_name_list = [] 15 | n_places_after_2b_list = [] 16 | vacancy_rate_list = [] 17 | with open(rank_path, 'r') as f: 18 | for line in f: 19 | out_params = re.split('–', line) # This '–' is not minus symbol '-' in the keyboard 20 | school_name = out_params[0].strip().lower() 21 | in_params = re.split('[(),]', out_params[1]) 22 | n_places_after_2b = int(in_params[0]) 23 | vacancy_rate = eval(in_params[2].strip()) 24 | school_name_list.append(school_name) 25 | n_places_after_2b_list.append(n_places_after_2b) 26 | vacancy_rate_list.append(vacancy_rate) 27 | school_df_i = pd.DataFrame({ 28 | 'school_name': school_name_list, 29 | 'n_places_{}'.format(i): n_places_after_2b, 30 | 'vacancy_rate_{}'.format(i): vacancy_rate_list 31 | }) 32 | school_df_i.set_index('school_name', inplace=True) 33 | school_df_list.append(school_df_i) 34 | 35 | all_school_df = school_df_list[0].join(school_df_list[1:]) 36 | all_school_df.to_csv(out_path, index=True) 37 | 38 | 39 | def get_school_loc(school_summary_path, out_path): 40 | school_df = pd.read_csv(school_summary_path) 41 | school_list = school_df['school_name'].to_list() 42 | names = [] 43 | lats = [] 44 | lons = [] 45 | for name in tqdm(school_list): 46 | query_str = "https://developers.onemap.sg/commonapi/search?searchVal=" + str( 47 | name) + "&returnGeom=Y&getAddrDetails=Y" 48 | resp = requests.get(query_str) 49 | 50 | # Convert JSON into Python Object 51 | try: 52 | data_geo_location = json.loads(resp.content) 53 | except json.decoder.JSONDecodeError: 54 | print("Failed to retrieve result") 55 | continue 56 | if data_geo_location['found'] != 0: 57 | lats.append(data_geo_location['results'][0]['LATITUDE']) 58 | lons.append(data_geo_location['results'][0]['LONGITUDE']) 59 | names.append(name) 60 | else: 61 | print("No Results") 62 | 63 | school_loc_df = pd.DataFrame({'school_name': names, 64 | 'lat': lats, 65 | 'lon': lons}).set_index('school_name') 66 | school_df = school_df.set_index('school_name').join(school_loc_df) 67 | school_df.dropna(inplace=True) 68 | school_df.to_csv(out_path) 69 | 70 | 71 | if __name__ == '__main__': 72 | os.chdir(sys.path[0] + "/../../../data/hdb") # change working directory 73 | # clean_school(["primary_school_rank_2015.txt", 74 | # "primary_school_rank_2016.txt", 75 | # "primary_school_rank_2017.txt", 76 | # "primary_school_rank_2018.txt"], 77 | # "school_summary.csv") 78 | get_school_loc("school_summary.csv", "school_clean.csv") 79 | -------------------------------------------------------------------------------- /src/preprocess/hdb/hdb_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from utils import move_item_to_start_, move_item_to_end_ 4 | 5 | 6 | def load_hdb(hdb_path): 7 | print("Loading hdb from {}".format(hdb_path)) 8 | hdb_data = pd.read_csv(hdb_path) 9 | 10 | hdb_data.drop(columns=['lon', 'lat']) 11 | 12 | hdb_data.info(verbose=True) 13 | 14 | labels = hdb_data['resale_price'].to_numpy() 15 | hdb_data = hdb_data.drop(columns=['resale_price']).to_numpy() 16 | 17 | return hdb_data, labels 18 | 19 | 20 | def load_both(hdb_path, airbnb_path, active_party='hdb'): 21 | print("Loading house from {}".format(hdb_path)) 22 | hdb_data = pd.read_csv(hdb_path) 23 | print("Loading airbnb from {}".format(airbnb_path)) 24 | school_data = pd.read_csv(airbnb_path) 25 | 26 | if active_party == 'hdb': 27 | labels = hdb_data['resale_price'].to_numpy() 28 | hdb_data.drop(columns=['resale_price'], inplace=True) 29 | 30 | # move lon and lat to end 31 | hdb_cols = list(hdb_data.columns) 32 | move_item_to_end_(hdb_cols, ['lon', 'lat']) 33 | hdb_data = hdb_data[hdb_cols] 34 | print("Current hdb columns {}".format(hdb_data.columns)) 35 | 36 | school_data.drop(columns=['school_name'], inplace=True) 37 | 38 | # move lon and lat to start 39 | school_cols = list(school_data.columns) 40 | move_item_to_start_(school_cols, ['lon', 'lat']) 41 | school_data = school_data[school_cols] 42 | print("Current airbnb columns {}".format(school_data.columns)) 43 | 44 | data1 = hdb_data.to_numpy() 45 | data2 = school_data.to_numpy() 46 | else: 47 | raise NotImplementedError 48 | 49 | return [data1, data2], labels 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /src/preprocess/krebs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/krebs/__init__.py -------------------------------------------------------------------------------- /src/preprocess/krebs/generate_dataset.py: -------------------------------------------------------------------------------- 1 | import recordlinkage.datasets 2 | import os 3 | import sys 4 | 5 | 6 | 7 | if __name__ == '__main__': 8 | 9 | os.chdir(sys.path[0] + "/../../../data/krebs") # change working directory 10 | os.environ["RL_DATA"] = os.getcwd() # set path of recordinglinkage dataset 11 | 12 | data = recordlinkage.datasets.load_krebsregister(block=1) 13 | 14 | pass -------------------------------------------------------------------------------- /src/preprocess/ml_dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/ml_dataset/__init__.py -------------------------------------------------------------------------------- /src/preprocess/ml_dataset/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/ml_dataset/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/ml_dataset/__pycache__/frog.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/ml_dataset/__pycache__/frog.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/ml_dataset/__pycache__/miniboone.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/ml_dataset/__pycache__/miniboone.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/ml_dataset/__pycache__/two_party_loader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/ml_dataset/__pycache__/two_party_loader.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/ml_dataset/frog.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import wget 5 | import zipfile 6 | 7 | 8 | def load_frog(path, download=True, label_col='Species'): 9 | assert label_col in ['Family', 'Genus', 'Species'], "Undefined label column {}".format(label_col) 10 | 11 | if download and not os.path.isfile(path): 12 | print("Downloading frog dataset") 13 | wget.download("https://archive.ics.uci.edu/ml/machine-learning-databases/00406/Anuran%20Calls%20(MFCCs).zip", 14 | out="data/frog.zip") 15 | with zipfile.ZipFile("data/frog.zip", 'r') as zip_ref: 16 | zip_ref.extractall("data/") 17 | 18 | data_labels_df = pd.read_csv(path, usecols=range(0, 25)) 19 | data_df = data_labels_df.iloc[:, :22] 20 | labels_df = data_labels_df[label_col].astype('category').cat.codes 21 | 22 | data = data_df.to_numpy() 23 | labels = labels_df.to_numpy() 24 | 25 | return data, labels -------------------------------------------------------------------------------- /src/preprocess/ml_dataset/miniboone.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import wget 4 | 5 | 6 | def load_miniboone(path, download=True): 7 | if download and not os.path.isfile(path): 8 | print("Downloading MiniBooNE dataset") 9 | wget.download("https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt", 10 | out=path) 11 | 12 | with open(path, 'r') as f: 13 | line = next(f) 14 | n_signal_events, n_background_events = [int(x) for x in line.split()] 15 | 16 | data = np.loadtxt(path, skiprows=1) 17 | label_signal = np.ones([n_signal_events, 1]) 18 | label_background = np.zeros([n_background_events, 1]) 19 | labels = np.concatenate([label_signal, label_background], axis=0) 20 | assert labels.shape[0] == data.shape[0] 21 | data_labels = np.concatenate([data, labels], axis=1) 22 | random_state = np.random.RandomState(0) 23 | random_state.shuffle(data_labels) 24 | return data_labels[:, :-1], data_labels[:, -1].flatten() 25 | 26 | -------------------------------------------------------------------------------- /src/preprocess/ml_dataset/two_party_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import random 5 | from sklearn.datasets import load_svmlight_file 6 | import pickle 7 | import inspect 8 | from scipy.sparse import csr_matrix 9 | 10 | 11 | class TwoPartyLoader: 12 | def __init__(self, num_features, num_common_features: int, 13 | common_feature_noise_scale=0.0, data_fmt='libsvm', dataset_name=None, cache_path=None, 14 | n_classes=2, seed=0): 15 | """ 16 | :param cache_path: path for cache of the object 17 | :param dataset_name: name of the dataset 18 | :param num_features_per_party: number of features on both party, including common features 19 | :param num_common_features: number of common features 20 | """ 21 | self.cache_path = cache_path 22 | self.dataset_name = dataset_name 23 | self.data_fmt = data_fmt 24 | self.n_classes = n_classes 25 | self.common_feature_noise_scale = common_feature_noise_scale 26 | self.num_common_features = num_common_features 27 | self.num_features = num_features 28 | self.seeds = list(range(seed, seed + 3)) 29 | 30 | self.X = None 31 | self.y = None 32 | self.Xs = None 33 | 34 | def load_dataset(self, path=None, use_cache=True, scale_label=False): 35 | """ 36 | :param use_cache: whether to use cache 37 | :param path: path of the ml dataset 38 | :param scale_label: whether to scale back the label from [0,1] to int. True in covtype.scale01. 39 | :return: features, labels 40 | """ 41 | if use_cache and self.X is not None and self.y is not None: 42 | assert self.num_features == self.X.shape[1], "Total number of features mismatch." 43 | return self.X, self.y 44 | 45 | assert path is not None 46 | print("Loading {} dataset".format(self.dataset_name)) 47 | if inspect.isfunction(self.data_fmt): 48 | X, y = self.data_fmt(path) 49 | elif self.data_fmt == 'libsvm': 50 | X, y = load_svmlight_file(path) 51 | X = X.toarray() 52 | 53 | # hard code for a strange dataset whose labels are 1 & 2 54 | if self.dataset_name == 'covtype.binary': 55 | y -= 1 56 | elif self.data_fmt == 'csv': 57 | dataset = np.loadtxt(path, delimiter=',', skiprows=1) 58 | X = dataset[:, :-1] 59 | y = dataset[:, -1].reshape(-1) 60 | else: 61 | assert False, "Unsupported ML dataset format" 62 | 63 | if scale_label: 64 | y = np.rint(y * (self.n_classes - 1)).astype(np.int) 65 | 66 | assert self.num_features == X.shape[1], "Total number of features mismatch." 67 | print("Done") 68 | if use_cache: 69 | self.X, self.y = X, y 70 | 71 | return X, y 72 | 73 | def load_parties(self, path=None, use_cache=True, scale_label=False): 74 | X, y = self.load_dataset(path, use_cache, scale_label) 75 | if use_cache and self.Xs is not None: 76 | print("Loading parties from cache") 77 | return self.Xs, self.y 78 | 79 | # assuming all the features are useful 80 | print("Splitting features to two parties") 81 | 82 | # randomly divide trained features to two parties 83 | shuffle_state = np.random.RandomState(self.seeds[0]) 84 | shuffle_state.shuffle(X.T) # shuffle columns 85 | trained_features = X[:, self.num_common_features:] 86 | trained_features1 = trained_features[:, :trained_features.shape[1] // 2] 87 | trained_features2 = trained_features[:, trained_features.shape[1] // 2:] 88 | 89 | # append common features 90 | common_features = X[:, :self.num_common_features] 91 | noise_state = np.random.RandomState(self.seeds[2]) 92 | noised_common_features = common_features.copy() + noise_state.normal( 93 | scale=self.common_feature_noise_scale, size=common_features.shape) 94 | X1 = np.concatenate([trained_features1, common_features], axis=1) 95 | X2 = np.concatenate([noised_common_features, trained_features2], axis=1) 96 | 97 | assert X1.shape[1] + X2.shape[1] - self.num_common_features == self.X.shape[1] 98 | 99 | if use_cache: 100 | # refresh cached Xs 101 | self.Xs = [X1, X2] 102 | print("Done") 103 | return [X1, X2], y 104 | 105 | def to_pickle(self, save_path: str): 106 | with open(save_path, 'wb') as f: 107 | pickle.dump(self, f) 108 | 109 | @staticmethod 110 | def from_pickle(load_path: str): 111 | with open(load_path, 'rb') as f: 112 | return pickle.load(f) 113 | 114 | 115 | class ThreePartyLoader: 116 | def __init__(self, num_features, num_common_features: int, 117 | common_feature_noise_scale=0.0, data_fmt='libsvm', dataset_name=None, cache_path=None, 118 | n_classes=2, seed=0): 119 | """ 120 | :param cache_path: path for cache of the object 121 | :param dataset_name: name of the dataset 122 | :param num_features_per_party: number of features on both party, including common features 123 | :param num_common_features: number of common features 124 | """ 125 | self.cache_path = cache_path 126 | self.dataset_name = dataset_name 127 | self.data_fmt = data_fmt 128 | self.n_classes = n_classes 129 | self.common_feature_noise_scale = common_feature_noise_scale 130 | self.num_common_features = num_common_features 131 | self.num_features = num_features 132 | self.seeds = list(range(seed, seed + 3)) 133 | 134 | self.X = None 135 | self.y = None 136 | self.Xs = None 137 | 138 | def load_dataset(self, path=None, use_cache=True, scale_label=False): 139 | """ 140 | :param use_cache: whether to use cache 141 | :param path: path of the ml dataset 142 | :param scale_label: whether to scale back the label from [0,1] to int. True in covtype.scale01. 143 | :return: features, labels 144 | """ 145 | if use_cache and self.X is not None and self.y is not None: 146 | assert self.num_features == self.X.shape[1], "Total number of features mismatch." 147 | return self.X, self.y 148 | 149 | assert path is not None 150 | print("Loading {} dataset".format(self.dataset_name)) 151 | if inspect.isfunction(self.data_fmt): 152 | X, y = self.data_fmt(path) 153 | elif self.data_fmt == 'libsvm': 154 | X, y = load_svmlight_file(path) 155 | X = X.toarray() 156 | 157 | # hard code for a strange dataset whose labels are 1 & 2 158 | if self.dataset_name == 'covtype.binary': 159 | y -= 1 160 | elif self.data_fmt == 'csv': 161 | dataset = np.loadtxt(path, delimiter=',', skiprows=1) 162 | X = dataset[:, :-1] 163 | y = dataset[:, -1].reshape(-1) 164 | else: 165 | assert False, "Unsupported ML dataset format" 166 | 167 | if scale_label: 168 | y = np.rint(y * (self.n_classes - 1)).astype(np.int) 169 | 170 | assert self.num_features == X.shape[1], "Total number of features mismatch." 171 | print("Done") 172 | if use_cache: 173 | self.X, self.y = X, y 174 | 175 | return X, y 176 | 177 | def load_parties(self, path=None, use_cache=True, scale_label=False): 178 | X, y = self.load_dataset(path, use_cache, scale_label) 179 | if use_cache and self.Xs is not None: 180 | print("Loading parties from cache") 181 | return self.Xs, self.y 182 | 183 | # assuming all the features are useful 184 | print("Splitting features to two parties") 185 | 186 | # randomly divide trained features to three parties 187 | shuffle_state = np.random.RandomState(self.seeds[0]) 188 | shuffle_state.shuffle(X.T) # shuffle columns 189 | trained_features = X[:, self.num_common_features:] 190 | trained_features1, trained_features2, trained_features3 = np.split(trained_features, 3, axis=1) 191 | 192 | # append common features 193 | common_features = X[:, :self.num_common_features] 194 | noise_state1 = np.random.RandomState(self.seeds[1]) 195 | noise_state2 = np.random.RandomState(self.seeds[2]) 196 | noised_common_features1 = common_features.copy() + noise_state1.normal( 197 | scale=self.common_feature_noise_scale, size=common_features.shape) 198 | noised_common_features2 = common_features.copy() + noise_state2.normal( 199 | scale=self.common_feature_noise_scale, size=common_features.shape) 200 | X1 = np.concatenate([trained_features1, common_features], axis=1) 201 | X2 = np.concatenate([noised_common_features1, trained_features2, noised_common_features2], axis=1) 202 | X3 = np.concatenate([noised_common_features2, trained_features3], axis=1) 203 | 204 | if use_cache: 205 | # refresh cached Xs 206 | self.Xs = [X1, X2, X3] 207 | print("Done") 208 | return [X1, X2, X3], y 209 | 210 | def to_pickle(self, save_path: str): 211 | with open(save_path, 'wb') as f: 212 | pickle.dump(self, f) 213 | 214 | @staticmethod 215 | def from_pickle(load_path: str): 216 | with open(load_path, 'rb') as f: 217 | return pickle.load(f) 218 | -------------------------------------------------------------------------------- /src/preprocess/nytaxi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/nytaxi/__init__.py -------------------------------------------------------------------------------- /src/preprocess/nytaxi/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/nytaxi/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/nytaxi/__pycache__/ny_loader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/nytaxi/__pycache__/ny_loader.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/nytaxi/clean_airbnb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pandas as pd 5 | 6 | 7 | def clean_airbnb(raw_airbnb_path, out_airbnb_path): 8 | raw_airbnb_data = pd.read_csv(raw_airbnb_path) 9 | 10 | # fill null value 11 | raw_airbnb_data.fillna({'reviews_per_month': 0}, inplace=True) 12 | raw_airbnb_data.fillna({'name': "null"}, inplace=True) 13 | raw_airbnb_data.fillna({'host_name': "null"}, inplace=True) 14 | raw_airbnb_data.fillna({'last_review': "null"}, inplace=True) 15 | 16 | assert (raw_airbnb_data.isnull().sum().to_numpy() == 0).all() 17 | 18 | # remove abnormal high prices larger than $1,000 per day 19 | raw_airbnb_data = raw_airbnb_data[raw_airbnb_data['price'] < 1000] 20 | 21 | # add the length of name as a feature 22 | raw_airbnb_data["name_length"] = raw_airbnb_data['name'].map(str).apply(len) 23 | 24 | # set all the minimum nights larger than 30 to 30 25 | raw_airbnb_data.loc[(raw_airbnb_data.minimum_nights > 30), 'minimum_nights'] = 30 26 | 27 | raw_airbnb_data.drop(columns=['id', 'host_id', 'host_name', 'name', 'last_review'], inplace=True) 28 | 29 | # set categorical to one-hot 30 | out_airbnb_data = pd.get_dummies(raw_airbnb_data, 31 | columns=['neighbourhood_group', 'neighbourhood', 'room_type'], 32 | prefix=['nhg', 'nh', 'rt'], drop_first=True) 33 | 34 | out_airbnb_data.to_csv(out_airbnb_path) 35 | 36 | 37 | if __name__ == '__main__': 38 | os.chdir(sys.path[0] + "/../../../data/nytaxi") # change working directory 39 | clean_airbnb("AB_NYC_2019.csv", "airbnb_clean.csv") 40 | 41 | -------------------------------------------------------------------------------- /src/preprocess/nytaxi/clean_citibike.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | 5 | import pandas as pd 6 | 7 | 8 | def fill_zero_padding(s: str): 9 | """ 10 | Fill a datetime string with zero paddings in days and months 11 | :param s: original datetime string with format '%-m/%-d/%Y %H:%M:%S' 12 | :return: 13 | """ 14 | month, day, other = s.split('/') 15 | month = "0" + month if len(month) == 1 else month 16 | day = "0" + day if len(day) == 1 else day 17 | return month + "/" + day + "/" + other 18 | 19 | 20 | def clean_bike(bike_ori_data_path, out_bike_data_path, sample_n=None): 21 | print("Reading from {}".format(bike_ori_data_path)) 22 | date_parser = lambda x: datetime.strptime(fill_zero_padding(x), '%m/%d/%Y %H:%M:%S') 23 | bike_ori_data = pd.read_csv(bike_ori_data_path, parse_dates=['starttime', 'stoptime'], 24 | date_parser=date_parser) 25 | 26 | print("Remove all nonsense data") 27 | bike_ori_data.dropna(inplace=True) 28 | bike_ori_data = bike_ori_data[bike_ori_data['tripduration'] < 2000] 29 | 30 | print("Remove useless features from dataset") 31 | bike_ori_data.drop(columns=['bikeid', 'usertype', 'start station name', 'end station name'], inplace=True) 32 | 33 | print("Get pick-up and drop-off hour") 34 | bike_ori_data['start_hour'] = bike_ori_data['starttime'].dt.hour 35 | bike_ori_data['end_hour'] = bike_ori_data['stoptime'].dt.hour 36 | 37 | print("Drop specific time information") 38 | bike_ori_data.drop(columns=['starttime', 'stoptime'], inplace=True) 39 | 40 | print("Rename columns") 41 | bike_ori_data.rename(columns={'start station id': 'start_id', 42 | 'end station id': 'end_id', 43 | 'start station longitude': 'start_lon', 44 | 'start station latitude': 'start_lat', 45 | 'end station longitude': 'end_lon', 46 | 'end station latitude': 'end_lat'}, inplace=True) 47 | 48 | print("Change birth year to age") 49 | bike_ori_data['age'] = bike_ori_data['birth year'].apply(lambda x: 2016 - x) 50 | bike_ori_data.drop(columns=['birth year'], inplace=True) 51 | 52 | print("Columns: " + str(bike_ori_data.columns)) 53 | 54 | out_bike_data = pd.get_dummies(bike_ori_data, 55 | columns=['gender', 'start_id', 'end_id'], 56 | prefix=['gender', 'sid', 'eid'], drop_first=True) 57 | 58 | print("sampling from dataset") 59 | if sample_n is not None: 60 | out_bike_data = out_bike_data.sample(n=sample_n, random_state=0) 61 | 62 | print("Saving cleaned dataset to {}".format(out_bike_data_path)) 63 | out_bike_data.to_pickle(out_bike_data_path) 64 | print("Saved {} samples to file".format(len(out_bike_data.index))) 65 | 66 | 67 | if __name__ == '__main__': 68 | os.chdir(sys.path[0] + "/../../../data/nytaxi") # change working directory 69 | clean_bike("201606-citibike-tripdata.csv", "bike_201606_clean_sample_2e5.pkl", sample_n=200000) 70 | -------------------------------------------------------------------------------- /src/preprocess/nytaxi/clean_tlc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pandas as pd 5 | 6 | 7 | def clean_tlc_for_airbnb(tlc_ori_data_path, out_tlc_data_path, sample_n=None, keep_col=None): 8 | print("Reading from {}".format(tlc_ori_data_path)) 9 | tlc_ori_data = pd.read_csv(tlc_ori_data_path, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime']) 10 | 11 | print("get pick-up and drop-off hour") 12 | tlc_ori_data.drop(columns=['store_and_fwd_flag'], inplace=True) 13 | 14 | print("get pick-up and drop-off hour") 15 | tlc_ori_data['pickup_hour'] = tlc_ori_data['tpep_pickup_datetime'].dt.hour 16 | tlc_ori_data['dropoff_hour'] = tlc_ori_data['tpep_dropoff_datetime'].dt.hour 17 | 18 | print("drop specific time information") 19 | tlc_ori_data.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], inplace=True) 20 | 21 | print("divide pickup and dropoff dataset") 22 | tlc_ori_data_pickup = tlc_ori_data.drop(columns=['dropoff_hour', 'dropoff_longitude', 'dropoff_latitude']) 23 | tlc_ori_data_pickup['is_pickup'] = 1 24 | tlc_ori_data_pickup.rename(columns={'pickup_hour': 'hour', 25 | 'pickup_longitude': 'lon', 26 | 'pickup_latitude': 'lat'}, inplace=True) 27 | tlc_ori_data_dropoff = tlc_ori_data.drop(columns=['pickup_hour', 'pickup_longitude', 'pickup_latitude']) 28 | tlc_ori_data_dropoff.rename(columns={'dropoff_hour': 'hour', 29 | 'dropoff_longitude': 'lon', 30 | 'dropoff_latitude': 'lat'}, inplace=True) 31 | tlc_ori_data_dropoff['is_pickup'] = 0 32 | 33 | print("concat pickup and dropoff dataset by rows") 34 | out_tlc_data = pd.concat([tlc_ori_data_pickup, tlc_ori_data_dropoff]) 35 | print("Finished, print all the columns:") 36 | print(out_tlc_data.dtypes) 37 | 38 | if keep_col is None: 39 | print("make categorical features one-hot") 40 | out_tlc_data = pd.get_dummies(out_tlc_data, 41 | columns=['hour', 'VendorID', 'RatecodeID', 'payment_type'], 42 | prefix=['hr', 'vid', 'rid', 'pt'], drop_first=True) 43 | else: 44 | print("Filter columns {}".format(keep_col)) 45 | out_tlc_data = out_tlc_data[keep_col + ['lon', 'lat']] 46 | print("make categorical features one-hot") 47 | dummy_col, dummy_prefix = [], [] 48 | col_prefix = { 49 | 'hour': 'hr', 50 | 'VendorID': 'vid', 51 | 'RatecodeID': 'rid', 52 | 'payment_type': 'pt' 53 | } 54 | for col, prefix in col_prefix.items(): 55 | if col in out_tlc_data.columns: 56 | dummy_col.append(col) 57 | dummy_prefix.append(prefix) 58 | out_tlc_data = pd.get_dummies(out_tlc_data, columns=dummy_col, prefix=dummy_prefix, drop_first=True) 59 | 60 | print("sampling from dataset") 61 | if sample_n is not None: 62 | out_tlc_data = out_tlc_data.sample(n=sample_n, random_state=0) 63 | 64 | print("Saving cleaned dataset to {}".format(out_tlc_data_path)) 65 | out_tlc_data.to_csv(out_tlc_data_path, index=False) 66 | print("Saved {} samples to file".format(len(out_tlc_data.index))) 67 | 68 | 69 | def clean_tlc_for_bike(tlc_ori_data_path, out_tlc_data_path, sample_n=None): 70 | print("Reading from {}".format(tlc_ori_data_path)) 71 | tlc_ori_data = pd.read_csv(tlc_ori_data_path, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime']) 72 | 73 | print("Drop values that are not reasonable") 74 | tlc_ori_data.dropna(inplace=True) 75 | tlc_ori_data = tlc_ori_data[tlc_ori_data['trip_distance'] > 0] 76 | tlc_ori_data = tlc_ori_data[tlc_ori_data['trip_distance'] < 10] 77 | 78 | print("get duration of the trip") 79 | tlc_ori_data['taxi_duration'] = (tlc_ori_data['tpep_dropoff_datetime'] 80 | - tlc_ori_data['tpep_pickup_datetime']).astype('timedelta64[s]') 81 | tlc_ori_data = tlc_ori_data[tlc_ori_data['taxi_duration'] > 0] 82 | tlc_ori_data = tlc_ori_data[tlc_ori_data['taxi_duration'] < 10000] 83 | 84 | print("get pick-up and drop-off hour") 85 | tlc_ori_data['start_hour'] = tlc_ori_data['tpep_pickup_datetime'].dt.hour 86 | tlc_ori_data['end_hour'] = tlc_ori_data['tpep_dropoff_datetime'].dt.hour 87 | 88 | print("drop specific time information") 89 | tlc_ori_data.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], inplace=True) 90 | 91 | print("divide pickup and dropoff dataset") 92 | tlc_ori_data.rename(columns={'pickup_longitude': 'start_lon', 93 | 'pickup_latitude': 'start_lat', 94 | 'dropoff_longitude': 'end_lon', 95 | 'dropoff_latitude': 'end_lat'}, inplace=True) 96 | 97 | print("Drop useless features") 98 | out_tlc_data = tlc_ori_data[['start_lon', 'start_lat', 'end_lon', 'end_lat', 99 | 'start_hour', 'end_hour', 'trip_distance', 'taxi_duration']] 100 | 101 | print("sampling from dataset") 102 | if sample_n is not None: 103 | out_tlc_data = out_tlc_data.sample(n=sample_n, random_state=0) 104 | 105 | print("Saving cleaned dataset to {}".format(out_tlc_data_path)) 106 | out_tlc_data.to_pickle(out_tlc_data_path) 107 | print("Saved {} samples to file".format(len(out_tlc_data.index))) 108 | 109 | 110 | if __name__ == '__main__': 111 | os.chdir(sys.path[0] + "/../../../data/nytaxi") # change working directory 112 | # clean_tlc("yellow_tripdata_2016-06.csv", "taxi_201606_clean.csv", sample_n=None) 113 | # clean_tlc_for_airbnb("yellow_tripdata_2016-06.csv", "taxi_201606_clean_sample_1e6.csv", 114 | # sample_n=1000000, keep_col=['RatecodeID', 'tip_amount']) 115 | clean_tlc_for_bike("yellow_tripdata_2016-06.csv", "taxi_201606_clean_sample_1e5.pkl", 116 | sample_n=100000) 117 | -------------------------------------------------------------------------------- /src/preprocess/nytaxi/filter_kaggle.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pandas as pd 5 | 6 | 7 | def filter_kaggle(kaggle_train_path, kaggle_out_path): 8 | print("Start filtering") 9 | 10 | print("Loading training data from csv files") 11 | kaggle_train = pd.read_csv(kaggle_train_path, index_col=0, parse_dates=['key']) 12 | 13 | print("Filtering data") 14 | filtered_train = kaggle_train.loc['2009-01-01': '2009-01-31'] 15 | print("Finished filtering training set, got {} samples".format(len(filtered_train.index))) 16 | 17 | print("Saving the filtered data") 18 | filtered_train.to_csv(kaggle_out_path) 19 | print("Done") 20 | 21 | 22 | if __name__ == '__main__': 23 | os.chdir(sys.path[0] + "/../../../data/nytaxi") # change working directory 24 | filter_kaggle("kaggle_train_ori.csv", "kaggle_data.csv") 25 | -------------------------------------------------------------------------------- /src/preprocess/nytaxi/ny_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from utils import move_item_to_start_, move_item_to_end_ 4 | 5 | 6 | class NYAirbnbTaxiLoader: 7 | def __init__(self, airbnb_path, taxi_path=None, link=False): 8 | print("Loading airbnb from {}".format(airbnb_path)) 9 | self.airbnb_data = pd.read_csv(airbnb_path) 10 | print("Loaded.") 11 | if taxi_path is not None: 12 | print("Loading taxi from {}".format(taxi_path)) 13 | self.taxi_data = pd.read_csv(taxi_path) 14 | print("Loaded.") 15 | 16 | if link: 17 | self.labels = self.airbnb_data['price'].to_numpy() 18 | self.airbnb_data.drop(columns=['price'], inplace=True) 19 | 20 | # move lon and lat to end of airbnb 21 | ab_cols = list(self.airbnb_data) 22 | ab_cols.insert(len(ab_cols), ab_cols.pop(ab_cols.index('longitude'))) 23 | ab_cols.insert(len(ab_cols), ab_cols.pop(ab_cols.index('latitude'))) 24 | self.airbnb_data = self.airbnb_data[ab_cols] 25 | print("Current airbnb columns: " + str(list(self.airbnb_data))) 26 | self.airbnb_data = self.airbnb_data.to_numpy() 27 | 28 | # move lon and lat to the front of taxi 29 | tx_cols = list(self.taxi_data) 30 | tx_cols.insert(0, tx_cols.pop(tx_cols.index('lat'))) 31 | tx_cols.insert(0, tx_cols.pop(tx_cols.index('lon'))) 32 | self.taxi_data = self.taxi_data[tx_cols] 33 | print("Current taxi columns: " + str(list(self.taxi_data))) 34 | self.taxi_data = self.taxi_data.to_numpy() 35 | else: 36 | self.airbnb_data.drop(columns=['longitude', 'latitude'], inplace=True) 37 | self.labels = self.airbnb_data['price'].to_numpy() 38 | self.airbnb_data = self.airbnb_data.drop(columns=['price']).to_numpy() 39 | 40 | def load_single(self): 41 | return self.airbnb_data, self.labels 42 | 43 | def load_parties(self): 44 | return [self.airbnb_data, self.taxi_data], self.labels 45 | 46 | 47 | class NYBikeTaxiLoader: 48 | def __init__(self, bike_path, taxi_path=None, link=False): 49 | print("Loading bike from {}".format(bike_path)) 50 | self.bike_data = pd.read_pickle(bike_path) 51 | # self.bike_data = self.bike_data.head(10000) 52 | # print("Remove N/A from bike") 53 | # self.bike_data.dropna() 54 | print("Loaded.") 55 | if taxi_path is not None: 56 | print("Loading taxi from {}".format(taxi_path)) 57 | self.taxi_data = pd.read_pickle(taxi_path) 58 | print("Loaded.") 59 | 60 | if link: 61 | self.labels = self.bike_data['tripduration'].to_numpy() 62 | self.bike_data.drop(columns=['tripduration'], inplace=True) 63 | 64 | # move lon and lat to end of airbnb 65 | bike_cols = list(self.bike_data) 66 | move_item_to_end_(bike_cols, ['start_lon', 'start_lat', 'end_lon', 'end_lat', 67 | 'start_hour', 'end_hour']) 68 | self.bike_data = self.bike_data[bike_cols] 69 | self.bike_data.drop(columns=['start_hour', 'end_hour'], inplace=True) 70 | print("Current bike columns: " + str(list(self.bike_data))) 71 | self.bike_data = self.bike_data.to_numpy() 72 | 73 | # move lon and lat to the front of taxi 74 | tx_cols = list(self.taxi_data) 75 | move_item_to_start_(tx_cols, ['start_lon', 'start_lat', 'end_lon', 'end_lat', 76 | 'start_hour', 'end_hour']) 77 | self.taxi_data = self.taxi_data[tx_cols] 78 | self.taxi_data.drop(columns=['start_hour', 'end_hour'], inplace=True) 79 | print("Current taxi columns: " + str(list(self.taxi_data))) 80 | self.taxi_data = self.taxi_data.to_numpy() 81 | else: 82 | print("Remove columns that are used for linkage") 83 | self.bike_data.drop(columns=['start_lon', 'start_lat', 'end_lon', 'end_lat', 84 | 'start_hour', 'end_hour'], inplace=True) 85 | print('Extract labels') 86 | self.labels = self.bike_data['tripduration'].to_numpy() 87 | print("Extract data") 88 | self.bike_data = self.bike_data.drop(columns=['tripduration']).to_numpy() 89 | 90 | def load_single(self): 91 | return self.bike_data, self.labels 92 | 93 | def load_parties(self): 94 | return [self.bike_data, self.taxi_data], self.labels 95 | -------------------------------------------------------------------------------- /src/preprocess/sklearn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/sklearn/__init__.py -------------------------------------------------------------------------------- /src/preprocess/sklearn/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/sklearn/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/sklearn/__pycache__/syn_data_generator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/sklearn/__pycache__/syn_data_generator.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/song/__init__.py: -------------------------------------------------------------------------------- 1 | from .song_loader import load_both, load_msd, load_fma -------------------------------------------------------------------------------- /src/preprocess/song/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/song/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/song/__pycache__/song_loader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/preprocess/song/__pycache__/song_loader.cpython-38.pyc -------------------------------------------------------------------------------- /src/preprocess/song/clean_fma.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pickle 4 | import re 5 | 6 | import pandas as pd 7 | 8 | 9 | def clean_fma(fma_path, out_clean_fma_path): 10 | fma_titles = [] 11 | fma_data = [] 12 | fma_labels = [] 13 | print("Reformatting fma data") 14 | with open(fma_path, 'rb') as f: 15 | fma_data_labels = pickle.load(f) 16 | for title, datum, label in fma_data_labels: 17 | title = re.sub(r'\W', '', title) 18 | if len(title) > 0: 19 | fma_titles.append(title.lower()) 20 | fma_data.append(datum) 21 | fma_labels.append(label) 22 | fma_df = pd.DataFrame(fma_data) 23 | fma_df['title'] = fma_titles 24 | fma_df['label'] = fma_labels 25 | 26 | # remove duplicate titles 27 | fma_df.set_index('title', inplace=True) 28 | fma_df = fma_df[~fma_df.index.duplicated(keep="first")] 29 | 30 | print("Saving to {}".format(out_clean_fma_path)) 31 | fma_df.to_csv(out_clean_fma_path) 32 | print("Done") 33 | 34 | 35 | if __name__ == '__main__': 36 | os.chdir(sys.path[0] + "/../../../data/song") # change working directory 37 | clean_fma("fma.pkl", "fma_clean.csv") 38 | -------------------------------------------------------------------------------- /src/preprocess/song/clean_msd.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pickle 4 | import re 5 | from pathlib import Path 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | from tqdm import tqdm 11 | import h5py 12 | import deprecation 13 | 14 | 15 | def generate_msd(million_song_dir, tracks_per_year_path, out_msd_path): 16 | num_songs = 0 17 | for _ in Path(million_song_dir).rglob('*.h5'): 18 | num_songs += 1 19 | print("There are {} songs in the dataset".format(num_songs)) 20 | 21 | id_to_title_artist = {} 22 | with open(tracks_per_year_path, 'r') as f: 23 | for i, row in enumerate(f): 24 | year, track_id, artist, title = row.split('') 25 | assert track_id not in id_to_title_artist # id should not be duplicated 26 | id_to_title_artist[track_id] = [title, artist, year] 27 | 28 | timbre_vec_all = np.zeros([0, 90]) # 12 average features and 78 covariance features 29 | track_info_all = [] 30 | for path in tqdm(Path(million_song_dir).rglob('*.h5'), total=num_songs): 31 | with h5py.File(path, 'r') as f: 32 | # read track id 33 | assert len(f['analysis']['songs']) == 1, "More than 1 song in a h5 file" 34 | track_id = f['analysis']['songs'][0]['track_id'].decode('utf-8') 35 | 36 | if track_id in id_to_title_artist: 37 | # get title, artist and year from id_to_title_artist 38 | track_info = id_to_title_artist[track_id] 39 | track_info_all.append([track_id] + track_info) 40 | 41 | # read timbre information similarly as YearPredictMSD.txt in UCI 42 | timbre_matrix = f['analysis']['segments_timbre'] 43 | timbre_avg = np.average(timbre_matrix, axis=0) # 12 timbre averages 44 | timbre_cov_matrix = np.cov(timbre_matrix, rowvar=False) # 12 x 12 covariance matrix 45 | timbre_cov = timbre_cov_matrix[np.triu_indices(timbre_cov_matrix.shape[0])] # flatten upper triangle 46 | timbre_vec = np.concatenate([timbre_avg, timbre_cov]).reshape(1, -1) 47 | timbre_vec_all = np.concatenate([timbre_vec_all, timbre_vec], axis=0) 48 | 49 | print("Finished. Got {} tracks".format(len(track_info_all))) 50 | 51 | msd_df = pd.DataFrame(track_info_all, columns=['track_id', 'title', 'artist', 'year']) 52 | msd_df = pd.concat([msd_df, pd.DataFrame(timbre_vec_all)], axis=1) 53 | 54 | print("Saving to file") 55 | msd_df.to_csv(out_msd_path, index=False) 56 | print("Saved to {}".format(out_msd_path)) 57 | 58 | 59 | @deprecation.deprecated() 60 | def __clean_msd(msd_path, out_clean_msd_path): 61 | msd_titles = [] 62 | msd_data = [] 63 | msd_labels = [] 64 | print("Reformatting msd data") 65 | with open(msd_path, 'rb') as f: 66 | msd_data_labels = pickle.load(f) 67 | for title, datum, label in msd_data_labels: 68 | # title = "".join(title.split()) # remove all whitespaces 69 | title = re.sub(r'\W', '', title) 70 | if len(title) > 0: 71 | msd_titles.append(title.lower()) 72 | msd_data.append(datum) 73 | msd_labels.append(label) 74 | msd_df = pd.DataFrame(msd_data) 75 | msd_df['title'] = msd_titles 76 | msd_df['label'] = msd_labels 77 | 78 | # remove duplicate titles 79 | msd_df.set_index('title', inplace=True) 80 | msd_df = msd_df[~msd_df.index.duplicated(keep="first")] 81 | 82 | # filter out extreme years 83 | msd_df = msd_df[msd_df['label'] > 1970] 84 | 85 | print("Saving to {}".format(out_clean_msd_path)) 86 | msd_df.to_csv(out_clean_msd_path) 87 | print("Done") 88 | 89 | 90 | def clean_msd(msd_path, out_clean_msd_path): 91 | print("Loading from {}".format(msd_path)) 92 | msd_df = pd.read_csv(msd_path) 93 | print("Loaded {} tracks".format(len(msd_df.index))) 94 | 95 | print("Encode titles") 96 | msd_df['title'] = msd_df['title'].apply(lambda s: re.sub(r'\W', '', s).lower()) 97 | msd_df = msd_df[msd_df['title'].str.len() > 0] 98 | print("Done") 99 | 100 | print("Removing duplicated titles") 101 | # remove duplicate titles 102 | msd_df.set_index('title', inplace=True) 103 | msd_df = msd_df[~msd_df.index.duplicated(keep="first")] 104 | print("Done. Got {} tracks".format(len(msd_df.index))) 105 | 106 | print("Saving to {}".format(out_clean_msd_path)) 107 | msd_df.to_csv(out_clean_msd_path) 108 | print("Done") 109 | 110 | 111 | if __name__ == '__main__': 112 | os.chdir(sys.path[0] + "/../../../data/song") # change working directory 113 | 114 | generate_msd("MillionSong", "tracks_per_year.txt", "msd_full.csv") 115 | clean_msd("msd_full.csv", "msd_clean.csv") 116 | -------------------------------------------------------------------------------- /src/preprocess/song/song_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import sys 4 | 5 | from utils import move_item_to_start_, move_item_to_end_ 6 | 7 | 8 | def load_msd(msd_path): 9 | print("Loading MSD from {}".format(msd_path)) 10 | msd_df = pd.read_csv(msd_path) 11 | msd_df.drop(columns=['track_id', 'title', 'artist'], inplace=True) 12 | 13 | # msd_df = msd_df[msd_df['year'] > 1960] 14 | msd_df.info(verbose=True) 15 | 16 | labels = msd_df['year'].to_numpy() 17 | msd_data = msd_df.drop(columns=['year']).to_numpy() 18 | 19 | return msd_data, labels 20 | 21 | 22 | def load_fma(fma_path): 23 | print("Loading FMA from {}".format(fma_path)) 24 | fma_df = pd.read_csv(fma_path) 25 | fma_df.drop(columns=['title'], inplace=True) 26 | 27 | fma_df.info(verbose=True) 28 | 29 | labels = fma_df['label'].to_numpy() 30 | fma_data = fma_df.drop(columns=['label']).to_numpy() 31 | 32 | return fma_data, labels 33 | 34 | 35 | def load_both(msd_path, fma_path, host_party='msd'): 36 | if host_party == 'msd': 37 | print("Loading MSD from {}".format(msd_path)) 38 | msd_df = pd.read_csv(msd_path) 39 | 40 | print("Loading FMA from {}".format(fma_path)) 41 | fma_df = pd.read_csv(fma_path) 42 | 43 | msd_df.drop(columns=['track_id', 'artist'], inplace=True) 44 | 45 | labels = msd_df['year'].to_numpy() 46 | msd_df.drop(columns=['year'], inplace=True) 47 | fma_df.drop(columns=['label'], inplace=True) 48 | 49 | msd_cols = list(msd_df.columns) 50 | move_item_to_end_(msd_cols, ['title']) 51 | msd_df = msd_df[msd_cols] 52 | print("Current MSD columns {}".format(msd_df.columns)) 53 | 54 | fma_cols = list(fma_df.columns) 55 | move_item_to_start_(fma_cols, ['title']) 56 | fma_df = fma_df[fma_cols] 57 | print("Current FMA columns {}".format(fma_df.columns)) 58 | 59 | data1 = msd_df.to_numpy() 60 | data2 = fma_df.to_numpy() 61 | elif host_party == 'fma': 62 | print("Loading MSD from {}".format(msd_path)) 63 | msd_df = pd.read_csv(msd_path) 64 | 65 | print("Loading FMA from {}".format(fma_path)) 66 | fma_df = pd.read_csv(fma_path) 67 | 68 | msd_df.drop(columns=['track_id', 'artist', 'year'], inplace=True) 69 | labels = fma_df['label'].to_numpy() 70 | fma_df.drop(columns=['label'], inplace=True) 71 | 72 | msd_cols = list(msd_df.columns) 73 | move_item_to_start_(msd_cols, ['title']) 74 | msd_df = msd_df[msd_cols] 75 | print("Current MSD columns {}".format(msd_df.columns)) 76 | 77 | fma_cols = list(fma_df.columns) 78 | move_item_to_end_(fma_cols, ['title']) 79 | fma_df = fma_df[fma_cols] 80 | print("Current FMA columns {}".format(fma_df.columns)) 81 | 82 | data1 = fma_df.to_numpy() 83 | data2 = msd_df.to_numpy() 84 | else: 85 | assert False 86 | 87 | return [data1, data2], labels 88 | 89 | 90 | if __name__ == '__main__': 91 | os.chdir(sys.path[0] + "/../../../data/song") # change working directory 92 | fma_df = pd.read_csv("fma_clean.csv") 93 | msd_df = pd.read_csv("msd_clean.csv") 94 | 95 | merge_df = fma_df.merge(msd_df, how='inner', on='title') 96 | 97 | -------------------------------------------------------------------------------- /src/priv_scripts/train_beijing_A.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 7 | from sklearn.linear_model import LinearRegression 8 | import sklearn.metrics as metrics 9 | import numpy as np 10 | 11 | from model.vertical_fl.OnePartyModel import OnePartyModel 12 | from preprocess.beijing import load_house 13 | 14 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 15 | 16 | os.chdir(sys.path[0] + "/../../") # change working directory 17 | 18 | root = "data/beijing/" 19 | dataset = "house_clean.csv" 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 23 | parser.add_argument('-g', '--gpu', type=int, default=3) 24 | args = parser.parse_args() 25 | 26 | X, y = load_house(root + dataset) 27 | print("X got {} dimensions".format(X.shape[1])) 28 | name = "beijing_house" 29 | # reg = LinearRegression().fit(X, y) 30 | # score = np.sqrt(metrics.mean_squared_error(reg.predict(X), y)) 31 | # print(score) 32 | 33 | model = OnePartyModel(model_name=name + "_" + now_string, 34 | task='regression', 35 | metrics=['r2_score', 'rmse'], 36 | n_classes=2, 37 | val_rate=0.1, 38 | test_rate=0.2, 39 | device='cuda:{}'.format(args.gpu), 40 | hidden_sizes=[400, 200], 41 | train_batch_size=4096, 42 | test_batch_size=4096, 43 | num_epochs=100, 44 | learning_rate=3e-3, 45 | weight_decay=1e-5, 46 | num_workers=4 if sys.gettrace() is None else 0, 47 | use_scheduler=False, 48 | sche_factor=0.1, 49 | sche_patience=10, 50 | sche_threshold=0.0001, 51 | writer_path="runs/{}_{}".format(name, now_string), 52 | model_save_path="ckp/{}_{}.pth".format(name, now_string) 53 | ) 54 | model.train_single(X, y, scale=True) 55 | -------------------------------------------------------------------------------- /src/priv_scripts/train_beijing_avgsim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | os.chdir(sys.path[0] + "/../../") # change working directory 7 | sys.path.append(os.path.join(os.getcwd(), "src")) 8 | 9 | from model.vertical_fl.MergeSimModel import MergeSimModel 10 | from preprocess.beijing import load_both 11 | 12 | 13 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 14 | root = "data/beijing/" 15 | house_dataset = root + "house_clean.csv" 16 | airbnb_dataset = root + "airbnb_clean.csv" 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 20 | parser.add_argument('-g', '--gpu', type=int, default=0) 21 | args = parser.parse_args() 22 | 23 | num_common_features = 2 24 | [X1, X2], y = load_both(house_path=house_dataset, airbnb_path=airbnb_dataset, active_party='house') 25 | name = "beijing_avgsim_p_{:.0E}".format(args.leak_p) 26 | 27 | 28 | model = MergeSimModel(num_common_features=num_common_features, 29 | sim_hidden_sizes=[10, 10], 30 | merge_mode='avg', 31 | feature_wise_sim=False, 32 | task='regression', 33 | metrics=['r2_score', 'rmse'], 34 | dataset_type='real', 35 | blocking_method='knn_priv_float', 36 | n_classes=2, 37 | grid_min=-10.0, 38 | grid_max=10.0, 39 | grid_width=1.5, 40 | knn_k=100, 41 | kd_tree_radius=1e-2, 42 | tree_leaf_size=1000, 43 | model_name=name + "_" + now_string, 44 | val_rate=0.1, 45 | test_rate=0.2, 46 | drop_key=True, 47 | device='cuda:{}'.format(args.gpu), 48 | hidden_sizes=[200, 100], 49 | train_batch_size=128, 50 | test_batch_size=1024 * 4, 51 | num_epochs=100, 52 | learning_rate=3e-3, 53 | weight_decay=1e-5, 54 | sim_learning_rate=3e-3, 55 | sim_weight_decay=1e-5, 56 | sim_batch_size=4096, 57 | update_sim_freq=1, 58 | num_workers=4 if sys.gettrace() is None else 0, 59 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 60 | writer_path="runs/{}_{}".format(name, now_string), 61 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 62 | sim_model_save_path="ckp/{}_{}_sim.pth".format(name, now_string), 63 | log_dir="log/{}_{}/".format(name, now_string), 64 | # SplitNN parameters 65 | local_hidden_sizes=[[200], [200]], 66 | agg_hidden_sizes=[100], 67 | cut_dims=[100, 100], 68 | 69 | # private link parameters 70 | link_epsilon=3e-2, 71 | link_delta=3e-2, 72 | link_threshold_t=1e-2, 73 | sim_leak_p=args.leak_p, 74 | link_n_jobs=-1, 75 | ) 76 | model.train_splitnn(X1, X2, y, data_cache_path="cache/beijing_sim_p_base.pkl".format(name), scale=True) 77 | 78 | -------------------------------------------------------------------------------- /src/priv_scripts/train_beijing_featuresim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | os.chdir(sys.path[0] + "/../../") # change working directory 7 | sys.path.append(os.path.join(os.getcwd(), "src")) 8 | 9 | from model.vertical_fl.FeatureSimModel import FeatureSimModel 10 | from preprocess.beijing import load_both 11 | 12 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 13 | root = "data/beijing/" 14 | house_dataset = root + "house_clean.csv" 15 | airbnb_dataset = root + "airbnb_clean.csv" 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 19 | parser.add_argument('-g', '--gpu', type=int, default=0) 20 | args = parser.parse_args() 21 | 22 | num_common_features = 2 23 | [X1, X2], y = load_both(house_path=house_dataset, airbnb_path=airbnb_dataset, active_party='house') 24 | name = "beijing_featuresim_p_{:.0E}".format(args.leak_p) 25 | 26 | model = FeatureSimModel(num_common_features=num_common_features, 27 | feature_wise_sim=False, 28 | task='regression', 29 | metrics=['r2_score', 'rmse'], 30 | dataset_type='real', 31 | blocking_method='knn_priv_float', 32 | n_classes=2, 33 | grid_min=-10.0, 34 | grid_max=10.0, 35 | grid_width=1.5, 36 | knn_k=100, 37 | kd_tree_radius=1e-2, 38 | tree_leaf_size=1000, 39 | model_name=name + "_" + now_string, 40 | val_rate=0.1, 41 | test_rate=0.2, 42 | drop_key=True, 43 | device='cuda:{}'.format(args.gpu), 44 | hidden_sizes=[200, 100], 45 | train_batch_size=128, 46 | test_batch_size=1024 * 4, 47 | num_epochs=100, 48 | learning_rate=3e-3, 49 | weight_decay=1e-5, 50 | num_workers=4 if sys.gettrace() is None else 0, 51 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 52 | writer_path="runs/{}_{}".format(name, now_string), 53 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 54 | # SplitNN parameters 55 | local_hidden_sizes=[[200], [200]], 56 | agg_hidden_sizes=[100], 57 | cut_dims=[100, 100], 58 | 59 | # private link parameters 60 | link_epsilon=3e-2, 61 | link_delta=3e-2, 62 | link_threshold_t=1e-2, 63 | sim_leak_p=args.leak_p, 64 | link_n_jobs=-1, 65 | ) 66 | model.train_splitnn(X1, X2, y, data_cache_path="cache/beijing_sim_p_base.pkl".format(name), scale=True) 67 | # model.train_splitnn(X1, X2, y, scale=True) 68 | -------------------------------------------------------------------------------- /src/priv_scripts/train_beijing_fedsim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | os.chdir(sys.path[0] + "/../../") # change working directory 7 | sys.path.append(os.path.join(os.getcwd(), "src")) 8 | 9 | from model.vertical_fl.FedSimModel import FedSimModel 10 | from preprocess.beijing import load_both 11 | 12 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 13 | 14 | root = "data/beijing/" 15 | house_dataset = root + "house_clean.csv" 16 | airbnb_dataset = root + "airbnb_clean.csv" 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('-p', '--leak-p', type=float, default=1) 20 | parser.add_argument('-g', '--gpu', type=int, default=0) 21 | parser.add_argument('-k', '--top-k', type=int, default=20) 22 | args = parser.parse_args() 23 | 24 | num_common_features = 2 25 | [X1, X2], y = load_both(house_path=house_dataset, airbnb_path=airbnb_dataset, active_party='house') 26 | name = "beijing_fedsim_p_{:.0E}".format(args.leak_p) 27 | 28 | model = FedSimModel(num_common_features=num_common_features, 29 | raw_output_dim=10, 30 | feature_wise_sim=False, 31 | task='regression', 32 | metrics=['r2_score', 'rmse'], 33 | dataset_type='real', 34 | blocking_method='knn_priv_float', 35 | n_classes=2, 36 | grid_min=(115.5, 39), 37 | grid_max=(116.5, 40), 38 | grid_width=(0.1, 0.1), 39 | knn_k=100, 40 | filter_top_k=args.top_k, 41 | kd_tree_radius=1e-2, 42 | tree_leaf_size=100, 43 | model_name=name + "_" + now_string, 44 | val_rate=0.1, 45 | test_rate=0.2, 46 | drop_key=True, 47 | device='cuda:{}'.format(args.gpu), 48 | hidden_sizes=[200, 100], 49 | train_batch_size=128, 50 | test_batch_size=1024 * 4, 51 | num_epochs=100, 52 | learning_rate=3e-3, 53 | weight_decay=1e-5, 54 | update_sim_freq=1, 55 | num_workers=4 if sys.gettrace() is None else 0, 56 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 57 | writer_path="runs/{}_{}".format(name, now_string), 58 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 59 | sim_model_save_path="ckp/{}_{}_sim.pth".format(name, now_string), 60 | log_dir="log/{}_{}/".format(name, now_string), 61 | # SplitNN parameters 62 | local_hidden_sizes=[[200], [200]], 63 | agg_hidden_sizes=[100], 64 | cut_dims=[100, 100], 65 | 66 | # fedsim parameters 67 | use_conv=True, 68 | merge_hidden_sizes=[400], 69 | sim_hidden_sizes=[10], 70 | merge_model_save_path="ckp/{}_{}_merge.pth".format(name, now_string), 71 | merge_dropout_p=0.8, 72 | conv_n_channels=8, 73 | conv_kernel_v_size=7, 74 | 75 | # private link parameters 76 | link_epsilon=3e-2, 77 | link_delta=3e-2, 78 | link_threshold_t=1e-2, 79 | sim_leak_p=args.leak_p, 80 | link_n_jobs=-1, 81 | ) 82 | model.train_splitnn(X1, X2, y, data_cache_path="cache/beijing_sim_p_base.pkl".format(args.leak_p), scale=True) 83 | # model.train_splitnn(X1, X2, y, scale=True) 84 | -------------------------------------------------------------------------------- /src/priv_scripts/train_beijing_top1sim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | from model.vertical_fl.Top1SimModel import Top1SimModel 7 | from preprocess.beijing import load_both 8 | 9 | os.chdir(sys.path[0] + "/../../") # change working directory 10 | sys.path.append(os.path.join(os.getcwd(), "src")) 11 | 12 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 13 | root = "data/beijing/" 14 | house_dataset = root + "house_clean.csv" 15 | airbnb_dataset = root + "airbnb_clean.csv" 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 19 | parser.add_argument('-g', '--gpu', type=int, default=0) 20 | args = parser.parse_args() 21 | 22 | num_common_features = 2 23 | [X1, X2], y = load_both(house_path=house_dataset, airbnb_path=airbnb_dataset, active_party='house') 24 | name = "beijing_top1sim_p_{:.0E}".format(args.leak_p) 25 | 26 | model = Top1SimModel(num_common_features=num_common_features, 27 | task='regression', 28 | dataset_type='real', 29 | blocking_method='knn_priv_float', 30 | metrics=['r2_score', 'rmse'], 31 | n_classes=2, 32 | grid_min=-10.0, 33 | grid_max=10.0, 34 | grid_width=1.5, 35 | knn_k=100, 36 | kd_tree_radius=0.01, 37 | tree_leaf_size=1000, 38 | model_name=name + "_" + now_string, 39 | val_rate=0.1, 40 | test_rate=0.2, 41 | drop_key=True, 42 | device='cuda:{}'.format(args.gpu), 43 | hidden_sizes=[200, 100], 44 | train_batch_size=1024 * 4, 45 | test_batch_size=1024 * 4, 46 | num_epochs=50, 47 | learning_rate=3e-3, 48 | weight_decay=1e-5, 49 | num_workers=4 if sys.gettrace() is None else 0, 50 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 51 | writer_path="runs/{}_{}".format(name, now_string), 52 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 53 | # SplitNN parameters 54 | local_hidden_sizes=[[200], [200]], 55 | agg_hidden_sizes=[100], 56 | cut_dims=[100, 100], 57 | 58 | # private link parameters 59 | link_epsilon=3e-2, 60 | link_delta=3e-2, 61 | link_threshold_t=1e-2, 62 | sim_leak_p=args.leak_p, 63 | link_n_jobs=-1, 64 | ) 65 | model.train_splitnn(X1, X2, y, data_cache_path="cache/beijing_sim_p_base.pkl".format(name), scale=True) 66 | # model.train_splitnn(X1, X2, y, scale=True) 67 | -------------------------------------------------------------------------------- /src/priv_scripts/train_hdb_A.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | os.chdir(sys.path[0] + "/../../") # change working directory 7 | sys.path.append(os.path.join(os.getcwd(), "src")) 8 | 9 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 10 | from sklearn.linear_model import LinearRegression 11 | import sklearn.metrics as metrics 12 | import numpy as np 13 | 14 | from model.vertical_fl.OnePartyModel import OnePartyModel 15 | from preprocess.hdb import load_hdb 16 | 17 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 18 | 19 | root = "data/hdb/" 20 | dataset = "hdb_clean.csv" 21 | 22 | X, y = load_hdb(root + dataset) 23 | print("X got {} dimensions".format(X.shape[1])) 24 | name = "hdb_A" 25 | 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('-g', '--gpu', type=int, default=0) 28 | args = parser.parse_args() 29 | 30 | model = OnePartyModel(model_name=name + "_" + now_string, 31 | task='regression', 32 | metrics=['r2_score', 'rmse'], 33 | n_classes=2, 34 | val_rate=0.1, 35 | test_rate=0.2, 36 | device='cuda:{}'.format(args.gpu), 37 | hidden_sizes=[400, 200], 38 | train_batch_size=4096, 39 | test_batch_size=4096, 40 | num_epochs=200, 41 | learning_rate=1e-2, 42 | weight_decay=1e-5, 43 | num_workers=4 if sys.gettrace() is None else 0, 44 | use_scheduler=False, 45 | sche_factor=0.1, 46 | sche_patience=10, 47 | sche_threshold=0.0001, 48 | writer_path="runs/{}_{}".format(name, now_string), 49 | model_save_path="ckp/{}_{}.pth".format(name, now_string) 50 | ) 51 | model.train_single(X, y, scale=True) 52 | -------------------------------------------------------------------------------- /src/priv_scripts/train_hdb_avgsim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | os.chdir(sys.path[0] + "/../../") # change working directory 7 | sys.path.append(os.path.join(os.getcwd(), "src")) 8 | 9 | from model.vertical_fl.MergeSimModel import MergeSimModel 10 | from preprocess.hdb import load_both 11 | 12 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 13 | root = "data/hdb/" 14 | hdb_dataset = root + "hdb_clean.csv" 15 | school_dataset = root + "school_clean.csv" 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 19 | parser.add_argument('-g', '--gpu', type=int, default=0) 20 | args = parser.parse_args() 21 | 22 | num_common_features = 2 23 | [X1, X2], y = load_both(hdb_path=hdb_dataset, airbnb_path=school_dataset, active_party='hdb') 24 | name = "hdb_avgsim" 25 | 26 | model = MergeSimModel(num_common_features=num_common_features, 27 | sim_hidden_sizes=[10, 10], 28 | merge_mode='avg', 29 | feature_wise_sim=False, 30 | task='regression', 31 | metrics=['r2_score', 'rmse'], 32 | dataset_type='real', 33 | blocking_method='knn_priv_float', 34 | n_classes=2, 35 | grid_min=-10.0, 36 | grid_max=10.0, 37 | grid_width=1.5, 38 | knn_k=50, 39 | kd_tree_radius=1e-2, 40 | tree_leaf_size=1000, 41 | model_name=name + "_" + now_string, 42 | val_rate=0.1, 43 | test_rate=0.2, 44 | drop_key=True, 45 | device='cuda:{}'.format(args.gpu), 46 | hidden_sizes=[200, 100], 47 | train_batch_size=128, 48 | test_batch_size=1024 * 4, 49 | num_epochs=100, 50 | learning_rate=1e-3, 51 | weight_decay=1e-5, 52 | sim_learning_rate=1e-3, 53 | sim_weight_decay=1e-5, 54 | sim_batch_size=4096, 55 | update_sim_freq=1, 56 | num_workers=4 if sys.gettrace() is None else 0, 57 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 58 | writer_path="runs/{}_{}".format(name, now_string), 59 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 60 | sim_model_save_path="ckp/{}_{}_sim.pth".format(name, now_string), 61 | log_dir="log/{}_{}/".format(name, now_string), 62 | # SplitNN parameters 63 | local_hidden_sizes=[[200], [200]], 64 | agg_hidden_sizes=[400], 65 | cut_dims=[100, 100], 66 | 67 | # private link parameters 68 | link_epsilon=5e-3, 69 | link_delta=5e-3, 70 | link_threshold_t=1e-2, 71 | sim_leak_p=args.leak_p, 72 | link_n_jobs=-1, 73 | ) 74 | model.train_splitnn(X1, X2, y, data_cache_path="cache/hdb_sim_p_base.pkl".format(name), scale=True) 75 | -------------------------------------------------------------------------------- /src/priv_scripts/train_hdb_featuresim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | import numpy as np 6 | 7 | 8 | os.chdir(sys.path[0] + "/../../") # change working directory 9 | sys.path.append(os.path.join(os.getcwd(), "src")) 10 | 11 | from model.vertical_fl.FeatureSimModel import FeatureSimModel 12 | from preprocess.hdb import load_both 13 | 14 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 15 | os.chdir(sys.path[0] + "/../../") # change working directory 16 | root = "data/hdb/" 17 | hdb_dataset = root + "hdb_clean.csv" 18 | school_dataset = root + "school_clean.csv" 19 | 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 22 | parser.add_argument('-g', '--gpu', type=int, default=0) 23 | args = parser.parse_args() 24 | 25 | num_common_features = 2 26 | [X1, X2], y = load_both(hdb_path=hdb_dataset, airbnb_path=school_dataset, active_party='hdb') 27 | name = "hdb_featuresim" 28 | 29 | model = FeatureSimModel(num_common_features=num_common_features, 30 | feature_wise_sim=False, 31 | task='regression', 32 | metrics=['r2_score', 'rmse'], 33 | dataset_type='real', 34 | blocking_method='knn_priv_float', 35 | n_classes=2, 36 | grid_min=-10.0, 37 | grid_max=10.0, 38 | grid_width=1.5, 39 | knn_k=50, 40 | kd_tree_radius=1e-2, 41 | tree_leaf_size=1000, 42 | model_name=name + "_" + now_string, 43 | val_rate=0.1, 44 | test_rate=0.2, 45 | drop_key=True, 46 | device='cuda:{}'.format(args.gpu), 47 | hidden_sizes=[200, 100], 48 | train_batch_size=128, 49 | test_batch_size=1024 * 4, 50 | num_epochs=100, 51 | learning_rate=1e-3, 52 | weight_decay=1e-5, 53 | num_workers=4 if sys.gettrace() is None else 0, 54 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 55 | writer_path="runs/{}_{}".format(name, now_string), 56 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 57 | 58 | # SplitNN parameters 59 | local_hidden_sizes=[[200], [200]], 60 | agg_hidden_sizes=[100], 61 | cut_dims=[100, 100], 62 | 63 | # private link parameters 64 | link_epsilon=5e-3, 65 | link_delta=5e-3, 66 | link_threshold_t=1e-2, 67 | sim_leak_p=args.leak_p, 68 | link_n_jobs=-1, 69 | ) 70 | model.train_splitnn(X1, X2, y, data_cache_path="cache/hdb_sim_p_base.pkl".format(name), scale=True) 71 | # model.train_splitnn(X1, X2, y, scale=True) 72 | -------------------------------------------------------------------------------- /src/priv_scripts/train_hdb_fedsim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | import numpy as np 6 | 7 | os.chdir(sys.path[0] + "/../../") # change working directory 8 | sys.path.append(os.path.join(os.getcwd(), "src")) 9 | 10 | from model.vertical_fl.FedSimModel import FedSimModel 11 | from preprocess.hdb import load_both 12 | 13 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 14 | os.chdir(sys.path[0] + "/../../") # change working directory 15 | root = "data/hdb/" 16 | hdb_dataset = root + "hdb_clean.csv" 17 | school_dataset = root + "school_clean.csv" 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 21 | parser.add_argument('-g', '--gpu', type=int, default=0) 22 | args = parser.parse_args() 23 | 24 | num_common_features = 2 25 | [X1, X2], y = load_both(hdb_path=hdb_dataset, airbnb_path=school_dataset, active_party='hdb') 26 | name = "hdb_fedsim" 27 | 28 | model = FedSimModel(num_common_features=num_common_features, 29 | raw_output_dim=10, 30 | feature_wise_sim=False, 31 | task='regression', 32 | metrics=['r2_score', 'rmse'], 33 | dataset_type='real', 34 | blocking_method='knn_priv_float', 35 | n_classes=2, 36 | grid_min=-10.0, 37 | grid_max=10.0, 38 | grid_width=1.5, 39 | knn_k=50, 40 | kd_tree_radius=1e-2, 41 | tree_leaf_size=1000, 42 | model_name=name + "_" + now_string, 43 | val_rate=0.1, 44 | test_rate=0.2, 45 | drop_key=True, 46 | device='cuda:{}'.format(args.gpu), 47 | hidden_sizes=[200, 100], 48 | train_batch_size=128, 49 | test_batch_size=1024 * 4, 50 | num_epochs=100, 51 | learning_rate=1e-3, 52 | weight_decay=1e-4, 53 | update_sim_freq=1, 54 | num_workers=4 if sys.gettrace() is None else 0, 55 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 56 | writer_path="runs/{}_{}".format(name, now_string), 57 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 58 | sim_model_save_path="ckp/{}_{}_sim.pth".format(name, now_string), 59 | log_dir="log/{}_{}/".format(name, now_string), 60 | # SplitNN parameters 61 | local_hidden_sizes=[[200], [200]], 62 | agg_hidden_sizes=[100], 63 | cut_dims=[100, 100], 64 | 65 | # fedsim parameters 66 | use_conv=True, 67 | merge_hidden_sizes=[400], 68 | sim_hidden_sizes=[10], 69 | merge_model_save_path="ckp/{}_{}_merge.pth".format(name, now_string), 70 | merge_dropout_p=0.2, 71 | conv_n_channels=8, 72 | conv_kernel_v_size=7, 73 | 74 | # private link parameters 75 | link_epsilon=5e-3, 76 | link_delta=5e-3, 77 | link_threshold_t=1e-2, 78 | sim_leak_p=args.leak_p, 79 | link_n_jobs=-1, 80 | ) 81 | model.train_splitnn(X1, X2, y, data_cache_path="cache/hdb_sim_p_base.pkl", scale=True) 82 | # model.train_splitnn(X1, X2, y, scale=True) 83 | -------------------------------------------------------------------------------- /src/priv_scripts/train_hdb_top1sim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | os.chdir(sys.path[0] + "/../../") # change working directory 7 | sys.path.append(os.path.join(os.getcwd(), "src")) 8 | 9 | from model.vertical_fl.Top1SimModel import Top1SimModel 10 | from preprocess.hdb import load_both 11 | 12 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 13 | root = "data/hdb/" 14 | hdb_dataset = root + "hdb_clean.csv" 15 | school_dataset = root + "school_clean.csv" 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 19 | parser.add_argument('-g', '--gpu', type=int, default=0) 20 | args = parser.parse_args() 21 | 22 | num_common_features = 2 23 | [X1, X2], y = load_both(hdb_path=hdb_dataset, airbnb_path=school_dataset, active_party='hdb') 24 | name = "hdb_top1sim" 25 | 26 | model = Top1SimModel(num_common_features=num_common_features, 27 | task='regression', 28 | dataset_type='real', 29 | blocking_method='knn_priv_float', 30 | metrics=['r2_score', 'rmse'], 31 | n_classes=2, 32 | grid_min=-10.0, 33 | grid_max=10.0, 34 | grid_width=1.5, 35 | knn_k=20, 36 | kd_tree_radius=0.01, 37 | tree_leaf_size=1000, 38 | model_name=name + "_" + now_string, 39 | val_rate=0.1, 40 | test_rate=0.2, 41 | drop_key=True, 42 | device='cuda:{}'.format(args.gpu), 43 | hidden_sizes=[200, 100], 44 | train_batch_size=1024 * 4, 45 | test_batch_size=1024 * 4, 46 | num_epochs=200, 47 | learning_rate=1e-2, 48 | weight_decay=1e-5, 49 | # IMPORTANT: Set num_workers to 0 to prevent deadlock on RTX3090 for unknown reason. 50 | num_workers=0 if sys.gettrace() is None else 0, 51 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 52 | writer_path="runs/{}_{}".format(name, now_string), 53 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 54 | # SplitNN parameters 55 | local_hidden_sizes=[[200], [200]], 56 | agg_hidden_sizes=[100], 57 | cut_dims=[100, 100], 58 | 59 | # private link parameters 60 | link_epsilon=5e-3, 61 | link_delta=5e-3, 62 | link_threshold_t=1e-2, 63 | sim_leak_p=args.leak_p, 64 | link_n_jobs=-1, 65 | ) 66 | model.train_splitnn(X1, X2, y, data_cache_path="cache/hdb_sim_p_base.pkl", scale=True) 67 | 68 | -------------------------------------------------------------------------------- /src/train_beijing_A.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 7 | from sklearn.linear_model import LinearRegression 8 | import sklearn.metrics as metrics 9 | import numpy as np 10 | 11 | from model.vertical_fl.OnePartyModel import OnePartyModel 12 | from preprocess.beijing import load_house 13 | 14 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 15 | 16 | os.chdir(sys.path[0] + "/../") # change working directory 17 | 18 | root = "data/beijing/" 19 | dataset = "house_clean.csv" 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 23 | parser.add_argument('-g', '--gpu', type=int, default=3) 24 | args = parser.parse_args() 25 | 26 | X, y = load_house(root + dataset) 27 | print("X got {} dimensions".format(X.shape[1])) 28 | name = "beijing_house" 29 | # reg = LinearRegression().fit(X, y) 30 | # score = np.sqrt(metrics.mean_squared_error(reg.predict(X), y)) 31 | # print(score) 32 | 33 | model = OnePartyModel(model_name=name + "_" + now_string, 34 | task='regression', 35 | metrics=['r2_score', 'rmse'], 36 | n_classes=2, 37 | val_rate=0.1, 38 | test_rate=0.2, 39 | device='cuda:{}'.format(args.gpu), 40 | hidden_sizes=[400, 200], 41 | train_batch_size=4096, 42 | test_batch_size=4096, 43 | num_epochs=100, 44 | learning_rate=3e-3, 45 | weight_decay=1e-5, 46 | num_workers=4 if sys.gettrace() is None else 0, 47 | use_scheduler=False, 48 | sche_factor=0.1, 49 | sche_patience=10, 50 | sche_threshold=0.0001, 51 | writer_path="runs/{}_{}".format(name, now_string), 52 | model_save_path="ckp/{}_{}.pth".format(name, now_string) 53 | ) 54 | model.train_single(X, y, scale=True) 55 | -------------------------------------------------------------------------------- /src/train_beijing_B.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | import pickle 6 | 7 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 8 | from sklearn.linear_model import LinearRegression 9 | import sklearn.metrics as metrics 10 | import numpy as np 11 | 12 | from model.vertical_fl.OnePartyModel import OnePartyModel 13 | from preprocess.beijing import load_house, load_both 14 | 15 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 16 | 17 | os.chdir(sys.path[0] + "/../") # change working directory 18 | 19 | root = "data/beijing/" 20 | house_dataset = root + "house_clean.csv" 21 | airbnb_dataset = root + "airbnb_clean.csv" 22 | 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 25 | parser.add_argument('-g', '--gpu', type=int, default=3) 26 | args = parser.parse_args() 27 | 28 | num_common_features = 2 29 | [X1, X2], y = load_both(house_path=house_dataset, airbnb_path=airbnb_dataset, active_party='house') 30 | data_cache_path = "cache/beijing_sim.pkl" 31 | print("Loading data from cache") 32 | with open(data_cache_path, 'rb') as f: 33 | train_dataset, val_dataset, test_dataset, y_scaler, sim_scaler = pickle.load(f) 34 | print("Done") 35 | train_X, train_y, train_idx = train_dataset.top1_dataset 36 | val_X, val_y, val_idx = val_dataset.top1_dataset 37 | test_X, test_y, test_idx = test_dataset.top1_dataset 38 | train_X = train_X[:, X1.shape[1] - num_common_features:] 39 | val_X = val_X[:, X1.shape[1] - num_common_features:] 40 | test_X = test_X[:, X1.shape[1] - num_common_features:] 41 | 42 | print("X got {} dimensions".format(X2.shape[1])) 43 | name = "beijing_B" 44 | # reg = LinearRegression().fit(X, y) 45 | # score = np.sqrt(metrics.mean_squared_error(reg.predict(X), y)) 46 | # print(score) 47 | 48 | model = OnePartyModel(model_name=name + "_" + now_string, 49 | task='regression', 50 | metrics=['r2_score', 'rmse'], 51 | n_classes=2, 52 | val_rate=0.1, 53 | test_rate=0.2, 54 | device='cuda:{}'.format(args.gpu), 55 | hidden_sizes=[400, 200], 56 | train_batch_size=4096, 57 | test_batch_size=4096, 58 | num_epochs=100, 59 | learning_rate=3e-3, 60 | weight_decay=1e-5, 61 | num_workers=4 if sys.gettrace() is None else 0, 62 | use_scheduler=False, 63 | sche_factor=0.1, 64 | sche_patience=10, 65 | sche_threshold=0.0001, 66 | writer_path="runs/{}_{}".format(name, now_string), 67 | model_save_path="ckp/{}_{}.pth".format(name, now_string) 68 | ) 69 | model._train(train_X, val_X, test_X, train_y, val_y, test_y, y_scaler=y_scaler) 70 | 71 | -------------------------------------------------------------------------------- /src/train_beijing_avgsim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | from model.vertical_fl.MergeSimModel import MergeSimModel 7 | from preprocess.beijing import load_both 8 | 9 | 10 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 11 | os.chdir(sys.path[0] + "/../") # change working directory 12 | root = "data/beijing/" 13 | house_dataset = root + "house_clean.csv" 14 | airbnb_dataset = root + "airbnb_clean.csv" 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 18 | parser.add_argument('-g', '--gpu', type=int, default=0) 19 | parser.add_argument('-k', '--top-k', type=int, default=None) 20 | args = parser.parse_args() 21 | 22 | num_common_features = 2 23 | [X1, X2], y = load_both(house_path=house_dataset, airbnb_path=airbnb_dataset, active_party='house') 24 | name = "beijing_avgsim_p_{:.0E}".format(args.leak_p) 25 | 26 | 27 | model = MergeSimModel(num_common_features=num_common_features, 28 | sim_hidden_sizes=[10, 10], 29 | merge_mode='avg', 30 | feature_wise_sim=False, 31 | task='regression', 32 | metrics=['r2_score', 'rmse'], 33 | dataset_type='real', 34 | blocking_method='knn', 35 | n_classes=2, 36 | grid_min=-10.0, 37 | grid_max=10.0, 38 | grid_width=1.5, 39 | knn_k=100, 40 | filter_top_k=args.top_k, 41 | kd_tree_radius=1e-2, 42 | tree_leaf_size=1000, 43 | model_name=name + "_" + now_string, 44 | val_rate=0.1, 45 | test_rate=0.2, 46 | drop_key=True, 47 | device='cuda:{}'.format(args.gpu), 48 | hidden_sizes=[200, 100], 49 | train_batch_size=128, 50 | test_batch_size=1024 * 4, 51 | num_epochs=100, 52 | learning_rate=3e-3, 53 | weight_decay=1e-5, 54 | sim_learning_rate=3e-3, 55 | sim_weight_decay=1e-5, 56 | sim_batch_size=4096, 57 | update_sim_freq=1, 58 | num_workers=4 if sys.gettrace() is None else 0, 59 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 60 | writer_path="runs/{}_{}".format(name, now_string), 61 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 62 | sim_model_save_path="ckp/{}_{}_sim.pth".format(name, now_string), 63 | log_dir="log/{}_{}/".format(name, now_string), 64 | # SplitNN parameters 65 | local_hidden_sizes=[[200], [200]], 66 | agg_hidden_sizes=[100], 67 | cut_dims=[100, 100], 68 | 69 | # private link parameters 70 | link_epsilon=3e-2, 71 | link_delta=3e-2, 72 | link_threshold_t=1e-2, 73 | sim_leak_p=args.leak_p, 74 | link_n_jobs=-1, 75 | ) 76 | model.train_splitnn(X1, X2, y, data_cache_path="cache/beijing_sim.pkl".format(name), scale=True) 77 | 78 | -------------------------------------------------------------------------------- /src/train_beijing_featuresim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | from model.vertical_fl.FeatureSimModel import FeatureSimModel 7 | from preprocess.beijing import load_both 8 | 9 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 10 | os.chdir(sys.path[0] + "/../") # change working directory 11 | root = "data/beijing/" 12 | house_dataset = root + "house_clean.csv" 13 | airbnb_dataset = root + "airbnb_clean.csv" 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 17 | parser.add_argument('-g', '--gpu', type=int, default=0) 18 | parser.add_argument('-k', '--top-k', type=int, default=None) 19 | args = parser.parse_args() 20 | 21 | num_common_features = 2 22 | [X1, X2], y = load_both(house_path=house_dataset, airbnb_path=airbnb_dataset, active_party='house') 23 | name = "beijing_featuresim_p_{:.0E}".format(args.leak_p) 24 | 25 | model = FeatureSimModel(num_common_features=num_common_features, 26 | feature_wise_sim=False, 27 | task='regression', 28 | metrics=['r2_score', 'rmse'], 29 | dataset_type='real', 30 | blocking_method='knn', 31 | n_classes=2, 32 | grid_min=-10.0, 33 | grid_max=10.0, 34 | grid_width=1.5, 35 | knn_k=100, 36 | filter_top_k=args.top_k, 37 | kd_tree_radius=1e-2, 38 | tree_leaf_size=1000, 39 | model_name=name + "_" + now_string, 40 | val_rate=0.1, 41 | test_rate=0.2, 42 | drop_key=True, 43 | device='cuda:{}'.format(args.gpu), 44 | hidden_sizes=[200, 100], 45 | train_batch_size=128, 46 | test_batch_size=1024 * 4, 47 | num_epochs=100, 48 | learning_rate=3e-3, 49 | weight_decay=1e-5, 50 | num_workers=4 if sys.gettrace() is None else 0, 51 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 52 | writer_path="runs/{}_{}".format(name, now_string), 53 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 54 | # SplitNN parameters 55 | local_hidden_sizes=[[200], [200]], 56 | agg_hidden_sizes=[100], 57 | cut_dims=[100, 100], 58 | 59 | # private link parameters 60 | link_epsilon=3e-2, 61 | link_delta=3e-2, 62 | link_threshold_t=1e-2, 63 | sim_leak_p=args.leak_p, 64 | link_n_jobs=-1, 65 | ) 66 | model.train_splitnn(X1, X2, y, data_cache_path="cache/beijing_sim.pkl".format(name), scale=True) 67 | # model.train_splitnn(X1, X2, y, scale=True) 68 | -------------------------------------------------------------------------------- /src/train_beijing_fedsim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | from model.vertical_fl.FedSimModel import FedSimModel 7 | from preprocess.beijing import load_both 8 | 9 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 10 | os.chdir(sys.path[0] + "/../") # change working directory 11 | root = "data/beijing/" 12 | house_dataset = root + "house_clean.csv" 13 | airbnb_dataset = root + "airbnb_clean.csv" 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('-p', '--leak-p', type=float, default=1) 17 | parser.add_argument('-g', '--gpu', type=int, default=0) 18 | parser.add_argument('-k', '--top-k', type=int, default=None) 19 | parser.add_argument('--mlp-merge', action='store_true') 20 | parser.add_argument('-ds', '--disable-sort', action='store_true') 21 | parser.add_argument('-dw', '--disable-weight', action='store_true') 22 | args = parser.parse_args() 23 | 24 | num_common_features = 2 25 | [X1, X2], y = load_both(house_path=house_dataset, airbnb_path=airbnb_dataset, active_party='house') 26 | name = "beijing_fedsim_p_{:.0E}".format(args.leak_p) 27 | 28 | model = FedSimModel(num_common_features=num_common_features, 29 | raw_output_dim=10, 30 | feature_wise_sim=False, 31 | task='regression', 32 | metrics=['r2_score', 'rmse'], 33 | dataset_type='real', 34 | blocking_method='knn', 35 | n_classes=2, 36 | grid_min=(115.5, 39), 37 | grid_max=(116.5, 40), 38 | grid_width=(0.1, 0.1), 39 | knn_k=100, 40 | filter_top_k=args.top_k, 41 | kd_tree_radius=1e-2, 42 | tree_leaf_size=100, 43 | model_name=name + "_" + now_string, 44 | val_rate=0.1, 45 | test_rate=0.2, 46 | drop_key=True, 47 | device='cuda:{}'.format(args.gpu), 48 | hidden_sizes=[200, 100], 49 | train_batch_size=128, 50 | test_batch_size=1024 * 4, 51 | num_epochs=100, 52 | learning_rate=1e-3, 53 | weight_decay=1e-5, 54 | update_sim_freq=1, 55 | num_workers=4 if sys.gettrace() is None else 0, 56 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 57 | writer_path="runs/{}_{}".format(name, now_string), 58 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 59 | sim_model_save_path="ckp/{}_{}_sim.pth".format(name, now_string), 60 | log_dir="log/{}_{}/".format(name, now_string), 61 | # SplitNN parameters 62 | local_hidden_sizes=[[200], [200]], 63 | agg_hidden_sizes=[100], 64 | cut_dims=[100, 100], 65 | 66 | # fedsim parameters 67 | use_conv=True, 68 | merge_hidden_sizes=[400], 69 | sim_hidden_sizes=[10], 70 | merge_model_save_path="ckp/{}_{}_merge.pth".format(name, now_string), 71 | merge_dropout_p=0.3, 72 | conv_n_channels=8, 73 | conv_kernel_v_size=7, 74 | mlp_merge=[1600, 1000, 400] if args.mlp_merge else None, 75 | disable_sort=args.disable_sort, 76 | disable_weight=args.disable_weight, 77 | 78 | # private link parameters 79 | link_epsilon=3e-2, 80 | link_delta=3e-2, 81 | link_threshold_t=1e-2, 82 | sim_leak_p=args.leak_p, 83 | link_n_jobs=-1, 84 | ) 85 | model.train_splitnn(X1, X2, y, data_cache_path="cache/beijing_sim.pkl", scale=True) 86 | # model.train_splitnn(X1, X2, y, data_cache_path="cache/beijing_sim.pkl", scale=True, torch_seed=0, 87 | # splitnn_model_path="ckp/beijing_fedsim_p_1E+00_2022-01-22-16-05-04.pth", 88 | # sim_model_path="ckp/beijing_fedsim_p_1E+00_2022-01-22-16-05-04_sim.pth", 89 | # merge_model_path="ckp/beijing_fedsim_p_1E+00_2022-01-22-16-05-04_merge.pth", evaluate_only=True) 90 | # model.train_splitnn(X1, X2, y, scale=True) 91 | -------------------------------------------------------------------------------- /src/train_beijing_fedsim_multi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | import numpy as np 6 | 7 | from model.vertical_fl.FedSimModel import FedSimModel 8 | from preprocess.beijing import load_both 9 | from utils.utils import equal_split 10 | 11 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 12 | os.chdir(sys.path[0] + "/../") # change working directory 13 | root = "data/beijing/" 14 | house_dataset = root + "house_clean.csv" 15 | airbnb_dataset = root + "airbnb_clean.csv" 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('-p', '--leak-p', type=float, default=1) 19 | parser.add_argument('-g', '--gpu', type=int, default=0) 20 | parser.add_argument('-k', '--top-k', type=int, default=None) 21 | parser.add_argument('--mlp-merge', action='store_true') 22 | parser.add_argument('-ds', '--disable-sort', action='store_true') 23 | parser.add_argument('-dw', '--disable-weight', action='store_true') 24 | parser.add_argument('-n', '--n-parties', type=int, default=2) 25 | parser.add_argument('-vp', '--valid-parties', nargs='+', type=int, help='list of valid secondary parties (starts at 1)', default=None) 26 | args = parser.parse_args() 27 | 28 | 29 | num_common_features = 2 30 | [X1, X2], y = load_both(house_path=house_dataset, airbnb_path=airbnb_dataset, active_party='house') 31 | name = "beijing_fedsim_p_{:.0E}".format(args.leak_p) 32 | 33 | if args.valid_parties is None: 34 | valid_parties = list(range(1, args.n_parties)) 35 | else: 36 | assert max(args.valid_parties) <= args.n_parties - 1 37 | valid_parties = args.valid_parties 38 | 39 | X2_splits = np.array_split(X2[:, num_common_features:], args.n_parties - 1, axis=1) 40 | valid_X2_splits = [X2_splits[i-1] for i in valid_parties] 41 | X2_mask = [] 42 | for i, split in enumerate(X2_splits): 43 | X2_mask += [i + 1 in valid_parties for _ in range(split.shape[1])] 44 | n_valid_parties = len(valid_X2_splits) + 1 45 | input_dims = [X1.shape[1] - num_common_features] + [X.shape[1] for X in valid_X2_splits] 46 | valid_X2 = np.concatenate([X2[:, :num_common_features]] + valid_X2_splits, axis=1) 47 | assert np.count_nonzero(X2_mask) + num_common_features == valid_X2.shape[1] 48 | data_mask = [True for _ in range(X1.shape[1] - num_common_features)] + X2_mask 49 | 50 | model = FedSimModel(num_common_features=num_common_features, 51 | raw_output_dim=10, 52 | feature_wise_sim=False, 53 | task='regression', 54 | metrics=['r2_score', 'rmse'], 55 | dataset_type='real', 56 | blocking_method='knn', 57 | n_classes=2, 58 | grid_min=(115.5, 39), 59 | grid_max=(116.5, 40), 60 | grid_width=(0.1, 0.1), 61 | knn_k=100, 62 | filter_top_k=args.top_k, 63 | kd_tree_radius=1e-2, 64 | tree_leaf_size=100, 65 | model_name=name + "_" + now_string, 66 | val_rate=0.1, 67 | test_rate=0.2, 68 | drop_key=True, 69 | device='cuda:{}'.format(args.gpu), 70 | hidden_sizes=[200, 100], 71 | train_batch_size=128, 72 | test_batch_size=1024 * 4, 73 | num_epochs=100, 74 | learning_rate=1e-3, 75 | weight_decay=1e-5, 76 | update_sim_freq=1, 77 | num_workers=4 if sys.gettrace() is None else 0, 78 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 79 | writer_path="runs/{}_{}".format(name, now_string), 80 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 81 | sim_model_save_path="ckp/{}_{}_sim.pth".format(name, now_string), 82 | log_dir="log/{}_{}/".format(name, now_string), 83 | # SplitNN parameters 84 | local_hidden_sizes=[[200] for _ in range(n_valid_parties)], 85 | agg_hidden_sizes=[100], 86 | cut_dims=[100 for _ in range(n_valid_parties)], 87 | 88 | # fedsim parameters 89 | use_conv=True, 90 | merge_hidden_sizes=[400], 91 | sim_hidden_sizes=[10], 92 | merge_model_save_path="ckp/{}_{}_merge.pth".format(name, now_string), 93 | merge_dropout_p=0.3, 94 | conv_n_channels=8, 95 | conv_kernel_v_size=7, 96 | mlp_merge=[1600, 1000, 400] if args.mlp_merge else None, 97 | disable_sort=args.disable_sort, 98 | disable_weight=args.disable_weight, 99 | 100 | # private link parameters 101 | link_epsilon=3e-2, 102 | link_delta=3e-2, 103 | link_threshold_t=1e-2, 104 | sim_leak_p=args.leak_p, 105 | link_n_jobs=-1, 106 | ) 107 | model.train_splitnn(X1, valid_X2, y, data_cache_path="cache/beijing_sim.pkl", scale=True, n_parties=n_valid_parties, 108 | force_input_dims=input_dims, data_mask=data_mask) 109 | 110 | 111 | # model.train_splitnn(X1, X2, y, data_cache_path="cache/beijing_sim.pkl", scale=True, torch_seed=0, 112 | # splitnn_model_path="ckp/beijing_fedsim_p_1E+00_2022-01-22-16-05-04.pth", 113 | # sim_model_path="ckp/beijing_fedsim_p_1E+00_2022-01-22-16-05-04_sim.pth", 114 | # merge_model_path="ckp/beijing_fedsim_p_1E+00_2022-01-22-16-05-04_merge.pth", evaluate_only=True) 115 | # model.train_splitnn(X1, X2, y, scale=True) 116 | -------------------------------------------------------------------------------- /src/train_beijing_top1sim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | from model.vertical_fl.Top1SimModel import Top1SimModel 7 | from preprocess.beijing import load_both 8 | 9 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 10 | os.chdir(sys.path[0] + "/../") # change working directory 11 | root = "data/beijing/" 12 | house_dataset = root + "house_clean.csv" 13 | airbnb_dataset = root + "airbnb_clean.csv" 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 17 | parser.add_argument('-g', '--gpu', type=int, default=0) 18 | parser.add_argument('-k', '--top-k', type=int, default=None) 19 | args = parser.parse_args() 20 | 21 | num_common_features = 2 22 | [X1, X2], y = load_both(house_path=house_dataset, airbnb_path=airbnb_dataset, active_party='house') 23 | name = "beijing_top1sim_p_{:.0E}".format(args.leak_p) 24 | 25 | model = Top1SimModel(num_common_features=num_common_features, 26 | task='regression', 27 | dataset_type='real', 28 | blocking_method='knn', 29 | metrics=['r2_score', 'rmse'], 30 | n_classes=2, 31 | grid_min=-10.0, 32 | grid_max=10.0, 33 | grid_width=1.5, 34 | knn_k=100, 35 | filter_top_k=args.top_k, 36 | kd_tree_radius=0.01, 37 | tree_leaf_size=1000, 38 | model_name=name + "_" + now_string, 39 | val_rate=0.1, 40 | test_rate=0.2, 41 | drop_key=True, 42 | device='cuda:{}'.format(args.gpu), 43 | hidden_sizes=[200, 100], 44 | train_batch_size=1024 * 4, 45 | test_batch_size=1024 * 4, 46 | num_epochs=50, 47 | learning_rate=3e-3, 48 | weight_decay=1e-5, 49 | num_workers=4 if sys.gettrace() is None else 0, 50 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 51 | writer_path="runs/{}_{}".format(name, now_string), 52 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 53 | # SplitNN parameters 54 | local_hidden_sizes=[[200], [200]], 55 | agg_hidden_sizes=[100], 56 | cut_dims=[100, 100], 57 | 58 | # private link parameters 59 | link_epsilon=3e-2, 60 | link_delta=3e-2, 61 | link_threshold_t=1e-2, 62 | sim_leak_p=args.leak_p, 63 | link_n_jobs=-1, 64 | ) 65 | model.train_splitnn(X1, X2, y, data_cache_path="cache/beijing_sim.pkl".format(name), scale=True) 66 | # model.train_splitnn(X1, X2, y, scale=True) 67 | -------------------------------------------------------------------------------- /src/train_hdb_A.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 7 | from sklearn.linear_model import LinearRegression 8 | import sklearn.metrics as metrics 9 | import numpy as np 10 | 11 | from model.vertical_fl.OnePartyModel import OnePartyModel 12 | from preprocess.hdb import load_hdb 13 | 14 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 15 | 16 | os.chdir(sys.path[0] + "/../") # change working directory 17 | 18 | root = "data/hdb/" 19 | dataset = "hdb_clean.csv" 20 | 21 | X, y = load_hdb(root + dataset) 22 | print("X got {} dimensions".format(X.shape[1])) 23 | name = "hdb_A" 24 | 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('-g', '--gpu', type=int, default=0) 27 | args = parser.parse_args() 28 | 29 | model = OnePartyModel(model_name=name + "_" + now_string, 30 | task='regression', 31 | metrics=['r2_score', 'rmse'], 32 | n_classes=2, 33 | val_rate=0.1, 34 | test_rate=0.2, 35 | device='cuda:{}'.format(args.gpu), 36 | hidden_sizes=[400, 200], 37 | train_batch_size=4096, 38 | test_batch_size=4096, 39 | num_epochs=200, 40 | learning_rate=1e-2, 41 | weight_decay=1e-5, 42 | num_workers=4 if sys.gettrace() is None else 0, 43 | use_scheduler=False, 44 | sche_factor=0.1, 45 | sche_patience=10, 46 | sche_threshold=0.0001, 47 | writer_path="runs/{}_{}".format(name, now_string), 48 | model_save_path="ckp/{}_{}.pth".format(name, now_string) 49 | ) 50 | model.train_single(X, y, scale=True) 51 | -------------------------------------------------------------------------------- /src/train_hdb_B.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | import pickle 6 | 7 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 8 | from sklearn.linear_model import LinearRegression 9 | import sklearn.metrics as metrics 10 | import numpy as np 11 | 12 | from model.vertical_fl.OnePartyModel import OnePartyModel 13 | from preprocess.hdb import load_hdb, load_both 14 | 15 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 16 | 17 | os.chdir(sys.path[0] + "/../") # change working directory 18 | 19 | num_common_features = 2 20 | root = "data/hdb/" 21 | hdb_dataset = root + "hdb_clean.csv" 22 | school_dataset = root + "school_clean.csv" 23 | [X1, X2], y = load_both(hdb_path=hdb_dataset, airbnb_path=school_dataset, active_party='hdb') 24 | name = "hdb_B" 25 | data_cache_path = "cache/hdb_sim.pkl" 26 | print("Loading data from cache") 27 | with open(data_cache_path, 'rb') as f: 28 | train_dataset, val_dataset, test_dataset, y_scaler, sim_scaler = pickle.load(f) 29 | print("Done") 30 | train_X, train_y, train_idx = train_dataset.top1_dataset 31 | val_X, val_y, val_idx = val_dataset.top1_dataset 32 | test_X, test_y, test_idx = test_dataset.top1_dataset 33 | train_X = train_X[:, X1.shape[1] - num_common_features:] 34 | val_X = val_X[:, X1.shape[1] - num_common_features:] 35 | test_X = test_X[:, X1.shape[1] - num_common_features:] 36 | 37 | print("X got {} dimensions".format(train_X.shape[1])) 38 | 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('-g', '--gpu', type=int, default=0) 41 | args = parser.parse_args() 42 | 43 | model = OnePartyModel(model_name=name + "_" + now_string, 44 | task='regression', 45 | metrics=['r2_score', 'rmse'], 46 | n_classes=2, 47 | val_rate=0.1, 48 | test_rate=0.2, 49 | device='cuda:{}'.format(args.gpu), 50 | hidden_sizes=[400, 200], 51 | train_batch_size=4096, 52 | test_batch_size=4096, 53 | num_epochs=200, 54 | learning_rate=1e-2, 55 | weight_decay=1e-5, 56 | num_workers=4 if sys.gettrace() is None else 0, 57 | use_scheduler=False, 58 | sche_factor=0.1, 59 | sche_patience=10, 60 | sche_threshold=0.0001, 61 | writer_path="runs/{}_{}".format(name, now_string), 62 | model_save_path="ckp/{}_{}.pth".format(name, now_string) 63 | ) 64 | model._train(train_X, val_X, test_X, train_y, val_y, test_y, y_scaler=y_scaler) 65 | -------------------------------------------------------------------------------- /src/train_hdb_avgsim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | from model.vertical_fl.MergeSimModel import MergeSimModel 7 | from preprocess.hdb import load_both 8 | 9 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 10 | os.chdir(sys.path[0] + "/../") # change working directory 11 | root = "data/hdb/" 12 | hdb_dataset = root + "hdb_clean.csv" 13 | school_dataset = root + "school_clean.csv" 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 17 | parser.add_argument('-g', '--gpu', type=int, default=0) 18 | parser.add_argument('-k', '--top-k', type=int, default=None) 19 | args = parser.parse_args() 20 | 21 | num_common_features = 2 22 | [X1, X2], y = load_both(hdb_path=hdb_dataset, airbnb_path=school_dataset, active_party='hdb') 23 | name = "hdb_avgsim" 24 | 25 | model = MergeSimModel(num_common_features=num_common_features, 26 | sim_hidden_sizes=[10, 10], 27 | merge_mode='avg', 28 | feature_wise_sim=False, 29 | task='regression', 30 | metrics=['r2_score', 'rmse'], 31 | dataset_type='real', 32 | blocking_method='knn', 33 | n_classes=2, 34 | grid_min=-10.0, 35 | grid_max=10.0, 36 | grid_width=1.5, 37 | knn_k=50, 38 | filter_top_k=args.top_k, 39 | kd_tree_radius=1e-2, 40 | tree_leaf_size=1000, 41 | model_name=name + "_" + now_string, 42 | val_rate=0.1, 43 | test_rate=0.2, 44 | drop_key=True, 45 | device='cuda:{}'.format(args.gpu), 46 | hidden_sizes=[200, 100], 47 | train_batch_size=128, 48 | test_batch_size=1024 * 4, 49 | num_epochs=100, 50 | learning_rate=1e-3, 51 | weight_decay=1e-5, 52 | sim_learning_rate=1e-3, 53 | sim_weight_decay=1e-5, 54 | sim_batch_size=4096, 55 | update_sim_freq=1, 56 | num_workers=4 if sys.gettrace() is None else 0, 57 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 58 | writer_path="runs/{}_{}".format(name, now_string), 59 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 60 | sim_model_save_path="ckp/{}_{}_sim.pth".format(name, now_string), 61 | log_dir="log/{}_{}/".format(name, now_string), 62 | # SplitNN parameters 63 | local_hidden_sizes=[[200], [200]], 64 | agg_hidden_sizes=[400], 65 | cut_dims=[100, 100], 66 | 67 | # private link parameters 68 | link_epsilon=5e-3, 69 | link_delta=5e-3, 70 | link_threshold_t=1e-2, 71 | sim_leak_p=args.leak_p, 72 | link_n_jobs=-1, 73 | ) 74 | model.train_splitnn(X1, X2, y, data_cache_path="cache/hdb_sim.pkl".format(name), scale=True) 75 | -------------------------------------------------------------------------------- /src/train_hdb_featuresim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | import numpy as np 6 | 7 | from model.vertical_fl.FeatureSimModel import FeatureSimModel 8 | from preprocess.hdb import load_both 9 | 10 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 11 | os.chdir(sys.path[0] + "/../") # change working directory 12 | root = "data/hdb/" 13 | hdb_dataset = root + "hdb_clean.csv" 14 | school_dataset = root + "school_clean.csv" 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 18 | parser.add_argument('-g', '--gpu', type=int, default=0) 19 | parser.add_argument('-k', '--top-k', type=int, default=None) 20 | args = parser.parse_args() 21 | 22 | num_common_features = 2 23 | [X1, X2], y = load_both(hdb_path=hdb_dataset, airbnb_path=school_dataset, active_party='hdb') 24 | name = "hdb_featuresim" 25 | 26 | model = FeatureSimModel(num_common_features=num_common_features, 27 | feature_wise_sim=False, 28 | task='regression', 29 | metrics=['r2_score', 'rmse'], 30 | dataset_type='real', 31 | blocking_method='knn', 32 | n_classes=2, 33 | grid_min=-10.0, 34 | grid_max=10.0, 35 | grid_width=1.5, 36 | knn_k=50, 37 | filter_top_k=args.top_k, 38 | kd_tree_radius=1e-2, 39 | tree_leaf_size=1000, 40 | model_name=name + "_" + now_string, 41 | val_rate=0.1, 42 | test_rate=0.2, 43 | drop_key=True, 44 | device='cuda:{}'.format(args.gpu), 45 | hidden_sizes=[200, 100], 46 | train_batch_size=128, 47 | test_batch_size=1024 * 4, 48 | num_epochs=100, 49 | learning_rate=1e-3, 50 | weight_decay=1e-5, 51 | num_workers=4 if sys.gettrace() is None else 0, 52 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 53 | writer_path="runs/{}_{}".format(name, now_string), 54 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 55 | 56 | # SplitNN parameters 57 | local_hidden_sizes=[[200], [200]], 58 | agg_hidden_sizes=[100], 59 | cut_dims=[100, 100], 60 | 61 | # private link parameters 62 | link_epsilon=5e-3, 63 | link_delta=5e-3, 64 | link_threshold_t=1e-2, 65 | sim_leak_p=args.leak_p, 66 | link_n_jobs=-1, 67 | ) 68 | model.train_splitnn(X1, X2, y, data_cache_path="cache/hdb_sim.pkl".format(name), scale=True) 69 | # model.train_splitnn(X1, X2, y, scale=True) 70 | -------------------------------------------------------------------------------- /src/train_hdb_fedsim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | import numpy as np 6 | 7 | from model.vertical_fl.FedSimModel import FedSimModel 8 | from preprocess.hdb import load_both 9 | 10 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 11 | os.chdir(sys.path[0] + "/../") # change working directory 12 | root = "data/hdb/" 13 | hdb_dataset = root + "hdb_clean.csv" 14 | school_dataset = root + "school_clean.csv" 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 18 | parser.add_argument('-g', '--gpu', type=int, default=0) 19 | parser.add_argument('-k', '--top-k', type=int, default=None) 20 | parser.add_argument('--mlp-merge', action='store_true') 21 | parser.add_argument('-ds', '--disable-sort', action='store_true') 22 | parser.add_argument('-dw', '--disable-weight', action='store_true') 23 | args = parser.parse_args() 24 | 25 | num_common_features = 2 26 | [X1, X2], y = load_both(hdb_path=hdb_dataset, airbnb_path=school_dataset, active_party='hdb') 27 | name = "hdb_fedsim" 28 | 29 | model = FedSimModel(num_common_features=num_common_features, 30 | raw_output_dim=10, 31 | feature_wise_sim=False, 32 | task='regression', 33 | metrics=['r2_score', 'rmse'], 34 | dataset_type='real', 35 | blocking_method='knn', 36 | n_classes=2, 37 | grid_min=-10.0, 38 | grid_max=10.0, 39 | grid_width=1.5, 40 | knn_k=50, 41 | filter_top_k=args.top_k, 42 | kd_tree_radius=1e-2, 43 | tree_leaf_size=1000, 44 | model_name=name + "_" + now_string, 45 | val_rate=0.1, 46 | test_rate=0.2, 47 | drop_key=True, 48 | device='cuda:{}'.format(args.gpu), 49 | hidden_sizes=[200, 100], 50 | train_batch_size=128, 51 | test_batch_size=1024 * 4, 52 | num_epochs=100, 53 | learning_rate=1e-3, 54 | weight_decay=1e-4, 55 | update_sim_freq=1, 56 | num_workers=4 if sys.gettrace() is None else 0, 57 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 58 | writer_path="runs/{}_{}".format(name, now_string), 59 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 60 | sim_model_save_path="ckp/{}_{}_sim.pth".format(name, now_string), 61 | log_dir="log/{}_{}/".format(name, now_string), 62 | # SplitNN parameters 63 | local_hidden_sizes=[[200], [200]], 64 | agg_hidden_sizes=[100], 65 | cut_dims=[100, 100], 66 | 67 | # fedsim parameters 68 | use_conv=True, 69 | merge_hidden_sizes=[400], 70 | sim_hidden_sizes=[10], 71 | merge_model_save_path="ckp/{}_{}_merge.pth".format(name, now_string), 72 | merge_dropout_p=0.2, 73 | conv_n_channels=8, 74 | conv_kernel_v_size=7, 75 | mlp_merge=[1600, 1000, 400] if args.mlp_merge else None, 76 | disable_sort=args.disable_sort, 77 | disable_weight=args.disable_weight, 78 | 79 | # private link parameters 80 | link_epsilon=5e-3, 81 | link_delta=5e-3, 82 | link_threshold_t=1e-2, 83 | sim_leak_p=args.leak_p, 84 | link_n_jobs=-1, 85 | ) 86 | model.train_splitnn(X1, X2, y, data_cache_path="cache/hdb_sim.pkl", scale=True) 87 | # model.train_splitnn(X1, X2, y, scale=True) 88 | -------------------------------------------------------------------------------- /src/train_hdb_top1sim.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import argparse 5 | 6 | from model.vertical_fl.Top1SimModel import Top1SimModel 7 | from preprocess.hdb import load_both 8 | 9 | now_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 10 | os.chdir(sys.path[0] + "/../") # change working directory 11 | root = "data/hdb/" 12 | hdb_dataset = root + "hdb_clean.csv" 13 | school_dataset = root + "school_clean.csv" 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('-p', '--leak-p', type=float, default=1.0) 17 | parser.add_argument('-g', '--gpu', type=int, default=0) 18 | parser.add_argument('-k', '--top-k', type=int, default=None) 19 | args = parser.parse_args() 20 | 21 | num_common_features = 2 22 | [X1, X2], y = load_both(hdb_path=hdb_dataset, airbnb_path=school_dataset, active_party='hdb') 23 | name = "hdb_top1sim" 24 | 25 | model = Top1SimModel(num_common_features=num_common_features, 26 | task='regression', 27 | dataset_type='real', 28 | blocking_method='knn', 29 | metrics=['r2_score', 'rmse'], 30 | n_classes=2, 31 | grid_min=-10.0, 32 | grid_max=10.0, 33 | grid_width=1.5, 34 | knn_k=50, 35 | filter_top_k=args.top_k, 36 | kd_tree_radius=0.01, 37 | tree_leaf_size=1000, 38 | model_name=name + "_" + now_string, 39 | val_rate=0.1, 40 | test_rate=0.2, 41 | drop_key=True, 42 | device='cuda:{}'.format(args.gpu), 43 | hidden_sizes=[200, 100], 44 | train_batch_size=1024 * 4, 45 | test_batch_size=1024 * 4, 46 | num_epochs=200, 47 | learning_rate=1e-2, 48 | weight_decay=1e-5, 49 | # IMPORTANT: Set num_workers to 0 to prevent deadlock on RTX3090 for unknown reason. 50 | num_workers=0 if sys.gettrace() is None else 0, 51 | use_scheduler=False, sche_factor=0.1, sche_patience=10, sche_threshold=0.0001, 52 | writer_path="runs/{}_{}".format(name, now_string), 53 | model_save_path="ckp/{}_{}.pth".format(name, now_string), 54 | # SplitNN parameters 55 | local_hidden_sizes=[[200], [200]], 56 | agg_hidden_sizes=[100], 57 | cut_dims=[100, 100], 58 | 59 | # private link parameters 60 | link_epsilon=5e-3, 61 | link_delta=5e-3, 62 | link_threshold_t=1e-2, 63 | sim_leak_p=args.leak_p, 64 | link_n_jobs=-1, 65 | ) 66 | model.train_splitnn(X1, X2, y, data_cache_path="cache/hdb_sim.pkl", scale=True) 67 | 68 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import \ 2 | get_split_points, \ 3 | move_item_to_end_, \ 4 | move_item_to_start_, \ 5 | scaled_edit_distance, \ 6 | custom_index_cpu_to_gpu_multiple, \ 7 | DroppingPriorityQueue 8 | -------------------------------------------------------------------------------- /src/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/utils/__pycache__/privacy.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/utils/__pycache__/privacy.cpython-38.pyc -------------------------------------------------------------------------------- /src/utils/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xtra-Computing/FedSim/c2364e3ec44383afd754f2cdda86afc68cedc0e4/src/utils/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /src/utils/privacy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.special import erf 3 | from scipy.optimize import bisect 4 | from phe import paillier 5 | 6 | 7 | class SimNoiseScale: 8 | def __init__(self, bf_std, noise_scale=None, sim_leak_p=None): 9 | self.bf_std = bf_std 10 | 11 | if np.isclose(sim_leak_p, 0.0): 12 | assert False, "The required noise is np.inf to reach 0 leaking probability" 13 | 14 | noise_to_p = lambda x: erf(np.sqrt(x ** 2 + 1) / (2 * np.sqrt(2) * x * bf_std)) 15 | 16 | if noise_scale is None and sim_leak_p is not None: 17 | # plus 1e-12 to ensure the noise is sufficient for probability p 18 | noise_scale = bisect(lambda x: noise_to_p(x) - sim_leak_p, 1e-8, 1e8, xtol=1e-12) + 1e-12 19 | elif sim_leak_p is None and noise_scale is not None: 20 | sim_leak_p = noise_to_p(noise_scale) 21 | else: 22 | assert False, "noise_scale={}, sim_leak_p={}".format(noise_scale, sim_leak_p) 23 | 24 | self.noise_scale = noise_scale 25 | self.sim_leak_p = sim_leak_p 26 | 27 | 28 | def l2_distance_with_he(a, encrypted_b, encrypted_b_square, 29 | private_key: paillier.PaillierPrivateKey): 30 | """ 31 | Calculate l2 distance with partial homomorphic encryption 32 | :param encrypted_b_square: 33 | :param encrypted_b: 34 | :param a: array 1 35 | :param private_key: private key to decrypt result 36 | :return: real distance 37 | """ 38 | encrypted_dists = a * a - 2 * a * encrypted_b + encrypted_b_square 39 | encrypted_dist = sum(encrypted_dists) 40 | dist = private_key.decrypt(encrypted_dist) 41 | return dist 42 | 43 | 44 | def jaccard_sim_with_he(a, b, public_key: paillier.PaillierPublicKey, 45 | private_key: paillier.PaillierPrivateKey): 46 | """ 47 | Calculate jaccard similarity with partial homomorphic encryption 48 | :param a: bit vector 1 49 | :param b: bit vector 2 (to be encrypted) 50 | :param public_key: public key to encrypt value 51 | :param private_key: private key to decrypt result 52 | :return: real jaccard similarity 53 | """ 54 | encrypted_b = [public_key.encrypt(b_i) for b_i in b] 55 | raise NotImplementedError 56 | -------------------------------------------------------------------------------- /src/utils/utils.py: -------------------------------------------------------------------------------- 1 | from nltk.metrics.distance import edit_distance 2 | import faiss 3 | from queue import Queue 4 | from sortedcontainers import SortedList 5 | 6 | 7 | def get_split_points(array, size): 8 | assert size > 1 9 | 10 | prev = array[0] 11 | split_points = [0] 12 | for i in range(1, size): 13 | if prev != array[i]: 14 | prev = array[i] 15 | split_points.append(i) 16 | 17 | split_points.append(size) 18 | return split_points 19 | 20 | 21 | def move_item_to_end_(arr, items): 22 | for item in items: 23 | arr.insert(len(arr), arr.pop(arr.index(item))) 24 | 25 | 26 | def move_item_to_start_(arr, items): 27 | for item in items[::-1]: 28 | arr.insert(0, arr.pop(arr.index(item))) 29 | 30 | 31 | def scaled_edit_distance(a: str, b: str): 32 | return edit_distance(a, b) / max(len(a), len(b)) 33 | 34 | 35 | def custom_index_cpu_to_gpu_multiple(resources, index, co=None, gpu_nos=None): 36 | vres = faiss.GpuResourcesVector() 37 | vdev = faiss.IntVector() 38 | if gpu_nos is None: 39 | gpu_nos = range(len(resources)) 40 | for i, res in zip(gpu_nos, resources): 41 | vdev.push_back(i) 42 | vres.push_back(res) 43 | index = faiss.index_cpu_to_gpu_multiple(vres, vdev, index, co) 44 | index.referenced_objects = resources 45 | return index 46 | 47 | 48 | class DroppingPriorityQueue: 49 | """ 50 | Priority queue with maximum size. Tail will be automatically dropped when reaching max size 51 | """ 52 | def __init__(self, maxsize=None, reverse=False): 53 | self.reverse = reverse 54 | self.maxsize = maxsize 55 | self._queue = SortedList() 56 | 57 | def put(self, item): 58 | self._queue.add(item) 59 | if self.maxsize is not None and len(self._queue) > self.maxsize: 60 | if self.reverse: 61 | self._queue.pop(0) 62 | else: 63 | self._queue.pop(-1) 64 | 65 | def get(self): 66 | if self.reverse: 67 | return self._queue.pop(-1) 68 | else: 69 | return self._queue.pop(0) 70 | 71 | def __len__(self): 72 | return len(self._queue) 73 | 74 | 75 | def equal_split(n, k): 76 | if n % k == 0: 77 | return [n // k for _ in range(k)] 78 | else: 79 | return [n // k for _ in range(k - 1)] + [n % k] 80 | --------------------------------------------------------------------------------