├── LICENSE ├── README.md ├── data ├── .DS_Store ├── celeba │ └── data │ │ ├── test │ │ └── all_data_niid_1_keep_25_test_8.json │ │ └── train │ │ └── all_data_niid_1_keep_25_train_8.json ├── mnist │ ├── README.md │ └── generate_niid.py ├── nist │ ├── README.md │ ├── data │ │ └── my_sample.py │ ├── preprocess.sh │ ├── preprocess │ │ ├── data_to_json.py │ │ ├── data_to_json.sh │ │ ├── get_data.sh │ │ ├── get_file_dirs.py │ │ ├── get_hashes.py │ │ ├── group_by_writer.py │ │ └── match_hashes.py │ └── stats.sh ├── sent140 │ ├── README.md │ ├── preprocess.sh │ ├── preprocess │ │ ├── combine_data.py │ │ ├── data_to_json.py │ │ ├── data_to_json.sh │ │ └── get_data.sh │ └── stats.sh ├── shakespeare │ ├── .DS_Store │ ├── README.md │ ├── data │ │ ├── .DS_Store │ │ ├── test │ │ │ └── all_data_niid_2_keep_100_test_8.json │ │ └── train │ │ │ └── all_data_niid_2_keep_100_train_8.json │ ├── preprocess.sh │ ├── preprocess │ │ ├── data_to_json.sh │ │ ├── gen_all_data.py │ │ ├── get_data.sh │ │ ├── preprocess_shakespeare.py │ │ └── shake_utils.py │ └── stats.sh ├── synthetic_0.5_0.5 │ ├── README.md │ ├── data │ │ ├── test │ │ │ └── mytest.json │ │ └── train │ │ │ └── mytrain.json │ └── generate_synthetic.py ├── synthetic_0_0 │ ├── README.md │ ├── data │ │ ├── test │ │ │ └── mytest.json │ │ └── train │ │ │ └── mytrain.json │ └── generate_synthetic.py ├── synthetic_1_1 │ ├── README.md │ ├── data │ │ ├── test │ │ │ └── mytest.json │ │ └── train │ │ │ └── mytrain.json │ └── generate_synthetic.py └── synthetic_iid │ ├── README.md │ ├── data │ ├── test │ │ └── mytest.json │ └── train │ │ └── mytrain.json │ └── generate_iid.py ├── flearn ├── .DS_Store ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── client.cpython-36.pyc │ ├── celeba │ │ ├── __init__.py │ │ └── cnn.py │ ├── client.py │ ├── mnist │ │ ├── __init__.py │ │ └── mclr.py │ ├── nist │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ └── __init__.cpython-36.pyc │ │ ├── cnn.py │ │ └── mclr.py │ ├── sent140 │ │ ├── get_embs.py │ │ ├── get_embs.sh │ │ └── stacked_lstm.py │ ├── shakespeare │ │ └── stacked_lstm.py │ └── synthetic │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── mclr.cpython-36.pyc │ │ └── mclr.py ├── optimizer │ ├── __pycache__ │ │ └── pgd.cpython-36.pyc │ ├── pgd.py │ └── pggd.py ├── trainers │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── fedavg.cpython-36.pyc │ │ ├── fedbase.cpython-36.pyc │ │ └── fedprox.cpython-36.pyc │ ├── fedavg.py │ ├── fedbase.py │ ├── feddane.py │ └── fedprox.py └── utils │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-38.pyc │ ├── model_utils.cpython-36.pyc │ ├── model_utils.cpython-38.pyc │ └── tf_utils.cpython-36.pyc │ ├── language_utils.py │ ├── model_utils.py │ ├── tf_utils.py │ └── utils.py ├── main.py ├── requirements.txt ├── run_fedavg.sh ├── run_fedprox.sh ├── run_scripts.sh ├── submod_scripts.sh ├── submod_scripts_sent140.sh ├── submod_scripts_shakespeare.sh └── utils ├── __init__.py ├── language_utils.py ├── model_utils.py ├── preprocess.sh ├── remove_users.py ├── sample.py ├── split_data.py ├── stats.py ├── tf_utils.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Melodi Lab @ The University of Washington 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Diverse Client Selection for Federated Learning via Submodular Maximization 2 | 3 | ## Code for ICLR 2022 paper: 4 | 5 | Title: Diverse Client Selection for Federated Learning via Submodular Maximization [pdf] [presentation]\ 6 | Authors: Ravikumar Balakrishnan* (Intel Labs), Tian Li* (CMU), Tianyi Zhou* (UW), Nageen Himayat (Intel Labs), Virginia Smith (CMU), Jeff Bilmes (UW)\ 7 | Institutes: Intel Labs, Carnegie Mellon University, University of Washington 8 | 9 |
10 | @inproceedings{
11 | balakrishnan2022diverse,
12 | title={Diverse Client Selection for Federated Learning via Submodular Maximization},
13 | author={Ravikumar Balakrishnan and Tian Li and Tianyi Zhou and Nageen Himayat and Virginia Smith and Jeff Bilmes},
14 | booktitle={International Conference on Learning Representations},
15 | year={2022},
16 | url={https://openreview.net/forum?id=nwKXyFvaUm}
17 | }
18 | 19 | 20 | Abstract\ 21 | In every communication round of federated learning, a random subset of clients communicate their model updates back to the server which then aggregates them all. The optimal size of this subset is not known and several studies have shown that typically random selection does not perform very well in terms of convergence, learning efficiency and fairness. We, in this paper, propose to select a small diverse subset of clients, namely those carrying representative gradient information, and we transmit only these updates to the server. Our aim is for updating via only a subset to approximate updating via aggregating all client information. We achieve this by choosing a subset that maximizes a submodular facility location function defined over gradient space. We introduce “federated averaging with diverse client selection (DivFL)”. We provide a thorough analysis of its convergence in the heterogeneous setting and apply it both to synthetic and to real datasets. Empirical results show several benefits to our approach including improved learning efficiency, faster convergence and also more uniform (i.e., fair) performance across clients. We further show a communication-efficient version of DivFL that can still outperform baselines on the above metrics. 22 | 23 | ## Preparation 24 | 25 | ### Dataset generation 26 | 27 | We **already provide four synthetic datasets** that are used in the paper under corresponding folders. For all datasets, see the `README` files in separate `data/$dataset` folders for instructions on preprocessing and/or sampling data. 28 | 29 | The statistics of real federated datasets are summarized as follows. 30 | 31 |
32 | 33 | | Dataset | Devices | Samples|Samples/device
mean (stdev) | 34 | | ------------- |-------------| -----| ---| 35 | | MNIST | 1,000 | 69,035 | 69 (106)| 36 | | FEMNIST | 200 | 18,345 | 92 (159)| 37 | | Shakespeare | 143 | 517,106 | 3,616 (6,808)| 38 | | Sent140| 772 | 40,783 | 53 (32)| 39 | 40 |
41 | 42 | ### Downloading dependencies 43 | 44 | ``` 45 | pip3 install -r requirements.txt 46 | ``` 47 | 48 | ## References 49 | See our [DivFL](https://openreview.net/pdf?id=nwKXyFvaUm) paper for more details as well as all references. 50 | 51 | ## Acknowledgements 52 | Our implementation is based on [FedProx](https://github.com/litian96/FedProx). 53 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/data/.DS_Store -------------------------------------------------------------------------------- /data/mnist/README.md: -------------------------------------------------------------------------------- 1 | # MNIST Dataset 2 | 3 | First download the raw data [here](https://drive.google.com/file/d/1Vp_gJHw4pPqwMUSgodhFOqUglAQyaOGD/view?usp=sharing), put `mnist-original.mat` under the folder `data/mldata/`. 4 | 5 | To generate non-iid data: 6 | 7 | ``` 8 | mkdir test 9 | mkdir train 10 | python generate_niid.py 11 | ``` 12 | 13 | Or you can download the dataset [here](https://drive.google.com/file/d/1cU_LcBAUZvfZWveOMhG4G5Fg9uFXhVdf/view?usp=sharing), unzip it and put the `train` and `test` folder under `data`. 14 | 15 | The layout of the folders under `./mnist` should be: 16 | 17 | ``` 18 | | data 19 | 20 | ----| mldata 21 | 22 | ---- ----| mnist-original.mat 23 | 24 | ----| train 25 | 26 | ---- ----| train_file_name.json 27 | 28 | ----| test 29 | 30 | ---- ----| test_file_name.json 31 | 32 | | generate_niid.py 33 | | README.md 34 | ``` 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /data/mnist/generate_niid.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import fetch_mldata 2 | from tqdm import trange 3 | import numpy as np 4 | import random 5 | import json 6 | import os 7 | 8 | # Setup directory for train/test data 9 | train_path = './data/train/all_data_0_niid_0_keep_10_train_9.json' 10 | test_path = './data/test/all_data_0_niid_0_keep_10_test_9.json' 11 | dir_path = os.path.dirname(train_path) 12 | if not os.path.exists(dir_path): 13 | os.makedirs(dir_path) 14 | dir_path = os.path.dirname(test_path) 15 | if not os.path.exists(dir_path): 16 | os.makedirs(dir_path) 17 | 18 | # Get MNIST data, normalize, and divide by level 19 | mnist = fetch_mldata('MNIST original', data_home='./data') 20 | mu = np.mean(mnist.data.astype(np.float32), 0) 21 | sigma = np.std(mnist.data.astype(np.float32), 0) 22 | mnist.data = (mnist.data.astype(np.float32) - mu)/(sigma+0.001) 23 | mnist_data = [] 24 | for i in trange(10): 25 | idx = mnist.target==i 26 | mnist_data.append(mnist.data[idx]) 27 | 28 | print([len(v) for v in mnist_data]) 29 | 30 | ###### CREATE USER DATA SPLIT ####### 31 | # Assign 10 samples to each user 32 | X = [[] for _ in range(1000)] 33 | y = [[] for _ in range(1000)] 34 | idx = np.zeros(10, dtype=np.int64) 35 | for user in range(1000): 36 | for j in range(2): 37 | l = (user+j)%10 38 | X[user] += mnist_data[l][idx[l]:idx[l]+5].tolist() 39 | y[user] += (l*np.ones(5)).tolist() 40 | idx[l] += 5 41 | print(idx) 42 | 43 | # Assign remaining sample by power law 44 | user = 0 45 | props = np.random.lognormal(0, 2.0, (10,100,2)) 46 | props = np.array([[[len(v)-1000]] for v in mnist_data])*props/np.sum(props,(1,2), keepdims=True) 47 | #idx = 1000*np.ones(10, dtype=np.int64) 48 | for user in trange(1000): 49 | for j in range(2): 50 | l = (user+j)%10 51 | num_samples = int(props[l,user//10,j]) 52 | #print(num_samples) 53 | if idx[l] + num_samples < len(mnist_data[l]): 54 | X[user] += mnist_data[l][idx[l]:idx[l]+num_samples].tolist() 55 | y[user] += (l*np.ones(num_samples)).tolist() 56 | idx[l] += num_samples 57 | 58 | print(idx) 59 | 60 | # Create data structure 61 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]} 62 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]} 63 | 64 | # Setup 1000 users 65 | for i in trange(1000, ncols=120): 66 | uname = 'f_{0:05d}'.format(i) 67 | 68 | combined = list(zip(X[i], y[i])) 69 | random.shuffle(combined) 70 | X[i][:], y[i][:] = zip(*combined) 71 | num_samples = len(X[i]) 72 | train_len = int(0.9*num_samples) 73 | test_len = num_samples - train_len 74 | 75 | train_data['users'].append(uname) 76 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]} 77 | train_data['num_samples'].append(train_len) 78 | test_data['users'].append(uname) 79 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]} 80 | test_data['num_samples'].append(test_len) 81 | 82 | print(train_data['num_samples']) 83 | print(sum(train_data['num_samples'])) 84 | 85 | with open(train_path,'w') as outfile: 86 | json.dump(train_data, outfile) 87 | with open(test_path, 'w') as outfile: 88 | json.dump(test_data, outfile) 89 | -------------------------------------------------------------------------------- /data/nist/README.md: -------------------------------------------------------------------------------- 1 | # FEMNIST Dataset 2 | 3 | ## Setup Instructions 4 | 5 | 6 | You can download the dataset [here](https://drive.google.com/file/d/1tCEcJgRJ8NdRo11UJZR6WSKMNdmox4GC/view?usp=sharing), unzip it and put the `train` and `test` folder under `data`. 7 | 8 | 9 | The FEMNIST data we used in the paper is a subsampled (and repartitioned) version of the original full dataset in order to impose additional statistical heterogeneity. The above dataset is generated by the following instruction: 10 | 11 | (1) First, 12 | 13 | Run preprocess.sh with a choice of the following tags: 14 | 15 | - ```-s``` := 'iid' to sample in an i.i.d. manner, or 'niid' to sample in a non-i.i.d. manner; more information on i.i.d. versus non-i.i.d. is included in the 'Notes' section 16 | - ```--iu``` := number of users, if iid sampling; expressed as a fraction of the total number of users; default is 0.01 17 | - ```--sf``` := fraction of data to sample, written as a decimal; default is 0.1 18 | - ```-k``` := minimum number of samples per user 19 | - ```-t``` := 'user' to partition users into train-test groups, or 'sample' to partition each user's samples into train-test groups 20 | - ```--tf``` := fraction of data in training set, written as a decimal; default is 0.9 21 | 22 | 23 | And then run: 24 | 25 | ``` 26 | ./preprocess.sh -s niid --sf 0.5 -k 0 -tf 0.8 -t sample 27 | ``` 28 | 29 | 30 | (Make sure to delete the rem\_user\_data, sampled\_data, test, and train subfolders in the data directory before re-running preprocess.sh.) 31 | 32 | (2) And then re-partition the data: 33 | 34 | ``` 35 | cd data 36 | python my_sample.py 37 | ``` 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /data/nist/data/my_sample.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import json 3 | import math 4 | import numpy as np 5 | import os 6 | import sys 7 | import random 8 | from tqdm import trange 9 | 10 | from PIL import Image 11 | 12 | NUM_USER = 200 13 | CLASS_PER_USER = 3 # from 10 lowercase characters 14 | 15 | 16 | def relabel_class(c): 17 | ''' 18 | maps hexadecimal class value (string) to a decimal number 19 | returns: 20 | - 0 through 9 for classes representing respective numbers 21 | - 10 through 35 for classes representing respective uppercase letters 22 | - 36 through 61 for classes representing respective lowercase letters 23 | ''' 24 | if c.isdigit() and int(c) < 40: 25 | return (int(c) - 30) 26 | elif int(c, 16) <= 90: # uppercase 27 | return (int(c, 16) - 55) 28 | else: 29 | return (int(c, 16) - 61) # lowercase 30 | 31 | def load_image(file_name): 32 | '''read in a png 33 | Return: a flatted list representing the image 34 | ''' 35 | size = (28, 28) 36 | img = Image.open(file_name) 37 | gray = img.convert('L') 38 | gray.thumbnail(size, Image.ANTIALIAS) 39 | arr = np.asarray(gray).copy() 40 | vec = arr.flatten() 41 | vec = vec / 255 # scale all pixel values to between 0 and 1 42 | vec = vec.tolist() 43 | 44 | return vec 45 | 46 | 47 | def main(): 48 | file_dir = "raw_data/by_class" 49 | 50 | train_path = "train/mytrain.json" 51 | test_path = "test/mytest.json" 52 | 53 | X = [[] for _ in range(NUM_USER)] 54 | y = [[] for _ in range(NUM_USER)] 55 | 56 | nist_data = {} 57 | 58 | 59 | for class_ in os.listdir(file_dir): 60 | 61 | real_class = relabel_class(class_) 62 | if real_class >= 36 and real_class <= 45: 63 | full_img_path = file_dir + "/" + class_ + "/train_" + class_ 64 | all_files_this_class = os.listdir(full_img_path) 65 | random.shuffle(all_files_this_class) 66 | sampled_files_this_class = all_files_this_class[:4000] 67 | imgs = [] 68 | for img in sampled_files_this_class: 69 | imgs.append(load_image(full_img_path + "/" + img)) 70 | class_ = relabel_class(class_) 71 | print(class_) 72 | nist_data[class_-36] = imgs # a list of list, key is (0, 25) 73 | print(len(imgs)) 74 | 75 | num_samples = np.random.lognormal(4, 1, (NUM_USER)) + 5 76 | 77 | idx = np.zeros(10, dtype=np.int64) 78 | 79 | for user in range(NUM_USER): 80 | num_sample_per_class = int(num_samples[user] / CLASS_PER_USER) 81 | if num_sample_per_class < 2: 82 | num_sample_per_class = 2 83 | 84 | for j in range(CLASS_PER_USER): 85 | class_id = (user + j) % 10 86 | if idx[class_id] + num_sample_per_class < len(nist_data[class_id]): 87 | idx[class_id] = 0 88 | X[user] += nist_data[class_id][idx[class_id]: (idx[class_id] + num_sample_per_class)] 89 | y[user] += (class_id * np.ones(num_sample_per_class)).tolist() 90 | idx[class_id] += num_sample_per_class 91 | 92 | # Create data structure 93 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]} 94 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]} 95 | 96 | for i in trange(NUM_USER, ncols=120): 97 | uname = 'f_{0:05d}'.format(i) 98 | 99 | combined = list(zip(X[i], y[i])) 100 | random.shuffle(combined) 101 | X[i][:], y[i][:] = zip(*combined) 102 | num_samples = len(X[i]) 103 | train_len = int(0.9 * num_samples) 104 | test_len = num_samples - train_len 105 | 106 | train_data['users'].append(uname) 107 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]} 108 | train_data['num_samples'].append(train_len) 109 | test_data['users'].append(uname) 110 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]} 111 | test_data['num_samples'].append(test_len) 112 | 113 | with open(train_path, 'w') as outfile: 114 | json.dump(train_data, outfile) 115 | with open(test_path, 'w') as outfile: 116 | json.dump(test_data, outfile) 117 | 118 | 119 | if __name__ == "__main__": 120 | main() 121 | 122 | -------------------------------------------------------------------------------- /data/nist/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #rm -rf rem_user_data sampled_data test train 4 | 5 | # download data and convert to .json format 6 | 7 | if [ ! -d "data/all_data" ] || [ ! "$(ls -A data/all_data)" ]; then 8 | cd preprocess 9 | ./data_to_json.sh 10 | cd .. 11 | fi 12 | 13 | NAME="nist" # name of the dataset, equivalent to directory name 14 | 15 | cd ../../utils 16 | 17 | # ./preprocess.sh -s niid --sf 0.05 -k 64 -t sample 18 | # ./preprocess.sh --name nist -s niid --sf 1.0 -k 0 -t sample 19 | # ./preprocess.sh --name sent140 -s niid --sf 1.0 -k 1 -t sample 20 | ./preprocess.sh --name $NAME $@ 21 | 22 | cd ../data/$NAME 23 | -------------------------------------------------------------------------------- /data/nist/preprocess/data_to_json.py: -------------------------------------------------------------------------------- 1 | # Converts a list of (writer, [list of (file,class)]) tuples into a json object 2 | # of the form: 3 | # {users: [bob, etc], num_samples: [124, etc.], 4 | # user_data: {bob : {x:[img1,img2,etc], y:[class1,class2,etc]}, etc}} 5 | # where 'img_' is a vectorized representation of the corresponding image 6 | 7 | from __future__ import division 8 | import json 9 | import math 10 | import numpy as np 11 | import os 12 | import sys 13 | 14 | from PIL import Image 15 | 16 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 17 | utils_dir = os.path.join(utils_dir, 'utils') 18 | sys.path.append(utils_dir) 19 | 20 | import utils 21 | 22 | 23 | MAX_WRITERS = 100 # max number of writers per json file. 24 | 25 | 26 | def relabel_class(c): 27 | ''' 28 | maps hexadecimal class value (string) to a decimal number 29 | returns: 30 | - 0 through 9 for classes representing respective numbers 31 | - 10 through 35 for classes representing respective uppercase letters 32 | - 36 through 61 for classes representing respective lowercase letters 33 | ''' 34 | if c.isdigit() and int(c) < 40: 35 | return (int(c) - 30) 36 | elif int(c, 16) <= 90: # uppercase 37 | return (int(c, 16) - 55) 38 | else: 39 | return (int(c, 16) - 61) 40 | 41 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 42 | 43 | ibwd = os.path.join(parent_path, 'data', 'intermediate', 'images_by_writer') 44 | writers = utils.load_obj(ibwd) 45 | 46 | num_json = int(math.ceil(len(writers) / MAX_WRITERS)) 47 | 48 | users = [[] for _ in range(num_json)] 49 | num_samples = [[] for _ in range(num_json)] 50 | user_data = [{} for _ in range(num_json)] 51 | 52 | writer_count = 0 53 | json_index = 0 54 | for (w, l) in writers: 55 | 56 | users[json_index].append(w) 57 | num_samples[json_index].append(len(l)) 58 | user_data[json_index][w] = {'x': [], 'y': []} 59 | 60 | size = 28, 28 # original image size is 128, 128 61 | for (f, c) in l: 62 | file_path = os.path.join(parent_path, f) 63 | img = Image.open(file_path) 64 | gray = img.convert('L') 65 | gray.thumbnail(size, Image.ANTIALIAS) 66 | arr = np.asarray(gray).copy() 67 | vec = arr.flatten() 68 | vec = vec / 255 # scale all pixel values to between 0 and 1 69 | vec = vec.tolist() 70 | 71 | nc = relabel_class(c) 72 | 73 | user_data[json_index][w]['x'].append(vec) 74 | user_data[json_index][w]['y'].append(nc) 75 | 76 | writer_count += 1 77 | if writer_count == MAX_WRITERS: 78 | 79 | all_data = {} 80 | all_data['users'] = users[json_index] 81 | all_data['num_samples'] = num_samples[json_index] 82 | all_data['user_data'] = user_data[json_index] 83 | 84 | file_name = 'all_data_%d.json' % json_index 85 | file_path = os.path.join(parent_path, 'data', 'all_data', file_name) 86 | 87 | print('writing %s' % file_name) 88 | 89 | with open(file_path, 'w') as outfile: 90 | json.dump(all_data, outfile) 91 | 92 | writer_count = 0 93 | json_index += 1 94 | -------------------------------------------------------------------------------- /data/nist/preprocess/data_to_json.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # assumes that the script is run in the preprocess folder 4 | 5 | if [ ! -d "../data" ]; then 6 | mkdir ../data 7 | fi 8 | if [ ! -d "../data/raw_data" ]; then 9 | echo "------------------------------" 10 | echo "downloading data" 11 | mkdir ../data/raw_data 12 | ./get_data.sh 13 | echo "finished downloading data" 14 | fi 15 | 16 | if [ ! -d "../data/intermediate" ]; then # stores .pkl files during preprocessing 17 | mkdir ../data/intermediate 18 | fi 19 | 20 | if [ ! -f ../data/intermediate/class_file_dirs.pkl ]; then 21 | echo "------------------------------" 22 | echo "extracting file directories of images" 23 | python3 get_file_dirs.py 24 | echo "finished extracting file directories of images" 25 | fi 26 | 27 | if [ ! -f ../data/intermediate/class_file_hashes.pkl ]; then 28 | echo "------------------------------" 29 | echo "calculating image hashes" 30 | python3 get_hashes.py 31 | echo "finished calculating image hashes" 32 | fi 33 | 34 | if [ ! -f ../data/intermediate/write_with_class.pkl ]; then 35 | echo "------------------------------" 36 | echo "assigning class labels to write images" 37 | python3 match_hashes.py 38 | echo "finished assigning class labels to write images" 39 | fi 40 | 41 | if [ ! -f ../data/intermediate/images_by_writer.pkl ]; then 42 | echo "------------------------------" 43 | echo "grouping images by writer" 44 | python3 group_by_writer.py 45 | echo "finished grouping images by writer" 46 | fi 47 | 48 | if [ ! -d "../data/all_data" ]; then 49 | mkdir ../data/all_data 50 | fi 51 | if [ ! "$(ls -A ../data/all_data)" ]; then 52 | echo "------------------------------" 53 | echo "converting data to .json format" 54 | python3 data_to_json.py 55 | echo "finished converting data to .json format" 56 | fi 57 | -------------------------------------------------------------------------------- /data/nist/preprocess/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # assumes that the script is run in the preprocess folder 4 | 5 | cd ../data/raw_data 6 | wget https://s3.amazonaws.com/nist-srd/SD19/by_class.zip 7 | wget https://s3.amazonaws.com/nist-srd/SD19/by_write.zip 8 | unzip by_class.zip 9 | rm by_class.zip 10 | unzip by_write.zip 11 | rm by_write.zip 12 | cd ../../preprocess 13 | -------------------------------------------------------------------------------- /data/nist/preprocess/get_file_dirs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Creates .pkl files for: 3 | 1. list of directories of every image in 'by_class' 4 | 2. list of directories of every image in 'by_write' 5 | the hierarchal structure of the data is as follows: 6 | - by_class -> classes -> folders containing images -> images 7 | - by_write -> folders containing writers -> writer -> types of images -> images 8 | the directories written into the files are of the form 'raw_data/...' 9 | ''' 10 | 11 | import os 12 | import sys 13 | 14 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 15 | utils_dir = os.path.join(utils_dir, 'utils') 16 | sys.path.append(utils_dir) 17 | 18 | import utils 19 | 20 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 21 | 22 | class_files = [] # (class, file directory) 23 | write_files = [] # (writer, file directory) 24 | 25 | class_dir = os.path.join(parent_path, 'data', 'raw_data', 'by_class') 26 | rel_class_dir = os.path.join('data', 'raw_data', 'by_class') 27 | classes = os.listdir(class_dir) 28 | 29 | for cl in classes: 30 | cldir = os.path.join(class_dir, cl) 31 | rel_cldir = os.path.join(rel_class_dir, cl) 32 | subcls = os.listdir(cldir) 33 | 34 | subcls = [s for s in subcls if (('hsf' in s) and ('mit' not in s))] 35 | 36 | for subcl in subcls: 37 | subcldir = os.path.join(cldir, subcl) 38 | rel_subcldir = os.path.join(rel_cldir, subcl) 39 | images = os.listdir(subcldir) 40 | image_dirs = [os.path.join(rel_subcldir, i) for i in images] 41 | 42 | for image_dir in image_dirs: 43 | class_files.append((cl, image_dir)) 44 | 45 | write_dir = os.path.join(parent_path, 'data', 'raw_data', 'by_write') 46 | rel_write_dir = os.path.join('data', 'raw_data', 'by_write') 47 | write_parts = os.listdir(write_dir) 48 | 49 | for write_part in write_parts: 50 | writers_dir = os.path.join(write_dir, write_part) 51 | rel_writers_dir = os.path.join(rel_write_dir, write_part) 52 | writers = os.listdir(writers_dir) 53 | 54 | for writer in writers: 55 | writer_dir = os.path.join(writers_dir, writer) 56 | rel_writer_dir = os.path.join(rel_writers_dir, writer) 57 | wtypes = os.listdir(writer_dir) 58 | 59 | for wtype in wtypes: 60 | type_dir = os.path.join(writer_dir, wtype) 61 | rel_type_dir = os.path.join(rel_writer_dir, wtype) 62 | images = os.listdir(type_dir) 63 | image_dirs = [os.path.join(rel_type_dir, i) for i in images] 64 | 65 | for image_dir in image_dirs: 66 | write_files.append((writer, image_dir)) 67 | 68 | utils.save_obj( 69 | class_files, 70 | os.path.join(parent_path, 'data', 'intermediate', 'class_file_dirs')) 71 | utils.save_obj( 72 | write_files, 73 | os.path.join(parent_path, 'data', 'intermediate', 'write_file_dirs')) 74 | -------------------------------------------------------------------------------- /data/nist/preprocess/get_hashes.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import sys 4 | 5 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 6 | utils_dir = os.path.join(utils_dir, 'utils') 7 | sys.path.append(utils_dir) 8 | 9 | import utils 10 | 11 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 12 | 13 | cfd = os.path.join(parent_path, 'data', 'intermediate', 'class_file_dirs') 14 | wfd = os.path.join(parent_path, 'data', 'intermediate', 'write_file_dirs') 15 | class_file_dirs = utils.load_obj(cfd) 16 | write_file_dirs = utils.load_obj(wfd) 17 | 18 | class_file_hashes = [] 19 | write_file_hashes = [] 20 | 21 | count = 0 22 | for tup in class_file_dirs: 23 | if (count%100000 == 0): 24 | print('hashed %d class images' % count) 25 | 26 | (cclass, cfile) = tup 27 | file_path = os.path.join(parent_path, cfile) 28 | 29 | chash = hashlib.md5(open(file_path, 'rb').read()).hexdigest() 30 | 31 | class_file_hashes.append((cclass, cfile, chash)) 32 | 33 | count += 1 34 | 35 | cfhd = os.path.join(parent_path, 'data', 'intermediate', 'class_file_hashes') 36 | utils.save_obj(class_file_hashes, cfhd) 37 | 38 | count = 0 39 | for tup in write_file_dirs: 40 | if (count%100000 == 0): 41 | print('hashed %d write images' % count) 42 | 43 | (cclass, cfile) = tup 44 | file_path = os.path.join(parent_path, cfile) 45 | 46 | chash = hashlib.md5(open(file_path, 'rb').read()).hexdigest() 47 | 48 | write_file_hashes.append((cclass, cfile, chash)) 49 | 50 | count += 1 51 | 52 | wfhd = os.path.join(parent_path, 'data', 'intermediate', 'write_file_hashes') 53 | utils.save_obj(write_file_hashes, wfhd) 54 | -------------------------------------------------------------------------------- /data/nist/preprocess/group_by_writer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 5 | utils_dir = os.path.join(utils_dir, 'utils') 6 | sys.path.append(utils_dir) 7 | 8 | import utils 9 | 10 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 11 | 12 | wwcd = os.path.join(parent_path, 'data', 'intermediate', 'write_with_class') 13 | write_class = utils.load_obj(wwcd) 14 | 15 | writers = [] # each entry is a (writer, [list of (file, class)]) tuple 16 | cimages = [] 17 | (cw, _, _) = write_class[0] 18 | for (w, f, c) in write_class: 19 | if w != cw: 20 | writers.append((cw, cimages)) 21 | cw = w 22 | cimages = [(f, c)] 23 | cimages.append((f, c)) 24 | writers.append((cw, cimages)) 25 | 26 | ibwd = os.path.join(parent_path, 'data', 'intermediate', 'images_by_writer') 27 | utils.save_obj(writers, ibwd) 28 | -------------------------------------------------------------------------------- /data/nist/preprocess/match_hashes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) 5 | utils_dir = os.path.join(utils_dir, 'utils') 6 | sys.path.append(utils_dir) 7 | 8 | import utils 9 | 10 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 11 | 12 | cfhd = os.path.join(parent_path, 'data', 'intermediate', 'class_file_hashes') 13 | wfhd = os.path.join(parent_path, 'data', 'intermediate', 'write_file_hashes') 14 | class_file_hashes = utils.load_obj(cfhd) # each elem is (class, file dir, hash) 15 | write_file_hashes = utils.load_obj(wfhd) # each elem is (writer, file dir, hash) 16 | 17 | class_hash_dict = {} 18 | for i in range(len(class_file_hashes)): 19 | (c, f, h) = class_file_hashes[len(class_file_hashes)-i-1] 20 | class_hash_dict[h] = (c, f) 21 | 22 | write_classes = [] 23 | for tup in write_file_hashes: 24 | (w, f, h) = tup 25 | write_classes.append((w, f, class_hash_dict[h][0])) 26 | 27 | wwcd = os.path.join(parent_path, 'data', 'intermediate', 'write_with_class') 28 | utils.save_obj(write_classes, wwcd) 29 | -------------------------------------------------------------------------------- /data/nist/stats.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | NAME="nist" 4 | 5 | cd ../../utils 6 | 7 | python3 stats.py --name $NAME 8 | 9 | cd ../data/$NAME -------------------------------------------------------------------------------- /data/sent140/README.md: -------------------------------------------------------------------------------- 1 | # Sentiment140 Dataset 2 | 3 | ## Setup Instructions 4 | 5 | You can download the dataset [here](https://drive.google.com/file/d/1pgHf4DUZkGI6q-NLjBzMawX5yn4Y40k0/view?usp=sharing), unzip it and put the `train` and `test` folder under `data`. 6 | 7 | If you would like to run on Sent140, you also need to download a pre-trained embedding file using the following commands (this may take 3-5 minutes): 8 | 9 | ``` 10 | cd FedProx_clean_code/flearn/models/sent140 11 | bash get_embs.sh 12 | ``` 13 | 14 | The above dataset is sampled using the following instructions: 15 | 16 | Run preprocess.sh with a choice of the following tags: 17 | 18 | - ```-s``` := 'iid' to sample in an i.i.d. manner, or 'niid' to sample in a non-i.i.d. manner; more information on i.i.d. versus non-i.i.d. is included in the 'Notes' section 19 | - ```--iu``` := number of users, if iid sampling; expressed as a fraction of the total number of users; default is 0.01 20 | - ```--sf``` := fraction of data to sample, written as a decimal; default is 0.1 21 | - ```-k``` := minimum number of samples per user 22 | - ```-t``` := 'user' to partition users into train-test groups, or 'sample' to partition each user's samples into train-test groups 23 | - ```--tf``` := fraction of data in training set, written as a decimal; default is 0.9 24 | 25 | 26 | Instruction used to generate Sent140 in the paper: 27 | 28 | ``` 29 | ./preprocess.sh -s niid --sf 0.3 -k 30 -tf 0.8 -t sample 30 | ``` 31 | 32 | 33 | (Make sure to delete the rem\_user\_data, sampled\_data, test, and train subfolders in the data directory before re-running preprocess.sh.) 34 | 35 | 36 | -------------------------------------------------------------------------------- /data/sent140/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # download data and convert to .json format 4 | 5 | if [ ! -d "data/all_data" ] || [ ! "$(ls -A data/all_data)" ]; then 6 | cd preprocess 7 | ./data_to_json.sh 8 | cd .. 9 | fi 10 | 11 | NAME="sent140" # name of the dataset, equivalent to directory name 12 | 13 | cd ../../utils 14 | 15 | ./preprocess.sh --name $NAME $@ 16 | 17 | cd ../data/$NAME -------------------------------------------------------------------------------- /data/sent140/preprocess/combine_data.py: -------------------------------------------------------------------------------- 1 | ''' 2 | each row of created .csv file is of the form: 3 | polarity, id, date, query, user, comment, test_or_training 4 | ''' 5 | 6 | import csv 7 | import os 8 | 9 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 10 | 11 | train_file_name = os.path.join(parent_path, 'data', 'raw_data', 'training.csv') 12 | 13 | training = [] 14 | with open(train_file_name, 'rt', encoding='ISO-8859-1') as f: 15 | reader = csv.reader(f) 16 | training = list(reader) 17 | 18 | test_file_name = os.path.join(parent_path, 'data', 'raw_data', 'test.csv') 19 | 20 | test = [] 21 | with open(test_file_name, 'rt', encoding='ISO-8859-1') as f: 22 | reader = csv.reader(f) 23 | test = list(reader) 24 | 25 | out_file_name = os.path.join(parent_path, 'data', 'intermediate', 'all_data.csv') 26 | 27 | with open (out_file_name, 'w') as f: 28 | writer = csv.writer(f) 29 | 30 | for row in training: 31 | row.append('training') 32 | writer.writerow(row) 33 | 34 | for row in test: 35 | row.append('test') 36 | writer.writerow(row) 37 | -------------------------------------------------------------------------------- /data/sent140/preprocess/data_to_json.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import os 4 | 5 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 6 | 7 | data_dir = os.path.join(parent_path, 'data', 'intermediate', 'all_data.csv') 8 | 9 | data = [] 10 | with open(data_dir, 'rt', encoding='ISO-8859-1') as f: 11 | reader = csv.reader(f) 12 | data = list(reader) 13 | 14 | data = sorted(data, key=lambda x: x[4]) 15 | 16 | # ------------ 17 | # get # of users in data, and list of users (note automatically sorted) 18 | 19 | num_users = 1 20 | cuser = data[0][4] 21 | users = [cuser] 22 | 23 | for i in range(len(data)): 24 | row = data[i] 25 | tuser = row[4] 26 | if tuser != cuser: 27 | num_users += 1 28 | cuser = tuser 29 | users.append(tuser) 30 | 31 | # ------------ 32 | # get # of samples for each user 33 | 34 | num_samples = [0 for _ in range(num_users)] 35 | cuser = data[0][4] 36 | user_i = 0 37 | 38 | for i in range(len(data)): 39 | row = data[i] 40 | tuser = row[4] 41 | if tuser != cuser: 42 | cuser = tuser 43 | user_i += 1 44 | num_samples[user_i] += 1 45 | 46 | # ------------ 47 | # create user_data 48 | 49 | user_data = {} 50 | row_i = 0 51 | 52 | for u in users: 53 | user_data[u] = {'x': [], 'y': []} 54 | 55 | while ((row_i < len(data)) and (data[row_i][4] == u)): 56 | row = data[row_i] 57 | 58 | user_data[u]['x'].append(row[1:]) 59 | user_data[u]['y'].append(row[0]) 60 | 61 | row_i += 1 62 | 63 | # ------------ 64 | # create .json file 65 | 66 | all_data = {} 67 | all_data['users'] = users 68 | all_data['num_samples'] = num_samples 69 | all_data['user_data'] = user_data 70 | 71 | file_path = os.path.join(parent_path, 'data', 'all_data', 'all_data.json') 72 | 73 | with open(file_path, 'w') as outfile: 74 | json.dump(all_data, outfile) 75 | -------------------------------------------------------------------------------- /data/sent140/preprocess/data_to_json.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ ! -d "../data" ]; then 4 | mkdir ../data 5 | fi 6 | if [ ! -d "../data/raw_data" ]; then 7 | mkdir ../data/raw_data 8 | fi 9 | if [ ! -f ../data/raw_data/test.csv ]; then 10 | echo "------------------------------" 11 | echo "retrieving raw data" 12 | 13 | ./get_data.sh 14 | echo "finished retrieving raw data" 15 | fi 16 | 17 | if [ ! -d "../data/intermediate" ]; then 18 | echo "------------------------------" 19 | echo "combining raw_data .csv files" 20 | mkdir ../data/intermediate 21 | python3 combine_data.py 22 | echo "finished combining raw_data .csv files" 23 | fi 24 | 25 | if [ ! -d "../data/all_data" ]; then 26 | echo "------------------------------" 27 | echo "converting data to .json format" 28 | mkdir ../data/all_data 29 | python3 data_to_json.py 30 | echo "finished converting data to .json format" 31 | fi 32 | -------------------------------------------------------------------------------- /data/sent140/preprocess/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd ../data/raw_data 4 | 5 | if [ ! -f trainingandtestdata.zip ]; then 6 | wget --no-check-certificate http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip 7 | fi 8 | 9 | unzip trainingandtestdata.zip 10 | 11 | mv training.1600000.processed.noemoticon.csv training.csv 12 | mv testdata.manual.2009.06.14.csv test.csv 13 | 14 | rm trainingandtestdata.zip 15 | 16 | cd ../../preprocess -------------------------------------------------------------------------------- /data/sent140/stats.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | NAME="sent140" 4 | 5 | cd ../../utils 6 | 7 | python3 stats.py --name $NAME 8 | 9 | cd ../data/$NAME -------------------------------------------------------------------------------- /data/shakespeare/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/data/shakespeare/.DS_Store -------------------------------------------------------------------------------- /data/shakespeare/README.md: -------------------------------------------------------------------------------- 1 | # Shakespeare Dataset 2 | 3 | ## Setup Instructions 4 | 5 | 6 | You can download the dataset [here](https://drive.google.com/file/d/1cVpkJA0cIKN4t-n5Fl95sPlAzhGgE_ZJ/view?usp=sharing), unzip it and put the `train` and `test` folder under `data`. 7 | 8 | The above dataset is sampled using the following instructions: 9 | 10 | - Run preprocess.sh with a choice of the following tags: 11 | 12 | - ```-s``` := 'iid' to sample in an i.i.d. manner, or 'niid' to sample in a non-i.i.d. manner; more information on i.i.d. versus non-i.i.d. is included in the 'Notes' section 13 | - ```--iu``` := number of users, if i.i.d. sampling; expressed as a fraction of the total number of users; default is 0.01 14 | - ```--sf``` := fraction of data to sample, written as a decimal; default is 0.1 15 | - ```-k``` := minimum number of samples per user 16 | - ```-t``` := 'user' to partition users into train-test groups, or 'sample' to partition each user's samples into train-test groups; default is 'sample' 17 | - ```--tf``` := fraction of data in training set, written as a decimal; default is 0.8 18 | - ```--raw``` := include users' raw text data in all_data.json 19 | 20 | Instruction used to generate Shakespeare in the paper: 21 | 22 | ``` 23 | ./preprocess.sh -s niid --sf 0.2 -k 0 -tf 0.8 -t sample 24 | ``` 25 | 26 | 27 | Make sure to delete the rem\_user\_data, sampled\_data, test, and train subfolders in the data directory before re-running preprocess.sh 28 | 29 | -------------------------------------------------------------------------------- /data/shakespeare/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/data/shakespeare/data/.DS_Store -------------------------------------------------------------------------------- /data/shakespeare/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # download data and convert to .json format 4 | 5 | RAWTAG="" 6 | if [[ $@ = *"--raw"* ]]; then 7 | RAWTAG="--raw" 8 | fi 9 | if [ ! -d "data/all_data" ] || [ ! "$(ls -A data/all_data)" ]; then 10 | cd preprocess 11 | ./data_to_json.sh $RAWTAG 12 | cd .. 13 | fi 14 | 15 | NAME="shakespeare" 16 | 17 | cd ../../utils 18 | 19 | ./preprocess.sh --name $NAME $@ 20 | 21 | cd ../data/$NAME -------------------------------------------------------------------------------- /data/shakespeare/preprocess/data_to_json.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ ! -d "../data" ]; then 4 | mkdir ../data 5 | fi 6 | 7 | if [ ! -d "../data/raw_data" ]; then 8 | mkdir ../data/raw_data 9 | fi 10 | 11 | if [ ! -f ../data/raw_data/raw_data.txt ]; then 12 | ./get_data.sh 13 | fi 14 | 15 | if [ ! -d "../data/raw_data/by_play_and_character" ]; then 16 | echo "dividing txt data between users" 17 | python3 preprocess_shakespeare.py ../data/raw_data/raw_data.txt ../data/raw_data/ 18 | fi 19 | 20 | RAWTAG="" 21 | if [[ $@ = *"--raw"* ]]; then 22 | RAWTAG="--raw" 23 | fi 24 | if [ ! -d "../data/all_data" ]; then 25 | mkdir ../data/all_data 26 | fi 27 | if [ ! "$(ls -A ../data/all_data)" ]; then 28 | echo "generating all_data.json" 29 | python3 gen_all_data.py $RAWTAG 30 | fi -------------------------------------------------------------------------------- /data/shakespeare/preprocess/gen_all_data.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import json 4 | import os 5 | 6 | from shake_utils import parse_data_in 7 | 8 | parser = argparse.ArgumentParser() 9 | 10 | parser.add_argument('--raw', 11 | help='include users\' raw .txt data in respective .json files', 12 | action="store_true") 13 | 14 | parser.set_defaults(raw=False) 15 | 16 | args = parser.parse_args() 17 | 18 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 19 | 20 | users_and_plays_path = os.path.join(parent_path, 'data', 'raw_data', 'users_and_plays.json') 21 | txt_dir = os.path.join(parent_path, 'data', 'raw_data', 'by_play_and_character') 22 | json_data = parse_data_in(txt_dir, users_and_plays_path, args.raw) 23 | json_path = os.path.join(parent_path, 'data', 'all_data', 'all_data.json') 24 | with open(json_path, 'w') as outfile: 25 | json.dump(json_data, outfile) 26 | -------------------------------------------------------------------------------- /data/shakespeare/preprocess/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd ../data/raw_data 4 | 5 | wget http://www.gutenberg.org/files/100/old/1994-01-100.zip 6 | unzip 1994-01-100.zip 7 | rm 1994-01-100.zip 8 | mv 100.txt raw_data.txt 9 | 10 | cd ../../preprocess -------------------------------------------------------------------------------- /data/shakespeare/preprocess/preprocess_shakespeare.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | import os 4 | import random 5 | import re 6 | import sys 7 | RANDOM_SEED = 1234 8 | # Regular expression to capture an actors name, and line continuation 9 | CHARACTER_RE = re.compile(r'^ ([a-zA-Z][a-zA-Z ]*)\. (.*)') 10 | CONT_RE = re.compile(r'^ (.*)') 11 | # The Comedy of Errors has errors in its indentation so we need to use 12 | # different regular expressions. 13 | COE_CHARACTER_RE = re.compile(r'^([a-zA-Z][a-zA-Z ]*)\. (.*)') 14 | COE_CONT_RE = re.compile(r'^(.*)') 15 | 16 | def _match_character_regex(line, comedy_of_errors=False): 17 | return (COE_CHARACTER_RE.match(line) if comedy_of_errors 18 | else CHARACTER_RE.match(line)) 19 | 20 | def _match_continuation_regex(line, comedy_of_errors=False): 21 | return ( 22 | COE_CONT_RE.match(line) if comedy_of_errors else CONT_RE.match(line)) 23 | 24 | def _split_into_plays(shakespeare_full): 25 | """Splits the full data by play.""" 26 | # List of tuples (play_name, dict from character to list of lines) 27 | plays = [] 28 | discarded_lines = [] # Track discarded lines. 29 | slines = shakespeare_full.splitlines(True)[1:] 30 | 31 | # skip contents, the sonnets, and all's well that ends well 32 | author_count = 0 33 | start_i = 0 34 | for i, l in enumerate(slines): 35 | if 'by William Shakespeare' in l: 36 | author_count += 1 37 | if author_count == 2: 38 | start_i = i - 5 39 | break 40 | slines = slines[start_i:] 41 | 42 | current_character = None 43 | comedy_of_errors = False 44 | for i, line in enumerate(slines): 45 | # This marks the end of the plays in the file. 46 | if i > 124195 - start_i: 47 | break 48 | # This is a pretty good heuristic for detecting the start of a new play: 49 | if 'by William Shakespeare' in line: 50 | current_character = None 51 | characters = collections.defaultdict(list) 52 | # The title will be 2, 3, 4, 5, 6, or 7 lines above "by William Shakespeare". 53 | if slines[i - 2].strip(): 54 | title = slines[i - 2] 55 | elif slines[i - 3].strip(): 56 | title = slines[i - 3] 57 | elif slines[i - 4].strip(): 58 | title = slines[i - 4] 59 | elif slines[i - 5].strip(): 60 | title = slines[i - 5] 61 | elif slines[i - 6].strip(): 62 | title = slines[i - 6] 63 | else: 64 | title = slines[i - 7] 65 | title = title.strip() 66 | 67 | assert title, ( 68 | 'Parsing error on line %d. Expecting title 2 or 3 lines above.' % 69 | i) 70 | comedy_of_errors = (title == 'THE COMEDY OF ERRORS') 71 | # Degenerate plays are removed at the end of the method. 72 | plays.append((title, characters)) 73 | continue 74 | match = _match_character_regex(line, comedy_of_errors) 75 | if match: 76 | character, snippet = match.group(1), match.group(2) 77 | # Some character names are written with multiple casings, e.g., SIR_Toby 78 | # and SIR_TOBY. To normalize the character names, we uppercase each name. 79 | # Note that this was not done in the original preprocessing and is a 80 | # recent fix. 81 | character = character.upper() 82 | if not (comedy_of_errors and character.startswith('ACT ')): 83 | characters[character].append(snippet) 84 | current_character = character 85 | continue 86 | else: 87 | current_character = None 88 | continue 89 | elif current_character: 90 | match = _match_continuation_regex(line, comedy_of_errors) 91 | if match: 92 | if comedy_of_errors and match.group(1).startswith('<'): 93 | current_character = None 94 | continue 95 | else: 96 | characters[current_character].append(match.group(1)) 97 | continue 98 | # Didn't consume the line. 99 | line = line.strip() 100 | if line and i > 2646: 101 | # Before 2646 are the sonnets, which we expect to discard. 102 | discarded_lines.append('%d:%s' % (i, line)) 103 | # Remove degenerate "plays". 104 | return [play for play in plays if len(play[1]) > 1], discarded_lines 105 | 106 | def _remove_nonalphanumerics(filename): 107 | return re.sub('\\W+', '_', filename) 108 | 109 | def play_and_character(play, character): 110 | return _remove_nonalphanumerics((play + '_' + character).replace(' ', '_')) 111 | 112 | def _get_train_test_by_character(plays, test_fraction=0.2): 113 | """ 114 | Splits character data into train and test sets. 115 | if test_fraction <= 0, returns {} for all_test_examples 116 | plays := list of (play, dict) tuples where play is a string and dict 117 | is a dictionary with character names as keys 118 | """ 119 | skipped_characters = 0 120 | all_train_examples = collections.defaultdict(list) 121 | all_test_examples = collections.defaultdict(list) 122 | 123 | def add_examples(example_dict, example_tuple_list): 124 | for play, character, sound_bite in example_tuple_list: 125 | example_dict[play_and_character( 126 | play, character)].append(sound_bite) 127 | 128 | users_and_plays = {} 129 | for play, characters in plays: 130 | curr_characters = list(characters.keys()) 131 | for c in curr_characters: 132 | users_and_plays[play_and_character(play, c)] = play 133 | for character, sound_bites in characters.items(): 134 | examples = [(play, character, sound_bite) 135 | for sound_bite in sound_bites] 136 | if len(examples) <= 2: 137 | skipped_characters += 1 138 | # Skip characters with fewer than 2 lines since we need at least one 139 | # train and one test line. 140 | continue 141 | train_examples = examples 142 | if test_fraction > 0: 143 | num_test = max(int(len(examples) * test_fraction), 1) 144 | train_examples = examples[:-num_test] 145 | test_examples = examples[-num_test:] 146 | assert len(test_examples) == num_test 147 | assert len(train_examples) >= len(test_examples) 148 | add_examples(all_test_examples, test_examples) 149 | add_examples(all_train_examples, train_examples) 150 | return users_and_plays, all_train_examples, all_test_examples 151 | 152 | def _write_data_by_character(examples, output_directory): 153 | """Writes a collection of data files by play & character.""" 154 | if not os.path.exists(output_directory): 155 | os.makedirs(output_directory) 156 | for character_name, sound_bites in examples.items(): 157 | filename = os.path.join(output_directory, character_name + '.txt') 158 | with open(filename, 'w') as output: 159 | for sound_bite in sound_bites: 160 | output.write(sound_bite + '\n') 161 | 162 | def main(argv): 163 | print('Splitting .txt data between users') 164 | input_filename = argv[0] 165 | with open(input_filename, 'r') as input_file: 166 | shakespeare_full = input_file.read() 167 | plays, discarded_lines = _split_into_plays(shakespeare_full) 168 | print ('Discarded %d lines' % len(discarded_lines)) 169 | users_and_plays, all_examples, _ = _get_train_test_by_character(plays, test_fraction=-1.0) 170 | output_directory = argv[1] 171 | with open(os.path.join(output_directory, 'users_and_plays.json'), 'w') as ouf: 172 | json.dump(users_and_plays, ouf) 173 | _write_data_by_character(all_examples, 174 | os.path.join(output_directory, 175 | 'by_play_and_character/')) 176 | 177 | if __name__ == '__main__': 178 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /data/shakespeare/preprocess/shake_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | helper functions for preprocessing shakespeare data 3 | ''' 4 | 5 | import json 6 | import os 7 | import re 8 | 9 | def __txt_to_data(txt_dir, seq_length=80): 10 | raw_text = "" 11 | with open(txt_dir,'r') as inf: 12 | raw_text = inf.read() 13 | raw_text = raw_text.replace('\n', ' ') 14 | raw_text = re.sub(r" *", r' ', raw_text) 15 | dataX = [] 16 | dataY = [] 17 | for i in range(0, len(raw_text) - seq_length, 1): 18 | seq_in = raw_text[i:i + seq_length] 19 | seq_out = raw_text[i + seq_length] 20 | dataX.append(seq_in) 21 | dataY.append(seq_out) 22 | return dataX, dataY 23 | 24 | def parse_data_in(data_dir, users_and_plays_path, raw=False): 25 | ''' 26 | returns dictionary with keys: users, num_samples, user_data 27 | raw := bool representing whether to include raw text in all_data 28 | if raw is True, then user_data key 29 | removes users with no data 30 | ''' 31 | with open(users_and_plays_path, 'r') as inf: 32 | users_and_plays = json.load(inf) 33 | files = os.listdir(data_dir) 34 | users = [] 35 | hierarchies = [] 36 | num_samples = [] 37 | user_data = {} 38 | for f in files: 39 | user = f[:-4] 40 | passage = '' 41 | filename = os.path.join(data_dir, f) 42 | with open(filename, 'r') as inf: 43 | passage = inf.read() 44 | dataX, dataY = __txt_to_data(filename) 45 | if(len(dataX) > 0): 46 | users.append(user) 47 | if raw: 48 | user_data[user] = {'raw': passage} 49 | else: 50 | user_data[user] = {} 51 | user_data[user]['x'] = dataX 52 | user_data[user]['y'] = dataY 53 | hierarchies.append(users_and_plays[user]) 54 | num_samples.append(len(dataY)) 55 | all_data = {} 56 | all_data['users'] = users 57 | all_data['hierarchies'] = hierarchies 58 | all_data['num_samples'] = num_samples 59 | all_data['user_data'] = user_data 60 | return all_data -------------------------------------------------------------------------------- /data/shakespeare/stats.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | NAME="shakespeare" 4 | 5 | cd ../../utils 6 | 7 | python3 stats.py --name $NAME 8 | 9 | cd ../data/$NAME -------------------------------------------------------------------------------- /data/synthetic_0.5_0.5/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | python generate_synthetic.py 3 | ``` -------------------------------------------------------------------------------- /data/synthetic_0.5_0.5/generate_synthetic.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import numpy as np 4 | import os 5 | import sys 6 | import random 7 | from tqdm import trange 8 | import math 9 | 10 | 11 | NUM_USER = 30 12 | 13 | def softmax(x): 14 | ex = np.exp(x) 15 | sum_ex = np.sum( np.exp(x)) 16 | return ex/sum_ex 17 | 18 | 19 | def generate_synthetic(alpha, beta, iid): 20 | 21 | dimension = 60 22 | NUM_CLASS = 10 23 | 24 | samples_per_user = np.random.lognormal(4, 2, (NUM_USER)).astype(int) + 50 25 | print(samples_per_user) 26 | num_samples = np.sum(samples_per_user) 27 | 28 | X_split = [[] for _ in range(NUM_USER)] 29 | y_split = [[] for _ in range(NUM_USER)] 30 | 31 | 32 | #### define some eprior #### 33 | mean_W = np.random.normal(0, alpha, NUM_USER) 34 | mean_b = mean_W 35 | B = np.random.normal(0, beta, NUM_USER) 36 | mean_x = np.zeros((NUM_USER, dimension)) 37 | 38 | diagonal = np.zeros(dimension) 39 | for j in range(dimension): 40 | diagonal[j] = np.power((j+1), -1.2) 41 | cov_x = np.diag(diagonal) 42 | 43 | for i in range(NUM_USER): 44 | if iid == 1: 45 | mean_x[i] = np.ones(dimension) * B[i] # all zeros 46 | else: 47 | mean_x[i] = np.random.normal(B[i], 1, dimension) 48 | print(mean_x[i]) 49 | 50 | if iid == 1: 51 | W_global = np.random.normal(0, 1, (dimension, NUM_CLASS)) 52 | b_global = np.random.normal(0, 1, NUM_CLASS) 53 | 54 | for i in range(NUM_USER): 55 | 56 | W = np.random.normal(mean_W[i], 1, (dimension, NUM_CLASS)) 57 | b = np.random.normal(mean_b[i], 1, NUM_CLASS) 58 | 59 | if iid == 1: 60 | W = W_global 61 | b = b_global 62 | 63 | xx = np.random.multivariate_normal(mean_x[i], cov_x, samples_per_user[i]) 64 | yy = np.zeros(samples_per_user[i]) 65 | 66 | for j in range(samples_per_user[i]): 67 | tmp = np.dot(xx[j], W) + b 68 | yy[j] = np.argmax(softmax(tmp)) 69 | 70 | X_split[i] = xx.tolist() 71 | y_split[i] = yy.tolist() 72 | 73 | print("{}-th users has {} exampls".format(i, len(y_split[i]))) 74 | 75 | 76 | return X_split, y_split 77 | 78 | 79 | 80 | def main(): 81 | 82 | 83 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]} 84 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]} 85 | 86 | train_path = "data/train/mytrain.json" 87 | test_path = "data/test/mytest.json" 88 | 89 | #X, y = generate_synthetic(alpha=0, beta=0, iid=0) # synthetiv (0,0) 90 | X, y = generate_synthetic(alpha=0.5, beta=0.5, iid=0) # synthetic (0.5, 0.5) 91 | #X, y = generate_synthetic(alpha=1, beta=1, iid=0) # synthetic (1,1) 92 | #X, y = generate_synthetic(alpha=0, beta=0, iid=1) # synthetic_IID 93 | 94 | 95 | # Create data structure 96 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]} 97 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]} 98 | 99 | for i in trange(NUM_USER, ncols=120): 100 | 101 | uname = 'f_{0:05d}'.format(i) 102 | combined = list(zip(X[i], y[i])) 103 | random.shuffle(combined) 104 | X[i][:], y[i][:] = zip(*combined) 105 | num_samples = len(X[i]) 106 | train_len = int(0.9 * num_samples) 107 | test_len = num_samples - train_len 108 | 109 | train_data['users'].append(uname) 110 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]} 111 | train_data['num_samples'].append(train_len) 112 | test_data['users'].append(uname) 113 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]} 114 | test_data['num_samples'].append(test_len) 115 | 116 | 117 | with open(train_path,'w') as outfile: 118 | json.dump(train_data, outfile) 119 | with open(test_path, 'w') as outfile: 120 | json.dump(test_data, outfile) 121 | 122 | 123 | if __name__ == "__main__": 124 | main() 125 | 126 | -------------------------------------------------------------------------------- /data/synthetic_0_0/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | python generate_synthetic.py 3 | ``` -------------------------------------------------------------------------------- /data/synthetic_0_0/generate_synthetic.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import numpy as np 4 | import os 5 | import sys 6 | import random 7 | from tqdm import trange 8 | import math 9 | 10 | 11 | NUM_USER = 30 12 | 13 | def softmax(x): 14 | ex = np.exp(x) 15 | sum_ex = np.sum( np.exp(x)) 16 | return ex/sum_ex 17 | 18 | 19 | def generate_synthetic(alpha, beta, iid): 20 | 21 | dimension = 60 22 | NUM_CLASS = 10 23 | 24 | samples_per_user = np.random.lognormal(4, 2, (NUM_USER)).astype(int) + 50 25 | print(samples_per_user) 26 | num_samples = np.sum(samples_per_user) 27 | 28 | X_split = [[] for _ in range(NUM_USER)] 29 | y_split = [[] for _ in range(NUM_USER)] 30 | 31 | 32 | #### define some eprior #### 33 | mean_W = np.random.normal(0, alpha, NUM_USER) 34 | mean_b = mean_W 35 | B = np.random.normal(0, beta, NUM_USER) 36 | mean_x = np.zeros((NUM_USER, dimension)) 37 | 38 | diagonal = np.zeros(dimension) 39 | for j in range(dimension): 40 | diagonal[j] = np.power((j+1), -1.2) 41 | cov_x = np.diag(diagonal) 42 | 43 | for i in range(NUM_USER): 44 | if iid == 1: 45 | mean_x[i] = np.ones(dimension) * B[i] # all zeros 46 | else: 47 | mean_x[i] = np.random.normal(B[i], 1, dimension) 48 | print(mean_x[i]) 49 | 50 | if iid == 1: 51 | W_global = np.random.normal(0, 1, (dimension, NUM_CLASS)) 52 | b_global = np.random.normal(0, 1, NUM_CLASS) 53 | 54 | for i in range(NUM_USER): 55 | 56 | W = np.random.normal(mean_W[i], 1, (dimension, NUM_CLASS)) 57 | b = np.random.normal(mean_b[i], 1, NUM_CLASS) 58 | 59 | if iid == 1: 60 | W = W_global 61 | b = b_global 62 | 63 | xx = np.random.multivariate_normal(mean_x[i], cov_x, samples_per_user[i]) 64 | yy = np.zeros(samples_per_user[i]) 65 | 66 | for j in range(samples_per_user[i]): 67 | tmp = np.dot(xx[j], W) + b 68 | yy[j] = np.argmax(softmax(tmp)) 69 | 70 | X_split[i] = xx.tolist() 71 | y_split[i] = yy.tolist() 72 | 73 | print("{}-th users has {} exampls".format(i, len(y_split[i]))) 74 | 75 | 76 | return X_split, y_split 77 | 78 | 79 | 80 | def main(): 81 | 82 | 83 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]} 84 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]} 85 | 86 | train_path = "data/train/mytrain.json" 87 | test_path = "data/test/mytest.json" 88 | 89 | X, y = generate_synthetic(alpha=0, beta=0, iid=0) # synthetiv (0,0) 90 | #X, y = generate_synthetic(alpha=0.5, beta=0.5, iid=0) # synthetic (0.5, 0.5) 91 | #X, y = generate_synthetic(alpha=1, beta=1, iid=0) # synthetic (1,1) 92 | #X, y = generate_synthetic(alpha=0, beta=0, iid=1) # synthetic_IID 93 | 94 | 95 | # Create data structure 96 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]} 97 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]} 98 | 99 | for i in trange(NUM_USER, ncols=120): 100 | 101 | uname = 'f_{0:05d}'.format(i) 102 | combined = list(zip(X[i], y[i])) 103 | random.shuffle(combined) 104 | X[i][:], y[i][:] = zip(*combined) 105 | num_samples = len(X[i]) 106 | train_len = int(0.9 * num_samples) 107 | test_len = num_samples - train_len 108 | 109 | train_data['users'].append(uname) 110 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]} 111 | train_data['num_samples'].append(train_len) 112 | test_data['users'].append(uname) 113 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]} 114 | test_data['num_samples'].append(test_len) 115 | 116 | 117 | with open(train_path,'w') as outfile: 118 | json.dump(train_data, outfile) 119 | with open(test_path, 'w') as outfile: 120 | json.dump(test_data, outfile) 121 | 122 | 123 | if __name__ == "__main__": 124 | main() 125 | 126 | -------------------------------------------------------------------------------- /data/synthetic_1_1/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | python generate_synthetic.py 3 | ``` -------------------------------------------------------------------------------- /data/synthetic_1_1/generate_synthetic.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import numpy as np 4 | import os 5 | import sys 6 | import random 7 | from tqdm import trange 8 | import math 9 | 10 | 11 | NUM_USER = 30 12 | 13 | 14 | def softmax(x): 15 | ex = np.exp(x) 16 | sum_ex = np.sum( np.exp(x)) 17 | return ex/sum_ex 18 | 19 | 20 | def generate_synthetic(alpha, beta, iid): 21 | 22 | dimension = 60 23 | NUM_CLASS = 10 24 | 25 | samples_per_user = np.random.lognormal(4, 2, (NUM_USER)).astype(int) + 50 26 | #samples_per_user = 50*np.ones((NUM_USER,)) 27 | #samples_per_user = np.array(samples_per_user, int) 28 | print(samples_per_user) 29 | num_samples = np.sum(samples_per_user) 30 | 31 | X_split = [[] for _ in range(NUM_USER)] 32 | y_split = [[] for _ in range(NUM_USER)] 33 | 34 | 35 | #### define some eprior #### 36 | mean_W = np.random.normal(0, alpha, NUM_USER) 37 | mean_b = mean_W 38 | B = np.random.normal(0, beta, NUM_USER) 39 | mean_x = np.zeros((NUM_USER, dimension)) 40 | 41 | diagonal = np.zeros(dimension) 42 | for j in range(dimension): 43 | diagonal[j] = np.power((j+1), -1.2) 44 | cov_x = np.diag(diagonal) 45 | 46 | for i in range(NUM_USER): 47 | mean_x[i] = np.random.normal(B[i], 1, dimension) 48 | print(mean_x[i]) 49 | 50 | 51 | for i in range(NUM_USER): 52 | 53 | W = np.random.normal(mean_W[i], 1, (dimension, NUM_CLASS)) 54 | b = np.random.normal(mean_b[i], 1, NUM_CLASS) 55 | 56 | xx = np.random.multivariate_normal(mean_x[i], cov_x, samples_per_user[i]) 57 | yy = np.zeros(samples_per_user[i]) 58 | 59 | for j in range(samples_per_user[i]): 60 | tmp = np.dot(xx[j], W) + b 61 | yy[j] = np.argmax(softmax(tmp)) 62 | 63 | X_split[i] = xx.tolist() 64 | y_split[i] = yy.tolist() 65 | 66 | print("{}-th users has {} exampls".format(i, len(y_split[i]))) 67 | 68 | 69 | return X_split, y_split 70 | 71 | 72 | 73 | def main(): 74 | 75 | 76 | train_path = "data/synthetic_1_1/data/train/mytrain_equal.json" 77 | test_path = "data/synthetic_1_1/data/test/mytest_equal.json" 78 | 79 | X, y = generate_synthetic(alpha=1, beta=1, iid=0) # synthetic (1,1) 80 | 81 | 82 | # Create data structure 83 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]} 84 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]} 85 | 86 | for i in trange(NUM_USER, ncols=120): 87 | 88 | uname = 'f_{0:05d}'.format(i) 89 | combined = list(zip(X[i], y[i])) 90 | random.shuffle(combined) 91 | X[i][:], y[i][:] = zip(*combined) 92 | num_samples = len(X[i]) 93 | train_len = int(0.9 * num_samples) 94 | test_len = num_samples - train_len 95 | 96 | train_data['users'].append(uname) 97 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]} 98 | train_data['num_samples'].append(train_len) 99 | test_data['users'].append(uname) 100 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]} 101 | test_data['num_samples'].append(test_len) 102 | 103 | 104 | with open(train_path,'w') as outfile: 105 | json.dump(train_data, outfile) 106 | with open(test_path, 'w') as outfile: 107 | json.dump(test_data, outfile) 108 | 109 | 110 | if __name__ == "__main__": 111 | main() 112 | 113 | -------------------------------------------------------------------------------- /data/synthetic_iid/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | python generate_synthetic.py 3 | ``` -------------------------------------------------------------------------------- /data/synthetic_iid/generate_iid.py: -------------------------------------------------------------------------------- 1 | import json, math, os, sys 2 | import numpy as np 3 | import random 4 | from tqdm import trange 5 | 6 | 7 | NUM_USER = 30 8 | 9 | def softmax(x): 10 | ex = np.exp(x) 11 | sum_ex = np.sum(np.exp(x)) 12 | return ex/sum_ex 13 | 14 | def generate_synthetic(alpha, beta, iid): 15 | dimension = 60 16 | NUM_CLASS = 10 17 | 18 | samples_per_user = np.random.lognormal(4, 2, (NUM_USER)).astype(int) + 50 19 | #samples_per_user = 50*np.ones((NUM_USER,)) 20 | samples_per_user = np.array(samples_per_user, int) 21 | print(samples_per_user) 22 | num_samples = np.sum(samples_per_user) 23 | 24 | X_split = [[] for _ in range(NUM_USER)] 25 | y_split = [[] for _ in range(NUM_USER)] 26 | 27 | #### define some eprior #### 28 | mean_x = np.zeros((NUM_USER, dimension)) 29 | 30 | diagonal = np.zeros(dimension) 31 | for j in range(dimension): 32 | diagonal[j] = np.power((j+1), -1.2) 33 | cov_x = np.diag(diagonal) 34 | 35 | for i in range(NUM_USER): 36 | mean_x[i] = np.zeros(dimension) 37 | 38 | W = np.random.normal(0, 1, (dimension, NUM_CLASS)) 39 | b = np.random.normal(0, 1, NUM_CLASS) 40 | 41 | for i in range(NUM_USER): 42 | xx = np.random.multivariate_normal(mean_x[i], cov_x, samples_per_user[i]) 43 | yy = np.zeros(samples_per_user[i]) 44 | 45 | for j in range(samples_per_user[i]): 46 | tmp = np.dot(xx[j], W) + b 47 | yy[j] = np.argmax(softmax(tmp)) 48 | 49 | X_split[i] = xx.tolist() 50 | y_split[i] = yy.tolist() 51 | 52 | print("{}-th users has {} exampls".format(i, len(y_split[i]))) 53 | 54 | return X_split, y_split 55 | 56 | 57 | 58 | def main(): 59 | train_path = "data/synthetic_iid/data/train/mytrain_equal.json" 60 | test_path = "data/synthetic_iid/data/test/mytest_equal.json" 61 | 62 | X, y = generate_synthetic(alpha=0, beta=0, iid=1) 63 | 64 | # Create data structure 65 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]} 66 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]} 67 | 68 | for i in trange(NUM_USER, ncols=120): 69 | 70 | uname = 'f_{0:05d}'.format(i) 71 | combined = list(zip(X[i], y[i])) 72 | random.shuffle(combined) 73 | X[i][:], y[i][:] = zip(*combined) 74 | num_samples = len(X[i]) 75 | train_len = int(0.9 * num_samples) 76 | test_len = num_samples - train_len 77 | 78 | train_data['users'].append(uname) 79 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]} 80 | train_data['num_samples'].append(train_len) 81 | test_data['users'].append(uname) 82 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]} 83 | test_data['num_samples'].append(test_len) 84 | 85 | with open(train_path, 'w') as outfile: 86 | json.dump(train_data, outfile) 87 | with open(test_path, 'w') as outfile: 88 | json.dump(test_data, outfile) 89 | 90 | 91 | if __name__ == "__main__": 92 | main() 93 | 94 | -------------------------------------------------------------------------------- /flearn/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/.DS_Store -------------------------------------------------------------------------------- /flearn/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/models/__init__.py -------------------------------------------------------------------------------- /flearn/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/models/__pycache__/client.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/models/__pycache__/client.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/models/celeba/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/models/celeba/__init__.py -------------------------------------------------------------------------------- /flearn/models/celeba/cnn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from tqdm import trange 5 | 6 | 7 | from flearn.utils.tf_utils import graph_size, process_grad 8 | from flearn.utils.model_utils import batch_data, batch_data_celeba, process_x, process_y 9 | 10 | IMAGE_SIZE = 84 11 | IMAGES_DIR = os.path.join('..', 'data', 'celeba', 'data', 'raw', 'img_align_celeba') 12 | 13 | 14 | class Model(object): 15 | def __init__(self, num_classes, optimizer, seed=1): 16 | # params 17 | self.num_classes = num_classes 18 | 19 | # create computation graph 20 | self.graph = tf.Graph() 21 | with self.graph.as_default(): 22 | tf.set_random_seed(123 + seed) 23 | self.features, self.labels, self.train_op, self.grads, self.eval_metric_ops, \ 24 | self.loss = self.create_model(optimizer) 25 | self.saver = tf.train.Saver() 26 | 27 | config = tf.ConfigProto() 28 | config.gpu_options.allow_growth = True 29 | self.sess = tf.Session(graph=self.graph, config=config) 30 | 31 | # find memory footprint and compute cost of the model 32 | self.size = graph_size(self.graph) 33 | with self.graph.as_default(): 34 | self.sess.run(tf.global_variables_initializer()) 35 | metadata = tf.RunMetadata() 36 | opts = tf.profiler.ProfileOptionBuilder.float_operation() 37 | self.flops = tf.profiler.profile(self.graph, run_meta=metadata, cmd='scope', options=opts).total_float_ops 38 | 39 | def create_model(self, optimizer): 40 | input_ph = tf.placeholder(tf.float32, shape=(None, IMAGE_SIZE, IMAGE_SIZE, 3)) 41 | out = input_ph 42 | for _ in range(4): 43 | out = tf.layers.conv2d(out, 32, 3, padding='same') 44 | out = tf.layers.batch_normalization(out, training=True) 45 | out = tf.layers.max_pooling2d(out, 2, 2, padding='same') 46 | out = tf.nn.relu(out) 47 | out = tf.reshape(out, (-1, int(np.prod(out.get_shape()[1:])))) 48 | logits = tf.layers.dense(out, self.num_classes) 49 | label_ph = tf.placeholder(tf.int64, shape=(None,)) 50 | loss = tf.losses.sparse_softmax_cross_entropy(labels=label_ph, logits=logits) 51 | predictions = { 52 | "classes": tf.argmax(input=logits, axis=1), 53 | "probabilities": tf.nn.softmax(logits, name="softmax_tensor") 54 | } 55 | grads_and_vars = optimizer.compute_gradients(loss) 56 | grads, _ = zip(*grads_and_vars) 57 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=tf.train.get_global_step()) 58 | eval_metric_ops = tf.count_nonzero(tf.equal(label_ph, tf.argmax(input=logits, axis=1))) 59 | 60 | return input_ph, label_ph, train_op, grads, eval_metric_ops, loss 61 | 62 | 63 | 64 | def set_params(self, model_params=None): 65 | if model_params is not None: 66 | with self.graph.as_default(): 67 | all_vars = tf.trainable_variables() 68 | for variable, value in zip(all_vars, model_params): 69 | variable.load(value, self.sess) 70 | 71 | def get_params(self): 72 | with self.graph.as_default(): 73 | model_params = self.sess.run(tf.trainable_variables()) 74 | return model_params 75 | 76 | def get_gradients(self, data, model_len): 77 | num_samples = len(data['y']) # Need model len 78 | with self.graph.as_default(): 79 | grads = self.sess.run(self.grads, 80 | feed_dict={self.features: process_x(data['x']), self.labels: process_y(data['y'])}) 81 | grads = process_grad(grads) 82 | 83 | return num_samples, grads 84 | 85 | 86 | def solve_inner(self, data, num_epochs=1, batch_size=32): 87 | '''Solves local optimization problem''' 88 | 89 | with self.graph.as_default(): 90 | _, grads = self.get_gradients(data, 610) # Ignore the hardcoding, it's not used anywhere 91 | 92 | for _ in trange(num_epochs, desc='Epoch: ', leave=False, ncols=120): 93 | for X, y in batch_data_celeba(data, batch_size): 94 | with self.graph.as_default(): 95 | self.sess.run(self.train_op, 96 | feed_dict={self.features: X, self.labels: y}) 97 | soln = self.get_params() 98 | comp = num_epochs * (len(data['y'])//batch_size) * batch_size * self.flops 99 | return soln, comp, grads 100 | 101 | def solve_sgd(self, mini_batch_data): 102 | with self.graph.as_default(): 103 | grads, loss, _ = self.sess.run([self.grads, self.loss, self.train_op], 104 | feed_dict={self.features: mini_batch_data[0], 105 | self.labels: mini_batch_data[1]}) 106 | weights = self.get_params() 107 | return grads, loss, weights 108 | 109 | def test(self, data): 110 | ''' 111 | Args: 112 | data: dict of the form {'x': [list], 'y': [list]} 113 | ''' 114 | with self.graph.as_default(): 115 | tot_correct, loss = self.sess.run([self.eval_metric_ops, self.loss], 116 | feed_dict={self.features: process_x(data['x']), 117 | self.labels: process_y(data['y'])}) 118 | return tot_correct, loss 119 | 120 | 121 | def close(self): 122 | self.sess.close() -------------------------------------------------------------------------------- /flearn/models/client.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | class Client(object): 5 | 6 | def __init__(self, id, group=None, train_data={'x':[],'y':[]}, eval_data={'x':[],'y':[]}, model=None): 7 | self.model = model 8 | self.id = id # integer 9 | self.group = group 10 | self.train_data = {k: np.array(v) for k, v in train_data.items()} 11 | self.eval_data = {k: np.array(v) for k, v in eval_data.items()} 12 | self.num_samples = len(self.train_data['y']) 13 | self.test_samples = len(self.eval_data['y']) 14 | self.updatevec = np.append(model.get_params()[0].flatten(), model.get_params()[1]) 15 | 16 | def set_params(self, model_params): 17 | '''set model parameters''' 18 | self.model.set_params(model_params) 19 | 20 | def get_params(self): 21 | '''get model parameters''' 22 | return self.model.get_params() 23 | 24 | def get_grads(self, model_len): 25 | '''get model gradient''' 26 | return self.model.get_gradients(self.train_data, model_len) 27 | 28 | def solve_grad(self): 29 | '''get model gradient with cost''' 30 | bytes_w = self.model.size 31 | grads = self.model.get_gradients(self.train_data) 32 | comp = self.model.flops * self.num_samples 33 | bytes_r = self.model.size 34 | return ((self.num_samples, grads), (bytes_w, comp, bytes_r)) 35 | 36 | def solve_inner(self, num_epochs=1, batch_size=10): 37 | '''Solves local optimization problem 38 | 39 | Return: 40 | 1: num_samples: number of samples used in training 41 | 1: soln: local optimization solution 42 | 2: bytes read: number of bytes received 43 | 2: comp: number of FLOPs executed in training process 44 | 2: bytes_write: number of bytes transmitted 45 | ''' 46 | 47 | bytes_w = self.model.size 48 | soln, comp, grads = self.model.solve_inner(self.train_data, num_epochs, batch_size) 49 | bytes_r = self.model.size 50 | return (self.num_samples, soln), (bytes_w, comp, bytes_r), grads 51 | 52 | def solve_iters(self, num_iters=1, batch_size=10): 53 | '''Solves local optimization problem 54 | 55 | Return: 56 | 1: num_samples: number of samples used in training 57 | 1: soln: local optimization solution 58 | 2: bytes read: number of bytes received 59 | 2: comp: number of FLOPs executed in training process 60 | 2: bytes_write: number of bytes transmitted 61 | ''' 62 | 63 | bytes_w = self.model.size 64 | soln, comp = self.model.solve_iters(self.train_data, num_iters, batch_size) 65 | bytes_r = self.model.size 66 | return (self.num_samples, soln), (bytes_w, comp, bytes_r) 67 | 68 | def train_error_and_loss(self): 69 | tot_correct, loss = self.model.test(self.train_data) 70 | return tot_correct, loss, self.num_samples 71 | 72 | 73 | def test(self): 74 | '''tests current model on local eval_data 75 | 76 | Return: 77 | tot_correct: total #correct predictions 78 | test_samples: int 79 | ''' 80 | tot_correct, loss = self.model.test(self.eval_data) 81 | return tot_correct, self.test_samples 82 | -------------------------------------------------------------------------------- /flearn/models/mnist/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/models/mnist/__init__.py -------------------------------------------------------------------------------- /flearn/models/mnist/mclr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tqdm import trange 4 | 5 | from flearn.utils.model_utils import batch_data, batch_data_multiple_iters 6 | from flearn.utils.tf_utils import graph_size 7 | from flearn.utils.tf_utils import process_grad 8 | 9 | 10 | class Model(object): 11 | ''' 12 | Assumes that images are 28px by 28px 13 | ''' 14 | 15 | def __init__(self, num_classes, optimizer, seed=1): 16 | 17 | # params 18 | self.num_classes = num_classes 19 | 20 | # create computation graph 21 | self.graph = tf.Graph() 22 | with self.graph.as_default(): 23 | tf.set_random_seed(123+seed) 24 | self.features, self.labels, self.train_op, self.grads, self.eval_metric_ops, self.loss = self.create_model(optimizer) 25 | self.saver = tf.train.Saver() 26 | self.sess = tf.Session(graph=self.graph) 27 | 28 | # find memory footprint and compute cost of the model 29 | self.size = graph_size(self.graph) 30 | with self.graph.as_default(): 31 | self.sess.run(tf.global_variables_initializer()) 32 | metadata = tf.RunMetadata() 33 | opts = tf.profiler.ProfileOptionBuilder.float_operation() 34 | self.flops = tf.profiler.profile(self.graph, run_meta=metadata, cmd='scope', options=opts).total_float_ops 35 | 36 | def create_model(self, optimizer): 37 | """Model function for Logistic Regression.""" 38 | features = tf.placeholder(tf.float32, shape=[None, 784], name='features') 39 | labels = tf.placeholder(tf.int64, shape=[None,], name='labels') 40 | logits = tf.layers.dense(inputs=features, units=self.num_classes, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001)) 41 | predictions = { 42 | "classes": tf.argmax(input=logits, axis=1), 43 | "probabilities": tf.nn.softmax(logits, name="softmax_tensor") 44 | } 45 | loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) 46 | 47 | grads_and_vars = optimizer.compute_gradients(loss) 48 | grads, _ = zip(*grads_and_vars) 49 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=tf.train.get_global_step()) 50 | eval_metric_ops = tf.count_nonzero(tf.equal(labels, predictions["classes"])) 51 | return features, labels, train_op, grads, eval_metric_ops, loss 52 | 53 | def set_params(self, model_params=None): 54 | if model_params is not None: 55 | with self.graph.as_default(): 56 | all_vars = tf.trainable_variables() 57 | for variable, value in zip(all_vars, model_params): 58 | variable.load(value, self.sess) 59 | 60 | def get_params(self): 61 | with self.graph.as_default(): 62 | model_params = self.sess.run(tf.trainable_variables()) 63 | return model_params 64 | 65 | def get_gradients(self, data, model_len): 66 | 67 | grads = np.zeros(model_len) 68 | num_samples = len(data['y']) 69 | 70 | with self.graph.as_default(): 71 | model_grads = self.sess.run(self.grads, 72 | feed_dict={self.features: data['x'], self.labels: data['y']}) 73 | grads = process_grad(model_grads) 74 | 75 | return num_samples, grads 76 | 77 | def solve_inner(self, data, num_epochs=1, batch_size=32): 78 | '''Solves local optimization problem''' 79 | for _ in trange(num_epochs, desc='Epoch: ', leave=False, ncols=120): 80 | for X, y in batch_data(data, batch_size): 81 | with self.graph.as_default(): 82 | self.sess.run(self.train_op, 83 | feed_dict={self.features: X, self.labels: y}) 84 | soln = self.get_params() 85 | comp = num_epochs * (len(data['y'])//batch_size) * batch_size * self.flops 86 | return soln, comp 87 | 88 | def solve_iters(self, data, num_iters=1, batch_size=32): 89 | '''Solves local optimization problem''' 90 | 91 | for X, y in batch_data_multiple_iters(data, batch_size, num_iters): 92 | with self.graph.as_default(): 93 | self.sess.run(self.train_op, feed_dict={self.features: X, self.labels: y}) 94 | soln = self.get_params() 95 | comp = 0 96 | return soln, comp 97 | 98 | def test(self, data): 99 | ''' 100 | Args: 101 | data: dict of the form {'x': [list], 'y': [list]} 102 | ''' 103 | with self.graph.as_default(): 104 | tot_correct, loss = self.sess.run([self.eval_metric_ops, self.loss], 105 | feed_dict={self.features: data['x'], self.labels: data['y']}) 106 | return tot_correct, loss 107 | 108 | def close(self): 109 | self.sess.close() 110 | -------------------------------------------------------------------------------- /flearn/models/nist/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/models/nist/__init__.py -------------------------------------------------------------------------------- /flearn/models/nist/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/models/nist/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/models/nist/cnn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tqdm import trange 4 | 5 | from flearn.utils.model_utils import batch_data 6 | from flearn.utils.tf_utils import graph_size 7 | from flearn.utils.tf_utils import process_grad 8 | 9 | 10 | class Model(object): 11 | def __init__(self, num_classes, optimizer, seed=1): 12 | # params 13 | self.num_classes = num_classes 14 | 15 | # create computation graph 16 | self.graph = tf.Graph() 17 | with self.graph.as_default(): 18 | tf.set_random_seed(123 + seed) 19 | self.features, self.labels, self.train_op, self.grads, self.eval_metric_ops, self.loss, self.predictions = self.create_model(optimizer) 20 | self.saver = tf.train.Saver() 21 | config = tf.ConfigProto() 22 | config.gpu_options.allow_growth = True 23 | self.sess = tf.Session(graph=self.graph, config=config) 24 | 25 | # find memory footprint and compute cost of the model 26 | self.size = graph_size(self.graph) 27 | with self.graph.as_default(): 28 | self.sess.run(tf.global_variables_initializer()) 29 | metadata = tf.RunMetadata() 30 | opts = tf.profiler.ProfileOptionBuilder.float_operation() 31 | self.flops = tf.profiler.profile(self.graph, run_meta=metadata, cmd='scope', options=opts).total_float_ops 32 | 33 | def create_model(self, optimizer): 34 | """Model function for CNN.""" 35 | features = tf.placeholder(tf.float32, shape=[None, 784], name='features') 36 | labels = tf.placeholder(tf.int64, shape=[None, ], name='labels') 37 | input_layer = tf.reshape(features, [-1, 28, 28, 1]) 38 | conv1 = tf.layers.conv2d( 39 | inputs=input_layer, 40 | filters=16, 41 | kernel_size=[5, 5], 42 | padding="same", 43 | activation=tf.nn.relu) 44 | pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) 45 | conv2 = tf.layers.conv2d( 46 | inputs=pool1, 47 | filters=32, 48 | kernel_size=[5, 5], 49 | padding="same", 50 | activation=tf.nn.relu) 51 | pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) 52 | 53 | pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 32]) 54 | dense = tf.layers.dense(inputs=pool2_flat, units=128, activation=tf.nn.relu) 55 | 56 | logits = tf.layers.dense(inputs=dense, units=self.num_classes) 57 | predictions = { 58 | "classes": tf.argmax(input=logits, axis=1), 59 | "probabilities": tf.nn.softmax(logits, name="softmax_tensor") 60 | } 61 | loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) 62 | grads_and_vars = optimizer.compute_gradients(loss) 63 | grads, _ = zip(*grads_and_vars) 64 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=tf.train.get_global_step()) 65 | eval_metric_ops = tf.count_nonzero(tf.equal(labels, predictions["classes"])) 66 | 67 | return features, labels, train_op, grads, eval_metric_ops, loss, predictions['classes'] 68 | 69 | 70 | def set_params(self, model_params=None): 71 | if model_params is not None: 72 | with self.graph.as_default(): 73 | all_vars = tf.trainable_variables() 74 | for variable, value in zip(all_vars, model_params): 75 | variable.load(value, self.sess) 76 | 77 | def get_params(self): 78 | with self.graph.as_default(): 79 | model_params = self.sess.run(tf.trainable_variables()) 80 | return model_params 81 | 82 | def get_gradients(self, mini_batch_data, model_len): 83 | 84 | #grads = np.zeros(model_len) 85 | num_samples = len(mini_batch_data['y']) 86 | with self.graph.as_default(): 87 | model_grads = self.sess.run(self.grads, 88 | feed_dict={self.features: mini_batch_data['x'], 89 | self.labels: mini_batch_data['y']}) 90 | 91 | grads = process_grad(model_grads) 92 | 93 | return num_samples, grads 94 | 95 | 96 | def solve_inner(self, data, num_epochs=1, batch_size=32): 97 | 98 | with self.graph.as_default(): 99 | _, grads = self.get_gradients(data, 610) # Ignore the hardcoding, it's not used anywhere 100 | '''Solves local optimization problem''' 101 | for _ in range(num_epochs): 102 | for X, y in batch_data(data, batch_size): 103 | with self.graph.as_default(): 104 | self.sess.run(self.train_op, feed_dict={self.features: X, self.labels: y}) 105 | soln = self.get_params() 106 | comp = num_epochs * (len(data['y']) // batch_size) * batch_size * self.flops 107 | return soln, comp, grads 108 | 109 | def solve_sgd(self, mini_batch_data): 110 | with self.graph.as_default(): 111 | grads, loss, _ = self.sess.run([self.grads, self.loss, self.train_op], 112 | feed_dict={self.features: mini_batch_data[0], 113 | self.labels: mini_batch_data[1]}) 114 | 115 | weights = self.get_params() 116 | return grads, loss, weights 117 | 118 | def get_loss(self, data): 119 | with self.graph.as_default(): 120 | loss = self.sess.run(self.loss, feed_dict={self.features: data['x'], self.labels: data['y']}) 121 | return loss 122 | 123 | 124 | def test(self, data): 125 | ''' 126 | Args: 127 | data: dict of the form {'x': [list], 'y': [list]} 128 | ''' 129 | with self.graph.as_default(): 130 | tot_correct, loss = self.sess.run([self.eval_metric_ops, self.loss], 131 | feed_dict={self.features: data['x'], self.labels: data['y']}) 132 | return tot_correct, loss 133 | 134 | def close(self): 135 | self.sess.close() 136 | -------------------------------------------------------------------------------- /flearn/models/nist/mclr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tqdm import trange 4 | 5 | from flearn.utils.model_utils import batch_data, batch_data_multiple_iters 6 | from flearn.utils.tf_utils import graph_size 7 | from flearn.utils.tf_utils import process_grad 8 | 9 | 10 | class Model(object): 11 | ''' 12 | Assumes that images are 28px by 28px 13 | ''' 14 | 15 | def __init__(self, num_classes, optimizer, seed=1): 16 | 17 | # params 18 | self.num_classes = num_classes 19 | 20 | # create computation graph 21 | self.graph = tf.Graph() 22 | with self.graph.as_default(): 23 | tf.set_random_seed(123+seed) 24 | self.features, self.labels, self.train_op, self.grads, self.eval_metric_ops, self.loss = self.create_model(optimizer) 25 | self.saver = tf.train.Saver() 26 | self.sess = tf.Session(graph=self.graph) 27 | 28 | # find memory footprint and compute cost of the model 29 | self.size = graph_size(self.graph) 30 | with self.graph.as_default(): 31 | self.sess.run(tf.global_variables_initializer()) 32 | metadata = tf.RunMetadata() 33 | opts = tf.profiler.ProfileOptionBuilder.float_operation() 34 | self.flops = tf.profiler.profile(self.graph, run_meta=metadata, cmd='scope', options=opts).total_float_ops 35 | 36 | def create_model(self, optimizer): 37 | """Model function for Logistic Regression.""" 38 | features = tf.placeholder(tf.float32, shape=[None, 784], name='features') 39 | labels = tf.placeholder(tf.int64, shape=[None,], name='labels') 40 | logits = tf.layers.dense(inputs=features, units=self.num_classes, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001)) 41 | predictions = { 42 | "classes": tf.argmax(input=logits, axis=1), 43 | "probabilities": tf.nn.softmax(logits, name="softmax_tensor") 44 | } 45 | loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) 46 | 47 | grads_and_vars = optimizer.compute_gradients(loss) 48 | grads, _ = zip(*grads_and_vars) 49 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=tf.train.get_global_step()) 50 | eval_metric_ops = tf.count_nonzero(tf.equal(labels, predictions["classes"])) 51 | return features, labels, train_op, grads, eval_metric_ops, loss 52 | 53 | def set_params(self, model_params=None): 54 | if model_params is not None: 55 | with self.graph.as_default(): 56 | all_vars = tf.trainable_variables() 57 | for variable, value in zip(all_vars, model_params): 58 | variable.load(value, self.sess) 59 | 60 | def get_params(self): 61 | with self.graph.as_default(): 62 | model_params = self.sess.run(tf.trainable_variables()) 63 | return model_params 64 | 65 | def get_gradients(self, data, model_len): 66 | 67 | grads = np.zeros(model_len) 68 | num_samples = len(data['y']) 69 | 70 | with self.graph.as_default(): 71 | model_grads = self.sess.run(self.grads, 72 | feed_dict={self.features: data['x'], self.labels: data['y']}) 73 | grads = process_grad(model_grads) 74 | 75 | return num_samples, grads 76 | 77 | def solve_inner(self, data, num_epochs=1, batch_size=32): 78 | '''Solves local optimization problem''' 79 | for _ in trange(num_epochs, desc='Epoch: ', leave=False, ncols=120): 80 | for X, y in batch_data(data, batch_size): 81 | with self.graph.as_default(): 82 | self.sess.run(self.train_op, feed_dict={self.features: X, self.labels: y}) 83 | soln = self.get_params() 84 | comp = num_epochs * (len(data['y'])//batch_size) * batch_size * self.flops 85 | return soln, comp 86 | 87 | def solve_iters(self, data, num_iters=1, batch_size=32): 88 | '''Solves local optimization problem''' 89 | 90 | for X, y in batch_data_multiple_iters(data, batch_size, num_iters): 91 | with self.graph.as_default(): 92 | self.sess.run(self.train_op, feed_dict={self.features: X, self.labels: y}) 93 | soln = self.get_params() 94 | comp = 0 95 | return soln, comp 96 | 97 | def test(self, data): 98 | ''' 99 | Args: 100 | data: dict of the form {'x': [list], 'y': [list]} 101 | ''' 102 | with self.graph.as_default(): 103 | tot_correct, loss = self.sess.run([self.eval_metric_ops, self.loss], 104 | feed_dict={self.features: data['x'], self.labels: data['y']}) 105 | return tot_correct, loss 106 | 107 | def close(self): 108 | self.sess.close() 109 | -------------------------------------------------------------------------------- /flearn/models/sent140/get_embs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | parser = argparse.ArgumentParser() 5 | 6 | parser.add_argument('-f', 7 | help='path to .txt file containing word embedding information;', 8 | type=str, 9 | default='glove.6B.300d.txt') 10 | 11 | args = parser.parse_args() 12 | 13 | lines = [] 14 | with open(args.f, 'r') as inf: 15 | lines = inf.readlines() 16 | lines = [l.split() for l in lines] 17 | vocab = [l[0] for l in lines] 18 | emb_floats = [[float(n) for n in l[1:]] for l in lines] 19 | emb_floats.append([0.0 for _ in range(300)]) # for unknown word 20 | js = {'vocab': vocab, 'emba': emb_floats} 21 | with open('embs.json', 'w') as ouf: 22 | json.dump(js, ouf) 23 | -------------------------------------------------------------------------------- /flearn/models/sent140/get_embs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd sent140 4 | 5 | if [ ! -f 'glove.6B.300d.txt' ]; then 6 | wget http://nlp.stanford.edu/data/glove.6B.zip 7 | unzip glove.6B.zip 8 | rm glove.6B.50d.txt glove.6B.100d.txt glove.6B.200d.txt glove.6B.zip 9 | fi 10 | 11 | if [ ! -f embs.json ]; then 12 | python3 get_embs.py 13 | fi -------------------------------------------------------------------------------- /flearn/models/sent140/stacked_lstm.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import tensorflow as tf 4 | from tqdm import trange 5 | 6 | from tensorflow.contrib import rnn 7 | 8 | from flearn.utils.model_utils import batch_data, batch_data_multiple_iters 9 | from flearn.utils.language_utils import line_to_indices 10 | from flearn.utils.tf_utils import graph_size, process_grad, process_sparse_grad 11 | 12 | with open('flearn/models/sent140/embs.json', 'r') as inf: 13 | embs = json.load(inf) 14 | id2word = embs['vocab'] 15 | word2id = {v: k for k,v in enumerate(id2word)} 16 | word_emb = np.array(embs['emba']) 17 | 18 | def process_x(raw_x_batch, max_words=25): 19 | x_batch = [e[4] for e in raw_x_batch] 20 | x_batch = [line_to_indices(e, word2id, max_words) for e in x_batch] 21 | x_batch = np.array(x_batch) 22 | return x_batch 23 | 24 | def process_y(raw_y_batch): 25 | y_batch = [1 if e=='4' else 0 for e in raw_y_batch] 26 | y_batch = np.array(y_batch) 27 | 28 | return y_batch 29 | 30 | class Model(object): 31 | 32 | def __init__(self, seq_len, num_classes, n_hidden, optimizer, seed): 33 | #params 34 | self.seq_len = seq_len 35 | self.num_classes = num_classes 36 | self.n_hidden = n_hidden 37 | self.emb_arr = word_emb 38 | 39 | # create computation graph 40 | self.graph = tf.Graph() 41 | with self.graph.as_default(): 42 | tf.set_random_seed(123+seed) 43 | self.features, self.labels, self.train_op, self.grads, self.eval_metric_ops, self.loss = self.create_model(optimizer) 44 | self.saver = tf.train.Saver() 45 | self.sess = tf.Session(graph=self.graph) 46 | 47 | # find memory footprint and compute cost of the model 48 | self.size = graph_size(self.graph) 49 | with self.graph.as_default(): 50 | self.sess.run(tf.global_variables_initializer()) 51 | metadata = tf.RunMetadata() 52 | opts = tf.profiler.ProfileOptionBuilder.float_operation() 53 | self.flops = tf.profiler.profile(self.graph, run_meta=metadata, cmd='scope', options=opts).total_float_ops 54 | 55 | def create_model(self, optimizer): 56 | features = tf.placeholder(tf.int32, [None, self.seq_len], name='features') 57 | labels = tf.placeholder(tf.int64, [None,], name='labels') 58 | 59 | embs = tf.Variable(self.emb_arr, dtype=tf.float32, trainable=False) 60 | x = tf.nn.embedding_lookup(embs, features) 61 | 62 | stacked_lstm = rnn.MultiRNNCell( 63 | [rnn.BasicLSTMCell(self.n_hidden) for _ in range(2)]) 64 | outputs, _ = tf.nn.dynamic_rnn(stacked_lstm, x, dtype=tf.float32) 65 | fc1 = tf.layers.dense(inputs=outputs[:,-1,:], units=30) 66 | pred = tf.squeeze(tf.layers.dense(inputs=fc1, units=1)) 67 | 68 | loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=labels, logits=pred) 69 | #optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) 70 | grads_and_vars = optimizer.compute_gradients(loss) 71 | grads, _ = zip(*grads_and_vars) 72 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=tf.train.get_global_step()) 73 | 74 | correct_pred = tf.equal(tf.to_int64(tf.greater(pred,0)), labels) 75 | eval_metric_ops = tf.count_nonzero(correct_pred) 76 | 77 | return features, labels, train_op, grads, eval_metric_ops, loss 78 | 79 | def set_params(self, model_params=None): 80 | if model_params is not None: 81 | with self.graph.as_default(): 82 | all_vars = tf.trainable_variables() 83 | for variable, value in zip(all_vars, model_params): 84 | variable.load(value, self.sess) 85 | 86 | def get_params(self): 87 | with self.graph.as_default(): 88 | model_params = self.sess.run(tf.trainable_variables()) 89 | return model_params 90 | 91 | def get_gradients(self, data, model_len): 92 | 93 | grads = np.zeros(model_len) 94 | num_samples = len(data['y']) 95 | processed_samples = 0 96 | 97 | if num_samples < 50: 98 | input_data = process_x(data['x']) 99 | target_data = process_y(data['y']) 100 | with self.graph.as_default(): 101 | model_grads = self.sess.run(self.grads, 102 | feed_dict={self.features: input_data, self.labels: target_data}) 103 | grads = process_grad(model_grads) 104 | processed_samples = num_samples 105 | 106 | else: # calculate the grads in a batch size of 50 107 | for i in range(min(int(num_samples / 50), 4)): 108 | input_data = process_x(data['x'][50*i:50*(i+1)]) 109 | target_data = process_y(data['y'][50*i:50*(i+1)]) 110 | with self.graph.as_default(): 111 | model_grads = self.sess.run(self.grads, 112 | feed_dict={self.features: input_data, self.labels: target_data}) 113 | 114 | # flat_grad = process_grad(model_grads) 115 | flat_grad = tf.nest.flatten(model_grads) 116 | # flat_grad = process_sparse_grad(model_grads) 117 | grads = np.add(grads, flat_grad) # this is the average in this batch 118 | 119 | grads = grads * 1.0 / min(int(num_samples/50), 4) 120 | processed_samples = min(int(num_samples / 50), 4) * 50 121 | 122 | return processed_samples, grads 123 | 124 | def solve_inner(self, data, num_epochs=1, batch_size=32): 125 | ''' 126 | Args: 127 | data: dict of the form {'x': [list], 'y': [list]} 128 | Return: 129 | comp: number of FLOPs computed while training given data 130 | update: list of np.ndarray weights, with each weight array 131 | corresponding to a variable in the resulting graph 132 | ''' 133 | 134 | for _ in trange(num_epochs, desc='Epoch: ', leave=False): 135 | for X,y in batch_data(data, batch_size): 136 | input_data = process_x(X, self.seq_len) 137 | target_data = process_y(y) 138 | with self.graph.as_default(): 139 | self.sess.run(self.train_op, 140 | feed_dict={self.features: input_data, self.labels: target_data}) 141 | soln = self.get_params() 142 | comp = num_epochs * (len(data['y'])//batch_size) * batch_size * self.flops 143 | return soln, comp 144 | 145 | def solve_iters(self, data, num_iters=1, batch_size=32): 146 | '''Solves local optimization problem''' 147 | 148 | for X, y in batch_data_multiple_iters(data, batch_size, num_iters): 149 | input_data = process_x(X, self.seq_len) 150 | target_data = process_y(y) 151 | with self.graph.as_default(): 152 | self.sess.run(self.train_op, feed_dict={self.features: input_data, self.labels: target_data}) 153 | soln = self.get_params() 154 | comp = 0 155 | return soln, comp 156 | 157 | def test(self, data): 158 | ''' 159 | Args: 160 | data: dict of the form {'x': [list], 'y': [list]} 161 | ''' 162 | x_vecs = process_x(data['x'], self.seq_len) 163 | labels = process_y(data['y']) 164 | with self.graph.as_default(): 165 | tot_correct, loss = self.sess.run([self.eval_metric_ops, self.loss], 166 | feed_dict={self.features: x_vecs, self.labels: labels}) 167 | return tot_correct, loss 168 | 169 | def close(self): 170 | self.sess.close() 171 | -------------------------------------------------------------------------------- /flearn/models/shakespeare/stacked_lstm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import trange 3 | import json 4 | 5 | import os 6 | import sys 7 | import tensorflow as tf 8 | 9 | from tensorflow.contrib import rnn 10 | 11 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 12 | utils_dir = os.path.join(utils_dir, 'utils') 13 | sys.path.append(utils_dir) 14 | 15 | from model_utils import batch_data, batch_data_multiple_iters 16 | from language_utils import letter_to_vec, word_to_indices 17 | from tf_utils import graph_size, process_grad 18 | from tf_utils import process_sparse_grad 19 | 20 | def process_x(raw_x_batch): 21 | x_batch = [word_to_indices(word) for word in raw_x_batch] 22 | x_batch = np.array(x_batch) 23 | return x_batch 24 | 25 | def process_y(raw_y_batch): 26 | y_batch = [letter_to_vec(c) for c in raw_y_batch] 27 | return y_batch 28 | 29 | class Model(object): 30 | def __init__(self, seq_len, num_classes, n_hidden, optimizer, seed): 31 | self.seq_len = seq_len 32 | self.num_classes = num_classes 33 | self.n_hidden = n_hidden 34 | 35 | self.graph = tf.Graph() 36 | with self.graph.as_default(): 37 | tf.set_random_seed(123 + seed) 38 | self.features, self.labels, self.train_op, self.grads, self.eval_metric_ops, self.loss = self.create_model(optimizer) 39 | self.saver = tf.train.Saver() 40 | self.sess = tf.Session(graph=self.graph) 41 | 42 | self.size = graph_size(self.graph) 43 | 44 | with self.graph.as_default(): 45 | self.sess.run(tf.global_variables_initializer()) 46 | 47 | metadata = tf.RunMetadata() 48 | opts = tf.profiler.ProfileOptionBuilder.float_operation() 49 | self.flops = tf.profiler.profile(self.graph, run_meta=metadata, cmd='scope', options=opts).total_float_ops 50 | 51 | def create_model(self, optimizer): 52 | features = tf.placeholder(tf.int32, [None, self.seq_len]) 53 | embedding = tf.get_variable("embedding", [self.num_classes, 8]) 54 | x = tf.nn.embedding_lookup(embedding, features) 55 | labels = tf.placeholder(tf.int32, [None, self.num_classes]) 56 | 57 | stacked_lstm = rnn.MultiRNNCell( 58 | [rnn.BasicLSTMCell(self.n_hidden) for _ in range(2)]) 59 | outputs, _ = tf.nn.dynamic_rnn(stacked_lstm, x, dtype=tf.float32) 60 | pred = tf.layers.dense(inputs=outputs[:,-1,:], units=self.num_classes) 61 | 62 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=labels)) 63 | 64 | grads_and_vars = optimizer.compute_gradients(loss) 65 | grads, _ = zip(*grads_and_vars) 66 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=tf.train.get_global_step()) 67 | 68 | 69 | correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(labels, 1)) 70 | eval_metric_ops = tf.count_nonzero(correct_pred) 71 | 72 | return features, labels, train_op, grads, eval_metric_ops, loss 73 | 74 | 75 | def set_params(self, model_params=None): 76 | if model_params is not None: 77 | with self.graph.as_default(): 78 | all_vars = tf.trainable_variables() 79 | for variable, value in zip(all_vars, model_params): 80 | variable.load(value, self.sess) 81 | 82 | def get_params(self): 83 | with self.graph.as_default(): 84 | model_params = self.sess.run(tf.trainable_variables()) 85 | return model_params 86 | 87 | def get_gradients(self, data, model_len): 88 | '''in order to avoid the OOM error, we need to calculate the gradients on each 89 | client batch by batch. batch size here is set to be 100. 90 | 91 | Return: a one-D array (after flattening all gradients) 92 | ''' 93 | grads = np.zeros(model_len) 94 | num_samples = len(data['y']) 95 | 96 | processed_samples = 0 97 | 98 | if num_samples < 50: 99 | input_data = process_x(data['x']) 100 | target_data = process_y(data['y']) 101 | with self.graph.as_default(): 102 | model_grads = self.sess.run(self.grads, 103 | feed_dict={self.features: input_data, self.labels: target_data}) 104 | grads = process_sparse_grad(model_grads) 105 | processed_samples = num_samples 106 | 107 | else: # in order to fit into memory, compute gradients in a batch of size 50, and subsample a subset of points to approximate 108 | for i in range(min(int(num_samples / 50), 4)): 109 | input_data = process_x(data['x'][50*i:50*(i+1)]) 110 | target_data = process_y(data['y'][50*i:50*(i+1)]) 111 | 112 | with self.graph.as_default(): 113 | model_grads = self.sess.run(self.grads, 114 | feed_dict={self.features: input_data, self.labels: target_data}) 115 | 116 | flat_grad = process_sparse_grad(model_grads) 117 | grads = np.add(grads, flat_grad) 118 | 119 | grads = grads * 1.0 / min(int(num_samples/50), 4) 120 | processed_samples = min(int(num_samples / 50), 4) * 50 121 | 122 | return processed_samples, grads 123 | 124 | # def get_gradients(self, data, model_len): 125 | # num_samples = len(data['y']) # Need model len 126 | # with self.graph.as_default(): 127 | # grads = self.sess.run(self.grads, 128 | # feed_dict={self.features: process_x(data['x']), self.labels: process_y(data['y'])}) 129 | # grads = process_grad(grads) 130 | 131 | # return num_samples, grads 132 | 133 | # def solve_inner(self, data, num_epochs=1, batch_size=32): 134 | # '''Solves local optimization problem''' 135 | 136 | # with self.graph.as_default(): 137 | # _, grads = self.get_gradients(data, 610) # Ignore the hardcoding, it's not used anywhere 138 | 139 | # for _ in trange(num_epochs, desc='Epoch: ', leave=False, ncols=120): 140 | # for X, y in batch_data_celeba(data, batch_size): 141 | # with self.graph.as_default(): 142 | # self.sess.run(self.train_op, 143 | # feed_dict={self.features: X, self.labels: y}) 144 | # soln = self.get_params() 145 | # comp = num_epochs * (len(data['y'])//batch_size) * batch_size * self.flops 146 | # return soln, comp, grads 147 | 148 | def solve_inner(self, data, num_epochs=1, batch_size=32): 149 | ''' 150 | Args: 151 | data: dict of the form {'x': [list], 'y': [list]} 152 | Return: 153 | soln: trainable variables of the lstm model 154 | comp: number of FLOPs computed while training given data 155 | ''' 156 | with self.graph.as_default(): 157 | _, grads = self.get_gradients(data, 817872) # Ignore the hardcoding, it's not used anywhere 158 | 159 | for _ in trange(num_epochs, desc='Epoch: ', leave=False): 160 | for X,y in batch_data(data, batch_size): 161 | input_data = process_x(X) 162 | target_data = process_y(y) 163 | with self.graph.as_default(): 164 | self.sess.run(self.train_op, 165 | feed_dict={self.features: input_data, self.labels: target_data}) 166 | soln = self.get_params() 167 | comp = num_epochs * (len(data['y'])//batch_size) * batch_size * self.flops 168 | return soln, comp, grads 169 | 170 | def solve_iters(self, data, num_iters=1, batch_size=32): 171 | '''Solves local optimization problem''' 172 | 173 | for X, y in batch_data_multiple_iters(data, batch_size, num_iters): 174 | input_data = process_x(X) 175 | target_data = process_y(y) 176 | with self.graph.as_default(): 177 | self.sess.run(self.train_op, feed_dict={self.features: input_data, self.labels: target_data}) 178 | soln = self.get_params() 179 | comp = 0 180 | return soln, comp 181 | 182 | def test(self, data): 183 | ''' 184 | Args: 185 | data: dict of the form {'x': [list], 'y': [list]} 186 | Return: 187 | tot_correct: total #samples that are predicted correctly 188 | loss: loss value on `data` 189 | ''' 190 | x_vecs = process_x(data['x']) 191 | labels = process_y(data['y']) 192 | with self.graph.as_default(): 193 | tot_correct, loss = self.sess.run([self.eval_metric_ops, self.loss], 194 | feed_dict={self.features: x_vecs, self.labels: labels}) 195 | return tot_correct, loss 196 | 197 | def close(self): 198 | self.sess.close() 199 | 200 | -------------------------------------------------------------------------------- /flearn/models/synthetic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/models/synthetic/__init__.py -------------------------------------------------------------------------------- /flearn/models/synthetic/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/models/synthetic/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/models/synthetic/__pycache__/mclr.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/models/synthetic/__pycache__/mclr.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/models/synthetic/mclr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from flearn.utils.model_utils import batch_data, batch_data_multiple_iters 5 | from flearn.utils.tf_utils import graph_size 6 | from flearn.utils.tf_utils import process_grad 7 | 8 | 9 | class Model(object): 10 | ''' 11 | Assumes that images are 28px by 28px 12 | ''' 13 | 14 | def __init__(self, num_classes, optimizer, seed=1): 15 | 16 | # params 17 | self.num_classes = num_classes 18 | 19 | # create computation graph 20 | self.graph = tf.Graph() 21 | with self.graph.as_default(): 22 | tf.set_random_seed(123+seed) 23 | self.features, self.labels, self.train_op, self.grads, self.eval_metric_ops, self.loss, self.pred = self.create_model(optimizer) 24 | self.saver = tf.train.Saver() 25 | self.sess = tf.Session(graph=self.graph) 26 | 27 | # find memory footprint and compute cost of the model 28 | self.size = graph_size(self.graph) 29 | with self.graph.as_default(): 30 | self.sess.run(tf.global_variables_initializer()) 31 | metadata = tf.RunMetadata() 32 | opts = tf.profiler.ProfileOptionBuilder.float_operation() 33 | self.flops = tf.profiler.profile(self.graph, run_meta=metadata, cmd='scope', options=opts).total_float_ops 34 | 35 | def create_model(self, optimizer): 36 | """Model function for Logistic Regression.""" 37 | features = tf.placeholder(tf.float32, shape=[None, 60], name='features') 38 | labels = tf.placeholder(tf.int64, shape=[None,], name='labels') 39 | logits = tf.layers.dense(inputs=features, units=self.num_classes, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001)) 40 | predictions = { 41 | "classes": tf.argmax(input=logits, axis=1), 42 | "probabilities": tf.nn.softmax(logits, name="softmax_tensor") 43 | } 44 | loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) 45 | 46 | grads_and_vars = optimizer.compute_gradients(loss) 47 | grads, _ = zip(*grads_and_vars) 48 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=tf.train.get_global_step()) 49 | eval_metric_ops = tf.count_nonzero(tf.equal(labels, predictions["classes"])) 50 | return features, labels, train_op, grads, eval_metric_ops, loss, predictions["classes"] 51 | 52 | def set_params(self, model_params=None): 53 | if model_params is not None: 54 | with self.graph.as_default(): 55 | all_vars = tf.trainable_variables() 56 | for variable, value in zip(all_vars, model_params): 57 | variable.load(value, self.sess) 58 | 59 | def get_params(self): 60 | with self.graph.as_default(): 61 | model_params = self.sess.run(tf.trainable_variables()) 62 | return model_params 63 | 64 | def get_gradients(self, data, model_len): 65 | 66 | grads = np.zeros(model_len) 67 | num_samples = len(data['y']) 68 | 69 | with self.graph.as_default(): 70 | model_grads = self.sess.run(self.grads, 71 | feed_dict={self.features: data['x'], self.labels: data['y']}) 72 | grads = process_grad(model_grads) 73 | 74 | return num_samples, grads 75 | 76 | def solve_inner(self, data, num_epochs=1, batch_size=32): 77 | '''Solves local optimization problem''' 78 | 79 | with self.graph.as_default(): 80 | _, grads = self.get_gradients(data, 610) # Ignore the hardcoding, it's not used anywhere 81 | 82 | for _ in range(num_epochs): 83 | for X, y in batch_data(data, batch_size): 84 | with self.graph.as_default(): 85 | self.sess.run(self.train_op, feed_dict={self.features: X, self.labels: y}) 86 | soln = self.get_params() 87 | comp = num_epochs * (len(data['y'])//batch_size) * batch_size * self.flops 88 | return soln, comp, grads 89 | 90 | def solve_iters(self, data, num_iters=1, batch_size=32): 91 | '''Solves local optimization problem''' 92 | 93 | for X, y in batch_data_multiple_iters(data, batch_size, num_iters): 94 | with self.graph.as_default(): 95 | self.sess.run(self.train_op, feed_dict={self.features: X, self.labels: y}) 96 | soln = self.get_params() 97 | comp = 0 98 | return soln, comp 99 | 100 | def test(self, data): 101 | ''' 102 | Args: 103 | data: dict of the form {'x': [list], 'y': [list]} 104 | ''' 105 | with self.graph.as_default(): 106 | tot_correct, loss, pred = self.sess.run([self.eval_metric_ops, self.loss, self.pred], 107 | feed_dict={self.features: data['x'], self.labels: data['y']}) 108 | return tot_correct, loss 109 | 110 | def close(self): 111 | self.sess.close() 112 | -------------------------------------------------------------------------------- /flearn/optimizer/__pycache__/pgd.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/optimizer/__pycache__/pgd.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/optimizer/pgd.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.ops import control_flow_ops 2 | from tensorflow.python.ops import math_ops 3 | from tensorflow.python.ops import state_ops 4 | from tensorflow.python.framework import ops 5 | from tensorflow.python.training import optimizer 6 | import tensorflow as tf 7 | 8 | 9 | class PerturbedGradientDescent(optimizer.Optimizer): 10 | """Implementation of Perturbed Gradient Descent, i.e., FedProx optimizer""" 11 | def __init__(self, learning_rate=0.001, mu=0.01, use_locking=False, name="PGD"): 12 | super(PerturbedGradientDescent, self).__init__(use_locking, name) 13 | self._lr = learning_rate 14 | self._mu = mu 15 | 16 | # Tensor versions of the constructor arguments, created in _prepare(). 17 | self._lr_t = None 18 | self._mu_t = None 19 | 20 | def _prepare(self): 21 | self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate") 22 | self._mu_t = ops.convert_to_tensor(self._mu, name="prox_mu") 23 | 24 | def _create_slots(self, var_list): 25 | # Create slots for the global solution. 26 | for v in var_list: 27 | self._zeros_slot(v, "vstar", self._name) 28 | 29 | def _apply_dense(self, grad, var): 30 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 31 | mu_t = math_ops.cast(self._mu_t, var.dtype.base_dtype) 32 | vstar = self.get_slot(var, "vstar") 33 | 34 | var_update = state_ops.assign_sub(var, lr_t*(grad + mu_t*(var-vstar))) 35 | 36 | return control_flow_ops.group(*[var_update,]) 37 | 38 | 39 | def _apply_sparse_shared(self, grad, var, indices, scatter_add): 40 | 41 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 42 | mu_t = math_ops.cast(self._mu_t, var.dtype.base_dtype) 43 | vstar = self.get_slot(var, "vstar") 44 | 45 | v_diff = state_ops.assign(vstar, mu_t * (var - vstar), use_locking=self._use_locking) 46 | 47 | with ops.control_dependencies([v_diff]): # run v_diff operation before scatter_add 48 | scaled_grad = scatter_add(vstar, indices, grad) 49 | var_update = state_ops.assign_sub(var, lr_t * scaled_grad) 50 | 51 | return control_flow_ops.group(*[var_update,]) 52 | 53 | def _apply_sparse(self, grad, var): 54 | return self._apply_sparse_shared( 55 | grad.values, var, grad.indices, 56 | lambda x, i, v: state_ops.scatter_add(x, i, v)) 57 | 58 | 59 | def set_params(self, cog, client): 60 | with client.graph.as_default(): 61 | all_vars = tf.trainable_variables() 62 | for variable, value in zip(all_vars, cog): 63 | vstar = self.get_slot(variable, "vstar") 64 | vstar.load(value, client.sess) 65 | -------------------------------------------------------------------------------- /flearn/optimizer/pggd.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.ops import control_flow_ops 2 | from tensorflow.python.ops import math_ops 3 | from tensorflow.python.ops import state_ops 4 | from tensorflow.python.framework import ops 5 | from tensorflow.python.training import optimizer 6 | import tensorflow as tf 7 | 8 | 9 | class PerGodGradientDescent(optimizer.Optimizer): 10 | """Implementation of Perturbed gold Gradient Descent""" 11 | def __init__(self, learning_rate=0.001, mu=0.01, use_locking=False, name="PGD"): 12 | super(PerGodGradientDescent, self).__init__(use_locking, name) 13 | self._lr = learning_rate 14 | self._mu = mu 15 | 16 | # Tensor versions of the constructor arguments, created in _prepare(). 17 | self._lr_t = None 18 | self._mu_t = None 19 | 20 | def _prepare(self): 21 | self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate") 22 | self._mu_t = ops.convert_to_tensor(self._mu, name="prox_mu") 23 | 24 | def _create_slots(self, var_list): 25 | # Create slots for the global solution. 26 | for v in var_list: 27 | self._zeros_slot(v, "vstar", self._name) 28 | self._zeros_slot(v, "gold", self._name) 29 | 30 | def _apply_dense(self, grad, var): 31 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 32 | mu_t = math_ops.cast(self._mu_t, var.dtype.base_dtype) 33 | 34 | vstar = self.get_slot(var, "vstar") 35 | gold = self.get_slot(var, "gold") 36 | 37 | var_update = state_ops.assign_sub(var, lr_t*(grad + gold + mu_t*(var-vstar))) #Update 'ref' by subtracting 'value 38 | #Create an op that groups multiple operations. 39 | #When this op finishes, all ops in input have finished 40 | return control_flow_ops.group(*[var_update,]) 41 | 42 | def _apply_sparse(self, grad, var): 43 | raise NotImplementedError("Sparse gradient updates are not supported.") 44 | 45 | def set_params(self, cog, avg_gradient, client): 46 | with client.model.graph.as_default(): 47 | all_vars = tf.trainable_variables() 48 | for variable, value in zip(all_vars, cog): 49 | vstar = self.get_slot(variable, "vstar") 50 | vstar.load(value, client.model.sess) 51 | 52 | # get old gradient 53 | gprev = client.get_grads() 54 | 55 | # Find g_t - F'(old) 56 | gdiff = [g1-g2 for g1,g2 in zip(avg_gradient, gprev)] 57 | 58 | with client.model.graph.as_default(): 59 | all_vars = tf.trainable_variables() 60 | for variable, grad in zip(all_vars, gdiff): 61 | gold = self.get_slot(variable, "gold") 62 | gold.load(grad, client.model.sess) 63 | -------------------------------------------------------------------------------- /flearn/trainers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/trainers/__init__.py -------------------------------------------------------------------------------- /flearn/trainers/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/trainers/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/trainers/__pycache__/fedavg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/trainers/__pycache__/fedavg.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/trainers/__pycache__/fedbase.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/trainers/__pycache__/fedbase.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/trainers/__pycache__/fedprox.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/trainers/__pycache__/fedprox.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/trainers/fedavg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import trange, tqdm 3 | import tensorflow as tf 4 | import os 5 | os.environ['TF_CPP_MIN_LOG_LEVEL']='3' 6 | 7 | from .fedbase import BaseFedarated 8 | from flearn.utils.tf_utils import process_grad 9 | from sklearn.metrics import pairwise_distances 10 | 11 | 12 | class Server(BaseFedarated): 13 | def __init__(self, params, learner, dataset): 14 | print('Using Federated avg to Train') 15 | self.inner_opt = tf.train.GradientDescentOptimizer(params['learning_rate']) 16 | super(Server, self).__init__(params, learner, dataset) 17 | self.rng = np.random.default_rng() 18 | 19 | def train(self): 20 | '''Train using Federated Proximal''' 21 | print('Training with {} workers ---'.format(self.clients_per_round)) 22 | 23 | test_accuracies = [] 24 | acc_10quant = [] 25 | acc_20quant = [] 26 | test_acc_var = [] 27 | train_accuracies = [] 28 | train_losses = [] 29 | num_sampled = [] 30 | client_sets_all = np.zeros([self.num_rounds, self.clients_per_round], dtype=int) 31 | diff_grad = np.zeros([self.num_rounds, len(self.clients)]) 32 | for i in range(self.num_rounds): 33 | 34 | allcl_models = [] 35 | for cc in self.clients: 36 | clmodel = cc.get_params() 37 | allcl_models.append(clmodel) 38 | # test model 39 | if i % self.eval_every == 0: 40 | stats = self.test() # have set the latest model for all clients 41 | stats_train = self.train_error_and_loss() 42 | 43 | test_accuracies.append(np.sum(stats[3]) * 1.0 / np.sum(stats[2])) 44 | acc_10quant.append(np.quantile([i/j for i,j in zip(stats[3], stats[2])], 0.1)) 45 | acc_20quant.append(np.quantile([i/j for i,j in zip(stats[3], stats[2])], 0.2)) 46 | test_acc_var.append(np.var([i/j for i,j in zip(stats[3], stats[2])])) 47 | train_accuracies.append(np.sum(stats_train[3]) * 1.0 / np.sum(stats_train[2])) 48 | train_losses.append(np.dot(stats_train[4], stats_train[2]) * 1.0 / np.sum(stats_train[2])) 49 | 50 | tqdm.write('At round {} per-client-accuracy: {}'.format(i, [i/j for i,j in zip(stats[3], stats[2])])) 51 | tqdm.write('At round {} accuracy: {}'.format(i, np.sum(stats[3]) * 1.0 / np.sum(stats[2]))) # testing accuracy 52 | tqdm.write('At round {} acc. 10th: {}'.format(i, np.quantile([i/j for i,j in zip(stats[3], stats[2])], 0.1))) # testing accuracy variance 53 | tqdm.write('At round {} acc. 20th: {}'.format(i, np.quantile([i/j for i,j in zip(stats[3], stats[2])], 0.2))) # testing accuracy variance 54 | tqdm.write('At round {} acc. variance: {}'.format(i, np.var([i/j for i,j in zip(stats[3], stats[2])]))) # testing accuracy variance 55 | tqdm.write('At round {} training accuracy: {}'.format(i, np.sum(stats_train[3]) * 1.0 / np.sum(stats_train[2]))) 56 | tqdm.write('At round {} training loss: {}'.format(i, np.dot(stats_train[4], stats_train[2]) * 1.0 / np.sum(stats_train[2]))) 57 | 58 | if self.clientsel_algo == 'submodular': 59 | #if i % self.m_interval == 0: # Moved the condition inside the function 60 | if i == 0 or self.clients_per_round == 1: # at the first iteration or when m=1, collect gradients from all clients 61 | self.all_grads = np.asarray(self.show_grads()[:-1]) 62 | self.norm_diff = pairwise_distances(self.all_grads, metric="euclidean") 63 | np.fill_diagonal(self.norm_diff, 0) 64 | indices, selected_clients, all_grad = self.select_cl_submod(i, num_clients=self.clients_per_round, stochastic_greedy = False) 65 | active_clients = selected_clients # Dropping clients don't apply in this case 66 | if i == 0: 67 | diff_grad[i] = np.zeros(len(all_grad)) 68 | else: 69 | diff_grad[i] = np.linalg.norm(all_grad - old_grad, axis=1) 70 | old_grad = all_grad.copy() 71 | elif self.clientsel_algo == 'lossbased': 72 | print('Power of choice') 73 | 74 | if i % self.m_interval == 0: 75 | lprob = stats_train[2]/np.sum(stats_train[2], axis=0) 76 | #d=100 77 | subsample = 0.1 78 | #d = max(self.clients_per_round, int(subsample * len(self.clients))) 79 | d = len(self.clients) 80 | lvals = self.rng.choice(stats_train[4], size=d, replace = False, p=lprob) 81 | Mlist = [np.where(stats_train[4] == i)[0][0] for i in lvals] 82 | lossvals = np.asarray(stats_train[4]) #All loss values 83 | sel_losses = lossvals[Mlist] 84 | idx_losses = np.argpartition(sel_losses, self.clients_per_round) 85 | values = sel_losses[idx_losses[-self.clients_per_round:]] 86 | 87 | listvalues = values.tolist() 88 | listlossvals = lossvals.tolist() 89 | indices = [listlossvals.index(i) for i in listvalues] 90 | 91 | #indices = np.argsort(stats_train[4], axis=0)[::-1][:self.clients_per_round] 92 | selected_clients = np.asarray(self.clients)[indices] 93 | np.random.seed(i) 94 | active_clients = np.random.choice(selected_clients, round(self.clients_per_round * (1-self.drop_percent)), replace=False) 95 | else: 96 | indices, selected_clients = self.select_clients(i, num_clients=self.clients_per_round) # uniform sampling 97 | np.random.seed(i) 98 | active_clients = np.random.choice(selected_clients, round(self.clients_per_round * (1-self.drop_percent)), replace=False) 99 | 100 | 101 | print('Client set is ', indices) 102 | client_sets_all[i] = indices 103 | tqdm.write('At round {} num. clients sampled: {}'.format(i, len(indices))) 104 | num_sampled.append(len(indices)) 105 | csolns = [] # buffer for receiving client solutions 106 | 107 | glob_copy = np.append(self.latest_model[0].flatten(), self.latest_model[1]) 108 | 109 | for idx, c in enumerate(active_clients.tolist()): # simply drop the slow devices 110 | # communicate the latest model 111 | c.set_params(self.latest_model) 112 | 113 | # solve minimization locally 114 | soln, stats, grads = c.solve_inner(num_epochs=self.num_epochs, batch_size=self.batch_size) 115 | #print("Shape of grads", np.shape(grads)) 116 | 117 | # gather solutions from client 118 | csolns.append(soln) 119 | 120 | if self.clientsel_algo == 'submodular': 121 | self.all_grads[indices[idx]] = grads 122 | 123 | # Update server's view of clients' models (only for the selected clients) 124 | #c.updatevec = (glob_copy - np.append(c.get_params()[0].flatten(), c.get_params()[1]))*0.01 125 | c.updatevec = np.append(c.get_params()[0].flatten(), c.get_params()[1]) 126 | 127 | # track communication cost 128 | self.metrics.update(rnd=i, cid=c.id, stats=stats) 129 | 130 | # update models 131 | if self.clientsel_algo == 'submodular': 132 | self.norm_diff[indices] = pairwise_distances(self.all_grads[indices], self.all_grads, metric="euclidean") 133 | self.norm_diff[:, indices] = self.norm_diff[indices].T 134 | self.latest_model = self.aggregate(csolns) 135 | #self.latest_model = self.aggregate_submod(csolns, gammas) 136 | elif self.clientsel_algo == 'lossbased': 137 | self.latest_model = self.aggregate_simple(csolns) 138 | else: 139 | self.latest_model = self.aggregate(csolns) 140 | 141 | 142 | # final test model 143 | stats = self.test() 144 | stats_train = self.train_error_and_loss() 145 | self.metrics.accuracies.append(stats) 146 | self.metrics.train_accuracies.append(stats_train) 147 | tqdm.write('At round {} per-client-accuracy: {}'.format(i, [i/j for i,j in zip(stats[3], stats[2])])) 148 | tqdm.write('At round {} accuracy: {}'.format(self.num_rounds, np.sum(stats[3]) * 1.0 / np.sum(stats[2]))) 149 | tqdm.write('At round {} acc. variance: {}'.format(self.num_rounds, np.var([i/j for i,j in zip(stats[3], stats[2])]))) 150 | tqdm.write('At round {} acc. 10th: {}'.format(self.num_rounds, np.quantile([i/j for i,j in zip(stats[3], stats[2])], 0.1))) 151 | tqdm.write('At round {} acc. 20th: {}'.format(self.num_rounds, np.quantile([i/j for i,j in zip(stats[3], stats[2])], 0.2))) 152 | tqdm.write('At round {} training accuracy: {}'.format(self.num_rounds, np.sum(stats_train[3]) * 1.0 / np.sum(stats_train[2]))) 153 | 154 | if self.clientsel_algo == 'submodular': 155 | np.save('./results/sent140/psubmod_select_client_sets_all_%s_epoch%d_numclient%d_m%d.npy' % (self.clientsel_algo, self.num_epochs, self.clients_per_round, self.m_interval), client_sets_all) 156 | np.save('./results/sent140/psubmod_client_diff_grad_all_%s_epoch%d_numclient%d_m%d.npy' % (self.clientsel_algo, self.num_epochs, self.clients_per_round, self.m_interval), diff_grad) 157 | elif self.clientsel_algo == 'lossbased': 158 | np.save('./results/sent140/powerofchoice_select_client_sets_all_%s_epoch%d_numclient%d_m%d.npy' % (self.clientsel_algo, self.num_epochs, self.clients_per_round, self.m_interval), client_sets_all) 159 | 160 | print('Number of samples', stats_train[2]) 161 | 162 | # save_dir = "./results/" 163 | # result_path = os.path.join(save_dir,'submodular.csv') 164 | # print('Writing Statistics to file') 165 | # with open(result_path, 'wb') as f: 166 | # np.savetxt(f, np.c_[test_accuracies, train_accuracies, train_losses, num_sampled], delimiter=",") -------------------------------------------------------------------------------- /flearn/trainers/fedbase.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tqdm import tqdm 4 | 5 | from flearn.models.client import Client 6 | from flearn.utils.model_utils import Metrics 7 | from flearn.utils.tf_utils import process_grad 8 | 9 | from sklearn.metrics import pairwise_distances 10 | 11 | class BaseFedarated(object): 12 | def __init__(self, params, learner, dataset): 13 | # transfer parameters to self 14 | for key, val in params.items(): setattr(self, key, val); 15 | 16 | # create worker nodes 17 | tf.reset_default_graph() 18 | self.client_model = learner(*params['model_params'], self.inner_opt, self.seed) 19 | self.clients = self.setup_clients(dataset, self.client_model) 20 | print('{} Clients in Total'.format(len(self.clients))) 21 | self.latest_model = self.client_model.get_params() 22 | 23 | # initialize system metrics 24 | self.metrics = Metrics(self.clients, params) 25 | 26 | self.norm_diff = np.zeros((len(self.clients), len(self.clients))) 27 | self.norm_diff2 = np.zeros((len(self.clients), len(self.clients))) 28 | 29 | def __del__(self): 30 | self.client_model.close() 31 | 32 | def setup_clients(self, dataset, model=None): 33 | '''instantiates clients based on given train and test data directories 34 | 35 | Return: 36 | list of Clients 37 | ''' 38 | users, groups, train_data, test_data = dataset 39 | if len(groups) == 0: 40 | groups = [None for _ in users] 41 | all_clients = [Client(u, g, train_data[u], test_data[u], model) for u, g in zip(users, groups)] 42 | return all_clients 43 | 44 | 45 | def train_error_and_loss(self): 46 | num_samples = [] 47 | tot_correct = [] 48 | losses = [] 49 | 50 | for c in self.clients: 51 | ct, cl, ns = c.train_error_and_loss() 52 | tot_correct.append(ct*1.0) 53 | num_samples.append(ns) 54 | losses.append(cl*1.0) 55 | 56 | ids = [c.id for c in self.clients] 57 | groups = [c.group for c in self.clients] 58 | 59 | return ids, groups, num_samples, tot_correct, losses 60 | 61 | 62 | def show_grads(self): 63 | ''' 64 | Return: 65 | gradients on all workers and the global gradient 66 | ''' 67 | 68 | model_len = process_grad(self.latest_model).size 69 | global_grads = np.zeros(model_len) 70 | 71 | cc = 0 72 | samples=[] 73 | 74 | self.client_model.set_params(self.latest_model) 75 | for c in self.clients: 76 | num_samples, client_grads = c.get_grads(model_len) 77 | #num_samples, client_grads = c.get_grads(self.latest_model) 78 | samples.append(num_samples) 79 | # serial_cl_grads = process_grad(client_grads) 80 | if cc == 0: 81 | intermediate_grads = np.zeros([len(self.clients) + 1, len(client_grads)]) 82 | # print(client_grads) 83 | 84 | # serial_cl_grads = client_grads 85 | global_grads = np.add(global_grads, client_grads * num_samples) 86 | intermediate_grads[cc] = client_grads 87 | # print('serial_cl_grads shape', serial_cl_grads.shape) 88 | cc += 1 89 | 90 | global_grads = global_grads * 1.0 / np.sum(np.asarray(samples)) 91 | intermediate_grads[-1] = global_grads 92 | 93 | return intermediate_grads 94 | 95 | 96 | def test(self): 97 | '''tests self.latest_model on given clients 98 | ''' 99 | num_samples = [] 100 | tot_correct = [] 101 | self.client_model.set_params(self.latest_model) 102 | for c in self.clients: 103 | ct, ns = c.test() 104 | tot_correct.append(ct*1.0) 105 | num_samples.append(ns) 106 | ids = [c.id for c in self.clients] 107 | groups = [c.group for c in self.clients] 108 | return ids, groups, num_samples, tot_correct 109 | 110 | def save(self): 111 | pass 112 | 113 | def select_cl_submod(self, round, num_clients=20, stochastic_greedy = False): 114 | # Get per-client gradients as well as global gradient 115 | #all_grads = self.show_grads() 116 | 117 | # if round == 0 or self.m_interval == 1: # at the first iteration or when m=1, collect gradients from all clients 118 | # self.all_grads = np.asarray(self.show_grads()[:-1]) 119 | 120 | #print("New shape", np.shape(self.all_grads)) 121 | # for i in range(len(self.clients)): 122 | # if all_grads[i].shape[0] > 8: 123 | # print('all_grads[i]', all_grads[i].shape) 124 | 125 | # cc = 0 126 | # # Get per-client model parameters (most recent available at client) 127 | # for c in self.clients: 128 | # #client_params = c.get_params() 129 | # #cl_params = np.append(client_params[0].flatten(), client_params[1]) 130 | # # cl_params = c.updatevec 131 | # if cc == 0: 132 | # all_cl_params = np.zeros([len(self.clients), len(c.updatevec)]) 133 | # all_cl_params[cc] = c.updatevec 134 | # cc += 1 135 | 136 | # Calculate gradient normed difference 137 | #m_interval = 1 # parameter m in the paper; (integer >=1) 138 | # if round % self.m_interval == 0: 139 | # for i in range(len(self.clients)): 140 | # for j in range(len(self.clients)): 141 | # # print(all_grads[i], all_grads[j]) 142 | # self.norm_diff[i,j] = np.linalg.norm(all_grads[i]-all_grads[j]) 143 | # self.norm_diff2[i,j] = np.linalg.norm(all_cl_params[i]-all_cl_params[j]) 144 | 145 | # if round <= 1 or self.sim_metric == "grad": 146 | #if round % self.m_interval == 0: # Update similarity matrix only after m_interval epochs 147 | #self.norm_diff = pairwise_distances(all_grads[:-1], metric="euclidean") 148 | # self.norm_diff = pairwise_distances(self.all_grads, metric="euclidean") 149 | # np.fill_diagonal(self.norm_diff, self.norm_diff.max()) 150 | # np.fill_diagonal(self.norm_diff, self.norm_diff.min(1) * 0.5) 151 | # np.fill_diagonal(self.norm_diff, 0) 152 | # else: 153 | # self.norm_diff = pairwise_distances(all_cl_params, metric="euclidean") 154 | # np.fill_diagonal(self.norm_diff, 0.1) 155 | 156 | if stochastic_greedy: 157 | SUi = self.stochastic_greedy(num_clients) 158 | else: 159 | SUi = self.lazy_greedy(num_clients) 160 | # print('Set Diff:', SUi0.difference(SUi), SUi.difference(SUi0)) 161 | 162 | indices = np.array(list(SUi)) 163 | selected_clients = np.asarray(self.clients)[indices] 164 | 165 | # return indices, selected_clients, gamma[indices] 166 | return indices, selected_clients, self.all_grads 167 | 168 | def stochastic_greedy(self, num_clients, subsample=0.1): 169 | # initialize the ground set and the selected set 170 | V_set = set(range(len(self.clients))) 171 | SUi = set() 172 | 173 | m = max(num_clients, int(subsample * len(self.clients))) 174 | for ni in range(num_clients): 175 | if m < len(V_set): 176 | R_set = np.random.choice(list(V_set), m, replace=False) 177 | else: 178 | R_set = list(V_set) 179 | if ni == 0: 180 | marg_util = self.norm_diff[:, R_set].sum(0) 181 | i = marg_util.argmin() 182 | client_min = self.norm_diff[:, R_set[i]] 183 | else: 184 | client_min_R = np.minimum(client_min[:,None], self.norm_diff[:,R_set]) 185 | marg_util = client_min_R.sum(0) 186 | i = marg_util.argmin() 187 | client_min = client_min_R[:, i] 188 | SUi.add(R_set[i]) 189 | V_set.remove(R_set[i]) 190 | return SUi 191 | 192 | def greedy(self, num_clients): 193 | # initialize the ground set and the selected set 194 | V_set = set(range(len(self.clients))) 195 | SUi = set() 196 | for ni in range(num_clients): 197 | R_set = list(V_set) 198 | if ni == 0: 199 | marg_util = self.norm_diff[:, R_set].sum(0) 200 | i = marg_util.argmin() 201 | client_min = self.norm_diff[:, R_set[i]] 202 | else: 203 | client_min_R = np.minimum(client_min[:,None], self.norm_diff[:,R_set]) 204 | marg_util = client_min_R.sum(0) 205 | i = marg_util.argmin() 206 | client_min = client_min_R[:, i] 207 | # print(R_set[i], marg_util[i]) 208 | SUi.add(R_set[i]) 209 | V_set.remove(R_set[i]) 210 | return SUi 211 | 212 | def lazy_greedy(self, num_clients): 213 | # initialize the ground set and the selected set 214 | V_set = set(range(len(self.clients))) 215 | SUi = set() 216 | 217 | S_util = 0 218 | marg_util = self.norm_diff.sum(0) 219 | i = marg_util.argmin() 220 | L_s0 = 2. * marg_util.max() 221 | marg_util = L_s0 - marg_util 222 | client_min = self.norm_diff[:,i] 223 | # print(i) 224 | SUi.add(i) 225 | V_set.remove(i) 226 | S_util = marg_util[i] 227 | marg_util[i] = -1. 228 | 229 | while len(SUi) < num_clients: 230 | argsort_V = np.argsort(marg_util)[len(SUi):] 231 | for ni in range(len(argsort_V)): 232 | i = argsort_V[-ni-1] 233 | SUi.add(i) 234 | client_min_i = np.minimum(client_min, self.norm_diff[:,i]) 235 | SUi_util = L_s0 - client_min_i.sum() 236 | 237 | marg_util[i] = SUi_util - S_util 238 | if ni > 0: 239 | if marg_util[i] < marg_util[pre_i]: 240 | if ni == len(argsort_V) - 1 or marg_util[pre_i] >= marg_util[argsort_V[-ni-2]]: 241 | S_util += marg_util[pre_i] 242 | # print(pre_i, L_s0 - S_util) 243 | SUi.remove(i) 244 | SUi.add(pre_i) 245 | V_set.remove(pre_i) 246 | marg_util[pre_i] = -1. 247 | client_min = client_min_pre_i.copy() 248 | break 249 | else: 250 | SUi.remove(i) 251 | else: 252 | if ni == len(argsort_V) - 1 or marg_util[i] >= marg_util[argsort_V[-ni-2]]: 253 | S_util = SUi_util 254 | # print(i, L_s0 - S_util) 255 | V_set.remove(i) 256 | marg_util[i] = -1. 257 | client_min = client_min_i.copy() 258 | break 259 | else: 260 | pre_i = i 261 | SUi.remove(i) 262 | client_min_pre_i = client_min_i.copy() 263 | else: 264 | if marg_util[i] >= marg_util[argsort_V[-ni-2]]: 265 | S_util = SUi_util 266 | # print(i, L_s0 - S_util) 267 | V_set.remove(i) 268 | marg_util[i] = -1. 269 | client_min = client_min_i.copy() 270 | break 271 | else: 272 | pre_i = i 273 | SUi.remove(i) 274 | client_min_pre_i = client_min_i.copy() 275 | return SUi 276 | 277 | def select_clients(self, round, num_clients=20): 278 | '''selects num_clients clients weighted by number of samples from possible_clients 279 | 280 | Args: 281 | num_clients: number of clients to select; default 20 282 | note that within function, num_clients is set to 283 | min(num_clients, len(possible_clients)) 284 | 285 | Return: 286 | list of selected clients objects 287 | ''' 288 | 289 | num_clients = min(num_clients, len(self.clients)) 290 | np.random.seed(round) # make sure for each comparison, we are selecting the same clients each round 291 | indices = np.random.choice(range(len(self.clients)), num_clients, replace=False) 292 | return indices, np.asarray(self.clients)[indices] 293 | 294 | def aggregate(self, wsolns): 295 | total_weight = 0.0 296 | base = [0]*len(wsolns[0][1]) 297 | 298 | for (w, soln) in wsolns: # w is the number of local samples 299 | total_weight += w 300 | for i, v in enumerate(soln): 301 | base[i] += w*v.astype(np.float64) 302 | 303 | averaged_soln = [v / total_weight for v in base] 304 | 305 | return averaged_soln 306 | 307 | def aggregate_simple(self, wsolns): 308 | total_weight = 0.0 309 | base = [0]*len(wsolns[0][1]) 310 | 311 | for (w, soln) in wsolns: # w is the number of local samples 312 | total_weight += 1 313 | for i, v in enumerate(soln): 314 | base[i] += v.astype(np.float64) 315 | 316 | averaged_soln = [v / total_weight for v in base] 317 | 318 | return averaged_soln 319 | 320 | def aggregate_submod(self, wsolns, gammas): 321 | total_weight = 0.0 322 | total_gamma = 0.0 323 | base = [0]*len(wsolns[0][1]) 324 | 325 | gammas = list(gammas) 326 | for (wsols, gamma) in zip(wsolns, gammas): 327 | total_weight += wsols[0] 328 | for i, v in enumerate(wsols[1]): 329 | base[i] += gamma*wsols[0]*v.astype(np.float64) 330 | total_gamma +=gamma 331 | 332 | averaged_soln = [v / (total_weight*total_gamma) for v in base] 333 | 334 | return averaged_soln 335 | 336 | -------------------------------------------------------------------------------- /flearn/trainers/feddane.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import trange, tqdm 3 | import tensorflow as tf 4 | 5 | from .fedbase import BaseFedarated 6 | from flearn.optimizer.pggd import PerGodGradientDescent 7 | from flearn.utils.tf_utils import process_grad, process_sparse_grad 8 | 9 | 10 | class Server(BaseFedarated): 11 | def __init__(self, params, learner, dataset): 12 | print('Using Federated Dane to Train') 13 | self.inner_opt = PerGodGradientDescent(params['learning_rate'], params['mu']) 14 | super(Server, self).__init__(params, learner, dataset) 15 | 16 | def train(self): 17 | '''Train using Federated Proximal''' 18 | print('Training with {} workers ---'.format(self.clients_per_round)) 19 | for i in trange(self.num_rounds, desc='Round: ', ncols=120): 20 | # test model 21 | if i % self.eval_every == 0: 22 | stats = self.test() # have set the latest model for all clients 23 | stats_train = self.train_error_and_loss() 24 | 25 | tqdm.write('At round {} accuracy: {}'.format(i, np.sum(stats[3])*1.0/np.sum(stats[2]))) # testing accuracy 26 | tqdm.write('At round {} training accuracy: {}'.format(i, np.sum(stats_train[3])*1.0/np.sum(stats_train[2]))) 27 | tqdm.write('At round {} training loss: {}'.format(i, np.dot(stats_train[4], stats_train[2])*1.0/np.sum(stats_train[2]))) 28 | 29 | # choose K clients prop to data size 30 | selected_clients = self.select_clients(i, num_clients=self.clients_per_round) 31 | 32 | cgrads = [] # buffer for receiving client solutions 33 | for c in tqdm(selected_clients, desc='Grads: ', leave=False, ncols=120): 34 | # communicate the latest model 35 | c.set_params(self.latest_model) 36 | 37 | # get the gradients 38 | grad, stats = c.solve_grad() 39 | 40 | # gather gradient from client 41 | cgrads.append(grad) 42 | 43 | # Total gradient 44 | avg_gradient = self.aggregate(cgrads) 45 | 46 | # Choose K clients prop to data size 47 | selected_clients = self.select_clients(i, num_clients=self.clients_per_round) 48 | 49 | csolns = [] # buffer for receiving client solutions 50 | for c in tqdm(selected_clients, desc='Solve: ', leave=False, ncols=120): 51 | # communicate the latest model 52 | c.set_params(self.latest_model) # w_{t-1} 53 | 54 | # setup local optimizer 55 | self.inner_opt.set_params(self.latest_model, avg_gradient, c) 56 | 57 | # solve minimization locally 58 | soln, stats = c.solve_inner(num_epochs=self.num_epochs, batch_size=self.batch_size) 59 | 60 | # gather solutions from client 61 | csolns.append(soln) 62 | 63 | # update model 64 | self.latest_model = self.aggregate(csolns) 65 | 66 | # final test model 67 | stats = self.test() 68 | stats_train = self.train_error_and_loss() 69 | tqdm.write('At round {} accuracy: {}'.format(self.num_rounds, np.sum(stats[3])*1.0/np.sum(stats[2]))) 70 | tqdm.write('At round {} training accuracy: {}'.format(self.num_rounds, np.sum(stats_train[3])*1.0/np.sum(stats_train[2]))) 71 | -------------------------------------------------------------------------------- /flearn/trainers/fedprox.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import trange, tqdm 3 | import tensorflow as tf 4 | 5 | from .fedbase import BaseFedarated 6 | from flearn.optimizer.pgd import PerturbedGradientDescent 7 | from flearn.utils.tf_utils import process_grad, process_sparse_grad 8 | 9 | 10 | class Server(BaseFedarated): 11 | def __init__(self, params, learner, dataset): 12 | print('Using Federated prox to Train') 13 | self.inner_opt = PerturbedGradientDescent(params['learning_rate'], params['mu']) 14 | super(Server, self).__init__(params, learner, dataset) 15 | 16 | def train(self): 17 | '''Train using Federated Proximal''' 18 | print('Training with {} workers ---'.format(self.clients_per_round)) 19 | 20 | for i in range(self.num_rounds): 21 | # test model 22 | if i % self.eval_every == 0: 23 | stats = self.test() # have set the latest model for all clients 24 | stats_train = self.train_error_and_loss() 25 | 26 | tqdm.write('At round {} per-client-accuracy: {}'.format(i, [i/j for i,j in zip(stats[3], stats[2])])) 27 | tqdm.write('At round {} accuracy: {}'.format(i, np.sum(stats[3])*1.0/np.sum(stats[2]))) # testing accuracy 28 | tqdm.write('At round {} acc. variance: {}'.format(i, np.var([i/j for i,j in zip(stats[3], stats[2])]))) # testing accuracy variance 29 | tqdm.write('At round {} acc. 10th: {}'.format(i, np.quantile([i/j for i,j in zip(stats[3], stats[2])], 0.1))) # testing accuracy variance 30 | tqdm.write('At round {} acc. 20th: {}'.format(i, np.quantile([i/j for i,j in zip(stats[3], stats[2])], 0.2))) # testing accuracy variance 31 | tqdm.write('At round {} training accuracy: {}'.format(i, np.sum(stats_train[3])*1.0/np.sum(stats_train[2]))) 32 | tqdm.write('At round {} training loss: {}'.format(i, np.dot(stats_train[4], stats_train[2])*1.0/np.sum(stats_train[2]))) 33 | 34 | model_len = process_grad(self.latest_model).size 35 | global_grads = np.zeros(model_len) 36 | client_grads = np.zeros(model_len) 37 | num_samples = [] 38 | local_grads = [] 39 | 40 | for c in self.clients: 41 | client_grad = c.get_grads(model_len) # get client_grad and operate on it 42 | #local_grads.append(client_grad) 43 | #num_samples.append(num) 44 | #global_grads = np.add(global_grads, client_grad * num) 45 | #global_grads = global_grads * 1.0 / np.sum(np.asarray(num_samples)) 46 | 47 | #difference = 0 48 | #for idx in range(len(self.clients)): 49 | # difference += np.sum(np.square(global_grads - local_grads[idx])) 50 | #difference = difference * 1.0 / len(self.clients) 51 | #tqdm.write('gradient difference: {}'.format(difference)) 52 | 53 | indices, selected_clients = self.select_clients(i, num_clients=self.clients_per_round) # uniform sampling 54 | np.random.seed(i) # make sure that the stragglers are the same for FedProx and FedAvg 55 | active_clients = np.random.choice(selected_clients, round(self.clients_per_round * (1 - self.drop_percent)), replace=False) 56 | 57 | csolns = [] # buffer for receiving client solutions 58 | 59 | self.inner_opt.set_params(self.latest_model, self.client_model) 60 | 61 | for idx, c in enumerate(selected_clients.tolist()): 62 | # communicate the latest model 63 | c.set_params(self.latest_model) 64 | 65 | total_iters = int(self.num_epochs * c.num_samples / self.batch_size)+2 # randint(low,high)=[low,high) 66 | 67 | # solve minimization locally 68 | if c in active_clients: 69 | soln, stats, grads = c.solve_inner(num_epochs=self.num_epochs, batch_size=self.batch_size) 70 | else: 71 | #soln, stats = c.solve_iters(num_iters=np.random.randint(low=1, high=total_iters), batch_size=self.batch_size) 72 | soln, stats, grads = c.solve_inner(num_epochs=np.random.randint(low=1, high=self.num_epochs), batch_size=self.batch_size) 73 | 74 | # gather solutions from client 75 | csolns.append(soln) 76 | 77 | # track communication cost 78 | self.metrics.update(rnd=i, cid=c.id, stats=stats) 79 | 80 | # update models 81 | self.latest_model = self.aggregate(csolns) 82 | self.client_model.set_params(self.latest_model) 83 | 84 | # final test model 85 | stats = self.test() 86 | stats_train = self.train_error_and_loss() 87 | self.metrics.accuracies.append(stats) 88 | self.metrics.train_accuracies.append(stats_train) 89 | tqdm.write('At round {} per-client-accuracy: {}'.format(i, [i/j for i,j in zip(stats[3], stats[2])])) 90 | tqdm.write('At round {} accuracy: {}'.format(self.num_rounds, np.sum(stats[3])*1.0/np.sum(stats[2]))) 91 | tqdm.write('At round {} acc. variance: {}'.format(self.num_rounds, np.var([i/j for i,j in zip(stats[3], stats[2])]))) 92 | tqdm.write('At round {} acc. 10th: {}'.format(self.num_rounds, np.quantile([i/j for i,j in zip(stats[3], stats[2])], 0.1))) 93 | tqdm.write('At round {} acc. 20th: {}'.format(self.num_rounds, np.quantile([i/j for i,j in zip(stats[3], stats[2])], 0.2))) 94 | tqdm.write('At round {} training accuracy: {}'.format(self.num_rounds, np.sum(stats_train[3])*1.0/np.sum(stats_train[2]))) 95 | -------------------------------------------------------------------------------- /flearn/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/utils/__init__.py -------------------------------------------------------------------------------- /flearn/utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /flearn/utils/__pycache__/model_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/utils/__pycache__/model_utils.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/utils/__pycache__/model_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/utils/__pycache__/model_utils.cpython-38.pyc -------------------------------------------------------------------------------- /flearn/utils/__pycache__/tf_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/flearn/utils/__pycache__/tf_utils.cpython-36.pyc -------------------------------------------------------------------------------- /flearn/utils/language_utils.py: -------------------------------------------------------------------------------- 1 | """Utils for language models.""" 2 | 3 | import re 4 | 5 | 6 | # ------------------------ 7 | # utils for shakespeare dataset 8 | 9 | ALL_LETTERS = "\n !\"&'(),-.0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz}" 10 | NUM_LETTERS = len(ALL_LETTERS) 11 | 12 | 13 | def _one_hot(index, size): 14 | '''returns one-hot vector with given size and value 1 at given index 15 | ''' 16 | vec = [0 for _ in range(size)] 17 | vec[int(index)] = 1 18 | return vec 19 | 20 | 21 | def letter_to_vec(letter): 22 | '''returns one-hot representation of given letter 23 | ''' 24 | index = ALL_LETTERS.find(letter) 25 | return _one_hot(index, NUM_LETTERS) 26 | 27 | 28 | def word_to_indices(word): 29 | '''returns a list of character indices 30 | 31 | Args: 32 | word: string 33 | 34 | Return: 35 | indices: int list with length len(word) 36 | ''' 37 | indices = [] 38 | for c in word: 39 | indices.append(ALL_LETTERS.find(c)) 40 | return indices 41 | 42 | 43 | # ------------------------ 44 | # utils for sent140 dataset 45 | 46 | 47 | def split_line(line): 48 | '''split given line/phrase into list of words 49 | 50 | Args: 51 | line: string representing phrase to be split 52 | 53 | Return: 54 | list of strings, with each string representing a word 55 | ''' 56 | return re.findall(r"[\w']+|[.,!?;]", line) 57 | 58 | 59 | def _word_to_index(word, indd): 60 | '''returns index of given word based on given lookup dictionary 61 | 62 | returns the length of the lookup dictionary if word not found 63 | 64 | Args: 65 | word: string 66 | indd: dictionary with string words as keys and int indices as values 67 | ''' 68 | if word in indd: 69 | return indd[word] 70 | else: 71 | return len(indd) 72 | 73 | 74 | def line_to_indices(line, word2id, max_words=25): 75 | '''converts given phrase into list of word indices 76 | 77 | if the phrase has more than max_words words, returns a list containing 78 | indices of the first max_words words 79 | if the phrase has less than max_words words, repeatedly appends integer 80 | representing unknown index to returned list until the list's length is 81 | max_words 82 | 83 | Args: 84 | line: string representing phrase/sequence of words 85 | word2id: dictionary with string words as keys and int indices as values 86 | max_words: maximum number of word indices in returned list 87 | 88 | Return: 89 | indl: list of word indices, one index for each word in phrase 90 | ''' 91 | unk_id = len(word2id) 92 | line_list = split_line(line) # split phrase in words 93 | indl = [word2id[w] if w in word2id else unk_id for w in line_list[:max_words]] 94 | indl += [unk_id]*(max_words-len(indl)) 95 | return indl 96 | 97 | 98 | def bag_of_words(line, vocab): 99 | '''returns bag of words representation of given phrase using given vocab 100 | 101 | Args: 102 | line: string representing phrase to be parsed 103 | vocab: dictionary with words as keys and indices as values 104 | 105 | Return: 106 | integer list 107 | ''' 108 | bag = [0]*len(vocab) 109 | words = split_line(line) 110 | for w in words: 111 | if w in vocab: 112 | bag[vocab[w]] += 1 113 | return bag 114 | -------------------------------------------------------------------------------- /flearn/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | 5 | # import Image 6 | from PIL import Image 7 | 8 | def batch_data(data, batch_size): 9 | ''' 10 | data is a dict := {'x': [numpy array], 'y': [numpy array]} (on one client) 11 | returns x, y, which are both numpy array of length: batch_size 12 | ''' 13 | data_x = data['x'] 14 | data_y = data['y'] 15 | 16 | # randomly shuffle data 17 | np.random.seed(100) 18 | rng_state = np.random.get_state() 19 | np.random.shuffle(data_x) 20 | np.random.set_state(rng_state) 21 | np.random.shuffle(data_y) 22 | 23 | # loop through mini-batches 24 | for i in range(0, len(data_x), batch_size): 25 | batched_x = data_x[i:i+batch_size] 26 | batched_y = data_y[i:i+batch_size] 27 | yield (batched_x, batched_y) 28 | 29 | 30 | def process_x(raw_x_batch): 31 | x_batch = [load_image(i) for i in raw_x_batch] 32 | x_batch = np.array(x_batch) 33 | return x_batch 34 | 35 | def process_y(raw_y_batch): 36 | return raw_y_batch 37 | 38 | def load_image(img_name): 39 | IMAGE_SIZE = 84 40 | 41 | IMAGES_DIR = os.path.join('data', 'celeba', 'data', 'raw', 'img_align_celeba') 42 | img = Image.open(os.path.join(IMAGES_DIR, img_name)) 43 | img = img.resize((IMAGE_SIZE, IMAGE_SIZE)).convert('RGB') 44 | return np.array(img) 45 | 46 | def batch_data_celeba(data, batch_size): 47 | 48 | data_x_name = data['x'] 49 | data_y_name = data['y'] 50 | 51 | raw_x = np.asarray(process_x(data_x_name)) 52 | raw_y = np.asarray(process_y(data_y_name)) 53 | 54 | # randomly shuffle data 55 | np.random.seed(100) 56 | rng_state = np.random.get_state() 57 | np.random.shuffle(raw_x) 58 | np.random.set_state(rng_state) 59 | np.random.shuffle(raw_y) 60 | 61 | # loop through mini-batches 62 | for i in range(0, len(raw_y), batch_size): 63 | batched_x = raw_x[i:i + batch_size] 64 | batched_y = raw_y[i:i + batch_size] 65 | yield (batched_x, batched_y) 66 | 67 | 68 | def batch_data_multiple_iters(data, batch_size, num_iters): 69 | data_x = data['x'] 70 | data_y = data['y'] 71 | 72 | np.random.seed(100) 73 | rng_state = np.random.get_state() 74 | np.random.shuffle(data_x) 75 | np.random.set_state(rng_state) 76 | np.random.shuffle(data_y) 77 | 78 | idx = 0 79 | 80 | for i in range(num_iters): 81 | if idx+batch_size >= len(data_x): 82 | idx = 0 83 | rng_state = np.random.get_state() 84 | np.random.shuffle(data_x) 85 | np.random.set_state(rng_state) 86 | np.random.shuffle(data_y) 87 | batched_x = data_x[idx: idx+batch_size] 88 | batched_y = data_y[idx: idx+batch_size] 89 | idx += batch_size 90 | yield (batched_x, batched_y) 91 | 92 | def read_data(train_data_dir, test_data_dir): 93 | '''parses data in given train and test data directories 94 | 95 | assumes: 96 | - the data in the input directories are .json files with 97 | keys 'users' and 'user_data' 98 | - the set of train set users is the same as the set of test set users 99 | 100 | Return: 101 | clients: list of client ids 102 | groups: list of group ids; empty list if none found 103 | train_data: dictionary of train data 104 | test_data: dictionary of test data 105 | ''' 106 | clients = [] 107 | groups = [] 108 | train_data = {} 109 | test_data = {} 110 | 111 | train_files = os.listdir(train_data_dir) 112 | train_files = [f for f in train_files if f.endswith('.json')] 113 | for f in train_files: 114 | file_path = os.path.join(train_data_dir,f) 115 | with open(file_path, 'r') as inf: 116 | cdata = json.load(inf) 117 | clients.extend(cdata['users']) 118 | if 'hierarchies' in cdata: 119 | groups.extend(cdata['hierarchies']) 120 | train_data.update(cdata['user_data']) 121 | 122 | test_files = os.listdir(test_data_dir) 123 | test_files = [f for f in test_files if f.endswith('.json')] 124 | for f in test_files: 125 | file_path = os.path.join(test_data_dir,f) 126 | with open(file_path, 'r') as inf: 127 | cdata = json.load(inf) 128 | test_data.update(cdata['user_data']) 129 | 130 | clients = list(sorted(train_data.keys())) 131 | 132 | return clients, groups, train_data, test_data 133 | 134 | 135 | class Metrics(object): 136 | def __init__(self, clients, params): 137 | self.params = params 138 | num_rounds = params['num_rounds'] 139 | self.bytes_written = {c.id: [0] * num_rounds for c in clients} 140 | self.client_computations = {c.id: [0] * num_rounds for c in clients} 141 | self.bytes_read = {c.id: [0] * num_rounds for c in clients} 142 | self.accuracies = [] 143 | self.train_accuracies = [] 144 | 145 | def update(self, rnd, cid, stats): 146 | bytes_w, comp, bytes_r = stats 147 | self.bytes_written[cid][rnd] += bytes_w 148 | self.client_computations[cid][rnd] += comp 149 | self.bytes_read[cid][rnd] += bytes_r 150 | 151 | def write(self): 152 | metrics = {} 153 | metrics['dataset'] = self.params['dataset'] 154 | metrics['num_rounds'] = self.params['num_rounds'] 155 | metrics['eval_every'] = self.params['eval_every'] 156 | metrics['learning_rate'] = self.params['learning_rate'] 157 | metrics['mu'] = self.params['mu'] 158 | metrics['num_epochs'] = self.params['num_epochs'] 159 | metrics['batch_size'] = self.params['batch_size'] 160 | metrics['accuracies'] = self.accuracies 161 | metrics['train_accuracies'] = self.train_accuracies 162 | metrics['client_computations'] = self.client_computations 163 | metrics['bytes_written'] = self.bytes_written 164 | metrics['bytes_read'] = self.bytes_read 165 | metrics_dir = os.path.join('out', self.params['dataset'], 'metrics_{}_{}_{}_{}_{}.json'.format(self.params['seed'], self.params['optimizer'], self.params['learning_rate'], self.params['num_epochs'], self.params['mu'])) 166 | #os.mkdir(os.path.join('out', self.params['dataset'])) 167 | if not os.path.exists(os.path.join('out', self.params['dataset'])): 168 | os.mkdir(os.path.join('out', self.params['dataset'])) 169 | with open(metrics_dir, 'w') as ouf: 170 | json.dump(metrics, ouf) 171 | -------------------------------------------------------------------------------- /flearn/utils/tf_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import functools 3 | import operator 4 | import tensorflow as tf 5 | 6 | def __num_elems(shape): 7 | '''Returns the number of elements in the given shape 8 | 9 | Args: 10 | shape: TensorShape 11 | 12 | Return: 13 | tot_elems: int 14 | ''' 15 | tot_elems = 1 16 | for s in shape: 17 | tot_elems *= int(s) 18 | return tot_elems 19 | 20 | def graph_size(graph): 21 | '''Returns the size of the given graph in bytes 22 | 23 | The size of the graph is calculated by summing up the sizes of each 24 | trainable variable. The sizes of variables are calculated by multiplying 25 | the number of bytes in their dtype with their number of elements, captured 26 | in their shape attribute 27 | 28 | Args: 29 | graph: TF graph 30 | Return: 31 | integer representing size of graph (in bytes) 32 | ''' 33 | tot_size = 0 34 | with graph.as_default(): 35 | vs = tf.trainable_variables() 36 | for v in vs: 37 | tot_elems = __num_elems(v.shape) 38 | dtype_size = int(v.dtype.size) 39 | var_size = tot_elems * dtype_size 40 | tot_size += var_size 41 | return tot_size 42 | 43 | def process_sparse_grad(grads): 44 | ''' 45 | Args: 46 | grads: grad returned by LSTM model (only for the shakespaere dataset) 47 | Return: 48 | a flattened grad in numpy (1-D array) 49 | ''' 50 | 51 | indices = grads[0].indices 52 | values = grads[0].values 53 | first_layer_dense = np.zeros((80,8)) 54 | for i in range(indices.shape[0]): 55 | first_layer_dense[indices[i], :] = values[i, :] 56 | 57 | client_grads = first_layer_dense 58 | for i in range(1, len(grads)): 59 | client_grads = np.append(client_grads, grads[i]) # output a flattened array 60 | 61 | 62 | return client_grads 63 | 64 | def flatten2list(object): 65 | gather = np.array([]) 66 | for item in object: 67 | if isinstance(item, (list, tuple, set)): 68 | gather = np.append(gather, flatten2list(item)) 69 | else: 70 | gather = np.append(gather, item) 71 | return gather 72 | 73 | def process_grad(grads): 74 | ''' 75 | Args: 76 | grads: grad 77 | Return: 78 | a flattened grad in numpy (1-D array) 79 | ''' 80 | 81 | # print('grads.shape', grads[0], grads[1], grads[-1]) 82 | # client_grads = grads[0] 83 | 84 | # for i in range(1, len(grads)): 85 | # client_grads = np.append(client_grads, grads[i]) # output a flattened array 86 | 87 | 88 | # return client_grads 89 | 90 | # return np.asarray(functools.reduce(operator.iconcat, grads, [])) 91 | 92 | # print(grads[0],grads[1]) 93 | client_grads = np.asarray(flatten2list(grads)) 94 | # print('client_grads shape', len(grads), client_grads.shape) 95 | return client_grads 96 | 97 | def cosine_sim(a, b): 98 | '''Returns the cosine similarity between two arrays a and b 99 | ''' 100 | dot_product = np.dot(a, b) 101 | norm_a = np.linalg.norm(a) 102 | norm_b = np.linalg.norm(b) 103 | return dot_product * 1.0 / (norm_a * norm_b) 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /flearn/utils/utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | def save_obj(obj, name): 4 | with open(name + '.pkl', 'wb') as f: 5 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 6 | 7 | def load_obj(name): 8 | with open(name + '.pkl', 'rb') as f: 9 | return pickle.load(f) 10 | 11 | def iid_divide(l, g): 12 | ''' 13 | divide list l among g groups 14 | each group has either int(len(l)/g) or int(len(l)/g)+1 elements 15 | returns a list of groups 16 | ''' 17 | num_elems = len(l) 18 | group_size = int(len(l)/g) 19 | num_big_groups = num_elems - g * group_size 20 | num_small_groups = g - num_big_groups 21 | glist = [] 22 | for i in range(num_small_groups): 23 | glist.append(l[group_size*i:group_size*(i+1)]) 24 | bi = group_size*num_small_groups 25 | group_size += 1 26 | for i in range(num_big_groups): 27 | glist.append(l[bi+group_size*i:bi+group_size*(i+1)]) 28 | return glist -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import importlib 4 | import random 5 | import os 6 | import tensorflow as tf 7 | from flearn.utils.model_utils import read_data 8 | 9 | # GLOBAL PARAMETERS 10 | OPTIMIZERS = ['fedavg', 'fedprox', 'feddane', 'fedddane', 'fedsgd', 'fedprox_origin'] 11 | DATASETS = ['celeba', 'sent140', 'nist', 'shakespeare', 'mnist', 12 | 'synthetic_iid', 'synthetic_0_0', 'synthetic_0.5_0.5', 'synthetic_1_1', 'synthetic_cluster'] # NIST is EMNIST in the paepr 13 | 14 | 15 | MODEL_PARAMS = { 16 | 'sent140.bag_dnn': (2,), # num_classes 17 | 'sent140.stacked_lstm': (25, 2, 100), # seq_len, num_classes, num_hidden 18 | 'sent140.stacked_lstm_no_embeddings': (25, 2, 100), # seq_len, num_classes, num_hidden 19 | 'nist.mclr': (26,), # num_classes 20 | 'nist.cnn':(10,), 21 | 'mnist.mclr': (10,), # num_classes 22 | 'mnist.cnn': (10,), # num_classes 23 | 'shakespeare.stacked_lstm': (80, 80, 256), # seq_len, emb_dim, num_hidden 24 | 'synthetic.mclr': (10, ), # num_classes 25 | 'celeba.cnn': (2,) 26 | } 27 | 28 | 29 | def read_options(): 30 | ''' Parse command line arguments or load defaults ''' 31 | parser = argparse.ArgumentParser() 32 | 33 | parser.add_argument('--optimizer', 34 | help='name of optimizer;', 35 | type=str, 36 | choices=OPTIMIZERS, 37 | default='fedavg') 38 | parser.add_argument('--dataset', 39 | help='name of dataset;', 40 | type=str, 41 | choices=DATASETS, 42 | default='nist') 43 | parser.add_argument('--model', 44 | help='name of model;', 45 | type=str, 46 | default='mclr') 47 | parser.add_argument('--num_rounds', 48 | help='number of rounds to simulate;', 49 | type=int, 50 | default=-1) 51 | parser.add_argument('--eval_every', 52 | help='evaluate every ____ rounds;', 53 | type=int, 54 | default=-1) 55 | parser.add_argument('--clients_per_round', 56 | help='number of clients trained per round;', 57 | type=int, 58 | default=-1) 59 | parser.add_argument('--batch_size', 60 | help='batch size when clients train on data;', 61 | type=int, 62 | default=10) 63 | parser.add_argument('--num_epochs', 64 | help='number of epochs when clients train on data;', 65 | type=int, 66 | default=1) 67 | parser.add_argument('--num_iters', 68 | help='number of iterations when clients train on data;', 69 | type=int, 70 | default=1) 71 | parser.add_argument('--learning_rate', 72 | help='learning rate for inner solver;', 73 | type=float, 74 | default=0.003) 75 | parser.add_argument('--mu', 76 | help='constant for prox;', 77 | type=float, 78 | default=0) 79 | parser.add_argument('--seed', 80 | help='seed for randomness;', 81 | type=int, 82 | default=0) 83 | parser.add_argument('--drop_percent', 84 | help='percentage of slow devices', 85 | type=float, 86 | default=0.1) 87 | parser.add_argument('--clientsel_algo', 88 | help='Client Selection Algorithm', 89 | type=str, 90 | default='random') 91 | parser.add_argument('--Ls0', 92 | help='Constant for grad. similarity', 93 | type=int, 94 | default=2) 95 | parser.add_argument('--sim_metric', 96 | help='similarity metric', 97 | type=str, 98 | default='grad') 99 | parser.add_argument('--m_interval', 100 | help='frequency of sending gradient metric for submodular', 101 | type=int, 102 | default=1) 103 | 104 | try: parsed = vars(parser.parse_args()) 105 | except IOError as msg: parser.error(str(msg)) 106 | 107 | # Set seeds 108 | random.seed(1 + parsed['seed']) 109 | np.random.seed(12 + parsed['seed']) 110 | tf.set_random_seed(123 + parsed['seed']) 111 | # tf.random.set_seed(123 + parsed['seed']) 112 | 113 | 114 | # load selected model 115 | if parsed['dataset'].startswith("synthetic"): # all synthetic datasets use the same model 116 | model_path = '%s.%s.%s.%s' % ('flearn', 'models', 'synthetic', parsed['model']) 117 | else: 118 | model_path = '%s.%s.%s.%s' % ('flearn', 'models', parsed['dataset'], parsed['model']) 119 | 120 | mod = importlib.import_module(model_path) 121 | learner = getattr(mod, 'Model') 122 | 123 | # load selected trainer 124 | opt_path = 'flearn.trainers.%s' % parsed['optimizer'] 125 | mod = importlib.import_module(opt_path) 126 | optimizer = getattr(mod, 'Server') 127 | 128 | # add selected model parameter 129 | parsed['model_params'] = MODEL_PARAMS['.'.join(model_path.split('.')[2:])] 130 | 131 | # print and return 132 | maxLen = max([len(ii) for ii in parsed.keys()]); 133 | fmtString = '\t%' + str(maxLen) + 's : %s'; 134 | print('Arguments:') 135 | for keyPair in sorted(parsed.items()): print(fmtString % keyPair) 136 | 137 | return parsed, learner, optimizer 138 | 139 | def main(): 140 | # suppress tf warnings 141 | tf.logging.set_verbosity(tf.logging.WARN) 142 | 143 | # parse command line arguments 144 | options, learner, optimizer = read_options() 145 | 146 | # read data 147 | train_path = os.path.join('data', options['dataset'], 'data', 'train') 148 | test_path = os.path.join('data', options['dataset'], 'data', 'test') 149 | dataset = read_data(train_path, test_path) 150 | 151 | # call appropriate trainer 152 | t = optimizer(options, learner, dataset) 153 | t.train() 154 | 155 | if __name__ == '__main__': 156 | main() 157 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | tensorflow-gpu==1.10 4 | Pillow 5 | matplotlib 6 | jupyter 7 | tqdm 8 | -------------------------------------------------------------------------------- /run_fedavg.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python3 -u main.py --dataset=$1 --optimizer='fedavg' \ 3 | --learning_rate=0.01 --num_rounds=200 --clients_per_round=5 \ 4 | --eval_every=1 --batch_size=10 \ 5 | --num_epochs=1 \ 6 | --model='mclr' \ 7 | --drop_percent=$2 \ 8 | --clientsel_algo=='submodular' \ 9 | 10 | 11 | -------------------------------------------------------------------------------- /run_fedprox.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python3 -u main.py --dataset=$1 --optimizer='fedprox' \ 3 | --learning_rate=0.01 --num_rounds=200 --clients_per_round=5 \ 4 | --eval_every=1 --batch_size=10 \ 5 | --num_epochs=1 \ 6 | --model='mclr' \ 7 | --drop_percent=$2 \ 8 | --mu=$3 \ 9 | -------------------------------------------------------------------------------- /run_scripts.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #datasets='synthetic_1_1 synthetic_iid synthetic_0_0 synthetic_0.5_0.5' 4 | datasets='celeba' 5 | clmodel='cnn' 6 | 7 | for dataset in $datasets 8 | do 9 | for num_clients in 10 20 #10 15 20 10 | do 11 | for epoch in 1 #5 10 12 | do 13 | for mu in 0 #1 14 | do 15 | echo $dataset $num_clients $epoch 16 | python3 -u main.py --dataset=$dataset --optimizer='fedprox' \ 17 | --learning_rate=0.1 --num_rounds=800 --clients_per_round=$num_clients \ 18 | --eval_every=1 --batch_size=10 \ 19 | --num_epochs=$epoch \ 20 | --model=$clmodel \ 21 | --drop_percent=0 \ 22 | --mu=$mu | tee results/$dataset/fedprox_numclients$num_clients"mu"$mu"epochs"$epoch"ICLR" 23 | done 24 | done 25 | done 26 | done 27 | 28 | echo All done -------------------------------------------------------------------------------- /submod_scripts.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #datasets='synthetic_1_1 synthetic_iid synthetic_0_0 synthetic_0.5_0.5 nist celeba' 4 | 5 | datasets='celeba' 6 | clmodel='cnn' 7 | 8 | for dataset in $datasets 9 | do 10 | if [ $dataset == 'synthetic_1_1' ] 11 | then 12 | L_auxs=( 35) #5 10 25 50 13 | elif [ $dataset == 'synthetic_0.5_0.5' ] 14 | then 15 | L_auxs=( 1 5 10 20) 16 | elif [ $dataset == 'synthetic_0_0' ] 17 | then 18 | L_auxs=( 1 3 7 10) 19 | else 20 | L_auxs=( 1 2 3 4) 21 | fi 22 | L_aux=1 23 | #for L_aux in "${L_auxs[@]}" 24 | for num_clients in 10 20 25 | do 26 | for epoch in 1 #20 27 | do 28 | for m in 1 10 # use any value other than 1 to run partial gradient setting 29 | do 30 | echo $L_aux 31 | python3 -u main.py --dataset=$dataset --optimizer='fedavg' \ 32 | --learning_rate=0.1 --num_rounds=800 --Ls0=$L_aux \ 33 | --eval_every=1 --batch_size=10 \ 34 | --num_epochs=$epoch \ 35 | --model=$clmodel \ 36 | --drop_percent=0 \ 37 | --clients_per_round=$num_clients \ 38 | --sim_metric='grad' --m_interval=$m \ 39 | --clientsel_algo='submodular' | tee results/$dataset/uneq_submod_numclients$num_clients"epochs"$epoch"updateevery"$m"ICLR_stochgreedy_newpartition" 40 | #--clientsel_algo='lossbased' | tee results/$dataset/uneq_PoC_numclients$num_clients"epochs"$epoch"T1" 41 | #--clientsel_algo='lossbased' | tee results/$dataset/uneq11_simpleavg_PoC_numclients$num_clients"epochs"$epoch"T1" 42 | done 43 | done 44 | done 45 | done 46 | echo All done -------------------------------------------------------------------------------- /submod_scripts_sent140.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #datasets='synthetic_1_1 synthetic_iid synthetic_0_0 synthetic_0.5_0.5 nist ' 4 | 5 | datasets='sent140' 6 | clmodel='stacked_lstm' 7 | 8 | 9 | for dataset in $datasets 10 | do 11 | if [ $dataset == 'synthetic_1_1' ] 12 | then 13 | L_auxs=( 35) #5 10 25 50 14 | elif [ $dataset == 'synthetic_0.5_0.5' ] 15 | then 16 | L_auxs=( 1 5 10 20) 17 | elif [ $dataset == 'synthetic_0_0' ] 18 | then 19 | L_auxs=( 1 3 7 10) 20 | else 21 | L_auxs=( 1 2 3 4) 22 | fi 23 | L_aux=1 24 | #for L_aux in "${L_auxs[@]}" 25 | for num_clients in 10 20 30 26 | do 27 | for epoch in 5 10 28 | do 29 | for m in 5 30 | do 31 | echo $L_aux 32 | python -u main.py --dataset=$dataset --optimizer='fedavg' \ 33 | --learning_rate=0.5 --num_rounds=200 --Ls0=$L_aux \ 34 | --eval_every=1 --batch_size=10 \ 35 | --num_epochs=$epoch \ 36 | --model=$clmodel \ 37 | --drop_percent=0 \ 38 | --clients_per_round=$num_clients \ 39 | --sim_metric='grad' --m_interval=$m \ 40 | --clientsel_algo='submodular' | tee results/$dataset/uneq_psubmod_numclients$num_clients"epochs"$epoch"updateevery"$m"TESTONLY" 41 | #--clientsel_algo='lossbased' | tee results/$dataset/uneq_PoC_numclients$num_clients"epochs"$epoch"T1" 42 | #--clientsel_algo='lossbased' | tee results/$dataset/uneq11_simpleavg_PoC_numclients$num_clients"epochs"$epoch"T1" 43 | done 44 | done 45 | done 46 | done 47 | echo All done -------------------------------------------------------------------------------- /submod_scripts_shakespeare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #datasets='synthetic_1_1 synthetic_iid synthetic_0_0 synthetic_0.5_0.5 nist ' 4 | 5 | datasets='shakespeare' 6 | clmodel='stacked_lstm' 7 | 8 | 9 | for dataset in $datasets 10 | do 11 | if [ $dataset == 'synthetic_1_1' ] 12 | then 13 | L_auxs=( 35) #5 10 25 50 14 | elif [ $dataset == 'synthetic_0.5_0.5' ] 15 | then 16 | L_auxs=( 1 5 10 20) 17 | elif [ $dataset == 'synthetic_0_0' ] 18 | then 19 | L_auxs=( 1 3 7 10) 20 | else 21 | L_auxs=( 1 2 3 4) 22 | fi 23 | L_aux=1 24 | #for L_aux in "${L_auxs[@]}" 25 | for num_clients in 10 #20 30 26 | do 27 | for epoch in 5 10 28 | do 29 | for m in 5 30 | do 31 | echo $L_aux 32 | python -u main.py --dataset=$dataset --optimizer='fedavg' \ 33 | --learning_rate=0.8 --num_rounds=60 --Ls0=$L_aux \ 34 | --eval_every=1 --batch_size=10 \ 35 | --num_epochs=$epoch \ 36 | --model=$clmodel \ 37 | --drop_percent=0 \ 38 | --clients_per_round=$num_clients \ 39 | --sim_metric='grad' --m_interval=$m \ 40 | --clientsel_algo='submodular' | tee results/$dataset/uneq_psubmod_numclients$num_clients"epochs"$epoch"updateevery"$m"TESTONLY" 41 | #--clientsel_algo='lossbased' | tee results/$dataset/uneq_PoC_numclients$num_clients"epochs"$epoch"T1" 42 | #--clientsel_algo='lossbased' | tee results/$dataset/uneq11_simpleavg_PoC_numclients$num_clients"epochs"$epoch"T1" 43 | done 44 | done 45 | done 46 | done 47 | echo All done -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melodi-lab/divfl/4a5b80df85a72b97751ce1130ee5c7d76a2c1540/utils/__init__.py -------------------------------------------------------------------------------- /utils/language_utils.py: -------------------------------------------------------------------------------- 1 | """Utils for language models.""" 2 | 3 | import re 4 | 5 | 6 | # ------------------------ 7 | # utils for shakespeare dataset 8 | 9 | ALL_LETTERS = "\n !\"&'(),-.0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz}" 10 | NUM_LETTERS = len(ALL_LETTERS) 11 | 12 | 13 | def _one_hot(index, size): 14 | '''returns one-hot vector with given size and value 1 at given index 15 | ''' 16 | vec = [0 for _ in range(size)] 17 | vec[int(index)] = 1 18 | return vec 19 | 20 | 21 | def letter_to_vec(letter): 22 | '''returns one-hot representation of given letter 23 | ''' 24 | index = ALL_LETTERS.find(letter) 25 | return _one_hot(index, NUM_LETTERS) 26 | 27 | 28 | def word_to_indices(word): 29 | '''returns a list of character indices 30 | 31 | Args: 32 | word: string 33 | 34 | Return: 35 | indices: int list with length len(word) 36 | ''' 37 | indices = [] 38 | for c in word: 39 | indices.append(ALL_LETTERS.find(c)) 40 | return indices 41 | 42 | 43 | # ------------------------ 44 | # utils for sent140 dataset 45 | 46 | 47 | def split_line(line): 48 | '''split given line/phrase into list of words 49 | 50 | Args: 51 | line: string representing phrase to be split 52 | 53 | Return: 54 | list of strings, with each string representing a word 55 | ''' 56 | return re.findall(r"[\w']+|[.,!?;]", line) 57 | 58 | 59 | def _word_to_index(word, indd): 60 | '''returns index of given word based on given lookup dictionary 61 | 62 | returns the length of the lookup dictionary if word not found 63 | 64 | Args: 65 | word: string 66 | indd: dictionary with string words as keys and int indices as values 67 | ''' 68 | if word in indd: 69 | return indd[word] 70 | else: 71 | return len(indd) 72 | 73 | 74 | def line_to_indices(line, indd, max_words=25): 75 | '''converts given phrase into list of word indices 76 | 77 | if the phrase has more than max_words words, returns a list containing 78 | indices of the first max_words words 79 | if the phrase has less than max_words words, repeatedly appends integer 80 | representing unknown index to returned list until the list's length is 81 | max_words 82 | 83 | Args: 84 | line: string representing phrase/sequence of words 85 | indd: dictionary with string words as keys and int indices as values 86 | max_words: maximum number of word indices in returned list 87 | 88 | Return: 89 | indl: list of word indices, one index for each word in phrase 90 | ''' 91 | line_list = split_line(line) # split phrase in words 92 | indl = [] 93 | for word in line_list: 94 | cind = _word_to_index(word, indd) 95 | indl.append(cind) 96 | if (len(indl) == max_words): 97 | break 98 | for i in range(max_words - len(indl)): 99 | indl.append(len(indd)) 100 | return indl 101 | 102 | 103 | def bag_of_words(line, vocab): 104 | '''returns bag of words representation of given phrase using given vocab 105 | 106 | Args: 107 | line: string representing phrase to be parsed 108 | vocab: dictionary with words as keys and indices as values 109 | 110 | Return: 111 | integer list 112 | ''' 113 | bag = [0]*len(vocab) 114 | words = split_line(line) 115 | for w in words: 116 | if w in vocab: 117 | bag[vocab[w]] += 1 118 | return bag 119 | -------------------------------------------------------------------------------- /utils/model_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | import re 5 | import sys 6 | 7 | models_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | models_dir = os.path.join(models_dir, 'models') 9 | sys.path.append(models_dir) 10 | 11 | from client import Client 12 | 13 | def batch_data(data, batch_size): 14 | ''' 15 | data is a dict := {'x': [list], 'y': [list]} 16 | returns x, y, which are both lists of size-batch_size lists 17 | ''' 18 | raw_x = data['x'] 19 | raw_y = data['y'] 20 | batched_x = [] 21 | batched_y = [] 22 | for i in range(0, len(raw_x), batch_size): 23 | batched_x.append(raw_x[i:i+batch_size]) 24 | batched_y.append(raw_y[i:i+batch_size]) 25 | return batched_x, batched_y 26 | 27 | def read_data(train_data_dir, test_data_dir): 28 | '''parses data in given train and test data directories 29 | 30 | assumes: 31 | - the data in the input directories are .json files with 32 | keys 'users' and 'user_data' 33 | - the set of train set users is the same as the set of test set users 34 | 35 | Return: 36 | clients: list of client ids 37 | groups: list of group ids; empty list if none found 38 | train_data: dictionary of train data 39 | test_data: dictionary of test data 40 | ''' 41 | clients = [] 42 | groups = [] 43 | train_data = {} 44 | test_data = {} 45 | 46 | train_files = os.listdir(train_data_dir) 47 | train_files = [f for f in train_files if f.endswith('.json')] 48 | for f in train_files: 49 | file_path = os.path.join(train_data_dir,f) 50 | with open(file_path, 'r') as inf: 51 | cdata = json.load(inf) 52 | clients.extend(cdata['users']) 53 | if 'hierarchies' in cdata: 54 | groups.extend(cdata['hierarchies']) 55 | train_data.update(cdata['user_data']) 56 | 57 | test_files = os.listdir(test_data_dir) 58 | test_files = [f for f in test_files if f.endswith('.json')] 59 | for f in test_files: 60 | file_path = os.path.join(test_data_dir,f) 61 | with open(file_path, 'r') as inf: 62 | cdata = json.load(inf) 63 | test_data.update(cdata['user_data']) 64 | 65 | clients = list(train_data.keys()) 66 | 67 | return clients, groups, train_data, test_data 68 | 69 | def setup_clients(train_data_dir, test_data_dir, model=None): 70 | '''instantiates clients based on given train and test data directories 71 | 72 | Return: 73 | list of Clients 74 | ''' 75 | users, groups, train_data, test_data = read_data(train_data_dir, test_data_dir) 76 | if len(groups) == 0: 77 | groups = [None for _ in users] 78 | all_clients = [Client(u, g, train_data[u], test_data[u], model) for u, g in zip(users, groups)] 79 | return all_clients 80 | 81 | -------------------------------------------------------------------------------- /utils/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # script to preprocess data 4 | 5 | # -------------------- 6 | # parse arguments 7 | 8 | NAME="sent140" # name of the dataset, equivalent to directory name 9 | SAMPLE="na" # -s tag, iid or niid 10 | IUSER="" # --iu tag, # of users if iid sampling 11 | SFRAC="" # --sf tag, fraction of data to sample 12 | MINSAMPLES="na" # -k tag, minimum allowable # of samples per user 13 | TRAIN="na" # -t tag, user or sample 14 | TFRAC="" # --tf tag, fraction of data in training set 15 | 16 | while [[ $# -gt 0 ]] 17 | do 18 | key="$1" 19 | 20 | case $key in 21 | --name) 22 | NAME="$2" 23 | shift # past argument 24 | if [ ${SAMPLE:0:1} = "-" ]; then 25 | NAME="sent140" 26 | else 27 | shift # past value 28 | fi 29 | ;; 30 | -s) 31 | SAMPLE="$2" 32 | shift # past argument 33 | if [ ${SAMPLE:0:1} = "-" ]; then 34 | SAMPLE="" 35 | else 36 | shift # past value 37 | fi 38 | ;; 39 | --iu) 40 | IUSER="$2" 41 | shift # past argument 42 | if [ ${IUSER:0:1} = "-" ]; then 43 | IUSER="" 44 | else 45 | shift # past value 46 | fi 47 | ;; 48 | --sf) 49 | SFRAC="$2" 50 | shift # past argument 51 | if [ ${SFRAC:0:1} = "-" ]; then 52 | SFRAC="" 53 | else 54 | shift # past value 55 | fi 56 | ;; 57 | -k) 58 | MINSAMPLES="$2" 59 | shift # past argument 60 | if [ ${MINSAMPLES:0:1} = "-" ]; then 61 | MINSAMPLES="" 62 | else 63 | shift # past value 64 | fi 65 | ;; 66 | -t) 67 | TRAIN="$2" 68 | shift # past argument 69 | if [ -z "$TRAIN" ] || [ ${TRAIN:0:1} = "-" ]; then 70 | TRAIN="" 71 | else 72 | shift # past value 73 | fi 74 | ;; 75 | --tf) 76 | TFRAC="$2" 77 | shift # past argument 78 | if [ ${TFRAC:0:1} = "-" ]; then 79 | TFRAC="" 80 | else 81 | shift # past value 82 | fi 83 | ;; 84 | *) # unknown option 85 | shift # past argument 86 | ;; 87 | esac 88 | done 89 | 90 | # -------------------- 91 | # preprocess data 92 | 93 | CONT_SCRIPT=true 94 | cd ../data/$NAME 95 | 96 | # download data and convert to .json format 97 | 98 | if [ ! -d "data/all_data" ]; then 99 | cd preprocess 100 | ./data_to_json.sh 101 | cd .. 102 | fi 103 | 104 | NAMETAG="--name $NAME" 105 | 106 | # sample data 107 | IUSERTAG="" 108 | if [ ! -z $IUSER ]; then 109 | IUSERTAG="--u $IUSER" 110 | fi 111 | SFRACTAG="" 112 | if [ ! -z $SFRAC ]; then 113 | SFRACTAG="--fraction $SFRAC" 114 | fi 115 | 116 | if [ "$CONT_SCRIPT" = true ] && [ ! $SAMPLE = "na" ]; then 117 | if [ -d "data/sampled_data" ] && [ "$(ls -A data/sampled_data)" ]; then 118 | CONT_SCRIPT=false 119 | else 120 | if [ ! -d "data/sampled_data" ]; then 121 | mkdir data/sampled_data 122 | fi 123 | 124 | cd ../../utils 125 | 126 | if [ $SAMPLE = "iid" ]; then 127 | python3 sample.py $NAMETAG --iid $IUSERTAG $SFRACTAG 128 | else 129 | python3 sample.py $NAMETAG $SFRACTAG 130 | fi 131 | 132 | cd ../data/$NAME 133 | fi 134 | fi 135 | 136 | # remove users with less then given number of samples 137 | if [ "$CONT_SCRIPT" = true ] && [ ! $MINSAMPLES = "na" ]; then 138 | if [ -d "data/rem_user_data" ] && [ "$(ls -A data/rem_user_data)" ]; then 139 | CONT_SCRIPT=false 140 | else 141 | if [ ! -d "data/rem_user_data" ]; then 142 | mkdir data/rem_user_data 143 | fi 144 | 145 | cd ../../utils 146 | 147 | if [ -z $MINSAMPLES ]; then 148 | python3 remove_users.py $NAMETAG 149 | else 150 | python3 remove_users.py $NAMETAG --min_samples $MINSAMPLES 151 | fi 152 | 153 | cd ../data/$NAME 154 | fi 155 | fi 156 | 157 | # create train-test split 158 | TFRACTAG="" 159 | if [ ! -z $TFRAC ]; then 160 | TFRACTAG="--frac $TFRAC" 161 | fi 162 | 163 | if [ "$CONT_SCRIPT" = true ] && [ ! $TRAIN = "na" ]; then 164 | if [ -d "data/train" ] && [ "$(ls -A data/train)" ]; then 165 | CONT_SCRIPT=false 166 | else 167 | if [ ! -d "data/train" ]; then 168 | mkdir data/train 169 | fi 170 | if [ ! -d "data/test" ]; then 171 | mkdir data/test 172 | fi 173 | 174 | cd ../../utils 175 | 176 | if [ -z $TRAIN ]; then 177 | python3 split_data.py $NAMETAG $TFRACTAG 178 | elif [ $TRAIN = "user" ]; then 179 | python3 split_data.py $NAMETAG --by_user $TFRACTAG 180 | elif [ $TRAIN = "sample" ]; then 181 | python3 split_data.py $NAMETAG --by_sample $TFRACTAG 182 | fi 183 | 184 | cd ../data/$NAME 185 | fi 186 | fi 187 | 188 | if [ "$CONT_SCRIPT" = false ]; then 189 | echo "Data for one of the specified preprocessing tasks has already been" 190 | echo "generated. If you would like to re-generate data for this directory," 191 | echo "please delete the existing one. Otherwise, please remove the" 192 | echo "respective tag(s) from the preprocessing command." 193 | fi -------------------------------------------------------------------------------- /utils/remove_users.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | removes users with less than the given number of samples 4 | ''' 5 | 6 | import argparse 7 | import json 8 | import os 9 | 10 | import numpy as np 11 | 12 | parser = argparse.ArgumentParser() 13 | 14 | parser.add_argument('--name', 15 | help='name of dataset to parse; default: sent140;', 16 | type=str, 17 | default='sent140') 18 | 19 | parser.add_argument('--min_samples', 20 | help='users with less than x samples are discarded; default: 10;', 21 | type=int, 22 | default=10) 23 | 24 | args = parser.parse_args() 25 | 26 | 27 | print('------------------------------') 28 | 29 | 30 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 31 | dir = os.path.join(parent_path, 'data', args.name, 'data') 32 | subdir = os.path.join(dir, 'sampled_data') 33 | files = [] 34 | if os.path.exists(subdir): 35 | files = os.listdir(subdir) 36 | if len(files) == 0: 37 | subdir = os.path.join(dir, 'all_data') 38 | files = os.listdir(subdir) 39 | files = [f for f in files if f.endswith('.json')] 40 | 41 | for f in files: 42 | users = [] 43 | hierarchies = [] 44 | num_samples = [] 45 | user_data = {} 46 | 47 | min_number = 0 48 | max_number = 0 49 | 50 | file_dir = os.path.join(subdir, f) 51 | with open(file_dir, 'r') as inf: 52 | data = json.load(inf) 53 | 54 | num_users = len(data['users']) 55 | for i in range(num_users): 56 | curr_user = data['users'][i] 57 | curr_hierarchy = None 58 | if 'hierarchies' in data: 59 | curr_hierarchy = data['hierarchies'][i] 60 | curr_num_samples = data['num_samples'][i] 61 | 62 | if curr_num_samples > args.min_samples: 63 | user_data[curr_user] = data['user_data'][curr_user] 64 | users.append(curr_user) 65 | max_number += 1 66 | if curr_hierarchy is not None: 67 | hierarchies.append(curr_hierarchy) 68 | num_samples.append(data['num_samples'][i]) 69 | 70 | 71 | all_data = {} 72 | all_data['users'] = users 73 | if len(hierarchies) == len(users): 74 | all_data['hierarchies'] = hierarchies 75 | all_data['num_samples'] = num_samples 76 | all_data['user_data'] = user_data 77 | 78 | file_name = '%s_keep_%d.json' % ((f[:-5]), args.min_samples) 79 | ouf_dir = os.path.join(dir, 'rem_user_data', file_name) 80 | 81 | print('writing %s' % file_name) 82 | with open(ouf_dir, 'w') as outfile: 83 | json.dump(all_data, outfile) 84 | 85 | -------------------------------------------------------------------------------- /utils/sample.py: -------------------------------------------------------------------------------- 1 | ''' 2 | samples from all raw data; 3 | by default samples in a non-iid manner; namely, randomly selects users from 4 | raw data until their cumulative amount of data exceeds the given number of 5 | datapoints to sample (specified by --fraction argument); 6 | ordering of original data points is not preserved in sampled data 7 | ''' 8 | 9 | import argparse 10 | import json 11 | import os 12 | import random 13 | 14 | from utils import iid_divide 15 | 16 | parser = argparse.ArgumentParser() 17 | 18 | parser.add_argument('--name', 19 | help='name of dataset to parse; default: sent140;', 20 | type=str, 21 | default='sent140') 22 | parser.add_argument('--iid', 23 | help='sample iid;', 24 | action="store_true") 25 | parser.add_argument('--niid', 26 | help="sample niid;", 27 | dest='iid', action='store_false') 28 | parser.add_argument('--fraction', 29 | help='fraction of all data to sample; default: 0.1;', 30 | type=float, 31 | default=0.1) 32 | parser.add_argument('--u', 33 | help=('number of users in iid data set; ignored in niid case;' 34 | 'represented as fraction of original total number of users; ' 35 | 'default: 0.01;'), 36 | type=float, 37 | default=0.01) 38 | parser.set_defaults(iid=False) 39 | 40 | args = parser.parse_args() 41 | 42 | print('------------------------------') 43 | print('sampling data') 44 | 45 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 46 | data_dir = os.path.join(parent_path, 'data', args.name, 'data') 47 | subdir = os.path.join(data_dir, 'all_data') 48 | files = os.listdir(subdir) 49 | files = [f for f in files if f.endswith('.json')] 50 | 51 | new_user_count = 0 # for iid case 52 | for f in files: 53 | file_dir = os.path.join(subdir, f) 54 | with open(file_dir, 'r') as inf: 55 | data = json.load(inf) 56 | 57 | num_users = len(data['users']) 58 | 59 | tot_num_samples = sum(data['num_samples']) 60 | print('Fraction: ', args.fraction) 61 | num_new_samples = int(args.fraction * tot_num_samples) 62 | 63 | hierarchies = None 64 | 65 | if(args.iid): 66 | raw_list = list(data['user_data'].values()) 67 | raw_x = [elem['x'] for elem in raw_list] 68 | raw_y = [elem['y'] for elem in raw_list] 69 | x_list = [item for sublist in raw_x for item in sublist] # flatten raw_x 70 | y_list = [item for sublist in raw_y for item in sublist] # flatten raw_y 71 | 72 | num_new_users = int(round(args.u * num_users)) 73 | if num_new_users == 0: 74 | num_new_users += 1 75 | 76 | indices = [i for i in range(tot_num_samples)] 77 | new_indices = random.sample(indices, num_new_samples) 78 | # TODO: seed this random 79 | 80 | users = [str(i+new_user_count) for i in range(num_new_users)] 81 | 82 | user_data = {} 83 | for user in users: 84 | user_data[user] = {'x': [], 'y': []} 85 | all_x_samples = [x_list[i] for i in new_indices] 86 | all_y_samples = [y_list[i] for i in new_indices] 87 | x_groups = iid_divide(all_x_samples, num_new_users) 88 | y_groups = iid_divide(all_y_samples, num_new_users) 89 | for i in range(num_new_users): 90 | user_data[users[i]]['x'] = x_groups[i] 91 | user_data[users[i]]['y'] = y_groups[i] 92 | 93 | num_samples = [len(user_data[u]['y']) for u in users] 94 | 95 | new_user_count += num_new_users 96 | 97 | else: 98 | 99 | ctot_num_samples = 0 100 | 101 | users = data['users'] 102 | users_and_hiers = None 103 | if 'hierarchies' in data: 104 | users_and_hiers = list(zip(users, data['hierarchies'])) 105 | random.shuffle(users_and_hiers) 106 | else: 107 | random.shuffle(users) 108 | user_i = 0 109 | num_samples = [] 110 | user_data = {} 111 | 112 | if 'hierarchies' in data: 113 | hierarchies = [] 114 | 115 | while(ctot_num_samples < num_new_samples): 116 | hierarchy = None 117 | if users_and_hiers is not None: 118 | user, hier = users_and_hiers[user_i] 119 | else: 120 | user = users[user_i] 121 | 122 | cdata = data['user_data'][user] 123 | 124 | cnum_samples = len(data['user_data'][user]['y']) 125 | 126 | if (ctot_num_samples + cnum_samples > num_new_samples): 127 | cnum_samples = num_new_samples - ctot_num_samples 128 | indices = [i for i in range(cnum_samples)] 129 | new_indices = random.sample(indices, cnum_samples) 130 | x = [] 131 | y = [] 132 | for i in new_indices: 133 | x.append(data['user_data'][user]['x'][i]) 134 | y.append(data['user_data'][user]['y'][i]) 135 | cdata = {'x': x, 'y': y} 136 | 137 | if 'hierarchies' in data: 138 | hierarchies.append(hier) 139 | 140 | num_samples.append(cnum_samples) 141 | user_data[user] = cdata 142 | 143 | ctot_num_samples += cnum_samples 144 | user_i += 1 145 | 146 | if 'hierarchies' in data: 147 | users = [u for u, h in users_and_hiers][:user_i] 148 | else: 149 | users = users[:user_i] 150 | 151 | # ------------ 152 | # create .json file 153 | 154 | all_data = {} 155 | all_data['users'] = users 156 | if hierarchies is not None: 157 | all_data['hierarchies'] = hierarchies 158 | all_data['num_samples'] = num_samples 159 | all_data['user_data'] = user_data 160 | 161 | slabel = '' 162 | if(args.iid): 163 | slabel = 'iid' 164 | else: 165 | slabel = 'niid' 166 | 167 | arg_frac = str(args.fraction) 168 | arg_frac = arg_frac[2:] 169 | arg_nu = str(args.u) 170 | arg_nu = arg_nu[2:] 171 | arg_label = arg_frac 172 | if(args.iid): 173 | arg_label = '%s_%s' % (arg_nu, arg_label) 174 | file_name = '%s_%s_%s.json' % ((f[:-5]), slabel, arg_label) 175 | ouf_dir = os.path.join(data_dir, 'sampled_data', file_name) 176 | 177 | print('writing %s' % file_name) 178 | with open(ouf_dir, 'w') as outfile: 179 | json.dump(all_data, outfile) 180 | -------------------------------------------------------------------------------- /utils/split_data.py: -------------------------------------------------------------------------------- 1 | ''' 2 | splits data into train and test sets 3 | ''' 4 | 5 | import argparse 6 | import json 7 | import os 8 | import random 9 | import sys 10 | 11 | def create_jsons_for(user_files, which_set, max_users, include_hierarchy): 12 | '''used in split-by-user case''' 13 | user_count = 0 14 | json_index = 0 15 | users = [] 16 | if include_hierarchy: 17 | hierarchies = [] 18 | else: 19 | hierarchies = None 20 | num_samples = [] 21 | user_data = {} 22 | for (i, t) in enumerate(user_files): 23 | if include_hierarchy: 24 | (u, h, ns, f) = t 25 | else: 26 | (u, ns, f) = t 27 | 28 | file_dir = os.path.join(subdir, f) 29 | with open(file_dir, 'r') as inf: 30 | data = json.load(inf) 31 | 32 | users.append(u) 33 | if include_hierarchy: 34 | hierarchies.append(h) 35 | num_samples.append(ns) 36 | user_data[u] = data['user_data'][u] 37 | user_count += 1 38 | 39 | if (user_count == max_users) or (i == len(user_files) - 1): 40 | 41 | all_data = {} 42 | all_data['users'] = users 43 | if include_hierarchy: 44 | all_data['hierarchies'] = hierarchies 45 | all_data['num_samples'] = num_samples 46 | all_data['user_data'] = user_data 47 | 48 | data_i = f.find('data') 49 | num_i = data_i + 5 50 | num_to_end = f[num_i:] 51 | param_i = num_to_end.find('_') 52 | param_to_end = '.json' 53 | if param_i != -1: 54 | param_to_end = num_to_end[param_i:] 55 | nf = '%s_%d%s' % (f[:(num_i-1)], json_index, param_to_end) 56 | file_name = '%s_%s_%s.json' % ((nf[:-5]), which_set, arg_label) 57 | ouf_dir = os.path.join(dir, which_set, file_name) 58 | 59 | print('writing %s' % file_name) 60 | with open(ouf_dir, 'w') as outfile: 61 | json.dump(all_data, outfile) 62 | 63 | user_count = 0 64 | json_index += 1 65 | users = [] 66 | if include_hierarchy: 67 | hierarchies = [] 68 | num_samples = [] 69 | user_data = {} 70 | 71 | parser = argparse.ArgumentParser() 72 | 73 | parser.add_argument('--name', 74 | help='name of dataset to parse; default: sent140;', 75 | type=str, 76 | default='sent140') 77 | parser.add_argument('--by_user', 78 | help='divide users into training and test set groups;', 79 | dest='user', action='store_true') 80 | parser.add_argument('--by_sample', 81 | help="divide each user's samples into training and test set groups;", 82 | dest='user', action='store_false') 83 | parser.add_argument('--frac', 84 | help='fraction in training set; default: 0.9;', 85 | type=float, 86 | default=0.8) 87 | parser.set_defaults(user=False) 88 | 89 | args = parser.parse_args() 90 | 91 | print('------------------------------') 92 | print('generating training and test sets') 93 | 94 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 95 | dir = os.path.join(parent_path, 'data', args.name, 'data') 96 | subdir = os.path.join(dir, 'rem_user_data') 97 | files = [] 98 | if os.path.exists(subdir): 99 | files = os.listdir(subdir) 100 | if len(files) == 0: 101 | subdir = os.path.join(dir, 'sampled_data') 102 | if os.path.exists(subdir): 103 | files = os.listdir(subdir) 104 | if len(files) == 0: 105 | subdir = os.path.join(dir, 'all_data') 106 | files = os.listdir(subdir) 107 | files = [f for f in files if f.endswith('.json')] 108 | 109 | arg_label = str(args.frac) 110 | arg_label = arg_label[2:] 111 | 112 | # check if data contains information on hierarchies 113 | file_dir = os.path.join(subdir, files[0]) 114 | with open(file_dir, 'r') as inf: 115 | data = json.load(inf) 116 | include_hierarchy = 'hierarchies' in data 117 | 118 | if (args.user): 119 | print('splitting data by user') 120 | 121 | # 1 pass through all the json files to instantiate arr 122 | # containing all possible (user, .json file name) tuples 123 | user_files = [] 124 | for f in files: 125 | file_dir = os.path.join(subdir, f) 126 | with open(file_dir, 'r') as inf: 127 | data = json.load(inf) 128 | if include_hierarchy: 129 | user_files.extend([(u, h, ns, f) for (u, h, ns) in 130 | zip(data['users'], data['hierarchies'], data['num_samples'])]) 131 | else: 132 | user_files.extend([(u, ns, f) for (u, ns) in 133 | zip(data['users'], data['num_samples'])]) 134 | 135 | # randomly sample from user_files to pick training set users 136 | num_users = len(user_files) 137 | num_train_users = int(args.frac * num_users) 138 | indices = [i for i in range(num_users)] 139 | train_indices = random.sample(indices, num_train_users) 140 | train_blist = [False for i in range(num_users)] 141 | for i in train_indices: 142 | train_blist[i] = True 143 | train_user_files = [] 144 | test_user_files = [] 145 | for i in range(num_users): 146 | if (train_blist[i]): 147 | train_user_files.append(user_files[i]) 148 | else: 149 | test_user_files.append(user_files[i]) 150 | 151 | max_users = sys.maxsize 152 | if args.name == 'nist': 153 | max_users = 50 # max number of users per json file 154 | create_jsons_for(train_user_files, 'train', max_users, include_hierarchy) 155 | create_jsons_for(test_user_files, 'test', max_users, include_hierarchy) 156 | 157 | else: 158 | print('splitting data by sample') 159 | 160 | for f in files: 161 | file_dir = os.path.join(subdir, f) 162 | with open(file_dir, 'r') as inf: 163 | data = json.load(inf) 164 | 165 | num_samples_train = [] 166 | user_data_train = {} 167 | num_samples_test = [] 168 | user_data_test = {} 169 | 170 | user_indices = [] # indices of users in data['users'] that are not deleted 171 | 172 | for i, u in enumerate(data['users']): 173 | user_data_train[u] = {'x': [], 'y': []} 174 | user_data_test[u] = {'x': [], 'y': []} 175 | 176 | curr_num_samples = len(data['user_data'][u]['y']) 177 | if curr_num_samples >= 2: 178 | user_indices.append(i) 179 | 180 | # ensures number of train and test samples both >= 1 181 | num_train_samples = max(1, int(args.frac * curr_num_samples)) 182 | if curr_num_samples == 2: 183 | num_train_samples = 1 184 | 185 | num_test_samples = curr_num_samples - num_train_samples 186 | num_samples_train.append(num_train_samples) 187 | num_samples_test.append(num_test_samples) 188 | 189 | indices = [j for j in range(curr_num_samples)] 190 | train_indices = random.sample(indices, num_train_samples) 191 | train_blist = [False for _ in range(curr_num_samples)] 192 | for j in train_indices: 193 | train_blist[j] = True 194 | 195 | for j in range(curr_num_samples): 196 | if (train_blist[j]): 197 | user_data_train[u]['x'].append(data['user_data'][u]['x'][j]) 198 | user_data_train[u]['y'].append(data['user_data'][u]['y'][j]) 199 | else: 200 | user_data_test[u]['x'].append(data['user_data'][u]['x'][j]) 201 | user_data_test[u]['y'].append(data['user_data'][u]['y'][j]) 202 | 203 | users = [data['users'][i] for i in user_indices] 204 | 205 | all_data_train = {} 206 | all_data_train['users'] = users 207 | all_data_train['num_samples'] = num_samples_train 208 | all_data_train['user_data'] = user_data_train 209 | all_data_test = {} 210 | all_data_test['users'] = users 211 | all_data_test['num_samples'] = num_samples_test 212 | all_data_test['user_data'] = user_data_test 213 | 214 | if include_hierarchy: 215 | all_data_train['hierarchies'] = data['hierarchies'] 216 | all_data_test['hierarchies'] = data['hierarchies'] 217 | 218 | file_name_train = '%s_train_%s.json' % ((f[:-5]), arg_label) 219 | file_name_test = '%s_test_%s.json' % ((f[:-5]), arg_label) 220 | ouf_dir_train = os.path.join(dir, 'train', file_name_train) 221 | ouf_dir_test = os.path.join(dir, 'test', file_name_test) 222 | print('writing %s' % file_name_train) 223 | with open(ouf_dir_train, 'w') as outfile: 224 | json.dump(all_data_train, outfile) 225 | print('writing %s' % file_name_test) 226 | with open(ouf_dir_test, 'w') as outfile: 227 | json.dump(all_data_test, outfile) 228 | -------------------------------------------------------------------------------- /utils/stats.py: -------------------------------------------------------------------------------- 1 | ''' 2 | assumes that the user has already generated .json file(s) containing data 3 | ''' 4 | 5 | import argparse 6 | import json 7 | import matplotlib.pyplot as plt 8 | import math 9 | import numpy as np 10 | import os 11 | 12 | from scipy import io 13 | from scipy import stats 14 | 15 | parser = argparse.ArgumentParser() 16 | 17 | parser.add_argument('--name', 18 | help='name of dataset to parse; default: sent140;', 19 | type=str, 20 | default='sent140') 21 | 22 | args = parser.parse_args() 23 | 24 | 25 | def load_data(name): 26 | 27 | users = [] 28 | num_samples = [] 29 | 30 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 31 | data_dir = os.path.join(parent_path, 'data', name, 'data') 32 | subdir = os.path.join(data_dir, 'all_data') 33 | 34 | files = os.listdir(subdir) 35 | files = [f for f in files if f.endswith('.json')] 36 | 37 | for f in files: 38 | file_dir = os.path.join(subdir, f) 39 | 40 | with open(file_dir) as inf: 41 | data = json.load(inf) 42 | 43 | users.extend(data['users']) 44 | num_samples.extend(data['num_samples']) 45 | 46 | return users, num_samples 47 | 48 | def print_dataset_stats(name): 49 | users, num_samples = load_data(name) 50 | num_users = len(users) 51 | 52 | print('####################################') 53 | print('DATASET: %s' % name) 54 | print('%d users' % num_users) 55 | print('%d samples (total)' % np.sum(num_samples)) 56 | print('%.2f samples per user (mean)' % np.mean(num_samples)) 57 | print('num_samples (std): %.2f' % np.std(num_samples)) 58 | print('num_samples (std/mean): %.2f' % (np.std(num_samples)/np.mean(num_samples))) 59 | print('num_samples (skewness): %.2f' % stats.skew(num_samples)) 60 | 61 | bins = [0,20,40,60,80,100,120,140,160,180,200] 62 | if args.name == 'shakespeare': 63 | bins = [0,2000,4000,6000,8000,10000,12000,14000,16000,18000,20000] 64 | if args.name == 'nist': 65 | bins = [0,20,40,60,80,100,120,140,160,180,200,220,240,260,280,300,320,340,360,380,400,420,440,460,480,500] 66 | 67 | hist, edges = np.histogram(num_samples,bins=bins) 68 | print("\nnum_sam\tnum_users") 69 | for e, h in zip(edges, hist): 70 | print(e, "\t", h) 71 | 72 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 73 | data_dir = os.path.join(parent_path, 'data', name, 'data') 74 | 75 | plt.hist(num_samples, bins = bins) 76 | fig_name = "%s_hist_nolabel.png" % name 77 | fig_dir = os.path.join(data_dir, fig_name) 78 | plt.savefig(fig_dir) 79 | plt.title(name) 80 | plt.xlabel("number of samples") 81 | plt.ylabel("number of users") 82 | fig_name = "%s_hist.png" % name 83 | fig_dir = os.path.join(data_dir, fig_name) 84 | plt.savefig(fig_dir) 85 | 86 | print_dataset_stats(args.name) -------------------------------------------------------------------------------- /utils/tf_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | 4 | def __num_elems(shape): 5 | '''Returns the number of elements in the given shape 6 | 7 | Args: 8 | shape: TensorShape 9 | 10 | Return: 11 | tot_elems: int 12 | ''' 13 | tot_elems = 1 14 | for s in shape: 15 | tot_elems *= int(s) 16 | return tot_elems 17 | 18 | def graph_size(graph): 19 | '''Returns the size of the given graph in bytes 20 | 21 | The size of the graph is calculated by summing up the sizes of each 22 | trainable variable. The sizes of variables are calculated by multiplying 23 | the number of bytes in their dtype with their number of elements, captured 24 | in their shape attribute 25 | 26 | Args: 27 | graph: TF graph 28 | Return: 29 | integer representing size of graph (in bytes) 30 | ''' 31 | tot_size = 0 32 | with graph.as_default(): 33 | vs = tf.trainable_variables() 34 | for v in vs: 35 | tot_elems = __num_elems(v.shape) 36 | dtype_size = int(v.dtype.size) 37 | var_size = tot_elems * dtype_size 38 | tot_size += var_size 39 | return tot_size 40 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | def save_obj(obj, name): 4 | with open(name + '.pkl', 'wb') as f: 5 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 6 | 7 | def load_obj(name): 8 | with open(name + '.pkl', 'rb') as f: 9 | return pickle.load(f) 10 | 11 | def iid_divide(l, g): 12 | ''' 13 | divide list l among g groups 14 | each group has either int(len(l)/g) or int(len(l)/g)+1 elements 15 | returns a list of groups 16 | ''' 17 | num_elems = len(l) 18 | group_size = int(len(l)/g) 19 | num_big_groups = num_elems - g * group_size 20 | num_small_groups = g - num_big_groups 21 | glist = [] 22 | for i in range(num_small_groups): 23 | glist.append(l[group_size*i:group_size*(i+1)]) 24 | bi = group_size*num_small_groups 25 | group_size += 1 26 | for i in range(num_big_groups): 27 | glist.append(l[bi+group_size*i:bi+group_size*(i+1)]) 28 | return glist --------------------------------------------------------------------------------