33 |
34 |
40 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.01 --rho 1.4 --times 1
41 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.03 --rho 1.4 --times 1
42 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.05 --rho 1.4 --times 1
43 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.07 --rho 1.4 --times 1
44 |
45 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.01 --rho 2 --times 1
46 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.03 --rho 2 --times 1
47 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.05 --rho 2 --times 1
48 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.07 --rho 2 --times 1
49 |
50 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.01 --rho 5 --times 1
51 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.03 --rho 5 --times 1
52 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.05 --rho 5 --times 1
53 | python3 -u main.py --dataset Linear_synthetic --algorithm FEDL --model linear --num_global_iters 200 --clients_per_round 100 --batch_size 0 --local_epochs 20 --learning_rate 0.04 --hyper_learning_rate 0.07 --rho 5 --times 1
54 |
55 | - All the train loss, testing accuracy, and training accuracy will be stored as h5py file in the folder "results".
56 | - To produce the figure for linear regression run python3 plot_linear.py
57 | - Note that all users are selected in Synthetic data, so the experiments for each case of synthetic only need to be run once
58 |
59 | - For MNIST, run below commands:
60 |
61 | python3 -u main.py --dataset Mnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 20 --local_epochs 20 --learning_rate 0.003 --hyper_learning_rate 0.2 --rho 0 --times 10
62 | python3 -u main.py --dataset Mnist --algorithm FedAvg --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 20 --local_epochs 20 --learning_rate 0.003 --hyper_learning_rate 0 --rho 0 --times 10
63 |
64 | python3 -u main.py --dataset Mnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 40 --local_epochs 20 --learning_rate 0.003 --hyper_learning_rate 0.2 --rho 0 --times 10
65 | python3 -u main.py --dataset Mnist --algorithm FedAvg --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 40 --local_epochs 20 --learning_rate 0.003 --hyper_learning_rate 0 --rho 0 --times 10
66 |
67 | python3 -u main.py --dataset Mnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 0 --local_epochs 20 --learning_rate 0.003 --hyper_learning_rate 0.2 --rho 0 --times 10
68 | python3 -u main.py --dataset Mnist --algorithm FedAvg --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 0 --local_epochs 20 --learning_rate 0.003 --hyper_learning_rate 0 --rho 0 --times 10
69 |
70 | python3 -u main.py --dataset Mnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 0 --local_epochs 20 --learning_rate 0.003 --hyper_learning_rate 2 --rho 0 --times 10
71 | python3 -u main.py --dataset Mnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 0 --local_epochs 20 --learning_rate 0.003 --hyper_learning_rate 4 --rho 0 --times 10
72 |
73 |
74 |
75 |
76 |
77 |
78 |
python3 plot_mnist.py
81 |
82 | - For FEMNIST, run below commands:
83 |
84 | python3 -u main.py --dataset Femnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 20 --local_epochs 10 --learning_rate 0.003 --hyper_learning_rate 0.2 --rho 0 --times 10
85 | python3 -u main.py --dataset Femnist --algorithm FedAvg --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 20 --local_epochs 10 --learning_rate 0.003 --hyper_learning_rate 0 --rho 0 --times 10
86 | python3 -u main.py --dataset Femnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 0 --local_epochs 10 --learning_rate 0.015 --hyper_learning_rate 0.5 --rho 0 --times 10
87 |
88 | python3 -u main.py --dataset Femnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 20 --local_epochs 20 --learning_rate 0.003 --hyper_learning_rate 0.2 --rho 0 --times 10
89 | python3 -u main.py --dataset Femnist --algorithm FedAvg --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 20 --local_epochs 20 --learning_rate 0.003 --hyper_learning_rate 0 --rho 0 --times 10
90 | python3 -u main.py --dataset Femnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 0 --local_epochs 20 --learning_rate 0.015 --hyper_learning_rate 0.5 --rho 0 --times 10
91 |
92 | python3 -u main.py --dataset Femnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 20 --local_epochs 40 --learning_rate 0.003 --hyper_learning_rate 0.2 --rho 0 --times 10
93 | python3 -u main.py --dataset Femnist --algorithm FedAvg --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 20 --local_epochs 40 --learning_rate 0.003 --hyper_learning_rate 0 --rho 0 --times 10
94 | python3 -u main.py --dataset Femnist --algorithm FEDL --model mclr --num_global_iters 800 --clients_per_round 10 --batch_size 0 --local_epochs 40 --learning_rate 0.015 --hyper_learning_rate 0.5 --rho 0 --times 10
95 |
96 |
97 |
98 |
99 |
100 |
101 |
python3 plot_femnist.py
104 |
105 | - For non-convex experiment on (MNIST dataset):
106 | Note that FEDL is unstable with minibatch for example 20:
107 |
108 | python3 -u main.py --dataset Mnist --algorithm FEDL --model dnn --num_global_iters 800 --clients_per_round 10 --batch_size 40 --local_epochs 20 --learning_rate 0.0015 --hyper_learning_rate 0.8 --rho 0 --times 10
109 | python3 -u main.py --dataset Mnist --algorithm FEDL --model dnn --num_global_iters 800 --clients_per_round 10 --batch_size 0 --local_epochs 20 --learning_rate 0.0015 --hyper_learning_rate 4.0 --rho 0 --times 10
110 |
111 |
--------------------------------------------------------------------------------
/data/Femnist/README.md:
--------------------------------------------------------------------------------
1 | # EMNIST Dataset
2 |
3 | ## Setup Instructions
4 | - pip3 install numpy
5 | - pip3 install pillow
6 | - Run ```./preprocess.sh``` with a choice of the following tags:
7 | - ```-s``` := 'iid' to sample in an i.i.d. manner, or 'niid' to sample in a non-i.i.d. manner; more information on i.i.d. versus non-i.i.d. is included in the 'Notes' section
8 | - ```--iu``` := number of users, if iid sampling; expressed as a fraction of the total number of users; default is 0.01
9 | - ```--sf``` := fraction of data to sample, written as a decimal; default is 0.1
10 | - ```-k``` := minimum number of samples per user
11 | - ```-t``` := 'user' to partition users into train-test groups, or 'sample' to partition each user's samples into train-test groups
12 | - ```--tf``` := fraction of data in training set, written as a decimal; default is 0.9
13 | - ```--nu``` := The total number of users generated.
14 |
15 | Instruction used to generate EMNIST with 50 users:
16 |
17 | ```
18 | ./preprocess.sh -s niid --sf 1.0 -k 0 -tf 0.8 -t sample --nu 100
19 | ```
20 |
21 |
22 |
23 |
24 | (Make sure to delete the rem\_user\_data, sampled\_data, test, and train subfolders in the data directory before re-running preprocess.sh.)
25 |
26 | Or you can download the dataset [here](https://drive.google.com/open?id=1sHzD4IsgEI5xLy6cqwUjSGW0PwiduPHr), unzip it and put the `train` and `test` folder under `data`.
27 |
--------------------------------------------------------------------------------
/data/Femnist/data/my_sample.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import json
3 | import math
4 | import numpy as np
5 | import os
6 | import sys
7 | import random
8 | from tqdm import trange
9 |
10 | from PIL import Image
11 |
12 | NUM_USER = 50
13 | CLASS_PER_USER = 19
14 |
15 |
16 | def relabel_class(c):
17 | '''
18 | maps hexadecimal class value (string) to a decimal number
19 | returns:
20 | - 0 through 9 for classes representing respective numbers
21 | - 10 through 35 for classes representing respective uppercase letters
22 | - 36 through 61 for classes representing respective lowercase letters
23 | '''
24 | if c.isdigit() and int(c) < 40:
25 | return (int(c) - 30)
26 | elif int(c, 16) <= 90: # uppercase
27 | return (int(c, 16) - 55)
28 | else:
29 | return (int(c, 16) - 61)
30 |
31 | def load_image(file_name):
32 | '''read in a png
33 | Return: a flatted list representing the image
34 | '''
35 | size = (28, 28)
36 | img = Image.open(file_name)
37 | gray = img.convert('L')
38 | gray.thumbnail(size, Image.ANTIALIAS)
39 | arr = np.asarray(gray).copy()
40 | vec = arr.flatten()
41 | vec = vec / 255 # scale all pixel values to between 0 and 1
42 | vec = vec.tolist()
43 |
44 | return vec
45 |
46 |
47 | def main():
48 | file_dir = "raw_data/by_class"
49 |
50 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]}
51 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]}
52 |
53 | train_path = "train/mytrain.json"
54 | test_path = "test/mytest.json"
55 |
56 | X = [[] for _ in range(NUM_USER)]
57 | y = [[] for _ in range(NUM_USER)]
58 |
59 | nist_data = {}
60 |
61 |
62 | for class_ in os.listdir(file_dir):
63 |
64 | real_class = relabel_class(class_)
65 |
66 | if real_class >= 36 and real_class <= 61:
67 |
68 | full_img_path = file_dir + "/" + class_ + "/train_" + class_
69 | all_files_this_class = os.listdir(full_img_path)
70 | random.shuffle(all_files_this_class)
71 | sampled_files_this_class = all_files_this_class[:7000]
72 | imgs = []
73 | for img in sampled_files_this_class:
74 | imgs.append(load_image(full_img_path + "/" + img))
75 | class_ = relabel_class(class_)
76 | print(class_)
77 | nist_data[class_-36] = imgs # a list of list, key is (0, 25)
78 | print(len(imgs))
79 |
80 | # assign samples to users by power law
81 | num_samples = np.random.lognormal(4, 2, (NUM_USER)) + 5
82 |
83 | idx = np.zeros(26, dtype=np.int64)
84 |
85 | for user in range(NUM_USER):
86 | num_sample_per_class = int(num_samples[user]/CLASS_PER_USER)
87 | if num_sample_per_class < 2:
88 | num_sample_per_class = 2
89 |
90 | for j in range(CLASS_PER_USER):
91 | class_id = (user + j) % 26
92 | if idx[class_id] + num_sample_per_class < len(nist_data[class_id]):
93 | idx[class_id] = 0
94 | X[user] += nist_data[class_id][idx[class_id] : (idx[class_id] + num_sample_per_class)]
95 | y[user] += (class_id * np.ones(num_sample_per_class)).tolist()
96 | idx[class_id] += num_sample_per_class
97 |
98 | # Create data structure
99 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]}
100 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]}
101 |
102 | for i in trange(NUM_USER, ncols=120):
103 | uname = 'f_{0:05d}'.format(i)
104 |
105 | combined = list(zip(X[i], y[i]))
106 | random.shuffle(combined)
107 | X[i][:], y[i][:] = zip(*combined)
108 | num_samples = len(X[i])
109 | train_len = int(0.9 * num_samples)
110 | test_len = num_samples - train_len
111 |
112 | train_data['users'].append(uname)
113 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]}
114 | train_data['num_samples'].append(train_len)
115 | test_data['users'].append(uname)
116 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]}
117 | test_data['num_samples'].append(test_len)
118 |
119 |
120 | with open(train_path,'w') as outfile:
121 | json.dump(train_data, outfile)
122 | with open(test_path, 'w') as outfile:
123 | json.dump(test_data, outfile)
124 |
125 |
126 | if __name__ == "__main__":
127 | main()
128 |
129 |
--------------------------------------------------------------------------------
/data/Femnist/data/nist_generator.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import json
3 | import math
4 | import numpy as np
5 | import os
6 | import sys
7 | import random
8 | from tqdm import trange
9 |
10 | from PIL import Image
11 |
12 | NUM_USER = 50
13 | CLASS_PER_USER = 50
14 | FEMNIST = True # True: generate data will full 62 label, False: only 26 labels for lowercase
15 | SAMPLE_NUM_MEAN = 400
16 | SAMPLE_NUM_STD = 110
17 |
18 |
19 | def relabel_class(c):
20 | '''
21 | maps hexadecimal class value (string) to a decimal number
22 | returns:
23 | - 0 through 9 for classes representing respective numbers : total 10
24 | - 10 through 35 for classes representing respective uppercase letters : 26
25 | - 36 through 61 for classes representing respective lowercase letters : 26
26 | - in total we have 10 + 26 + 26 = 62 class for FEMIST tiwand only 36-61 for FEMIST*
27 | '''
28 | if c.isdigit() and int(c) < 40:
29 | return (int(c) - 30)
30 | elif int(c, 16) <= 90: # uppercase
31 | return (int(c, 16) - 55)
32 | else:
33 | return (int(c, 16) - 61)
34 |
35 |
36 | def load_image(file_name):
37 | '''read in a png
38 | Return: a flatted list representing the image
39 | '''
40 | size = (28, 28)
41 | img = Image.open(file_name)
42 | gray = img.convert('L')
43 | gray.thumbnail(size, Image.ANTIALIAS)
44 | arr = np.asarray(gray).copy()
45 | vec = arr.flatten()
46 | vec = vec / 255 # scale all pixel values to between 0 and 1
47 | vec = vec.tolist()
48 |
49 | return vec
50 |
51 |
52 | def main():
53 | file_dir = "raw_data/by_class"
54 |
55 | train_data = {'users': [], 'user_data': {}, 'num_samples': []}
56 | test_data = {'users': [], 'user_data': {}, 'num_samples': []}
57 | if(FEMNIST):
58 | train_path = "train/nisttrain.json"
59 | test_path = "test/nisttest.json"
60 | else:
61 | train_path = "train/femnisttrain.json"
62 | test_path = "test/femnisttest.json"
63 |
64 | X = [[] for _ in range(NUM_USER)]
65 | y = [[] for _ in range(NUM_USER)]
66 |
67 | nist_data = {}
68 |
69 | for class_ in os.listdir(file_dir):
70 |
71 | real_class = relabel_class(class_)
72 |
73 | if(FEMNIST):
74 | full_img_path = file_dir + "/" + class_ + "/train_" + class_
75 | all_files_this_class = os.listdir(full_img_path)
76 | random.shuffle(all_files_this_class)
77 | sampled_files_this_class = all_files_this_class[:7000]
78 | imgs = []
79 | for img in sampled_files_this_class:
80 | imgs.append(load_image(full_img_path + "/" + img))
81 | class_ = relabel_class(class_)
82 | print("Class:", class_)
83 | nist_data[class_] = imgs # a list of list, key is (0, 25)
84 | print("Image len:", len(imgs))
85 |
86 | else:
87 | if real_class >= 36 and real_class <= 61:
88 | full_img_path = file_dir + "/" + class_ + "/train_" + class_
89 | all_files_this_class = os.listdir(full_img_path)
90 | random.shuffle(all_files_this_class)
91 | sampled_files_this_class = all_files_this_class[:7000]
92 | imgs = []
93 | for img in sampled_files_this_class:
94 | imgs.append(load_image(full_img_path + "/" + img))
95 | class_ = relabel_class(class_)
96 | print(class_)
97 | nist_data[class_-36] = imgs # a list of list, key is (0, 25)
98 | print(len(imgs))
99 |
100 | # assign samples to users by power law
101 | normal_std = np.sqrt(np.log(1 + (lognormal_std/lognormal_mean)**2))
102 | normal_mean = np.log(lognormal_mean) - normal_std**2 / 2
103 |
104 | num_samples = np.random.lognormal(normal_mean, normal_std, (NUM_USER)) + 5
105 | #num_samples = np.random.normal(SAMPLE_NUM_MEAN,SAMPLE_NUM_STD,(NUM_USER))
106 |
107 | if(FEMNIST):
108 | idx = np.zeros(62, dtype=np.int64)
109 | else:
110 | idx = np.zeros(26, dtype=np.int64)
111 |
112 | for user in range(NUM_USER):
113 | num_sample_per_class = int(num_samples[user]/CLASS_PER_USER)
114 | if num_sample_per_class < 2:
115 | num_sample_per_class = 2
116 |
117 | for j in range(CLASS_PER_USER):
118 | if(FEMNIST):
119 | class_id = (user + j) % 62
120 | else:
121 | class_id = (user + j) % 26
122 |
123 | if idx[class_id] + num_sample_per_class < len(nist_data[class_id]):
124 | idx[class_id] = 0
125 | X[user] += nist_data[class_id][idx[class_id]
126 | : (idx[class_id] + num_sample_per_class)]
127 | y[user] += (class_id * np.ones(num_sample_per_class)).tolist()
128 | idx[class_id] += num_sample_per_class
129 |
130 | # Create data structure
131 | train_data = {'users': [], 'user_data': {}, 'num_samples': []}
132 | test_data = {'users': [], 'user_data': {}, 'num_samples': []}
133 |
134 | for i in trange(NUM_USER, ncols=120):
135 | uname = 'f_{0:05d}'.format(i)
136 |
137 | combined = list(zip(X[i], y[i]))
138 | random.shuffle(combined)
139 | X[i][:], y[i][:] = zip(*combined)
140 | num_samples = len(X[i])
141 | train_len = int(0.9 * num_samples)
142 | test_len = num_samples - train_len
143 |
144 | train_data['users'].append(uname)
145 | train_data['user_data'][uname] = {
146 | 'x': X[i][:train_len], 'y': y[i][:train_len]}
147 | train_data['num_samples'].append(train_len)
148 | test_data['users'].append(uname)
149 | test_data['user_data'][uname] = {
150 | 'x': X[i][train_len:], 'y': y[i][train_len:]}
151 | test_data['num_samples'].append(test_len)
152 |
153 | with open(train_path, 'w') as outfile:
154 | json.dump(train_data, outfile)
155 | with open(test_path, 'w') as outfile:
156 | json.dump(test_data, outfile)
157 |
158 |
159 | if __name__ == "__main__":
160 | main()
161 |
--------------------------------------------------------------------------------
/data/Femnist/preprocess.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #rm -rf rem_user_data sampled_data test train
4 |
5 | # download data and convert to .json format
6 |
7 | if [ ! -d "data/all_data" ] || [ ! "$(ls -A data/all_data)" ]; then
8 | cd preprocess
9 | ./data_to_json.sh
10 | cd ..
11 | fi
12 |
13 | NAME="nist" # name of the dataset, equivalent to directory name
14 |
15 | cd ../../utils
16 |
17 | # ./preprocess.sh -s niid --sf 0.05 -k 64 -t sample
18 | # ./preprocess.sh --name nist -s niid --sf 1.0 -k 0 -t sample
19 | # ./preprocess.sh --name sent140 -s niid --sf 1.0 -k 1 -t sample
20 | ./preprocess.sh --name $NAME $@
21 |
22 | cd ../data/$NAME
23 |
--------------------------------------------------------------------------------
/data/Femnist/preprocess/data_to_json.py:
--------------------------------------------------------------------------------
1 | # Converts a list of (writer, [list of (file,class)]) tuples into a json object
2 | # of the form:
3 | # {users: [bob, etc], num_samples: [124, etc.],
4 | # user_data: {bob : {x:[img1,img2,etc], y:[class1,class2,etc]}, etc}}
5 | # where 'img_' is a vectorized representation of the corresponding image
6 |
7 | from __future__ import division
8 | import json
9 | import math
10 | import numpy as np
11 | import os
12 | import sys
13 |
14 | from PIL import Image
15 |
16 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
17 | utils_dir = os.path.join(utils_dir, 'utils')
18 | sys.path.append(utils_dir)
19 |
20 | import utils
21 |
22 |
23 | MAX_WRITERS = 100 # max number of writers per json file.
24 |
25 |
26 | def relabel_class(c):
27 | '''
28 | maps hexadecimal class value (string) to a decimal number
29 | returns:
30 | - 0 through 9 for classes representing respective numbers
31 | - 10 through 35 for classes representing respective uppercase letters
32 | - 36 through 61 for classes representing respective lowercase letters
33 | '''
34 | if c.isdigit() and int(c) < 40:
35 | return (int(c) - 30)
36 | elif int(c, 16) <= 90: # uppercase
37 | return (int(c, 16) - 55)
38 | else:
39 | return (int(c, 16) - 61)
40 |
41 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
42 |
43 | ibwd = os.path.join(parent_path, 'data', 'intermediate', 'images_by_writer')
44 | writers = utils.load_obj(ibwd)
45 |
46 | num_json = int(math.ceil(len(writers) / MAX_WRITERS))
47 |
48 | users = [[] for _ in range(num_json)]
49 | num_samples = [[] for _ in range(num_json)]
50 | user_data = [{} for _ in range(num_json)]
51 |
52 | writer_count = 0
53 | json_index = 0
54 | for (w, l) in writers:
55 |
56 | users[json_index].append(w)
57 | num_samples[json_index].append(len(l))
58 | user_data[json_index][w] = {'x': [], 'y': []}
59 |
60 | size = 28, 28 # original image size is 128, 128
61 | for (f, c) in l:
62 | file_path = os.path.join(parent_path, f)
63 | img = Image.open(file_path)
64 | gray = img.convert('L')
65 | gray.thumbnail(size, Image.ANTIALIAS)
66 | arr = np.asarray(gray).copy()
67 | vec = arr.flatten()
68 | vec = vec / 255 # scale all pixel values to between 0 and 1
69 | vec = vec.tolist()
70 |
71 | nc = relabel_class(c)
72 |
73 | user_data[json_index][w]['x'].append(vec)
74 | user_data[json_index][w]['y'].append(nc)
75 |
76 | writer_count += 1
77 | if writer_count == MAX_WRITERS:
78 |
79 | all_data = {}
80 | all_data['users'] = users[json_index]
81 | all_data['num_samples'] = num_samples[json_index]
82 | all_data['user_data'] = user_data[json_index]
83 |
84 | file_name = 'all_data_%d.json' % json_index
85 | file_path = os.path.join(parent_path, 'data', 'all_data', file_name)
86 |
87 | print('writing %s' % file_name)
88 |
89 | with open(file_path, 'w') as outfile:
90 | json.dump(all_data, outfile)
91 |
92 | writer_count = 0
93 | json_index += 1
94 |
--------------------------------------------------------------------------------
/data/Femnist/preprocess/data_to_json.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # assumes that the script is run in the preprocess folder
4 |
5 | if [ ! -d "../data" ]; then
6 | mkdir ../data
7 | fi
8 | if [ ! -d "../data/raw_data" ]; then
9 | echo "------------------------------"
10 | echo "downloading data"
11 | mkdir ../data/raw_data
12 | ./get_data.sh
13 | echo "finished downloading data"
14 | fi
15 |
16 | if [ ! -d "../data/intermediate" ]; then # stores .pkl files during preprocessing
17 | mkdir ../data/intermediate
18 | fi
19 |
20 | if [ ! -f ../data/intermediate/class_file_dirs.pkl ]; then
21 | echo "------------------------------"
22 | echo "extracting file directories of images"
23 | python3 get_file_dirs.py
24 | echo "finished extracting file directories of images"
25 | fi
26 |
27 | if [ ! -f ../data/intermediate/class_file_hashes.pkl ]; then
28 | echo "------------------------------"
29 | echo "calculating image hashes"
30 | python3 get_hashes.py
31 | echo "finished calculating image hashes"
32 | fi
33 |
34 | if [ ! -f ../data/intermediate/write_with_class.pkl ]; then
35 | echo "------------------------------"
36 | echo "assigning class labels to write images"
37 | python3 match_hashes.py
38 | echo "finished assigning class labels to write images"
39 | fi
40 |
41 | if [ ! -f ../data/intermediate/images_by_writer.pkl ]; then
42 | echo "------------------------------"
43 | echo "grouping images by writer"
44 | python3 group_by_writer.py
45 | echo "finished grouping images by writer"
46 | fi
47 |
48 | if [ ! -d "../data/all_data" ]; then
49 | mkdir ../data/all_data
50 | fi
51 | if [ ! "$(ls -A ../data/all_data)" ]; then
52 | echo "------------------------------"
53 | echo "converting data to .json format"
54 | python3 data_to_json.py
55 | echo "finished converting data to .json format"
56 | fi
57 |
--------------------------------------------------------------------------------
/data/Femnist/preprocess/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # assumes that the script is run in the preprocess folder
4 |
5 | cd ../data/raw_data
6 | wget https://s3.amazonaws.com/nist-srd/SD19/by_class.zip
7 | wget https://s3.amazonaws.com/nist-srd/SD19/by_write.zip
8 | unzip by_class.zip
9 | rm by_class.zip
10 | unzip by_write.zip
11 | rm by_write.zip
12 | cd ../../preprocess
13 |
--------------------------------------------------------------------------------
/data/Femnist/preprocess/get_file_dirs.py:
--------------------------------------------------------------------------------
1 | '''
2 | Creates .pkl files for:
3 | 1. list of directories of every image in 'by_class'
4 | 2. list of directories of every image in 'by_write'
5 | the hierarchal structure of the data is as follows:
6 | - by_class -> classes -> folders containing images -> images
7 | - by_write -> folders containing writers -> writer -> types of images -> images
8 | the directories written into the files are of the form 'raw_data/...'
9 | '''
10 |
11 | import os
12 | import sys
13 |
14 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
15 | utils_dir = os.path.join(utils_dir, 'utils')
16 | sys.path.append(utils_dir)
17 |
18 | import utils
19 |
20 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
21 |
22 | class_files = [] # (class, file directory)
23 | write_files = [] # (writer, file directory)
24 |
25 | class_dir = os.path.join(parent_path, 'data', 'raw_data', 'by_class')
26 | rel_class_dir = os.path.join('data', 'raw_data', 'by_class')
27 | classes = os.listdir(class_dir)
28 |
29 | for cl in classes:
30 | cldir = os.path.join(class_dir, cl)
31 | rel_cldir = os.path.join(rel_class_dir, cl)
32 | subcls = os.listdir(cldir)
33 |
34 | subcls = [s for s in subcls if (('hsf' in s) and ('mit' not in s))]
35 |
36 | for subcl in subcls:
37 | subcldir = os.path.join(cldir, subcl)
38 | rel_subcldir = os.path.join(rel_cldir, subcl)
39 | images = os.listdir(subcldir)
40 | image_dirs = [os.path.join(rel_subcldir, i) for i in images]
41 |
42 | for image_dir in image_dirs:
43 | class_files.append((cl, image_dir))
44 |
45 | write_dir = os.path.join(parent_path, 'data', 'raw_data', 'by_write')
46 | rel_write_dir = os.path.join('data', 'raw_data', 'by_write')
47 | write_parts = os.listdir(write_dir)
48 |
49 | for write_part in write_parts:
50 | writers_dir = os.path.join(write_dir, write_part)
51 | rel_writers_dir = os.path.join(rel_write_dir, write_part)
52 | writers = os.listdir(writers_dir)
53 |
54 | for writer in writers:
55 | writer_dir = os.path.join(writers_dir, writer)
56 | rel_writer_dir = os.path.join(rel_writers_dir, writer)
57 | wtypes = os.listdir(writer_dir)
58 |
59 | for wtype in wtypes:
60 | type_dir = os.path.join(writer_dir, wtype)
61 | rel_type_dir = os.path.join(rel_writer_dir, wtype)
62 | images = os.listdir(type_dir)
63 | image_dirs = [os.path.join(rel_type_dir, i) for i in images]
64 |
65 | for image_dir in image_dirs:
66 | write_files.append((writer, image_dir))
67 |
68 | utils.save_obj(
69 | class_files,
70 | os.path.join(parent_path, 'data', 'intermediate', 'class_file_dirs'))
71 | utils.save_obj(
72 | write_files,
73 | os.path.join(parent_path, 'data', 'intermediate', 'write_file_dirs'))
74 |
--------------------------------------------------------------------------------
/data/Femnist/preprocess/get_hashes.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import os
3 | import sys
4 |
5 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
6 | utils_dir = os.path.join(utils_dir, 'utils')
7 | sys.path.append(utils_dir)
8 |
9 | import utils
10 |
11 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
12 |
13 | cfd = os.path.join(parent_path, 'data', 'intermediate', 'class_file_dirs')
14 | wfd = os.path.join(parent_path, 'data', 'intermediate', 'write_file_dirs')
15 | class_file_dirs = utils.load_obj(cfd)
16 | write_file_dirs = utils.load_obj(wfd)
17 |
18 | class_file_hashes = []
19 | write_file_hashes = []
20 |
21 | count = 0
22 | for tup in class_file_dirs:
23 | if (count%100000 == 0):
24 | print('hashed %d class images' % count)
25 |
26 | (cclass, cfile) = tup
27 | file_path = os.path.join(parent_path, cfile)
28 |
29 | chash = hashlib.md5(open(file_path, 'rb').read()).hexdigest()
30 |
31 | class_file_hashes.append((cclass, cfile, chash))
32 |
33 | count += 1
34 |
35 | cfhd = os.path.join(parent_path, 'data', 'intermediate', 'class_file_hashes')
36 | utils.save_obj(class_file_hashes, cfhd)
37 |
38 | count = 0
39 | for tup in write_file_dirs:
40 | if (count%100000 == 0):
41 | print('hashed %d write images' % count)
42 |
43 | (cclass, cfile) = tup
44 | file_path = os.path.join(parent_path, cfile)
45 |
46 | chash = hashlib.md5(open(file_path, 'rb').read()).hexdigest()
47 |
48 | write_file_hashes.append((cclass, cfile, chash))
49 |
50 | count += 1
51 |
52 | wfhd = os.path.join(parent_path, 'data', 'intermediate', 'write_file_hashes')
53 | utils.save_obj(write_file_hashes, wfhd)
54 |
--------------------------------------------------------------------------------
/data/Femnist/preprocess/group_by_writer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
5 | utils_dir = os.path.join(utils_dir, 'utils')
6 | sys.path.append(utils_dir)
7 |
8 | import utils
9 |
10 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
11 |
12 | wwcd = os.path.join(parent_path, 'data', 'intermediate', 'write_with_class')
13 | write_class = utils.load_obj(wwcd)
14 |
15 | writers = [] # each entry is a (writer, [list of (file, class)]) tuple
16 | cimages = []
17 | (cw, _, _) = write_class[0]
18 | for (w, f, c) in write_class:
19 | if w != cw:
20 | writers.append((cw, cimages))
21 | cw = w
22 | cimages = [(f, c)]
23 | cimages.append((f, c))
24 | writers.append((cw, cimages))
25 |
26 | ibwd = os.path.join(parent_path, 'data', 'intermediate', 'images_by_writer')
27 | utils.save_obj(writers, ibwd)
28 |
--------------------------------------------------------------------------------
/data/Femnist/preprocess/match_hashes.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | utils_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
5 | utils_dir = os.path.join(utils_dir, 'utils')
6 | sys.path.append(utils_dir)
7 |
8 | import utils
9 |
10 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
11 |
12 | cfhd = os.path.join(parent_path, 'data', 'intermediate', 'class_file_hashes')
13 | wfhd = os.path.join(parent_path, 'data', 'intermediate', 'write_file_hashes')
14 | class_file_hashes = utils.load_obj(cfhd) # each elem is (class, file dir, hash)
15 | write_file_hashes = utils.load_obj(wfhd) # each elem is (writer, file dir, hash)
16 |
17 | class_hash_dict = {}
18 | for i in range(len(class_file_hashes)):
19 | (c, f, h) = class_file_hashes[len(class_file_hashes)-i-1]
20 | class_hash_dict[h] = (c, f)
21 |
22 | write_classes = []
23 | for tup in write_file_hashes:
24 | (w, f, h) = tup
25 | write_classes.append((w, f, class_hash_dict[h][0]))
26 |
27 | wwcd = os.path.join(parent_path, 'data', 'intermediate', 'write_with_class')
28 | utils.save_obj(write_classes, wwcd)
29 |
--------------------------------------------------------------------------------
/data/Femnist/stats.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | NAME="nist"
4 |
5 | cd ../../utils
6 |
7 | python3 stats.py --name $NAME
8 |
9 | cd ../data/$NAME
--------------------------------------------------------------------------------
/data/Linear_synthetic/data/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharlieDinh/FEDL_pytorch/4db34e5b698d46e2f73b94fb9c0ce00ef9b464f4/data/Linear_synthetic/data/README.md
--------------------------------------------------------------------------------
/data/Linear_synthetic/generate_linear_regession.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import numpy as np
3 | import json
4 | import random
5 | import os
6 | np.random.seed(0)
7 |
8 | NUM_USER = 100
9 | Kappa = 1.4
10 | Dim = 40
11 | Noise = 0.05
12 |
13 | def generate_x(n_samples = 100, dim= 40, kappa= 10):
14 | '''Helper function to generate data'''
15 |
16 | powers = - np.log(kappa) / np.log(dim) / 2
17 |
18 | S = np.power(np.arange(dim)+1, powers)
19 | X = np.random.randn(n_samples, dim) # Random standard Gaussian data
20 | X *= S
21 | covarient_matrix = np.cov(X)
22 | print("Covarient matrix:",covarient_matrix) # Conditioning
23 | print("np.diag(S)", np.diag(S))
24 | return X, 1, 1/kappa, np.diag(S)
25 |
26 | def generate_linear_data(num_users=100, kappa=10, dim=40, noise_ratio=0.05):
27 |
28 | '''Helper function to generate data'''
29 | # generate power S
30 | powers = - np.log(kappa) / np.log(dim) / 2
31 | DIM = np.arange(dim)
32 |
33 | # Covariance matrix for X
34 | S = np.power(DIM+1, powers)
35 |
36 | # Creat list data for all users
37 | X_split = [[] for _ in range(num_users)] # X for each user
38 | y_split = [[] for _ in range(num_users)] # y for each user
39 | samples_per_user = np.random.lognormal(4, 2, num_users).astype(int) + 500
40 | indices_per_user = np.insert(samples_per_user.cumsum(), 0, 0, 0)
41 | num_total_samples = indices_per_user[-1]
42 |
43 | # Create mean of data for each user, each user will have different distribution
44 | mean_X = np.array([np.random.randn(dim) for _ in range(num_users)])
45 |
46 |
47 | X_total = np.zeros((num_total_samples, dim))
48 | y_total = np.zeros(num_total_samples)
49 |
50 | for n in range(num_users):
51 | # Generate data
52 | X_n = np.random.multivariate_normal(mean_X[n], np.diag(S), samples_per_user[n])
53 | X_total[indices_per_user[n]:indices_per_user[n+1], :] = X_n
54 |
55 | # Normalize all X's using LAMBDA
56 | norm = np.sqrt(np.linalg.norm(X_total.T.dot(X_total), 2) / num_total_samples)
57 | X_total /= norm
58 |
59 | # Generate weights and labels
60 | W = np.random.rand(dim)
61 | y_total = X_total.dot(W)
62 | noise_variance = 0.01
63 | y_total = y_total + np.sqrt(noise_ratio) * np.random.randn(num_total_samples)
64 |
65 | for n in range(num_users):
66 | X_n = X_total[indices_per_user[n]:indices_per_user[n+1], :]
67 | y_n = y_total[indices_per_user[n]:indices_per_user[n+1]]
68 | X_split[n] = X_n.tolist()
69 | y_split[n] = y_n.tolist()
70 |
71 | # print("User {} has {} samples.".format(n, samples_per_user[n]))
72 |
73 | print("=" * 80)
74 | print("Generated synthetic data for logistic regression successfully.")
75 | print("Summary of the generated data:".format(kappa))
76 | print(" Total # users : {}".format(num_users))
77 | print(" Input dimension : {}".format(dim))
78 | print(" rho : {}".format(kappa))
79 | print(" Total # of samples : {}".format(num_total_samples))
80 | print(" Minimum # of samples: {}".format(np.min(samples_per_user)))
81 | print(" Maximum # of samples: {}".format(np.max(samples_per_user)))
82 | print("=" * 80)
83 |
84 | return X_split, y_split
85 |
86 |
87 | def save_total_data():
88 | train_data = {'users': [], 'user_data': {}, 'num_samples': []}
89 | test_data = {'users': [], 'user_data': {}, 'num_samples': []}
90 |
91 | train_path = os.path.join("data", "train", "mytrain.json")
92 | test_path = os.path.join("data", "test", "mytest.json")
93 | for path in [os.path.join("data", "train"), os.path.join("data", "test")]:
94 | if not os.path.exists(path):
95 | os.makedirs(path)
96 |
97 | X, y = generate_linear_data(NUM_USER, Kappa, Dim, Noise)
98 |
99 | # Create data structure
100 | train_data = {'users': [], 'user_data': {}, 'num_samples': []}
101 | test_data = {'users': [], 'user_data': {}, 'num_samples': []}
102 |
103 | for i in range(NUM_USER):
104 | uname = 'f_{0:05d}'.format(i)
105 | combined = list(zip(X[i], y[i]))
106 | random.shuffle(combined)
107 | X[i][:], y[i][:] = zip(*combined)
108 | num_samples = len(X[i])
109 | train_len = int(0.75 * num_samples)
110 | test_len = num_samples - train_len
111 | print("User: ",uname, " Num Sample: ", num_samples )
112 | train_data['users'].append(uname)
113 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]}
114 | train_data['num_samples'].append(train_len)
115 | test_data['users'].append(uname)
116 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]}
117 | test_data['num_samples'].append(test_len)
118 |
119 | with open(train_path, 'w') as outfile:
120 | json.dump(train_data, outfile)
121 | with open(test_path, 'w') as outfile:
122 | json.dump(test_data, outfile)
123 |
124 | print("=" * 80)
125 | print("Saved all users' data sucessfully.")
126 | print(" Train path:", os.path.join(os.curdir, train_path))
127 | print(" Test path :", os.path.join(os.curdir, test_path))
128 | print("=" * 80)
129 |
130 |
131 | def main():
132 | #generate_x()
133 | save_total_data()
134 |
135 |
136 | if __name__ == '__main__':
137 | main()
138 |
--------------------------------------------------------------------------------
/data/Linear_synthetic/generate_linear_regession_updated.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import numpy as np
3 | import json
4 | import random
5 | import os
6 | np.random.seed(0)
7 |
8 | NUM_USER = 100
9 | Kappa = 1.4
10 | Dim = 40
11 | Noise = 0.05
12 |
13 | def generate_x(n_samples = 100, dim= 40, kappa= 10):
14 | '''Helper function to generate data'''
15 |
16 | powers = - np.log(kappa) / np.log(dim) / 2
17 |
18 | S = np.power(np.arange(dim)+1, powers)
19 | X = np.random.randn(n_samples, dim) # Random standard Gaussian data
20 | X *= S
21 | covarient_matrix = np.cov(X)
22 | print("Covarient matrix:",covarient_matrix) # Conditioning
23 | print("np.diag(S)", np.diag(S))
24 | return X, 1, 1/kappa, np.diag(S)
25 |
26 | def generate_linear_data(num_users=100, kappa=10, dim=40, noise_ratio=0.05):
27 |
28 | '''Helper function to generate data'''
29 | # generate power S
30 | powers = - np.log(kappa) / np.log(dim) / 2
31 | DIM = np.arange(dim)
32 |
33 | # Covariance matrix for X
34 | S = np.power(DIM+1, powers)
35 |
36 | # Creat list data for all users
37 | X_split = [[] for _ in range(num_users)] # X for each user
38 | y_split = [[] for _ in range(num_users)] # y for each user
39 | samples_per_user = np.random.lognormal(4, 2, num_users).astype(int) + 500
40 | indices_per_user = np.insert(samples_per_user.cumsum(), 0, 0, 0)
41 | num_total_samples = indices_per_user[-1]
42 |
43 | # Create mean of data for each user, each user will have different distribution
44 | sig = np.random.uniform(0.1, 10)
45 | mean = np.random.uniform(low=-0.1, high=0.1)
46 | cov = np.random.uniform(low=0.0, high=0.01)
47 | #print("mean -cov", mean,cov)
48 | mean_X = np.random.normal(mean, cov, dim)
49 |
50 | X_total = np.zeros((num_total_samples, dim))
51 | y_total = np.zeros(num_total_samples)
52 |
53 | for n in range(num_users):
54 | # Generate data
55 | X_n = np.random.multivariate_normal(mean_X, sig * np.diag(S), samples_per_user[n])
56 | X_total[indices_per_user[n]:indices_per_user[n+1], :] = X_n
57 |
58 | # Normalize all X's using LAMBDA
59 | norm = np.sqrt(np.linalg.norm(X_total.T.dot(X_total), 2) / num_total_samples)
60 | X_total /= norm
61 |
62 | # Generate weights and labels
63 | W = np.random.rand(dim)
64 | y_total = X_total.dot(W)
65 | noise_variance = 0.01
66 | y_total = y_total + np.sqrt(noise_ratio) * np.random.randn(num_total_samples)
67 |
68 | for n in range(num_users):
69 | X_n = X_total[indices_per_user[n]:indices_per_user[n+1], :]
70 | y_n = y_total[indices_per_user[n]:indices_per_user[n+1]]
71 | X_split[n] = X_n.tolist()
72 | y_split[n] = y_n.tolist()
73 |
74 | # print("User {} has {} samples.".format(n, samples_per_user[n]))
75 |
76 | print("=" * 80)
77 | print("Generated synthetic data for logistic regression successfully.")
78 | print("Summary of the generated data:".format(kappa))
79 | print(" Total # users : {}".format(num_users))
80 | print(" Input dimension : {}".format(dim))
81 | print(" rho : {}".format(kappa))
82 | print(" Total # of samples : {}".format(num_total_samples))
83 | print(" Minimum # of samples: {}".format(np.min(samples_per_user)))
84 | print(" Maximum # of samples: {}".format(np.max(samples_per_user)))
85 | print("=" * 80)
86 |
87 | return X_split, y_split
88 |
89 |
90 | def save_total_data():
91 | train_data = {'users': [], 'user_data': {}, 'num_samples': []}
92 | test_data = {'users': [], 'user_data': {}, 'num_samples': []}
93 |
94 | train_path = os.path.join("data", "train", "mytrain.json")
95 | test_path = os.path.join("data", "test", "mytest.json")
96 | for path in [os.path.join("data", "train"), os.path.join("data", "test")]:
97 | if not os.path.exists(path):
98 | os.makedirs(path)
99 |
100 | X, y = generate_linear_data(NUM_USER, Kappa, Dim, Noise)
101 |
102 | # Create data structure
103 | train_data = {'users': [], 'user_data': {}, 'num_samples': []}
104 | test_data = {'users': [], 'user_data': {}, 'num_samples': []}
105 |
106 | for i in range(NUM_USER):
107 | uname = 'f_{0:05d}'.format(i)
108 | combined = list(zip(X[i], y[i]))
109 | random.shuffle(combined)
110 | X[i][:], y[i][:] = zip(*combined)
111 | num_samples = len(X[i])
112 | train_len = int(0.75 * num_samples)
113 | test_len = num_samples - train_len
114 | print("User: ",uname, " Num Sample: ", num_samples )
115 | train_data['users'].append(uname)
116 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]}
117 | train_data['num_samples'].append(train_len)
118 | test_data['users'].append(uname)
119 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]}
120 | test_data['num_samples'].append(test_len)
121 |
122 | with open(train_path, 'w') as outfile:
123 | json.dump(train_data, outfile)
124 | with open(test_path, 'w') as outfile:
125 | json.dump(test_data, outfile)
126 |
127 | print("=" * 80)
128 | print("Saved all users' data sucessfully.")
129 | print(" Train path:", os.path.join(os.curdir, train_path))
130 | print(" Test path :", os.path.join(os.curdir, test_path))
131 | print("=" * 80)
132 |
133 |
134 | def main():
135 | #generate_x()
136 | save_total_data()
137 |
138 |
139 | if __name__ == '__main__':
140 | main()
141 |
--------------------------------------------------------------------------------
/data/Linear_synthetic/generate_linear_synthetic_backup.py:
--------------------------------------------------------------------------------
1 | import json
2 | import math
3 | import numpy as np
4 | import os
5 | import sys
6 | import random
7 | from tqdm import trange
8 | import math
9 |
10 |
11 | NUM_USER = 100
12 | def normalize_data(X):
13 |
14 | #nomarlize all feature of data between (0 and 1)
15 | normX = X - X.min()
16 | normX = normX / (X.max() - X.min())
17 | #normX = normX*2-1 between (-1 and 1)
18 |
19 | # nomarlize data with respect to -1 < X.X^T < 1.
20 | temp = normX.dot(normX.T)
21 | return normX/np.sqrt(temp.max())
22 |
23 | def generate_synthetic(alpha = 0.5, beta = 0.5):
24 |
25 | # Generate parameters for controlling kappa
26 | dimension = 60
27 | NUM_CLASS = 1
28 | samples_per_user = np.random.lognormal(4, 2, (NUM_USER)).astype(int) + 100
29 | print(samples_per_user)
30 | num_samples = np.sum(samples_per_user)
31 |
32 | X_split = [[] for _ in range(NUM_USER)]
33 | y_split = [[] for _ in range(NUM_USER)]
34 |
35 | #### define some eprior ####
36 | mean_W = np.random.normal(0, alpha, NUM_USER)
37 | mean_b = mean_W
38 | B = np.random.normal(0, beta, NUM_USER)
39 | mean_x = np.zeros((NUM_USER, dimension))
40 |
41 | diagonal = np.zeros(dimension)
42 | for j in range(dimension):
43 | diagonal[j] = np.power((j+1), -1.2)
44 | cov_x = np.diag(diagonal)
45 |
46 | for i in range(NUM_USER):
47 | mean_x[i] = np.random.normal(B[i], 1, dimension)
48 | print(mean_x[i])
49 |
50 | for i in range(NUM_USER):
51 |
52 | W = np.random.normal(mean_W[i], 1, (dimension, NUM_CLASS))
53 | b = np.random.normal(mean_b[i], 1, NUM_CLASS)
54 |
55 | xx = np.random.multivariate_normal(mean_x[i], cov_x, samples_per_user[i])
56 | nom_xx = normalize_data(xx)
57 | yy = np.zeros(samples_per_user[i])
58 |
59 | for j in range(samples_per_user[i]):
60 | yy[j] = np.dot(nom_xx[j], W) + b
61 |
62 | X_split[i] = nom_xx.tolist()
63 | y_split[i] = yy.tolist()
64 |
65 | print("{}-th users has {} exampls".format(i, len(y_split[i])))
66 |
67 | return X_split, y_split
68 |
69 |
70 |
71 | def main():
72 |
73 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]}
74 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]}
75 |
76 | train_path = "data/train/mytrain.json"
77 | test_path = "data/test/mytest.json"
78 |
79 | X, y = generate_synthetic(alpha=0.5, beta=0.5) # synthetic (0.5, 0.5)
80 |
81 |
82 | # Create data structure
83 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]}
84 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]}
85 |
86 | for i in trange(NUM_USER, ncols=120):
87 |
88 | uname = 'f_{0:05d}'.format(i)
89 | combined = list(zip(X[i], y[i]))
90 | random.shuffle(combined)
91 | X[i][:], y[i][:] = zip(*combined)
92 | num_samples = len(X[i])
93 | train_len = int(0.75 * num_samples)
94 | test_len = num_samples - train_len
95 |
96 | train_data['users'].append(uname)
97 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]}
98 | train_data['num_samples'].append(train_len)
99 | test_data['users'].append(uname)
100 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]}
101 | test_data['num_samples'].append(test_len)
102 |
103 |
104 | with open(train_path,'w') as outfile:
105 | json.dump(train_data, outfile)
106 | with open(test_path, 'w') as outfile:
107 | json.dump(test_data, outfile)
108 |
109 |
110 | if __name__ == "__main__":
111 | main()
112 |
113 |
--------------------------------------------------------------------------------
/data/Linear_synthetic/optimal_solution_finding.py:
--------------------------------------------------------------------------------
1 | import json
2 | import math
3 | import numpy as np
4 | import os
5 | import sys
6 | import random
7 | from tqdm import trange
8 | import math
9 | import numpy as np
10 | from sklearn.linear_model import LinearRegression
11 | import sklearn as sk
12 | np.random.seed(0)
13 |
14 | NUM_USER = 100
15 |
16 | def normalize_data(X):
17 |
18 | #nomarlize all feature of data between (-1 and 1)
19 | normX = X - X.min()
20 | normX = normX / (X.max() - X.min())
21 |
22 | # nomarlize data with respect to -1 < X.X^T < 1.
23 | temp = normX.dot(normX.T)
24 | return normX/np.sqrt(temp.max())
25 |
26 |
27 | def finding_optimal_synthetic(num_users=100, kappa=10, dim = 40, noise_ratio=0.05):
28 |
29 | powers = - np.log(kappa) / np.log(dim) / 2
30 | DIM = np.arange(dim)
31 | S = np.power(DIM+1, powers)
32 |
33 | # Creat list data for all users
34 | X_split = [[] for _ in range(num_users)] # X for each user
35 | y_split = [[] for _ in range(num_users)] # y for each user
36 | samples_per_user = np.random.lognormal(4, 2, num_users).astype(int) + 500
37 | indices_per_user = np.insert(samples_per_user.cumsum(), 0, 0, 0)
38 | num_total_samples = indices_per_user[-1]
39 |
40 | # Create mean of data for each user, each user will have different distribution
41 | mean_X = np.array([np.random.randn(dim) for _ in range(num_users)])
42 |
43 | # Covariance matrix for X
44 | X_total = np.zeros((num_total_samples, dim))
45 | y_total = np.zeros(num_total_samples)
46 |
47 | for n in range(num_users):
48 | # Generate data
49 | X_n = np.random.multivariate_normal(mean_X[n], np.diag(S), samples_per_user[n])
50 | X_total[indices_per_user[n]:indices_per_user[n+1], :] = X_n
51 |
52 | # Normalize all X's using LAMBDA
53 | norm = np.sqrt(np.linalg.norm(X_total.T.dot(X_total), 2) / num_total_samples)
54 | X_total /= norm
55 |
56 | # Generate weights and labels
57 | W = np.random.rand(dim)
58 | y_total = X_total.dot(W)
59 | noise_variance = 0.01
60 | y_total = y_total + np.sqrt(noise_ratio) * np.random.randn(num_total_samples)
61 |
62 | for n in range(num_users):
63 | X_n = X_total[indices_per_user[n]:indices_per_user[n+1],:]
64 | y_n = y_total[indices_per_user[n]:indices_per_user[n+1]]
65 | X_split[n] = X_n.tolist()
66 | y_split[n] = y_n.tolist()
67 |
68 | # split data to get training data
69 | train_x = []
70 | train_y = []
71 | test_x = []
72 | test_y = []
73 | for i in range(NUM_USER):
74 | num_samples = len(X_split[i])
75 | train_len = int(0.75 * num_samples)
76 | test_len = num_samples - train_len
77 | train_x.append(X_split[i][:train_len])
78 | train_y.append(y_split[i][:train_len])
79 | test_x.append(X_split[i][train_len:])
80 | test_y.append(y_split[i][train_len:])
81 |
82 | train_xc = np.concatenate(train_x)
83 | train_yc = np.concatenate(train_y)
84 | test_xc = np.concatenate(test_x)
85 | test_yc = np.concatenate(test_y)
86 |
87 | # # finding optimal
88 | X_X_T = np.zeros(shape=(dim+1,dim+1))
89 | X_Y = np.zeros(shape=(dim+1,1))
90 |
91 | for n in range(num_users):
92 | X = np.array(train_x[i])
93 | y = np.array(train_y[i])
94 | one = np.ones((X.shape[0], 1))
95 | Xbar = np.concatenate((one, X), axis = 1)
96 | X_X_T += Xbar.T.dot(Xbar)*len(y)/len(train_yc)
97 | X_Y += np.array(Xbar).T.dot(y).reshape((dim+1, 1))*len(y)/len(train_yc)
98 |
99 | # get optimal point.
100 | w = np.linalg.inv(X_X_T).dot(X_Y)
101 |
102 | # caculate loss over all devices
103 | loss = 0
104 | for n in range(num_users):
105 | X = np.array(train_x[i])
106 | y = np.array(train_y[i])
107 | one = np.ones((X.shape[0], 1))
108 | Xbar = np.concatenate((one, X), axis = 1)
109 | y_predict = Xbar.dot(w)
110 | loss += sk.metrics.mean_squared_error(y,y_predict)*len(y)/len(train_yc)
111 |
112 | return loss
113 |
114 | def main():
115 | loss = 0
116 | loss = finding_optimal_synthetic()
117 | print("loss for train data", loss)
118 |
119 | if __name__ == "__main__":
120 | main()
121 |
122 |
--------------------------------------------------------------------------------
/data/Logistic_synthetic/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharlieDinh/FEDL_pytorch/4db34e5b698d46e2f73b94fb9c0ce00ef9b464f4/data/Logistic_synthetic/README.md
--------------------------------------------------------------------------------
/data/Logistic_synthetic/logistic_regression.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import numpy as np
3 | import json
4 | import random
5 | import os
6 |
7 |
8 | def logit(X, W):
9 | return 1 / (1 + np.exp(-np.dot(X, W)))
10 |
11 |
12 | def generate_logistic_regression_data(num_users=100, kappa=10, dim=40, noise_ratio=0.05):
13 | # For consistent results
14 | np.random.seed(0)
15 |
16 | # Sanity check
17 | assert(kappa >= 1 and num_users > 0 and dim > 0)
18 |
19 | X_split = [[] for _ in range(num_users)] # X for each user
20 | y_split = [[] for _ in range(num_users)] # y for each user
21 |
22 | # Find users' sample sizes based on the power law (heterogeneity)
23 | samples_per_user = np.random.lognormal(4, 2, num_users).astype(int) + 50 + 10000
24 | indices_per_user = np.insert(samples_per_user.cumsum(), 0, 0, 0)
25 | num_total_samples = indices_per_user[-1]
26 |
27 | # Each user's mean is drawn from N(0, 1) (non-i.i.d. data)
28 | mean_X = np.array([np.random.randn(dim) for _ in range(num_users)])
29 |
30 | # Covariance matrix for X
31 | Sigma = np.eye(dim)
32 |
33 | # L = 1, hyper_learning_rate = LAMBDA
34 | LAMBDA = 100 if kappa == 1 else 1 / (kappa - 1)
35 |
36 | # Keep all users' inputs and labels in one array,
37 | # indexed according to indices_per_user.
38 | # (e.g. X_total[indices_per_user[n]:indices_per_user[n+1], :] = X_n)
39 | # (e.g. y_total[indices_per_user[n]:indices_per_user[n+1]] = y_n)
40 | X_total = np.zeros((num_total_samples, dim))
41 | y_total = np.zeros(num_total_samples)
42 |
43 | for n in range(num_users):
44 | # Generate data
45 | X_n = np.random.multivariate_normal(mean_X[n], Sigma, samples_per_user[n])
46 | X_total[indices_per_user[n]:indices_per_user[n+1], :] = X_n
47 |
48 | # Normalize all X's using LAMBDA
49 | norm = np.sqrt(np.linalg.norm(X_total.T.dot(X_total), 2) / num_total_samples)
50 | X_total /= norm + LAMBDA
51 |
52 | # Generate weights and labels
53 | W = np.random.rand(dim)
54 | y_total = logit(X_total, W)
55 | y_total = np.where(y_total > 0.5, 1, 0)
56 |
57 | # Apply noise: randomly flip some of y_n with probability noise_ratio
58 | noise = np.random.binomial(1, noise_ratio, num_total_samples)
59 | y_total = np.multiply(noise - y_total, noise) + np.multiply(y_total, 1 - noise)
60 |
61 | # Save each user's data separately
62 | for n in range(num_users):
63 | X_n = X_total[indices_per_user[n]:indices_per_user[n+1], :]
64 | y_n = y_total[indices_per_user[n]:indices_per_user[n+1]]
65 | X_split[n] = X_n.tolist()
66 | y_split[n] = y_n.tolist()
67 |
68 | # print("User {} has {} samples.".format(n, samples_per_user[n]))
69 |
70 | print("=" * 80)
71 | print("Generated synthetic data for logistic regression successfully.")
72 | print("Summary of the generated data:".format(kappa))
73 | print(" Total # users : {}".format(num_users))
74 | print(" Input dimension : {}".format(dim))
75 | print(" rho : {}".format(kappa))
76 | print(" Total # of samples : {}".format(num_total_samples))
77 | print(" Minimum # of samples: {}".format(np.min(samples_per_user)))
78 | print(" Maximum # of samples: {}".format(np.max(samples_per_user)))
79 | print("=" * 80)
80 |
81 | return X_split, y_split
82 |
83 |
84 | def save_total_data():
85 | train_data = {'users': [], 'user_data': {}, 'num_samples': []}
86 | test_data = {'users': [], 'user_data': {}, 'num_samples': []}
87 |
88 | train_path = os.path.join("data", "train", "mytrain.json")
89 | test_path = os.path.join("data", "test", "mytest.json")
90 | for path in [os.path.join("data", "train"), os.path.join("data", "test")]:
91 | if not os.path.exists(path):
92 | os.makedirs(path)
93 |
94 | X, y = generate_logistic_regression_data(100, 2, 40, 0.05)
95 |
96 | # Create data structure
97 | train_data = {'users': [], 'user_data': {}, 'num_samples': []}
98 | test_data = {'users': [], 'user_data': {}, 'num_samples': []}
99 |
100 | for i in range(100):
101 | uname = 'f_{0:05d}'.format(i)
102 | combined = list(zip(X[i], y[i]))
103 | random.shuffle(combined)
104 | X[i][:], y[i][:] = zip(*combined)
105 | num_samples = len(X[i])
106 | train_len = int(0.75 * num_samples)
107 | test_len = num_samples - train_len
108 | print("User: ",uname, " Num Sample: ", num_samples )
109 | train_data['users'].append(uname)
110 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]}
111 | train_data['num_samples'].append(train_len)
112 | test_data['users'].append(uname)
113 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]}
114 | test_data['num_samples'].append(test_len)
115 |
116 | with open(train_path, 'w') as outfile:
117 | json.dump(train_data, outfile)
118 | with open(test_path, 'w') as outfile:
119 | json.dump(test_data, outfile)
120 |
121 | print("=" * 80)
122 | print("Saved all users' data sucessfully.")
123 | print(" Train path:", os.path.join(os.curdir, train_path))
124 | print(" Test path :", os.path.join(os.curdir, test_path))
125 | print("=" * 80)
126 |
127 |
128 | def main():
129 | save_total_data()
130 | #save_data_by_user()
131 |
132 |
133 | if __name__ == '__main__':
134 | main()
135 |
--------------------------------------------------------------------------------
/data/Mnist/data/mldata/mnist-original.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharlieDinh/FEDL_pytorch/4db34e5b698d46e2f73b94fb9c0ce00ef9b464f4/data/Mnist/data/mldata/mnist-original.mat
--------------------------------------------------------------------------------
/data/Mnist/generate_iid_20users.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import fetch_mldata
2 | from tqdm import trange
3 | import numpy as np
4 | import random
5 | import json
6 | import os
7 |
8 | random.seed(1)
9 | np.random.seed(1)
10 | NUM_USERS = 20 # should be muitiple of 10
11 | NUM_LABELS = 10
12 | # Setup directory for train/test data
13 | train_path = './data/train/mnist_train.json'
14 | test_path = './data/test/mnist_test.json'
15 | dir_path = os.path.dirname(train_path)
16 | if not os.path.exists(dir_path):
17 | os.makedirs(dir_path)
18 | dir_path = os.path.dirname(test_path)
19 | if not os.path.exists(dir_path):
20 | os.makedirs(dir_path)
21 |
22 | # Get MNIST data, normalize, and divide by level
23 | mnist = fetch_mldata('MNIST original', data_home='./data')
24 | mu = np.mean(mnist.data.astype(np.float32), 0)
25 | sigma = np.std(mnist.data.astype(np.float32), 0)
26 | mnist.data = (mnist.data.astype(np.float32) - mu)/(sigma+0.001)
27 | mnist_data = []
28 | for i in trange(10):
29 | idx = mnist.target==i
30 | mnist_data.append(mnist.data[idx])
31 |
32 | print("\nNumb samples of each label:\n", [len(v) for v in mnist_data])
33 | users_lables = []
34 |
35 | print("idx",idx)
36 | # devide for label for each users:
37 | for user in trange(NUM_USERS):
38 | for j in range(NUM_LABELS): # 4 labels for each users
39 | l = (user + j) % 10
40 | users_lables.append(l)
41 | unique, counts = np.unique(users_lables, return_counts=True)
42 | print("--------------")
43 | print(unique, counts)
44 |
45 | def ram_dom_gen(total, size):
46 | print(total)
47 | nums = []
48 | temp = []
49 | for i in range(size - 1):
50 | val = np.random.randint(total//(size + 1), total//(size - 8))
51 | temp.append(val)
52 | total -= val
53 | temp.append(total)
54 | print(temp)
55 | return temp
56 | number_sample = []
57 | for total_value, count in zip(mnist_data, counts):
58 | temp = ram_dom_gen(len(total_value), count)
59 | number_sample.append(temp)
60 | print("--------------")
61 | print(number_sample)
62 |
63 | i = 0
64 | number_samples = []
65 | for i in range(len(number_sample[0])):
66 | for sample in number_sample:
67 | print(sample)
68 | number_samples.append(sample[i])
69 |
70 | print("--------------")
71 | print(number_samples)
72 |
73 | ###### CREATE USER DATA SPLIT #######
74 | # Assign 100 samples to each user
75 | X = [[] for _ in range(NUM_USERS)]
76 | y = [[] for _ in range(NUM_USERS)]
77 | count = 0
78 | for user in trange(NUM_USERS):
79 | for j in range(NUM_LABELS): # 4 labels for each users
80 | l = (user + j) % 10
81 | print("value of L",l)
82 | print("value of count",count)
83 | num_samples = number_samples[count] # num sample
84 | count = count + 1
85 | if idx[l] + num_samples < len(mnist_data[l]):
86 | X[user] += mnist_data[l][idx[l]:num_samples].tolist()
87 | y[user] += (l*np.ones(num_samples)).tolist()
88 | idx[l] += num_samples
89 | print("check len os user:", user, j,"len data", len(X[user]), num_samples)
90 |
91 | print("IDX2:", idx) # counting samples for each labels
92 |
93 | # Create data structure
94 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]}
95 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]}
96 |
97 | # Setup 5 users
98 | # for i in trange(5, ncols=120):
99 | for i in range(NUM_USERS):
100 | uname = 'f_{0:05d}'.format(i)
101 |
102 | combined = list(zip(X[i], y[i]))
103 | random.shuffle(combined)
104 | X[i][:], y[i][:] = zip(*combined)
105 | num_samples = len(X[i])
106 | train_len = int(0.75*num_samples)
107 | test_len = num_samples - train_len
108 |
109 | train_data['users'].append(uname)
110 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]}
111 | train_data['num_samples'].append(train_len)
112 | test_data['users'].append(uname)
113 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]}
114 | test_data['num_samples'].append(test_len)
115 |
116 | print("Num_samples:", train_data['num_samples'])
117 | print("Total_samples:",sum(train_data['num_samples'] + test_data['num_samples']))
118 |
119 | with open(train_path,'w') as outfile:
120 | json.dump(train_data, outfile)
121 | with open(test_path, 'w') as outfile:
122 | json.dump(test_data, outfile)
123 |
124 | print("Finish Generating Samples")
125 |
--------------------------------------------------------------------------------
/data/Mnist/generate_niid_100users_updated.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import fetch_mldata
2 | from tqdm import trange
3 | import numpy as np
4 | import random
5 | import json
6 | import os
7 |
8 | random.seed(1)
9 | np.random.seed(1)
10 | NUM_USERS = 100
11 | NUM_LABELS = 3
12 | # Setup directory for train/test data
13 | train_path = './data/train/mnist_train.json'
14 | test_path = './data/test/mnist_test.json'
15 | dir_path = os.path.dirname(train_path)
16 | if not os.path.exists(dir_path):
17 | os.makedirs(dir_path)
18 | dir_path = os.path.dirname(test_path)
19 | if not os.path.exists(dir_path):
20 | os.makedirs(dir_path)
21 |
22 | # Get MNIST data, normalize, and divide by level
23 | mnist = fetch_mldata('MNIST original', data_home='./data')
24 | mu = np.mean(mnist.data.astype(np.float32), 0)
25 | sigma = np.std(mnist.data.astype(np.float32), 0)
26 | mnist.data = (mnist.data.astype(np.float32) - mu)/(sigma+0.001)
27 | mnist_data = []
28 | for i in trange(10):
29 | idx = mnist.target==i
30 | mnist_data.append(mnist.data[idx])
31 |
32 | print("\nNumb samples of each label:\n", [len(v) for v in mnist_data])
33 |
34 | ###### CREATE USER DATA SPLIT #######
35 | # Assign 100 samples to each user
36 | X = [[] for _ in range(NUM_USERS)]
37 | y = [[] for _ in range(NUM_USERS)]
38 | idx = np.zeros(10, dtype=np.int64)
39 | for user in range(NUM_USERS):
40 | for j in range(NUM_LABELS): # 3 labels for each users
41 | #l = (2*user+j)%10
42 | l = (user + j) % 10
43 | print("L:", l)
44 | X[user] += mnist_data[l][idx[l]:idx[l]+10].tolist()
45 | y[user] += (l*np.ones(10)).tolist()
46 | idx[l] += 10
47 |
48 | print("IDX1:", idx) # counting samples for each labels
49 |
50 | # Assign remaining sample by power law
51 | user = 0
52 | props = np.random.lognormal(
53 | 0, 2., (10, NUM_USERS, NUM_LABELS)) # last 5 is 5 labels
54 | props = np.array([[[len(v)-1000]] for v in mnist_data]) * \
55 | props/np.sum(props, (1, 2), keepdims=True)
56 | # print("here:",props/np.sum(props,(1,2), keepdims=True))
57 | #props = np.array([[[len(v)-100]] for v in mnist_data]) * \
58 | # props/np.sum(props, (1, 2), keepdims=True)
59 | #idx = 1000*np.ones(10, dtype=np.int64)
60 | # print("here2:",props)
61 | for user in trange(NUM_USERS):
62 | for j in range(NUM_LABELS): # 4 labels for each users
63 | # l = (2*user+j)%10
64 | l = (user + j) % 10
65 | num_samples = int(props[l, user//int(NUM_USERS/10), j])
66 | numran1 = random.randint(10, 200)
67 | numran2 = random.randint(1, 10)
68 | num_samples = (num_samples) * numran2 + numran1
69 | if(NUM_USERS <= 20):
70 | num_samples = num_samples * 2
71 | if idx[l] + num_samples < len(mnist_data[l]):
72 | X[user] += mnist_data[l][idx[l]:idx[l]+num_samples].tolist()
73 | y[user] += (l*np.ones(num_samples)).tolist()
74 | idx[l] += num_samples
75 | print("check len os user:", user, j,
76 | "len data", len(X[user]), num_samples)
77 |
78 | print("IDX2:", idx) # counting samples for each labels
79 |
80 | # Create data structure
81 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]}
82 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]}
83 |
84 | # Setup 5 users
85 | # for i in trange(5, ncols=120):
86 | for i in range(NUM_USERS):
87 | uname = 'f_{0:05d}'.format(i)
88 |
89 | combined = list(zip(X[i], y[i]))
90 | random.shuffle(combined)
91 | X[i][:], y[i][:] = zip(*combined)
92 | num_samples = len(X[i])
93 | train_len = int(0.75*num_samples)
94 | test_len = num_samples - train_len
95 |
96 | train_data['users'].append(uname)
97 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]}
98 | train_data['num_samples'].append(train_len)
99 | test_data['users'].append(uname)
100 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]}
101 | test_data['num_samples'].append(test_len)
102 |
103 | print("Num_samples:", train_data['num_samples'])
104 | print("Total_samples:",sum(train_data['num_samples']))
105 |
106 | with open(train_path,'w') as outfile:
107 | json.dump(train_data, outfile)
108 | with open(test_path, 'w') as outfile:
109 | json.dump(test_data, outfile)
110 |
111 | print("Finish Generating Samples")
112 |
--------------------------------------------------------------------------------
/data/Mnist/generate_niid_20users.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import fetch_openml
2 | from tqdm import trange
3 | import numpy as np
4 | import random
5 | import json
6 | import os
7 |
8 | random.seed(1)
9 | np.random.seed(1)
10 | NUM_USERS = 20 # should be muitiple of 10
11 | NUM_LABELS = 2
12 | # Setup directory for train/test data
13 | train_path = './data/train/mnist_train.json'
14 | test_path = './data/test/mnist_test.json'
15 | dir_path = os.path.dirname(train_path)
16 | if not os.path.exists(dir_path):
17 | os.makedirs(dir_path)
18 | dir_path = os.path.dirname(test_path)
19 | if not os.path.exists(dir_path):
20 | os.makedirs(dir_path)
21 |
22 | # Get MNIST data, normalize, and divide by level
23 | mnist = fetch_openml('mnist_784', data_home='./data')
24 | mu = np.mean(mnist.data.astype(np.float32), 0)
25 | sigma = np.std(mnist.data.astype(np.float32), 0)
26 | mnist.data = (mnist.data.astype(np.float32) - mu)/(sigma+0.001)
27 | mnist_data = []
28 | for i in trange(10):
29 | idx = mnist.target==str(i)
30 | mnist_data.append(mnist.data[idx])
31 |
32 | print("\nNumb samples of each label:\n", [len(v) for v in mnist_data])
33 | users_lables = []
34 |
35 | print("idx",idx)
36 | # devide for label for each users:
37 | for user in trange(NUM_USERS):
38 | for j in range(NUM_LABELS): # 4 labels for each users
39 | l = (user + j) % 10
40 | users_lables.append(l)
41 | unique, counts = np.unique(users_lables, return_counts=True)
42 | print("--------------")
43 | print(unique, counts)
44 |
45 | def ram_dom_gen(total, size):
46 | print(total)
47 | nums = []
48 | temp = []
49 | for i in range(size - 1):
50 | val = np.random.randint(total//(size + 1), total//2)
51 | temp.append(val)
52 | total -= val
53 | temp.append(total)
54 | print(temp)
55 | return temp
56 | number_sample = []
57 | for total_value, count in zip(mnist_data, counts):
58 | temp = ram_dom_gen(len(total_value), count)
59 | number_sample.append(temp)
60 | print("--------------")
61 | print(number_sample)
62 |
63 | i = 0
64 | number_samples = []
65 | for i in range(len(number_sample[0])):
66 | for sample in number_sample:
67 | print(sample)
68 | number_samples.append(sample[i])
69 |
70 | print("--------------")
71 | print(number_samples)
72 |
73 | ###### CREATE USER DATA SPLIT #######
74 | # Assign 100 samples to each user
75 | X = [[] for _ in range(NUM_USERS)]
76 | y = [[] for _ in range(NUM_USERS)]
77 | count = 0
78 | for user in trange(NUM_USERS):
79 | for j in range(NUM_LABELS): # 4 labels for each users
80 | l = (user + j) % 10
81 | print("value of L",l)
82 | print("value of count",count)
83 | num_samples = number_samples[count] # num sample
84 | count = count + 1
85 | if idx[l] + num_samples < len(mnist_data[l]):
86 | X[user] += mnist_data[l][idx[l]:num_samples].tolist()
87 | y[user] += (l*np.ones(num_samples)).tolist()
88 | idx[l] += num_samples
89 | print("check len os user:", user, j,"len data", len(X[user]), num_samples)
90 |
91 | print("IDX2:", idx) # counting samples for each labels
92 |
93 | # Create data structure
94 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]}
95 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]}
96 |
97 | # Setup 5 users
98 | # for i in trange(5, ncols=120):
99 | for i in range(NUM_USERS):
100 | uname = 'f_{0:05d}'.format(i)
101 |
102 | combined = list(zip(X[i], y[i]))
103 | random.shuffle(combined)
104 | X[i][:], y[i][:] = zip(*combined)
105 | num_samples = len(X[i])
106 | train_len = int(0.75*num_samples)
107 | test_len = num_samples - train_len
108 |
109 | train_data['users'].append(uname)
110 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]}
111 | train_data['num_samples'].append(train_len)
112 | test_data['users'].append(uname)
113 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]}
114 | test_data['num_samples'].append(test_len)
115 |
116 | print("Num_samples:", train_data['num_samples'])
117 | print("Total_samples:",sum(train_data['num_samples'] + test_data['num_samples']))
118 |
119 | with open(train_path,'w') as outfile:
120 | json.dump(train_data, outfile)
121 | with open(test_path, 'w') as outfile:
122 | json.dump(test_data, outfile)
123 |
124 | print("Finish Generating Samples")
125 |
--------------------------------------------------------------------------------
/data/Mnist/generate_niid_mnist_100users.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import fetch_mldata
2 | from tqdm import trange
3 | import numpy as np
4 | import random
5 | import json
6 | import os
7 |
8 | random.seed(1)
9 | np.random.seed(1)
10 | NUM_USERS = 100
11 | NUM_LABELS = 3
12 | # Setup directory for train/test data
13 | train_path = './data/train/mnist_train.json'
14 | test_path = './data/test/mnist_test.json'
15 | dir_path = os.path.dirname(train_path)
16 | if not os.path.exists(dir_path):
17 | os.makedirs(dir_path)
18 | dir_path = os.path.dirname(test_path)
19 | if not os.path.exists(dir_path):
20 | os.makedirs(dir_path)
21 |
22 | # Get MNIST data, normalize, and divide by level
23 | mnist = fetch_mldata('MNIST original', data_home='./data')
24 | mu = np.mean(mnist.data.astype(np.float32), 0)
25 | sigma = np.std(mnist.data.astype(np.float32), 0)
26 | mnist.data = (mnist.data.astype(np.float32) - mu)/(sigma+0.001)
27 | mnist_data = []
28 | for i in trange(10):
29 | idx = mnist.target==i
30 | mnist_data.append(mnist.data[idx])
31 |
32 | print("\nNumb samples of each label:\n", [len(v) for v in mnist_data])
33 |
34 | ###### CREATE USER DATA SPLIT #######
35 | # Assign 100 samples to each user
36 | X = [[] for _ in range(NUM_USERS)]
37 | y = [[] for _ in range(NUM_USERS)]
38 | idx = np.zeros(10, dtype=np.int64)
39 | for user in range(NUM_USERS):
40 | for j in range(NUM_LABELS): # 3 labels for each users
41 | #l = (2*user+j)%10
42 | l = (user + j) % 10
43 | print("L:", l)
44 | X[user] += mnist_data[l][idx[l]:idx[l]+10].tolist()
45 | y[user] += (l*np.ones(10)).tolist()
46 | idx[l] += 10
47 |
48 | print("IDX1:", idx) # counting samples for each labels
49 |
50 | # Assign remaining sample by power law
51 | user = 0
52 | props = np.random.lognormal(
53 | 0, 2., (10, NUM_USERS, NUM_LABELS)) # last 5 is 5 labels
54 | props = np.array([[[len(v)-1000]] for v in mnist_data]) * \
55 | props/np.sum(props, (1, 2), keepdims=True)
56 | # print("here:",props/np.sum(props,(1,2), keepdims=True))
57 | #props = np.array([[[len(v)-100]] for v in mnist_data]) * \
58 | # props/np.sum(props, (1, 2), keepdims=True)
59 | #idx = 1000*np.ones(10, dtype=np.int64)
60 | # print("here2:",props)
61 | for user in trange(NUM_USERS):
62 | for j in range(NUM_LABELS): # 4 labels for each users
63 | # l = (2*user+j)%10
64 | l = (user + j) % 10
65 | num_samples = int(props[l, user//int(NUM_USERS/10), j])
66 | numran1 = random.randint(10, 200)
67 | numran2 = random.randint(1, 10)
68 | num_samples = (num_samples) * numran2 + numran1
69 | if(NUM_USERS <= 20):
70 | num_samples = num_samples * 2
71 | if idx[l] + num_samples < len(mnist_data[l]):
72 | X[user] += mnist_data[l][idx[l]:idx[l]+num_samples].tolist()
73 | y[user] += (l*np.ones(num_samples)).tolist()
74 | idx[l] += num_samples
75 | print("check len os user:", user, j,
76 | "len data", len(X[user]), num_samples)
77 |
78 | print("IDX2:", idx) # counting samples for each labels
79 |
80 | # Create data structure
81 | train_data = {'users': [], 'user_data':{}, 'num_samples':[]}
82 | test_data = {'users': [], 'user_data':{}, 'num_samples':[]}
83 |
84 | # Setup 5 users
85 | # for i in trange(5, ncols=120):
86 | for i in range(NUM_USERS):
87 | uname = 'f_{0:05d}'.format(i)
88 |
89 | combined = list(zip(X[i], y[i]))
90 | random.shuffle(combined)
91 | X[i][:], y[i][:] = zip(*combined)
92 | num_samples = len(X[i])
93 | train_len = int(0.75*num_samples)
94 | test_len = num_samples - train_len
95 |
96 | train_data['users'].append(uname)
97 | train_data['user_data'][uname] = {'x': X[i][:train_len], 'y': y[i][:train_len]}
98 | train_data['num_samples'].append(train_len)
99 | test_data['users'].append(uname)
100 | test_data['user_data'][uname] = {'x': X[i][train_len:], 'y': y[i][train_len:]}
101 | test_data['num_samples'].append(test_len)
102 |
103 | print("Num_samples:", train_data['num_samples'])
104 | print("Total_samples:",sum(train_data['num_samples']))
105 |
106 | with open(train_path,'w') as outfile:
107 | json.dump(train_data, outfile)
108 | with open(test_path, 'w') as outfile:
109 | json.dump(test_data, outfile)
110 |
111 | print("Finish Generating Samples")
112 |
--------------------------------------------------------------------------------
/flearn/optimizers/fedoptimizer.py:
--------------------------------------------------------------------------------
1 | from torch.optim import Optimizer
2 |
3 |
4 | class MySGD(Optimizer):
5 | def __init__(self, params, lr):
6 | defaults = dict(lr=lr)
7 | super(MySGD, self).__init__(params, defaults)
8 |
9 | def step(self, closure=None, hyper_learning_rate = 0):
10 | loss = None
11 | if closure is not None:
12 | loss = closure
13 |
14 | for group in self.param_groups:
15 | # print(group)
16 | for p in group['params']:
17 | if p.grad is None:
18 | continue
19 | d_p = p.grad.data
20 | if(hyper_learning_rate != 0):
21 | p.data.add_(-hyper_learning_rate, d_p)
22 | else:
23 | p.data.add_(-group['lr'], d_p)
24 | return loss
25 |
26 |
27 | class FEDLOptimizer(Optimizer):
28 | def __init__(self, params, lr = 0.01, hyper_lr = 0.01, L = 0.1):
29 | if lr < 0.0:
30 | raise ValueError("Invalid learning rate: {}".format(lr))
31 | defaults = dict(lr=lr,hyper_lr= hyper_lr, L = L)
32 | super(FEDLOptimizer, self).__init__(params, defaults)
33 |
34 | def step(self, server_grads, pre_grads, closure=None):
35 | loss = None
36 | if closure is not None:
37 | loss = closure
38 | for group in self.param_groups:
39 | for p, server_grad, pre_grad in zip(group['params'],server_grads, pre_grads):
40 | if(server_grad.grad != None and pre_grad.grad != None):
41 | p.data = p.data - group['lr'] * (p.grad.data + group['hyper_lr'] * server_grad.grad.data - pre_grad.grad.data)
42 | else:
43 | p.data = p.data - group['lr'] * p.grad.data
44 | return loss
45 |
46 | class pFedMeOptimizer(Optimizer):
47 | def __init__(self, params, lr=0.01, L=0.1 , mu = 0.001):
48 | #self.local_weight_updated = local_weight # w_i,K
49 | if lr < 0.0:
50 | raise ValueError("Invalid learning rate: {}".format(lr))
51 | defaults = dict(lr=lr, L=L, mu = mu)
52 | super(pFedMeOptimizer, self).__init__(params, defaults)
53 |
54 | def step(self, local_weight_updated, closure=None):
55 | loss = None
56 | if closure is not None:
57 | loss = closure
58 | weight_update = local_weight_updated.copy()
59 | for group in self.param_groups:
60 | for p, localweight in zip( group['params'], weight_update):
61 | p.data = p.data - group['lr'] * (p.grad.data + group['L'] * (p.data - localweight.data) + group['mu']*p.data)
62 | return group['params'], loss
63 |
64 | def update_param(self, local_weight_updated, closure=None):
65 | loss = None
66 | if closure is not None:
67 | loss = closure
68 | weight_update = local_weight_updated.copy()
69 | for group in self.param_groups:
70 | for p, localweight in zip( group['params'], weight_update):
71 | p.data = localweight.data
72 | #return p.data
73 | return group['params']
--------------------------------------------------------------------------------
/flearn/servers/serveravg.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os
3 |
4 | from flearn.users.useravg import UserAVG
5 | from flearn.servers.serverbase import Server
6 | from utils.model_utils import read_data, read_user_data
7 | import numpy as np
8 |
9 | # Implementation for FedAvg Server
10 |
11 | class FedAvg(Server):
12 | def __init__(self, dataset,algorithm, model, batch_size, learning_rate, hyper_learning_rate, L, num_glob_iters,
13 | local_epochs, optimizer, num_users, rho, times):
14 | super().__init__(dataset,algorithm, model[0], batch_size, learning_rate, hyper_learning_rate, L, num_glob_iters,
15 | local_epochs, optimizer, num_users, rho, times)
16 |
17 | # Initialize data for all users
18 | data = read_data(dataset)
19 | total_users = len(data[0])
20 | for i in range(total_users):
21 | id, train , test = read_user_data(i, data, dataset)
22 | user = UserAVG(id, train, test, model, batch_size, learning_rate,hyper_learning_rate,L, local_epochs, optimizer)
23 | self.users.append(user)
24 | self.total_train_samples += user.train_samples
25 |
26 | print("Number of users / total users:",num_users, " / " ,total_users)
27 | print("Finished creating FedAvg server.")
28 |
29 | def send_grads(self):
30 | assert (self.users is not None and len(self.users) > 0)
31 | grads = []
32 | for param in self.model.parameters():
33 | if param.grad is None:
34 | grads.append(torch.zeros_like(param.data))
35 | else:
36 | grads.append(param.grad)
37 | for user in self.users:
38 | user.set_grads(grads)
39 |
40 | def train(self):
41 | loss = []
42 | for glob_iter in range(self.num_glob_iters):
43 | print("-------------Round number: ",glob_iter, " -------------")
44 | #loss_ = 0
45 | self.send_parameters()
46 |
47 | # Evaluate model each interation
48 | self.evaluate()
49 |
50 | self.selected_users = self.select_users(glob_iter,self.num_users)
51 | for user in self.selected_users:
52 | user.train(self.local_epochs) #* user.train_samples
53 | self.aggregate_parameters()
54 | #loss_ /= self.total_train_samples
55 | #loss.append(loss_)
56 | #print(loss_)
57 | #print(loss)
58 | self.save_results()
59 | self.save_model()
--------------------------------------------------------------------------------
/flearn/servers/serverbase.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os
3 | import numpy as np
4 | import h5py
5 | from utils.model_utils import Metrics
6 | import copy
7 |
8 | class Server:
9 | def __init__(self, dataset,algorithm, model, batch_size, learning_rate ,hyper_learning_rate, L,
10 | num_glob_iters, local_epochs, optimizer,num_users,rho, times):
11 |
12 | # Set up the main attributes
13 | self.dataset = dataset
14 | self.num_glob_iters = num_glob_iters
15 | self.local_epochs = local_epochs
16 | self.batch_size = batch_size
17 | self.learning_rate = learning_rate
18 | self.total_train_samples = 0
19 | self.model = copy.deepcopy(model)
20 | self.users = []
21 | self.selected_users = []
22 | self.num_users = num_users
23 | self.hyper_learning_rate = hyper_learning_rate
24 | self.L = L
25 | self.algorithm = algorithm
26 | self.rs_train_acc, self.rs_train_loss, self.rs_glob_acc= [], [], []
27 | self.rho = rho
28 | self.times = times
29 |
30 | def aggregate_grads(self):
31 | assert (self.users is not None and len(self.users) > 0)
32 | for param in self.model.parameters():
33 | param.grad = torch.zeros_like(param.data)
34 | for user in self.users:
35 | self.add_grad(user, user.train_samples / self.total_train_samples)
36 |
37 | def send_parameters(self):
38 | assert (self.users is not None and len(self.users) > 0)
39 | for user in self.users:
40 | user.set_parameters(self.model)
41 |
42 | def add_parameters(self, user, ratio):
43 | model = self.model.parameters()
44 | for server_param, user_param in zip(self.model.parameters(), user.get_parameters()):
45 | server_param.data = server_param.data + user_param.data.clone() * ratio
46 | if(user_param.grad != None):
47 | if(server_param.grad == None):
48 | server_param.grad = torch.zeros_like(user_param.grad)
49 | server_param.grad.data = server_param.grad.data + user_param.grad.data.clone() * ratio
50 |
51 | def aggregate_parameters(self):
52 | assert (self.users is not None and len(self.users) > 0)
53 | for param in self.model.parameters():
54 | param.data = torch.zeros_like(param.data)
55 | if(param.grad != None):
56 | param.grad.data = torch.zeros_like(param.grad.data)
57 | total_train = 0
58 | #if(self.num_users = self.to)
59 | for user in self.selected_users:
60 | total_train += user.train_samples
61 | for user in self.selected_users:
62 | self.add_parameters(user, user.train_samples / total_train)
63 | #self.add_grad(user, user.train_samples / total_train)
64 |
65 | def save_model(self):
66 | model_path = os.path.join("models", self.dataset)
67 | if not os.path.exists(model_path):
68 | os.makedirs(model_path)
69 | torch.save(self.model, os.path.join(model_path, "server" + ".pt"))
70 |
71 | def load_model(self):
72 | model_path = os.path.join("models", self.dataset, "server" + ".pt")
73 | assert (os.path.exists(model_path))
74 | self.model = torch.load(model_path)
75 |
76 | def model_exists(self):
77 | return os.path.exists(os.path.join("models", self.dataset, "server" + ".pt"))
78 |
79 | def select_users(self, round, num_users):
80 | if(num_users == len(self.users)):
81 | print("All users are selected")
82 | return self.users
83 |
84 | num_users = min(num_users, len(self.users))
85 | # fix the list of user consistent
86 | np.random.seed(round * (self.times + 1))
87 | return np.random.choice(self.users, num_users, replace=False) #, p=pk)
88 |
89 |
90 | # Save loss, accurancy to h5 fiel
91 | def save_results(self):
92 | alg = self.dataset + "_" + self.algorithm
93 | if(self.algorithm == "FEDL"):
94 | alg = alg + "_" + str(self.learning_rate) + "_" + str(self.hyper_learning_rate) + "_" + str(self.num_users) + "u" + "_" + str(self.batch_size) + "b" + "_" + str(self.local_epochs)
95 | else:
96 | alg = alg + "_" + str(self.learning_rate) + "_" + str(self.num_users) + "u" + "_" + str(self.batch_size) + "b" + "_" + str(self.local_epochs)
97 | if(self.L > 0):
98 | alg = alg + "_" + str(self.L) + "L"
99 |
100 | if(self.rho > 0):
101 | alg = alg + "_" + str(self.rho) + "p"
102 |
103 | alg = alg + "_" + str(self.times)
104 | if (len(self.rs_glob_acc) != 0 & len(self.rs_train_acc) & len(self.rs_train_loss)) :
105 | with h5py.File("./results/" + '{}.h5'.format(alg, self.local_epochs), 'w') as hf:
106 | hf.create_dataset('rs_glob_acc', data=self.rs_glob_acc)
107 | hf.create_dataset('rs_train_acc', data=self.rs_train_acc)
108 | hf.create_dataset('rs_train_loss', data=self.rs_train_loss)
109 | hf.close()
110 |
111 | def test(self):
112 | '''tests self.latest_model on given clients
113 | '''
114 | num_samples = []
115 | tot_correct = []
116 | losses = []
117 | for c in self.users:
118 | ct, ns = c.test()
119 | tot_correct.append(ct*1.0)
120 | num_samples.append(ns)
121 | ids = [c.id for c in self.users]
122 |
123 | return ids, num_samples, tot_correct
124 |
125 | def train_error_and_loss(self):
126 | num_samples = []
127 | tot_correct = []
128 | losses = []
129 | for c in self.users:
130 | ct, cl, ns = c.train_error_and_loss()
131 | tot_correct.append(ct*1.0)
132 | num_samples.append(ns)
133 | losses.append(cl*1.0)
134 |
135 | ids = [c.id for c in self.users]
136 | #groups = [c.group for c in self.clients]
137 |
138 | return ids, num_samples, tot_correct, losses
139 |
140 | def evaluate(self):
141 | stats = self.test()
142 | stats_train = self.train_error_and_loss()
143 | glob_acc = np.sum(stats[2])*1.0/np.sum(stats[1])
144 | train_acc = np.sum(stats_train[2])*1.0/np.sum(stats_train[1])
145 | # train_loss = np.dot(stats_train[3], stats_train[1])*1.0/np.sum(stats_train[1])
146 | train_loss = sum([x * y for (x, y) in zip(stats_train[1], stats_train[3])]).item() / np.sum(stats_train[1])
147 | self.rs_glob_acc.append(glob_acc)
148 | self.rs_train_acc.append(train_acc)
149 | self.rs_train_loss.append(train_loss)
150 | #print("stats_train[1]",stats_train[3][0])
151 | print("Average Global Accurancy: ", glob_acc)
152 | print("Average Global Trainning Accurancy: ", train_acc)
153 | print("Average Global Trainning Loss: ",train_loss)
154 |
--------------------------------------------------------------------------------
/flearn/servers/serverfedl.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os
3 |
4 | from flearn.users.userfedl import UserFEDL
5 | from flearn.servers.serverbase import Server
6 | from utils.model_utils import read_data, read_user_data
7 | import numpy as np
8 |
9 | # Implementation for FedAvg Server
10 |
11 | class FEDL(Server):
12 | def __init__(self, dataset,algorithm, model, batch_size, learning_rate, hyper_learning_rate, L, num_glob_iters,
13 | local_epochs, optimizer, num_users,rho, times):
14 | super().__init__(dataset,algorithm, model[0], batch_size, learning_rate, hyper_learning_rate, L, num_glob_iters,
15 | local_epochs, optimizer, num_users,rho, times)
16 |
17 | # Initialize data for all users
18 | data = read_data(dataset)
19 | total_users = len(data[0])
20 | for i in range(total_users):
21 | id, train , test = read_user_data(i, data, dataset)
22 | user = UserFEDL(id, train, test, model, batch_size, learning_rate, hyper_learning_rate, L, local_epochs, optimizer)
23 | self.users.append(user)
24 | self.total_train_samples += user.train_samples
25 |
26 | print("Number of users / total users:",num_users, " / " ,total_users)
27 | print("Finished creating FEDL server.")
28 |
29 | def train(self):
30 |
31 | for glob_iter in range(self.num_glob_iters):
32 | print("-------------Round number: ",glob_iter, " -------------")
33 |
34 | self.send_parameters()
35 | self.evaluate()
36 | self.selected_users = self.select_users(glob_iter,self.num_users)
37 | for user in self.selected_users:
38 | user.train(self.local_epochs) #* user.train_samples
39 | self.aggregate_parameters()
40 |
41 | self.save_results()
42 | self.save_model()
--------------------------------------------------------------------------------
/flearn/trainmodel/models.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class Net(nn.Module):
6 | def __init__(self):
7 | super(Net, self).__init__()
8 | self.conv1 = nn.Conv2d(1, 16, 2, 1)
9 | self.conv2 = nn.Conv2d(16, 32, 2, 1)
10 | self.dropout1 = nn.Dropout(0.25)
11 | self.dropout2 = nn.Dropout(0.5)
12 | self.fc1 = nn.Linear(18432, 128)
13 | self.fc2 = nn.Linear(128, 10)
14 |
15 | def forward(self, x):
16 | x = self.conv1(x)
17 | x = nn.ReLU()(x)
18 | x = nn.MaxPool2d(2, 1)(x)
19 | x = self.dropout1(x)
20 | x = self.conv2(x)
21 | x = nn.ReLU()(x)
22 | x = nn.MaxPool2d(2, 1)(x)
23 | x = self.dropout2(x)
24 | x = torch.flatten(x, 1)
25 | x = self.fc1(x)
26 | x = nn.ReLU()(x)
27 | x = self.fc2(x)
28 | output = F.log_softmax(x, dim=1)
29 | return output
30 |
31 | class Mclr_Logistic(nn.Module):
32 | def __init__(self, input_dim = 784, output_dim = 10):
33 | super(Mclr_Logistic, self).__init__()
34 | self.fc1 = nn.Linear(input_dim, output_dim)
35 |
36 | def forward(self, x):
37 | x = torch.flatten(x, 1)
38 | x = self.fc1(x)
39 | output = F.log_softmax(x, dim=1)
40 | return output
41 |
42 | class Mclr_CrossEntropy(nn.Module):
43 | def __init__(self, input_dim = 784, output_dim = 10):
44 | super(Mclr_CrossEntropy, self).__init__()
45 | self.linear = torch.nn.Linear(input_dim, output_dim)
46 |
47 | def forward(self, x):
48 | x = torch.flatten(x, 1)
49 | outputs = self.linear(x)
50 | return outputs
51 |
52 | class DNN(nn.Module):
53 | def __init__(self, input_dim = 784, mid_dim = 100, output_dim = 10):
54 | super(DNN, self).__init__()
55 | # define network layers
56 | self.fc1 = nn.Linear(input_dim, mid_dim)
57 | self.fc2 = nn.Linear(mid_dim, output_dim)
58 |
59 | def forward(self, x):
60 | # define forward pass
61 | x = torch.flatten(x, 1)
62 | x = F.relu(self.fc1(x))
63 | x = self.fc2(x)
64 | x = F.log_softmax(x, dim=1)
65 | return x
66 |
67 | class Linear_Regression(nn.Module):
68 | def __init__(self, input_dim = 60, output_dim = 1):
69 | super(Linear_Regression, self).__init__()
70 | self.linear = torch.nn.Linear(input_dim, output_dim)
71 |
72 | def forward(self, x):
73 | x = torch.flatten(x, 1)
74 | outputs = self.linear(x)
75 | return outputs
76 |
77 | class DNN(nn.Module):
78 | def __init__(self, input_dim = 784, mid_dim = 100, output_dim = 10):
79 | super(DNN, self).__init__()
80 | # define network layers
81 | self.fc1 = nn.Linear(input_dim, mid_dim)
82 | self.fc2 = nn.Linear(mid_dim, output_dim)
83 |
84 | def forward(self, x):
85 | # define forward pass
86 | x = torch.flatten(x, 1)
87 | x = F.relu(self.fc1(x))
88 | x = self.fc2(x)
89 | x = F.log_softmax(x, dim=1)
90 | return x
--------------------------------------------------------------------------------
/flearn/users/useravg.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import os
5 | import json
6 | from torch.utils.data import DataLoader
7 | from flearn.users.userbase import User
8 | from flearn.optimizers.fedoptimizer import *
9 | # Implementation for FedAvg clients
10 |
11 | class UserAVG(User):
12 | def __init__(self, numeric_id, train_data, test_data, model, batch_size, learning_rate, hyper_learning_rate, L,
13 | local_epochs, optimizer):
14 | super().__init__(numeric_id, train_data, test_data, model[0], batch_size, learning_rate, hyper_learning_rate, L,
15 | local_epochs)
16 |
17 | if(model[1] == "linear"):
18 | self.loss = nn.MSELoss()
19 | else:
20 | self.loss = nn.NLLLoss()
21 |
22 | self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.learning_rate)
23 |
24 | def set_grads(self, new_grads):
25 | if isinstance(new_grads, nn.Parameter):
26 | for model_grad, new_grad in zip(self.model.parameters(), new_grads):
27 | model_grad.data = new_grad.data
28 | elif isinstance(new_grads, list):
29 | for idx, model_grad in enumerate(self.model.parameters()):
30 | model_grad.data = new_grads[idx]
31 |
32 |
33 | def train(self, epochs):
34 | self.model.train()
35 | for epoch in range(1, self.local_epochs + 1):
36 | self.model.train()
37 | #loss_per_epoch = 0
38 | for batch_idx, (X, y) in enumerate(self.trainloader):
39 | self.optimizer.zero_grad()
40 | output = self.model(X)
41 | loss = self.loss(output, y)
42 | loss.backward()
43 | self.optimizer.step()
44 | self.clone_model_paramenter(self.model.parameters(), self.local_model)
45 | return loss
46 |
--------------------------------------------------------------------------------
/flearn/users/userbase.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import os
5 | import json
6 | from torch.utils.data import DataLoader
7 | import numpy as np
8 | import copy
9 |
10 | class User:
11 | """
12 | Base class for users in federated learning.
13 | """
14 | def __init__(self, id, train_data, test_data, model, batch_size = 0, learning_rate = 0, hyper_learning_rate = 0 , L = 0, local_epochs = 0):
15 | # from fedprox
16 | self.model = copy.deepcopy(model)
17 | self.id = id # integer
18 | self.train_samples = len(train_data)
19 | self.test_samples = len(test_data)
20 | if(batch_size == 0):
21 | self.batch_size = len(train_data)
22 | else:
23 | self.batch_size = batch_size
24 | self.learning_rate = learning_rate
25 | self.hyper_learning_rate = hyper_learning_rate
26 | self.L = L
27 | self.local_epochs = local_epochs
28 | self.trainloader = DataLoader(train_data, self.batch_size)
29 | self.testloader = DataLoader(test_data, self.batch_size)
30 | self.testloaderfull = DataLoader(test_data, self.test_samples)
31 | self.trainloaderfull = DataLoader(train_data, self.train_samples)
32 | self.iter_trainloader = iter(self.trainloader)
33 | self.iter_testloader = iter(self.testloader)
34 |
35 | # those parameters are for FEDL.
36 | self.local_model = copy.deepcopy(list(self.model.parameters()))
37 | self.server_grad = copy.deepcopy(list(self.model.parameters()))
38 | self.pre_local_grad = copy.deepcopy(list(self.model.parameters()))
39 |
40 | def set_parameters(self, model):
41 | for old_param, new_param, local_param in zip(self.model.parameters(), model.parameters(), self.local_model):
42 | old_param.data = new_param.data.clone()
43 | local_param.data = new_param.data.clone()
44 | if(new_param.grad != None):
45 | if(old_param.grad == None):
46 | old_param.grad = torch.zeros_like(new_param.grad)
47 |
48 | if(local_param.grad == None):
49 | local_param.grad = torch.zeros_like(new_param.grad)
50 |
51 | old_param.grad.data = new_param.grad.data.clone()
52 | local_param.grad.data = new_param.grad.data.clone()
53 | #self.local_weight_updated = copy.deepcopy(self.optimizer.param_groups[0]['params'])
54 |
55 | def get_parameters(self):
56 | for param in self.model.parameters():
57 | param.detach()
58 | return self.model.parameters()
59 |
60 | def clone_model_paramenter(self, param, clone_param):
61 | for param, clone_param in zip(param, clone_param):
62 | clone_param.data = param.data.clone()
63 | if(param.grad != None):
64 | if(clone_param.grad == None):
65 | clone_param.grad = torch.zeros_like(param.grad)
66 | clone_param.grad.data = param.grad.data.clone()
67 |
68 | return clone_param
69 |
70 | def get_updated_parameters(self):
71 | return self.local_weight_updated
72 |
73 | def update_parameters(self, new_params):
74 | for param , new_param in zip(self.model.parameters(), new_params):
75 | param.data = new_param.data.clone()
76 | param.grad.data = new_param.grad.data.clone()
77 |
78 | def get_grads(self, grads):
79 |
80 | self.optimizer.zero_grad()
81 |
82 | for x, y in self.trainloaderfull:
83 | output = self.model(x)
84 | loss = self.loss(output, y)
85 | loss.backward()
86 | self.clone_model_paramenter(self.model.parameters(), grads)
87 | #for param, grad in zip(self.model.parameters(), grads):
88 | # if(grad.grad == None):
89 | # grad.grad = torch.zeros_like(param.grad)
90 | # grad.grad.data = param.grad.data.clone()
91 | return grads
92 |
93 | def test(self):
94 | self.model.eval()
95 | test_acc = 0
96 | for x, y in self.testloaderfull:
97 | output = self.model(x)
98 | test_acc += (torch.sum(torch.argmax(output, dim=1) == y)).item()
99 | #@loss += self.loss(output, y)
100 | #print(self.id + ", Test Accuracy:", test_acc / y.shape[0] )
101 | #print(self.id + ", Test Loss:", loss)
102 | return test_acc, y.shape[0]
103 |
104 | def train_error_and_loss(self):
105 | self.model.eval()
106 | train_acc = 0
107 | loss = 0
108 | for x, y in self.trainloaderfull:
109 | output = self.model(x)
110 | train_acc += (torch.sum(torch.argmax(output, dim=1) == y)).item()
111 | loss += self.loss(output, y)
112 | #print(self.id + ", Train Accuracy:", train_acc)
113 | #print(self.id + ", Train Loss:", loss)
114 | return train_acc, loss , self.train_samples
115 |
116 |
117 | def get_next_train_batch(self):
118 | try:
119 | # Samples a new batch for persionalizing
120 | (X, y) = next(self.iter_trainloader)
121 | except StopIteration:
122 | # restart the generator if the previous generator is exhausted.
123 | self.iter_trainloader = iter(self.trainloader)
124 | (X, y) = next(self.iter_trainloader)
125 | return (X, y)
126 |
127 | def get_next_test_batch(self):
128 | try:
129 | # Samples a new batch for persionalizing
130 | (X, y) = next(self.iter_testloader)
131 | except StopIteration:
132 | # restart the generator if the previous generator is exhausted.
133 | self.iter_testloader = iter(self.testloader)
134 | (X, y) = next(self.iter_testloader)
135 | return (X, y)
136 |
137 | def save_model(self):
138 | model_path = os.path.join("models", self.dataset)
139 | if not os.path.exists(model_path):
140 | os.makedirs(model_path)
141 | torch.save(self.model, os.path.join(model_path, "user_" + self.id + ".pt"))
142 |
143 | def load_model(self):
144 | model_path = os.path.join("models", self.dataset)
145 | self.model = torch.load(os.path.join(model_path, "server" + ".pt"))
146 |
147 | @staticmethod
148 | def model_exists():
149 | return os.path.exists(os.path.join("models", "server" + ".pt"))
--------------------------------------------------------------------------------
/flearn/users/userfedl.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import os
5 | import json
6 | from torch.utils.data import DataLoader
7 | from flearn.users.userbase import User
8 | from flearn.optimizers.fedoptimizer import *
9 | import copy
10 | # Implementation for FedAvg clients
11 |
12 | class UserFEDL(User):
13 | def __init__(self, numeric_id, train_data, test_data, model, batch_size, learning_rate, hyper_learning_rate, L,
14 | local_epochs, optimizer):
15 | super().__init__(numeric_id, train_data, test_data, model[0], batch_size, learning_rate, hyper_learning_rate, L,
16 | local_epochs)
17 |
18 | if(model[1] == "linear"):
19 | self.loss = nn.MSELoss()
20 | else:
21 | self.loss = nn.NLLLoss()
22 |
23 | self.optimizer = FEDLOptimizer(self.model.parameters(), lr=self.learning_rate, hyper_lr= hyper_learning_rate, L = L)
24 |
25 | def get_full_grad(self):
26 | for X, y in self.trainloaderfull:
27 | self.model.zero_grad()
28 | output = self.model(X)
29 | loss = self.loss(output, y)
30 | loss.backward()
31 |
32 | def set_grads(self, new_grads):
33 | if isinstance(new_grads, nn.Parameter):
34 | for model_grad, new_grad in zip(self.model.parameters(), new_grads):
35 | model_grad.data = new_grad.data
36 | elif isinstance(new_grads, list):
37 | for idx, model_grad in enumerate(self.model.parameters()):
38 | model_grad.data = new_grads[idx]
39 |
40 | def train(self, epochs):
41 | self.clone_model_paramenter(self.model.parameters(), self.server_grad)
42 | self.get_grads(self.pre_local_grad)
43 | self.model.train()
44 | for epoch in range(1, self.local_epochs + 1):
45 | loss_per_epoch = 0
46 | for batch_idx, (X, y) in enumerate(self.trainloader):
47 | self.optimizer.zero_grad()
48 | output = self.model(X)
49 | loss = self.loss(output, y)
50 | loss.backward()
51 | self.optimizer.step(self.server_grad, self.pre_local_grad)
52 |
53 | self.optimizer.zero_grad()
54 | self.get_full_grad()
55 | return loss
56 |
57 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import h5py
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import argparse
6 | import importlib
7 | import random
8 | import os
9 | from flearn.servers.serveravg import FedAvg
10 | from flearn.servers.serverfedl import FEDL
11 | from flearn.trainmodel.models import *
12 | from utils.plot_utils import *
13 | import torch
14 | torch.manual_seed(0)
15 |
16 | def main(dataset, algorithm, model, batch_size, learning_rate, hyper_learning_rate, L, num_glob_iters,
17 | local_epochs, optimizer, clients_per_round, rho, times):
18 |
19 | for i in range(times):
20 | print("---------------Running time:------------",i)
21 |
22 | # Generate model
23 | if(model == "mclr"): #for Mnist and Femnist datasets
24 | model = Mclr_Logistic(), model
25 |
26 | if(model == "linear"): # For Linear dataset
27 | model = Linear_Regression(40,1), model
28 |
29 | if(model == "dnn"): # for Mnist and Femnist datasets
30 | model = model = DNN(), model
31 |
32 | # select algorithm
33 | if(algorithm == "FedAvg"):
34 | server = FedAvg(dataset, algorithm, model, batch_size, learning_rate, hyper_learning_rate, L, num_glob_iters, local_epochs, optimizer, clients_per_round, rho, i)
35 |
36 | if(algorithm == "FEDL"):
37 | server = FEDL(dataset, algorithm, model, batch_size, learning_rate, hyper_learning_rate, L, num_glob_iters, local_epochs, optimizer, clients_per_round, rho, i)
38 | server.train()
39 | server.test()
40 |
41 | # Average data
42 | average_data(num_users=clients_per_round, loc_ep1=local_epochs, Numb_Glob_Iters=num_glob_iters, lamb=L,learning_rate=learning_rate, hyper_learning_rate = hyper_learning_rate, algorithms=algorithm, batch_size=batch_size, dataset=dataset, rho = rho, times = times)
43 |
44 | if __name__ == "__main__":
45 | parser = argparse.ArgumentParser()
46 | parser.add_argument("--dataset", type=str, default="Mnist", choices=["Mnist", "Femnist", "Linear_synthetic", "Logistic_synthetic"])
47 | parser.add_argument("--model", type=str, default="mclr", choices=["linear", "mclr", "dnn"])
48 | parser.add_argument("--batch_size", type=int, default=20)
49 | parser.add_argument("--learning_rate", type=float, default=0.003, help="Local learning rate")
50 | parser.add_argument("--hyper_learning_rate", type=float, default = 0, help=" Learning rate of FEDL")
51 | parser.add_argument("--L", type=int, default=0, help="Regularization term")
52 | parser.add_argument("--num_global_iters", type=int, default=800)
53 | parser.add_argument("--local_epochs", type=int, default=20)
54 | parser.add_argument("--optimizer", type=str, default="SGD")
55 | parser.add_argument("--algorithm", type=str, default="FEDL",choices=["FEDL", "FedAvg"])
56 | parser.add_argument("--clients_per_round", type=int, default=10, help="Number of Users per round")
57 | parser.add_argument("--rho", type=float, default=0, help="Conditon Number")
58 | parser.add_argument("--times", type=int, default=1, help="running time")
59 | args = parser.parse_args()
60 |
61 | print("=" * 80)
62 | print("Summary of training process:")
63 | print("Algorithm: {}".format(args.algorithm))
64 | print("Batch size: {}".format(args.batch_size))
65 | print("Learing rate : {}".format(args.learning_rate))
66 | print("Hyper learing rate : {}".format(args.hyper_learning_rate))
67 | print("Subset of users : {}".format(args.clients_per_round))
68 | print("Number of local rounds : {}".format(args.local_epochs))
69 | print("Number of global rounds : {}".format(args.num_global_iters))
70 | print("Dataset : {}".format(args.dataset))
71 | print("Local Model : {}".format(args.model))
72 | print("=" * 80)
73 |
74 | main(
75 | dataset=args.dataset,
76 | algorithm = args.algorithm,
77 | model=args.model,
78 | batch_size=args.batch_size,
79 | learning_rate=args.learning_rate,
80 | hyper_learning_rate = args.hyper_learning_rate,
81 | L = args.L,
82 | num_glob_iters=args.num_global_iters,
83 | local_epochs=args.local_epochs,
84 | optimizer= args.optimizer,
85 | clients_per_round = args.clients_per_round,
86 | rho = args.rho,
87 | times = args.times
88 | )
89 |
--------------------------------------------------------------------------------
/plot_femnist.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import h5py
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import argparse
6 | import importlib
7 | import random
8 | import os
9 | from flearn.servers.serveravg import FedAvg
10 | from flearn.servers.serverfedl import FEDL
11 | from flearn.trainmodel.models import *
12 | from utils.plot_utils import *
13 | import torch
14 | torch.manual_seed(0)
15 |
16 | algorithms_list = ["FEDL","FedAvg","FEDL", "FEDL","FedAvg","FEDL", "FEDL","FedAvg","FEDL"]
17 | rho = [0,0,0,0,0,0,0,0,0,0,0,0]
18 | lamb_value = [0, 0, 0, 0, 0, 0, 0, 0, 0]
19 | learning_rate = [0.003, 0.003, 0.015, 0.003, 0.003, 0.015, 0.003, 0.003, 0.015]
20 | hyper_learning_rate = [0.2, 0, 0.5, 0.2, 0, 0.5, 0.2, 0, 0.5]
21 | local_ep = [10, 10, 10, 20, 20, 20, 40, 40, 40]
22 | batch_size = [20, 20, 0, 20, 20, 0, 20, 20, 0]
23 | DATA_SET = "Femnist"
24 | number_users = 10
25 |
26 | plot_summary_nist(num_users=number_users, loc_ep1=local_ep, Numb_Glob_Iters=800, lamb=lamb_value,
27 | learning_rate=learning_rate, hyper_learning_rate = hyper_learning_rate, algorithms_list=algorithms_list, batch_size=batch_size, rho = rho, dataset=DATA_SET)
28 | print("-- FINISH -- :",)
--------------------------------------------------------------------------------
/plot_linear.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import h5py
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | from utils.plot_utils import *
6 | import torch
7 | torch.manual_seed(0)
8 |
9 | algorithms_list = ["FEDL","FEDL","FEDL","FEDL","FEDL","FEDL","FEDL","FEDL","FEDL","FEDL","FEDL","FEDL"]
10 | rho = [1.4, 1.4, 1.4, 1.4, 2 ,2 , 2, 2, 5, 5, 5, 5]
11 | lamb_value = [0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0 ,0]
12 | learning_rate = [0.04,0.04,0.04,0.04, 0.04,0.04,0.04,0.04, 0.04,0.04,0.04,0.04]
13 | hyper_learning_rate = [0.01,0.03,0.05,0.07, 0.01,0.03,0.05,0.07, 0.01,0.03,0.05,0.07]
14 | local_ep = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
15 | batch_size = [0,0,0,0 ,0,0,0,0, 0,0,0,0]
16 | DATA_SET = "Linear_synthetic"
17 | number_users = 100
18 |
19 | plot_summary_linear(num_users=number_users, loc_ep1=local_ep, Numb_Glob_Iters=200, lamb=lamb_value, learning_rate=learning_rate, hyper_learning_rate = hyper_learning_rate, algorithms_list=algorithms_list, batch_size=batch_size, rho = rho, dataset=DATA_SET)
--------------------------------------------------------------------------------
/plot_mnist.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import h5py
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import argparse
6 | import importlib
7 | import random
8 | import os
9 | from flearn.servers.serveravg import FedAvg
10 | from flearn.servers.serverfedl import FEDL
11 | from flearn.trainmodel.models import *
12 | from utils.plot_utils import *
13 | import torch
14 | torch.manual_seed(0)
15 |
16 | algorithms_list = ["FEDL","FedAvg","FEDL","FedAvg","FEDL","FedAvg","FEDL","FEDL"]
17 | rho = [0,0,0,0,0,0,0,0,0,0,0,0,0]
18 | lamb_value = [0, 0, 0, 0, 0, 0,0, 0, 0, 0]
19 | learning_rate = [0.003,0.003,0.003,0.003,0.003,0.003,0.003,0.003]
20 | hyper_learning_rate = [0.2,0,0.2,0,0.2,0,2.0,4.0]
21 | local_ep = [20, 20, 20, 20, 20, 20, 20, 20]
22 | batch_size = [20,20,40,40,0,0,0,0]
23 | DATA_SET = "Mnist"
24 | number_users = 10
25 | plot_summary_mnist(num_users=number_users, loc_ep1=local_ep, Numb_Glob_Iters=800, lamb=lamb_value,learning_rate=learning_rate, hyper_learning_rate = hyper_learning_rate, algorithms_list=algorithms_list, batch_size=batch_size, rho = rho, dataset=DATA_SET)
26 | print("-- FINISH -- :",)
27 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | Pillow
4 | torch
5 | torchvision
6 | matplotlib
7 | tqdm
--------------------------------------------------------------------------------
/results/Mnist_FedAvg_0.005_0.2_15_10u_20b_20_avg.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharlieDinh/FEDL_pytorch/4db34e5b698d46e2f73b94fb9c0ce00ef9b464f4/results/Mnist_FedAvg_0.005_0.2_15_10u_20b_20_avg.h5
--------------------------------------------------------------------------------
/results/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CharlieDinh/FEDL_pytorch/4db34e5b698d46e2f73b94fb9c0ce00ef9b464f4/results/README.md
--------------------------------------------------------------------------------
/utils/model_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import numpy as np
3 | import os
4 | import torch
5 | import torch.nn as nn
6 |
7 | IMAGE_SIZE = 28
8 | IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE
9 | NUM_CHANNELS = 1
10 |
11 | def suffer_data(data):
12 | data_x = data['x']
13 | data_y = data['y']
14 | # randomly shuffle data
15 | np.random.seed(100)
16 | rng_state = np.random.get_state()
17 | np.random.shuffle(data_x)
18 | np.random.set_state(rng_state)
19 | np.random.shuffle(data_y)
20 | return (data_x, data_y)
21 |
22 | def batch_data(data, batch_size):
23 | '''
24 | data is a dict := {'x': [numpy array], 'y': [numpy array]} (on one client)
25 | returns x, y, which are both numpy array of length: batch_size
26 | '''
27 | data_x = data['x']
28 | data_y = data['y']
29 |
30 | # randomly shuffle data
31 | np.random.seed(100)
32 | rng_state = np.random.get_state()
33 | np.random.shuffle(data_x)
34 | np.random.set_state(rng_state)
35 | np.random.shuffle(data_y)
36 |
37 | # loop through mini-batches
38 | for i in range(0, len(data_x), batch_size):
39 | batched_x = data_x[i:i+batch_size]
40 | batched_y = data_y[i:i+batch_size]
41 | yield (batched_x, batched_y)
42 |
43 |
44 | def get_random_batch_sample(data_x, data_y, batch_size):
45 | num_parts = len(data_x)//batch_size + 1
46 | if(len(data_x) > batch_size):
47 | batch_idx = np.random.choice(list(range(num_parts +1)))
48 | sample_index = batch_idx*batch_size
49 | if(sample_index + batch_size > len(data_x)):
50 | return (data_x[sample_index:], data_y[sample_index:])
51 | else:
52 | return (data_x[sample_index: sample_index+batch_size], data_y[sample_index: sample_index+batch_size])
53 | else:
54 | return (data_x,data_y)
55 |
56 |
57 | def get_batch_sample(data, batch_size):
58 | data_x = data['x']
59 | data_y = data['y']
60 |
61 | np.random.seed(100)
62 | rng_state = np.random.get_state()
63 | np.random.shuffle(data_x)
64 | np.random.set_state(rng_state)
65 | np.random.shuffle(data_y)
66 |
67 | batched_x = data_x[0:batch_size]
68 | batched_y = data_y[0:batch_size]
69 | return (batched_x, batched_y)
70 |
71 | def read_data(dataset):
72 | '''parses data in given train and test data directories
73 |
74 | assumes:
75 | - the data in the input directories are .json files with
76 | keys 'users' and 'user_data'
77 | - the set of train set users is the same as the set of test set users
78 |
79 | Return:
80 | clients: list of client ids
81 | groups: list of group ids; empty list if none found
82 | train_data: dictionary of train data
83 | test_data: dictionary of test data
84 | '''
85 | train_data_dir = os.path.join('data',dataset,'data', 'train')
86 | test_data_dir = os.path.join('data',dataset,'data', 'test')
87 | clients = []
88 | groups = []
89 | train_data = {}
90 | test_data = {}
91 |
92 | train_files = os.listdir(train_data_dir)
93 | train_files = [f for f in train_files if f.endswith('.json')]
94 | for f in train_files:
95 | file_path = os.path.join(train_data_dir, f)
96 | with open(file_path, 'r') as inf:
97 | cdata = json.load(inf)
98 | clients.extend(cdata['users'])
99 | if 'hierarchies' in cdata:
100 | groups.extend(cdata['hierarchies'])
101 | train_data.update(cdata['user_data'])
102 |
103 | test_files = os.listdir(test_data_dir)
104 | test_files = [f for f in test_files if f.endswith('.json')]
105 | for f in test_files:
106 | file_path = os.path.join(test_data_dir, f)
107 | with open(file_path, 'r') as inf:
108 | cdata = json.load(inf)
109 | test_data.update(cdata['user_data'])
110 |
111 | clients = list(sorted(train_data.keys()))
112 |
113 | return clients, groups, train_data, test_data
114 |
115 | def read_user_data(index,data,dataset):
116 | id = data[0][index]
117 | train_data = data[2][id]
118 | test_data = data[3][id]
119 | X_train, y_train, X_test, y_test = train_data['x'], train_data['y'], test_data['x'], test_data['y']
120 | if(dataset == "Mnist"):
121 | X_train, y_train, X_test, y_test = train_data['x'], train_data['y'], test_data['x'], test_data['y']
122 | X_train = torch.Tensor(X_train).view(-1, NUM_CHANNELS, IMAGE_SIZE, IMAGE_SIZE).type(torch.float32)
123 | y_train = torch.Tensor(y_train).type(torch.int64)
124 | X_test = torch.Tensor(X_test).view(-1, NUM_CHANNELS, IMAGE_SIZE, IMAGE_SIZE).type(torch.float32)
125 | y_test = torch.Tensor(y_test).type(torch.int64)
126 | elif(dataset == "Linear_synthetic"):
127 | X_train = torch.Tensor(X_train).type(torch.float32)
128 | y_train = torch.Tensor(y_train).type(torch.float32).unsqueeze(1)
129 | X_test = torch.Tensor(X_test).type(torch.float32)
130 | y_test = torch.Tensor(y_test).type(torch.float32).unsqueeze(1)
131 | #y_train = torch.flatten(y_train, 1)
132 | #y_test = torch.flatten(y_test, 1)
133 | #print(y_test.size(),y_train.size())
134 | else:
135 | X_train = torch.Tensor(X_train).type(torch.float32)
136 | y_train = torch.Tensor(y_train).type(torch.int64)
137 | X_test = torch.Tensor(X_test).type(torch.float32)
138 | y_test = torch.Tensor(y_test).type(torch.int64)
139 | train_data = [(x, y) for x, y in zip(X_train, y_train)]
140 | test_data = [(x, y) for x, y in zip(X_test, y_test)]
141 | return id, train_data, test_data
142 |
143 | class Metrics(object):
144 | def __init__(self, clients, params):
145 | self.params = params
146 | num_rounds = params['num_rounds']
147 | self.bytes_written = {c.id: [0] * num_rounds for c in clients}
148 | self.client_computations = {c.id: [0] * num_rounds for c in clients}
149 | self.bytes_read = {c.id: [0] * num_rounds for c in clients}
150 | self.accuracies = []
151 | self.train_accuracies = []
152 |
153 | def update(self, rnd, cid, stats):
154 | bytes_w, comp, bytes_r = stats
155 | self.bytes_written[cid][rnd] += bytes_w
156 | self.client_computations[cid][rnd] += comp
157 | self.bytes_read[cid][rnd] += bytes_r
158 |
159 | def write(self):
160 | metrics = {}
161 | metrics['dataset'] = self.params['dataset']
162 | metrics['num_rounds'] = self.params['num_rounds']
163 | metrics['eval_every'] = self.params['eval_every']
164 | metrics['learning_rate'] = self.params['learning_rate']
165 | metrics['mu'] = self.params['mu']
166 | metrics['num_epochs'] = self.params['num_epochs']
167 | metrics['batch_size'] = self.params['batch_size']
168 | metrics['accuracies'] = self.accuracies
169 | metrics['train_accuracies'] = self.train_accuracies
170 | metrics['client_computations'] = self.client_computations
171 | metrics['bytes_written'] = self.bytes_written
172 | metrics['bytes_read'] = self.bytes_read
173 | metrics_dir = os.path.join('out', self.params['dataset'], 'metrics_{}_{}_{}_{}_{}.json'.format(
174 | self.params['seed'], self.params['optimizer'], self.params['learning_rate'], self.params['num_epochs'], self.params['mu']))
175 | #os.mkdir(os.path.join('out', self.params['dataset']))
176 | if not os.path.exists('out'):
177 | os.mkdir('out')
178 | if not os.path.exists(os.path.join('out', self.params['dataset'])):
179 | os.mkdir(os.path.join('out', self.params['dataset']))
180 | with open(metrics_dir, 'w') as ouf:
181 | json.dump(metrics, ouf)
182 |
--------------------------------------------------------------------------------
/utils/plot_utils.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import h5py
3 | import numpy as np
4 | from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, mark_inset
5 | plt.rcParams.update({'font.size': 14})
6 |
7 | def simple_read_data(loc_ep, alg):
8 | hf = h5py.File("./results/"+'{}_{}.h5'.format(alg, loc_ep), 'r')
9 | rs_glob_acc = np.array(hf.get('rs_glob_acc')[:])
10 | rs_train_acc = np.array(hf.get('rs_train_acc')[:])
11 | rs_train_loss = np.array(hf.get('rs_train_loss')[:])
12 | return rs_train_acc, rs_train_loss, rs_glob_acc
13 |
14 | def get_training_data_value(num_users=100, loc_ep1=5, Numb_Glob_Iters=10, lamb=[], learning_rate=[],hyper_learning_rate=[],algorithms_list=[], batch_size=0, rho=[], dataset=""):
15 | Numb_Algs = len(algorithms_list)
16 | train_acc = np.zeros((Numb_Algs, Numb_Glob_Iters))
17 | train_loss = np.zeros((Numb_Algs, Numb_Glob_Iters))
18 | glob_acc = np.zeros((Numb_Algs, Numb_Glob_Iters))
19 | algs_lbl = algorithms_list.copy()
20 | for i in range(Numb_Algs):
21 | if(lamb[i] > 0):
22 | algorithms_list[i] = algorithms_list[i] + "_prox_" + str(lamb[i])
23 | algs_lbl[i] = algs_lbl[i] + "_prox"
24 |
25 | string_learning_rate = str(learning_rate[i])
26 |
27 | if(algorithms_list[i] == "FEDL"):
28 | string_learning_rate = string_learning_rate + "_" +str(hyper_learning_rate[i])
29 | algorithms_list[i] = algorithms_list[i] + \
30 | "_" + string_learning_rate + "_" + str(num_users) + \
31 | "u" + "_" + str(batch_size[i]) + "b" + "_" + str(loc_ep1[i])
32 | if(rho[i] > 0):
33 | algorithms_list[i] += "_" + str(rho[i])+"p"
34 |
35 | train_acc[i, :], train_loss[i, :], glob_acc[i, :] = np.array(
36 | simple_read_data("avg", dataset + "_" + algorithms_list[i]))[:, :Numb_Glob_Iters]
37 | algs_lbl[i] = algs_lbl[i]
38 | return glob_acc, train_acc, train_loss
39 |
40 |
41 | def get_data_label_style(input_data = [], linestyles= [], algs_lbl = [], lamb = [], loc_ep1 = 0, batch_size =0):
42 | data, lstyles, labels = [], [], []
43 | for i in range(len(algs_lbl)):
44 | data.append(input_data[i, ::])
45 | lstyles.append(linestyles[i])
46 | labels.append(algs_lbl[i]+str(lamb[i])+"_" +
47 | str(loc_ep1[i])+"e" + "_" + str(batch_size[i]) + "b")
48 |
49 | return data, lstyles, labels
50 |
51 | def average_smooth(data, window_len=10, window='hanning'):
52 | results = []
53 | if window_len<3:
54 | return data
55 | for i in range(len(data)):
56 | x = data[i]
57 | s=np.r_[x[window_len-1:0:-1],x,x[-2:-window_len-1:-1]]
58 | #print(len(s))
59 | if window == 'flat': #moving average
60 | w=np.ones(window_len,'d')
61 | else:
62 | w=eval('numpy.'+window+'(window_len)')
63 |
64 | y=np.convolve(w/w.sum(),s,mode='valid')
65 | results.append(y[window_len-1:])
66 | return np.array(results)
67 |
68 | def plot_summary_one_figure(num_users=100, loc_ep1=5, Numb_Glob_Iters=10, lamb=[], learning_rate=[],hyper_learning_rate=[], algorithms_list=[], batch_size=0, rho = [], dataset = ""):
69 | Numb_Algs = len(algorithms_list)
70 | #glob_acc, train_acc, train_loss = get_training_data_value(
71 | # num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate,hyper_learning_rate, algorithms_list, batch_size, dataset)
72 |
73 | glob_acc_, train_acc_, train_loss_ = get_training_data_value(num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate, hyper_learning_rate, algorithms_list, batch_size, rho, dataset)
74 | glob_acc = average_smooth(glob_acc_, window='flat')
75 | train_loss = average_smooth(train_loss_, window='flat')
76 | train_acc = average_smooth(train_acc_, window='flat')
77 |
78 | plt.figure(1)
79 | MIN = train_loss.min() - 0.001
80 | start = 0
81 | linestyles = ['-', '--', '-.', ':', '-', '--', '-.', ':', ':']
82 | plt.grid(True)
83 | for i in range(Numb_Algs):
84 | plt.plot(train_acc[i, 1:], linestyle=linestyles[i], label=algorithms_list[i] + str(lamb[i])+ "_"+str(loc_ep1[i])+"e" + "_" + str(batch_size[i]) + "b")
85 | plt.legend(loc='lower right')
86 | plt.ylabel('Training Accuracy')
87 | plt.xlabel('Global rounds ' + '$K_g$')
88 | plt.title(dataset.upper())
89 | #plt.ylim([0.8, glob_acc.max()])
90 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'train_acc.png', bbox_inches="tight")
91 | #plt.savefig(dataset + str(loc_ep1[1]) + 'train_acc.pdf')
92 | plt.figure(2)
93 |
94 | plt.grid(True)
95 | for i in range(Numb_Algs):
96 | plt.plot(train_loss[i, start:], linestyle=linestyles[i], label=algorithms_list[i] + str(lamb[i]) +
97 | "_"+str(loc_ep1[i])+"e" + "_" + str(batch_size[i]) + "b")
98 | #plt.plot(train_loss1[i, 1:], label=algs_lbl1[i])
99 | plt.legend(loc='upper right')
100 | plt.ylabel('Training Loss')
101 | plt.xlabel('Global rounds')
102 | plt.title(dataset.upper())
103 | #plt.ylim([train_loss.min(), 0.5])
104 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'train_loss.png', bbox_inches="tight")
105 | #plt.savefig(dataset + str(loc_ep1[1]) + 'train_loss.pdf')
106 | plt.figure(3)
107 | plt.grid(True)
108 | for i in range(Numb_Algs):
109 | plt.plot(glob_acc[i, start:], linestyle=linestyles[i],
110 | label=algorithms_list[i]+str(lamb[i])+"_"+str(loc_ep1[i])+"e" + "_" + str(batch_size[i]) + "b")
111 | #plt.plot(glob_acc1[i, 1:], label=algs_lbl1[i])
112 | plt.legend(loc='lower right')
113 | #plt.ylim([0.6, glob_acc.max()])
114 | plt.ylabel('Test Accuracy')
115 | plt.xlabel('Global rounds ')
116 | plt.title(dataset.upper())
117 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'glob_acc.png', bbox_inches="tight")
118 | #plt.savefig(dataset + str(loc_ep1[1]) + 'glob_acc.pdf')
119 |
120 | def get_max_value_index(num_users=100, loc_ep1=5, Numb_Glob_Iters=10, lamb=[], learning_rate=[], algorithms_list=[], batch_size=0, dataset=""):
121 | Numb_Algs = len(algorithms_list)
122 | glob_acc, train_acc, train_loss = get_training_data_value(
123 | num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate, algorithms_list, batch_size, dataset)
124 | for i in range(Numb_Algs):
125 | print("Algorithm: ", algorithms_list[i], "Max testing Accurancy: ", glob_acc[i].max(
126 | ), "Index: ", np.argmax(glob_acc[i]), "local update:", loc_ep1[i])
127 |
128 | def plot_summary_mnist(num_users=100, loc_ep1=[], Numb_Glob_Iters=10, lamb=[], learning_rate=[],hyper_learning_rate=[], algorithms_list=[], batch_size=0,rho = [], dataset=""):
129 | Numb_Algs = len(algorithms_list)
130 |
131 | #glob_acc, train_acc, train_loss = get_training_data_value(num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate, hyper_learning_rate, algorithms_list, batch_size, rho, dataset)
132 |
133 | glob_acc_, train_acc_, train_loss_ = get_training_data_value(num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate, hyper_learning_rate, algorithms_list, batch_size, rho, dataset)
134 | glob_acc = average_smooth(glob_acc_, window='flat')
135 | train_loss = average_smooth(train_loss_, window='flat')
136 | train_acc = average_smooth(train_acc_, window='flat')
137 |
138 | for i in range(Numb_Algs):
139 | print(algorithms_list[i], "acc:", glob_acc[i].max())
140 | print(algorithms_list[i], "loss:", train_loss[i].min())
141 |
142 | plt.figure(1)
143 |
144 | linestyles = ['-', '--', '-.', ':']
145 | algs_lbl = ["FEDL", "FedAvg",
146 | "FEDL", "FedAvg",
147 | "FEDL", "FedAvg",
148 | "FEDL", "FEDL"]
149 |
150 | fig = plt.figure(figsize=(12, 4))
151 | ax = fig.add_subplot(111) # The big subplot
152 | ax1 = fig.add_subplot(131)
153 | ax2 = fig.add_subplot(132)
154 | ax3 = fig.add_subplot(133)
155 | ax1.grid(True)
156 | ax2.grid(True)
157 | ax3.grid(True)
158 | #min = train_loss.min()
159 | min = train_loss.min() - 0.001
160 | max = 0.46
161 | #max = train_loss.max() + 0.01
162 | num_al = 2
163 | # Turn off axis lines and ticks of the big subplot
164 | ax.spines['top'].set_color('none')
165 | ax.spines['bottom'].set_color('none')
166 | ax.spines['left'].set_color('none')
167 | ax.spines['right'].set_color('none')
168 | ax.tick_params(labelcolor='w', top='off',
169 | bottom='off', left='off', right='off')
170 | for i in range(num_al):
171 | stringbatch = str(batch_size[i])
172 | if(stringbatch == '0'):
173 | stringbatch = '$\infty$'
174 | ax1.plot(train_loss[i, 1:], linestyle=linestyles[i],
175 | label=algs_lbl[i] + " : " + '$B = $' + stringbatch+ ', $\eta = $'+ str(hyper_learning_rate[i]))
176 | ax1.set_ylim([min, max])
177 | ax1.legend(loc='upper right', prop={'size': 10})
178 |
179 | for i in range(num_al):
180 | stringbatch = str(batch_size[i+2])
181 | if(stringbatch == '0'):
182 | stringbatch = '$\infty$'
183 | ax2.plot(train_loss[i+num_al, 1:], linestyle=linestyles[i],
184 | label=algs_lbl[i + num_al] + " : " + '$B = $' + stringbatch+ ', $\eta = $'+ str(hyper_learning_rate[i+num_al]))
185 | ax2.set_ylim([min, max])
186 | ax2.legend(loc='upper right', prop={'size': 10})
187 |
188 | for i in range(4):
189 | stringbatch = str(batch_size[i+4])
190 | if(stringbatch == '0'):
191 | stringbatch = '$\infty$'
192 | ax3.plot(train_loss[i+num_al*2, 1:], linestyle=linestyles[i],
193 | label=algs_lbl[i + num_al*2] + " : " + '$B = $' + stringbatch+ ', $\eta = $'+ str(hyper_learning_rate[i+num_al*2]))
194 | ax3.set_ylim([min, max])
195 | ax3.legend(loc='upper right', prop={'size': 10})
196 |
197 | ax.set_title('MNIST', y=1.02)
198 | ax.set_xlabel('Global rounds ' + '$K_g$')
199 | ax.set_ylabel('Training Loss', labelpad=15)
200 | plt.savefig(dataset + str(loc_ep1[1]) +
201 | 'train_loss.pdf', bbox_inches='tight')
202 | plt.savefig(dataset + str(loc_ep1[1]) +
203 | 'train_loss.png', bbox_inches='tight')
204 |
205 | fig = plt.figure(figsize=(12, 4))
206 | ax = fig.add_subplot(111) # The big subplot
207 | ax1 = fig.add_subplot(131)
208 | ax2 = fig.add_subplot(132)
209 | ax3 = fig.add_subplot(133)
210 | ax1.grid(True)
211 | ax2.grid(True)
212 | ax3.grid(True)
213 | #min = train_loss.min()
214 | min = 0.82
215 | max = glob_acc.max() + 0.001 # train_loss.max() + 0.01
216 | num_al = 2
217 | # Turn off axis lines and ticks of the big subplot
218 | ax.spines['top'].set_color('none')
219 | ax.spines['bottom'].set_color('none')
220 | ax.spines['left'].set_color('none')
221 | ax.spines['right'].set_color('none')
222 | ax.tick_params(labelcolor='w', top='off',
223 | bottom='off', left='off', right='off')
224 | for i in range(num_al):
225 | stringbatch = str(batch_size[i])
226 | if(stringbatch == '0'):
227 | stringbatch = '$\infty$'
228 | ax1.plot(glob_acc[i, 1:], linestyle=linestyles[i],
229 | label=algs_lbl[i] + " : " + '$B = $' + stringbatch + ', $\eta = $'+ str(hyper_learning_rate[i]))
230 | ax1.set_ylim([min, max])
231 | ax1.legend(loc='lower right', prop={'size': 10})
232 |
233 | for i in range(num_al):
234 | stringbatch = str(batch_size[i+2])
235 | if(stringbatch == '0'):
236 | stringbatch = '$\infty$'
237 | ax2.plot(glob_acc[i+num_al, 1:], linestyle=linestyles[i],
238 | label=algs_lbl[i + num_al] + " : " + '$B = $' + stringbatch+ ', $\eta = $'+ str(hyper_learning_rate[i+num_al*1]))
239 | ax2.set_ylim([min, max])
240 | ax2.legend(loc='lower right', prop={'size': 10})
241 |
242 | for i in range(4):
243 | stringbatch = str(batch_size[i+4])
244 | if(stringbatch == '0'):
245 | stringbatch = '$\infty$'
246 | ax3.plot(glob_acc[i+num_al*2, 1:], linestyle=linestyles[i],
247 | label=algs_lbl[i + num_al*2] + " : " + '$B = $' + stringbatch + ', $\eta = $'+ str(hyper_learning_rate[i+num_al*2]))
248 | ax3.set_ylim([min, max])
249 | ax3.legend(loc='lower right', prop={'size': 10})
250 |
251 | ax.set_title('MNIST', y=1.02)
252 | ax.set_xlabel('Global rounds ' + '$K_g$')
253 | ax.set_ylabel('Testing Accuracy', labelpad=15)
254 | plt.savefig(dataset + str(loc_ep1[1]) + 'test_accu.pdf', bbox_inches='tight')
255 | plt.savefig(dataset + str(loc_ep1[1]) + 'test_accu.png', bbox_inches='tight')
256 |
257 |
258 | def plot_summary_nist(num_users=100, loc_ep1=[], Numb_Glob_Iters=10, lamb=[], learning_rate=[], hyper_learning_rate=[], algorithms_list=[], batch_size=0,rho = [], dataset=""):
259 | Numb_Algs = len(algorithms_list)
260 | #glob_acc, train_acc, train_loss = get_training_data_value( num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate, hyper_learning_rate, algorithms_list, batch_size, rho, dataset)
261 | glob_acc_, train_acc_, train_loss_ = get_training_data_value(num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate, hyper_learning_rate, algorithms_list, batch_size, rho, dataset)
262 | glob_acc = average_smooth(glob_acc_, window='flat')
263 | train_loss = average_smooth(train_loss_, window='flat')
264 | train_acc = average_smooth(train_acc_, window='flat')
265 | for i in range(Numb_Algs):
266 | print(algorithms_list[i], "acc:", glob_acc[i].max())
267 | print(algorithms_list[i], "loss:", train_loss[i].max())
268 | plt.figure(1)
269 |
270 | linestyles = ['-', '--', '-.', ':']
271 | algs_lbl = ["FEDL","FedAvg", "FEDL",
272 | "FEDL", "FedAvg", "FEDL",
273 | "FEDL", "FedAvg", "FEDL"]
274 | fig = plt.figure(figsize=(12, 4))
275 |
276 | ax = fig.add_subplot(111) # The big subplot
277 | ax1 = fig.add_subplot(131)
278 | ax2 = fig.add_subplot(132)
279 | ax3 = fig.add_subplot(133)
280 | ax1.grid(True)
281 | ax2.grid(True)
282 | ax3.grid(True)
283 | #min = train_loss.min()
284 | min = train_loss.min() - 0.01
285 | max = 3 # train_loss.max() + 0.01
286 | num_al = 3
287 | # Turn off axis lines and ticks of the big subplot
288 | ax.spines['top'].set_color('none')
289 | ax.spines['bottom'].set_color('none')
290 | ax.spines['left'].set_color('none')
291 | ax.spines['right'].set_color('none')
292 | ax.tick_params(labelcolor='w', top='off',
293 | bottom='off', left='off', right='off')
294 | for i in range(num_al):
295 | stringbatch = str(batch_size[i])
296 | if(stringbatch == '0'):
297 | stringbatch = '$\infty$'
298 | ax1.plot(train_loss[i, 1:], linestyle=linestyles[i],
299 | label=algs_lbl[i] + " : " + '$B = $' + stringbatch + ', $\eta = $' + str(hyper_learning_rate[i]) + ', $K_l = $' + str(loc_ep1[i]))
300 | ax1.set_ylim([min, max])
301 | ax1.legend(loc='upper right', prop={'size': 10})
302 |
303 | for i in range(num_al):
304 | stringbatch = str(batch_size[i+num_al])
305 | if(stringbatch == '0'):
306 | stringbatch = '$\infty$'
307 | ax2.plot(train_loss[i+num_al, 1:], linestyle=linestyles[i],
308 | label=algs_lbl[i + num_al] + " : " + '$B = $' + stringbatch + ', $\eta = $' + str(hyper_learning_rate[i+num_al]) + ', $K_l = $' + str(loc_ep1[i+ num_al]))
309 | ax2.set_ylim([min, max])
310 | ax2.legend(loc='upper right', prop={'size': 10})
311 |
312 | for i in range(num_al):
313 | stringbatch = str(batch_size[i+num_al*2])
314 | if(stringbatch == '0'):
315 | stringbatch = '$\infty$'
316 | ax3.plot(train_loss[i+num_al*2, 1:], linestyle=linestyles[i],
317 | label=algs_lbl[i + num_al*2] + " : " + '$B = $' + stringbatch + ', $\eta = $' + str(hyper_learning_rate[i+num_al*2]) + ', $K_l = $' + str(loc_ep1[i + num_al*2]))
318 | ax3.set_ylim([min, max])
319 | ax3.legend(loc='upper right', prop={'size': 10})
320 |
321 | ax.set_title('FEMNIST', y=1.02)
322 | ax.set_xlabel('Global rounds ' + '$K_g$')
323 | ax.set_ylabel('Training Loss', labelpad=15)
324 | plt.savefig(dataset + str(loc_ep1[1]) + 'train_loss.pdf', bbox_inches='tight')
325 | plt.savefig(dataset + str(loc_ep1[1]) + 'train_loss.png', bbox_inches='tight')
326 |
327 | fig = plt.figure(figsize=(12, 4))
328 | ax = fig.add_subplot(111) # The big subplot
329 | ax1 = fig.add_subplot(131)
330 | ax2 = fig.add_subplot(132)
331 | ax3 = fig.add_subplot(133)
332 | ax1.grid(True)
333 | ax2.grid(True)
334 | ax3.grid(True)
335 | #min = train_loss.min()
336 | num_al = 3
337 | min = 0.3
338 | max = glob_acc.max() + 0.01 # train_loss.max() + 0.01
339 | # Turn off axis lines and ticks of the big subplot
340 | ax.spines['top'].set_color('none')
341 | ax.spines['bottom'].set_color('none')
342 | ax.spines['left'].set_color('none')
343 | ax.spines['right'].set_color('none')
344 | ax.tick_params(labelcolor='w', top='off',
345 | bottom='off', left='off', right='off')
346 | for i in range(num_al):
347 | stringbatch = str(batch_size[i])
348 | if(stringbatch == '0'):
349 | stringbatch = '$\infty$'
350 | ax1.plot(glob_acc[i, 1:], linestyle=linestyles[i],
351 | label=algs_lbl[i] + " : " + '$B = $' + stringbatch + ', $\eta = $' + str(hyper_learning_rate[i]) + ', $K_l = $' + str(loc_ep1[i]))
352 | ax1.set_ylim([min, max])
353 | ax1.legend(loc='lower right', prop={'size': 10})
354 |
355 | for i in range(num_al):
356 | stringbatch = str(batch_size[i+num_al])
357 | if(stringbatch == '0'):
358 | stringbatch = '$\infty$'
359 | ax2.plot(glob_acc[i+num_al, 1:], linestyle=linestyles[i],
360 | label=algs_lbl[i + num_al] + " : " + '$B = $' + stringbatch + ', $\eta = $' + str(hyper_learning_rate[i+num_al*1]) + ', $K_l = $' + str(loc_ep1[i + num_al]))
361 | ax2.set_ylim([min, max])
362 | ax2.legend(loc='lower right', prop={'size': 10})
363 |
364 | for i in range(num_al):
365 | stringbatch = str(batch_size[i+num_al*2])
366 | if(stringbatch == '0'):
367 | stringbatch = '$\infty$'
368 | ax3.plot(glob_acc[i+num_al*2, 1:], linestyle=linestyles[i],
369 | label=algs_lbl[i + num_al*2] + " : " + '$B = $' + stringbatch + ', $\eta = $' + str(hyper_learning_rate[i+num_al*2]) + ', $K_l = $' + str(loc_ep1[i+ 2*num_al]))
370 | ax3.set_ylim([min, max])
371 | ax3.legend(loc='lower right', prop={'size': 10})
372 |
373 | ax.set_title('FEMNIST', y=1.02)
374 | ax.set_xlabel('Global rounds ' + '$K_g$')
375 | ax.set_ylabel('Testing Accuracy', labelpad=15)
376 | plt.savefig(dataset + str(loc_ep1[1]) + 'test_accu.pdf', bbox_inches='tight')
377 | plt.savefig(dataset + str(loc_ep1[1]) + 'test_accu.png', bbox_inches='tight')
378 |
379 | def plot_summary_linear(num_users=100, loc_ep1=5, Numb_Glob_Iters=10, lamb=[], learning_rate=[],hyper_learning_rate=[], algorithms_list=[], batch_size=0,rho = [], dataset = ""):
380 |
381 | Numb_Algs = len(algorithms_list)
382 | glob_acc, train_acc, train_loss = get_training_data_value( num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate, hyper_learning_rate, algorithms_list, batch_size, rho, dataset)
383 | for i in range(Numb_Algs):
384 | print(algorithms_list[i], "loss:", glob_acc[i].max())
385 | plt.figure(1)
386 |
387 | linestyles = ['-', '-', '-', '-']
388 | markers = ["o","v","s","*","x","P"]
389 | algs_lbl = ["FEDL","FEDL", "FEDL","FEDL",
390 | "FEDL", "FEDL", "FEDL","FEDL",
391 | "FEDL", "FEDL", "FEDL","FEDL"]
392 | fig = plt.figure(figsize=(12, 4))
393 | ax = fig.add_subplot(111) # The big subplot
394 | ax1 = fig.add_subplot(131)
395 | ax2 = fig.add_subplot(132)
396 | ax3 = fig.add_subplot(133)
397 | #min = train_loss.min()
398 | num_al = 4
399 | # Turn off axis lines and ticks of the big subplot
400 | ax.spines['top'].set_color('none')
401 | ax.spines['bottom'].set_color('none')
402 | ax.spines['left'].set_color('none')
403 | ax.spines['right'].set_color('none')
404 | ax.tick_params(labelcolor='w', top='off',
405 | bottom='off', left='off', right='off')
406 | for i in range(num_al):
407 | ax1.plot(train_loss[i, 1:], linestyle=linestyles[i], label=algs_lbl[i] + ": "+ '$\eta = $' + str(hyper_learning_rate[i]) ,marker = markers[i],markevery=0.4, markersize=5)
408 |
409 | ax1.hlines(y=0.035,xmin=0, xmax=200, linestyle='--',label = "optimal solution", color= "m" )
410 | ax1.legend(loc='upper right', prop={'size': 10})
411 | ax1.set_ylim([0.02, 0.5])
412 | ax1.set_title('$\\rho = $' + str(rho[0]))
413 | ax1.grid(True)
414 | for i in range(num_al):
415 | str_rho = ', $\eta = $' + str(rho[i])
416 | ax2.plot(train_loss[i+num_al, 1:], linestyle=linestyles[i], label=algs_lbl[i + num_al] + ": "+ '$\eta = $' + str(hyper_learning_rate[i+num_al]) ,marker = markers[i],markevery=0.4, markersize=5)
417 |
418 | ax2.hlines(y=0.035,xmin=0, xmax=200, linestyle='--',label = "optimal solution", color= "m" )
419 | ax2.set_ylim([0.02, 0.5])
420 | #ax2.legend(loc='upper right')
421 | ax2.set_title('$\\rho = $' + str(rho[0+ num_al]))
422 | ax2.grid(True)
423 | for i in range(num_al):
424 | str_rho = ', $\rho = $' + str(rho[i])
425 | ax3.plot(train_loss[i+num_al*2, 1:], linestyle=linestyles[i], label=algs_lbl[i + num_al*2] + ": "+ '$\eta = $' + str(hyper_learning_rate[i+num_al*2]) ,marker = markers[i], markevery=0.4, markersize=5)
426 |
427 | ax3.hlines(y=0.035, xmin=0, xmax=200, linestyle='--',
428 | label="optimal solution", color="m")
429 | ax3.set_ylim([0.02, 0.5])
430 | #ax3.legend(loc='upper right')
431 | ax3.set_title('$\\rho = $' + str(rho[0+ 2*num_al]))
432 | ax3.grid(True)
433 | ax.set_title('Synthetic dataset', y=1.1)
434 | ax.set_xlabel('Global rounds ' + '$K_g$')
435 | ax.set_ylabel('Training Loss')
436 | plt.savefig(dataset + str(loc_ep1[1]) + 'train_loss.pdf', bbox_inches='tight')
437 | plt.savefig(dataset + str(loc_ep1[1]) + 'train_loss.png', bbox_inches='tight')
438 |
439 | def get_all_training_data_value(num_users=100, loc_ep1=5, Numb_Glob_Iters=10, lamb = 0, learning_rate=0, hyper_learning_rate=0, algorithms="", batch_size=0, dataset="", rho= 0, times = 5):
440 | train_acc = np.zeros((times, Numb_Glob_Iters))
441 | train_loss = np.zeros((times, Numb_Glob_Iters))
442 | glob_acc = np.zeros((times, Numb_Glob_Iters))
443 | algorithms_list = [algorithms] * times
444 |
445 | for i in range(times):
446 | if(lamb > 0):
447 | algorithms_list[i] = algorithms_list[i] + "_prox_" + str(lamb)
448 |
449 | string_learning_rate = str(learning_rate)
450 |
451 | if(algorithms_list[i] == "FEDL"):
452 | string_learning_rate = string_learning_rate + "_" +str(hyper_learning_rate)
453 |
454 | algorithms_list[i] = algorithms_list[i] + "_" + string_learning_rate + "_" + str(num_users) + "u" + "_" + str(batch_size) + "b" + "_" + str(loc_ep1)
455 |
456 | if(rho > 0):
457 | algorithms_list[i] += "_" + str(rho) + "p"
458 |
459 | train_acc[i, :], train_loss[i, :], glob_acc[i, :] = np.array(
460 | simple_read_data(str(i) , dataset + "_" + algorithms_list[i]))[:, :Numb_Glob_Iters]
461 |
462 | return glob_acc, train_acc, train_loss
463 |
464 |
465 | def average_data(num_users, loc_ep1, Numb_Glob_Iters, lamb,learning_rate, hyper_learning_rate, algorithms, batch_size, dataset, rho, times):
466 | glob_acc, train_acc, train_loss = get_all_training_data_value( num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate, hyper_learning_rate, algorithms, batch_size, dataset, rho, times)
467 | # store average value to h5 file
468 | glob_acc_data = np.average(glob_acc, axis=0)
469 | train_acc_data = np.average(train_acc, axis=0)
470 | train_loss_data = np.average(train_loss, axis=0)
471 |
472 | max_accurancy = []
473 | for i in range(times):
474 | max_accurancy.append(glob_acc[i].max())
475 | print("std:", np.std(max_accurancy))
476 | print("Mean:", np.mean(max_accurancy))
477 |
478 | alg = dataset + "_" + algorithms
479 | alg += "_" + str(learning_rate)
480 |
481 | if(algorithms == "FEDL"):
482 | alg += "_" + str(hyper_learning_rate)
483 |
484 | alg += "_" + str(num_users) + "u" + "_" + str(batch_size) + "b" + "_" + str(loc_ep1)
485 |
486 | if(lamb > 0):
487 | alg += "_" + str(lamb) + "L"
488 |
489 | if(rho > 0):
490 | alg += "_" + str(rho) + "p"
491 |
492 | #alg = alg + "_" + str(learning_rate) + "_" + str(hyper_learning_rate) + "_" + str(lamb) + "_" + str(num_users) + "u" + "_" + str(batch_size) + "b" + "_" + str(loc_ep1)
493 | alg = alg + "_" + "avg"
494 | if (len(glob_acc) != 0 & len(train_acc) & len(train_loss)) :
495 | with h5py.File("./results/"+'{}.h5'.format(alg,loc_ep1), 'w') as hf:
496 | hf.create_dataset('rs_glob_acc', data=glob_acc_data)
497 | hf.create_dataset('rs_train_acc', data=train_acc_data)
498 | hf.create_dataset('rs_train_loss', data=train_loss_data)
499 | hf.close()
500 | return 0
501 |
502 | def plot_summary_one_mnist(num_users=100, loc_ep1=5, Numb_Glob_Iters=10, lamb=[], learning_rate=[],hyper_learning_rate=[], algorithms_list=[], batch_size=0, rho = [], dataset = ""):
503 | Numb_Algs = len(algorithms_list)
504 | #glob_acc, train_acc, train_loss = get_training_data_value(
505 | # num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate,hyper_learning_rate, algorithms_list, batch_size, dataset)
506 |
507 | glob_acc_, train_acc_, train_loss_ = get_training_data_value(num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate, hyper_learning_rate, algorithms_list, batch_size, rho, dataset)
508 | glob_acc = average_smooth(glob_acc_, window='flat')
509 | train_loss = average_smooth(train_loss_, window='flat')
510 | train_acc = average_smooth(train_acc_, window='flat')
511 |
512 | plt.figure(1)
513 | MIN = train_loss.min() - 0.001
514 | start = 0
515 | linestyles = ['-', '--', '-.', ':']
516 | markers = ["o","v","s","*","x","P"]
517 | algs_lbl = ["FEDL","FedAvg","FEDL","FedAvg"]
518 | plt.grid(True)
519 | for i in range(Numb_Algs):
520 | stringbatch = str(batch_size[i])
521 | if(stringbatch == '0'):
522 | stringbatch = '$\infty$'
523 | plt.plot(train_acc[i, 1:], linestyle=linestyles[i],marker = markers[i],label=algs_lbl[i] + " : " + '$B = $' + stringbatch, markevery=0.4, markersize=5)
524 |
525 | plt.legend(loc='lower right')
526 | plt.ylabel('Training Accuracy')
527 | plt.xlabel('Global rounds ' + '$K_g$')
528 | plt.title(dataset.upper())
529 | plt.ylim([0.85, train_acc.max()])
530 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'train_acc.png', bbox_inches="tight")
531 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'train_acc.pdf', bbox_inches="tight")
532 | #plt.savefig(dataset + str(loc_ep1[1]) + 'train_acc.pdf')
533 | plt.figure(2)
534 |
535 | plt.grid(True)
536 | for i in range(Numb_Algs):
537 | stringbatch = str(batch_size[i])
538 | if(stringbatch == '0'):
539 | stringbatch = '$\infty$'
540 | plt.plot(train_loss[i, 1:], linestyle=linestyles[i],marker = markers[i],label=algs_lbl[i] + " : " + '$B = $' + stringbatch, markevery=0.4, markersize=5)
541 |
542 | #plt.plot(train_loss1[i, 1:], label=algs_lbl1[i])
543 | plt.legend(loc='upper right')
544 | plt.ylabel('Training Loss')
545 | plt.xlabel('Global rounds')
546 | plt.title(dataset.upper())
547 | plt.ylim([train_loss.min() -0.01, 0.7])
548 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'train_loss.png', bbox_inches="tight")
549 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'train_loss.pdf', bbox_inches="tight")
550 | #plt.savefig(dataset + str(loc_ep1[1]) + 'train_loss.pdf')
551 | plt.figure(3)
552 | plt.grid(True)
553 | for i in range(Numb_Algs):
554 | stringbatch = str(batch_size[i])
555 | if(stringbatch == '0'):
556 | stringbatch = '$\infty$'
557 | plt.plot(glob_acc[i, 1:], linestyle=linestyles[i],marker = markers[i],label=algs_lbl[i] + " : " + '$B = $' + stringbatch, markevery=0.4, markersize=5)
558 | #plt.plot(glob_acc1[i, 1:], label=algs_lbl1[i])
559 | plt.legend(loc='lower right')
560 | plt.ylim([0.8, glob_acc.max() + 0.005])
561 | plt.ylabel('Test Accuracy')
562 | plt.xlabel('Global rounds ')
563 | plt.title(dataset.upper())
564 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'glob_acc.png', bbox_inches="tight")
565 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'glob_acc.pdf', bbox_inches="tight")
566 | #plt.savefig(dataset + str(loc_ep1[1]) + 'glob_acc.pdf')
567 |
568 |
569 | def plot_summary_one_nist(num_users=100, loc_ep1=5, Numb_Glob_Iters=10, lamb=[], learning_rate=[],hyper_learning_rate=[], algorithms_list=[], batch_size=0, rho = [], dataset = ""):
570 | Numb_Algs = len(algorithms_list)
571 | #glob_acc, train_acc, train_loss = get_training_data_value(
572 | # num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate,hyper_learning_rate, algorithms_list, batch_size, dataset)
573 |
574 | glob_acc_, train_acc_, train_loss_ = get_training_data_value(num_users, loc_ep1, Numb_Glob_Iters, lamb, learning_rate, hyper_learning_rate, algorithms_list, batch_size, rho, dataset)
575 | glob_acc = average_smooth(glob_acc_, window='flat')
576 | train_loss = average_smooth(train_loss_, window='flat')
577 | train_acc = average_smooth(train_acc_, window='flat')
578 |
579 | plt.figure(1)
580 | MIN = train_loss.min() - 0.001
581 | start = 0
582 | linestyles = ['-', '--', '-.', ':']
583 | markers = ["o","v","s","*","x","P"]
584 | algs_lbl = ["FEDL","FedAvg","FedAvg"]
585 | plt.grid(True)
586 | for i in range(Numb_Algs):
587 | stringbatch = str(batch_size[i])
588 | if(stringbatch == '0'):
589 | stringbatch = '$\infty$'
590 | plt.plot(train_acc[i, 1:], linestyle=linestyles[i],marker = markers[i],label=algs_lbl[i] + " : " + '$B = $' + stringbatch, markevery=0.4, markersize=5)
591 |
592 | plt.legend(loc='lower right')
593 | plt.ylabel('Training Accuracy')
594 | plt.xlabel('Global rounds ' + '$K_g$')
595 | plt.title('FEMNIST')
596 | #plt.ylim([0.85, train_acc.max()])
597 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'train_acc.png', bbox_inches="tight")
598 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'train_acc.pdf', bbox_inches="tight")
599 | #plt.savefig(dataset + str(loc_ep1[1]) + 'train_acc.pdf')
600 | plt.figure(2)
601 |
602 | plt.grid(True)
603 | for i in range(Numb_Algs):
604 | stringbatch = str(batch_size[i])
605 | if(stringbatch == '0'):
606 | stringbatch = '$\infty$'
607 | plt.plot(train_loss[i, 1:], linestyle=linestyles[i],marker = markers[i],label=algs_lbl[i] + " : " + '$B = $' + stringbatch, markevery=0.4, markersize=5)
608 |
609 | #plt.plot(train_loss1[i, 1:], label=algs_lbl1[i])
610 | plt.legend(loc='upper right')
611 | plt.ylabel('Training Loss')
612 | plt.xlabel('Global rounds')
613 | plt.title('FEMNIST')
614 | #plt.ylim([train_loss.min(), 0.7])
615 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'train_loss.png', bbox_inches="tight")
616 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'train_loss.pdf', bbox_inches="tight")
617 | #plt.savefig(dataset + str(loc_ep1[1]) + 'train_loss.pdf')
618 | plt.figure(3)
619 | plt.grid(True)
620 | for i in range(Numb_Algs):
621 | stringbatch = str(batch_size[i])
622 | if(stringbatch == '0'):
623 | stringbatch = '$\infty$'
624 | plt.plot(glob_acc[i, 1:], linestyle=linestyles[i],marker = markers[i],label=algs_lbl[i] + " : " + '$B = $' + stringbatch, markevery=0.4, markersize=5)
625 | #plt.plot(glob_acc1[i, 1:], label=algs_lbl1[i])
626 | plt.legend(loc='lower right')
627 | #plt.ylim([0.8, glob_acc.max() + 0.005])
628 | plt.ylabel('Test Accuracy')
629 | plt.xlabel('Global rounds ')
630 | plt.title('FEMNIST')
631 | #ax.set_title('FEMNIST', y=1.02)
632 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'glob_acc.png', bbox_inches="tight")
633 | plt.savefig(dataset.upper() + str(loc_ep1[1]) + 'glob_acc.pdf', bbox_inches="tight")
634 | #plt.savefig(dataset + str(loc_ep1[1]) + 'glob_acc.pdf')
635 |
--------------------------------------------------------------------------------