├── requirements.txt ├── data ├── synthetic │ ├── README.md │ ├── generate_data.sh │ ├── split_data.py │ └── generate_data.py ├── femnist │ ├── README.md │ ├── group_by_writer.py │ ├── match_hashes.py │ ├── split_data.sh │ ├── get_hashes.py │ ├── preprocess.sh │ ├── get_file_dirs.py │ └── split_data.py ├── sent140 │ ├── README.md │ ├── combine_data.py │ ├── preprocess.sh │ ├── split_data.sh │ └── split_data.py ├── shakespeare │ ├── README.md │ ├── preprocess.sh │ ├── split_data.sh │ ├── split_data.py │ └── preprocess_shakespeare.py ├── inaturalist │ ├── README.md │ ├── preprocess.sh │ └── split_data.py └── README.md ├── utils ├── metrics.py ├── logger.py ├── optim.py └── args.py ├── graph_utils ├── generate_all_networks.sh ├── data │ ├── Read_me_gml.txt │ └── gaia.gml ├── README.md ├── show_networks.py ├── utils │ ├── mbst.py │ ├── evaluate_throughput.py │ ├── tsp_christofides.py │ ├── matcha.py │ ├── matching_decomposition.py │ └── utils.py ├── time_simulator.py └── generate_networks.py ├── .gitignore ├── loaders ├── synthetic.py ├── sent140.py ├── shakespeare.py ├── femnist.py └── inaturalist.py ├── reproduce_results.py ├── make_table3.py ├── communication_module ├── worker.py └── manager.py ├── models ├── inaturalist │ └── resnet.py ├── model.py ├── synthetic │ └── linear.py ├── femnist │ └── cnn.py ├── sent140 │ └── lstm.py └── shakespeare │ └── gru.py ├── main.py ├── make_figure2.py ├── README.md └── LICENSE /requirements.txt: -------------------------------------------------------------------------------- 1 | cvxpy 2 | tensorboard 3 | geopy 4 | PIL 5 | scikit-learn 6 | networkx == 2.4 7 | numpy 8 | torch 9 | torchvision 10 | scipy 11 | matplotlib 12 | jupyter 13 | torchtext 14 | spacy 15 | mplleaflet -------------------------------------------------------------------------------- /data/synthetic/README.md: -------------------------------------------------------------------------------- 1 | # Synthetic Dataset 2 | 3 | ## Setup Instructions 4 | 5 | Run generate_data.sh with a choice of the following tags: 6 | 7 | - ```-nw```: number of workers, written as integer 8 | - ```-nc``` : number of classes, written as integer 9 | - ```-dim```: dimension of the data, written as integer 10 | - ```--tf``` := fraction of data in training set, written as a decimal; default is 0.9 11 | - ```--seed``` := seed to be used before random sampling of data 12 | 13 | i.e. 14 | - ```./generate_data.sh -s -nw 11 -nc 2 -dim 10 -tf 0.8 -seed 1234``` 15 | -------------------------------------------------------------------------------- /utils/metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def binary_accuracy(preds, y): 5 | """ 6 | 7 | :param preds: 8 | :param y: 9 | :return: 10 | """ 11 | # round predictions to the closest integer 12 | rounded_preds = torch.round(torch.sigmoid(preds)) 13 | correct = (rounded_preds == y).float() 14 | acc = correct.sum() / len(correct) 15 | return acc 16 | 17 | 18 | def accuracy(preds, y): 19 | """ 20 | 21 | :param preds: 22 | :param y: 23 | :return: 24 | """ 25 | _, predicted = torch.max(preds, 1) 26 | correct = (predicted == y).float() 27 | acc = correct.sum() / len(correct) 28 | return acc -------------------------------------------------------------------------------- /data/femnist/README.md: -------------------------------------------------------------------------------- 1 | # FEMNIST Dataset 2 | 3 | ## Setup Instructions 4 | 5 | Run preprocess.sh with a choice of the following tags: 6 | 7 | - ```-nw```: number of workers, written as integer 8 | - ```-s``` := 'iid' to sample in an i.i.d. manner, or 'niid' to sample 9 | in a non-i.i.d. manner; more information on i.i.d. versus non-i.i.d. 10 | is included in the 'Notes' section 11 | - ```--sf``` := fraction of data to sample, written as a decimal; 12 | default is 0.1 13 | - ```--tf``` := fraction of data in training set, written as a decimal; default is 0.9 14 | - ```--seed``` := seed to be used before random sampling of data 15 | 16 | i.e. 17 | - ```./preprocess.sh -s iid -nw 11--sf 1.0 -t sample``` (full-sized 18 | dataset partitioned on Gaia)
-------------------------------------------------------------------------------- /data/sent140/README.md: -------------------------------------------------------------------------------- 1 | # Sentiment140 Dataset 2 | 3 | ## Setup Instructions 4 | 5 | Run preprocess.sh with a choice of the following tags: 6 | 7 | - ```-nw```: number of workers, written as integer 8 | - ```-s``` := 'iid' to sample in an i.i.d. manner, or 'niid' to sample 9 | in a non-i.i.d. manner; more information on i.i.d. versus non-i.i.d. 10 | is included in the 'Notes' section 11 | - ```--sf``` := fraction of data to sample, written as a decimal; 12 | default is 0.1 13 | - ```--tf``` := fraction of data in training set, written as a decimal; default is 0.9 14 | - ```--seed``` := seed to be used before random sampling of data 15 | 16 | i.e. 17 | - ```./preprocess.sh -s iid -nw 11--sf 1.0 -t sample``` (full-sized 18 | dataset partitioned on Gaia)
19 | 20 | 21 | -------------------------------------------------------------------------------- /data/shakespeare/README.md: -------------------------------------------------------------------------------- 1 | # Shakespeare Dataset 2 | 3 | ## Setup Instructions 4 | 5 | Run preprocess.sh with a choice of the following tags: 6 | 7 | - ```-nw```: number of workers, written as integer 8 | - ```-s``` := 'iid' to sample in an i.i.d. manner, or 'niid' to sample 9 | in a non-i.i.d. manner; more information on i.i.d. versus non-i.i.d. 10 | is included in the 'Notes' section 11 | - ```--sf``` := fraction of data to sample, written as a decimal; 12 | default is 0.1 13 | - ```--tf``` := fraction of data in training set, written as a decimal; default is 0.9 14 | - ```--seed``` := seed to be used before random sampling of data 15 | 16 | i.e. 17 | - ```./preprocess.sh -s iid -nw 11--sf 1.0 -t sample``` (full-sized 18 | dataset partitioned on Gaia)
19 | 20 | 21 | -------------------------------------------------------------------------------- /graph_utils/generate_all_networks.sh: -------------------------------------------------------------------------------- 1 | echo "################" 2 | echo "gaia" 3 | python generate_networks.py gaia --experiment inaturalist --upload_capacity 1e10 --download_capacity 1e10 4 | echo "################" 5 | echo "amazon_us" 6 | python generate_networks.py amazon_us --experiment inaturalist --upload_capacity 1e10 --download_capacity 1e10 7 | echo "################" 8 | echo "geantdistance" 9 | python generate_networks.py geantdistance --experiment inaturalist --upload_capacity 1e10 --download_capacity 1e10 10 | echo "################" 11 | echo "ebone" 12 | python generate_networks.py ebone --experiment inaturalist --upload_capacity 1e10 --download_capacity 1e10 13 | echo "################" 14 | echo "exodus" 15 | python generate_networks.py exodus --experiment inaturalist --upload_capacity 1e10 --download_capacity 1e10 -------------------------------------------------------------------------------- /data/inaturalist/README.md: -------------------------------------------------------------------------------- 1 | # iNaturalist Dataset 2 | 3 | ## Setup Instructions 4 | 5 | * Download iNaturalist 6 | [here](https://storage.googleapis.com/inat_data_2018_eu/train_val2018.tar.gz), 7 | unzip it and place its content in ``raw_data`` folder. 8 | 9 | * Run preprocess.sh with a choice of the following tags: 10 | 11 | - ```--network```:= name of the network to use, should be present in 12 | ``/graph_utils/data``, default is us-amzaon 13 | - ```--sf``` := fraction of data to sample, written as a decimal; 14 | default is 0.1 15 | - ```--tf``` := fraction of data in training set, written as a decimal; default is 0.9 16 | - ```--seed``` := seed to be used before random sampling of data 17 | 18 | i.e. 19 | - ```./preprocess.sh --sf 1.0 --tf 0.9 --seed 1234``` (full-sized 20 | dataset partitioned on Gaia)
21 | -------------------------------------------------------------------------------- /graph_utils/data/Read_me_gml.txt: -------------------------------------------------------------------------------- 1 | In GML file the distance is the latency indicated in the original files (latencies.intra). 2 | The details of each instance is as follows (which is a little bit different from the statistics in the paper): 3 | num_of_nodes num_of_links 4 | 1221 108 153 5 | 1239 315 972 6 | 1755 87 161 7 | 3257 161 328 8 | 3967 79 147 9 | 6461 141 374 10 | 1755+3967 166 327 11 | 12 | 13 | The combined one "1755+3967": 19 random edges are added. The latency is calculated by distance which is normalized compared with the original maximum latency weight. 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | 5 | 6 | class Logger(object): 7 | def __init__(self, logdir): 8 | self.logdir = logdir 9 | 10 | def write_model(self, model_params, iteration=0, mode="json"): 11 | """ 12 | save model parameters as .pt file 13 | :param model_params: torch.tensor 14 | :param iteration: integer 15 | :param mode: 16 | """ 17 | if mode == "torch": 18 | file_path = os.path.join(self.logdir, 19 | "model_{}.pt".format(iteration)) 20 | torch.save(model_params, file_path) 21 | 22 | elif mode == "json": 23 | file_path = os.path.join(self.logdir, 24 | "model_{}.json".format(iteration)) 25 | 26 | with open(file_path, "w") as f: 27 | f.write(json.dumps(model_params.tolist())) -------------------------------------------------------------------------------- /data/femnist/group_by_writer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | 5 | def load_obj(name): 6 | with open(name + '.pkl', 'rb') as f: 7 | return pickle.load(f) 8 | 9 | 10 | def save_obj(obj, name): 11 | with open(name + '.pkl', 'wb') as f: 12 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 13 | 14 | 15 | parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 16 | 17 | wwcd = os.path.join('intermediate', 'write_with_class') 18 | write_class = load_obj(wwcd) 19 | 20 | writers = [] # each entry is a (writer, [list of (file, class)]) tuple 21 | cimages = [] 22 | (cw, _, _) = write_class[0] 23 | for (w, f, c) in write_class: 24 | if w != cw: 25 | writers.append((cw, cimages)) 26 | cw = w 27 | cimages = [(f, c)] 28 | cimages.append((f, c)) 29 | writers.append((cw, cimages)) 30 | 31 | ibwd = os.path.join('intermediate', 'images_by_writer') 32 | save_obj(writers, ibwd) -------------------------------------------------------------------------------- /data/sent140/combine_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | each row of created .csv file is of the form: 3 | polarity, id, date, query, user, comment, test_or_training 4 | """ 5 | 6 | import csv 7 | import os 8 | 9 | 10 | train_file_name = os.path.join('raw_data', 'training.csv') 11 | 12 | training = [] 13 | with open(train_file_name, 'rt', encoding='ISO-8859-1') as f: 14 | reader = csv.reader(f) 15 | training = list(reader) 16 | 17 | test_file_name = os.path.join('raw_data', 'test.csv') 18 | 19 | test = [] 20 | with open(test_file_name, 'rt', encoding='ISO-8859-1') as f: 21 | reader = csv.reader(f) 22 | test = list(reader) 23 | 24 | out_file_name = os.path.join('raw_data', 'all_data.csv') 25 | 26 | with open(out_file_name, 'w') as f: 27 | writer = csv.writer(f) 28 | 29 | for row in training: 30 | row.append('training') 31 | writer.writerow(row) 32 | 33 | for row in test: 34 | row.append('test') 35 | writer.writerow(row) -------------------------------------------------------------------------------- /data/shakespeare/preprocess.sh: -------------------------------------------------------------------------------- 1 | if [ ! -d "all_data" ] || [ ! "$(ls -A all_data)" ]; then 2 | if [ ! -d "raw_data" ]; then 3 | mkdir raw_data 4 | fi 5 | 6 | if [ ! -f raw_data/raw_data.txt ]; then 7 | echo "------------------------------" 8 | echo "retrieving raw data" 9 | cd raw_data 10 | 11 | wget http://www.gutenberg.org/files/100/old/1994-01-100.zip 12 | unzip 1994-01-100.zip 13 | rm 1994-01-100.zip 14 | mv 100.txt raw_data.txt 15 | 16 | cd ../ 17 | fi 18 | if [ ! -d "raw_data/by_play_and_character" ]; then 19 | echo "dividing txt data between users" 20 | python3 preprocess_shakespeare.py raw_data/raw_data.txt raw_data/ 21 | fi 22 | fi 23 | if [ ! -f test/test.json ]; then 24 | echo "------------------------------" 25 | echo "spliting data" 26 | mkdir train 27 | mkdir test 28 | 29 | ./split_data.sh "$@" 30 | 31 | echo "finished splitting data" 32 | fi -------------------------------------------------------------------------------- /data/femnist/match_hashes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | 5 | def load_obj(name): 6 | with open(name + '.pkl', 'rb') as f: 7 | return pickle.load(f) 8 | 9 | 10 | def save_obj(obj, name): 11 | with open(name + '.pkl', 'wb') as f: 12 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 13 | 14 | 15 | cfhd = os.path.join('intermediate', 'class_file_hashes') 16 | wfhd = os.path.join('intermediate', 'write_file_hashes') 17 | class_file_hashes = load_obj(cfhd) # each elem is (class, file dir, hash) 18 | write_file_hashes = load_obj(wfhd) # each elem is (writer, file dir, hash) 19 | 20 | class_hash_dict = {} 21 | for i in range(len(class_file_hashes)): 22 | (c, f, h) = class_file_hashes[len(class_file_hashes)-i-1] 23 | class_hash_dict[h] = (c, f) 24 | 25 | write_classes = [] 26 | for tup in write_file_hashes: 27 | (w, f, h) = tup 28 | write_classes.append((w, f, class_hash_dict[h][0])) 29 | 30 | wwcd = os.path.join('intermediate', 'write_with_class') 31 | save_obj(write_classes, wwcd) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Don't track content of these folders 2 | .idea/ 3 | log/ 4 | 5 | *.ipynb 6 | *.npy 7 | *.pth 8 | *.csv 9 | *.json 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | env/ 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *,cover 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | -------------------------------------------------------------------------------- /data/sent140/preprocess.sh: -------------------------------------------------------------------------------- 1 | if [ ! -d "raw_data" ]; then 2 | mkdir raw_data 3 | fi 4 | 5 | if [ ! -f raw_data/test.csv ]; then 6 | echo "------------------------------" 7 | echo "retrieving raw data" 8 | 9 | cd raw_data 10 | 11 | if [ ! -f trainingandtestdata.zip ]; then 12 | wget --no-check-certificate http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip 13 | fi 14 | 15 | unzip trainingandtestdata.zip 16 | 17 | mv training.1600000.processed.noemoticon.csv training.csv 18 | mv testdata.manual.2009.06.14.csv test.csv 19 | 20 | rm trainingandtestdata.zip 21 | 22 | cd ../ 23 | echo "finished retrieving raw data" 24 | 25 | echo "------------------------------" 26 | echo "combining raw_data .csv files" 27 | 28 | python3 combine_data.py 29 | 30 | echo "finished combining raw_data .csv files" 31 | 32 | fi 33 | if [ ! -f test/test.json ]; then 34 | echo "------------------------------" 35 | echo "spliting data" 36 | mkdir train 37 | mkdir test 38 | 39 | ./split_data.sh "$@" 40 | 41 | echo "finished splitting data" 42 | fi -------------------------------------------------------------------------------- /loaders/synthetic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | from torch.utils.data import Dataset, DataLoader 4 | 5 | 6 | class SyntheticDataset(Dataset): 7 | def __init__(self, json_file, device): 8 | self.device = device 9 | 10 | with open(json_file, "r") as f: 11 | data = json.load(f) 12 | 13 | self.X = torch.tensor(data["x"]).to(device) 14 | self.y = torch.tensor(data["y"]).to(device) 15 | 16 | self.num_classes = data["num_classes"] 17 | if self.num_classes == 2: 18 | self.num_classes = 1 19 | self.dimension = self.X.shape[1] 20 | 21 | def __len__(self): 22 | return self.X.shape[0] 23 | 24 | def __getitem__(self, idx): 25 | return self.X[idx], torch.unsqueeze(self.y[idx], 0) 26 | 27 | 28 | def get_iterator_synthetic(file_path, device, batch_size=1): 29 | """ 30 | 31 | :param file_path: 32 | :param device: 33 | :param batch_size 34 | :return: 35 | """ 36 | dataset = SyntheticDataset(file_path, device) 37 | iterator = DataLoader(dataset, shuffle=True, batch_size=batch_size) 38 | 39 | return iterator -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | 3 | ## Leaf Datasets 4 | 1. FEMNIST 5 | 6 | * **Overview:** Image Dataset 7 | * **Details:** 62 different classes (10 digits, 26 lowercase, 26 uppercase), images are 28 by 28 pixels (with option to make them all 128 by 128 pixels), 3500 users 8 | * **Task:** Image Classification 9 | 10 | 2. Sentiment140 11 | 12 | * **Overview:** Text Dataset of Tweets 13 | * **Details** 660120 users 14 | * **Task:** Sentiment Analysis 15 | 16 | 3. Shakespeare 17 | 18 | * **Overview:** Text Dataset of Shakespeare Dialogues 19 | * **Details:** 1129 users 20 | * **Task:** Next-Character Prediction 21 | 22 | 23 | ## Cross-silo Datasets 24 | 1. iNaturlaist Dataset 25 | 26 | * **Overview:** We preprocess the iNaturalist data released by 27 | [inaturalist.org](https://www.inaturalist.org/pages/developers). 28 | * **Details:** 859,000 samples with geo-location information. 29 | * **Task:** Image classification. 30 | 31 | ## References 32 | 33 | 34 | @misc{title={LEAF: A Benchmark for Federated Settings}, 35 | author={Sebastian Caldas and Sai Meher Karthik Duddu and Peter Wu and Tian Li and Jakub Konečný and H. Brendan McMahan and Virginia Smith and Ameet Talwalkar}, 36 | year={2018}, 37 | eprint={1812.01097}, 38 | archivePrefix={arXiv}, 39 | primaryClass={cs.LG} 40 | } 41 | -------------------------------------------------------------------------------- /graph_utils/README.md: -------------------------------------------------------------------------------- 1 | # Graph Generator 2 | 3 | Generate different overlays given a connectivity graph. The connectivity 4 | graph should be stored in ``data`` as a ``.gml`` file 5 | ## Setup Instructions 6 | 7 | Run ```generate_network.py``` with a choice of the following arguments: 8 | 9 | - ```name```: name of the used network; 10 | - ```--experiment```: name of the experiment that will be run on the 11 | network; possible are femnist, inaturalist, synthetic, shakespeare, 12 | sent140; if not precised --model_size will be used as model size; 13 | - ``--model_size``: size of the model that will be transmitted on the 14 | network in bit; will be ignored if --experiment is precised; default 15 | is 1e8; 16 | - ``--default_capacity``: default capacity (in bit/s) to use on links 17 | with unknown capacity; default is 1e9; 18 | - ```--centrality``` : Type of centrality to use in order to select the 19 | central node of the network; possible values are: "load", "distance" 20 | and "information"; default is "load"; 21 | 22 | 23 | i.e. 24 | - ```python3 generate_network.py amazon_us --experiment inaturalist``` 25 | (generate different overlays with Amazon North America as connectivity 26 | graph for iNaturalist experiment)
27 | 28 | To generate all the topologies for all the networks run 29 | 30 | ``` 31 | .\generate_all_networks.sh 32 | ``` -------------------------------------------------------------------------------- /data/shakespeare/split_data.sh: -------------------------------------------------------------------------------- 1 | while [[ $# -gt 0 ]] 2 | do 3 | key="$1" 4 | 5 | case $key in 6 | -nw) 7 | NUM_WORKERS="$2" 8 | shift # past argument 9 | shift # past value 10 | ;; 11 | -s) 12 | SAMPLE="$2" 13 | shift # past argument 14 | shift # past value 15 | ;; 16 | --sf) 17 | SFRAC="$2" 18 | shift # past argument 19 | shift # past value 20 | ;; 21 | --tf) 22 | TFRAC="$2" 23 | shift # past argument 24 | shift # past value 25 | ;; 26 | --seed) 27 | SEED="$2" 28 | shift # past argument 29 | ;; 30 | --default) 31 | DEFAULT=YES 32 | shift # past argument 33 | ;; 34 | *) # unknown option 35 | POSITIONAL+=("$1") # save it in an array for later 36 | shift # past argument 37 | ;; 38 | esac 39 | done 40 | 41 | if [ ! -z $NUM_WORKERS ]; then 42 | NUM_WORKERS_TAG="--num_workers $NUM_WORKERS" 43 | fi 44 | 45 | SFRAC_TAG="" 46 | if [ ! -z $SFRAC ]; then 47 | SFRAC_TAG="--s_frac $SFRAC" 48 | fi 49 | 50 | TFRAC_TAG="" 51 | if [ ! -z $TFRAC ]; then 52 | TFRAC_TAG="--tr_frac $TFRAC" 53 | fi 54 | 55 | SEED_TAG="" 56 | if [ ! -z $SEED ]; then 57 | SEED_TAG="--seed $SEED" 58 | fi 59 | 60 | if [ $SAMPLE = "iid" ]; then 61 | python3 split_data.py --iid $NUM_WORKERS_TAG $SFRAC_TAG $TFRAC_TAG $SEED_TAG 62 | else 63 | python3 split_data.py $NUM_WORKERS_TAG $SFRAC_TAG $TFRAC_TAG $SEED_TAG 64 | fi -------------------------------------------------------------------------------- /data/femnist/split_data.sh: -------------------------------------------------------------------------------- 1 | while [[ $# -gt 0 ]] 2 | do 3 | key="$1" 4 | 5 | case $key in 6 | -nw) 7 | NUM_WORKERS="$2" 8 | shift # past argument 9 | shift # past value 10 | ;; 11 | -s) 12 | SAMPLE="$2" 13 | shift # past argument 14 | shift # past value 15 | ;; 16 | --sf) 17 | SFRAC="$2" 18 | shift # past argument 19 | shift # past value 20 | ;; 21 | --tf) 22 | TFRAC="$2" 23 | shift # past argument 24 | shift # past value 25 | ;; 26 | --seed) 27 | SEED="$2" 28 | shift # past argument 29 | ;; 30 | --default) 31 | DEFAULT=YES 32 | shift # past argument 33 | ;; 34 | *) # unknown option 35 | POSITIONAL+=("$1") # save it in an array for later 36 | shift # past argument 37 | ;; 38 | esac 39 | done 40 | 41 | NUM_WORKERS_TAG="" 42 | if [ ! -z $NUM_WORKERS ]; then 43 | NUM_WORKERS_TAG="--num_workers $NUM_WORKERS" 44 | fi 45 | 46 | SFRAC_TAG="" 47 | if [ ! -z $SFRAC ]; then 48 | SFRAC_TAG="--s_frac $SFRAC" 49 | fi 50 | 51 | TFRAC_TAG="" 52 | if [ ! -z $TFRAC ]; then 53 | TFRAC_TAG="--tr_frac $TFRAC" 54 | fi 55 | 56 | SEED_TAG="" 57 | if [ ! -z $SEED ]; then 58 | SEED_TAG="--seed $SEED" 59 | fi 60 | 61 | if [ $SAMPLE = "iid" ]; then 62 | python3 split_data.py --iid $NUM_WORKERS_TAG $SFRAC_TAG $TFRAC_TAG $SEED_TAG 63 | else 64 | python3 split_data.py $NUM_WORKERS_TAG $SFRAC_TAG $TFRAC_TAG $SEED_TAG 65 | fi -------------------------------------------------------------------------------- /data/sent140/split_data.sh: -------------------------------------------------------------------------------- 1 | while [[ $# -gt 0 ]] 2 | do 3 | key="$1" 4 | 5 | case $key in 6 | -nw) 7 | NUM_WORKERS="$2" 8 | shift # past argument 9 | shift # past value 10 | ;; 11 | -s) 12 | SAMPLE="$2" 13 | shift # past argument 14 | shift # past value 15 | ;; 16 | --sf) 17 | SFRAC="$2" 18 | shift # past argument 19 | shift # past value 20 | ;; 21 | --tf) 22 | TFRAC="$2" 23 | shift # past argument 24 | shift # past value 25 | ;; 26 | --seed) 27 | SEED="$2" 28 | shift # past argument 29 | ;; 30 | --default) 31 | DEFAULT=YES 32 | shift # past argument 33 | ;; 34 | *) # unknown option 35 | POSITIONAL+=("$1") # save it in an array for later 36 | shift # past argument 37 | ;; 38 | esac 39 | done 40 | 41 | NUM_WORKERS_TAG="" 42 | if [ ! -z $NUM_WORKERS ]; then 43 | NUM_WORKERS_TAG="--num_workers $NUM_WORKERS" 44 | fi 45 | 46 | SFRAC_TAG="" 47 | if [ ! -z $SFRAC ]; then 48 | SFRAC_TAG="--s_frac $SFRAC" 49 | fi 50 | 51 | TFRAC_TAG="" 52 | if [ ! -z $TFRAC ]; then 53 | TFRAC_TAG="--tr_frac $TFRAC" 54 | fi 55 | 56 | SEED_TAG="" 57 | if [ ! -z $SEED ]; then 58 | SEED_TAG="--seed $SEED" 59 | fi 60 | 61 | if [ $SAMPLE = "iid" ]; then 62 | python3 split_data.py --iid $NUM_WORKERS_TAG $SFRAC_TAG $TFRAC_TAG $SEED_TAG 63 | else 64 | python3 split_data.py $NUM_WORKERS_TAG $SFRAC_TAG $TFRAC_TAG $SEED_TAG 65 | fi -------------------------------------------------------------------------------- /utils/optim.py: -------------------------------------------------------------------------------- 1 | import torch.optim as optim 2 | import numpy as np 3 | 4 | 5 | def get_optimizer(optimizer_name, net, lr_initial=1e-3): 6 | """ 7 | 8 | :param optimizer_name: 9 | :param net: 10 | :param lr_initial: 11 | :return: 12 | """ 13 | if optimizer_name == "adam": 14 | return optim.Adam([param for param in net.parameters() if param.requires_grad], lr=lr_initial) 15 | 16 | elif optimizer_name == "sgd": 17 | return optim.SGD([param for param in net.parameters() if param.requires_grad], lr=lr_initial) 18 | 19 | else: 20 | raise NotImplementedError("Other optimizer are not implemented") 21 | 22 | 23 | def get_lr_scheduler(optimizer, scheduler_name, epoch_size): 24 | if scheduler_name == "sqrt": 25 | return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1/np.sqrt(x) if x > 0 else 1) 26 | 27 | elif scheduler_name == "linear": 28 | return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1 / x if x > 0 else 1) 29 | 30 | elif scheduler_name == "constant": 31 | return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1) 32 | 33 | elif scheduler_name == "cyclic": 34 | return optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-5, max_lr=0.1) 35 | 36 | elif scheduler_name == "custom": 37 | return optim.lr_scheduler.StepLR(optimizer, step_size=30*int(epoch_size), gamma=0.1) 38 | else: 39 | raise NotImplementedError("Other learning rate schedulers are not implemented") 40 | 41 | -------------------------------------------------------------------------------- /data/femnist/get_hashes.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import pickle 4 | 5 | 6 | def load_obj(name): 7 | with open(name + '.pkl', 'rb') as f: 8 | return pickle.load(f) 9 | 10 | 11 | def save_obj(obj, name): 12 | with open(name + '.pkl', 'wb') as f: 13 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 14 | 15 | 16 | cfd = os.path.join('intermediate', 'class_file_dirs') 17 | wfd = os.path.join('intermediate', 'write_file_dirs') 18 | 19 | class_file_dirs = load_obj(cfd) 20 | write_file_dirs = load_obj(wfd) 21 | 22 | class_file_hashes = [] 23 | write_file_hashes = [] 24 | 25 | count = 0 26 | for tup in class_file_dirs: 27 | if count % 100000 == 0: 28 | print('hashed %d class images' % count) 29 | 30 | (cclass, cfile) = tup 31 | file_path = os.path.join(cfile) 32 | 33 | chash = hashlib.md5(open(file_path, 'rb').read()).hexdigest() 34 | 35 | class_file_hashes.append((cclass, cfile, chash)) 36 | 37 | count += 1 38 | 39 | cfhd = os.path.join('intermediate', 'class_file_hashes') 40 | save_obj(class_file_hashes, cfhd) 41 | 42 | count = 0 43 | for tup in write_file_dirs: 44 | if (count % 100000 == 0): 45 | print('hashed %d write images' % count) 46 | 47 | (cclass, cfile) = tup 48 | file_path = os.path.join(cfile) 49 | 50 | chash = hashlib.md5(open(file_path, 'rb').read()).hexdigest() 51 | 52 | write_file_hashes.append((cclass, cfile, chash)) 53 | 54 | count += 1 55 | 56 | wfhd = os.path.join('intermediate', 'write_file_hashes') 57 | save_obj(write_file_hashes, wfhd) -------------------------------------------------------------------------------- /loaders/sent140.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchtext import data 3 | 4 | 5 | def get_iterator_sent140(path, all_data_path, device, max_vocab_size=25_000, batch_size=64): 6 | """ 7 | Build text iterator to be use with LSTM model, 8 | :param path: path to .json file used to build the iterator, see TorchText for .json file format. 9 | :param all_data_path: path to .json file containing all train data 10 | :param device: 11 | :param max_vocab_size: 12 | :param batch_size: 13 | :return: iterator over sent140 samples, each sample has two attributes "text" and "label" 14 | """ 15 | TEXT = data.Field(tokenize='spacy', include_lengths=True) 16 | LABEL = data.LabelField(dtype=torch.float) 17 | 18 | fields = {'text': ('text', TEXT), 'label': ('label', LABEL)} 19 | 20 | text_data = data.TabularDataset(path=path, format='json', fields=fields) 21 | 22 | text_data.sort_key = lambda x: len(x.text) 23 | 24 | # Fix the seed 25 | torch.manual_seed(0) 26 | torch.backends.cudnn.deterministic = True 27 | torch.backends.cudnn.benchmark = False 28 | 29 | all_text_data = data.TabularDataset(path=all_data_path, format='json', fields=fields) 30 | 31 | # vocab is built using all data, in order to have the same mapping from words to indexes across workers 32 | TEXT.build_vocab(all_text_data, 33 | max_size=max_vocab_size, 34 | vectors="glove.6B.100d", 35 | unk_init=torch.Tensor.normal_) 36 | LABEL.build_vocab(text_data) 37 | 38 | iterator = data.BucketIterator( 39 | text_data, 40 | batch_size=batch_size, 41 | sort_within_batch=True, 42 | device=device) 43 | 44 | return iterator 45 | 46 | -------------------------------------------------------------------------------- /data/femnist/preprocess.sh: -------------------------------------------------------------------------------- 1 | if [ ! -d "raw_data" ]; then 2 | echo "------------------------------" 3 | echo "downloading data" 4 | mkdir raw_data 5 | cd raw_data 6 | wget https://s3.amazonaws.com/nist-srd/SD19/by_class.zip 7 | wget https://s3.amazonaws.com/nist-srd/SD19/by_write.zip 8 | unzip by_class.zip 9 | rm by_class.zip 10 | unzip by_write.zip 11 | rm by_write.zip 12 | cd ../ 13 | echo "finished downloading data" 14 | fi 15 | if [ ! -d "intermediate" ]; then # stores .pkl files during preprocessing 16 | mkdir intermediate 17 | fi 18 | 19 | if [ ! -f ntermediate/class_file_dirs.pkl ]; then 20 | echo "------------------------------" 21 | echo "extracting file directories of images" 22 | python3 get_file_dirs.py 23 | echo "finished extracting file directories of images" 24 | fi 25 | 26 | if [ ! -f intermediate/class_file_hashes.pkl ]; then 27 | echo "------------------------------" 28 | echo "calculating image hashes" 29 | python3 get_hashes.py 30 | echo "finished calculating image hashes" 31 | fi 32 | 33 | if [ ! -f intermediate/write_with_class.pkl ]; then 34 | echo "------------------------------" 35 | echo "assigning class labels to write images" 36 | python3 match_hashes.py 37 | echo "finished assigning class labels to write images" 38 | fi 39 | 40 | if [ ! -f intermediate/images_by_writer.pkl ]; then 41 | echo "------------------------------" 42 | echo "grouping images by writer" 43 | python3 group_by_writer.py 44 | echo "finished grouping images by writer" 45 | fi 46 | if [ ! -f test/test.json ]; then 47 | echo "------------------------------" 48 | echo "spliting data" 49 | mkdir train 50 | mkdir test 51 | 52 | ./split_data.sh "$@" 53 | 54 | echo "finished splitting data" 55 | fi -------------------------------------------------------------------------------- /data/synthetic/generate_data.sh: -------------------------------------------------------------------------------- 1 | # Parse arguments 2 | while [[ $# -gt 0 ]] 3 | do 4 | key="$1" 5 | 6 | case $key in 7 | -nw) 8 | NUM_WORKERS="$2" 9 | shift # past argument 10 | shift # past value 11 | ;; 12 | -nc) 13 | NUM_CLASSES="$2" 14 | shift # past argument 15 | shift # past value 16 | ;; 17 | -dim) 18 | DIMENSION="$2" 19 | shift # past argument 20 | shift # past value 21 | ;; 22 | --tf) 23 | TFRAC="$2" 24 | shift # past argument 25 | shift # past value 26 | ;; 27 | --seed) 28 | SEED="$2" 29 | shift # past argument 30 | ;; 31 | --default) 32 | DEFAULT=YES 33 | shift # past argument 34 | ;; 35 | *) # unknown option 36 | POSITIONAL+=("$1") # save it in an array for later 37 | shift # past argument 38 | ;; 39 | esac 40 | done 41 | 42 | NUM_WORKERS_TAG="" 43 | if [ ! -z $NUM_WORKERS ]; then 44 | NUM_WORKERS_TAG="--num_workers $NUM_WORKERS" 45 | fi 46 | 47 | NUM_CLASSE_TAG="" 48 | if [ ! -z $NUM_CLASSES ]; then 49 | NUM_CLASSES_TAG="--num_classes $NUM_CLASSES" 50 | fi 51 | 52 | DIMENSION_TAG="" 53 | if [ ! -z $DIMENSION ]; then 54 | DIMENSION_TAG="--dimension $DIMENSION" 55 | fi 56 | 57 | TFRACTAG="" 58 | if [ ! -z $TFRAC ]; then 59 | TFRAC_TAG="--tr_frac $TFRAC" 60 | fi 61 | 62 | SEED_TAG="" 63 | if [ ! -z $SEED ]; then 64 | SEED_TAG="--seed $SEED" 65 | fi 66 | 67 | 68 | if [ ! -d "all_data" ]; then 69 | mkdir all_data 70 | fi 71 | 72 | 73 | if [ ! -f all_data/all_data.json ]; then 74 | echo "------------------------------" 75 | echo "generating data" 76 | 77 | python3 generate_data.py $NUM_WORKERS_TAG $NUM_CLASSES_TAG $DIMENSION_TAG $SEED_TAG 78 | 79 | echo "finished generating data" 80 | fi 81 | 82 | if [ ! -f test/test.json ]; then 83 | echo "------------------------------" 84 | echo "spliting data" 85 | mkdir train 86 | mkdir test 87 | 88 | python3 split_data.py $TFRACTAG $SEED_TAG 89 | 90 | echo "finished splitting data" 91 | fi -------------------------------------------------------------------------------- /loaders/shakespeare.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset, DataLoader 3 | import string 4 | 5 | 6 | class CharacterDataset(Dataset): 7 | def __init__(self, file_path, chunk_len, device): 8 | """ 9 | Dataset for next character prediction, each sample represents an input sequence of characters 10 | and a target sequence of characters representing to next sequence of the input 11 | :param file_path: path to .txt file containing the training corpus 12 | :param chunk_len: (int) the length of the input and target sequences 13 | :param device: 14 | """ 15 | self.all_characters = string.printable 16 | self.n_characters = len(self.all_characters) 17 | self.chunk_len = chunk_len 18 | self.device = device 19 | f = open(file_path, 'r') 20 | self.text = f.read() 21 | 22 | def __len__(self): 23 | return len(self.text) // (self.chunk_len + 1) 24 | 25 | def __getitem__(self, idx): 26 | input_ = torch.zeros(self.chunk_len).long() 27 | for c in range(self.chunk_len): 28 | input_[c] = self.all_characters.index(self.text[idx + c]) 29 | 30 | target = torch.zeros(self.chunk_len).long() 31 | for c in range(self.chunk_len): 32 | target[c] = self.all_characters.index(self.text[idx + c + 1]) 33 | 34 | return input_.to(self.device), target.to(self.device) 35 | 36 | 37 | def get_iterator_shakespeare(file_path, device, batch_size, chunk_len=200): 38 | """ 39 | get next character prediction DataLoader, yields `batch_size` batches of `CharacterDataset` samples 40 | :param file_path: path to .txt file containing the training corpus 41 | :param chunk_len: (int) the length of the input and target sequences 42 | :param device: 43 | :param batch_size 44 | :return: iterator over shakespeare dataset samples 45 | """ 46 | dataset = CharacterDataset(file_path, chunk_len, device) 47 | iterator = DataLoader(dataset, shuffle=True, batch_size=batch_size) 48 | 49 | return iterator 50 | -------------------------------------------------------------------------------- /reproduce_results.py: -------------------------------------------------------------------------------- 1 | from utils.utils import args_to_string, loggs_to_json 2 | from utils.args import parse_args 3 | 4 | import os 5 | import json 6 | 7 | 8 | trsh_dict = {"gaia": 0.65, 9 | "amazon_us": 0.55, 10 | "geantdistance": 0.55, 11 | "exodus": 0.5, 12 | "ebone": 0.5} 13 | 14 | lr_dict = {"gaia": "1e-3", 15 | "amazon_us": "1e-3", 16 | "geantdistance": "1e-3", 17 | "exodus": "1e-1", 18 | "ebone": "1e-1"} 19 | 20 | for network_name in ["gaia", "amazon_us", "geantdistance", "exodus", "ebone"]: 21 | print("{}:".format(network_name)) 22 | args = parse_args(["inaturalist", 23 | "--network", network_name, 24 | "--bz", "16", 25 | "--lr", lr_dict[network_name], 26 | "--decay", "sqrt", 27 | "--local_steps", "1"]) 28 | 29 | args_string = args_to_string(args) 30 | 31 | loggs_dir = os.path.join("loggs", args_to_string(args)) 32 | loggs_to_json(loggs_dir) 33 | 34 | loggs_dir_path = os.path.join("loggs", args_to_string(args)) 35 | path_to_json = os.path.join("results", "json", "{}.json".format(os.path.split(loggs_dir_path)[1])) 36 | with open(path_to_json, "r") as f: 37 | data = json.load(f) 38 | 39 | for architecture in ["centralized", "ring", "matcha"]: 40 | values = data['Train/Acc'][architecture] 41 | rounds = data["Round"][architecture] 42 | 43 | ii = -1 44 | for ii, value in enumerate(values): 45 | if value > trsh_dict[network_name]: 46 | break 47 | 48 | try: 49 | print("Number of steps to achieve {}% is {} on {} using {}".format(int(trsh_dict[network_name] * 100), 50 | rounds[ii], network_name, architecture)) 51 | except IndexError: 52 | print("Number of steps to achieve {}% is {} on {} using {}".format(int(trsh_dict[network_name] * 100), 53 | rounds[-1], network_name, architecture)) 54 | 55 | print("#" * 10) 56 | -------------------------------------------------------------------------------- /loaders/femnist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import torch 5 | from torch.utils.data import Dataset, DataLoader 6 | from torchvision.transforms import Compose, ToTensor, Normalize, Resize 7 | from PIL import Image 8 | 9 | 10 | class FEMNIST(Dataset): 11 | def __init__(self, pickle_file, root_path, device, transforms=None): 12 | """ 13 | FEMNIST Dataset generated from a .pkl containing a list of tuples 14 | each of them representing a path to an image and it class 15 | :param pickle_file: path to .pkl file 16 | :param root_path: path to the directory containing images 17 | :param device: 18 | :param transforms: list of transformation to apply to images 19 | """ 20 | self.root_path = root_path 21 | self.device = device 22 | with open(pickle_file, 'rb') as f: 23 | self.data = pickle.load(f) 24 | 25 | self.transforms = transforms 26 | 27 | def __getitem__(self, idx): 28 | img_path, label = self.data[idx] 29 | 30 | img = Image.open(os.path.join(self.root_path, img_path)) 31 | label = torch.tensor(label).to(self.device) 32 | 33 | if self.transforms: 34 | img = self.transforms(img).to(self.device) 35 | 36 | return img, label 37 | 38 | def __len__(self): 39 | return len(self.data) 40 | 41 | 42 | def get_iterator_femnist(file_path, device, batch_size=1): 43 | """ 44 | returns an iterator over FEMNIST dataset batches 45 | :param file_path: path to .pkl file containing a list of tuples 46 | each of them representing a path to an image and it class 47 | :param device: 48 | :param batch_size: 49 | :return: torch.utils.DataLoader object constructed from FEMNIST dataset object 50 | """ 51 | root_path = os.path.join("data", "femnist") 52 | 53 | transforms = Compose([Resize(28), 54 | ToTensor(), 55 | Normalize((0.1307,), (0.3081,)) 56 | ]) 57 | 58 | dataset = FEMNIST(file_path, device=device, root_path=root_path, transforms=transforms) 59 | iterator = DataLoader(dataset, shuffle=True, batch_size=batch_size) 60 | 61 | return iterator 62 | -------------------------------------------------------------------------------- /loaders/inaturalist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import torch 5 | from torch.utils.data import Dataset, DataLoader 6 | from torchvision.transforms import Compose, ToTensor, Normalize, CenterCrop 7 | from PIL import Image 8 | 9 | 10 | class INaturalist(Dataset): 11 | def __init__(self, pickle_file, root_path, device, transforms=None): 12 | """ 13 | iNaturalist Dataset generated from a .pkl containing a list of tuples 14 | each of them representing a path to an image and it class 15 | :param pickle_file: path to .pkl file 16 | :param root_path: path to the directory containing images 17 | :param device: 18 | :param transforms: list of transformation to apply to images 19 | """ 20 | self.root_path = root_path 21 | self.device = device 22 | with open(pickle_file, 'rb') as f: 23 | self.data = pickle.load(f) 24 | 25 | self.transforms = transforms 26 | 27 | def __getitem__(self, idx): 28 | img_path, label = self.data[idx] 29 | 30 | img = Image.open(os.path.join(self.root_path, img_path)).convert("RGB") 31 | label = torch.tensor(label).to(self.device) 32 | 33 | if self.transforms: 34 | img = self.transforms(img).to(self.device) 35 | 36 | return img, label 37 | 38 | def __len__(self): 39 | return len(self.data) 40 | 41 | 42 | def get_iterator_inaturalist(file_path, device, batch_size=1): 43 | """ 44 | returns an iterator over iNaturalist dataset batches 45 | :param file_path: path to .pkl file containing a list of tuples 46 | each of them representing a path to an image and it class 47 | :param device: 48 | :param batch_size: 49 | :return: torch.utils.DataLoader object constructed from INaturalist dataset object 50 | """ 51 | root_path = os.path.join("data", "inaturalist") 52 | 53 | transforms = Compose([CenterCrop((224, 224)), 54 | ToTensor(), 55 | Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 56 | ]) 57 | 58 | dataset = INaturalist(file_path, device=device, root_path=root_path, transforms=transforms) 59 | iterator = DataLoader(dataset, shuffle=True, batch_size=batch_size) 60 | 61 | return iterator 62 | -------------------------------------------------------------------------------- /data/synthetic/split_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import random 5 | import time 6 | import numpy as np 7 | from sklearn.model_selection import train_test_split 8 | 9 | parser = argparse.ArgumentParser() 10 | 11 | 12 | parser.add_argument('--tr_frac', 13 | help='fraction in training set; default: 0.8;', 14 | type=float, 15 | default=0.8) 16 | parser.add_argument('--seed', 17 | help='args.seed for random partitioning of test/train data', 18 | type=int, 19 | default=None) 20 | 21 | parser.set_defaults(user=False) 22 | 23 | args = parser.parse_args() 24 | 25 | 26 | if __name__ == "__main__": 27 | rng_seed = (args.seed if (args.seed is not None and args.seed >= 0) else int(time.time())) 28 | rng = random.Random(rng_seed) 29 | 30 | data_file = os.path.join('all_data', 'all_data.json') 31 | 32 | with open(data_file, 'r') as inf: 33 | data = json.load(inf) 34 | 35 | X_list = {"train": [], "test": []} 36 | y_list = {"train": [], "test": []} 37 | 38 | num_classes = data['num_classes'] 39 | 40 | for worker in data['users']: 41 | train_file = os.path.join("train", "{}.json".format(worker)) 42 | 43 | worker_data = data['user_data'][worker] 44 | X = np.array(worker_data['x']) 45 | y = np.array(worker_data['y']) 46 | 47 | X_train, X_test, y_train, y_test = train_test_split( 48 | X, y, train_size=args.tr_frac, random_state=args.seed) 49 | 50 | X_list["train"].append(X_train) 51 | y_list["train"].append(y_train) 52 | X_list["test"].append(X_test) 53 | y_list["test"].append(y_test) 54 | 55 | json_data_train = {"x": X_train.tolist(), "y": y_train.tolist(), "num_classes": num_classes} 56 | 57 | with open(train_file, 'w') as outfile: 58 | json.dump(json_data_train, outfile) 59 | 60 | for key in ["train", "test"]: 61 | X = np.vstack(X_list[key]) 62 | y = np.concatenate(y_list[key]) 63 | 64 | file = os.path.join(key, "{}.json".format(key)) 65 | json_data = {"x": X.tolist(), "y": y.tolist(), "num_classes": num_classes} 66 | with open(file, 'w') as outfile: 67 | json.dump(json_data, outfile) 68 | 69 | -------------------------------------------------------------------------------- /data/inaturalist/preprocess.sh: -------------------------------------------------------------------------------- 1 | while [[ $# -gt 0 ]] 2 | do 3 | key="$1" 4 | 5 | case $key in 6 | --network) 7 | NETWORK_NAME="$2" 8 | shift # past argument 9 | shift # past value 10 | ;; 11 | --sf) 12 | SFRAC="$2" 13 | shift # past argument 14 | shift # past value 15 | ;; 16 | --tf) 17 | TFRAC="$2" 18 | shift # past argument 19 | shift # past value 20 | ;; 21 | --seed) 22 | SEED="$2" 23 | shift # past argument 24 | ;; 25 | --default) 26 | DEFAULT=YES 27 | shift # past argument 28 | ;; 29 | *) # unknown option 30 | POSITIONAL+=("$1") # save it in an array for later 31 | shift # past argument 32 | ;; 33 | esac 34 | done 35 | 36 | NETWORK_NAME_TAG="" 37 | if [ ! -z $NETWORK_NAME ]; then 38 | NETWORK_NAME_TAG="--network $NETWORK_NAME" 39 | fi 40 | 41 | SFRAC_TAG="" 42 | if [ ! -z $SFRAC ]; then 43 | SFRAC_TAG="--s_frac $SFRAC" 44 | fi 45 | 46 | TFRAC_TAG="" 47 | if [ ! -z $TFRAC ]; then 48 | TFRAC_TAG="--tr_frac $TFRAC" 49 | fi 50 | 51 | SEED_TAG="" 52 | if [ ! -z $SEED ]; then 53 | SEED_TAG="--seed $SEED" 54 | fi 55 | 56 | if [ ! -f raw_data/train2018.json ]; then 57 | echo "------------------------------" 58 | echo "downloading annotations and locations" 59 | 60 | cd raw_data 61 | wget http://www.vision.caltech.edu/~gvanhorn/datasets/inaturalist/fgvc5_competition/val2018.json.tar.gz 62 | wget http://www.vision.caltech.edu/~gvanhorn/datasets/inaturalist/fgvc5_competition/inat2018_locations.zip 63 | wget http://www.vision.caltech.edu/~gvanhorn/datasets/inaturalist/fgvc5_competition/train2018.json.tar.gz 64 | unzip inat2018_locations.zip -d . 65 | tar -xf val2018.json.tar.gz -C . 66 | tar -xf train2018.json.tar.gz -C . 67 | 68 | rm inat2018_locations.zip 69 | rm val2018.json.tar.gz 70 | rm train2018.json.tar.gz 71 | mv inat2018_locations/* . 72 | rm -r inat2018_locations 73 | echo "finished downloading annotations and locations" 74 | cd ../ 75 | fi 76 | 77 | if [ ! -f test/test.json ]; then 78 | echo "------------------------------" 79 | echo "spliting data" 80 | mkdir train 81 | mkdir test 82 | 83 | python3 split_data.py $NETWORK_NAME_TAG $NUM_WORKERS_TAG $SFRAC_TAG $TFRAC_TAG $SEED_TAG 84 | 85 | echo "finished splitting data" 86 | fi -------------------------------------------------------------------------------- /graph_utils/show_networks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate .html file with world map and positions of workers and links used in the overlay 3 | """ 4 | import argparse 5 | import os 6 | import time 7 | import mplleaflet 8 | import matplotlib.pyplot as plt 9 | import networkx as nx 10 | from geopy.geocoders import Nominatim 11 | 12 | 13 | geolocator = Nominatim(user_agent="delay", timeout=20) 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument( 17 | 'underlay', 18 | help='name of the underlay network; should be present in "/data"', 19 | type=str) 20 | parser.add_argument( 21 | 'architecture', 22 | help='name of the architecture; should be present in "results/$UNDERLAY"', 23 | type=str) 24 | 25 | parser.set_defaults(user=False) 26 | 27 | args = parser.parse_args() 28 | 29 | if __name__ == "__main__": 30 | underlay_path = os.path.join("data", "{}.gml".format(args.underlay)) 31 | overlay_path = os.path.join("results", args.underlay, "{}.gml".format(args.architecture)) 32 | 33 | underlay = nx.read_gml(underlay_path) 34 | 35 | pos_dict = {} 36 | for node in underlay.nodes(): 37 | try: 38 | pos_dict[node] = [underlay.nodes(data=True)[node]["Longitude"], 39 | underlay.nodes(data=True)[node]["Latitude"]] 40 | 41 | except KeyError: 42 | time.sleep(1.2) # To avoid Service time out Error 43 | 44 | geo = geolocator.geocode(node, timeout=20) 45 | pos_dict[node] = [geo.longitude, geo.latitude] 46 | 47 | overlay = nx.read_gml(overlay_path).to_undirected() 48 | 49 | mapping = {} 50 | for ii, node in enumerate(underlay.nodes()): 51 | mapping[str(ii)] = node 52 | 53 | overlay = nx.relabel_nodes(overlay, mapping).to_undirected() 54 | 55 | fig, ax = plt.subplots() 56 | 57 | nx.draw_networkx_nodes(overlay, pos=pos_dict, node_size=10, node_color='red', edge_color='k', alpha=.5, 58 | with_labels=True) 59 | nx.draw_networkx_edges(overlay, pos=pos_dict, edge_color='blue', alpha=1, width=5.0) 60 | nx.draw_networkx_labels(overlay, pos=pos_dict, label_pos=10.3) 61 | 62 | mplleaflet.display(fig=ax.figure) 63 | mplleaflet.save_html(fig=ax.figure, 64 | fileobj=os.path.join("results", args.underlay, "{}.html".format(args.architecture))) 65 | -------------------------------------------------------------------------------- /graph_utils/utils/mbst.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | from networkx.algorithms.tournament import hamiltonian_path 4 | 5 | 6 | def cube_algorithm(G_complete): 7 | """ 8 | Use cube algorithm to build an approximation for the 2-MBST problem on G: 9 | 1. Add edges to G to build complete graph G_complete 10 | 2. Build an MST T of G_complete 11 | 3. Build the the cube of T 12 | 4. find a Hamiltonian path in the cube of T 13 | :param G : (nx.Graph()) 14 | """ 15 | T = nx.minimum_spanning_tree(G_complete, weight="weight") 16 | 17 | T_cube = nx.Graph() 18 | T_cube.add_nodes_from(T.nodes(data=True)) 19 | 20 | shortest_paths = nx.shortest_path_length(T) 21 | for source, lengths_dict in shortest_paths: 22 | for target in lengths_dict: 23 | if lengths_dict[target] <= 3: 24 | T_cube.add_edge(source, target, 25 | weight=G_complete.get_edge_data(source, target)["weight"]) 26 | 27 | ham_path = hamiltonian_path(T_cube.to_directed()) 28 | 29 | result = nx.Graph() 30 | result.add_nodes_from(G_complete.nodes(data=True)) 31 | 32 | for idx in range(len(ham_path) - 1): 33 | result.add_edge(ham_path[idx], ham_path[idx + 1], 34 | weight=G_complete.get_edge_data(ham_path[idx], ham_path[idx + 1])['weight']) 35 | 36 | return result 37 | 38 | 39 | def delta_prim(G_complete, delta): 40 | """ 41 | implementation of delta prim algorithm from https://ieeexplore.ieee.org/document/850653 42 | :param G: (nx.Graph()) 43 | :param delta: (int) 44 | :return: a tree T with degree at most delta 45 | """ 46 | N = G_complete.number_of_nodes() 47 | T = nx.Graph() 48 | 49 | T.add_node(list(G_complete.nodes)[0]) 50 | 51 | while len(T.edges) < N - 1: 52 | smallest_weight = np.inf 53 | edge_to_add = None 54 | for u in T.nodes: 55 | for v in G_complete.nodes: 56 | if (v not in T.nodes) and (T.degree[u] < delta): 57 | weight = G_complete.get_edge_data(u, v)["weight"] 58 | if weight < smallest_weight: 59 | smallest_weight = weight 60 | edge_to_add = (u, v) 61 | 62 | T.add_edge(*edge_to_add, weight=smallest_weight) 63 | 64 | T.add_nodes_from(G_complete.nodes(data=True)) 65 | 66 | return T 67 | -------------------------------------------------------------------------------- /make_table3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | from utils.args import parse_args 5 | from utils.utils import args_to_string, loggs_to_json 6 | 7 | trsh_dict = {"gaia": 0.65, 8 | "amazon_us": 0.55, 9 | "geantdistance": 0.55, 10 | "exodus": 0.5, 11 | "ebone": 0.5} 12 | 13 | lr_dict = {"gaia": "1e-3", 14 | "amazon_us": "1e-3", 15 | "geantdistance": "1e-3", 16 | "exodus": "1e-1", 17 | "ebone": "1e-1"} 18 | 19 | if __name__ == "__main__": 20 | for network_name in ["gaia", "amazon_us", "geantdistance", "exodus", "ebone"]: 21 | print("{}:".format(network_name)) 22 | args = parse_args(["inaturalist", 23 | "--network", network_name, 24 | "--bz", "16", 25 | "--lr", lr_dict[network_name], 26 | "--decay", "sqrt", 27 | "--local_steps", "1"]) 28 | 29 | args_string = args_to_string(args) 30 | 31 | loggs_dir = os.path.join("loggs", args_to_string(args)) 32 | loggs_to_json(loggs_dir) 33 | 34 | loggs_dir_path = os.path.join("loggs", args_to_string(args)) 35 | path_to_json = os.path.join("results", "json", "{}.json".format(os.path.split(loggs_dir_path)[1])) 36 | with open(path_to_json, "r") as f: 37 | data = json.load(f) 38 | 39 | for architecture in ["centralized", "ring", "matcha"]: 40 | values = data['Train/Acc'][architecture] 41 | rounds = data["Round"][architecture] 42 | 43 | for ii, value in enumerate(values): 44 | if value > trsh_dict[network_name]: 45 | break 46 | 47 | try: 48 | print("Number of steps to achieve {}% is {} on {} using {}".format(int(trsh_dict[network_name] * 100), 49 | rounds[ii], network_name, 50 | architecture)) 51 | except IndexError: 52 | print("Number of steps to achieve {}% is {} on {} using {}".format(int(trsh_dict[network_name] * 100), 53 | rounds[-1], network_name, 54 | architecture)) 55 | 56 | print("#" * 10) -------------------------------------------------------------------------------- /communication_module/worker.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torch.distributed as dist 5 | 6 | from utils.utils import get_network, get_iterator, get_model 7 | 8 | 9 | EXTENSIONS = {"synthetic": ".json", "sent140": ".json", "femnist": ".pkl", "shakespeare": ".txt"} 10 | 11 | 12 | class Worker(object): 13 | def __init__(self, args, rank): 14 | self.rank = rank 15 | self.local_steps = args.local_steps 16 | self.device = args.device 17 | self.num_gpu = torch.cuda.device_count() 18 | self.batch_size = args.bz 19 | self.network = get_network(args.network_name, args.architecture) 20 | self.world_size = self.network.number_of_nodes() + 1 # we add node representing the network manager 21 | self.fit_by_epoch = args.fit_by_epoch 22 | self.initial_lr = args.lr 23 | self.optimizer_name = args.optimizer 24 | self.lr_scheduler_name = args.decay 25 | 26 | if self.device == "cuda": 27 | if torch.cuda.is_available(): 28 | print(f"{rank} get gpu {self.rank % self.num_gpu}") 29 | self.device = "cuda:"+str(self.rank % self.num_gpu) 30 | else: 31 | print("No GPU is available on the system") 32 | raise TypeError 33 | elif self.device != "cpu": 34 | print("Please choose device be either cuda or cpu") 35 | raise TypeError 36 | 37 | self.data_dir = os.path.join("data", args.experiment, "train") 38 | self.data_path = os.path.join(self.data_dir, str(rank) + EXTENSIONS[args.experiment]) 39 | 40 | self.iterator = get_iterator(args.experiment, self.data_path, self.device, self.batch_size) 41 | 42 | self.model = get_model(args.experiment, self.device, self.iterator, 43 | optimizer_name=self.optimizer_name, lr_scheduler=self.lr_scheduler_name, 44 | initial_lr=self.initial_lr) 45 | 46 | def communicate(self): 47 | 48 | if self.fit_by_epoch: 49 | self.model.fit_iterator(train_iterator=self.iterator, n_epochs=self.local_steps) 50 | else: 51 | self.model.fit_batches(iterator=self.iterator, n_steps=self.local_steps) 52 | 53 | for ii, param in enumerate(self.model.net.parameters()): 54 | dist.gather(tensor=param.data, dst=self.world_size - 1) 55 | 56 | for ii, param in enumerate(self.model.net.parameters()): 57 | dist.scatter(tensor=param.data, src=self.world_size - 1) 58 | -------------------------------------------------------------------------------- /data/femnist/get_file_dirs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Creates .pkl files for: 3 | 1. list of directories of every image in 'by_class' 4 | 2. list of directories of every image in 'by_write' 5 | the hierarchal structure of the data is as follows: 6 | - by_class -> classes -> folders containing images -> images 7 | - by_write -> folders containing writers -> writer -> types of images -> images 8 | the directories written into the files are of the form 'raw_data/...' 9 | """ 10 | import os 11 | import pickle 12 | 13 | 14 | def save_obj(obj, name): 15 | with open(name + '.pkl', 'wb') as f: 16 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 17 | 18 | 19 | class_files = [] # (class, file directory) 20 | write_files = [] # (writer, file directory) 21 | 22 | class_dir = os.path.join('raw_data', 'by_class') 23 | classes = os.listdir(class_dir) 24 | classes = [c for c in classes if len(c) == 2] 25 | 26 | for cl in classes: 27 | cldir = os.path.join(class_dir, cl) 28 | rel_cldir = os.path.join(class_dir, cl) 29 | subcls = os.listdir(cldir) 30 | 31 | subcls = [s for s in subcls if (('hsf' in s) and ('mit' not in s))] 32 | 33 | for subcl in subcls: 34 | subcldir = os.path.join(cldir, subcl) 35 | rel_subcldir = os.path.join(rel_cldir, subcl) 36 | images = os.listdir(subcldir) 37 | image_dirs = [os.path.join(rel_subcldir, i) for i in images] 38 | 39 | for image_dir in image_dirs: 40 | class_files.append((cl, image_dir)) 41 | 42 | 43 | write_dir = os.path.join('raw_data', 'by_write') 44 | write_parts = os.listdir(write_dir) 45 | 46 | for write_part in write_parts: 47 | writers_dir = os.path.join(write_dir, write_part) 48 | rel_writers_dir = os.path.join(write_dir, write_part) 49 | writers = os.listdir(writers_dir) 50 | 51 | for writer in writers: 52 | writer_dir = os.path.join(writers_dir, writer) 53 | rel_writer_dir = os.path.join(rel_writers_dir, writer) 54 | wtypes = os.listdir(writer_dir) 55 | 56 | for wtype in wtypes: 57 | type_dir = os.path.join(writer_dir, wtype) 58 | rel_type_dir = os.path.join(rel_writer_dir, wtype) 59 | images = os.listdir(type_dir) 60 | image_dirs = [os.path.join(rel_type_dir, i) for i in images] 61 | 62 | for image_dir in image_dirs: 63 | write_files.append((writer, image_dir)) 64 | 65 | save_obj( 66 | class_files, 67 | os.path.join('intermediate', 'class_file_dirs')) 68 | save_obj( 69 | write_files, 70 | os.path.join('intermediate', 'write_file_dirs')) -------------------------------------------------------------------------------- /models/inaturalist/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from utils.optim import get_optimizer, get_lr_scheduler 4 | from torchvision.models import resnet18 5 | from ..model import Model 6 | 7 | NUMBER_CLASSES = 80 8 | 9 | 10 | class INaturalistCNN(Model): 11 | def __init__(self, criterion, metric, device, 12 | optimizer_name="adam", lr_scheduler="sqrt", initial_lr=1e-3, epoch_size=1, coeff=1): 13 | super(Model, self).__init__() 14 | 15 | self.net = resnet18(pretrained=True) 16 | self.net.fc = nn.Linear(self.net.fc.in_features, NUMBER_CLASSES) 17 | self.net = self.net.to(device) 18 | self.criterion = criterion 19 | self.metric = metric 20 | self.device = device 21 | self.coeff = coeff 22 | 23 | self.optimizer = get_optimizer(optimizer_name, self.net, initial_lr) 24 | self.lr_scheduler = get_lr_scheduler(self.optimizer, lr_scheduler, epoch_size) 25 | 26 | def fit_iterator_one_epoch(self, iterator): 27 | epoch_loss = 0 28 | epoch_acc = 0 29 | 30 | self.net.train() 31 | 32 | for x, y in iterator: 33 | self.optimizer.zero_grad() 34 | 35 | predictions = self.net(x) 36 | 37 | loss = self.coeff * self.criterion(predictions, y) 38 | 39 | acc = self.metric(predictions, y) 40 | 41 | loss.backward() 42 | 43 | self.optimizer.step() 44 | self.lr_scheduler.step() 45 | 46 | epoch_loss += loss.item() 47 | epoch_acc += acc.item() 48 | 49 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 50 | 51 | def fit_batch(self, iterator, update=True): 52 | self.net.train() 53 | 54 | x, y = next(iter(iterator)) 55 | 56 | self.optimizer.zero_grad() 57 | 58 | predictions = self.net(x) 59 | 60 | loss = self.criterion(predictions, y) 61 | 62 | acc = self.metric(predictions, y) 63 | 64 | loss.backward() 65 | 66 | if update: 67 | self.optimizer.step() 68 | self.lr_scheduler.step() 69 | 70 | batch_loss = loss.item() 71 | batch_acc = acc.item() 72 | 73 | return batch_loss, batch_acc 74 | 75 | def evaluate_iterator(self, iterator): 76 | epoch_loss = 0 77 | epoch_acc = 0 78 | 79 | self.net.eval() 80 | 81 | with torch.no_grad(): 82 | for x, y in iterator: 83 | predictions = self.net(x) 84 | 85 | loss = self.criterion(predictions, y) 86 | 87 | acc = self.metric(predictions, y) 88 | 89 | epoch_loss += loss.item() 90 | epoch_acc += acc.item() 91 | 92 | return epoch_loss / len(iterator), epoch_acc / len(iterator) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from torch.multiprocessing import Process 3 | import torch.distributed as dist 4 | import torch 5 | 6 | from utils.args import parse_args 7 | from utils.utils import loggs_to_json, args_to_string 8 | from communication_module.worker import Worker 9 | from communication_module.manager import Peer2PeerManager, CentralizedManager 10 | from communication import CentralizedNetwork, Peer2PeerNetwork, MATCHANetwork, RingNetwork 11 | 12 | 13 | def run(rank, size, arguments): 14 | torch.manual_seed(0) 15 | torch.backends.cudnn.deterministic = True 16 | torch.backends.cudnn.benchmark = False 17 | 18 | if rank == size - 1: 19 | if arguments.architecture == "centralized": 20 | node = CentralizedManager(arguments) 21 | else: 22 | node = Peer2PeerManager(arguments) 23 | else: 24 | node = Worker(arguments, rank) 25 | 26 | for _ in range(arguments.n_rounds): 27 | node.communicate() 28 | 29 | 30 | def init_process(rank, size, arguments, fn, backend='gloo'): 31 | """ Initialize the distributed environment. """ 32 | os.environ['MASTER_ADDR'] = '127.0.0.1' 33 | os.environ['MASTER_PORT'] = '29500' 34 | dist.init_process_group(backend, rank=rank, world_size=size) 35 | fn(rank, size, arguments) 36 | 37 | 38 | if __name__ == "__main__": 39 | torch.manual_seed(0) 40 | torch.backends.cudnn.deterministic = True 41 | torch.backends.cudnn.benchmark = False 42 | 43 | args = parse_args() 44 | 45 | if args.parallel: 46 | print("Run experiment in parallel settings using torch.dist..") 47 | 48 | processes = [] 49 | world_size = args.num_workers + 1 # We add an extra node that plays the role of network manager 50 | for rank_ in range(world_size): 51 | p = Process(target=init_process, args=(rank_, world_size, args, run)) 52 | p.start() 53 | processes.append(p) 54 | 55 | for p in processes: 56 | p.join() 57 | 58 | else: 59 | print("Run experiment in sequential setting..") 60 | 61 | if args.architecture == "centralized": 62 | network = CentralizedNetwork(args) 63 | elif args.architecture == "matcha" or args.architecture == "matcha+" or\ 64 | args.architecture == "matcha+mst" or args.architecture == "matcha+ring" or\ 65 | args.architecture == "matcha+delta_mbst": 66 | network = MATCHANetwork(args) 67 | elif args.architecture == "dynamic_ring": 68 | network = RingNetwork(args) 69 | else: 70 | network = Peer2PeerNetwork(args) 71 | 72 | for k in range(args.n_rounds): 73 | network.mix() 74 | 75 | network.write_logs() 76 | 77 | loggs_dir = os.path.join("loggs", args_to_string(args)) 78 | loggs_to_json(loggs_dir) 79 | -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import torch 3 | import time 4 | 5 | 6 | def epoch_time(start_time, end_time): 7 | elapsed_time = end_time - start_time 8 | elapsed_mins = int(elapsed_time / 60) 9 | elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) 10 | return elapsed_mins, elapsed_secs 11 | 12 | 13 | class Model(ABC): 14 | @abstractmethod 15 | def __init__(self): 16 | pass 17 | 18 | @abstractmethod 19 | def fit_iterator_one_epoch(self, iterator): 20 | pass 21 | 22 | @abstractmethod 23 | def fit_batch(self, iterator): 24 | pass 25 | 26 | @abstractmethod 27 | def evaluate_iterator(self, iterator): 28 | pass 29 | 30 | def update_from_model(self, model): 31 | """ 32 | update parameters using gradients from another model 33 | :param model: Model() object, gradients should be precomputed; 34 | """ 35 | for param_idx, param in enumerate(self.net.parameters()): 36 | param.grad = list(model.net.parameters())[param_idx].grad.data.clone() 37 | 38 | self.optimizer.step() 39 | self.lr_scheduler.step() 40 | 41 | def fit_batches(self, iterator, n_steps): 42 | global_loss = 0 43 | global_acc = 0 44 | 45 | for step in range(n_steps): 46 | batch_loss, batch_acc = self.fit_batch(iterator) 47 | global_loss += batch_loss 48 | global_acc += batch_acc 49 | 50 | return global_loss / n_steps, global_acc / n_steps 51 | 52 | def fit_iterator(self, train_iterator, val_iterator=None, n_epochs=1, path=None, verbose=0): 53 | best_valid_loss = float('inf') 54 | 55 | for epoch in range(n_epochs): 56 | 57 | start_time = time.time() 58 | 59 | train_loss, train_acc = self.fit_iterator_one_epoch(train_iterator) 60 | if val_iterator: 61 | valid_loss, valid_acc = self.evaluate_iterator(val_iterator) 62 | 63 | end_time = time.time() 64 | 65 | epoch_mins, epoch_secs = epoch_time(start_time, end_time) 66 | 67 | if val_iterator: 68 | if valid_loss < best_valid_loss: 69 | best_valid_loss = valid_loss 70 | if path: 71 | torch.save(self.net, path) 72 | 73 | if verbose: 74 | print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') 75 | print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%') 76 | if val_iterator: 77 | print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}%') 78 | 79 | def get_param_tensor(self): 80 | param_list = [] 81 | 82 | for param in self.net.parameters(): 83 | param_list.append(param.data.view(-1, )) 84 | 85 | return torch.cat(param_list) 86 | -------------------------------------------------------------------------------- /graph_utils/utils/evaluate_throughput.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | 4 | np.seterr(all="ignore") 5 | 6 | 7 | def cycle_time_decision(G, lambda_0): 8 | """ 9 | Answers the cycle time decision problem question: Is the throughput of G at most lambda ? 10 | :param G: (nx.DiGraph) Strong Weighted Digraph 11 | :param lambda_0: (numerical) 12 | """ 13 | A = nx.adjacency_matrix(G).toarray() 14 | new_A = lambda_0 - A 15 | 16 | new_G = nx.from_numpy_matrix(new_A, create_using=nx.DiGraph()) 17 | 18 | answer = True 19 | try: 20 | nx.bellman_ford_predecessor_and_distance(new_G, 0) 21 | except nx.NetworkXUnbounded: 22 | answer = False 23 | return answer 24 | 25 | 26 | def evaluate_cycle_time(G, s=0): 27 | """ 28 | Evaluate the cycle time of a strong weighted digraph. For now the implementation only supports integer delays 29 | :param G: (nx.DiGraph) strong weighted digraph 30 | :param s: starting point 31 | :return: lambda_G 32 | The cycle time of G 33 | """ 34 | n = len(G) 35 | nodes_to_indices = {node: idx for idx, node in enumerate(G.nodes)} 36 | 37 | # Head 38 | D = np.zeros((n + 1, n)) - np.inf 39 | pi = np.zeros((n + 1, n), dtype=np.int64) - 1 40 | D[0, s] = 0 41 | 42 | # Body 43 | for k in range(1, n + 1): 44 | for v in G.nodes: 45 | for u in G.predecessors(v): 46 | if D[k, nodes_to_indices[v]] < D[k - 1, nodes_to_indices[u]] + G.get_edge_data(u, v)['weight']: 47 | D[k, nodes_to_indices[v]] = D[k - 1, nodes_to_indices[u]] \ 48 | + G.get_edge_data(u, v)['weight'] 49 | 50 | pi[k, nodes_to_indices[v]] = nodes_to_indices[u] 51 | 52 | # Tail 53 | lambda_ = -np.inf 54 | M = np.zeros((n,)) + np.inf 55 | K = np.zeros((n,), dtype=np.int64) - 1 56 | for v in G.nodes: 57 | for k in range(0, n): 58 | if M[nodes_to_indices[v]] > (D[n, nodes_to_indices[v]] - D[k, nodes_to_indices[v]]) / (n - k): 59 | M[nodes_to_indices[v]] = (D[n, nodes_to_indices[v]] - D[k, nodes_to_indices[v]]) / (n - k) 60 | K[nodes_to_indices[v]] = k 61 | 62 | if lambda_ < M[nodes_to_indices[v]]: 63 | lambda_ = M[nodes_to_indices[v]] 64 | v_star = nodes_to_indices[v] 65 | 66 | # Get critical cycle 67 | path = [] 68 | actual = v_star 69 | for i in range(n, -1, -1): 70 | path.append(actual) 71 | actual = pi[i, actual] 72 | 73 | path.reverse() 74 | 75 | return lambda_, path, n - K[v_star] 76 | 77 | 78 | def evaluate_throughput(G): 79 | """ 80 | Evaluate the throughput of a strong weighted digraph. For now the implementation only supports integer delays 81 | :param G: (nx.DiGraph) strong weighted digraph 82 | :return: The throughput of G 83 | """ 84 | lambda_, _, _ = evaluate_cycle_time(G) 85 | return 1 / lambda_ 86 | -------------------------------------------------------------------------------- /models/synthetic/linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from utils.optim import get_lr_scheduler, get_optimizer 4 | from ..model import Model 5 | 6 | 7 | class LinearLayer(nn.Module): 8 | def __init__(self, input_dimension, num_classes): 9 | super(LinearLayer, self).__init__() 10 | self.input_dimension = input_dimension 11 | self.num_classes = num_classes 12 | self.fc = nn.Linear(input_dimension, num_classes) 13 | 14 | def forward(self, x): 15 | return self.fc(x) 16 | 17 | 18 | class LinearModel(Model): 19 | def __init__(self, criterion, metric, device, input_dimension, num_classes, 20 | optimizer_name="adam", lr_scheduler="cyclic", initial_lr=1e-3, epoch_size=1): 21 | super(Model, self).__init__() 22 | 23 | self.criterion = criterion 24 | self.metric = metric 25 | self.device = device 26 | 27 | self.net = LinearLayer(input_dimension, num_classes).to(self.device) 28 | 29 | self.optimizer = get_optimizer(optimizer_name, self.net, initial_lr) 30 | self.lr_scheduler = get_lr_scheduler(self.optimizer, lr_scheduler, epoch_size) 31 | 32 | def fit_iterator_one_epoch(self, iterator): 33 | epoch_loss = 0 34 | epoch_acc = 0 35 | 36 | self.net.train() 37 | 38 | for x, y in iterator: 39 | self.optimizer.zero_grad() 40 | 41 | predictions = self.net(x) 42 | 43 | loss = self.criterion(predictions, y.float()) 44 | 45 | acc = self.metric(predictions, y) 46 | 47 | loss.backward() 48 | 49 | self.optimizer.step() 50 | self.lr_scheduler.step() 51 | 52 | epoch_loss += loss.item() 53 | epoch_acc += acc.item() 54 | 55 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 56 | 57 | def fit_batch(self, iterator, update=True): 58 | self.net.train() 59 | 60 | x, y = next(iter(iterator)) 61 | 62 | self.optimizer.zero_grad() 63 | 64 | predictions = self.net(x) 65 | 66 | loss = self.criterion(predictions, y.float()) 67 | 68 | acc = self.metric(predictions, y) 69 | 70 | loss.backward() 71 | 72 | if update: 73 | self.optimizer.step() 74 | self.lr_scheduler.step() 75 | 76 | batch_loss = loss.item() 77 | batch_acc = acc.item() 78 | 79 | return batch_loss, batch_acc 80 | 81 | def evaluate_iterator(self, iterator): 82 | epoch_loss = 0 83 | epoch_acc = 0 84 | 85 | self.net.eval() 86 | 87 | with torch.no_grad(): 88 | for x, y in iterator: 89 | predictions = self.net(x) 90 | 91 | loss = self.criterion(predictions, y.float()) 92 | 93 | acc = self.metric(predictions, y) 94 | 95 | epoch_loss += loss.item() 96 | epoch_acc += acc.item() 97 | 98 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 99 | 100 | -------------------------------------------------------------------------------- /models/femnist/cnn.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from utils.optim import get_optimizer, get_lr_scheduler 4 | import torch 5 | from ..model import Model 6 | 7 | 8 | class CNN(nn.Module): 9 | def __init__(self): 10 | super(CNN, self).__init__() 11 | self.conv1 = nn.Conv2d(3, 32, 3, 1) 12 | self.conv2 = nn.Conv2d(32, 64, 3, 1) 13 | self.dropout1 = nn.Dropout2d(0.25) 14 | self.dropout2 = nn.Dropout2d(0.5) 15 | self.fc1 = nn.Linear(9216, 128) 16 | self.fc2 = nn.Linear(128, 62) 17 | 18 | def forward(self, x): 19 | x = self.conv1(x) 20 | x = F.relu(x) 21 | x = self.conv2(x) 22 | x = F.relu(x) 23 | x = F.max_pool2d(x, 2) 24 | x = self.dropout1(x) 25 | x = torch.flatten(x, 1) 26 | x = self.fc1(x) 27 | x = F.relu(x) 28 | x = self.dropout2(x) 29 | x = self.fc2(x) 30 | return x 31 | 32 | 33 | class FemnistCNN(Model): 34 | def __init__(self, criterion, metric, device, 35 | optimizer_name="adam", lr_scheduler="sqrt", initial_lr=1e-3, epoch_size=1): 36 | super(Model, self).__init__() 37 | 38 | self.net = CNN().to(device) 39 | self.criterion = criterion 40 | self.metric = metric 41 | self.device = device 42 | 43 | self.optimizer = get_optimizer(optimizer_name, self.net, initial_lr) 44 | self.lr_scheduler = get_lr_scheduler(self.optimizer, lr_scheduler, epoch_size) 45 | 46 | def fit_iterator_one_epoch(self, iterator): 47 | epoch_loss = 0 48 | epoch_acc = 0 49 | 50 | self.net.train() 51 | 52 | for x, y in iterator: 53 | self.optimizer.zero_grad() 54 | 55 | predictions = self.net(x) 56 | 57 | loss = self.criterion(predictions, y) 58 | 59 | acc = self.metric(predictions, y) 60 | 61 | loss.backward() 62 | 63 | self.optimizer.step() 64 | self.lr_scheduler.step() 65 | 66 | epoch_loss += loss.item() 67 | epoch_acc += acc.item() 68 | 69 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 70 | 71 | def fit_batch(self, iterator, update=True): 72 | self.net.train() 73 | 74 | x, y = next(iter(iterator)) 75 | 76 | self.optimizer.zero_grad() 77 | 78 | predictions = self.net(x) 79 | 80 | loss = self.criterion(predictions, y) 81 | 82 | acc = self.metric(predictions, y) 83 | 84 | loss.backward() 85 | 86 | if update: 87 | self.optimizer.step() 88 | self.lr_scheduler.step() 89 | 90 | batch_loss = loss.item() 91 | batch_acc = acc.item() 92 | 93 | return batch_loss, batch_acc 94 | 95 | def evaluate_iterator(self, iterator): 96 | epoch_loss = 0 97 | epoch_acc = 0 98 | 99 | self.net.eval() 100 | 101 | with torch.no_grad(): 102 | for x, y in iterator: 103 | predictions = self.net(x) 104 | 105 | loss = self.criterion(predictions, y) 106 | 107 | acc = self.metric(predictions, y) 108 | 109 | epoch_loss += loss.item() 110 | epoch_acc += acc.item() 111 | 112 | return epoch_loss / len(iterator), epoch_acc / len(iterator) -------------------------------------------------------------------------------- /utils/args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from utils.utils import get_network 3 | 4 | 5 | def parse_args(args_list=None): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument( 8 | 'experiment', 9 | help='name of experiment', 10 | type=str) 11 | parser.add_argument( 12 | "--use_weighted_average", 13 | help="if used the weighted average will be optimized, otherwise the average is optimized," 14 | " i,e, all the local functions are treated the same.", 15 | action='store_true' 16 | ) 17 | parser.add_argument( 18 | '--network_name', 19 | help='name of the network;', 20 | type=str 21 | ) 22 | parser.add_argument( 23 | '--architecture', 24 | help='architecture to use, possible: complete, centralized, ring, mst, original and matcha;', 25 | default='original' 26 | ) 27 | parser.add_argument( 28 | '--communication_budget', 29 | type=float, 30 | help='used to fix communication budget when architecture is matcha;', 31 | default=0.5 32 | ) 33 | parser.add_argument( 34 | "--random_ring_proba", 35 | type=float, 36 | help="the probability of using a random ring at each step; only used if architecture is ring", 37 | default=0.5 38 | ) 39 | parser.add_argument( 40 | '--parallel', 41 | help='if chosen the training well be run in parallel,' 42 | 'otherwise the training will be run in a sequential fashion;', 43 | action='store_true' 44 | ) 45 | parser.add_argument( 46 | '--fit_by_epoch', 47 | help='if chosen each local step corresponds to one epoch,' 48 | ' otherwise each local step corresponds to one gradient step', 49 | action='store_true' 50 | ) 51 | parser.add_argument( 52 | '--n_rounds', 53 | help='number of communication rounds;', 54 | type=int, 55 | default=1 56 | ) 57 | parser.add_argument( 58 | '--bz', 59 | help='batch_size;', 60 | type=int, 61 | default=1 62 | ) 63 | parser.add_argument( 64 | '--local_steps', 65 | help='number of local steps before communication;', 66 | type=int, 67 | default=1 68 | ) 69 | parser.add_argument( 70 | '--log_freq', 71 | help='number of local steps before communication;', 72 | type=int, 73 | default=1 74 | ) 75 | parser.add_argument( 76 | '--device', 77 | help='device to use, either cpu or gpu;', 78 | type=str, 79 | default="cpu" 80 | ) 81 | parser.add_argument( 82 | '--optimizer', 83 | help='optimizer to be used for the training;', 84 | type=str, 85 | default="adam" 86 | ) 87 | parser.add_argument( 88 | "--lr", 89 | type=float, 90 | help='learning rate', 91 | default=1e-3 92 | ) 93 | parser.add_argument( 94 | "--decay", 95 | help='learning rate decay scheme to be used;' 96 | ' possible are "cyclic", "sqrt", "linear" and "constant"(no learning rate decay);' 97 | 'default is "cyclic"', 98 | type=str, 99 | default="constant" 100 | ) 101 | 102 | if args_list: 103 | args = parser.parse_args(args_list) 104 | else: 105 | args = parser.parse_args() 106 | 107 | network = get_network(args.network_name, args.architecture) 108 | args.num_workers = network.number_of_nodes() 109 | 110 | return args 111 | -------------------------------------------------------------------------------- /graph_utils/utils/tsp_christofides.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from random import randint 3 | 4 | import numpy as np 5 | import networkx as nx 6 | 7 | from networkx.algorithms.matching import max_weight_matching 8 | from networkx.algorithms.euler import eulerian_circuit 9 | 10 | 11 | def christofides_tsp(graph, starting_node=0): 12 | """ 13 | Christofides TSP algorithm 14 | http://www.dtic.mil/dtic/tr/fulltext/u2/a025602.pdf 15 | Args: 16 | graph: 2d numpy array matrix 17 | starting_node: of the TSP 18 | Returns: 19 | tour given by christofies TSP algorithm 20 | Examples: 21 | >>> import numpy as np 22 | >>> graph = np.array([[ 0, 300, 250, 190, 230], 23 | >>> [300, 0, 230, 330, 150], 24 | >>> [250, 230, 0, 240, 120], 25 | >>> [190, 330, 240, 0, 220], 26 | >>> [230, 150, 120, 220, 0]]) 27 | >>> christofides_tsp(graph) 28 | """ 29 | 30 | mst = minimal_spanning_tree(graph, 'Prim', starting_node=0) 31 | odd_degree_nodes = list(_get_odd_degree_vertices(mst)) 32 | odd_degree_nodes_ix = np.ix_(odd_degree_nodes, odd_degree_nodes) 33 | nx_graph = nx.from_numpy_array(-1 * graph[odd_degree_nodes_ix]) 34 | matching = max_weight_matching(nx_graph, maxcardinality=True) 35 | euler_multigraph = nx.MultiGraph(mst) 36 | for edge in matching: 37 | euler_multigraph.add_edge(odd_degree_nodes[edge[0]], odd_degree_nodes[edge[1]], 38 | weight=graph[odd_degree_nodes[edge[0]]][odd_degree_nodes[edge[1]]]) 39 | euler_tour = list(eulerian_circuit(euler_multigraph, source=starting_node)) 40 | path = list(itertools.chain.from_iterable(euler_tour)) 41 | return _remove_repeated_vertices(path, starting_node)[:-1] 42 | 43 | 44 | def _get_odd_degree_vertices(graph): 45 | """ 46 | Finds all the odd degree vertices in graph 47 | Args: 48 | graph: 2d np array as adj. matrix 49 | Returns: 50 | Set of vertices that have odd degree 51 | """ 52 | odd_degree_vertices = set() 53 | for index, row in enumerate(graph): 54 | if len(np.nonzero(row)[0]) % 2 != 0: 55 | odd_degree_vertices.add(index) 56 | return odd_degree_vertices 57 | 58 | 59 | def _remove_repeated_vertices(path, starting_node): 60 | path = list(dict.fromkeys(path).keys()) 61 | path.append(starting_node) 62 | return path 63 | 64 | 65 | def minimal_spanning_tree(graph, mode='Prim', starting_node=None): 66 | """ 67 | Args: 68 | graph: weighted adjacency matrix as 2d np.array 69 | mode: method for calculating minimal spanning tree 70 | starting_node: node number to start construction of minimal spanning tree (Prim) 71 | Returns: 72 | minimal spanning tree as 2d array 73 | """ 74 | 75 | if mode == 'Prim': 76 | return _minimal_spanning_tree_prim(graph, starting_node) 77 | 78 | 79 | def _minimal_spanning_tree_prim(graph, starting_node): 80 | """ 81 | Args: 82 | graph: weighted adj. matrix as 2d np.array 83 | starting_node: node number to start construction of minimal spanning tree 84 | Returns: 85 | minimal spanning tree as 2d array calculted by Prim 86 | """ 87 | 88 | node_count = len(graph) 89 | all_nodes = [i for i in range(node_count)] 90 | 91 | if starting_node is None: 92 | starting_node = randint(0, node_count-1) 93 | 94 | unvisited_nodes = all_nodes 95 | visited_nodes = [starting_node] 96 | unvisited_nodes.remove(starting_node) 97 | mst = np.zeros((node_count, node_count)) 98 | 99 | while len(visited_nodes) != node_count: 100 | selected_subgraph = graph[np.array(visited_nodes)[:, None], np.array(unvisited_nodes)] 101 | # we mask non-exist edges with -- so it doesn't crash the argmin 102 | min_edge_index = np.unravel_index(np.ma.masked_equal(selected_subgraph, 0, copy=False).argmin(), 103 | selected_subgraph.shape) 104 | edge_from = visited_nodes[min_edge_index[0]] 105 | edge_to = unvisited_nodes[min_edge_index[1]] 106 | mst[edge_from, edge_to] = graph[edge_from, edge_to] 107 | mst[edge_to, edge_from] = graph[edge_from, edge_to] 108 | unvisited_nodes.remove(edge_to) 109 | visited_nodes.append(edge_to) 110 | return mst -------------------------------------------------------------------------------- /data/synthetic/generate_data.py: -------------------------------------------------------------------------------- 1 | """ From https://github.com/TalwalkarLab/leaf/blob/master/data/synthetic/""" 2 | import argparse 3 | import json 4 | import os 5 | import numpy as np 6 | from scipy.special import softmax 7 | 8 | NUM_DIM = 10 9 | PROB_CLUSTERS = [1.0] 10 | 11 | 12 | class SyntheticDataset: 13 | def __init__( 14 | self, 15 | num_classes=2, 16 | seed=931231, 17 | num_dim=NUM_DIM, 18 | prob_clusters=[0.5, 0.5]): 19 | 20 | np.random.seed(seed) 21 | 22 | self.num_classes = num_classes 23 | self.num_dim = num_dim 24 | self.num_clusters = len(prob_clusters) 25 | self.prob_clusters = prob_clusters 26 | 27 | self.side_info_dim = self.num_clusters 28 | 29 | self.Q = np.random.normal( 30 | loc=0.0, scale=1.0, size=(self.num_dim + 1, self.num_classes, self.side_info_dim)) 31 | 32 | self.Sigma = np.zeros((self.num_dim, self.num_dim)) 33 | for i in range(self.num_dim): 34 | self.Sigma[i, i] = (i + 1) ** (-1.2) 35 | 36 | self.means = self._generate_clusters() 37 | 38 | def get_task(self, num_samples): 39 | cluster_idx = np.random.choice( 40 | range(self.num_clusters), size=None, replace=True, p=self.prob_clusters) 41 | new_task = self._generate_task(self.means[cluster_idx], cluster_idx, num_samples) 42 | return new_task 43 | 44 | def _generate_clusters(self): 45 | means = [] 46 | for i in range(self.num_clusters): 47 | loc = np.random.normal(loc=0, scale=1., size=None) 48 | mu = np.random.normal(loc=loc, scale=1., size=self.side_info_dim) 49 | means.append(mu) 50 | return means 51 | 52 | def _generate_x(self, num_samples): 53 | B = np.random.normal(loc=0.0, scale=1.0, size=None) 54 | loc = np.random.normal(loc=B, scale=1.0, size=self.num_dim) 55 | 56 | samples = np.ones((num_samples, self.num_dim + 1)) 57 | samples[:, 1:] = np.random.multivariate_normal( 58 | mean=loc, cov=self.Sigma, size=num_samples) 59 | 60 | return samples 61 | 62 | def _generate_y(self, x, cluster_mean): 63 | model_info = np.random.normal(loc=cluster_mean, scale=0.1, size=cluster_mean.shape) 64 | w = np.matmul(self.Q, model_info) 65 | 66 | num_samples = x.shape[0] 67 | prob = softmax(np.matmul(x, w) + np.random.normal(loc=0., scale=0.1, size=(num_samples, self.num_classes)), 68 | axis=1) 69 | 70 | y = np.argmax(prob, axis=1) 71 | return y, w, model_info 72 | 73 | def _generate_task(self, cluster_mean, cluster_id, num_samples): 74 | x = self._generate_x(num_samples) 75 | y, w, model_info = self._generate_y(x, cluster_mean) 76 | 77 | # now that we have y, we can remove the bias coeff 78 | x = x[:, 1:] 79 | 80 | return {'x': x, 'y': y, 'w': w, 'model_info': model_info, 'cluster': cluster_id} 81 | 82 | 83 | def main(): 84 | args = parse_args() 85 | np.random.seed(args.seed) 86 | 87 | num_samples = get_num_samples(args.num_workers) 88 | dataset = SyntheticDataset( 89 | num_classes=args.num_classes, prob_clusters=PROB_CLUSTERS, num_dim=args.dimension, seed=args.seed) 90 | tasks = [dataset.get_task(s) for s in num_samples] 91 | users, num_samples, user_data = to_leaf_format(tasks) 92 | save_json('all_data', 'all_data.json', users, num_samples, user_data, args.num_classes) 93 | 94 | 95 | def get_num_samples(num_tasks, min_num_samples=5, max_num_samples=1000): 96 | num_samples = np.random.lognormal(3, 2, (num_tasks)).astype(int) 97 | num_samples = [min(s + min_num_samples, max_num_samples) for s in num_samples] 98 | return num_samples 99 | 100 | 101 | def to_leaf_format(tasks): 102 | users, num_samples, user_data = [], [], {} 103 | 104 | for i, t in enumerate(tasks): 105 | x, y = t['x'].tolist(), t['y'].tolist() 106 | u_id = str(i) 107 | 108 | users.append(u_id) 109 | num_samples.append(len(y)) 110 | user_data[u_id] = {'x': x, 'y': y} 111 | 112 | return users, num_samples, user_data 113 | 114 | 115 | def save_json(json_dir, json_name, users, num_samples, user_data, num_classes): 116 | if not os.path.exists(json_dir): 117 | os.makedirs(json_dir) 118 | 119 | json_file = { 120 | 'users': users, 121 | 'num_samples': num_samples, 122 | 'user_data': user_data, 123 | "num_classes": num_classes 124 | } 125 | 126 | with open(os.path.join(json_dir, json_name), 'w') as outfile: 127 | json.dump(json_file, outfile) 128 | 129 | 130 | def parse_args(): 131 | parser = argparse.ArgumentParser() 132 | 133 | parser.add_argument( 134 | '--num_workers', 135 | help='number of workers;', 136 | type=int, 137 | required=True) 138 | parser.add_argument( 139 | '--num_classes', 140 | help='number of classes;', 141 | type=int, 142 | required=True) 143 | parser.add_argument( 144 | '--dimension', 145 | help='data dimension;', 146 | type=int, 147 | required=True) 148 | parser.add_argument( 149 | '--seed', 150 | help='seed for the random processes;', 151 | type=int, 152 | default=931231, 153 | required=False) 154 | return parser.parse_args() 155 | 156 | 157 | if __name__ == '__main__': 158 | main() 159 | -------------------------------------------------------------------------------- /models/sent140/lstm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from utils.optim import get_optimizer, get_lr_scheduler 4 | from ..model import Model 5 | 6 | 7 | class LSTM(nn.Module): 8 | def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 9 | bidirectional, dropout, pad_idx): 10 | super().__init__() 11 | 12 | self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx) 13 | 14 | self.lstm = nn.LSTM(embedding_dim, 15 | hidden_dim, 16 | num_layers=n_layers, 17 | bidirectional=bidirectional, 18 | dropout=dropout) 19 | 20 | self.fc = nn.Linear(hidden_dim * 2, output_dim) 21 | 22 | self.dropout = nn.Dropout(dropout) 23 | 24 | def forward(self, text, text_lengths): 25 | # text = [sent len, batch size] 26 | self.lstm.flatten_parameters() 27 | embedded = self.dropout(self.embedding(text)) 28 | 29 | # pack sequence 30 | packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths) 31 | 32 | packed_output, (hidden, cell) = self.lstm(packed_embedded) 33 | 34 | # unpack sequence 35 | _, _ = nn.utils.rnn.pad_packed_sequence(packed_output) 36 | 37 | hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)) 38 | 39 | return self.fc(hidden) 40 | 41 | 42 | class LSTMSentiment(Model): 43 | def __init__(self, iterator, criterion, metric, device, optimizer_name="adam", lr_scheduler="sqrt", initial_lr=1e-3, 44 | epoch_size=1, embedding_dim=100, hidden_dim=256, output_dim=1, n_layers=2, bidirectional=True, 45 | dropout=0.5): 46 | """ 47 | 48 | :param iterator: 49 | :param criterion: 50 | :param metric: 51 | :param device: 52 | :param optimizer_name: 53 | :param lr_scheduler: 54 | :param initial_lr: 55 | :param embedding_dim: 56 | :param hidden_dim: 57 | :param output_dim: 58 | :param n_layers: 59 | :param bidirectional: 60 | :param dropout: 61 | """ 62 | super(Model, self).__init__() 63 | 64 | self.device = device 65 | self.criterion = criterion 66 | self.metric = metric 67 | 68 | text_field = iterator.dataset.fields['text'] 69 | 70 | pad_idx = text_field.vocab.stoi[text_field.pad_token] 71 | unk_idx = text_field.vocab.stoi[text_field.unk_token] 72 | 73 | self.net = LSTM(vocab_size=len(text_field.vocab), 74 | embedding_dim=embedding_dim, 75 | hidden_dim=hidden_dim, 76 | output_dim=output_dim, 77 | n_layers=n_layers, 78 | bidirectional=bidirectional, 79 | dropout=dropout, 80 | pad_idx=pad_idx).to(device) 81 | 82 | # initialize embeddings 83 | pretrained_embeddings = text_field.vocab.vectors 84 | self.net.embedding.weight.data.copy_(pretrained_embeddings) 85 | 86 | self.net.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim).to(self.device) 87 | self.net.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim).to(self.device) 88 | 89 | # Freeze embedding 90 | self.net.embedding.weight.requires_grad = False 91 | 92 | self.optimizer = get_optimizer(optimizer_name, self.net, initial_lr) 93 | self.lr_scheduler = get_lr_scheduler(self.optimizer, lr_scheduler, epoch_size) 94 | 95 | def fit_iterator_one_epoch(self, iterator): 96 | epoch_loss = 0 97 | epoch_acc = 0 98 | 99 | self.net.train() 100 | 101 | for batch in iterator: 102 | self.optimizer.zero_grad() 103 | 104 | text, text_lengths = batch.text 105 | 106 | predictions = self.net(text, text_lengths).squeeze(1) 107 | 108 | loss = self.criterion(predictions, batch.label) 109 | 110 | acc = self.metric(predictions, batch.label) 111 | 112 | loss.backward() 113 | 114 | self.optimizer.step() 115 | 116 | self.lr_scheduler.step() 117 | 118 | epoch_loss += loss.item() 119 | epoch_acc += acc.item() 120 | 121 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 122 | 123 | def fit_batch(self, iterator, update=True): 124 | self.net.train() 125 | 126 | batch = next(iter(iterator)) 127 | self.optimizer.zero_grad() 128 | 129 | text, text_lengths = batch.text 130 | 131 | predictions = self.net(text, text_lengths).squeeze(1) 132 | 133 | loss = self.criterion(predictions, batch.label) 134 | 135 | acc = self.metric(predictions, batch.label) 136 | 137 | loss.backward() 138 | 139 | if update: 140 | self.optimizer.step() 141 | self.lr_scheduler.step() 142 | 143 | return loss.item(), acc.item() 144 | 145 | def evaluate_iterator(self, iterator): 146 | epoch_loss = 0 147 | epoch_acc = 0 148 | 149 | self.net.eval() 150 | 151 | with torch.no_grad(): 152 | for batch in iterator: 153 | text, text_lengths = batch.text 154 | 155 | predictions = self.net(text, text_lengths).squeeze(1) 156 | 157 | loss = self.criterion(predictions, batch.label) 158 | 159 | acc = self.metric(predictions, batch.label) 160 | 161 | epoch_loss += loss.item() 162 | epoch_acc += acc.item() 163 | 164 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 165 | -------------------------------------------------------------------------------- /data/shakespeare/split_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import random 4 | import time 5 | 6 | 7 | def iid_divide(l, g): 8 | """ 9 | divide list l among g groups 10 | each group has either int(len(l)/g) or int(len(l)/g)+1 elements 11 | returns a list of groups 12 | 13 | """ 14 | num_elems = len(l) 15 | group_size = int(len(l)/g) 16 | num_big_groups = num_elems - g * group_size 17 | num_small_groups = g - num_big_groups 18 | glist = [] 19 | for i in range(num_small_groups): 20 | glist.append(l[group_size * i : group_size * (i + 1)]) 21 | bi = group_size*num_small_groups 22 | group_size += 1 23 | for i in range(num_big_groups): 24 | glist.append(l[bi + group_size * i:bi + group_size * (i + 1)]) 25 | return glist 26 | 27 | 28 | parser = argparse.ArgumentParser() 29 | 30 | parser.add_argument('--num_workers', 31 | help=('number of workers/users;' 32 | 'default: 1;'), 33 | type=int, 34 | default=1) 35 | parser.add_argument('--iid', 36 | help='sample iid;', 37 | action="store_true") 38 | parser.add_argument('--niid', 39 | help="sample niid;", 40 | dest='iid', action='store_false') 41 | parser.add_argument('--s_frac', 42 | help='fraction of all data to sample; default: 0.1;', 43 | type=float, 44 | default=0.1) 45 | parser.add_argument('--tr_frac', 46 | help='fraction in training set; default: 0.8;', 47 | type=float, 48 | default=0.8) 49 | parser.add_argument('--seed', 50 | help='args.seed for random partitioning of test/train data', 51 | type=int, 52 | default=None) 53 | 54 | parser.set_defaults(user=False) 55 | 56 | args = parser.parse_args() 57 | 58 | 59 | if __name__ == "__main__": 60 | print('------------------------------') 61 | print('generating training and test sets') 62 | 63 | rng_seed = (args.seed if (args.seed is not None and args.seed >= 0) else int(time.time())) 64 | rng = random.Random(rng_seed) 65 | 66 | train_file = os.path.join("train", "train.txt") 67 | test_file = os.path.join("test", "test.txt") 68 | 69 | data_dir = os.path.join('raw_data', 'by_play_and_character') 70 | 71 | if args.iid: 72 | # TO DO: Factorize this part 73 | all_lines = [] 74 | for file_name in os.listdir(data_dir): 75 | file_path = os.path.join(data_dir, file_name) 76 | with open(file_path, "r") as f: 77 | lines = f.readlines() 78 | all_lines += lines 79 | 80 | tot_num_samples = len(all_lines) 81 | num_new_samples = int(args.s_frac * tot_num_samples) 82 | 83 | indices = [i for i in range(tot_num_samples)] 84 | new_indices = rng.sample(indices, num_new_samples) 85 | 86 | indices_groups = iid_divide(new_indices, args.num_workers) 87 | 88 | for id_w, worker_indices in enumerate(indices_groups): 89 | curr_num_samples = len(worker_indices) 90 | 91 | num_train_samples = max(1, int(args.tr_frac * curr_num_samples)) 92 | num_test_samples = curr_num_samples - num_train_samples 93 | 94 | train_indices = rng.sample(worker_indices, num_train_samples) 95 | test_indices = list(set(worker_indices) - set(train_indices)) 96 | 97 | local_train_file = os.path.join("train", "{}.txt".format(id_w)) 98 | 99 | for (file_, indices) in [(train_file, train_indices), 100 | (local_train_file, train_indices), 101 | (test_file, test_indices)]: 102 | 103 | for sample_idx in indices: 104 | sample = all_lines[sample_idx] 105 | 106 | with open(file_, "a") as f: 107 | f.write(sample) 108 | else: 109 | writers = os.listdir(data_dir) 110 | 111 | rng.shuffle(writers) 112 | writers_by_workers = iid_divide(writers, args.num_workers) 113 | 114 | for id_w, worker_writers in enumerate(writers_by_workers): 115 | all_worker_lines = [] 116 | for writer in worker_writers: 117 | file_path = os.path.join(data_dir, writer) 118 | with open(file_path, "r") as f: 119 | lines = f.readlines() 120 | 121 | all_worker_lines += lines 122 | 123 | tot_num_samples = len(all_worker_lines) 124 | num_new_samples = int(args.s_frac * tot_num_samples) 125 | 126 | indices = [i for i in range(tot_num_samples)] 127 | new_indices = rng.sample(indices, num_new_samples) 128 | 129 | new_worker_lines = [all_worker_lines[i] for i in new_indices] 130 | 131 | num_train_samples = max(1, int(args.tr_frac * num_new_samples)) 132 | num_test_samples = num_new_samples - num_train_samples 133 | 134 | train_indices = rng.sample(new_indices, num_train_samples) 135 | test_indices = list(set(new_indices) - set(train_indices)) 136 | 137 | local_train_file = os.path.join("train", "{}.txt".format(id_w)) 138 | 139 | for (file_, indices) in [(train_file, train_indices), 140 | (local_train_file, train_indices), 141 | (test_file, test_indices)]: 142 | 143 | for sample_idx in indices: 144 | sample = all_worker_lines[sample_idx] 145 | 146 | with open(file_, "a") as f: 147 | f.write(sample) 148 | -------------------------------------------------------------------------------- /graph_utils/time_simulator.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import networkx as nx 4 | import numpy as np 5 | 6 | import geopy.distance 7 | from geopy.geocoders import Nominatim 8 | 9 | geolocator = Nominatim(user_agent="delay", timeout=20) 10 | 11 | 12 | def get_zoo_topology(file_path, 13 | bandwidth=1e9, 14 | upload_capacity_at_edge=35 * 1e6, 15 | download_capacity_at_edge=144 * 1e6): 16 | """ 17 | Read zoo_topology data into nx.DiGraph(); 18 | in the output graph each edge has two information: "capacity" and "distance"; 19 | each node has two information: "upload capacity" and "download capacity"; 20 | :param file_path : path to .gml file with topology information 21 | :param bandwidth: (float) represent links capacity, 22 | used when information not available in .gml file 23 | :param upload_capacity_at_edge: https://en.wikipedia.org/wiki/Bit_rate for information 24 | :param download_capacity_at_edge: https://en.wikipedia.org/wiki/Bit_rate for information 25 | :return: G_z (nx.DiGraph) 26 | """ 27 | 28 | network_data = nx.read_gml(file_path) 29 | 30 | G_z = nx.Graph() 31 | G_z.add_nodes_from(network_data) 32 | 33 | # add nodes capacity 34 | nx.set_node_attributes(G_z, upload_capacity_at_edge * 1e-3, 'upload_capacity') 35 | nx.set_node_attributes(G_z, download_capacity_at_edge * 1e-3, "download_capacity") 36 | 37 | # add edges data 38 | for u, v, data in network_data.edges.data(): 39 | # get distance 40 | try: 41 | distance = data["distance"] 42 | 43 | except AttributeError: 44 | try: 45 | coords_1 = (network_data.nodes(data=True)[u]["Latitude"], 46 | network_data.nodes(data=True)[u]["Longitude"]) 47 | 48 | coords_2 = (network_data.nodes(data=True)[v]["Latitude"], 49 | network_data.nodes(data=True)[v]["Longitude"]) 50 | 51 | except KeyError: 52 | time.sleep(1.2) # To avoid Service time out Error 53 | 54 | geo = geolocator.geocode(u, timeout=20) 55 | 56 | coords_1 = (geo.latitude, geo.longitude) 57 | 58 | time.sleep(1.2) # To avoid Service time out Error 59 | 60 | geo = geolocator.geocode(v, timeout=20) 61 | 62 | coords_2 = (geo.latitude, geo.longitude) 63 | 64 | distance = geopy.distance.distance(coords_1, coords_2).km 65 | 66 | # add_edge 67 | G_z.add_edge(u, v, capacity=bandwidth * 1e-3, distance=distance) 68 | 69 | return G_z 70 | 71 | 72 | def initialize_delays(underlay, overlay, model_size): 73 | """ 74 | compute delays between nodes ignoring download congestion effect 75 | :param underlay: (nx.Graph()) 76 | :param overlay: (nx.Graph()) 77 | :param model_size: message_size in bits, see https://keras.io/applications/ for examples 78 | :return: nxGraph() 79 | """ 80 | for u, v, data in overlay.edges(data=True): 81 | overlay.edges[u, v]["delay"] = overlay.edges[u, v]["weight"] 82 | 83 | return overlay 84 | 85 | 86 | def init_iteration_end_time(overlay, computation_time=0): 87 | """ 88 | 89 | :param overlay: 90 | :param computation_time: 91 | :return: 92 | """ 93 | nx.set_node_attributes(overlay, computation_time, "end_time") 94 | return overlay 95 | 96 | 97 | def get_iteration_end_time(underlay, overlay, model_size, computation_time): 98 | """ 99 | Compute the end times of next iteration having the end times for current iteration. 100 | :param underlay: 101 | :param overlay: 102 | :param model_size: 103 | :param computation_time 104 | :return: 105 | """ 106 | out_degrees = dict(overlay.out_degree()) 107 | for i, j in overlay.edges: 108 | overlay.edges[i, j]["t"] = overlay.edges[i, j]["delay"] + overlay.nodes[i]["end_time"] 109 | 110 | def get_edge_time(e): 111 | return overlay.edges[e[0], e[1]]["t"] 112 | 113 | for j in overlay.nodes: 114 | overlay.nodes[j]["end_time"] = 0 115 | 116 | # get all the input edges to "j" sorted by t_{ij} 117 | edges = [] 118 | for i in overlay.predecessors(j): 119 | edges.append((i, j)) 120 | 121 | if len(edges) > 0: 122 | edges.sort(key=get_edge_time) 123 | 124 | t_prev = get_edge_time(edges[0]) + model_size / underlay.nodes[j]["download_capacity"] 125 | 126 | for edge in edges[1:]: 127 | if get_edge_time(edge) <= t_prev + model_size / underlay.nodes[j]["download_capacity"]: 128 | t_prev = t_prev + model_size / underlay.nodes[j]["download_capacity"] 129 | else: 130 | t_prev = get_edge_time(edge) 131 | 132 | else: 133 | t_prev = 0 134 | 135 | overlay.nodes[j]["end_time"] = t_prev + computation_time + \ 136 | (model_size * out_degrees[j]) / underlay.nodes[j]["upload_capacity"] 137 | 138 | return overlay 139 | 140 | 141 | def simulate_network(underlay, overlay, n_iterations, model_size=1e8, computation_time=0): 142 | """ 143 | 144 | :param underlay: 145 | :param overlay: 146 | :param n_iterations: 147 | :param model_size: 148 | :param computation_time 149 | :return: 150 | """ 151 | time_evolution = np.zeros((overlay.number_of_nodes(), n_iterations)) 152 | 153 | overlay = initialize_delays(underlay, overlay, model_size) 154 | overlay = init_iteration_end_time(overlay, computation_time) 155 | 156 | for iteration in range(n_iterations): 157 | overlay = get_iteration_end_time(underlay, overlay, model_size, computation_time) 158 | for ii, (_, end_time) in enumerate(overlay.nodes.data("end_time")): 159 | time_evolution[ii, iteration] = end_time 160 | 161 | return time_evolution 162 | -------------------------------------------------------------------------------- /models/shakespeare/gru.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from utils.optim import get_optimizer, get_lr_scheduler 4 | from torch.autograd import Variable 5 | import string 6 | from ..model import Model 7 | 8 | 9 | class RNN(nn.Module): 10 | def __init__(self, input_size, embed_size, hidden_size, output_size, n_layers): 11 | super(RNN, self).__init__() 12 | self.input_size = input_size 13 | self.hidden_size = hidden_size 14 | self.embed_size = embed_size 15 | self.output_size = output_size 16 | self.n_layers = n_layers 17 | 18 | self.encoder = nn.Embedding(input_size, embed_size) 19 | self.gru = nn.GRU(embed_size, hidden_size, n_layers) 20 | self.decoder = nn.Linear(hidden_size, output_size) 21 | 22 | def forward(self, input_, hidden): 23 | self.gru.flatten_parameters() 24 | batch_size = input_.size(0) 25 | encoded = self.encoder(input_) 26 | output, hidden = self.gru(encoded.view(1, batch_size, -1), hidden) 27 | output = self.decoder(output.view(batch_size, -1)) 28 | return output, hidden 29 | 30 | def init_hidden(self, batch_size): 31 | return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)) 32 | 33 | 34 | class NextCharDecoder(Model): 35 | def __init__(self, criterion, metric, device, 36 | optimizer_name="adam", lr_scheduler="sqrt", initial_lr=1e-3, epoch_size=1, 37 | embed_size=16, hidden_size=256, n_layers=2): 38 | super(Model, self).__init__() 39 | 40 | vocab_size = len(string.printable) 41 | self.net = RNN(vocab_size, embed_size, hidden_size, vocab_size, n_layers).to(device) 42 | self.criterion = criterion 43 | self.metric = metric 44 | self.device = device 45 | 46 | self.optimizer = get_optimizer(optimizer_name, self.net, initial_lr) 47 | self.lr_scheduler = get_lr_scheduler(self.optimizer, lr_scheduler, epoch_size) 48 | 49 | def fit_iterator_one_epoch(self, iterator): 50 | self.net.train() 51 | 52 | epoch_loss = 0 53 | epoch_acc = 0 54 | 55 | for inp, target in iterator: 56 | 57 | inp = inp.to(self.device) 58 | target = target.to(self.device) 59 | 60 | hidden = self.net.init_hidden(inp.size(0)).to(self.device) 61 | self.optimizer.zero_grad() 62 | 63 | loss = 0 64 | acc = 0 65 | 66 | for c in range(iterator.dataset.chunk_len): 67 | output, hidden = self.net(inp[:, c], hidden) 68 | loss += self.criterion(output.view(inp.size(0), -1), target[:, c]) 69 | acc += self.metric(output, target[:, c]).item() 70 | 71 | loss /= iterator.dataset.chunk_len 72 | acc /= iterator.dataset.chunk_len 73 | 74 | loss.backward() 75 | 76 | self.optimizer.step() 77 | self.lr_scheduler.step() 78 | 79 | epoch_loss += loss.item() 80 | epoch_acc += acc 81 | 82 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 83 | 84 | def fit_batch(self, iterator, update=True): 85 | self.net.train() 86 | 87 | inp, target = next(iter(iterator)) 88 | inp = inp.to(self.device) 89 | target = target.to(self.device) 90 | 91 | hidden = self.net.init_hidden(inp.size(0)).to(self.device) 92 | self.optimizer.zero_grad() 93 | 94 | loss = 0 95 | acc = 0 96 | 97 | for c in range(iterator.dataset.chunk_len): 98 | output, hidden = self.net(inp[:, c], hidden) 99 | loss += self.criterion(output.view(inp.size(0), -1), target[:, c]) 100 | acc += self.metric(output, target[:, c]).item() 101 | 102 | loss /= iterator.dataset.chunk_len 103 | acc /= iterator.dataset.chunk_len 104 | 105 | loss.backward() 106 | 107 | if update: 108 | self.optimizer.step() 109 | self.lr_scheduler.step() 110 | 111 | return loss.item(), acc 112 | 113 | def evaluate_iterator(self, iterator): 114 | self.net.eval() 115 | 116 | epoch_loss = 0 117 | epoch_acc = 0 118 | 119 | for inp, target in iterator: 120 | 121 | inp = inp.to(self.device) 122 | target = target.to(self.device) 123 | 124 | hidden = self.net.init_hidden(inp.size(0)).to(self.device) 125 | 126 | loss = 0 127 | acc = 0 128 | for c in range(iterator.dataset.chunk_len): 129 | output, hidden = self.net(inp[:, c], hidden) 130 | loss += self.criterion(output.view(inp.size(0), -1), target[:, c]) 131 | acc += self.metric(output, target[:, c]).item() 132 | 133 | loss /= iterator.dataset.chunk_len 134 | acc /= iterator.dataset.chunk_len 135 | 136 | epoch_loss += loss.item() 137 | epoch_acc += acc 138 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 139 | 140 | def generate(self, prime_str="Wh", predict_len=200, temperature=0.8): 141 | all_characters = string.printable 142 | hidden = self.net.init_hidden(1).to(self.device) 143 | 144 | prime_input = torch.zeros(1, len(prime_str)).long().to(self.device) 145 | for c in range(len(prime_str)): 146 | prime_input[0, c] = all_characters.index(prime_str[c]) 147 | 148 | predicted = prime_str 149 | 150 | for p in range(len(prime_str) - 1): 151 | _, hidden = self.net(prime_input[:, p], hidden) 152 | 153 | inp = prime_input[:, -1] 154 | 155 | for p in range(predict_len): 156 | output, hidden = self.net(inp, hidden) 157 | 158 | output_dist = output.data.view(-1).div(temperature).exp() 159 | top_i = torch.multinomial(output_dist, 1)[0] 160 | 161 | predicted_char = all_characters[top_i] 162 | predicted += predicted_char 163 | 164 | inp = torch.zeros(1, len(predicted_char)).long().to(self.device) 165 | for c in range(len(predicted_char)): 166 | inp[0, c] = all_characters.index(predicted_char[c]) 167 | 168 | return predicted 169 | -------------------------------------------------------------------------------- /data/inaturalist/split_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import pickle 5 | import time 6 | import random 7 | from collections import Counter 8 | 9 | import networkx as nx 10 | import numpy as np 11 | 12 | import geopy.distance 13 | from geopy.geocoders import Nominatim 14 | 15 | 16 | class FileException(FileNotFoundError): 17 | def __init__(self, message): 18 | super().__init__(message) 19 | 20 | 21 | parser = argparse.ArgumentParser() 22 | 23 | parser.add_argument('--network', 24 | help="name of the network to use, should be present in /graph_utils/data; default: amazon_us", 25 | type=str, 26 | default="amazon_us") 27 | parser.add_argument('--num_categories', 28 | help="number of classes to include, default: 80", 29 | type=int, 30 | default="80") 31 | parser.add_argument('--s_frac', 32 | help='fraction of all data to sample; default: 0.1;', 33 | type=float, 34 | default=1) 35 | parser.add_argument('--tr_frac', 36 | help='fraction in training set; default: 0.8;', 37 | type=float, 38 | default=0.9) 39 | parser.add_argument('--seed', 40 | help='args.seed for random partitioning of test/train data', 41 | type=int, 42 | default=None) 43 | 44 | args = parser.parse_args() 45 | 46 | 47 | if __name__ == "__main__": 48 | network_path = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "graph_utils/data", args.network + ".gml")) 49 | 50 | if not os.path.isfile(network_path): 51 | raise FileException("The network with name {} is not found!".format(network_path)) 52 | 53 | rng_seed = (args.seed if (args.seed is not None and args.seed >= 0) else int(time.time())) 54 | rng = random.Random(rng_seed) 55 | np.random.seed(rng_seed) 56 | 57 | # Get workers locations 58 | network_path = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "graph_utils/data", args.network + ".gml")) 59 | workers_network = nx.read_gml(network_path, label="label") 60 | nodes_locs = [] 61 | geolocator = Nominatim(user_agent="delay", timeout=20) 62 | for node in workers_network.nodes(): 63 | time.sleep(1.0) # To avoid Service time out Error 64 | geo = geolocator.geocode(node, timeout=20) 65 | nodes_locs.append((geo.latitude, geo.longitude)) 66 | 67 | # Get the information for images and locations 68 | with open(os.path.join("raw_data", "train2018_locations.json")) as f: 69 | train_imgs_locations = json.load(f) 70 | 71 | with open(os.path.join("raw_data", "val2018_locations.json")) as f: 72 | val_imgs_locations = json.load(f) 73 | 74 | with open(os.path.join("raw_data", "train2018.json")) as f: 75 | train_images_data = json.load(f) 76 | 77 | with open(os.path.join("raw_data", "val2018.json")) as f: 78 | val_images_data = json.load(f) 79 | 80 | all_data = dict() 81 | for images_data in [train_images_data, val_images_data]: 82 | for img, annotation in zip(images_data["images"], images_data["annotations"]): 83 | img_id = img["id"] 84 | img_path = ["raw_data/"] + img["file_name"].split("/")[1:] 85 | img_path = "/".join(img_path) 86 | category_id = annotation["category_id"] 87 | 88 | all_data[img_id] = {"path": img_path, "class": category_id} 89 | 90 | for imgs_locations in [train_imgs_locations, val_imgs_locations]: 91 | for location in imgs_locations: 92 | img_id = location["id"] 93 | all_data[img_id]["lat"] = location["lat"] 94 | all_data[img_id]["lon"] = location["lon"] 95 | 96 | # Get most common categories 97 | all_categories = [] 98 | for img_id in all_data: 99 | all_categories.append(all_data[img_id]['class']) 100 | 101 | c = Counter(all_categories) 102 | most_common_categories = c.most_common(args.num_categories) 103 | most_common_categories = [i for (i, j) in most_common_categories] 104 | 105 | relabel_categories = {category: idx for idx, category in enumerate(most_common_categories)} 106 | most_common_categories = set(most_common_categories) 107 | 108 | # Assign images to closest workers 109 | imgs_by_workers = {worker_id: [] for worker_id in range(workers_network.number_of_nodes())} 110 | 111 | for img_id in all_data: 112 | category = all_data[img_id]['class'] 113 | if category in most_common_categories: 114 | # Get closest worker to node 115 | coord_img = (all_data[img_id]['lat'], all_data[img_id]['lon']) 116 | distances = np.array([geopy.distance.distance(coord_img, coord_node).km for coord_node in nodes_locs]) 117 | worker_id = np.argmin(distances) 118 | 119 | img_data = (all_data[img_id]["path"], relabel_categories[category]) 120 | 121 | imgs_by_workers[worker_id].append(img_data) 122 | 123 | # Split data to train and test 124 | train_data = [] 125 | test_data = [] 126 | 127 | for worker_id in imgs_by_workers.keys(): 128 | all_worker_data = imgs_by_workers[worker_id] 129 | 130 | tot_num_samples = len(all_worker_data) 131 | num_new_samples = int(args.s_frac * tot_num_samples) 132 | 133 | indices = [i for i in range(tot_num_samples)] 134 | new_indices = rng.sample(indices, num_new_samples) 135 | 136 | num_train_samples = max(1, int(args.tr_frac * num_new_samples)) 137 | num_test_samples = num_new_samples - num_train_samples 138 | 139 | train_indices = rng.sample(new_indices, num_train_samples) 140 | test_indices = list(set(new_indices) - set(train_indices)) 141 | 142 | worker_data = [all_worker_data[ii] for ii in train_indices] 143 | train_data += [all_worker_data[ii] for ii in train_indices] 144 | test_data += [all_worker_data[ii] for ii in test_indices] 145 | 146 | with open('train/{}.pkl'.format(worker_id), 'wb') as f: 147 | pickle.dump(worker_data, f, pickle.HIGHEST_PROTOCOL) 148 | 149 | with open('train/train.pkl', 'wb') as f: 150 | pickle.dump(train_data, f, pickle.HIGHEST_PROTOCOL) 151 | 152 | with open('test/test.pkl', 'wb') as f: 153 | pickle.dump(test_data, f, pickle.HIGHEST_PROTOCOL) 154 | -------------------------------------------------------------------------------- /graph_utils/utils/matcha.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | 4 | import cvxpy as cp 5 | import numpy as np 6 | import networkx as nx 7 | 8 | from .matching_decomposition import matching_decomposition 9 | 10 | 11 | class RandomTopologyGenerator(object): 12 | """ 13 | Attributes: 14 | - laplacian_matrices: List of numpy arrays; each array represents the laplacian matrix of a matching; 15 | - communication_budget: Constraint controlling the sum of the weights, 16 | and equivalently controlling the expected communication time; 17 | - path_to_history_file: path to .csv file used to save the history of selected matching at each step 18 | - activation_probas: np.array of the same size as "laplacian_matrices"; 19 | - current_matching_activations: list of booleans, each of them represent if a matching is used; 20 | - matching_list: list of nx.Graph() objects; 21 | - alpha: float to be use in generating mixing matrix 22 | """ 23 | def __init__(self, network, communication_budget, network_save_path=None, path_to_history_file=None): 24 | self.network = network 25 | self.communication_budget = communication_budget 26 | self.path_to_history_file = path_to_history_file 27 | self.network_save_path = network_save_path 28 | 29 | # eliminate self loops 30 | self.network.remove_edges_from(nx.selfloop_edges(self.network)) 31 | 32 | self.matching_list, self.laplacian_matrices = matching_decomposition(self.network) 33 | 34 | self.number_workers = self.laplacian_matrices[0].shape[0] 35 | self.number_matching = len(self.laplacian_matrices) 36 | 37 | # Initialize generator parameters 38 | self.activation_probas = self.get_matching_activation_probabilities() 39 | self.activation_probas = np.clip(self.activation_probas, 0., 1.) 40 | 41 | self.alpha, self.spectral_norm = self.get_mixing_matrix_parameter() 42 | 43 | # Initialize 44 | self.current_step = -1 45 | self.current_matching_activations = np.ones(self.number_workers) 46 | self.current_topology = self.network 47 | 48 | if self.network_save_path: 49 | nx.write_gml(self.network, self.network_save_path) 50 | 51 | def get_matching_activation_probabilities(self): 52 | """ 53 | Computes a set of activation probabilities that maximize the connectivity of the expected graph 54 | given a communication time constraint; 55 | For given Laplacian matrices, it computes optimal weights to sum them, in order to maximize 56 | the second largest eigenvalue of their weighted sum; 57 | See https://arxiv.org/pdf/1905.09435.pdf (Formula 5) for details; 58 | and equivalently controlling the expected communication time; 59 | :return: np.array of the same size as "laplacian_matrices"; each entry represents the probability 60 | of activating a sub-graph; 61 | """ 62 | p = cp.Variable(self.number_matching) 63 | gamma = cp.Variable() 64 | beta = cp.Variable() 65 | constraints = [p <= 1, p >= 0, 66 | p.T @ np.ones(self.number_matching) <= self.communication_budget * self.number_matching, 67 | gamma * np.eye(self.number_workers) - beta * np.ones((self.number_workers, self.number_workers)) 68 | << cp.sum([p[i] * self.laplacian_matrices[i] for i in range(self.number_matching)])] 69 | objective = cp.Maximize(gamma) 70 | problem = cp.Problem(objective, constraints) 71 | 72 | problem.solve() 73 | 74 | return p.value 75 | 76 | def get_mixing_matrix_parameter(self): 77 | """ 78 | Computes optimal equal weight mixing matrix parameter; 79 | i.e. computes alpha in order to optimize the spectral gap of the mixing matrix W, where 80 | W = I - alpha * L_bar, with being identity matrix and L_bar is the expected Laplacian matrix; 81 | See https://arxiv.org/pdf/1905.09435.pdf (Formula 6 and 7) for details; 82 | each entry represents the probability of activating a sub-graph; 83 | :return: alpha (float) 84 | """ 85 | L_bar = np.zeros((self.number_workers, self.number_workers)) 86 | L_tilde = np.zeros((self.number_workers, self.number_workers)) 87 | 88 | for idx in range(self.number_matching): 89 | L_bar += self.activation_probas[idx] * self.laplacian_matrices[idx] 90 | L_tilde += self.activation_probas[idx] * (1 - self.activation_probas[idx]) * self.laplacian_matrices[idx] 91 | 92 | rho = cp.Variable() 93 | alpha = cp.Variable() 94 | beta = cp.Variable() 95 | 96 | objective = cp.Minimize(rho) 97 | 98 | constraints = [alpha ** 2 - beta <= 0, 99 | np.eye(self.number_workers) - 2 * alpha * L_bar + beta * (L_bar @ L_bar + 2 * L_tilde) 100 | - (1 / self.number_workers) * np.ones((self.number_workers, self.number_workers)) 101 | << rho * np.eye(self.number_workers)] 102 | 103 | prob = cp.Problem(objective, constraints) 104 | prob.solve() 105 | 106 | return alpha.value, rho.value 107 | 108 | def step(self): 109 | """ 110 | Generating random topology at any iteration: given activation probabilities, generates an independent 111 | Bernoulli random variable Bj for each matching in "matching_list", 112 | the activated topology is the concatenation of the activated matching. 113 | The mixing matrix is then computed as W = I - alpha * L, where L is the Laplacian matrix 114 | of the activated topology; 115 | """ 116 | self.current_topology = nx.Graph() 117 | laplacian_matrix = np.zeros((self.number_workers, self.number_workers)) 118 | 119 | self.current_matching_activations = np.random.binomial(n=1, p=self.activation_probas) 120 | while self.current_matching_activations.sum() == 0: 121 | self.current_matching_activations = np.random.binomial(n=1, p=self.activation_probas) 122 | 123 | for idx, matching_activation in enumerate(self.current_matching_activations): 124 | if matching_activation: 125 | self.current_topology = nx.compose(self.current_topology, self.matching_list[idx]) 126 | laplacian_matrix += self.laplacian_matrices[idx] 127 | 128 | mixing_matrix = np.eye(self.number_workers) - self.alpha * laplacian_matrix 129 | 130 | self.current_topology = nx.from_numpy_matrix(mixing_matrix) 131 | 132 | self.current_step += 1 133 | 134 | if self.path_to_history_file: 135 | with open(self.path_to_history_file, "a") as csvfile: 136 | writer = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) 137 | writer.writerow(self.current_matching_activations.tolist()) 138 | -------------------------------------------------------------------------------- /data/sent140/split_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import argparse 4 | import json 5 | import random 6 | import time 7 | import numpy as np 8 | 9 | 10 | def iid_divide(l, g): 11 | """ 12 | divide list l among g groups 13 | each group has either int(len(l)/g) or int(len(l)/g)+1 elements 14 | returns a list of groups 15 | 16 | """ 17 | num_elems = len(l) 18 | group_size = int(len(l)/g) 19 | num_big_groups = num_elems - g * group_size 20 | num_small_groups = g - num_big_groups 21 | glist = [] 22 | for i in range(num_small_groups): 23 | glist.append(l[group_size * i : group_size * (i + 1)]) 24 | bi = group_size*num_small_groups 25 | group_size += 1 26 | for i in range(num_big_groups): 27 | glist.append(l[bi + group_size * i:bi + group_size * (i + 1)]) 28 | return glist 29 | 30 | 31 | parser = argparse.ArgumentParser() 32 | 33 | parser.add_argument('--num_workers', 34 | help=('number of workers/users;' 35 | 'default: 1;'), 36 | type=int, 37 | default=1) 38 | parser.add_argument('--iid', 39 | help='sample iid;', 40 | action="store_true") 41 | parser.add_argument('--niid', 42 | help="sample niid;", 43 | dest='iid', action='store_false') 44 | parser.add_argument('--s_frac', 45 | help='fraction of all data to sample; default: 0.1;', 46 | type=float, 47 | default=0.01) 48 | parser.add_argument('--tr_frac', 49 | help='fraction in training set; default: 0.8;', 50 | type=float, 51 | default=0.8) 52 | parser.add_argument('--seed', 53 | help='args.seed for random partitioning of test/train data', 54 | type=int, 55 | default=None) 56 | 57 | parser.set_defaults(user=False) 58 | 59 | args = parser.parse_args() 60 | 61 | if __name__ == "__main__": 62 | print('------------------------------') 63 | print('generating training and test sets') 64 | 65 | rng_seed = (args.seed if (args.seed is not None and args.seed >= 0) else int(time.time())) 66 | rng = random.Random(rng_seed) 67 | np.random.seed(rng_seed) 68 | 69 | train_file = os.path.join("train", "train.json") 70 | test_file = os.path.join("test", "test.json") 71 | 72 | data_dir = os.path.join('raw_data', 'all_data.csv') 73 | with open(data_dir, 'rt', encoding='ISO-8859-1') as f: 74 | reader = csv.reader(f) 75 | data = list(reader) 76 | 77 | data = sorted(data, key=lambda x: x[4]) 78 | 79 | if args.iid: 80 | tot_num_samples = len(data) 81 | num_new_samples = int(args.s_frac * tot_num_samples) 82 | 83 | indices = [i for i in range(tot_num_samples)] 84 | new_indices = rng.sample(indices, num_new_samples) 85 | 86 | indices_groups = iid_divide(new_indices, args.num_workers) 87 | 88 | for id_w, worker_indices in enumerate(indices_groups): 89 | curr_num_samples = len(worker_indices) 90 | 91 | num_train_samples = max(1, int(args.tr_frac * curr_num_samples)) 92 | num_test_samples = curr_num_samples - num_train_samples 93 | 94 | train_indices = rng.sample(worker_indices, num_train_samples) 95 | test_indices = list(set(worker_indices) - set(train_indices)) 96 | 97 | local_train_file = os.path.join("train", "{}.json".format(id_w)) 98 | 99 | for (file_, indices) in [(local_train_file, train_indices), 100 | (train_file, train_indices), 101 | (test_file, test_indices)]: 102 | 103 | for sample_idx in indices: 104 | sample = data[sample_idx] 105 | row = dict() 106 | 107 | row['idx'] = sample[1] 108 | row["time"] = sample[2] 109 | row['query'] = sample[3] 110 | row["user"] = sample[4] 111 | row["text"] = sample[5] 112 | row["label"] = 1 if sample[0] == "4" else 0 113 | 114 | with open(file_, "a") as f: 115 | json.dump(row, f) 116 | f.write("\n") 117 | 118 | else: 119 | all_writers = set() 120 | 121 | for i in range(len(data)): 122 | row = data[i] 123 | all_writers.add(row[4]) 124 | 125 | all_writers = list(all_writers) 126 | 127 | data_by_writers = {k: [] for k in all_writers} 128 | 129 | for i in range(len(data)): 130 | row = data[i] 131 | data_by_writers[row[4]].append(row) 132 | 133 | num_writers_by_user = np.random.lognormal(5, 1.5, args.num_workers) + 5 134 | num_writers_by_user *= (len(all_writers) / num_writers_by_user.sum()) 135 | num_samples = np.floor(num_writers_by_user).astype(np.int64) 136 | 137 | writers_by_workers = [] 138 | current_idx = 0 139 | for worker_id in range(args.num_workers): 140 | writers_by_workers.append(all_writers[current_idx: current_idx + num_samples[worker_id]]) 141 | current_idx = num_samples[worker_id] 142 | 143 | for id_w, writers in enumerate(writers_by_workers): 144 | all_worker_data = [] 145 | for writer in writers: 146 | all_worker_data += data_by_writers[writer] 147 | 148 | tot_num_samples = len(all_worker_data) 149 | curr_num_samples = int(args.s_frac * tot_num_samples) 150 | 151 | indices = [i for i in range(tot_num_samples)] 152 | worker_indices = rng.sample(indices, curr_num_samples) 153 | 154 | num_train_samples = max(1, int(args.tr_frac * curr_num_samples)) 155 | num_test_samples = curr_num_samples - num_train_samples 156 | 157 | train_indices = rng.sample(worker_indices, num_train_samples) 158 | test_indices = list(set(worker_indices) - set(train_indices)) 159 | 160 | local_train_file = os.path.join("train", "{}.json".format(id_w)) 161 | 162 | for (file_, indices) in [(local_train_file, train_indices), 163 | (train_file, train_indices), 164 | (test_file, test_indices)]: 165 | 166 | for sample_idx in indices: 167 | sample = data[sample_idx] 168 | row = dict() 169 | 170 | row['idx'] = sample[1] 171 | row["time"] = sample[2] 172 | row['query'] = sample[3] 173 | row["user"] = sample[4] 174 | row["text"] = sample[5] 175 | row["label"] = 1 if sample[0] == "4" else 0 176 | 177 | with open(file_, "a") as f: 178 | json.dump(row, f) 179 | f.write("\n") 180 | 181 | 182 | -------------------------------------------------------------------------------- /data/femnist/split_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import argparse 4 | import random 5 | import time 6 | import numpy as np 7 | 8 | 9 | def relabel_class(c): 10 | """ 11 | maps hexadecimal class value (string) to a decimal number 12 | returns: 13 | - 0 through 9 for classes representing respective numbers 14 | - 10 through 35 for classes representing respective uppercase letters 15 | - 36 through 61 for classes representing respective lowercase letters 16 | """ 17 | if c.isdigit() and int(c) < 40: 18 | return int(c) - 30 19 | elif int(c, 16) <= 90: # uppercase 20 | return int(c, 16) - 55 21 | else: 22 | return int(c, 16) - 61 23 | 24 | 25 | def iid_divide(l, g): 26 | """ 27 | divide list l among g groups 28 | each group has either int(len(l)/g) or int(len(l)/g)+1 elements 29 | returns a list of groups 30 | 31 | """ 32 | num_elems = len(l) 33 | group_size = int(len(l)/g) 34 | num_big_groups = num_elems - g * group_size 35 | num_small_groups = g - num_big_groups 36 | glist = [] 37 | for i in range(num_small_groups): 38 | glist.append(l[group_size * i : group_size * (i + 1)]) 39 | bi = group_size*num_small_groups 40 | group_size += 1 41 | for i in range(num_big_groups): 42 | glist.append(l[bi + group_size * i:bi + group_size * (i + 1)]) 43 | return glist 44 | 45 | 46 | parser = argparse.ArgumentParser() 47 | 48 | parser.add_argument('--num_workers', 49 | help=('number of workers/users;' 50 | 'default: 1;'), 51 | type=int, 52 | default=1) 53 | parser.add_argument('--iid', 54 | help='sample iid;', 55 | action="store_true") 56 | parser.add_argument('--niid', 57 | help="sample niid;", 58 | dest='iid', action='store_false') 59 | parser.add_argument('--s_frac', 60 | help='fraction of all data to sample; default: 0.1;', 61 | type=float, 62 | default=0.01) 63 | parser.add_argument('--tr_frac', 64 | help='fraction in training set; default: 0.8;', 65 | type=float, 66 | default=0.8) 67 | parser.add_argument('--seed', 68 | help='args.seed for random partitioning of test/train data', 69 | type=int, 70 | default=None) 71 | 72 | parser.set_defaults(user=False) 73 | 74 | args = parser.parse_args() 75 | 76 | if __name__ == "__main__": 77 | print('------------------------------') 78 | print('generating training and test sets') 79 | 80 | rng_seed = (args.seed if (args.seed is not None and args.seed >= 0) else int(time.time())) 81 | rng = random.Random(rng_seed) 82 | np.random.seed(rng_seed) 83 | 84 | train_file = os.path.join("train", "train.json") 85 | test_file = os.path.join("test", "test.json") 86 | 87 | data_dir = os.path.join('intermediate', 'images_by_writer.pkl') 88 | with open(data_dir, 'rb') as f: 89 | all_data = pickle.load(f) 90 | 91 | if args.iid: 92 | combined_data = [] 93 | 94 | for (writer_id, l) in all_data: 95 | combined_data += l 96 | 97 | for ii, (path, c) in enumerate(combined_data): 98 | combined_data[ii] = (path, relabel_class(c)) 99 | 100 | tot_num_samples = len(combined_data) 101 | num_new_samples = int(args.s_frac * tot_num_samples) 102 | 103 | indices = [i for i in range(tot_num_samples)] 104 | new_indices = rng.sample(indices, num_new_samples) 105 | 106 | indices_groups = iid_divide(new_indices, args.num_workers) 107 | 108 | train_data = [] 109 | test_data = [] 110 | 111 | for id_w, worker_indices in enumerate(indices_groups): 112 | curr_num_samples = len(worker_indices) 113 | 114 | num_train_samples = max(1, int(args.tr_frac * curr_num_samples)) 115 | num_test_samples = curr_num_samples - num_train_samples 116 | 117 | train_indices = rng.sample(worker_indices, num_train_samples) 118 | test_indices = list(set(indices) - set(train_indices)) 119 | 120 | worker_data = [combined_data[ii] for ii in train_indices] 121 | train_data += [combined_data[ii] for ii in train_indices] 122 | test_data += [combined_data[ii] for ii in test_indices] 123 | 124 | with open('train/{}.pkl'.format(id_w), 'wb') as f: 125 | pickle.dump(worker_data, f, pickle.HIGHEST_PROTOCOL) 126 | 127 | with open('train/train.pkl', 'wb') as f: 128 | pickle.dump(train_data, f, pickle.HIGHEST_PROTOCOL) 129 | 130 | with open('test/test.pkl', 'wb') as f: 131 | pickle.dump(test_data, f, pickle.HIGHEST_PROTOCOL) 132 | 133 | else: 134 | writer_ids = [i for i in range(len(all_data))] 135 | rng.shuffle(writer_ids) 136 | 137 | num_writers_by_user = np.random.lognormal(5, 1.5, args.num_workers) + 5 138 | num_writers_by_user *= (len(writer_ids) / num_writers_by_user.sum()) 139 | num_samples = np.floor(num_writers_by_user).astype(np.int64) 140 | 141 | writers_by_workers = [] 142 | current_idx = 0 143 | for worker_id in range(args.num_workers): 144 | writers_by_workers.append(writer_ids[current_idx: current_idx + num_samples[worker_id]]) 145 | current_idx = num_samples[worker_id] 146 | 147 | train_data = [] 148 | test_data = [] 149 | 150 | for id_w, writer_indices in enumerate(writers_by_workers): 151 | all_worker_data = [] 152 | for writer_id in writer_indices: 153 | all_worker_data += all_data[writer_id][1] 154 | 155 | for ii, (path, c) in enumerate(all_worker_data): 156 | all_worker_data[ii] = (path, relabel_class(c)) 157 | 158 | tot_num_samples = len(all_worker_data) 159 | curr_num_samples = int(args.s_frac * tot_num_samples) 160 | 161 | indices = [i for i in range(tot_num_samples)] 162 | worker_indices = rng.sample(indices, curr_num_samples) 163 | 164 | num_train_samples = max(1, int(args.tr_frac * curr_num_samples)) 165 | num_test_samples = curr_num_samples - num_train_samples 166 | 167 | train_indices = rng.sample(worker_indices, num_train_samples) 168 | test_indices = list(set(worker_indices) - set(train_indices)) 169 | 170 | worker_data = [all_worker_data[ii] for ii in train_indices] 171 | train_data += [all_worker_data[ii] for ii in train_indices] 172 | test_data += [all_worker_data[ii] for ii in test_indices] 173 | 174 | with open('train/{}.pkl'.format(id_w), 'wb') as f: 175 | pickle.dump(worker_data, f, pickle.HIGHEST_PROTOCOL) 176 | 177 | with open('train/train.pkl', 'wb') as f: 178 | pickle.dump(train_data, f, pickle.HIGHEST_PROTOCOL) 179 | 180 | with open('test/test.pkl', 'wb') as f: 181 | pickle.dump(test_data, f, pickle.HIGHEST_PROTOCOL) 182 | -------------------------------------------------------------------------------- /graph_utils/generate_networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | import networkx as nx 5 | 6 | from utils.evaluate_throughput import evaluate_cycle_time 7 | from utils.utils import get_connectivity_graph, add_upload_download_delays, get_delta_mbst_overlay,\ 8 | get_star_overlay, get_ring_overlay, get_matcha_cycle_time 9 | 10 | # Model size in bit 11 | MODEL_SIZE_DICT = {"synthetic": 4354, 12 | "shakespeare": 3385747, 13 | "femnist": 4843243, 14 | "sent140": 19269416, 15 | "inaturalist": 44961717} 16 | 17 | # Model computation time in ms 18 | COMPUTATION_TIME_DICT = {"synthetic": 1.5, 19 | "shakespeare": 389.6, 20 | "femnist": 4.6, 21 | "sent140": 9.8, 22 | "inaturalist": 25.4} 23 | 24 | 25 | parser = argparse.ArgumentParser() 26 | 27 | parser.add_argument('name', 28 | help='name of the network to use;') 29 | parser.add_argument("--experiment", 30 | type=str, 31 | help="name of the experiment that will be run on the network;" 32 | "possible are femnist, inaturalist, synthetic, shakespeare, sent140;" 33 | "if not precised --model_size will be used as model size;", 34 | default=None) 35 | parser.add_argument('--model_size', 36 | type=float, 37 | help="size of the model that will be transmitted on the network in bit;" 38 | "ignored if --experiment is precised;", 39 | default=1e8) 40 | parser.add_argument("--local_steps", 41 | type=int, 42 | help="number of local steps, used to get computation time", 43 | default=1) 44 | parser.add_argument("--upload_capacity", 45 | type=float, 46 | help="upload capacity at edge in bit/s; default=1e32", 47 | default=1e32) 48 | parser.add_argument("--download_capacity", 49 | type=float, 50 | help="download capacity at edge in bit/s; default=1e32", 51 | default=1e32) 52 | parser.add_argument("--communication_budget", 53 | type=float, 54 | help="communication budget to use with matcha; will be ignored if name is not matcha", 55 | default=0.5) 56 | parser.add_argument("--default_capacity", 57 | type=float, 58 | help="default capacity (in bit/s) to use on links with unknown capacity", 59 | default=1e9) 60 | parser.add_argument('--centrality', 61 | help="centrality type; default: load;", 62 | default="load") 63 | 64 | parser.set_defaults(user=False) 65 | 66 | args = parser.parse_args() 67 | args.default_capacity *= 1e-3 68 | 69 | if __name__ == "__main__": 70 | if args.experiment is not None: 71 | args.model_size = MODEL_SIZE_DICT[args.experiment] 72 | args.computation_time = args.local_steps * COMPUTATION_TIME_DICT[args.experiment] 73 | 74 | upload_delay = (args.model_size / args.upload_capacity) * 1e3 75 | download_delay = (args.model_size / args.download_capacity) * 1e3 76 | 77 | result_dir = "./results/{}".format(args.name) 78 | if not os.path.exists(result_dir): 79 | os.makedirs(result_dir) 80 | 81 | results_txt_path = os.path.join(result_dir, "cycle_time.txt") 82 | results_file = open(results_txt_path, "w") 83 | 84 | path_to_graph = "./data/{}.gml".format(args.name) 85 | 86 | underlay = nx.read_gml(path_to_graph) 87 | 88 | print("Number of Workers: {}".format(underlay.number_of_nodes())) 89 | print("Number of links: {}".format(underlay.number_of_edges())) 90 | 91 | nx.set_node_attributes(underlay, upload_delay, 'uploadDelay') 92 | nx.set_node_attributes(underlay, download_delay, "downloadDelay") 93 | 94 | nx.write_gml(underlay.copy(), os.path.join(result_dir, "original.gml")) 95 | 96 | connectivity_graph = get_connectivity_graph(underlay, args.default_capacity) 97 | 98 | # MST 99 | for u, v, data in connectivity_graph.edges(data=True): 100 | weight = args.computation_time + data["latency"] + args.model_size / data["availableBandwidth"] 101 | connectivity_graph.add_edge(u, v, weight=weight) 102 | 103 | MST = nx.minimum_spanning_tree(connectivity_graph.copy(), weight="weight") 104 | 105 | MST = MST.to_directed() 106 | 107 | cycle_time, _, _ = evaluate_cycle_time(add_upload_download_delays(MST, args.computation_time, args.model_size)) 108 | 109 | nx.write_gml(MST, os.path.join(result_dir, "mst.gml")) 110 | print("Cycle time for MST architecture: {0:.1f}".format(cycle_time)) 111 | results_file.write("MST {}\n".format(cycle_time)) 112 | 113 | # delta-MBST 114 | delta_mbst, best_cycle_time, best_delta = \ 115 | get_delta_mbst_overlay(connectivity_graph.copy(), args.computation_time, args.model_size) 116 | 117 | delta_mbst = add_upload_download_delays(delta_mbst, args.computation_time, args.model_size) 118 | cycle_time, _, _ = evaluate_cycle_time(delta_mbst) 119 | 120 | nx.write_gml(delta_mbst, os.path.join(result_dir, "mct_congest.gml")) 121 | print("Cycle time for delta-MBST architecture: {0:.1f} ms".format(cycle_time)) 122 | results_file.write("MCT_congest {}\n".format(cycle_time)) 123 | 124 | # Star 125 | star = get_star_overlay(connectivity_graph.copy(), args.centrality) 126 | 127 | cycle_time, _, _ = evaluate_cycle_time(add_upload_download_delays(star, args.computation_time, args.model_size)) 128 | 129 | cycle_time = (cycle_time - args.computation_time) * 2 + args.computation_time 130 | 131 | nx.write_gml(star, os.path.join(result_dir, "centralized.gml")) 132 | print("Cycle time for STAR architecture: {0:.1f} ms".format(cycle_time)) 133 | results_file.write("Server {}\n".format(cycle_time)) 134 | 135 | # Ring 136 | ring = get_ring_overlay(connectivity_graph.copy(), args.computation_time, args.model_size) 137 | 138 | cycle_time, _, _ = evaluate_cycle_time(add_upload_download_delays(ring, args.computation_time, args.model_size)) 139 | 140 | nx.write_gml(ring, os.path.join(result_dir, "ring.gml")) 141 | print("Cycle time for RING architecture: {0:.1f} ms".format(cycle_time)) 142 | results_file.write("Ring graph {}\n".format(cycle_time)) 143 | 144 | # MATCHA 145 | cycle_time = get_matcha_cycle_time(underlay.copy(), connectivity_graph.copy(), 146 | args.computation_time, args.model_size, args.communication_budget) 147 | 148 | print("Cycle time for MATCHA architecture: {0:.1f} ms".format(cycle_time)) 149 | results_file.write("MATCHA {}\n".format(cycle_time)) 150 | 151 | # MATCHA+ 152 | cycle_time = get_matcha_cycle_time(connectivity_graph.copy(), connectivity_graph.copy(), 153 | args.computation_time, args.model_size, args.communication_budget) 154 | 155 | print("Cycle time for MATCHA+ architecture: {0:.1f} ms".format(cycle_time)) 156 | results_file.write("MATCHA {}\n".format(cycle_time)) 157 | -------------------------------------------------------------------------------- /make_figure2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | from utils.utils import args_to_string, loggs_to_json 8 | from utils.args import parse_args 9 | 10 | 11 | cycle_time_dict = {"gaia": {"ring": 522.8, 12 | "centralized": 9293.3, 13 | "mst": 1442.0, 14 | "mct_congest": 1018.8, 15 | "matcha": 2612.8}, 16 | "amazon_us": {"ring": 485.9, 17 | "centralized": 18983.2, 18 | "mst": 1385.7, 19 | "mct_congest": 952.8, 20 | "matcha": 5036.7}, 21 | "geantdistance": {"ring": 491.1, 22 | "centralized": 35188.4, 23 | "mst": 2753.8, 24 | "mct_congest": 984.7, 25 | "matcha": 2658.9}, 26 | "exodus": {"ring": 488.1, 27 | "centralized": 70350.7, 28 | "mst": 3176.9, 29 | "mct_congest": 1023.5, 30 | "matcha": 2874.3}, 31 | "ebone": {"ring": 482.2, 32 | "centralized": 77462.5, 33 | "mst": 4123.4, 34 | "mct_congest": 984.8, 35 | "matcha": 2660.3}} 36 | 37 | EXTENSIONS = {"synthetic": ".json", 38 | "sent140": ".json", 39 | "femnist": ".pkl", 40 | "shakespeare": ".txt", 41 | "inaturalist": ".pkl"} 42 | 43 | # Model size in bit 44 | MODEL_SIZE_DICT = {"synthetic": 4354, 45 | "shakespeare": 3385747, 46 | "femnist": 4843243, 47 | "sent140": 19269416, 48 | "inaturalist": 44961717} 49 | 50 | # Model computation time in ms 51 | COMPUTATION_TIME_DICT = {"synthetic": 1.5, 52 | "shakespeare": 389.6, 53 | "femnist": 4.6, 54 | "sent140": 9.8, 55 | "inaturalist": 25.4} 56 | 57 | # Tags list 58 | TAGS = ["Train/Loss", "Train/Acc", "Test/Loss", "Test/Acc", "Consensus"] 59 | 60 | labels_dict = {"matcha": "MATCHA$^{+}$", 61 | "mst": "MST", 62 | "centralized": "STAR", 63 | 'mct_congest': "$\delta$-MBST", 64 | "ring": "RING"} 65 | 66 | tag_dict = {"Train/Loss": "Train loss", 67 | "Train/Acc": "Train acc", 68 | "Test/Loss": "Test loss", 69 | "Test/Acc": "Test acc", 70 | "Consensus": "Consensus"} 71 | 72 | path_dict = {"Train/Loss": "Train_loss", 73 | "Train/Acc": "Train_acc", 74 | "Test/Loss": "Test_loss", 75 | "Test/Acc": "Test_acc", 76 | "Consensus": "Consensus"} 77 | 78 | trsh_dict = {"gaia": 0.65, 79 | "amazon_us": 0.55, 80 | "geantdistance": 0.55, 81 | "exodus": 0.5, 82 | "ebone": 0.5} 83 | 84 | lr_dict = {"gaia": "1e-3", 85 | "amazon_us": "1e-3", 86 | "geantdistance": "1e-3", 87 | "exodus": "1e-1", 88 | "ebone": "1e-1"} 89 | 90 | bz_dict = {"shakespeare": 512, 91 | "femnist": 128, 92 | "sent140": 512, 93 | "inaturalist": 16} 94 | 95 | 96 | def make_plots(args, mode=0): 97 | os.makedirs(os.path.join("results", "plots", args.experiment), exist_ok=True) 98 | 99 | loggs_dir_path = os.path.join("loggs", args_to_string(args)) 100 | path_to_json = os.path.join("results", "json", "{}.json".format(os.path.split(loggs_dir_path)[1])) 101 | with open(path_to_json, "r") as f: 102 | data = json.load(f) 103 | 104 | # fig, axs = plt.subplots(2, 5, figsize=(20, 8)) 105 | x_lim = np.inf 106 | for idx, tag in enumerate(TAGS): 107 | fig = plt.figure(figsize=(12, 10)) 108 | for architecture in ["centralized", "matcha", "mst", "mct_congest", "ring"]: 109 | try: 110 | values = data[tag][architecture] 111 | rounds = data["Round"][architecture] 112 | except: 113 | continue 114 | 115 | if mode == 0: 116 | min_len = min(len(values), len(rounds)) 117 | 118 | if rounds[-1] * cycle_time_dict[network_name][architecture] < x_lim: 119 | x_lim = rounds[-1] * cycle_time_dict[network_name][architecture] 120 | 121 | plt.plot(cycle_time_dict[network_name][architecture] * np.array(rounds) / 1000, 122 | values[:min_len], label=labels_dict[architecture], 123 | linewidth=5.0) 124 | plt.grid(True, linewidth=2) 125 | plt.xlim(0, x_lim / 1000) 126 | plt.ylabel("{}".format(tag_dict[tag]), fontsize=50) 127 | plt.xlabel("time (s)", fontsize=50) 128 | plt.tick_params(axis='both', labelsize=40) 129 | plt.tick_params(axis='x') 130 | plt.legend(fontsize=35) 131 | 132 | else: 133 | min_len = min(len(values), len(rounds)) 134 | 135 | if rounds[:min_len][-1] < x_lim: 136 | x_lim = rounds[:min_len][-1] 137 | 138 | plt.plot(rounds[:min_len], 139 | values[:min_len], label=labels_dict[architecture], 140 | linewidth=5.0) 141 | plt.ylabel("{}".format(tag_dict[tag]), fontsize=50) 142 | plt.xlabel("Rounds", fontsize=50) 143 | plt.tick_params(axis='both', labelsize=40) 144 | plt.legend(fontsize=35) 145 | plt.grid(True, linewidth=2) 146 | plt.xlim(0, x_lim) 147 | 148 | if mode == 0: 149 | fig_path = os.path.join("results", "plots", args.experiment, 150 | "{}_{}_vs_time.png".format(args.network_name, path_dict[tag])) 151 | plt.savefig(fig_path, bbox_inches='tight') 152 | else: 153 | fig_path = os.path.join("results", "plots", args.experiment, 154 | "{}_{}_vs_iteration.png".format(args.network_name, path_dict[tag])) 155 | plt.savefig(fig_path, bbox_inches='tight') 156 | 157 | 158 | if __name__ == "__main__": 159 | network_name = "amazon_us" 160 | 161 | for experiment in [ "inaturalist", "shakespeare", "sent140", "femnist"]: 162 | args = parse_args([experiment, 163 | "--network", network_name, 164 | "--bz", str(bz_dict[experiment]), 165 | "--lr", str(lr_dict[network_name]), 166 | "--decay", "sqrt", 167 | "--local_steps", "1"]) 168 | 169 | args_string = args_to_string(args) 170 | 171 | loggs_dir = os.path.join("loggs", args_to_string(args)) 172 | loggs_to_json(loggs_dir) 173 | 174 | print("{}:".format(experiment)) 175 | 176 | make_plots(args, mode=0) 177 | make_plots(args, mode=1) 178 | 179 | print("#" * 10) 180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /communication_module/manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | 4 | import torch 5 | import torch.distributed as dist 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | from graph_utils.utils.matcha import RandomTopologyGenerator 9 | from utils.utils import get_network, get_iterator, get_model, args_to_string 10 | 11 | 12 | EXTENSIONS = {"synthetic": ".json", "sent140": ".json", "femnist": ".pkl", "shakespeare": ".txt"} 13 | 14 | 15 | class Manager(ABC): 16 | def __init__(self, args): 17 | self.device = args.device 18 | self.batch_size = args.bz 19 | self.network = get_network(args.network_name, args.architecture) 20 | self.world_size = self.network.number_of_nodes() + 1 # we add node representing the network manager 21 | self.log_freq = args.log_freq 22 | 23 | # create logger 24 | logger_path = os.path.join("loggs", args_to_string(args), args.architecture) 25 | self.logger = SummaryWriter(logger_path) 26 | 27 | self.round_idx = 0 # index of the current communication round 28 | 29 | self.train_dir = os.path.join("data", args.experiment, "train") 30 | self.test_dir = os.path.join("data", args.experiment, "test") 31 | 32 | self.train_path = os.path.join(self.train_dir, "train" + EXTENSIONS[args.experiment]) 33 | self.test_path = os.path.join(self.test_dir, "test" + EXTENSIONS[args.experiment]) 34 | 35 | self.train_iterator = get_iterator(args.experiment, self.train_path, self.device, self.batch_size) 36 | self.test_iterator = get_iterator(args.experiment, self.test_path, self.device, self.batch_size) 37 | 38 | self.gather_list = [get_model(args.experiment, self.device, self.train_iterator) 39 | for _ in range(self.world_size)] 40 | 41 | self.scatter_list = [get_model(args.experiment, self.device, self.train_iterator) 42 | for _ in range(self.world_size)] 43 | 44 | # print initial logs 45 | self.write_logs() 46 | 47 | def communicate(self): 48 | for ii, param in enumerate(self.gather_list[-1].net.parameters()): 49 | param_list = [list(self.gather_list[idx].net.parameters())[ii].data 50 | for idx in range(self.world_size)] 51 | 52 | dist.gather(tensor=param.data, dst=self.world_size - 1, gather_list=param_list) 53 | 54 | self.mix() 55 | 56 | if (self.round_idx - 1) % self.log_freq == 0: 57 | self.write_logs() 58 | 59 | for ii, param in enumerate(self.scatter_list[-1].net.parameters()): 60 | param_list = [list(self.scatter_list[idx].net.parameters())[ii].data 61 | for idx in range(self.world_size)] 62 | 63 | dist.scatter(tensor=param.data, src=self.world_size - 1, scatter_list=param_list) 64 | 65 | @abstractmethod 66 | def mix(self): 67 | pass 68 | 69 | def write_logs(self): 70 | """ 71 | write train/test loss, train/tet accuracy for average model and local models 72 | and intra-workers parameters variance (consensus) adn save average model 73 | """ 74 | train_loss, train_acc = self.scatter_list[-1].evaluate_iterator(self.train_iterator) 75 | test_loss, test_acc = self.scatter_list[-1].evaluate_iterator(self.train_iterator) 76 | 77 | self.logger.add_scalar("Train/Loss", train_loss, self.round_idx) 78 | self.logger.add_scalar("Train/Acc", train_acc, self.round_idx) 79 | self.logger.add_scalar("Test/Loss", test_loss, self.round_idx) 80 | self.logger.add_scalar("Test/Acc", test_acc, self.round_idx) 81 | 82 | # write parameter variance 83 | average_parameter = self.scatter_list[-1].get_param_tensor() 84 | 85 | param_tensors_by_workers = torch.zeros((average_parameter.shape[0], self.world_size - 1)) 86 | 87 | for ii, model in enumerate(self.scatter_list[:-1]): 88 | param_tensors_by_workers[:, ii] = model.get_param_tensor() - average_parameter 89 | 90 | consensus = (param_tensors_by_workers ** 2).sum() 91 | self.logger.add_scalar("Consensus", consensus, self.round_idx) 92 | 93 | print(f'\t Round: {self.round_idx} |Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%') 94 | 95 | 96 | class Peer2PeerManager(Manager): 97 | def mix(self): 98 | for ii, model in enumerate(self.scatter_list): 99 | if ii == self.world_size - 1: 100 | for param_idx, param in enumerate(model.net.parameters()): 101 | param.data.fill_(0.) 102 | for local_model in self.scatter_list[:-1]: 103 | param.data += (1 / (self.world_size - 1)) * list(local_model.net.parameters())[param_idx] 104 | else: 105 | for param_idx, param in enumerate(model.net.parameters()): 106 | param.data.fill_(0.) 107 | for neighbour in self.network.neighbors(ii): 108 | coeff = self.network.get_edge_data(ii, neighbour)["weight"] 109 | param.data += coeff * list(self.gather_list[neighbour].net.parameters())[param_idx] 110 | 111 | self.round_idx += 1 112 | 113 | 114 | class MATCHAManager(Manager): 115 | def __init__(self, args): 116 | super(MATCHAManager, self).__init__(args) 117 | path_to_save_network = os.path.join("loggs", args_to_string(args), "matcha", "colored_network.gml") 118 | path_to_matching_history_file = os.path.join("loggs", args_to_string(args), "matcha", "matching_history.csv") 119 | self.topology_generator = RandomTopologyGenerator(self.network, 120 | args.communication_budget, 121 | network_save_path=path_to_save_network, 122 | path_to_history_file=path_to_matching_history_file) 123 | 124 | def mix(self): 125 | # update topology 126 | self.topology_generator.step() 127 | 128 | for ii, model in enumerate(self.scatter_list): 129 | if ii == self.world_size - 1: 130 | for param_idx, param in enumerate(model.net.parameters()): 131 | param.data.fill_(0.) 132 | for local_model in self.scatter_list[:-1]: 133 | param.data += (1 / (self.world_size - 1)) * list(local_model.net.parameters())[param_idx] 134 | else: 135 | for param_idx, param in enumerate(model.net.parameters()): 136 | param.data.fill_(0.) 137 | for neighbour in self.topology_generator.current_topology.neighbors(ii): 138 | coeff = self.topology_generator.current_topology.get_edge_data(ii, neighbour)["weight"] 139 | param.data += coeff * list(self.gather_list[neighbour].net.parameters())[param_idx] 140 | 141 | self.round_idx += 1 142 | 143 | 144 | class CentralizedManager(Manager): 145 | def mix(self): 146 | for param_idx, param in enumerate(self.scatter_list[-1].net.parameters()): 147 | param.data.fill_(0.) 148 | for local_model in self.gather_list[:-1]: 149 | param.data += (1 / (self.world_size - 1)) * list(local_model.net.parameters())[param_idx] 150 | 151 | for ii, model in enumerate(self.scatter_list[:-1]): 152 | for param_idx, param in enumerate(model.net.parameters()): 153 | param.data = list(self.scatter_list[-1].net.parameters())[param_idx] 154 | 155 | self.round_idx += 1 156 | -------------------------------------------------------------------------------- /graph_utils/data/gaia.gml: -------------------------------------------------------------------------------- 1 | graph [ 2 | node [ 3 | id 0 4 | label "Virginia" 5 | ] 6 | node [ 7 | id 1 8 | label "California" 9 | ] 10 | node [ 11 | id 2 12 | label "Oregon" 13 | ] 14 | node [ 15 | id 3 16 | label "Dublin" 17 | ] 18 | node [ 19 | id 4 20 | label "Frankfurt" 21 | ] 22 | node [ 23 | id 5 24 | label "Tokyo" 25 | ] 26 | node [ 27 | id 6 28 | label "Seoul" 29 | ] 30 | node [ 31 | id 7 32 | label "Singapore" 33 | ] 34 | node [ 35 | id 8 36 | label "Sydney" 37 | ] 38 | node [ 39 | id 9 40 | label "Mumbai" 41 | ] 42 | node [ 43 | id 10 44 | label "Sao Paulo" 45 | ] 46 | edge [ 47 | source 0 48 | target 1 49 | distance 3560.859824767453 50 | capacity 1000000000.0 51 | ] 52 | edge [ 53 | source 0 54 | target 2 55 | distance 3617.1058525455455 56 | capacity 1000000000.0 57 | ] 58 | edge [ 59 | source 0 60 | target 3 61 | distance 5683.746538162422 62 | capacity 100000000.0 63 | ] 64 | edge [ 65 | source 0 66 | target 4 67 | distance 6774.62010172149 68 | capacity 100000000.0 69 | ] 70 | edge [ 71 | source 0 72 | target 5 73 | distance 11032.403521116341 74 | capacity 100000000.0 75 | ] 76 | edge [ 77 | source 0 78 | target 6 79 | distance 11331.528778910633 80 | capacity 100000000.0 81 | ] 82 | edge [ 83 | source 0 84 | target 7 85 | distance 15737.083172377323 86 | capacity 100000000.0 87 | ] 88 | edge [ 89 | source 0 90 | target 8 91 | distance 15550.74835546916 92 | capacity 100000000.0 93 | ] 94 | edge [ 95 | source 0 96 | target 9 97 | distance 13113.161300492078 98 | capacity 100000000.0 99 | ] 100 | edge [ 101 | source 0 102 | target 10 103 | distance 7500.898168816753 104 | capacity 500000000.0 105 | ] 106 | edge [ 107 | source 1 108 | target 2 109 | distance 825.4130940774442 110 | capacity 1500000000.0 111 | ] 112 | edge [ 113 | source 1 114 | target 3 115 | distance 8111.218768362535 116 | capacity 100000000.0 117 | ] 118 | edge [ 119 | source 1 120 | target 4 121 | distance 9096.865584257743 122 | capacity 100000000.0 123 | ] 124 | edge [ 125 | source 1 126 | target 5 127 | distance 8620.547632468602 128 | capacity 100000000.0 129 | ] 130 | edge [ 131 | source 1 132 | target 6 133 | distance 9370.063077937788 134 | capacity 300000000.0 135 | ] 136 | edge [ 137 | source 1 138 | target 7 139 | distance 13930.612571776204 140 | capacity 100000000.0 141 | ] 142 | edge [ 143 | source 1 144 | target 8 145 | distance 12160.544494528913 146 | capacity 300000000.0 147 | ] 148 | edge [ 149 | source 1 150 | target 9 151 | distance 13727.28776621854 152 | capacity 100000000.0 153 | ] 154 | edge [ 155 | source 1 156 | target 10 157 | distance 10079.072989313954 158 | capacity 100000000.0 159 | ] 160 | edge [ 161 | source 2 162 | target 3 163 | distance 7551.970732123231 164 | capacity 300000000.0 165 | ] 166 | edge [ 167 | source 2 168 | target 4 169 | distance 8488.896086335717 170 | capacity 100000000.0 171 | ] 172 | edge [ 173 | source 2 174 | target 5 175 | distance 8028.469388699873 176 | capacity 100000000.0 177 | ] 178 | edge [ 179 | source 2 180 | target 6 181 | distance 8700.031091458462 182 | capacity 300000000.0 183 | ] 184 | edge [ 185 | source 2 186 | target 7 187 | distance 13325.366070623815 188 | capacity 100000000.0 189 | ] 190 | edge [ 191 | source 2 192 | target 8 193 | distance 12383.076161347562 194 | capacity 100000000.0 195 | ] 196 | edge [ 197 | source 2 198 | target 9 199 | distance 12902.319229980723 200 | capacity 100000000.0 201 | ] 202 | edge [ 203 | source 2 204 | target 10 205 | distance 10610.577959918295 206 | capacity 100000000.0 207 | ] 208 | edge [ 209 | source 3 210 | target 4 211 | distance 1091.0035398064083 212 | capacity 1500000000.0 213 | ] 214 | edge [ 215 | source 3 216 | target 5 217 | distance 9611.133798789571 218 | capacity 100000000.0 219 | ] 220 | edge [ 221 | source 3 222 | target 6 223 | distance 8974.589549377932 224 | capacity 100000000.0 225 | ] 226 | edge [ 227 | source 3 228 | target 7 229 | distance 11203.776982156216 230 | capacity 100000000.0 231 | ] 232 | edge [ 233 | source 3 234 | target 8 235 | distance 17207.312372624874 236 | capacity 100000000.0 237 | ] 238 | edge [ 239 | source 3 240 | target 9 241 | distance 7620.843594967312 242 | capacity 300000000.0 243 | ] 244 | edge [ 245 | source 3 246 | target 10 247 | distance 9366.555606476215 248 | capacity 100000000.0 249 | ] 250 | edge [ 251 | source 4 252 | target 5 253 | distance 9358.521215366647 254 | capacity 100000000.0 255 | ] 256 | edge [ 257 | source 4 258 | target 6 259 | distance 8571.5714609335 260 | capacity 100000000.0 261 | ] 262 | edge [ 263 | source 4 264 | target 7 265 | distance 10260.83044153216 266 | capacity 100000000.0 267 | ] 268 | edge [ 269 | source 4 270 | target 8 271 | distance 16478.1341044152 272 | capacity 100000000.0 273 | ] 274 | edge [ 275 | source 4 276 | target 9 277 | distance 6578.168093870104 278 | capacity 500000000.0 279 | ] 280 | edge [ 281 | source 4 282 | target 10 283 | distance 9807.409376220698 284 | capacity 100000000.0 285 | ] 286 | edge [ 287 | source 5 288 | target 6 289 | distance 1161.2277477992284 290 | capacity 1000000000.0 291 | ] 292 | edge [ 293 | source 5 294 | target 7 295 | distance 5311.118309037953 296 | capacity 1000000000.0 297 | ] 298 | edge [ 299 | source 5 300 | target 8 301 | distance 7789.739742827469 302 | capacity 300000000.0 303 | ] 304 | edge [ 305 | source 5 306 | target 9 307 | distance 6751.3514540143415 308 | capacity 300000000.0 309 | ] 310 | edge [ 311 | source 5 312 | target 10 313 | distance 18528.65557840507 314 | capacity 100000000.0 315 | ] 316 | edge [ 317 | source 6 318 | target 7 319 | distance 4658.7421490548095 320 | capacity 1000000000.0 321 | ] 322 | edge [ 323 | source 6 324 | target 8 325 | distance 8296.033168577038 326 | capacity 100000000.0 327 | ] 328 | edge [ 329 | source 6 330 | target 9 331 | distance 5613.893433078432 332 | capacity 500000000.0 333 | ] 334 | edge [ 335 | source 6 336 | target 10 337 | distance 18337.930813275976 338 | capacity 100000000.0 339 | ] 340 | edge [ 341 | source 7 342 | target 8 343 | distance 6301.111688839916 344 | capacity 300000000.0 345 | ] 346 | edge [ 347 | source 7 348 | target 9 349 | distance 3899.1833741194805 350 | capacity 500000000.0 351 | ] 352 | edge [ 353 | source 7 354 | target 10 355 | distance 16000.059238393498 356 | capacity 100000000.0 357 | ] 358 | edge [ 359 | source 8 360 | target 9 361 | distance 10144.778814121693 362 | capacity 100000000.0 363 | ] 364 | edge [ 365 | source 8 366 | target 10 367 | distance 13377.864263189238 368 | capacity 100000000.0 369 | ] 370 | edge [ 371 | source 9 372 | target 10 373 | distance 13772.602629233716 374 | capacity 100000000.0 375 | ] 376 | ] 377 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Throughput-Optimal Topology Design for Cross-Silo Federated Learning 2 | 3 | This repository is the official implementation of [Throughput-Optimal Topology 4 | Design for Cross-Silo Federated Learning](https://arxiv.org/abs/2010.12229). 5 | 6 | Federated learning usually employs a master-slave architecture where an 7 | orchestrator iteratively aggregates model updates from remote clients 8 | and pushes them back a refined model. This approach may be inefficient 9 | in cross-silo settings, as close-by data silos with high-speed access 10 | links may exchange information faster than with the orchestrator, and 11 | the orchestrator may become a communication bottleneck. In this paper we 12 | define the problem of topology design for cross-silo federated learning 13 | using the theory of max-plus linear systems to compute the system 14 | throughput---number of communication rounds per time unit. We also 15 | propose practical algorithms that, under the knowledge of measurable 16 | network characteristics, find a topology with the largest throughput or 17 | with provable throughput guarantees. In realistic Internet networks with 18 | 10 Gbps access links for silos, our algorithms speed up training by a 19 | factor 9 and 1.5 in comparison to the master-slave architecture and to 20 | state-of-the-art MATCHA, respectively. Speedups are even larger with 21 | slower access links. 22 | 23 | ## Requirements 24 | 25 | To install requirements: 26 | 27 | ```setup 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | ## Datasets 32 | 33 | We provide four datasets that are used in the paper under corresponding 34 | folders. For all datasets, see the README files in separate 35 | data/$dataset folders for instructions on preprocessing and/or sampling 36 | data. 37 | 38 | ## Networks and Topologies 39 | 40 | A main part of the paper is related to topology design. In 41 | `graph_utils/` details on generating different topologies for each 42 | network are provided. Scripts to compute the cycle time of each topology 43 | are also provided in `graph_utils/` 44 | 45 | ## Training 46 | 47 | Run on one dataset, with a specific topology choice on on network. 48 | Specify the name of the dataset (experiment), the name of the network 49 | and the used architecture, and configure all other hyper-parameters (see 50 | all hyper-parameters values in the appendix of the paper) 51 | 52 | ```train 53 | python3 main.py experiment ----network_name \ 54 | --architecture=original (--parallel) (--fit_by_epoch) \ 55 | --n_rounds=1 --bz=1 56 | --local_steps=1 --log_freq=1 \ 57 | --device="cpu" --lr=1e-3\ 58 | --optimizer='adam' --decay="constant" 59 | ``` 60 | 61 | And the test and training accuracy and loss will be saved in the log files. 62 | 63 | ## Evaluation 64 | 65 | ### iNaturalist Speed-ups 66 | To evaluate the speed-ups obtained when training iNaturalist on the proposed topology architectures (generate Table 3) fora given network, run 67 | 68 | ```eval 69 | python3 main.py inaturalist --network_name gaia --architecture $ARCHITECTURE --n_rounds 5600 --bz 16 --device cuda --log_freq 100 --local_steps 1 --lr 0.001 --decay sqrt 70 | python3 main.py inaturalist --network_name amazon_us --architecture $ARCHITECTURE --n_rounds 1600 --bz 16 --device cuda --log_freq 40 --local_steps 1 --lr 0.001 --decay sqrt 71 | python3 main.py inaturalist --network_name geantdistance --architecture $ARCHITECTURE --n_rounds 4000 --bz 16 --device cuda --log_freq 100 --local_steps 1 --lr 0.001 --decay sqrt 72 | python3 main.py inaturalist --network_name exodus --architecture $ARCHITECTURE --n_rounds 4800 --bz 16 --device cuda --log_freq 100 --local_steps 1 --lr 0.1 --decay sqrt --optimizer sgd 73 | python3 main.py inaturalist --network_name ebone --architecture $ARCHITECTURE --n_rounds 6000 --bz 16 --device cuda --log_freq 100 --local_steps 1 --lr 0.1 --decay sqrt --optimizer sgd 74 | ``` 75 | 76 | And the test and training accuracy and loss for the corresponding experiment will be saved in the log files. 77 | 78 | Do this operation for all architectures ($ARCHITECTURE=ring, centralized, matcha, exodus, ebone). 79 | Remind that for every network, a new generation of dataset (data/$dataset folders) is required to distribute data into silos. 80 | 81 | Then run 82 | 83 | ```eval 84 | python3 make_table3.py 85 | ``` 86 | 87 | To generate the values from Table 3. 88 | 89 | ### Effect of the topology on the convergence 90 | 91 | To evaluate the influence of topology on the training evolution for the different datasets when trained on AWS-NA network, run 92 | 93 | ```eval 94 | python main.py inaturalist --network_name amazon_us --architecture ring --n_rounds 1600 --bz 16 --device cuda --log_freq 40 --local_steps 1 --lr 0.001 --decay sqrt 95 | python main.py inaturalist --network_name amazon_us --architecture centralized --n_rounds 1600 --bz 16 --device cuda --log_freq 40 --local_steps 1 --lr 0.001 --decay sqrt 96 | python main.py inaturalist --network_name amazon_us --architecture matcha --n_rounds 1600 --bz 16 --device cuda --log_freq 40 --local_steps 1 --lr 0.001 --decay sqrt 97 | python main.py inaturalist --network_name amazon_us --architecture mst --n_rounds 1600 --bz 16 --device cuda --log_freq 40 --local_steps 1 --lr 0.001 --decay sqrt 98 | python main.py inaturalist --network_name amazon_us --architecture mct_congest --n_rounds 1600 --bz 16 --device cuda --log_freq 40 --local_steps 1 --lr 0.001 --decay sqrt 99 | 100 | python main.py femnist --network_name amazon_us --architecture ring --n_rounds 6400 --bz 128 --device cuda --log_freq 80 --local_steps 1 --lr 0.001 --decay sqrt 101 | python main.py femnist --network_name amazon_us --architecture centralized --n_rounds 6400 --bz 128 --device cuda --log_freq 80 --local_steps 1 --lr 0.001 --decay sqrt 102 | python main.py femnist --network_name amazon_us --architecture matcha --n_rounds 6400 --bz 128 --device cuda --log_freq 80 --local_steps 1 --lr 0.001 --decay sqrt 103 | python main.py femnist --network_name amazon_us --architecture mst --n_rounds 6400 --bz 128 --device cuda --log_freq 80 --local_steps 1 --lr 0.001 --decay sqrt 104 | python main.py femnist --network_name amazon_us --architecture mct_congest --n_rounds 6400 --bz 128 --device cuda --log_freq 80 --local_steps 1 --lr 0.001 --decay sqrt 105 | 106 | python main.py sent140 --network_name amazon_us --architecture ring --n_rounds 20000 --bz 512 --device cuda --log_freq 100 --local_steps 1 --lr 0.001 --decay sqrt 107 | python main.py sent140 --network_name amazon_us --architecture centralized --n_rounds 20000 --bz 512 --device cuda --log_freq 100 --local_steps 1 --lr 0.001 --decay sqrt 108 | python main.py sent140 --network_name amazon_us --architecture matcha --n_rounds 20000 --bz 512 --device cuda --log_freq 100 --local_steps 1 --lr 0.001 --decay sqrt 109 | python main.py sent140 --network_name amazon_us --architecture mst --n_rounds 20000 --bz 512 --device cuda --log_freq 100 --local_steps 1 --lr 0.001 --decay sqrt 110 | python main.py sent140 --network_name amazon_us --architecture mct_congest --n_rounds 20000 --bz 512 --device cuda --log_freq 100 --local_steps 1 --lr 0.001 --decay sqrt 111 | 112 | python main.py shakespeare --network_name amazon_us --architecture ring --n_rounds 1500 --bz 512 --decay sqrt --lr 1e-3 --device cuda --local_steps 1 --log_freq 30 113 | python main.py shakespeare --network_name amazon_us --architecture centralized --n_rounds 1500 --bz 512 --decay sqrt --lr 1e-3 --device cuda --local_steps 1 --log_freq 30 114 | python main.py shakespeare --network_name amazon_us --architecture matcha --n_rounds 1500 --bz 512 --decay sqrt --lr 1e-3 --device cuda --local_steps 1 --log_freq 30 115 | python main.py shakespeare --network_name amazon_us --architecture mst --n_rounds 1500 --bz 512 --decay sqrt --lr 1e-3 --device cuda --local_steps 1 --log_freq 30 116 | python main.py shakespeare --network_name amazon_us --architecture mct_congest --n_rounds 1500 --bz 512 --decay sqrt --lr 1e-3 --device cuda --local_steps 1 --log_freq 30 117 | ``` 118 | 119 | to generate the log files for each experiment. Tne run 120 | 121 | ```eval 122 | python3 make_figure2.py 123 | ``` 124 | 125 | to generate Figure 2. (Figures will be found in `results/plots`) 126 | 127 | ## Results 128 | 129 | ### iNaturalist Speed-ups 130 | Our topology design achieves the following speed-ups when training 131 | iNaturalist dataset over different networks: 132 | 133 | 134 | |Network Name | Silos | Links | Ring vs Star speed-up | Ring vs MATCHA speed-up| 135 | | ------------------ | ------|-------|---------------- | -------------- | 136 | | Gaia | 11 | 55 |2.65 | 1.54 | 137 | | AWS NA | 22 | 321 |3.41 |1.47| 138 | | Géant | 40 | 61 |4.85 |0.81| 139 | | Exodus | 79 | 147 |8.78 |1.37| 140 | | Ebone | 87 | 161 |8.83 |1.29| 141 | 142 | ### Effect of the topology on the convergence 143 | 144 | Effect of overlays on the convergence w.r.t. communication rounds (top row) 145 | and wall-clock time(bottom row) when training four different datasets on 146 | AWS North America underlay.1Gbps core links capacities, 100Mbps access 147 | links capacities,s= 1. 148 | 149 | ![](https://user-images.githubusercontent.com/42912620/84382812-7e215780-abeb-11ea-94f5-e08e506ace89.PNG) 150 | -------------------------------------------------------------------------------- /data/shakespeare/preprocess_shakespeare.py: -------------------------------------------------------------------------------- 1 | """Preprocesses the Shakespeare dataset for federated training. 2 | Copyright 2017 Google Inc. 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | https://www.apache.org/licenses/LICENSE-2.0 7 | Unless required by applicable law or agreed to in writing, software 8 | distributed under the License is distributed on an "AS IS" BASIS, 9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | See the License for the specific language governing permissions and 11 | limitations under the License. 12 | To run: 13 | python preprocess_shakespeare.py path/to/raw/shakespeare.txt output_directory/ 14 | The raw data can be downloaded from: 15 | http://www.gutenberg.org/cache/epub/100/pg100.txt 16 | (The Plain Text UTF-8 file format, md5sum: 036d0f9cf7296f41165c2e6da1e52a0e) 17 | Note that The Comedy of Errors has a incorrect indentation compared to all the 18 | other plays in the file. The code below reflects that issue. To make the code 19 | cleaner, you could fix the indentation in the raw shakespeare file and remove 20 | the special casing for that play in the code below. 21 | Authors: loeki@google.com, mcmahan@google.com 22 | Disclaimer: This is not an official Google product. 23 | """ 24 | import collections 25 | import json 26 | import os 27 | import random 28 | import re 29 | import sys 30 | RANDOM_SEED = 1234 31 | # Regular expression to capture an actors name, and line continuation 32 | CHARACTER_RE = re.compile(r'^ ([a-zA-Z][a-zA-Z ]*)\. (.*)') 33 | CONT_RE = re.compile(r'^ (.*)') 34 | # The Comedy of Errors has errors in its indentation so we need to use 35 | # different regular expressions. 36 | COE_CHARACTER_RE = re.compile(r'^([a-zA-Z][a-zA-Z ]*)\. (.*)') 37 | COE_CONT_RE = re.compile(r'^(.*)') 38 | 39 | 40 | def _match_character_regex(line, comedy_of_errors=False): 41 | return (COE_CHARACTER_RE.match(line) if comedy_of_errors 42 | else CHARACTER_RE.match(line)) 43 | 44 | 45 | def _match_continuation_regex(line, comedy_of_errors=False): 46 | return ( 47 | COE_CONT_RE.match(line) if comedy_of_errors else CONT_RE.match(line)) 48 | 49 | 50 | def _split_into_plays(shakespeare_full): 51 | """Splits the full data by play.""" 52 | # List of tuples (play_name, dict from character to list of lines) 53 | plays = [] 54 | discarded_lines = [] # Track discarded lines. 55 | slines = shakespeare_full.splitlines(True)[1:] 56 | 57 | # skip contents, the sonnets, and all's well that ends well 58 | author_count = 0 59 | start_i = 0 60 | for i, l in enumerate(slines): 61 | if 'by William Shakespeare' in l: 62 | author_count += 1 63 | if author_count == 2: 64 | start_i = i - 5 65 | break 66 | slines = slines[start_i:] 67 | 68 | current_character = None 69 | comedy_of_errors = False 70 | for i, line in enumerate(slines): 71 | # This marks the end of the plays in the file. 72 | if i > 124195 - start_i: 73 | break 74 | # This is a pretty good heuristic for detecting the start of a new play: 75 | if 'by William Shakespeare' in line: 76 | current_character = None 77 | characters = collections.defaultdict(list) 78 | # The title will be 2, 3, 4, 5, 6, or 7 lines above "by William Shakespeare". 79 | if slines[i - 2].strip(): 80 | title = slines[i - 2] 81 | elif slines[i - 3].strip(): 82 | title = slines[i - 3] 83 | elif slines[i - 4].strip(): 84 | title = slines[i - 4] 85 | elif slines[i - 5].strip(): 86 | title = slines[i - 5] 87 | elif slines[i - 6].strip(): 88 | title = slines[i - 6] 89 | else: 90 | title = slines[i - 7] 91 | title = title.strip() 92 | 93 | assert title, ( 94 | 'Parsing error on line %d. Expecting title 2 or 3 lines above.' % 95 | i) 96 | comedy_of_errors = (title == 'THE COMEDY OF ERRORS') 97 | # Degenerate plays are removed at the end of the method. 98 | plays.append((title, characters)) 99 | continue 100 | match = _match_character_regex(line, comedy_of_errors) 101 | if match: 102 | character, snippet = match.group(1), match.group(2) 103 | # Some character names are written with multiple casings, e.g., SIR_Toby 104 | # and SIR_TOBY. To normalize the character names, we uppercase each name. 105 | # Note that this was not done in the original preprocessing and is a 106 | # recent fix. 107 | character = character.upper() 108 | if not (comedy_of_errors and character.startswith('ACT ')): 109 | characters[character].append(snippet) 110 | current_character = character 111 | continue 112 | else: 113 | current_character = None 114 | continue 115 | elif current_character: 116 | match = _match_continuation_regex(line, comedy_of_errors) 117 | if match: 118 | if comedy_of_errors and match.group(1).startswith('<'): 119 | current_character = None 120 | continue 121 | else: 122 | characters[current_character].append(match.group(1)) 123 | continue 124 | # Didn't consume the line. 125 | line = line.strip() 126 | if line and i > 2646: 127 | # Before 2646 are the sonnets, which we expect to discard. 128 | discarded_lines.append('%d:%s' % (i, line)) 129 | # Remove degenerate "plays". 130 | return [play for play in plays if len(play[1]) > 1], discarded_lines 131 | 132 | 133 | def _remove_nonalphanumerics(filename): 134 | return re.sub('\\W+', '_', filename) 135 | 136 | 137 | def play_and_character(play, character): 138 | return _remove_nonalphanumerics((play + '_' + character).replace(' ', '_')) 139 | 140 | 141 | def _get_train_test_by_character(plays, test_fraction=0.2): 142 | """ 143 | Splits character data into train and test sets. 144 | if test_fraction <= 0, returns {} for all_test_examples 145 | plays := list of (play, dict) tuples where play is a string and dict 146 | is a dictionary with character names as keys 147 | """ 148 | skipped_characters = 0 149 | all_train_examples = collections.defaultdict(list) 150 | all_test_examples = collections.defaultdict(list) 151 | 152 | def add_examples(example_dict, example_tuple_list): 153 | for play, character, sound_bite in example_tuple_list: 154 | example_dict[play_and_character( 155 | play, character)].append(sound_bite) 156 | 157 | users_and_plays = {} 158 | for play, characters in plays: 159 | curr_characters = list(characters.keys()) 160 | for c in curr_characters: 161 | users_and_plays[play_and_character(play, c)] = play 162 | for character, sound_bites in characters.items(): 163 | examples = [(play, character, sound_bite) 164 | for sound_bite in sound_bites] 165 | if len(examples) <= 2: 166 | skipped_characters += 1 167 | # Skip characters with fewer than 2 lines since we need at least one 168 | # train and one test line. 169 | continue 170 | train_examples = examples 171 | if test_fraction > 0: 172 | num_test = max(int(len(examples) * test_fraction), 1) 173 | train_examples = examples[:-num_test] 174 | test_examples = examples[-num_test:] 175 | assert len(test_examples) == num_test 176 | assert len(train_examples) >= len(test_examples) 177 | add_examples(all_test_examples, test_examples) 178 | add_examples(all_train_examples, train_examples) 179 | return users_and_plays, all_train_examples, all_test_examples 180 | 181 | 182 | def _write_data_by_character(examples, output_directory): 183 | """Writes a collection of data files by play & character.""" 184 | if not os.path.exists(output_directory): 185 | os.makedirs(output_directory) 186 | for character_name, sound_bites in examples.items(): 187 | filename = os.path.join(output_directory, character_name + '.txt') 188 | with open(filename, 'w') as output: 189 | for sound_bite in sound_bites: 190 | output.write(sound_bite + '\n') 191 | 192 | 193 | def main(argv): 194 | print('Splitting .txt data between users') 195 | input_filename = argv[0] 196 | with open(input_filename, 'r') as input_file: 197 | shakespeare_full = input_file.read() 198 | plays, discarded_lines = _split_into_plays(shakespeare_full) 199 | print('Discarded %d lines' % len(discarded_lines)) 200 | users_and_plays, all_examples, _ = _get_train_test_by_character(plays, test_fraction=-1.0) 201 | output_directory = argv[1] 202 | with open(os.path.join(output_directory, 'users_and_plays.json'), 'w') as ouf: 203 | json.dump(users_and_plays, ouf) 204 | _write_data_by_character(all_examples, 205 | os.path.join(output_directory, 206 | 'by_play_and_character/')) 207 | 208 | 209 | if __name__ == '__main__': 210 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /graph_utils/utils/matching_decomposition.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | 4 | def matching_decomposition(graph): 5 | """ 6 | Implementing Misra & Gries edge coloring algorithm; 7 | The coloring produces uses at most Delta +1 colors, where Delta is the maximum degree of the graph; 8 | By Vizing's theorem it uses at most one color more than the optimal for all others; 9 | See http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.4452 for details 10 | :param graph: nx.Graph() 11 | :return: - List of matching; each matching is an nx.Graph() representing a sub-graph of "graph" 12 | - list of laplacian matrices, a laplacian matrix for each matching 13 | """ 14 | # Initialize the graph with a greedy coloring of less then degree + 1 colors 15 | nx.set_edge_attributes(graph, None, 'color') 16 | 17 | # edge coloring 18 | for u, v in graph.edges: 19 | if u != v: 20 | graph = color_edge(graph, u, v) 21 | 22 | # matching decomposition 23 | matching_list = get_matching_list_from_graph(graph) 24 | 25 | # compute laplacian matrices 26 | laplacian_matrices = [nx.laplacian_matrix(matching, nodelist=graph.nodes(), weight=None).toarray() 27 | for matching in matching_list] 28 | 29 | return matching_list, laplacian_matrices 30 | 31 | 32 | def get_matching_list_from_graph(graph): 33 | """ 34 | 35 | :param graph: nx.Graph(); each edge should have an attribute "color" 36 | :return: List of matching; each matching is an nx.Graph() representing a sub-graph of "graph" 37 | """ 38 | degree = get_graph_degree(graph) 39 | colors = [i for i in range(degree + 1)] 40 | 41 | matching_list = [nx.Graph() for _ in colors] 42 | 43 | for (u, v, data) in graph.edges(data=True): 44 | color = data["color"] 45 | idx = colors.index(color) 46 | matching_list[idx].add_edges_from([(u, v, data)]) 47 | 48 | return matching_list 49 | 50 | 51 | def color_edge(graph, u, v): 52 | """ 53 | color edge (u, v) if uncolored following Misra & Gries procedure; 54 | :param graph: nx.Graph(); each edge should have an attribute "color" 55 | :param u: node in "graph" 56 | :param v: node in "graph" 57 | :return: nx.Graph() where edge (u, v) has an attribute "color", the generated coloring is valid 58 | """ 59 | degree = get_graph_degree(graph) 60 | colors = [i for i in range(degree + 1)] 61 | 62 | if graph.get_edge_data(u, v)["color"] is not None: 63 | return graph 64 | 65 | else: 66 | maximal_fan = get_maximal_fan(graph, u, v) 67 | 68 | for color in colors: 69 | if is_color_free(graph, u, color): 70 | c = color 71 | break 72 | 73 | for color in colors: 74 | if is_color_free(graph, maximal_fan[-1], color): 75 | d = color 76 | break 77 | 78 | cd_path = get_cd_path(graph, u, c, d) 79 | 80 | sub_fan = get_sub_fan(graph, maximal_fan, u, v, cd_path, d) 81 | 82 | graph = invert_cd_path(graph, cd_path, c, d) 83 | 84 | graph = rotate_fan(graph, sub_fan, u) 85 | 86 | graph.add_edge(u, sub_fan[-1], color=d) 87 | 88 | return graph 89 | 90 | 91 | def get_maximal_fan(graph, u, v): 92 | """ 93 | constructs a maximal fan starting from v; 94 | A fan of a vertex u is a sequence of vertices F[1:k] that satisfies the following conditions: 95 | 1) F[1:k] is a non-empty sequence of distinct neighbors of u 96 | 2) (F[1],u) in E(G) is uncolored 97 | 3) The color of (F[i+1],u) is free on F[i] for 1 ≤ i < k 98 | A fan is maximal if it can't be extended; 99 | :param graph: nx.Graph(); each edge should have an attribute "color" 100 | :param u: node in "graph" 101 | :param v: node in "graph" 102 | :return: list of nodes of "graph" representing a maximal fan starting from "v" 103 | """ 104 | maximal_fan = [v] 105 | 106 | is_maximal = False 107 | 108 | while not is_maximal: 109 | is_maximal = True 110 | for node in graph.neighbors(u): 111 | edge_color = graph.get_edge_data(u, node)["color"] 112 | if (node not in maximal_fan) and \ 113 | is_color_free(graph, maximal_fan[-1], edge_color) and \ 114 | (edge_color is not None): 115 | maximal_fan.append(node) 116 | is_maximal = False 117 | break 118 | 119 | return maximal_fan 120 | 121 | 122 | def get_sub_fan(graph, maximal_fan, u, v, cd_path, d): 123 | """ 124 | constructs a sub-fan of "maximal_fan" such that color `d` is free on its last node; 125 | :param graph: nx.Graph(); each edge should have an attribute "color" 126 | :param maximal_fan: maxmial resulting from `get_maximal_fan` 127 | :param u: node in "graph" 128 | :param v: node in "graph" 129 | :param cd_path: nx.Graph() representing a path with edges colored only with c and d 130 | :param d: integer representing a color 131 | :return: sub-list of maximal fan such that its last node is free on d 132 | """ 133 | sub_fan = [v] 134 | for node in maximal_fan[1:]: 135 | if graph.get_edge_data(u, node)['color'] == d: 136 | break 137 | else: 138 | sub_fan.append(node) 139 | 140 | if cd_path.has_node(sub_fan[-1]): 141 | sub_fan = maximal_fan 142 | 143 | return sub_fan 144 | 145 | 146 | def rotate_fan(graph, fan, u): 147 | """ 148 | 149 | :param graph: nx.Graph(); each edge should have an attribute "color" 150 | :param fan: list of nodes of "graph" representing a fan 151 | :param u: node in "graph" 152 | :return: 153 | """ 154 | for idx in range(len(fan)-1): 155 | current_edge = (u, fan[idx]) 156 | next_edge = (u, fan[idx+1]) 157 | color = graph.get_edge_data(*next_edge)["color"] 158 | graph.add_edge(*current_edge, color=color) 159 | 160 | graph.add_edge(u, fan[-1], color=None) 161 | 162 | return graph 163 | 164 | 165 | def is_color_free(graph, node, color): 166 | """ 167 | check if the color is free on a vertex; 168 | a color is said to be incident on a vertex if an edge incident on that vertex has that color; 169 | otherwise, the color is free on that vertex 170 | :param graph: graph: nx.Graph(); each edge should have an attribute "color" 171 | :param node: node of "graph" 172 | :param color: integer smaller then the degree of "graph" or None 173 | :return: boolean True if "color" is free on "node" and False otherwise 174 | """ 175 | for neighbor in graph.neighbors(node): 176 | current_color = graph.get_edge_data(node, neighbor)["color"] 177 | 178 | if current_color == color: 179 | return False 180 | 181 | return True 182 | 183 | 184 | def get_cd_path(graph, u, c, d): 185 | """ 186 | Construct cd-path; a path that includes vertex u, has edges colored only c or d , and is maximal 187 | :param graph: graph: nx.Graph(); each edge should have an attribute "color" 188 | :param u: node of "graph" 189 | :param c: integer smaller then the degree of "graph" or None; represents a color 190 | :param d: integer smaller then the degree of "graph" or None; represents a color 191 | :return: List of nodes of "graph" representing a cd-path 192 | """ 193 | path = nx.Graph() 194 | 195 | current_color = d 196 | current_node = u 197 | is_maximal = False 198 | 199 | while not is_maximal: 200 | is_maximal = True 201 | for neighbor in graph.neighbors(current_node): 202 | 203 | try: 204 | color = graph.get_edge_data(current_node, neighbor)["color"] 205 | except: 206 | color = None 207 | 208 | if color == current_color: 209 | path.add_edge(current_node, neighbor) 210 | current_node = neighbor 211 | is_maximal = False 212 | if current_color == c: 213 | current_color = d 214 | else: 215 | current_color = c 216 | break 217 | 218 | return path 219 | 220 | 221 | def invert_cd_path(graph, path, c, d): 222 | """ 223 | Switch the colors of the edges on the cd-path: c to d and d to c. 224 | :param graph: nx.Graph(); each edge should have an attribute "color" 225 | :param path: nx.Graph() representing cd-path 226 | :param c: integer smaller then the degree of "graph" or None; represents a color 227 | :param d: integer smaller then the degree of "graph" or None; represents a color 228 | :return: graph with switched colors 229 | """ 230 | for edge in path.edges: 231 | current_color = graph.get_edge_data(*edge)["color"] 232 | if current_color == c: 233 | graph.add_edge(*edge, color=d) 234 | if current_color == d: 235 | graph.add_edge(*edge, color=c) 236 | 237 | return graph 238 | 239 | 240 | def get_graph_degree(graph): 241 | """ 242 | get maximal degree of nodes of "graph" 243 | :param graph: nx.Graph() 244 | :return: integer representing the degree of the graph 245 | """ 246 | degrees = graph.degree() 247 | 248 | graph_degree = 0 249 | for _, degree in degrees: 250 | if degree > graph_degree: 251 | graph_degree = degree 252 | 253 | return graph_degree 254 | 255 | 256 | def is_coloring_valid(graph): 257 | """ 258 | check if the coloring of a graph is valid, 259 | i.e., two adjacent edges shouldn't have the same color; 260 | :param graph: nx.Graph() each edge should have an attribute 'color' 261 | """ 262 | for u, v, data in graph.edges(data=True): 263 | color = data['color'] 264 | 265 | if color is None: continue 266 | 267 | for _, v_, data_ in graph.edges(u, data=True): 268 | if v_ != v and data_['color'] == color: 269 | return False 270 | 271 | for _, u_, data_ in graph.edges(v, data=True): 272 | if u_ != u and data_['color'] == color: 273 | return False 274 | 275 | return True 276 | 277 | 278 | def is_coloring_correct(graph): 279 | """ 280 | check if the coloring of a graph is correct, 281 | i.e., two adjacent edges shouldn't have the same color and all edges are colored; 282 | :param graph: nx.Graph() each edge should have an attribute 'color' 283 | """ 284 | if is_coloring_valid(graph): 285 | for u, v, data in graph.edges(data=True): 286 | color = data['color'] 287 | 288 | if color is None: continue 289 | 290 | for _, v_, data_ in graph.edges(u, data=True): 291 | if v_ != v and data_['color'] == color: 292 | return False 293 | 294 | for _, u_, data_ in graph.edges(v, data=True): 295 | if u_ != u and data_['color'] == color: 296 | return False 297 | 298 | return True 299 | else: return False 300 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /graph_utils/utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | import random 5 | 6 | import networkx as nx 7 | import numpy as np 8 | 9 | from .evaluate_throughput import evaluate_cycle_time 10 | from .mbst import cube_algorithm, delta_prim 11 | from .tsp_christofides import christofides_tsp 12 | from .matcha import RandomTopologyGenerator 13 | from .matching_decomposition import get_matching_list_from_graph 14 | 15 | 16 | def get_connectivity_graph(underlay, default_capacity=1e9): 17 | """ 18 | 19 | :param underlay: 20 | :param default_capacity: 21 | :return: 22 | """ 23 | connectivity_graph = nx.Graph() 24 | connectivity_graph.add_nodes_from(underlay.nodes(data=True)) 25 | 26 | dijkstra_result = nx.all_pairs_dijkstra(underlay.copy(), weight="distance") 27 | 28 | for node, (weights_dict, paths_dict) in dijkstra_result: 29 | for neighbour in paths_dict.keys(): 30 | if node != neighbour: 31 | path = paths_dict[neighbour] 32 | 33 | distance = 0. 34 | for idx in range(len(path) - 1): 35 | u = path[idx] 36 | v = path[idx + 1] 37 | 38 | data = underlay.get_edge_data(u, v) 39 | distance += data["distance"] 40 | 41 | available_bandwidth = default_capacity / (len(path) - 1) 42 | 43 | latency = 0.0085 * distance + 4 44 | 45 | connectivity_graph.add_edge(node, neighbour, availableBandwidth=available_bandwidth, latency=latency) 46 | 47 | return connectivity_graph 48 | 49 | 50 | def add_upload_download_delays(overlay, computation_time, model_size): 51 | """ 52 | Takes as input an nx.Graph(), each edge should have attributes "latency" and "availableBandwidth"; 53 | each node should have attribute "uploadDelay" and "downloadDelay"; 54 | The weight (delay) of edge (i, j) is computed as: 55 | d(i, j) = computation_time + latency(i, j) + max(M/[availableBandwidth(i, j), "uploadDelay", "downloadDelay"]$$ 56 | :param overlay: 57 | :param computation_time: 58 | :param model_size: 59 | :return: 60 | """ 61 | overlay = overlay.to_directed() 62 | 63 | out_degree_dict = dict(overlay.out_degree) 64 | in_degree_dict = dict(overlay.in_degree) 65 | 66 | for u, v, data in overlay.edges(data=True): 67 | upload_delay = out_degree_dict[u] * overlay.nodes[u]["uploadDelay"] 68 | download_delay = in_degree_dict[v] * overlay.nodes[v]["downloadDelay"] 69 | 70 | weight = computation_time + data["latency"] +\ 71 | max(upload_delay, download_delay, model_size/data["availableBandwidth"]) 72 | 73 | overlay.add_edge(u, v, weight=weight) 74 | 75 | return overlay 76 | 77 | 78 | def get_star_overlay(connectivity_graph, centrality): 79 | """ 80 | Generate server connectivity graph given an underlay topology represented as an nx.Graph 81 | :param connectivity_graph: nx.Graph() object, each edge should have availableBandwidth: 82 | "latency", "availableBandwidth" and "weight"; 83 | :param centrality: mode of centrality to use, possible: "load", "distance", "information", default="load" 84 | :return: nx.Graph() 85 | """ 86 | if centrality == "distance": 87 | centrality_dict = nx.algorithms.centrality.closeness_centrality(connectivity_graph, distance="latency") 88 | server_node = max(centrality_dict, key=centrality_dict.get) 89 | 90 | elif centrality == "information": 91 | centrality_dict = nx.algorithms.centrality.information_centrality(connectivity_graph, weight="latency") 92 | server_node = max(centrality_dict, key=centrality_dict.get) 93 | 94 | else: 95 | # centrality = load_centrality 96 | centrality_dict = nx.algorithms.centrality.load_centrality(connectivity_graph, weight="latency") 97 | server_node = max(centrality_dict, key=centrality_dict.get) 98 | 99 | weights, paths = nx.single_source_dijkstra(connectivity_graph, source=server_node, weight="weight") 100 | 101 | star = nx.Graph() 102 | star.add_nodes_from(connectivity_graph.nodes(data=True)) 103 | 104 | for node in paths.keys(): 105 | if node != server_node: 106 | 107 | latency = 0. 108 | available_bandwidth = 1e32 109 | for idx in range(len(paths[node]) - 1): 110 | u = paths[node][idx] 111 | v = paths[node][idx + 1] 112 | 113 | data = connectivity_graph.get_edge_data(u, v) 114 | latency += data["latency"] 115 | available_bandwidth = data["availableBandwidth"] 116 | 117 | star.add_edge(server_node, node, availableBandwidth=available_bandwidth, latency=latency) 118 | 119 | return star 120 | 121 | 122 | def get_ring_overlay(connectivity_graph, computation_time, model_size): 123 | """ 124 | 125 | :param connectivity_graph: 126 | :param computation_time: 127 | :param model_size: 128 | :return: 129 | """ 130 | for u, v, data in connectivity_graph.edges(data=True): 131 | upload_delay = connectivity_graph.nodes[u]["uploadDelay"] 132 | download_delay = connectivity_graph.nodes[v]["downloadDelay"] 133 | 134 | weight = computation_time + data["latency"] + max(upload_delay, 135 | download_delay, 136 | model_size / data["availableBandwidth"]) 137 | 138 | connectivity_graph.add_edge(u, v, weight=weight) 139 | 140 | adjacency_matrix = nx.adjacency_matrix(connectivity_graph, weight="weight").toarray() 141 | tsp_nodes = christofides_tsp(adjacency_matrix) 142 | 143 | ring = nx.DiGraph() 144 | ring.add_nodes_from(connectivity_graph.nodes(data=True)) 145 | 146 | for idx in range(len(tsp_nodes) - 1): 147 | # get the label of source and sink nodes from the original graph 148 | source_node = list(connectivity_graph.nodes())[tsp_nodes[idx]] 149 | sink_node = list(connectivity_graph.nodes())[tsp_nodes[idx + 1]] 150 | 151 | ring.add_edge(source_node, sink_node, 152 | latency=connectivity_graph.get_edge_data(source_node, sink_node)['latency'], 153 | availableBandwidth=connectivity_graph.get_edge_data(source_node, sink_node)['availableBandwidth'], 154 | weight=connectivity_graph.get_edge_data(source_node, sink_node)['weight']) 155 | 156 | # add final link to close the circuit 157 | source_node = list(connectivity_graph.nodes())[tsp_nodes[-1]] 158 | sink_node = list(connectivity_graph.nodes())[tsp_nodes[0]] 159 | ring.add_edge(source_node, sink_node, 160 | latency=connectivity_graph.get_edge_data(source_node, sink_node)['latency'], 161 | availableBandwidth=connectivity_graph.get_edge_data(source_node, sink_node)['availableBandwidth'], 162 | weight=connectivity_graph.get_edge_data(source_node, sink_node)['weight']) 163 | 164 | return ring 165 | 166 | 167 | def generate_random_ring(list_of_nodes): 168 | """ 169 | Generate a random ring graph connecting a list of nodes 170 | :param list_of_nodes: 171 | :return: nx.DiGraph() 172 | """ 173 | ring = nx.DiGraph() 174 | 175 | ring.add_nodes_from(list_of_nodes) 176 | 177 | random.shuffle(list_of_nodes) 178 | 179 | for idx in range(len(list_of_nodes) - 1): 180 | # get the label of source and sink nodes from the original graph 181 | source_node = list_of_nodes[idx] 182 | sink_node = list_of_nodes[idx + 1] 183 | 184 | ring.add_edge(source_node, sink_node) 185 | 186 | # add final link to close the circuit 187 | source_node = list_of_nodes[-1] 188 | sink_node = list_of_nodes[0] 189 | ring.add_edge(source_node, sink_node) 190 | 191 | mixing_matrix = nx.adjacency_matrix(ring, weight=None).todense().astype(np.float64) 192 | 193 | mixing_matrix += np.eye(mixing_matrix.shape[0]) 194 | mixing_matrix *= 0.5 195 | 196 | return nx.from_numpy_matrix(mixing_matrix, create_using=nx.DiGraph()) 197 | 198 | 199 | def get_delta_mbst_overlay(connectivity_graph, computation_time, model_size): 200 | """ 201 | 202 | :param connectivity_graph: 203 | :param computation_time: 204 | :param model_size: 205 | :return: 206 | """ 207 | for u, v, data in connectivity_graph.edges(data=True): 208 | weight = computation_time + data["latency"] + \ 209 | max(connectivity_graph.nodes[u]["uploadDelay"], connectivity_graph.nodes[v]["downloadDelay"], 210 | model_size / data["availableBandwidth"]) + \ 211 | max(connectivity_graph.nodes[v]["uploadDelay"], connectivity_graph.nodes[u]["downloadDelay"], 212 | model_size / data["availableBandwidth"]) 213 | 214 | connectivity_graph.add_edge(u, v, weight=weight, latency=data["latency"], 215 | availableBandwidth=data["availableBandwidth"]) 216 | 217 | for u in connectivity_graph.nodes: 218 | connectivity_graph.add_edge(u, u, weight=0, latency=0, availableBandwidth=1e32) 219 | 220 | best_result = cube_algorithm(connectivity_graph.copy()).to_directed() 221 | 222 | for u, v in best_result.edges: 223 | best_result.add_edge(u, v, 224 | latency=connectivity_graph.get_edge_data(u, v)['latency'], 225 | availableBandwidth=connectivity_graph.get_edge_data(u, v)['availableBandwidth']) 226 | 227 | best_cycle_time, _, _ = evaluate_cycle_time(add_upload_download_delays(best_result, computation_time, model_size)) 228 | best_delta = 2 229 | 230 | n_nodes = connectivity_graph.number_of_nodes() 231 | for delta in range(2, n_nodes): 232 | result = delta_prim(connectivity_graph.copy(), delta).to_directed() 233 | 234 | for u, v, data in result.edges(data=True): 235 | weight = data["weight"] - (result.nodes[u]["uploadDelay"] + result.nodes[v]["downloadDelay"]) 236 | 237 | result.add_edge(u, v, weight=weight, 238 | latency=connectivity_graph.get_edge_data(u, v)['latency'], 239 | availableBandwidth=connectivity_graph.get_edge_data(u, v)['availableBandwidth']) 240 | 241 | cycle_time, _, _ = evaluate_cycle_time(add_upload_download_delays(result, computation_time, model_size)) 242 | 243 | if cycle_time < best_cycle_time: 244 | best_result = result 245 | best_cycle_time = cycle_time 246 | best_delta = delta 247 | 248 | return best_result, best_cycle_time, best_delta 249 | 250 | 251 | def get_matcha_cycle_time(underlay, connectivity_graph, computation_time, model_size, communication_budget): 252 | """ 253 | 254 | :param underlay: 255 | :param connectivity_graph: 256 | :param computation_time: 257 | :param model_size: 258 | :param communication_budget: 259 | :return: 260 | """ 261 | path_to_save_network = os.path.join("temp", "colored_network.gml") 262 | path_to_matching_history_file = os.path.join("temp", "matching_history.csv") 263 | 264 | try: 265 | shutil.rmtree("temp") 266 | except FileNotFoundError: 267 | pass 268 | 269 | os.makedirs("temp", exist_ok=True) 270 | 271 | topology_generator = RandomTopologyGenerator(underlay.copy(), 272 | communication_budget, 273 | network_save_path=path_to_save_network, 274 | path_to_history_file=path_to_matching_history_file) 275 | 276 | n_rounds = 1000 277 | np.random.seed(0) 278 | for _ in range(n_rounds): 279 | topology_generator.step() 280 | 281 | path_to_colored_network = os.path.join("temp", "colored_network.gml") 282 | path_to_matching_history_file = os.path.join("temp", "matching_history.csv") 283 | 284 | colored_network = nx.read_gml(path_to_colored_network) 285 | matching_list = get_matching_list_from_graph(colored_network) 286 | 287 | simulated_time = np.zeros(n_rounds) 288 | with open(path_to_matching_history_file) as csv_file: 289 | csv_reader = csv.reader(csv_file, delimiter=' ') 290 | 291 | for ii, row in enumerate(csv_reader): 292 | overlay = nx.Graph() 293 | overlay.add_nodes_from(connectivity_graph.nodes(data=True)) 294 | 295 | current_matching_activations = row 296 | for matching_idx, matching_activation in enumerate(current_matching_activations): 297 | if int(matching_activation): 298 | overlay = nx.compose(overlay, matching_list[matching_idx]) 299 | 300 | for u, v in overlay.edges(): 301 | overlay.add_edge(u, v, 302 | latency=connectivity_graph.get_edge_data(u, v)["latency"], 303 | availableBandwidth=connectivity_graph.get_edge_data(u, v)['availableBandwidth'] 304 | ) 305 | 306 | if nx.is_empty(overlay): 307 | # If overlay is empty, then no communication cost is added 308 | simulated_time[:, ii] = computation_time 309 | 310 | else: 311 | overlay = add_upload_download_delays(overlay, computation_time, model_size) 312 | 313 | cycle_time = 0 314 | for u, v, data in overlay.edges(data=True): 315 | if data["weight"] > cycle_time: 316 | cycle_time = data["weight"] 317 | 318 | simulated_time[ii] = cycle_time 319 | 320 | simulated_time = simulated_time.cumsum() 321 | 322 | try: 323 | shutil.rmtree("temp") 324 | except FileNotFoundError: 325 | pass 326 | 327 | return simulated_time[-1] / (n_rounds - 1) 328 | --------------------------------------------------------------------------------