├── eval ├── data │ └── .gitkeep ├── graph.adj ├── 6_star.edges ├── ip_addr_3Machines.json ├── 4_node_fullyConnected.edges ├── ip_addr_4Machines.json ├── ip_addr_5Machines.json ├── ip_addr_6Machines.json ├── ip_addr_7Machines.json ├── step_configs │ ├── config_movielens_sharing.ini │ ├── config_cifar_sharing.ini │ ├── config_movielens_subsampling.ini │ ├── config_femnist_sharing.ini │ ├── config_shakespeare_sharing.ini │ ├── config_reddit_sharing.ini │ ├── config_cifar_partialmodel.ini │ ├── config_cifar_subsampling.ini │ ├── config_femnist_partialmodel.ini │ ├── config_reddit_partialmodel.ini │ ├── config_reddit_subsampling.ini │ ├── config_celeba_sharing.ini │ ├── config_shakespeare_subsampling.ini │ ├── config_shakespeare_partialmodel.ini │ ├── config_femnist_subsampling.ini │ ├── config_celeba_partialmodel.ini │ ├── config_celeba_subsampling.ini │ ├── config_movielens_jwins.ini │ ├── config_cifar_jwins.ini │ ├── config_shakespeare_jwins.ini │ ├── config_femnist_jwins.ini │ └── config_celeba_jwins.ini ├── run.sh ├── run_all.sh ├── plot_shared.py ├── testingKNN.py ├── testingKFN.py ├── testing.py ├── 36_nodes.edges ├── testingManual.py ├── testingPeerSampler.py ├── testingPeerSamplerDynamic.py ├── testingPeerSamplerDynamicManual.py ├── testingSTC.py ├── testingFederated.py ├── plot_model.py ├── 96_regular.edges ├── plot_percentile.py ├── 80_nodes.edges ├── run_grid.sh ├── 96_nodes_smallworld.edges ├── run_xtimes_cifar.sh └── 96_nodes_random2.edges ├── requirements.txt ├── src └── decentralizepy │ ├── __init__.py │ ├── models │ ├── __init__.py │ └── Model.py │ ├── node │ ├── __init__.py │ ├── STC │ │ └── __init__.py │ ├── EpidemicLearning │ │ ├── __init__.py │ │ ├── EL_Oracle_TopologyBuilder.py │ │ └── EL_Oracle_Client.py │ ├── PeerSamplerDynamic.py │ └── DPSGDWithPeerSampler.py │ ├── sharing │ ├── __init__.py │ ├── JWINS │ │ ├── __init__.py │ │ └── JWINS.py │ └── PlainAverageSharing.py │ ├── training │ ├── __init__.py │ ├── text │ │ ├── __init__.py │ │ └── LLMTraining.py │ └── Training.py │ ├── communication │ ├── __init__.py │ └── Communication.py │ ├── datasets │ ├── text │ │ ├── __init__.py │ │ └── LLMData.py │ ├── __init__.py │ ├── Data.py │ └── Dataset.py │ ├── mappings │ ├── __init__.py │ ├── Mapping.py │ ├── Linear.py │ └── Manual.py │ ├── graphs │ ├── __init__.py │ ├── Ring.py │ ├── FullyConnected.py │ ├── Star.py │ ├── Regular.py │ ├── SmallWorld.py │ └── Graph.py │ ├── compression │ ├── EliasQuantization.py │ ├── EliasFpzip.py │ ├── EliasFpzipLossy.py │ ├── Compression.py │ ├── Lz4Wrapper.py │ ├── Elias.py │ └── Quantization.py │ └── utils.py ├── tutorial ├── ip.json ├── JWINS │ ├── ip.json │ ├── regular_16.txt │ ├── run_decentralized.sh │ └── config.ini ├── EpidemicLearning │ ├── ip.json │ ├── run_el-local.sh │ ├── run_el-oracle.sh │ ├── config_EL.ini │ ├── testingEL_Local.py │ ├── fullyConnected_16.edges │ └── testingEL_Oracle.py ├── regular_16.txt ├── run_decentralized.sh ├── run_federated.sh └── config.ini ├── pyproject.toml ├── .isort.cfg ├── .gitignore ├── setup.py ├── download_dataset.py ├── split_into_files.py ├── install_nMachines.sh ├── LICENSE ├── setup.cfg ├── generate_graph.py └── README.rst /eval/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/decentralizepy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/decentralizepy/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/decentralizepy/node/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/decentralizepy/sharing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/decentralizepy/node/STC/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/decentralizepy/training/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/decentralizepy/communication/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/decentralizepy/datasets/text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/decentralizepy/sharing/JWINS/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/decentralizepy/training/text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tutorial/ip.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": "localhost" 3 | } -------------------------------------------------------------------------------- /src/decentralizepy/node/EpidemicLearning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tutorial/JWINS/ip.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": "localhost" 3 | } -------------------------------------------------------------------------------- /eval/graph.adj: -------------------------------------------------------------------------------- 1 | 6 2 | 1 3 | 0 3 4 4 | 3 5 5 | 1 2 5 6 | 1 7 | 2 3 -------------------------------------------------------------------------------- /tutorial/EpidemicLearning/ip.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": "localhost" 3 | } -------------------------------------------------------------------------------- /src/decentralizepy/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .Femnist import Femnist 2 | -------------------------------------------------------------------------------- /src/decentralizepy/mappings/__init__.py: -------------------------------------------------------------------------------- 1 | from .Linear import Linear 2 | from .Mapping import Mapping 3 | -------------------------------------------------------------------------------- /src/decentralizepy/graphs/__init__.py: -------------------------------------------------------------------------------- 1 | from .Graph import Graph 2 | from .SmallWorld import SmallWorld 3 | -------------------------------------------------------------------------------- /eval/6_star.edges: -------------------------------------------------------------------------------- 1 | 6 2 | 0 1 3 | 0 2 4 | 0 3 5 | 0 4 6 | 0 5 7 | 1 0 8 | 2 0 9 | 3 0 10 | 4 0 11 | 5 0 -------------------------------------------------------------------------------- /eval/ip_addr_3Machines.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": "10.90.41.131", 3 | "1": "10.90.41.132", 4 | "2": "10.90.41.133" 5 | } -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /eval/4_node_fullyConnected.edges: -------------------------------------------------------------------------------- 1 | 4 2 | 0 1 3 | 0 2 4 | 0 3 5 | 1 0 6 | 1 2 7 | 1 3 8 | 2 0 9 | 2 1 10 | 2 3 11 | 3 0 12 | 3 1 13 | 3 2 14 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | force_grid_wrap=0 5 | combine_as_imports=True 6 | line_length=88 7 | -------------------------------------------------------------------------------- /eval/ip_addr_4Machines.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": "10.90.41.130", 3 | "1": "10.90.41.131", 4 | "2": "10.90.41.132", 5 | "3": "10.90.41.133" 6 | } -------------------------------------------------------------------------------- /eval/ip_addr_5Machines.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": "10.90.41.129", 3 | "1": "10.90.41.130", 4 | "2": "10.90.41.131", 5 | "3": "10.90.41.132", 6 | "4": "10.90.41.133" 7 | } -------------------------------------------------------------------------------- /eval/ip_addr_6Machines.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": "10.90.41.128", 3 | "1": "10.90.41.129", 4 | "2": "10.90.41.130", 5 | "3": "10.90.41.131", 6 | "4": "10.90.41.132", 7 | "5": "10.90.41.133" 8 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/.idea 2 | **/__pycache__/ 3 | **/data/ 4 | **/.DS_Store 5 | **/results/ 6 | **/experiment_results/ 7 | **/.vscode 8 | **/leaf/ 9 | **.egg-info 10 | 202** 11 | eval/data** 12 | **/massif.out* 13 | *swp 14 | build -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!$CONDA_PREFIX/python 2 | from setuptools import setup 3 | 4 | # https://packaging.python.org/guides/single-sourcing-package-version/ 5 | # http://blog.ionelmc.ro/2014/05/25/python-packaging/ 6 | setup(setup_cfg=True) 7 | -------------------------------------------------------------------------------- /eval/ip_addr_7Machines.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": "10.90.41.127", 3 | "1": "10.90.41.128", 4 | "2": "10.90.41.129", 5 | "3": "10.90.41.130", 6 | "4": "10.90.41.131", 7 | "5": "10.90.41.132", 8 | "6": "10.90.41.133" 9 | } -------------------------------------------------------------------------------- /download_dataset.py: -------------------------------------------------------------------------------- 1 | import torchvision 2 | 3 | if __name__ == "__main__": 4 | torchvision.datasets.CIFAR10(root="./eval/data/", train=True, download=True) 5 | torchvision.datasets.CIFAR10(root="./eval/data/", train=False, download=True) 6 | 7 | # TODO: download the other datasets 8 | -------------------------------------------------------------------------------- /split_into_files.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from decentralizepy.datasets.Reddit import Reddit 4 | from decentralizepy.mappings import Linear 5 | 6 | if __name__ == "__main__": 7 | mapping = Linear(6, 16) 8 | f = Reddit(0, 0, mapping) 9 | assert len(sys.argv) == 3 10 | frm = sys.argv[1] 11 | to = sys.argv[2] 12 | f.file_per_user(frm, to) 13 | -------------------------------------------------------------------------------- /src/decentralizepy/node/EpidemicLearning/EL_Oracle_TopologyBuilder.py: -------------------------------------------------------------------------------- 1 | from decentralizepy.node.PeerSamplerDynamic import PeerSamplerDynamic 2 | 3 | 4 | class EL_Oracle_TopologyBuilder(PeerSamplerDynamic): 5 | """ 6 | This class defines the topology builder that responds to neighbor requests from the clients. 7 | 8 | """ 9 | 10 | def __init__(self, *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | -------------------------------------------------------------------------------- /src/decentralizepy/node/EpidemicLearning/EL_Oracle_Client.py: -------------------------------------------------------------------------------- 1 | from decentralizepy.node.DPSGDWithPeerSampler import DPSGDWithPeerSampler 2 | 3 | 4 | class EL_Oracle_Client(DPSGDWithPeerSampler): 5 | """ 6 | This class defines the client class for Epidemic Learning with Oracle. 7 | The client requests the peer sampler for neighbors each round. 8 | 9 | """ 10 | 11 | def __init__(self, *args, **kwargs): 12 | super().__init__(*args, **kwargs) 13 | -------------------------------------------------------------------------------- /tutorial/regular_16.txt: -------------------------------------------------------------------------------- 1 | 16 2 | 0 12 3 | 0 14 4 | 0 15 5 | 1 8 6 | 1 3 7 | 1 6 8 | 2 9 9 | 2 10 10 | 2 5 11 | 3 1 12 | 3 11 13 | 3 9 14 | 4 9 15 | 4 12 16 | 4 13 17 | 5 2 18 | 5 6 19 | 5 7 20 | 6 1 21 | 6 5 22 | 6 7 23 | 7 5 24 | 7 6 25 | 7 14 26 | 8 1 27 | 8 13 28 | 8 14 29 | 9 2 30 | 9 3 31 | 9 4 32 | 10 2 33 | 10 11 34 | 10 13 35 | 11 10 36 | 11 3 37 | 11 15 38 | 12 0 39 | 12 4 40 | 12 15 41 | 13 8 42 | 13 10 43 | 13 4 44 | 14 0 45 | 14 8 46 | 14 7 47 | 15 0 48 | 15 11 49 | 15 12 50 | -------------------------------------------------------------------------------- /tutorial/JWINS/regular_16.txt: -------------------------------------------------------------------------------- 1 | 16 2 | 0 12 3 | 0 14 4 | 0 15 5 | 1 8 6 | 1 3 7 | 1 6 8 | 2 9 9 | 2 10 10 | 2 5 11 | 3 1 12 | 3 11 13 | 3 9 14 | 4 9 15 | 4 12 16 | 4 13 17 | 5 2 18 | 5 6 19 | 5 7 20 | 6 1 21 | 6 5 22 | 6 7 23 | 7 5 24 | 7 6 25 | 7 14 26 | 8 1 27 | 8 13 28 | 8 14 29 | 9 2 30 | 9 3 31 | 9 4 32 | 10 2 33 | 10 11 34 | 10 13 35 | 11 10 36 | 11 3 37 | 11 15 38 | 12 0 39 | 12 4 40 | 12 15 41 | 13 8 42 | 13 10 43 | 13 4 44 | 14 0 45 | 14 8 46 | 14 7 47 | 15 0 48 | 15 11 49 | 15 12 50 | -------------------------------------------------------------------------------- /src/decentralizepy/graphs/Ring.py: -------------------------------------------------------------------------------- 1 | from decentralizepy.graphs.Graph import Graph 2 | 3 | 4 | class Ring(Graph): 5 | """ 6 | The class for generating a Ring topology 7 | 8 | """ 9 | 10 | def __init__(self, n_procs): 11 | """ 12 | Constructor. Generates a Ring graph 13 | 14 | Parameters 15 | ---------- 16 | n_procs : int 17 | total number of nodes in the graph 18 | 19 | """ 20 | super().__init__(n_procs) 21 | self.connect_graph() 22 | -------------------------------------------------------------------------------- /install_nMachines.sh: -------------------------------------------------------------------------------- 1 | #!\bin\bash 2 | 3 | cd 4 | mkdir -p Gitlab 5 | cd Gitlab 6 | git clone git@gitlab.epfl.ch:risharma/decentralizepy.git 7 | cd decentralizepy 8 | mkdir -p leaf/data/femnist/data/train 9 | mkdir -p leaf/data/femnist/data/test 10 | mkdir -p leaf/data/femnist/per_user_data/train 11 | ~/miniconda3/bin/conda remove --name decpy --all 12 | ~/miniconda3/bin/conda create -n decpy python=3.9 13 | ~/miniconda3/envs/decpy/bin/pip install --upgrade pip --quiet 14 | ~/miniconda3/envs/decpy/bin/pip install --editable .\[dev\] 15 | -------------------------------------------------------------------------------- /src/decentralizepy/graphs/FullyConnected.py: -------------------------------------------------------------------------------- 1 | from decentralizepy.graphs.Graph import Graph 2 | 3 | 4 | class FullyConnected(Graph): 5 | """ 6 | The class for generating a Fully Connected Graph Topology 7 | 8 | """ 9 | 10 | def __init__(self, n_procs): 11 | """ 12 | Constructor. Generates a Fully Connected graph 13 | 14 | Parameters 15 | ---------- 16 | n_procs : int 17 | total number of nodes in the graph 18 | 19 | """ 20 | super().__init__(n_procs) 21 | for node in range(n_procs): 22 | neighbors = set([x for x in range(n_procs) if x != node]) 23 | self.adj_list[node] = neighbors 24 | -------------------------------------------------------------------------------- /src/decentralizepy/compression/EliasQuantization.py: -------------------------------------------------------------------------------- 1 | from decentralizepy.compression.Elias import Elias 2 | from decentralizepy.compression.Quantization import Quantization 3 | 4 | 5 | class EliasQuantization(Elias, Quantization): 6 | """ 7 | Compress metadata and quantize parameters 8 | 9 | """ 10 | 11 | def __init__(self, float_precision: int = 2**15 - 1, *args, **kwargs): 12 | """ 13 | Constructor 14 | 15 | Parameters 16 | ---------- 17 | float_precision : int, optional 18 | Quantization parameter 19 | """ 20 | super().__init__(float_precision=float_precision, *args, **kwargs) 21 | self.k = float_precision 22 | -------------------------------------------------------------------------------- /src/decentralizepy/graphs/Star.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | from decentralizepy.graphs.Graph import Graph 4 | 5 | 6 | class Star(Graph): 7 | """ 8 | The class for generating a Star topology 9 | Adapted from ./Regular.py 10 | 11 | """ 12 | 13 | def __init__(self, n_procs): 14 | """ 15 | Constructor. Generates a Ring graph 16 | 17 | Parameters 18 | ---------- 19 | n_procs : int 20 | total number of nodes in the graph 21 | 22 | """ 23 | super().__init__(n_procs) 24 | G = nx.star_graph(n_procs - 1) 25 | adj = G.adjacency() 26 | for i, l in adj: 27 | self.adj_list[i] = set() # new set 28 | for k in l: 29 | self.adj_list[i].add(k) 30 | if not nx.is_connected(G): 31 | self.connect_graph() 32 | -------------------------------------------------------------------------------- /eval/step_configs/config_movielens_sharing.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.MovieLens 3 | dataset_class = MovieLens 4 | model_class = MatrixFactorization 5 | train_dir = /mnt/nfs/shared/leaf/data/movielens 6 | test_dir = /mnt/nfs/shared/leaf/data/movielens 7 | ; python list of fractions below 8 | sizes = 9 | 10 | [OPTIMIZER_PARAMS] 11 | optimizer_package = torch.optim 12 | optimizer_class = SGD 13 | lr = 0.1 14 | 15 | [TRAIN_PARAMS] 16 | training_package = decentralizepy.training.Training 17 | training_class = Training 18 | rounds = 10 19 | full_epochs = False 20 | batch_size = 16 21 | shuffle = True 22 | loss_package = torch.nn 23 | loss_class = MSELoss 24 | 25 | [COMMUNICATION] 26 | comm_package = decentralizepy.communication.TCP 27 | comm_class = TCP 28 | addresses_filepath = ip_addr_6Machines.json 29 | 30 | [SHARING] 31 | sharing_package = decentralizepy.sharing.Sharing 32 | sharing_class = Sharing 33 | -------------------------------------------------------------------------------- /eval/step_configs/config_cifar_sharing.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.CIFAR10 3 | dataset_class = CIFAR10 4 | model_class = LeNet 5 | train_dir = /mnt/nfs/shared/CIFAR 6 | test_dir = /mnt/nfs/shared/CIFAR 7 | ; python list of fractions below 8 | sizes = 9 | random_seed = 99 10 | partition_niid = iid 11 | 12 | [OPTIMIZER_PARAMS] 13 | optimizer_package = torch.optim 14 | optimizer_class = SGD 15 | lr = 0.001 16 | 17 | [TRAIN_PARAMS] 18 | training_package = decentralizepy.training.Training 19 | training_class = Training 20 | rounds = 65 21 | full_epochs = False 22 | batch_size = 8 23 | shuffle = True 24 | loss_package = torch.nn 25 | loss_class = CrossEntropyLoss 26 | 27 | [COMMUNICATION] 28 | comm_package = decentralizepy.communication.TCP 29 | comm_class = TCP 30 | addresses_filepath = ip_addr_6Machines.json 31 | 32 | [SHARING] 33 | sharing_package = decentralizepy.sharing.Sharing 34 | sharing_class = Sharing 35 | -------------------------------------------------------------------------------- /src/decentralizepy/graphs/Regular.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | from decentralizepy.graphs.Graph import Graph 4 | 5 | 6 | class Regular(Graph): 7 | """ 8 | The class for generating a Regular topology 9 | 10 | """ 11 | 12 | def __init__(self, n_procs, degree, seed=None): 13 | """ 14 | Constructor. Generates a Ring graph 15 | 16 | Parameters 17 | ---------- 18 | n_procs : int 19 | total number of nodes in the graph 20 | degree : int 21 | Neighbors of each node 22 | 23 | """ 24 | super().__init__(n_procs) 25 | G = nx.random_regular_graph(degree, n_procs, seed) 26 | adj = G.adjacency() 27 | for i, l in adj: 28 | self.adj_list[i] = set() # new set 29 | for k in l: 30 | self.adj_list[i].add(k) 31 | if not nx.is_connected(G): 32 | self.connect_graph() 33 | -------------------------------------------------------------------------------- /eval/step_configs/config_movielens_subsampling.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.MovieLens 3 | dataset_class = MovieLens 4 | model_class = MatrixFactorization 5 | train_dir = /mnt/nfs/shared/leaf/data/movielens 6 | test_dir = /mnt/nfs/shared/leaf/data/movielens 7 | ; python list of fractions below 8 | sizes = 9 | 10 | [OPTIMIZER_PARAMS] 11 | optimizer_package = torch.optim 12 | optimizer_class = SGD 13 | lr = 0.1 14 | 15 | [TRAIN_PARAMS] 16 | training_package = decentralizepy.training.Training 17 | training_class = Training 18 | rounds = 10 19 | full_epochs = False 20 | batch_size = 16 21 | shuffle = True 22 | loss_package = torch.nn 23 | loss_class = MSELoss 24 | 25 | [COMMUNICATION] 26 | comm_package = decentralizepy.communication.TCP 27 | comm_class = TCP 28 | addresses_filepath = ip_addr_6Machines.json 29 | 30 | [SHARING] 31 | sharing_package = decentralizepy.sharing.SubSampling 32 | sharing_class = SubSampling 33 | alpha = 0.3 34 | -------------------------------------------------------------------------------- /eval/step_configs/config_femnist_sharing.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Femnist 3 | dataset_class = Femnist 4 | random_seed = 97 5 | model_class = CNN 6 | train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 47 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.Sharing 33 | sharing_class = Sharing 34 | -------------------------------------------------------------------------------- /eval/step_configs/config_shakespeare_sharing.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Shakespeare 3 | dataset_class = Shakespeare 4 | model_class = LSTM 5 | train_dir = /mnt/nfs/shared/leaf/data/shakespeare_sub96/per_user_data/train 6 | test_dir = /mnt/nfs/shared/leaf/data/shakespeare_sub96/data/test 7 | ; python list of fractions below 8 | sizes = 9 | 10 | [OPTIMIZER_PARAMS] 11 | optimizer_package = torch.optim 12 | optimizer_class = SGD 13 | lr = 0.1 14 | 15 | [TRAIN_PARAMS] 16 | training_package = decentralizepy.training.Training 17 | training_class = Training 18 | rounds = 10 19 | full_epochs = False 20 | batch_size = 16 21 | shuffle = True 22 | loss_package = torch.nn 23 | loss_class = CrossEntropyLoss 24 | 25 | [COMMUNICATION] 26 | comm_package = decentralizepy.communication.TCP 27 | comm_class = TCP 28 | addresses_filepath = ip_addr_6Machines.json 29 | 30 | [SHARING] 31 | sharing_package = decentralizepy.sharing.Sharing 32 | sharing_class = Sharing 33 | -------------------------------------------------------------------------------- /eval/step_configs/config_reddit_sharing.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Reddit 3 | dataset_class = Reddit 4 | random_seed = 97 5 | model_class = RNN 6 | train_dir = /mnt/nfs/shared/leaf/data/reddit_new/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/reddit_new/new_small_data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 47 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.Sharing 33 | sharing_class = Sharing 34 | -------------------------------------------------------------------------------- /eval/step_configs/config_cifar_partialmodel.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.CIFAR10 3 | dataset_class = CIFAR10 4 | model_class = LeNet 5 | train_dir = /mnt/nfs/shared/CIFAR 6 | test_dir = /mnt/nfs/shared/CIFAR 7 | ; python list of fractions below 8 | sizes = 9 | random_seed = 99 10 | partition_niid = kshard 11 | shard = 2 12 | 13 | [OPTIMIZER_PARAMS] 14 | optimizer_package = torch.optim 15 | optimizer_class = SGD 16 | lr = 0.001 17 | 18 | [TRAIN_PARAMS] 19 | training_package = decentralizepy.training.Training 20 | training_class = Training 21 | rounds = 65 22 | full_epochs = False 23 | batch_size = 8 24 | shuffle = True 25 | loss_package = torch.nn 26 | loss_class = CrossEntropyLoss 27 | 28 | [COMMUNICATION] 29 | comm_package = decentralizepy.communication.TCP 30 | comm_class = TCP 31 | addresses_filepath = ip_addr_6Machines.json 32 | 33 | [SHARING] 34 | sharing_package = decentralizepy.sharing.PartialModel 35 | sharing_class = PartialModel 36 | alpha=0.5 37 | -------------------------------------------------------------------------------- /eval/step_configs/config_cifar_subsampling.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.CIFAR10 3 | dataset_class = CIFAR10 4 | model_class = LeNet 5 | train_dir = /mnt/nfs/shared/CIFAR 6 | test_dir = /mnt/nfs/shared/CIFAR 7 | ; python list of fractions below 8 | sizes = 9 | random_seed = 99 10 | partition_niid = dirichlet 11 | alpha = 1 12 | 13 | [OPTIMIZER_PARAMS] 14 | optimizer_package = torch.optim 15 | optimizer_class = SGD 16 | lr = 0.001 17 | 18 | [TRAIN_PARAMS] 19 | training_package = decentralizepy.training.Training 20 | training_class = Training 21 | rounds = 65 22 | full_epochs = False 23 | batch_size = 8 24 | shuffle = True 25 | loss_package = torch.nn 26 | loss_class = CrossEntropyLoss 27 | 28 | [COMMUNICATION] 29 | comm_package = decentralizepy.communication.TCP 30 | comm_class = TCP 31 | addresses_filepath = ip_addr_6Machines.json 32 | 33 | [SHARING] 34 | sharing_package = decentralizepy.sharing.SubSampling 35 | sharing_class = SubSampling 36 | alpha = 0.5 37 | -------------------------------------------------------------------------------- /eval/step_configs/config_femnist_partialmodel.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Femnist 3 | dataset_class = Femnist 4 | random_seed = 97 5 | model_class = CNN 6 | train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 47 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.PartialModel 33 | sharing_class = PartialModel 34 | alpha=0.1 35 | -------------------------------------------------------------------------------- /eval/step_configs/config_reddit_partialmodel.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Reddit 3 | dataset_class = Reddit 4 | random_seed = 97 5 | model_class = RNN 6 | train_dir = /mnt/nfs/shared/leaf/data/reddit_new/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/reddit_new/new_small_data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 47 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.PartialModel 33 | sharing_class = PartialModel 34 | alpha = 0.1 35 | -------------------------------------------------------------------------------- /eval/step_configs/config_reddit_subsampling.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Reddit 3 | dataset_class = Reddit 4 | random_seed = 97 5 | model_class = RNN 6 | train_dir = /mnt/nfs/shared/leaf/data/reddit_new/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/reddit_new/new_small_data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 4 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.SubSampling 33 | sharing_class = SubSampling 34 | alpha = 0.1 35 | -------------------------------------------------------------------------------- /eval/step_configs/config_celeba_sharing.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Celeba 3 | dataset_class = Celeba 4 | model_class = CNN 5 | images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba 6 | train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 4 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.Sharing 33 | sharing_class = Sharing 34 | -------------------------------------------------------------------------------- /eval/step_configs/config_shakespeare_subsampling.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Shakespeare 3 | dataset_class = Shakespeare 4 | random_seed = 97 5 | model_class = LSTM 6 | train_dir = /mnt/nfs/shared/leaf/data/shakespeare_sub96/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/shakespeare_sub96/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.1 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 10 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.SubSampling 33 | sharing_class = SubSampling 34 | alpha = 0.1 35 | -------------------------------------------------------------------------------- /eval/step_configs/config_shakespeare_partialmodel.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Shakespeare 3 | dataset_class = Shakespeare 4 | random_seed = 97 5 | model_class = LSTM 6 | train_dir = /mnt/nfs/shared/leaf/data/shakespeare_sub96/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/shakespeare_sub96/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.1 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 10 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.PartialModel 33 | sharing_class = PartialModel 34 | alpha = 0.1 35 | -------------------------------------------------------------------------------- /eval/step_configs/config_femnist_subsampling.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Femnist 3 | dataset_class = Femnist 4 | random_seed = 97 5 | model_class = CNN 6 | train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | # There are 734463 femnist samples 17 | [TRAIN_PARAMS] 18 | training_package = decentralizepy.training.Training 19 | training_class = Training 20 | rounds = 47 21 | full_epochs = False 22 | batch_size = 16 23 | shuffle = True 24 | loss_package = torch.nn 25 | loss_class = CrossEntropyLoss 26 | 27 | [COMMUNICATION] 28 | comm_package = decentralizepy.communication.TCP 29 | comm_class = TCP 30 | addresses_filepath = ip_addr_6Machines.json 31 | 32 | [SHARING] 33 | sharing_package = decentralizepy.sharing.SubSampling 34 | sharing_class = SubSampling 35 | alpha = 0.1 36 | -------------------------------------------------------------------------------- /eval/step_configs/config_celeba_partialmodel.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Celeba 3 | dataset_class = Celeba 4 | model_class = CNN 5 | images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba 6 | train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 4 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.PartialModel 33 | sharing_class = PartialModel 34 | alpha = 0.1 -------------------------------------------------------------------------------- /eval/step_configs/config_celeba_subsampling.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Celeba 3 | dataset_class = Celeba 4 | model_class = CNN 5 | images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba 6 | train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 4 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.SubSampling 33 | sharing_class = SubSampling 34 | alpha = 0.1 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2022] [DecentralizePy] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /src/decentralizepy/datasets/Data.py: -------------------------------------------------------------------------------- 1 | class Data: 2 | """ 3 | This class defines the API for Data. 4 | 5 | """ 6 | 7 | def __init__(self, x, y): 8 | """ 9 | Constructor 10 | 11 | Parameters 12 | ---------- 13 | x : numpy array 14 | A numpy array of data samples 15 | y : numpy array 16 | A numpy array of outputs corresponding to the sample 17 | 18 | """ 19 | self.x = x 20 | self.y = y 21 | 22 | def __len__(self): 23 | """ 24 | Return the number of samples in the dataset 25 | 26 | Returns 27 | ------- 28 | int 29 | Number of samples 30 | 31 | """ 32 | return self.y.shape[0] 33 | 34 | def __getitem__(self, i): 35 | """ 36 | Function to get the item with index i. 37 | 38 | Parameters 39 | ---------- 40 | i : int 41 | Index 42 | 43 | Returns 44 | ------- 45 | 2-tuple 46 | A tuple of the ith data sample and it's corresponding label 47 | 48 | """ 49 | return self.x[i], self.y[i] 50 | -------------------------------------------------------------------------------- /src/decentralizepy/graphs/SmallWorld.py: -------------------------------------------------------------------------------- 1 | import smallworld 2 | 3 | from decentralizepy.graphs.Graph import Graph 4 | 5 | 6 | class SmallWorld(Graph): 7 | """ 8 | The class for generating a SmallWorld topology Graph 9 | 10 | Adapted from https://gitlab.epfl.ch/sacs/ml-rawdatasharing/dnn-recommender/-/blob/master/topologies.py 11 | 12 | """ 13 | 14 | def __init__(self, n_procs, k_over_2, beta): 15 | """ 16 | Constructor. Generates a random connected SmallWorld graph 17 | 18 | Parameters 19 | ---------- 20 | n_procs : int 21 | total number of nodes in the graph 22 | k_over_2 : int 23 | k_over_2 config for smallworld 24 | beta : float 25 | beta config for smallworld. β = 1 is truly equal to the Erdős-Rényi network model 26 | 27 | """ 28 | super().__init__(n_procs) 29 | G = smallworld.get_smallworld_graph(self.n_procs, k_over_2, beta) 30 | for edge in list(G.edges): 31 | node1 = edge[0] 32 | node2 = edge[1] 33 | self.adj_list[node1].add(node2) 34 | self.adj_list[node2].add(node1) 35 | 36 | self.connect_graph() 37 | -------------------------------------------------------------------------------- /tutorial/run_decentralized.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | decpy_path=../eval # Path to eval folder 4 | graph=regular_16.txt # Absolute path of the graph file generated using the generate_graph.py script 5 | run_path=../eval/data # Path to the folder where the graph and config file will be copied and the results will be stored 6 | config_file=config.ini 7 | cp $graph $config_file $run_path 8 | 9 | env_python=~/miniconda3/envs/decpy/bin/python3 # Path to python executable of the environment | conda recommended 10 | machines=1 # number of machines in the runtime 11 | iterations=80 12 | test_after=20 13 | eval_file=$decpy_path/testing.py # decentralized driver code (run on each machine) 14 | log_level=INFO # DEBUG | INFO | WARN | CRITICAL 15 | 16 | m=0 # machine id corresponding consistent with ip.json 17 | echo M is $m 18 | 19 | procs_per_machine=16 # 16 processes on 1 machine 20 | echo procs per machine is $procs_per_machine 21 | 22 | log_dir=$run_path/$(date '+%Y-%m-%dT%H:%M')/machine$m # in the eval folder 23 | mkdir -p $log_dir 24 | 25 | $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $run_path/$graph -ta $test_after -cf $run_path/$config_file -ll $log_level -wsd $log_dir -------------------------------------------------------------------------------- /tutorial/JWINS/run_decentralized.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | decpy_path=../../eval # Path to eval folder 4 | graph=regular_16.txt # Absolute path of the graph file generated using the generate_graph.py script 5 | run_path=../../eval/data # Path to the folder where the graph and config file will be copied and the results will be stored 6 | config_file=config.ini 7 | cp $graph $config_file $run_path 8 | 9 | env_python=~/miniconda3/envs/decpy/bin/python3 # Path to python executable of the environment | conda recommended 10 | machines=1 # number of machines in the runtime 11 | iterations=80 12 | test_after=20 13 | eval_file=$decpy_path/testing.py # decentralized driver code (run on each machine) 14 | log_level=INFO # DEBUG | INFO | WARN | CRITICAL 15 | 16 | m=0 # machine id corresponding consistent with ip.json 17 | echo M is $m 18 | 19 | procs_per_machine=16 # 16 processes on 1 machine 20 | echo procs per machine is $procs_per_machine 21 | 22 | log_dir=$run_path/$(date '+%Y-%m-%dT%H:%M')/machine$m # in the eval folder 23 | mkdir -p $log_dir 24 | 25 | $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $run_path/$graph -ta $test_after -cf $run_path/$config_file -ll $log_level -wsd $log_dir -------------------------------------------------------------------------------- /eval/step_configs/config_movielens_jwins.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.MovieLens 3 | dataset_class = MovieLens 4 | model_class = MatrixFactorization 5 | train_dir = /mnt/nfs/shared/leaf/data/movielens 6 | test_dir = /mnt/nfs/shared/leaf/data/movielens 7 | ; python list of fractions below 8 | sizes = 9 | 10 | [OPTIMIZER_PARAMS] 11 | optimizer_package = torch.optim 12 | optimizer_class = SGD 13 | lr = 0.1 14 | 15 | [TRAIN_PARAMS] 16 | training_package = decentralizepy.training.Training 17 | training_class = Training 18 | rounds = 10 19 | full_epochs = False 20 | batch_size = 16 21 | shuffle = True 22 | loss_package = torch.nn 23 | loss_class = MSELoss 24 | 25 | [COMMUNICATION] 26 | comm_package = decentralizepy.communication.TCP 27 | comm_class = TCP 28 | addresses_filepath = ip_addr_6Machines.json 29 | 30 | [SHARING] 31 | sharing_package = decentralizepy.sharing.JWINS.JWINS 32 | sharing_class = JWINS 33 | change_based_selection = True 34 | alpha_list = [0.1,0.15,0.2,0.25,0.3,0.4,1.0] 35 | wavelet=sym2 36 | level= 4 37 | accumulation = True 38 | accumulate_averaging_changes = True 39 | metadata_cap = 0.5 40 | compression_package = decentralizepy.compression.EliasFpzip 41 | compression_class = EliasFpzip 42 | compress = True 43 | -------------------------------------------------------------------------------- /tutorial/EpidemicLearning/run_el-local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | decpy_path=../../eval # Path to eval folder 4 | graph=fullyConnected_16.edges # Absolute path of the graph file generated using the generate_graph.py script 5 | run_path=../../eval/data # Path to the folder where the graph and config file will be copied and the results will be stored 6 | config_file=config_EL.ini 7 | cp $graph $config_file $run_path 8 | 9 | env_python=~/miniconda3/envs/decpy/bin/python3 # Path to python executable of the environment | conda recommended 10 | machines=1 # number of machines in the runtime 11 | iterations=80 12 | test_after=20 13 | eval_file=testingEL_Local.py # decentralized driver code (run on each machine) 14 | log_level=INFO # DEBUG | INFO | WARN | CRITICAL 15 | 16 | m=0 # machine id corresponding consistent with ip.json 17 | echo M is $m 18 | 19 | procs_per_machine=16 # 16 processes on 1 machine 20 | echo procs per machine is $procs_per_machine 21 | 22 | log_dir=$run_path/$(date '+%Y-%m-%dT%H:%M')/machine$m # in the eval folder 23 | mkdir -p $log_dir 24 | 25 | $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $run_path/$graph -ta $test_after -cf $run_path/$config_file -ll $log_level -wsd $log_dir -------------------------------------------------------------------------------- /tutorial/EpidemicLearning/run_el-oracle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | decpy_path=../../eval # Path to eval folder 4 | graph=fullyConnected_16.edges # Absolute path of the graph file generated using the generate_graph.py script 5 | run_path=../../eval/data # Path to the folder where the graph and config file will be copied and the results will be stored 6 | config_file=config_EL.ini 7 | cp $graph $config_file $run_path 8 | 9 | env_python=~/miniconda3/envs/decpy/bin/python3 # Path to python executable of the environment | conda recommended 10 | machines=1 # number of machines in the runtime 11 | iterations=80 12 | test_after=20 13 | eval_file=testingEL_Oracle.py # decentralized driver code (run on each machine) 14 | log_level=INFO # DEBUG | INFO | WARN | CRITICAL 15 | 16 | m=0 # machine id corresponding consistent with ip.json 17 | echo M is $m 18 | 19 | procs_per_machine=16 # 16 processes on 1 machine 20 | echo procs per machine is $procs_per_machine 21 | 22 | log_dir=$run_path/$(date '+%Y-%m-%dT%H:%M')/machine$m # in the eval folder 23 | mkdir -p $log_dir 24 | 25 | $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $run_path/$graph -ta $test_after -cf $run_path/$config_file -ll $log_level -wsd $log_dir -------------------------------------------------------------------------------- /eval/step_configs/config_cifar_jwins.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.CIFAR10 3 | dataset_class = CIFAR10 4 | model_class = LeNet 5 | train_dir = /mnt/nfs/shared/CIFAR 6 | test_dir = /mnt/nfs/shared/CIFAR 7 | ; python list of fractions below 8 | sizes = 9 | random_seed = 99 10 | partition_niid = dirichlet 11 | alpha = 1 12 | 13 | [OPTIMIZER_PARAMS] 14 | optimizer_package = torch.optim 15 | optimizer_class = SGD 16 | lr = 0.01 17 | 18 | [TRAIN_PARAMS] 19 | training_package = decentralizepy.training.Training 20 | training_class = Training 21 | rounds = 3 22 | full_epochs = False 23 | batch_size = 8 24 | shuffle = True 25 | loss_package = torch.nn 26 | loss_class = CrossEntropyLoss 27 | 28 | [COMMUNICATION] 29 | comm_package = decentralizepy.communication.TCP 30 | comm_class = TCP 31 | addresses_filepath = ip_addr_6Machines.json 32 | 33 | [SHARING] 34 | sharing_package = decentralizepy.sharing.JWINS.JWINS 35 | sharing_class = JWINS 36 | change_based_selection = True 37 | alpha_list = [0.1,0.15,0.2,0.25,0.3,0.4,1.0] 38 | wavelet=sym2 39 | level= 4 40 | accumulation = True 41 | accumulate_averaging_changes = True 42 | metadata_cap = 0.5 43 | compression_package = decentralizepy.compression.EliasFpzip 44 | compression_class = EliasFpzip 45 | compress = True 46 | -------------------------------------------------------------------------------- /tutorial/run_federated.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | decpy_path=../eval # Path to eval folder 4 | graph=regular_16.txt # Absolute path of the graph file 5 | run_path=../eval/data # Path to the folder where the graph and config file will be copied and the results will be stored 6 | config_file=config.ini 7 | cp $graph $config_file $run_path 8 | 9 | env_python=~/miniconda3/envs/decpy/bin/python3 # Path to python executable of the environment | conda recommended 10 | machines=1 # number of machines in the runtime 11 | iterations=80 12 | test_after=20 13 | eval_file=$decpy_path/testingFederated.py # decentralized driver code (run on each machine) 14 | log_level=INFO # DEBUG | INFO | WARN | CRITICAL 15 | 16 | server_rank=-1 17 | server_machine=0 18 | working_rate=0.5 19 | 20 | m=0 # machine id corresponding consistent with ip.json 21 | echo M is $m 22 | 23 | procs_per_machine=16 # 16 processes on 1 machine 24 | echo procs per machine is $procs_per_machine 25 | 26 | log_dir=$run_path/$(date '+%Y-%m-%dT%H:%M')/machine$m # in the eval folder 27 | mkdir -p $log_dir 28 | 29 | $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $run_path/$graph -ta $test_after -cf $run_path/$config_file -ll $log_level -sm $server_machine -sr $server_rank -wr $working_rate -------------------------------------------------------------------------------- /eval/step_configs/config_shakespeare_jwins.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Shakespeare 3 | dataset_class = Shakespeare 4 | random_seed = 97 5 | model_class = LSTM 6 | train_dir = /mnt/nfs/shared/leaf/data/shakespeare_sub96/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/shakespeare_sub96/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.1 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 10 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.JWINS.JWINS 33 | sharing_class = JWINS 34 | change_based_selection = True 35 | alpha_list = [0.1,0.15,0.2,0.25,0.3,0.4,1.0] 36 | wavelet=sym2 37 | level= 4 38 | accumulation = True 39 | accumulate_averaging_changes = True 40 | metadata_cap = 0.5 41 | compression_package = decentralizepy.compression.EliasFpzip 42 | compression_class = EliasFpzip 43 | compress = True 44 | -------------------------------------------------------------------------------- /eval/step_configs/config_femnist_jwins.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Femnist 3 | dataset_class = Femnist 4 | random_seed = 97 5 | model_class = CNN 6 | train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | # There are 734463 femnist samples 17 | [TRAIN_PARAMS] 18 | training_package = decentralizepy.training.Training 19 | training_class = Training 20 | rounds = 47 21 | full_epochs = False 22 | batch_size = 16 23 | shuffle = True 24 | loss_package = torch.nn 25 | loss_class = CrossEntropyLoss 26 | 27 | [COMMUNICATION] 28 | comm_package = decentralizepy.communication.TCP 29 | comm_class = TCP 30 | addresses_filepath = ip_addr_6Machines.json 31 | 32 | [SHARING] 33 | sharing_package = decentralizepy.sharing.JWINS.JWINS 34 | sharing_class = JWINS 35 | change_based_selection = True 36 | alpha_list = [0.1,0.15,0.2,0.25,0.3,0.4,1.0] 37 | wavelet=sym2 38 | level= 4 39 | accumulation = True 40 | accumulate_averaging_changes = True 41 | metadata_cap = 0.5 42 | compression_package = decentralizepy.compression.EliasFpzip 43 | compression_class = EliasFpzip 44 | compress = True 45 | -------------------------------------------------------------------------------- /eval/step_configs/config_celeba_jwins.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.Celeba 3 | dataset_class = Celeba 4 | model_class = CNN 5 | images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba 6 | train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train 7 | test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test 8 | ; python list of fractions below 9 | sizes = 10 | 11 | [OPTIMIZER_PARAMS] 12 | optimizer_package = torch.optim 13 | optimizer_class = SGD 14 | lr = 0.001 15 | 16 | [TRAIN_PARAMS] 17 | training_package = decentralizepy.training.Training 18 | training_class = Training 19 | rounds = 4 20 | full_epochs = False 21 | batch_size = 16 22 | shuffle = True 23 | loss_package = torch.nn 24 | loss_class = CrossEntropyLoss 25 | 26 | [COMMUNICATION] 27 | comm_package = decentralizepy.communication.TCP 28 | comm_class = TCP 29 | addresses_filepath = ip_addr_6Machines.json 30 | 31 | [SHARING] 32 | sharing_package = decentralizepy.sharing.JWINS.JWINS 33 | sharing_class = JWINS 34 | change_based_selection = True 35 | alpha_list = [0.1,0.15,0.2,0.25,0.3,0.4,1.0] 36 | wavelet=sym2 37 | level= 4 38 | accumulation = True 39 | accumulate_averaging_changes = True 40 | metadata_cap = 0.5 41 | compression_package = decentralizepy.compression.EliasFpzip 42 | compression_class = EliasFpzip 43 | compress = True 44 | 45 | -------------------------------------------------------------------------------- /src/decentralizepy/compression/EliasFpzip.py: -------------------------------------------------------------------------------- 1 | # elias implementation: taken from this stack overflow post: 2 | # https://stackoverflow.com/questions/62843156/python-fast-compression-of-large-amount-of-numbers-with-elias-gamma 3 | import fpzip 4 | 5 | from decentralizepy.compression.Elias import Elias 6 | 7 | 8 | class EliasFpzip(Elias): 9 | """ 10 | Compression API 11 | 12 | """ 13 | 14 | def __init__(self, *args, **kwargs): 15 | """ 16 | Constructor 17 | """ 18 | 19 | def compress_float(self, arr): 20 | """ 21 | compression function for float arrays 22 | 23 | Parameters 24 | ---------- 25 | arr : np.ndarray 26 | Data to compress 27 | 28 | Returns 29 | ------- 30 | bytearray 31 | encoded data as bytes 32 | 33 | """ 34 | return fpzip.compress(arr, precision=0, order="C") 35 | 36 | def decompress_float(self, bytes): 37 | """ 38 | decompression function for compressed float arrays 39 | 40 | Parameters 41 | ---------- 42 | bytes :bytearray 43 | compressed data 44 | 45 | Returns 46 | ------- 47 | arr : np.ndarray 48 | decompressed data as array 49 | 50 | """ 51 | return fpzip.decompress(bytes, order="C").squeeze() 52 | -------------------------------------------------------------------------------- /src/decentralizepy/datasets/text/LLMData.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from decentralizepy.datasets.Data import Data 4 | 5 | 6 | class LLMData(Data): 7 | """ 8 | This class defines the API for Data. 9 | 10 | """ 11 | 12 | def __init__(self, x, y): 13 | """ 14 | Constructor 15 | 16 | Parameters 17 | ---------- 18 | x : numpy array 19 | A numpy array of data samples 20 | y : numpy array 21 | A numpy array of outputs corresponding to the sample 22 | 23 | """ 24 | self.x = x 25 | self.y = y 26 | 27 | def __len__(self): 28 | """ 29 | Return the number of samples in the dataset 30 | 31 | Returns 32 | ------- 33 | int 34 | Number of samples 35 | 36 | """ 37 | return len(self.y) 38 | 39 | def __getitem__(self, idx): 40 | """ 41 | Function to get the item with index i. 42 | 43 | Parameters 44 | ---------- 45 | idx : int 46 | Index 47 | 48 | Returns 49 | ------- 50 | dict 51 | A dict of the ith data sample, its attention_mask and label 52 | 53 | """ 54 | item = {key: torch.tensor(val[idx]) for key, val in self.x.items()} 55 | item["labels"] = torch.tensor(self.y[idx]) 56 | return item 57 | -------------------------------------------------------------------------------- /tutorial/EpidemicLearning/config_EL.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.CIFAR10 3 | dataset_class = CIFAR10 4 | model_class = LeNet 5 | ; provide directory containing "cifar-10-batches-py" folder | Pre-download recommended 6 | ; New download does not work with multiple processes | Crashes the first time, just retry 7 | train_dir = ../../eval/data/ 8 | test_dir = ../../eval/data/ 9 | ; python list of fractions below 10 | sizes = 11 | random_seed = 90 12 | partition_niid = dirichlet 13 | alpha = 0.1 ; alpha (dirichlet parameter) 14 | 15 | [OPTIMIZER_PARAMS] 16 | optimizer_package = torch.optim 17 | optimizer_class = SGD 18 | lr = 0.05 ; gamma 19 | 20 | [TRAIN_PARAMS] 21 | training_package = decentralizepy.training.Training 22 | training_class = Training 23 | rounds = 10 ; r 24 | full_epochs = False 25 | batch_size = 5 ; b 26 | shuffle = True 27 | loss_package = torch.nn 28 | loss_class = CrossEntropyLoss 29 | 30 | [COMMUNICATION] 31 | comm_package = decentralizepy.communication.TCP 32 | comm_class = TCP 33 | addresses_filepath = ip.json 34 | 35 | [SHARING] 36 | sharing_package = decentralizepy.sharing.PlainAverageSharing ; Does not use Metropolis-Hastings 37 | sharing_class = PlainAverageSharing 38 | compress = False 39 | 40 | [NODE] 41 | graph_degree = 7 ; s (number of neighbors in EL-Oracle and number of random neighbors picked to send message to in EL-Local) -------------------------------------------------------------------------------- /tutorial/config.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.CIFAR10 3 | dataset_class = CIFAR10 4 | model_class = LeNet 5 | ; provide directory containing "cifar-10-batches-py" folder | Pre-download recommended 6 | ; New download does not work with multiple processes | Crashes the first time, just retry 7 | train_dir = ../eval/data/ 8 | test_dir = ../eval/data/ 9 | ; python list of fractions below 10 | sizes = 11 | random_seed = 90 12 | partition_niid = False 13 | shards = 4 14 | validation_source = Test 15 | ; Train or Test set used to extract the validation set only on CIFAR-10 and FEMNIST 16 | ; On FEMNIST if the validation set is extracted from the test set is the same for all the clients 17 | validation_size = 0.1 18 | ; fraction of the train or test set used as validation set, implemented only on CIFAR-10 and FEMNIST dataset 19 | 20 | 21 | [OPTIMIZER_PARAMS] 22 | optimizer_package = torch.optim 23 | optimizer_class = SGD 24 | lr = 0.01 25 | 26 | [TRAIN_PARAMS] 27 | training_package = decentralizepy.training.Training 28 | training_class = Training 29 | rounds = 3 30 | full_epochs = False 31 | batch_size = 8 32 | shuffle = True 33 | loss_package = torch.nn 34 | loss_class = CrossEntropyLoss 35 | 36 | [COMMUNICATION] 37 | comm_package = decentralizepy.communication.TCP 38 | comm_class = TCP 39 | addresses_filepath = ../tutorial/ip.json 40 | 41 | [SHARING] 42 | sharing_package = decentralizepy.sharing.Sharing 43 | sharing_class = Sharing 44 | -------------------------------------------------------------------------------- /eval/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | script_path=$(realpath $(dirname $0)) 3 | 4 | # Working directory, where config files are read from and logs are written. 5 | decpy_path=/mnt/nfs/$(whoami)/decpy_workingdir 6 | cd $decpy_path 7 | 8 | # Python interpreter 9 | env_python=python3 10 | 11 | # File regular_16.txt is available in /tutorial 12 | graph=$decpy_path/regular_16.txt 13 | 14 | # File config_celeba_sharing.ini is available in /tutorial 15 | # In this config file, change addresses_filepath to correspond to your list of machines (example in /tutorial/ip.json) 16 | original_config=$decpy_path/config_celeba_sharing.ini 17 | 18 | # Local config file 19 | config_file=/tmp/$(basename $original_config) 20 | 21 | # Python script to be executed 22 | eval_file=$script_path/testingPeerSampler.py 23 | 24 | # General parameters 25 | procs_per_machine=8 26 | machines=2 27 | iterations=5 28 | test_after=2 29 | log_level=INFO 30 | 31 | m=`cat $(grep addresses_filepath $original_config | awk '{print $3}') | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2` 32 | echo M is $m 33 | log_dir=$(date '+%Y-%m-%dT%H:%M')/machine$m 34 | mkdir -p $log_dir 35 | 36 | # Copy and manipulate the local config file 37 | cp $original_config $config_file 38 | # echo "alpha = 0.10" >> $config_file 39 | 40 | $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level -wsd $log_dir 41 | -------------------------------------------------------------------------------- /tutorial/JWINS/config.ini: -------------------------------------------------------------------------------- 1 | [DATASET] 2 | dataset_package = decentralizepy.datasets.CIFAR10 3 | dataset_class = CIFAR10 4 | model_class = LeNet 5 | ; provide directory containing "cifar-10-batches-py" folder | Pre-download recommended 6 | ; New download does not work with multiple processes | Crashes the first time, just retry 7 | train_dir = ../../eval/data/ 8 | test_dir = ../../eval/data/ 9 | ; python list of fractions below 10 | sizes = 11 | random_seed = 90 12 | partition_niid = kshard 13 | shards = 4 14 | 15 | [OPTIMIZER_PARAMS] 16 | optimizer_package = torch.optim 17 | optimizer_class = SGD 18 | lr = 0.01 19 | 20 | [TRAIN_PARAMS] 21 | training_package = decentralizepy.training.Training 22 | training_class = Training 23 | rounds = 3 24 | full_epochs = False 25 | batch_size = 8 26 | shuffle = True 27 | loss_package = torch.nn 28 | loss_class = CrossEntropyLoss 29 | 30 | [COMMUNICATION] 31 | comm_package = decentralizepy.communication.TCP 32 | comm_class = TCP 33 | addresses_filepath = ip.json 34 | 35 | ; [SHARING] 36 | ; sharing_package = decentralizepy.sharing.Sharing 37 | ; sharing_class = Sharing 38 | 39 | [SHARING] 40 | sharing_package = decentralizepy.sharing.JWINS.JWINS 41 | sharing_class = JWINS 42 | change_based_selection = True 43 | alpha_list = [0.1,0.15,0.2,0.25,0.3,0.4,1.0] 44 | wavelet=sym2 45 | level= 4 46 | accumulation = True 47 | accumulate_averaging_changes = True 48 | metadata_cap = 0.5 49 | compression_package = decentralizepy.compression.EliasFpzip 50 | compression_class = EliasFpzip 51 | compress = True 52 | -------------------------------------------------------------------------------- /src/decentralizepy/compression/EliasFpzipLossy.py: -------------------------------------------------------------------------------- 1 | # elias implementation: taken from this stack overflow post: 2 | # https://stackoverflow.com/questions/62843156/python-fast-compression-of-large-amount-of-numbers-with-elias-gamma 3 | import fpzip 4 | 5 | from decentralizepy.compression.Elias import Elias 6 | 7 | 8 | class EliasFpzipLossy(Elias): 9 | """ 10 | Compression API 11 | 12 | """ 13 | 14 | def __init__(self, float_precision=16, *args, **kwargs): 15 | """ 16 | Constructor 17 | 18 | Parameters 19 | ---------- 20 | float_precision : int, optional 21 | Precision of the compression, by default 16 22 | 23 | """ 24 | self.float_precision = float_precision 25 | 26 | def compress_float(self, arr): 27 | """ 28 | compression function for float arrays 29 | 30 | Parameters 31 | ---------- 32 | arr : np.ndarray 33 | Data to compress 34 | 35 | Returns 36 | ------- 37 | bytearray 38 | encoded data as bytes 39 | 40 | """ 41 | return fpzip.compress(arr, precision=self.float_precision, order="C") 42 | 43 | def decompress_float(self, bytes): 44 | """ 45 | decompression function for compressed float arrays 46 | 47 | Parameters 48 | ---------- 49 | bytes :bytearray 50 | compressed data 51 | 52 | Returns 53 | ------- 54 | arr : np.ndarray 55 | decompressed data as array 56 | 57 | """ 58 | return fpzip.decompress(bytes, order="C").squeeze() 59 | -------------------------------------------------------------------------------- /src/decentralizepy/compression/Compression.py: -------------------------------------------------------------------------------- 1 | class Compression: 2 | """ 3 | Compression API 4 | 5 | """ 6 | 7 | def __init__(self, *args, **kwargs): 8 | """ 9 | Constructor 10 | """ 11 | 12 | def compress(self, arr): 13 | """ 14 | compression function 15 | 16 | Parameters 17 | ---------- 18 | arr : np.ndarray 19 | Data to compress 20 | 21 | Returns 22 | ------- 23 | bytearray 24 | encoded data as bytes 25 | 26 | """ 27 | return arr 28 | 29 | def decompress(self, bytes): 30 | """ 31 | decompression function 32 | 33 | Parameters 34 | ---------- 35 | bytes :bytearray 36 | compressed data 37 | 38 | Returns 39 | ------- 40 | arr : np.ndarray 41 | decompressed data as array 42 | 43 | """ 44 | return bytes 45 | 46 | def compress_float(self, arr): 47 | """ 48 | compression function for float arrays 49 | 50 | Parameters 51 | ---------- 52 | arr : np.ndarray 53 | Data to compress 54 | 55 | Returns 56 | ------- 57 | bytearray 58 | encoded data as bytes 59 | 60 | """ 61 | return arr 62 | 63 | def decompress_float(self, bytes): 64 | """ 65 | decompression function for compressed float arrays 66 | 67 | Parameters 68 | ---------- 69 | bytes :bytearray 70 | compressed data 71 | 72 | Returns 73 | ------- 74 | arr : np.ndarray 75 | decompressed data as array 76 | 77 | """ 78 | return bytes 79 | -------------------------------------------------------------------------------- /eval/run_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nfs_home=$1 3 | python_bin=$2 4 | decpy_path=$nfs_home/decentralizepy/eval 5 | cd $decpy_path 6 | 7 | env_python=$python_bin/python3 8 | graph=96_regular.edges #4_node_fullyConnected.edges 9 | config_file=~/tmp/config.ini 10 | procs_per_machine=16 11 | machines=6 12 | iterations=5 13 | train_evaluate_after=5 14 | test_after=21 # we do not test 15 | eval_file=testing.py 16 | log_level=INFO 17 | 18 | ip_machines=$nfs_home/configs/ip_addr_6Machines.json 19 | 20 | m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2` 21 | export PYTHONFAULTHANDLER=1 22 | tests=("step_configs/config_celeba_partialmodel.ini" "step_configs/config_celeba_sharing.ini" "step_configs/config_celeba_fft.ini" "step_configs/config_celeba_wavelet.ini" 23 | "step_configs/config_celeba_grow.ini" "step_configs/config_celeba_manualadapt.ini" "step_configs/config_celeba_randomalpha.ini" 24 | "step_configs/config_celeba_randomalphainc.ini" "step_configs/config_celeba_roundrobin.ini" "step_configs/config_celeba_subsampling.ini" 25 | "step_configs/config_celeba_topkrandom.ini" "step_configs/config_celeba_topkacc.ini" "step_configs/config_celeba_topkparam.ini") 26 | 27 | for i in "${tests[@]}" 28 | do 29 | echo $i 30 | IFS='_' read -ra NAMES <<< $i 31 | IFS='.' read -ra NAME <<< ${NAMES[-1]} 32 | log_dir=$nfs_home/logs/testing/${NAME[0]}$(date '+%Y-%m-%dT%H:%M')/machine$m 33 | mkdir -p $log_dir 34 | cp $i $config_file 35 | $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines 36 | $env_python $eval_file -ro 0 -tea $train_evaluate_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level 37 | echo $i is done 38 | sleep 3 39 | echo end of sleep 40 | done 41 | -------------------------------------------------------------------------------- /eval/plot_shared.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | from matplotlib import pyplot as plt 8 | 9 | 10 | def plot(x, y, label, *args): 11 | plt.plot(x, y, *args, label=label) 12 | plt.legend() 13 | 14 | 15 | def plot_shared(path, title): 16 | model_path = os.path.join(path, "plots") 17 | Path(model_path).mkdir(parents=True, exist_ok=True) 18 | files = [f for f in os.listdir(path) if f.endswith("json")] 19 | assert len(files) > 0 20 | for i, file in enumerate(files): 21 | filepath = os.path.join(path, file) 22 | with open(filepath, "r") as inf: 23 | model_vec = json.load(inf) 24 | del model_vec["order"] 25 | if i == 0: 26 | total_params = 0 27 | for l in model_vec["shapes"].values(): 28 | current_params = 1 29 | for v in l: 30 | current_params *= v 31 | total_params += current_params 32 | print("Total Params: ", str(total_params)) 33 | shared_count = np.zeros(total_params, dtype=int) 34 | del model_vec["shapes"] 35 | model_vec = np.array(model_vec[list(model_vec.keys())[0]]) 36 | shared_count[model_vec] += 1 37 | print("sum: ", np.sum(shared_count)) 38 | num_elements = shared_count.shape[0] 39 | x_axis = np.arange(1, num_elements + 1) 40 | plt.clf() 41 | plt.title(title) 42 | plot(x_axis, shared_count, "unsorted", ".") 43 | shared_count = np.sort(shared_count) 44 | plot(x_axis, shared_count, "sorted") 45 | plt.savefig(os.path.join(model_path, "shared_plot.png")) 46 | 47 | 48 | if __name__ == "__main__": 49 | assert len(sys.argv) == 2 50 | plot_shared(sys.argv[1], "Shared Parameters") 51 | -------------------------------------------------------------------------------- /src/decentralizepy/mappings/Mapping.py: -------------------------------------------------------------------------------- 1 | class Mapping: 2 | """ 3 | This class defines the bidirectional mapping between: 4 | 1. The unique identifier 5 | 2. machine_id and rank 6 | 7 | """ 8 | 9 | def __init__(self, n_procs): 10 | """ 11 | Constructor 12 | 13 | Parameters 14 | ---------- 15 | n_procs : int 16 | Total number of processes 17 | 18 | """ 19 | self.n_procs = n_procs 20 | 21 | def get_n_procs(self): 22 | """ 23 | Gives the global sum of all processes that are spawned on the machines 24 | 25 | Returns 26 | ------- 27 | int 28 | the number of global processes 29 | 30 | """ 31 | 32 | return self.n_procs 33 | 34 | def get_uid(self, rank: int, machine_id: int): 35 | """ 36 | Gives the global unique identifier of the node 37 | 38 | Parameters 39 | ---------- 40 | rank : int 41 | Node's rank on its machine 42 | machine_id : int 43 | node's machine in the cluster 44 | 45 | Returns 46 | ------- 47 | int 48 | the unique identifier 49 | 50 | """ 51 | 52 | raise NotImplementedError 53 | 54 | def get_machine_and_rank(self, uid: int): 55 | """ 56 | Gives the rank and machine_id of the node 57 | 58 | Parameters 59 | ---------- 60 | uid : int 61 | globally unique identifier of the node 62 | 63 | Returns 64 | ------- 65 | 2-tuple 66 | a tuple of rank and machine_id 67 | 68 | """ 69 | 70 | raise NotImplementedError 71 | 72 | def get_local_procs_count(self): 73 | """ 74 | Gives number of processes that run on the node 75 | 76 | Returns 77 | ------- 78 | int 79 | the number of local processes 80 | 81 | """ 82 | 83 | raise NotImplementedError 84 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = decentralizepy 3 | version = 0.1.dev0 4 | author = Rishi Sharma 5 | author_email = rishi.sharma@epfl.ch 6 | license = MIT 7 | description = A framework to write decentralized machine learning applications 8 | keywords = 9 | python 10 | decentralized 11 | ml 12 | learning 13 | sacs 14 | url = https://rishisharma.netlify.app 15 | download_url = https://gitlab.epfl.ch/risharma/decentralizepy 16 | long_description = file: README.rst 17 | classifiers = 18 | Development Status :: 4 - Beta 19 | Intended Audience :: Education 20 | Intended Audience :: Science/Research 21 | License :: OSI Approved :: MIT License 22 | Operating System :: OS Independent 23 | Programming Language :: Python 24 | Programming Language :: Python :: 3 25 | Programming Language :: Python :: 3.6 26 | Programming Language :: Python :: 3.7 27 | Programming Language :: Python :: 3.8 28 | Topic :: Scientific/Engineering 29 | [options] 30 | package_dir = 31 | = src 32 | packages = find: 33 | zip_safe = False 34 | install_requires = 35 | numpy 36 | torch 37 | torchvision 38 | matplotlib 39 | networkx 40 | zmq 41 | jsonlines 42 | pillow 43 | smallworld 44 | localconfig 45 | PyWavelets 46 | pandas 47 | crudini 48 | scikit-learn 49 | lz4 50 | fpzip 51 | include_package_data = True 52 | python_requires = >=3.6 53 | [options.packages.find] 54 | where = src 55 | [options.extras_require] 56 | dev = 57 | black>22.3.0 58 | coverage 59 | isort 60 | pytest 61 | pytest-xdist 62 | pytest-cov<2.6.0 63 | pycodestyle 64 | sphinx 65 | alabaster 66 | tox 67 | [tool:pytest] 68 | norecursedirs = 69 | .git 70 | dist 71 | build 72 | python_files = 73 | test_*.py 74 | doctest_plus = disabled 75 | addopts = --strict 76 | markers = 77 | slow 78 | remote_data 79 | filterwarnings 80 | mpl_image_compare 81 | [flake8] 82 | ignore = E203, E266, E501, W503 83 | max-line-length = 80 84 | max-complexity = 18 85 | select = B,C,E,F,W,T4,B9 86 | -------------------------------------------------------------------------------- /eval/testingKNN.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Linear import Linear 11 | from decentralizepy.node.KNN import KNN 12 | 13 | 14 | def read_ini(file_path): 15 | config = LocalConfig(file_path) 16 | for section in config: 17 | print("Section: ", section) 18 | for key, value in config.items(section): 19 | print((key, value)) 20 | print(dict(config.items("DATASET"))) 21 | return config 22 | 23 | 24 | if __name__ == "__main__": 25 | args = utils.get_args() 26 | 27 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 28 | 29 | log_level = { 30 | "INFO": logging.INFO, 31 | "DEBUG": logging.DEBUG, 32 | "WARNING": logging.WARNING, 33 | "ERROR": logging.ERROR, 34 | "CRITICAL": logging.CRITICAL, 35 | } 36 | 37 | config = read_ini(args.config_file) 38 | my_config = dict() 39 | for section in config: 40 | my_config[section] = dict(config.items(section)) 41 | 42 | copy(args.config_file, args.log_dir) 43 | copy(args.graph_file, args.log_dir) 44 | utils.write_args(args, args.log_dir) 45 | 46 | g = Graph() 47 | g.read_graph_from_file(args.graph_file, args.graph_type) 48 | n_machines = args.machines 49 | procs_per_machine = args.procs_per_machine[0] 50 | l = Linear(n_machines, procs_per_machine) 51 | m_id = args.machine_id 52 | 53 | processes = [] 54 | for r in range(procs_per_machine): 55 | processes.append( 56 | mp.Process( 57 | target=KNN, 58 | args=[ 59 | r, 60 | m_id, 61 | l, 62 | g, 63 | my_config, 64 | args.iterations, 65 | args.log_dir, 66 | args.weights_store_dir, 67 | log_level[args.log_level], 68 | args.test_after, 69 | args.train_evaluate_after, 70 | args.reset_optimizer, 71 | ], 72 | ) 73 | ) 74 | 75 | for p in processes: 76 | p.start() 77 | 78 | for p in processes: 79 | p.join() 80 | -------------------------------------------------------------------------------- /eval/testingKFN.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Linear import Linear 11 | from decentralizepy.node.KFNNode import KFNNode 12 | 13 | 14 | def read_ini(file_path): 15 | config = LocalConfig(file_path) 16 | for section in config: 17 | print("Section: ", section) 18 | for key, value in config.items(section): 19 | print((key, value)) 20 | print(dict(config.items("DATASET"))) 21 | return config 22 | 23 | 24 | if __name__ == "__main__": 25 | args = utils.get_args() 26 | 27 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 28 | 29 | log_level = { 30 | "INFO": logging.INFO, 31 | "DEBUG": logging.DEBUG, 32 | "WARNING": logging.WARNING, 33 | "ERROR": logging.ERROR, 34 | "CRITICAL": logging.CRITICAL, 35 | } 36 | 37 | config = read_ini(args.config_file) 38 | my_config = dict() 39 | for section in config: 40 | my_config[section] = dict(config.items(section)) 41 | 42 | copy(args.config_file, args.log_dir) 43 | copy(args.graph_file, args.log_dir) 44 | utils.write_args(args, args.log_dir) 45 | 46 | g = Graph() 47 | g.read_graph_from_file(args.graph_file, args.graph_type) 48 | n_machines = args.machines 49 | procs_per_machine = args.procs_per_machine[0] 50 | l = Linear(n_machines, procs_per_machine) 51 | m_id = args.machine_id 52 | 53 | processes = [] 54 | for r in range(procs_per_machine): 55 | processes.append( 56 | mp.Process( 57 | target=KFNNode, 58 | args=[ 59 | r, 60 | m_id, 61 | l, 62 | g, 63 | my_config, 64 | args.iterations, 65 | args.log_dir, 66 | args.weights_store_dir, 67 | log_level[args.log_level], 68 | args.test_after, 69 | args.train_evaluate_after, 70 | args.reset_optimizer, 71 | ], 72 | ) 73 | ) 74 | 75 | for p in processes: 76 | p.start() 77 | 78 | for p in processes: 79 | p.join() 80 | -------------------------------------------------------------------------------- /eval/testing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Linear import Linear 11 | from decentralizepy.node.DPSGDNode import DPSGDNode 12 | 13 | 14 | def read_ini(file_path): 15 | config = LocalConfig(file_path) 16 | for section in config: 17 | print("Section: ", section) 18 | for key, value in config.items(section): 19 | print((key, value)) 20 | print(dict(config.items("DATASET"))) 21 | return config 22 | 23 | 24 | if __name__ == "__main__": 25 | args = utils.get_args() 26 | 27 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 28 | 29 | log_level = { 30 | "INFO": logging.INFO, 31 | "DEBUG": logging.DEBUG, 32 | "WARNING": logging.WARNING, 33 | "ERROR": logging.ERROR, 34 | "CRITICAL": logging.CRITICAL, 35 | } 36 | 37 | config = read_ini(args.config_file) 38 | my_config = dict() 39 | for section in config: 40 | my_config[section] = dict(config.items(section)) 41 | 42 | copy(args.config_file, args.log_dir) 43 | copy(args.graph_file, args.log_dir) 44 | utils.write_args(args, args.log_dir) 45 | 46 | g = Graph() 47 | g.read_graph_from_file(args.graph_file, args.graph_type) 48 | n_machines = args.machines 49 | procs_per_machine = args.procs_per_machine[0] 50 | 51 | l = Linear(n_machines, procs_per_machine) 52 | m_id = args.machine_id 53 | 54 | processes = [] 55 | for r in range(procs_per_machine): 56 | processes.append( 57 | mp.Process( 58 | target=DPSGDNode, 59 | args=[ 60 | r, 61 | m_id, 62 | l, 63 | g, 64 | my_config, 65 | args.iterations, 66 | args.log_dir, 67 | args.weights_store_dir, 68 | log_level[args.log_level], 69 | args.test_after, 70 | args.train_evaluate_after, 71 | args.reset_optimizer, 72 | ], 73 | ) 74 | ) 75 | 76 | for p in processes: 77 | p.start() 78 | 79 | for p in processes: 80 | p.join() 81 | -------------------------------------------------------------------------------- /eval/36_nodes.edges: -------------------------------------------------------------------------------- 1 | 36 2 | 0 1 3 | 0 2 4 | 0 35 5 | 0 6 6 | 1 0 7 | 1 2 8 | 1 17 9 | 1 28 10 | 1 30 11 | 2 0 12 | 2 1 13 | 2 3 14 | 2 7 15 | 2 8 16 | 2 19 17 | 2 31 18 | 3 2 19 | 3 4 20 | 3 5 21 | 3 23 22 | 3 25 23 | 3 26 24 | 4 34 25 | 4 3 26 | 4 5 27 | 4 16 28 | 4 18 29 | 5 3 30 | 5 4 31 | 5 6 32 | 5 10 33 | 5 23 34 | 6 0 35 | 6 33 36 | 6 5 37 | 6 7 38 | 6 9 39 | 6 20 40 | 6 26 41 | 7 8 42 | 7 2 43 | 7 6 44 | 8 32 45 | 8 2 46 | 8 34 47 | 8 7 48 | 8 9 49 | 9 35 50 | 9 6 51 | 9 8 52 | 9 10 53 | 9 11 54 | 9 18 55 | 9 23 56 | 9 31 57 | 10 34 58 | 10 5 59 | 10 9 60 | 10 11 61 | 10 17 62 | 10 18 63 | 10 22 64 | 10 23 65 | 11 34 66 | 11 9 67 | 11 10 68 | 11 12 69 | 11 19 70 | 11 25 71 | 11 27 72 | 11 29 73 | 11 30 74 | 12 32 75 | 12 11 76 | 12 13 77 | 12 15 78 | 12 16 79 | 12 23 80 | 13 12 81 | 13 14 82 | 13 15 83 | 13 18 84 | 13 25 85 | 14 35 86 | 14 13 87 | 14 15 88 | 14 16 89 | 14 25 90 | 15 33 91 | 15 12 92 | 15 13 93 | 15 14 94 | 15 16 95 | 15 18 96 | 15 27 97 | 15 30 98 | 16 35 99 | 16 4 100 | 16 12 101 | 16 14 102 | 16 15 103 | 16 17 104 | 17 1 105 | 17 10 106 | 17 16 107 | 17 18 108 | 17 19 109 | 18 32 110 | 18 4 111 | 18 9 112 | 18 10 113 | 18 13 114 | 18 15 115 | 18 17 116 | 18 19 117 | 18 20 118 | 19 2 119 | 19 11 120 | 19 17 121 | 19 18 122 | 19 20 123 | 19 30 124 | 20 35 125 | 20 6 126 | 20 18 127 | 20 19 128 | 20 21 129 | 20 22 130 | 20 27 131 | 21 20 132 | 21 22 133 | 21 23 134 | 21 29 135 | 21 30 136 | 22 10 137 | 22 20 138 | 22 21 139 | 22 23 140 | 22 25 141 | 23 3 142 | 23 5 143 | 23 9 144 | 23 10 145 | 23 12 146 | 23 21 147 | 23 22 148 | 23 24 149 | 23 29 150 | 24 25 151 | 24 23 152 | 25 33 153 | 25 3 154 | 25 35 155 | 25 11 156 | 25 13 157 | 25 14 158 | 25 22 159 | 25 24 160 | 25 26 161 | 25 29 162 | 25 31 163 | 26 27 164 | 26 25 165 | 26 3 166 | 26 6 167 | 27 35 168 | 27 11 169 | 27 15 170 | 27 20 171 | 27 26 172 | 27 28 173 | 28 1 174 | 28 27 175 | 28 29 176 | 29 11 177 | 29 21 178 | 29 23 179 | 29 25 180 | 29 28 181 | 29 30 182 | 30 32 183 | 30 1 184 | 30 11 185 | 30 15 186 | 30 19 187 | 30 21 188 | 30 29 189 | 30 31 190 | 31 32 191 | 31 2 192 | 31 9 193 | 31 25 194 | 31 30 195 | 32 33 196 | 32 8 197 | 32 12 198 | 32 18 199 | 32 30 200 | 32 31 201 | 33 32 202 | 33 34 203 | 33 6 204 | 33 15 205 | 33 25 206 | 34 33 207 | 34 35 208 | 34 4 209 | 34 8 210 | 34 10 211 | 34 11 212 | 35 0 213 | 35 34 214 | 35 9 215 | 35 14 216 | 35 16 217 | 35 20 218 | 35 25 219 | 35 27 220 | -------------------------------------------------------------------------------- /eval/testingManual.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Manual import Manual 11 | from decentralizepy.node.DPSGDNode import DPSGDNode 12 | 13 | 14 | def read_ini(file_path): 15 | config = LocalConfig(file_path) 16 | for section in config: 17 | print("Section: ", section) 18 | for key, value in config.items(section): 19 | print((key, value)) 20 | print(dict(config.items("DATASET"))) 21 | return config 22 | 23 | 24 | if __name__ == "__main__": 25 | args = utils.get_args() 26 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 27 | 28 | log_level = { 29 | "INFO": logging.INFO, 30 | "DEBUG": logging.DEBUG, 31 | "WARNING": logging.WARNING, 32 | "ERROR": logging.ERROR, 33 | "CRITICAL": logging.CRITICAL, 34 | } 35 | 36 | config = read_ini(args.config_file) 37 | my_config = dict() 38 | for section in config: 39 | my_config[section] = dict(config.items(section)) 40 | 41 | copy(args.config_file, args.log_dir) 42 | copy(args.graph_file, args.log_dir) 43 | utils.write_args(args, args.log_dir) 44 | 45 | g = Graph() 46 | g.read_graph_from_file(args.graph_file, args.graph_type) 47 | n_machines = args.machines 48 | procs_per_machine = args.procs_per_machine 49 | m_id = args.machine_id 50 | 51 | l = Manual(n_machines, procs_per_machine, current_machine=m_id) 52 | 53 | processes = [] 54 | for r in range(procs_per_machine[m_id]): 55 | processes.append( 56 | mp.Process( 57 | target=DPSGDNode, 58 | args=[ 59 | r, 60 | m_id, 61 | l, 62 | g, 63 | my_config, 64 | args.iterations, 65 | args.log_dir, 66 | args.weights_store_dir, 67 | log_level[args.log_level], 68 | args.test_after, 69 | args.train_evaluate_after, 70 | args.reset_optimizer, 71 | ], 72 | ) 73 | ) 74 | 75 | for p in processes: 76 | p.start() 77 | 78 | for p in processes: 79 | p.join() 80 | -------------------------------------------------------------------------------- /tutorial/EpidemicLearning/testingEL_Local.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Linear import Linear 11 | from decentralizepy.node.EpidemicLearning.EL_Local import EL_Local 12 | 13 | 14 | def read_ini(file_path): 15 | config = LocalConfig(file_path) 16 | for section in config: 17 | print("Section: ", section) 18 | for key, value in config.items(section): 19 | print((key, value)) 20 | print(dict(config.items("DATASET"))) 21 | return config 22 | 23 | 24 | if __name__ == "__main__": 25 | args = utils.get_args() 26 | 27 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 28 | 29 | log_level = { 30 | "INFO": logging.INFO, 31 | "DEBUG": logging.DEBUG, 32 | "WARNING": logging.WARNING, 33 | "ERROR": logging.ERROR, 34 | "CRITICAL": logging.CRITICAL, 35 | } 36 | 37 | config = read_ini(args.config_file) 38 | my_config = dict() 39 | for section in config: 40 | my_config[section] = dict(config.items(section)) 41 | 42 | copy(args.config_file, args.log_dir) 43 | copy(args.graph_file, args.log_dir) 44 | utils.write_args(args, args.log_dir) 45 | 46 | g = Graph() 47 | g.read_graph_from_file(args.graph_file, args.graph_type) 48 | n_machines = args.machines 49 | procs_per_machine = args.procs_per_machine[0] 50 | 51 | l = Linear(n_machines, procs_per_machine) 52 | m_id = args.machine_id 53 | 54 | processes = [] 55 | for r in range(procs_per_machine): 56 | processes.append( 57 | mp.Process( 58 | target=EL_Local, 59 | args=[ 60 | r, 61 | m_id, 62 | l, 63 | g, 64 | my_config, 65 | args.iterations, 66 | args.log_dir, 67 | args.weights_store_dir, 68 | log_level[args.log_level], 69 | args.test_after, 70 | args.train_evaluate_after, 71 | args.reset_optimizer, 72 | ], 73 | ) 74 | ) 75 | 76 | for p in processes: 77 | p.start() 78 | 79 | for p in processes: 80 | p.join() 81 | -------------------------------------------------------------------------------- /tutorial/EpidemicLearning/fullyConnected_16.edges: -------------------------------------------------------------------------------- 1 | 16 2 | 0 1 3 | 0 2 4 | 0 3 5 | 0 4 6 | 0 5 7 | 0 6 8 | 0 7 9 | 0 8 10 | 0 9 11 | 0 10 12 | 0 11 13 | 0 12 14 | 0 13 15 | 0 14 16 | 0 15 17 | 1 0 18 | 1 2 19 | 1 3 20 | 1 4 21 | 1 5 22 | 1 6 23 | 1 7 24 | 1 8 25 | 1 9 26 | 1 10 27 | 1 11 28 | 1 12 29 | 1 13 30 | 1 14 31 | 1 15 32 | 2 0 33 | 2 1 34 | 2 3 35 | 2 4 36 | 2 5 37 | 2 6 38 | 2 7 39 | 2 8 40 | 2 9 41 | 2 10 42 | 2 11 43 | 2 12 44 | 2 13 45 | 2 14 46 | 2 15 47 | 3 0 48 | 3 1 49 | 3 2 50 | 3 4 51 | 3 5 52 | 3 6 53 | 3 7 54 | 3 8 55 | 3 9 56 | 3 10 57 | 3 11 58 | 3 12 59 | 3 13 60 | 3 14 61 | 3 15 62 | 4 0 63 | 4 1 64 | 4 2 65 | 4 3 66 | 4 5 67 | 4 6 68 | 4 7 69 | 4 8 70 | 4 9 71 | 4 10 72 | 4 11 73 | 4 12 74 | 4 13 75 | 4 14 76 | 4 15 77 | 5 0 78 | 5 1 79 | 5 2 80 | 5 3 81 | 5 4 82 | 5 6 83 | 5 7 84 | 5 8 85 | 5 9 86 | 5 10 87 | 5 11 88 | 5 12 89 | 5 13 90 | 5 14 91 | 5 15 92 | 6 0 93 | 6 1 94 | 6 2 95 | 6 3 96 | 6 4 97 | 6 5 98 | 6 7 99 | 6 8 100 | 6 9 101 | 6 10 102 | 6 11 103 | 6 12 104 | 6 13 105 | 6 14 106 | 6 15 107 | 7 0 108 | 7 1 109 | 7 2 110 | 7 3 111 | 7 4 112 | 7 5 113 | 7 6 114 | 7 8 115 | 7 9 116 | 7 10 117 | 7 11 118 | 7 12 119 | 7 13 120 | 7 14 121 | 7 15 122 | 8 0 123 | 8 1 124 | 8 2 125 | 8 3 126 | 8 4 127 | 8 5 128 | 8 6 129 | 8 7 130 | 8 9 131 | 8 10 132 | 8 11 133 | 8 12 134 | 8 13 135 | 8 14 136 | 8 15 137 | 9 0 138 | 9 1 139 | 9 2 140 | 9 3 141 | 9 4 142 | 9 5 143 | 9 6 144 | 9 7 145 | 9 8 146 | 9 10 147 | 9 11 148 | 9 12 149 | 9 13 150 | 9 14 151 | 9 15 152 | 10 0 153 | 10 1 154 | 10 2 155 | 10 3 156 | 10 4 157 | 10 5 158 | 10 6 159 | 10 7 160 | 10 8 161 | 10 9 162 | 10 11 163 | 10 12 164 | 10 13 165 | 10 14 166 | 10 15 167 | 11 0 168 | 11 1 169 | 11 2 170 | 11 3 171 | 11 4 172 | 11 5 173 | 11 6 174 | 11 7 175 | 11 8 176 | 11 9 177 | 11 10 178 | 11 12 179 | 11 13 180 | 11 14 181 | 11 15 182 | 12 0 183 | 12 1 184 | 12 2 185 | 12 3 186 | 12 4 187 | 12 5 188 | 12 6 189 | 12 7 190 | 12 8 191 | 12 9 192 | 12 10 193 | 12 11 194 | 12 13 195 | 12 14 196 | 12 15 197 | 13 0 198 | 13 1 199 | 13 2 200 | 13 3 201 | 13 4 202 | 13 5 203 | 13 6 204 | 13 7 205 | 13 8 206 | 13 9 207 | 13 10 208 | 13 11 209 | 13 12 210 | 13 14 211 | 13 15 212 | 14 0 213 | 14 1 214 | 14 2 215 | 14 3 216 | 14 4 217 | 14 5 218 | 14 6 219 | 14 7 220 | 14 8 221 | 14 9 222 | 14 10 223 | 14 11 224 | 14 12 225 | 14 13 226 | 14 15 227 | 15 0 228 | 15 1 229 | 15 2 230 | 15 3 231 | 15 4 232 | 15 5 233 | 15 6 234 | 15 7 235 | 15 8 236 | 15 9 237 | 15 10 238 | 15 11 239 | 15 12 240 | 15 13 241 | 15 14 242 | -------------------------------------------------------------------------------- /src/decentralizepy/mappings/Linear.py: -------------------------------------------------------------------------------- 1 | from decentralizepy.mappings.Mapping import Mapping 2 | 3 | 4 | class Linear(Mapping): 5 | """ 6 | This class defines the mapping: 7 | uid = machine_id * procs_per_machine + rank 8 | 9 | """ 10 | 11 | def __init__( 12 | self, n_machines, procs_per_machine, global_service_machine=0, current_machine=0 13 | ): 14 | """ 15 | Constructor 16 | 17 | Parameters 18 | ---------- 19 | n_machines : int 20 | Number of machines involved in learning 21 | procs_per_machine : int 22 | Number of processes spawned per machine 23 | global_service_machine: int, optional 24 | Machine ID on which the server/services are hosted 25 | current_machine: int, optional 26 | Machine ID of local machine 27 | 28 | """ 29 | super().__init__(n_machines * procs_per_machine) 30 | self.n_machines = n_machines 31 | self.procs_per_machine = procs_per_machine 32 | self.global_service_machine = global_service_machine 33 | self.current_machine = current_machine 34 | 35 | def get_uid(self, rank: int, machine_id: int): 36 | """ 37 | Gives the global unique identifier of the node 38 | 39 | Parameters 40 | ---------- 41 | rank : int 42 | Node's rank on its machine 43 | machine_id : int 44 | node's machine in the cluster 45 | 46 | Returns 47 | ------- 48 | int 49 | the unique identifier 50 | 51 | """ 52 | if rank < 0: 53 | return rank 54 | return machine_id * self.procs_per_machine + rank 55 | 56 | def get_machine_and_rank(self, uid: int): 57 | """ 58 | Gives the rank and machine_id of the node 59 | 60 | Parameters 61 | ---------- 62 | uid : int 63 | globally unique identifier of the node 64 | 65 | Returns 66 | ------- 67 | 2-tuple 68 | a tuple of rank and machine_id 69 | 70 | """ 71 | if uid < 0: 72 | return uid, self.global_service_machine 73 | return (uid % self.procs_per_machine), (uid // self.procs_per_machine) 74 | 75 | def get_local_procs_count(self): 76 | """ 77 | Gives number of processes that run on the node 78 | 79 | Returns 80 | ------- 81 | int 82 | the number of local processes 83 | 84 | """ 85 | 86 | return self.procs_per_machine 87 | -------------------------------------------------------------------------------- /src/decentralizepy/compression/Lz4Wrapper.py: -------------------------------------------------------------------------------- 1 | import lz4.frame 2 | import numpy as np 3 | 4 | from decentralizepy.compression.Compression import Compression 5 | 6 | 7 | class Lz4Wrapper(Compression): 8 | """ 9 | Compression API 10 | 11 | """ 12 | 13 | def __init__(self, compress_metadata=True, compress_data=False, *args, **kwargs): 14 | """ 15 | Constructor 16 | """ 17 | self.compress_metadata = compress_metadata 18 | self.compress_data = compress_data 19 | 20 | def compress(self, arr): 21 | """ 22 | compression function 23 | 24 | Parameters 25 | ---------- 26 | arr : np.ndarray 27 | Data to compress 28 | 29 | Returns 30 | ------- 31 | bytearray 32 | encoded data as bytes 33 | 34 | """ 35 | if self.compress_metadata: 36 | arr.sort() 37 | diff = np.diff(arr, prepend=0).astype(np.int32) 38 | to_compress = diff.tobytes("C") 39 | return lz4.frame.compress(to_compress) 40 | return arr 41 | 42 | def decompress(self, bytes): 43 | """ 44 | decompression function 45 | 46 | Parameters 47 | ---------- 48 | bytes :bytearray 49 | compressed data 50 | 51 | Returns 52 | ------- 53 | arr : np.ndarray 54 | decompressed data as array 55 | 56 | """ 57 | if self.compress_metadata: 58 | decomp = lz4.frame.decompress(bytes) 59 | return np.cumsum(np.frombuffer(decomp, dtype=np.int32)) 60 | return bytes 61 | 62 | def compress_float(self, arr): 63 | """ 64 | compression function for float arrays 65 | 66 | Parameters 67 | ---------- 68 | arr : np.ndarray 69 | Data to compress 70 | 71 | Returns 72 | ------- 73 | bytearray 74 | encoded data as bytes 75 | 76 | """ 77 | if self.compress_data: 78 | to_compress = arr.tobytes("C") 79 | return lz4.frame.compress(to_compress) 80 | return arr 81 | 82 | def decompress_float(self, bytes): 83 | """ 84 | decompression function for compressed float arrays 85 | 86 | Parameters 87 | ---------- 88 | bytes :bytearray 89 | compressed data 90 | 91 | Returns 92 | ------- 93 | arr : np.ndarray 94 | decompressed data as array 95 | 96 | """ 97 | if self.compress_data: 98 | decomp = lz4.frame.decompress(bytes) 99 | return np.frombuffer(decomp, dtype=np.float32) 100 | return bytes 101 | -------------------------------------------------------------------------------- /src/decentralizepy/models/Model.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from pathlib import Path 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class Model(nn.Module): 9 | """ 10 | This class wraps the torch model 11 | More fields can be added here 12 | 13 | """ 14 | 15 | def __init__(self): 16 | """ 17 | Constructor 18 | 19 | """ 20 | super().__init__() 21 | self.model_change = None 22 | self._param_count_ot = None 23 | self._param_count_total = None 24 | self.accumulated_changes = None 25 | self.shared_parameters_counter = None 26 | 27 | def count_params(self, only_trainable=False): 28 | """ 29 | Counts the total number of params 30 | 31 | Parameters 32 | ---------- 33 | only_trainable : bool 34 | Counts only parameters with gradients when True 35 | 36 | Returns 37 | ------- 38 | int 39 | Total number of parameters 40 | 41 | """ 42 | if only_trainable: 43 | if not self._param_count_ot: 44 | self._param_count_ot = sum( 45 | p.numel() for p in self.parameters() if p.requires_grad 46 | ) 47 | return self._param_count_ot 48 | else: 49 | if not self._param_count_total: 50 | self._param_count_total = sum(p.numel() for p in self.parameters()) 51 | return self._param_count_total 52 | 53 | def rewind_accumulation(self, indices): 54 | """ 55 | resets accumulated_changes at the given indices 56 | 57 | Parameters 58 | ---------- 59 | indices : torch.Tensor 60 | Tensor that contains indices corresponding to the flatten model 61 | 62 | """ 63 | if self.accumulated_changes is not None: 64 | self.accumulated_changes[indices] = 0.0 65 | 66 | def dump_weights(self, directory, uid, round): 67 | """ 68 | saves the current model as a pt file into the specified direcectory 69 | 70 | Parameters 71 | ---------- 72 | directory : str 73 | directory in which the weights are dumped 74 | uid : int 75 | uid of the node, will be used to give the weight a unique name 76 | round : int 77 | current round, will be used to give the weight a unique name 78 | 79 | """ 80 | torch.save(self.state_dict(), Path(directory) / f"{round}_weight_{uid}.pt") 81 | 82 | def get_weights(self): 83 | """ 84 | flattens the current weights 85 | 86 | """ 87 | with torch.no_grad(): 88 | tensors_to_cat = [] 89 | for _, v in self.state_dict().items(): 90 | tensors_to_cat.append(v.flatten()) 91 | flat = torch.cat(tensors_to_cat) 92 | 93 | return flat 94 | -------------------------------------------------------------------------------- /src/decentralizepy/communication/Communication.py: -------------------------------------------------------------------------------- 1 | class Communication: 2 | """ 3 | Communcation API 4 | 5 | """ 6 | 7 | def __init__(self, rank, machine_id, mapping, total_procs): 8 | """ 9 | Constructor 10 | 11 | Parameters 12 | ---------- 13 | rank : int 14 | Local rank of the process 15 | machine_id : int 16 | Machine id of the process 17 | mapping : decentralizepy.mappings.Mapping 18 | uid, rank, machine_id invertible mapping 19 | total_procs : int 20 | Total number of processes 21 | 22 | """ 23 | self.total_procs = total_procs 24 | self.rank = rank 25 | self.machine_id = machine_id 26 | self.mapping = mapping 27 | self.uid = mapping.get_uid(rank, machine_id) 28 | self.total_bytes = 0 29 | 30 | def encrypt(self, data): 31 | """ 32 | Encode/Encrypt data. 33 | 34 | Parameters 35 | ---------- 36 | data : dict 37 | Data dict to send 38 | 39 | Returns 40 | ------- 41 | byte 42 | Encoded data 43 | 44 | """ 45 | raise NotImplementedError 46 | 47 | def decrypt(self, sender, data): 48 | """ 49 | Decodes received data. 50 | 51 | Parameters 52 | ---------- 53 | sender : byte 54 | sender of the data 55 | data : byte 56 | Data received 57 | 58 | Returns 59 | ------- 60 | tuple 61 | (sender: int, data: dict) 62 | 63 | """ 64 | raise NotImplementedError 65 | 66 | def connect_neighbors(self, neighbors): 67 | """ 68 | Connects all neighbors. 69 | 70 | Parameters 71 | ---------- 72 | neighbors : list(int) 73 | List of neighbors 74 | 75 | """ 76 | raise NotImplementedError 77 | 78 | def receive(self): 79 | """ 80 | Returns ONE message received. 81 | 82 | Returns 83 | ---------- 84 | dict 85 | Received and decrypted data 86 | 87 | """ 88 | raise NotImplementedError 89 | 90 | def send(self, uid, data): 91 | """ 92 | Send a message to a process. 93 | 94 | Parameters 95 | ---------- 96 | uid : int 97 | Neighbor's unique ID 98 | data : dict 99 | Message as a Python dictionary 100 | 101 | """ 102 | raise NotImplementedError 103 | 104 | def disconnect_neighbors(self): 105 | """ 106 | Disconnects all neighbors. 107 | 108 | """ 109 | raise NotImplementedError 110 | 111 | def terminate(self): 112 | """ 113 | Terminate the communication sockets. 114 | 115 | """ 116 | return 117 | -------------------------------------------------------------------------------- /src/decentralizepy/compression/Elias.py: -------------------------------------------------------------------------------- 1 | # elias implementation: taken from this stack overflow post: 2 | # https://stackoverflow.com/questions/62843156/python-fast-compression-of-large-amount-of-numbers-with-elias-gamma 3 | import numpy as np 4 | 5 | from decentralizepy.compression.Compression import Compression 6 | 7 | 8 | class Elias(Compression): 9 | """ 10 | Compression API 11 | 12 | """ 13 | 14 | def __init__(self, *args, **kwargs): 15 | """ 16 | Constructor 17 | """ 18 | super().__init__() 19 | 20 | def compress(self, arr): 21 | """ 22 | compression function 23 | 24 | Parameters 25 | ---------- 26 | arr : np.ndarray 27 | Data to compress 28 | 29 | Returns 30 | ------- 31 | bytearray 32 | encoded data as bytes 33 | 34 | """ 35 | arr.sort() 36 | first = arr[0] 37 | arr = np.diff(arr).astype(np.int32) 38 | arr = arr.view(f"u{arr.itemsize}") 39 | l = np.log2(arr).astype("u1") 40 | L = ((l << 1) + 1).cumsum() 41 | out = np.zeros(int(L[-1] + 128), "u1") 42 | for i in range(l.max() + 1): 43 | out[L - i - 1] += (arr >> i) & 1 44 | 45 | s = np.array([out.size], dtype=np.int64) 46 | size = np.ndarray(8, dtype="u1", buffer=s.data) 47 | packed = np.packbits(out) 48 | packed[-8:] = size 49 | s = np.array([first], dtype=np.int64) 50 | size = np.ndarray(8, dtype="u1", buffer=s.data) 51 | packed[-16:-8] = size 52 | return packed 53 | 54 | def decompress(self, bytes): 55 | """ 56 | decompression function 57 | 58 | Parameters 59 | ---------- 60 | bytes :bytearray 61 | compressed data 62 | 63 | Returns 64 | ------- 65 | arr : np.ndarray 66 | decompressed data as array 67 | 68 | """ 69 | n_arr = bytes[-8:] 70 | n = np.ndarray(1, dtype=np.int64, buffer=n_arr.data)[0] 71 | first = bytes[-16:-8] 72 | first = np.ndarray(1, dtype=np.int64, buffer=first.data)[0] 73 | b = bytes[:-16] 74 | b = np.unpackbits(b, count=n).view(bool) 75 | s = b.nonzero()[0] 76 | s = (s << 1).repeat(np.diff(s, prepend=-1)) 77 | s -= np.arange(-1, len(s) - 1) 78 | s = s.tolist() # list has faster __getitem__ 79 | ns = len(s) 80 | 81 | def gen(): 82 | idx = 0 83 | yield idx 84 | while idx < ns: 85 | idx = s[idx] 86 | yield idx 87 | 88 | offs = np.fromiter(gen(), int) 89 | sz = np.diff(offs) >> 1 90 | mx = sz.max() + 1 91 | out_fin = np.zeros(offs.size, int) 92 | out_fin[0] = first 93 | out = out_fin[1:] 94 | for i in range(mx): 95 | out[b[offs[1:] - i - 1] & (sz >= i)] += 1 << i 96 | out = np.cumsum(out_fin) 97 | return out 98 | -------------------------------------------------------------------------------- /src/decentralizepy/mappings/Manual.py: -------------------------------------------------------------------------------- 1 | from decentralizepy.mappings.Mapping import Mapping 2 | 3 | 4 | class Manual(Mapping): 5 | """ 6 | This class defines the manual mapping 7 | 8 | """ 9 | 10 | def __init__( 11 | self, n_machines, procs_per_machine, global_service_machine=0, current_machine=0 12 | ): 13 | """ 14 | Constructor 15 | 16 | Parameters 17 | ---------- 18 | n_machines : int 19 | Number of machines involved in learning 20 | procs_per_machine : list(int) 21 | A list of number of processes spawned per machine 22 | global_service_machine: int, optional 23 | Machine ID on which the server/services are hosted 24 | current_machine: int, optional 25 | Machine ID of local machine 26 | 27 | """ 28 | 29 | self.n_procs = 0 30 | for i in procs_per_machine: 31 | self.n_procs += i 32 | super().__init__(self.n_procs) 33 | self.n_machines = n_machines 34 | self.procs_per_machine = procs_per_machine 35 | self.global_service_machine = global_service_machine 36 | self.current_machine = current_machine 37 | 38 | def get_uid(self, rank: int, machine_id: int): 39 | """ 40 | Gives the global unique identifier of the node 41 | 42 | Parameters 43 | ---------- 44 | rank : int 45 | Node's rank on its machine 46 | machine_id : int 47 | node's machine in the cluster 48 | 49 | Returns 50 | ------- 51 | int 52 | the unique identifier 53 | 54 | """ 55 | if rank < 0: 56 | return rank 57 | cur_uid = 0 58 | for i in range(machine_id): 59 | cur_uid += self.procs_per_machine[i] 60 | return cur_uid + rank 61 | 62 | def get_machine_and_rank(self, uid: int): 63 | """ 64 | Gives the rank and machine_id of the node 65 | 66 | Parameters 67 | ---------- 68 | uid : int 69 | globally unique identifier of the node 70 | 71 | Returns 72 | ------- 73 | 2-tuple 74 | a tuple of rank and machine_id 75 | 76 | """ 77 | if uid < 0: 78 | return uid, self.global_service_machine 79 | 80 | machine, rank = 0, 0 81 | for procs in self.procs_per_machine: 82 | if uid < procs: 83 | rank = uid 84 | break 85 | else: 86 | machine += 1 87 | uid -= procs 88 | return rank, machine 89 | 90 | def get_local_procs_count(self): 91 | """ 92 | Gives number of processes that run on the node 93 | 94 | Returns 95 | ------- 96 | int 97 | the number of local processes 98 | 99 | """ 100 | 101 | return self.procs_per_machine[self.current_machine] 102 | -------------------------------------------------------------------------------- /src/decentralizepy/sharing/JWINS/JWINS.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from decentralizepy.sharing.JWINS.Wavelet import Wavelet 4 | 5 | 6 | class JWINS(Wavelet): 7 | """ 8 | This class implements the JWINS sharing algorithm. 9 | 10 | """ 11 | 12 | def __init__( 13 | self, 14 | rank, 15 | machine_id, 16 | communication, 17 | mapping, 18 | graph, 19 | model, 20 | dataset, 21 | log_dir, 22 | alpha_list="[0.1, 0.2, 0.3, 0.4, 1.0]", 23 | dict_ordered=True, 24 | save_shared=False, 25 | metadata_cap=1.0, 26 | wavelet="haar", 27 | level=4, 28 | change_based_selection=True, 29 | save_accumulated="", 30 | accumulation=False, 31 | accumulate_averaging_changes=False, 32 | compress=False, 33 | compression_package=None, 34 | compression_class=None, 35 | ): 36 | """ 37 | Constructor 38 | 39 | Parameters 40 | ---------- 41 | rank : int 42 | Local rank 43 | machine_id : int 44 | Global machine id 45 | communication : decentralizepy.communication.Communication 46 | Communication module used to send and receive messages 47 | mapping : decentralizepy.mappings.Mapping 48 | Mapping (rank, machine_id) -> uid 49 | graph : decentralizepy.graphs.Graph 50 | Graph reprensenting neighbors 51 | model : decentralizepy.models.Model 52 | Model to train 53 | dataset : decentralizepy.datasets.Dataset 54 | Dataset for sharing data. Not implemented yet! TODO 55 | log_dir : str 56 | Location to write shared_params (only writing for 2 procs per machine) 57 | dict_ordered : bool 58 | Specifies if the python dict maintains the order of insertion 59 | save_shared : bool 60 | Specifies if the indices of shared parameters should be logged 61 | metadata_cap : float 62 | Share full model when self.alpha > metadata_cap 63 | 64 | """ 65 | super().__init__( 66 | rank, 67 | machine_id, 68 | communication, 69 | mapping, 70 | graph, 71 | model, 72 | dataset, 73 | log_dir, 74 | 1.0, 75 | dict_ordered, 76 | save_shared, 77 | metadata_cap, 78 | wavelet, 79 | level, 80 | change_based_selection, 81 | save_accumulated, 82 | accumulation, 83 | accumulate_averaging_changes, 84 | compress, 85 | compression_package, 86 | compression_class, 87 | ) 88 | self.alpha_list = eval(alpha_list) 89 | random.seed(self.mapping.get_uid(self.rank, self.machine_id)) 90 | 91 | def get_data_to_send(self, degree=None): 92 | """ 93 | Perform a sharing step. Implements D-PSGD with alpha randomly chosen. 94 | 95 | """ 96 | self.alpha = random.choice(self.alpha_list) 97 | return super().get_data_to_send() 98 | -------------------------------------------------------------------------------- /eval/testingPeerSampler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Linear import Linear 11 | from decentralizepy.node.DPSGDWithPeerSampler import DPSGDWithPeerSampler 12 | from decentralizepy.node.PeerSampler import PeerSampler 13 | 14 | 15 | def read_ini(file_path): 16 | config = LocalConfig(file_path) 17 | for section in config: 18 | print("Section: ", section) 19 | for key, value in config.items(section): 20 | print((key, value)) 21 | print(dict(config.items("DATASET"))) 22 | return config 23 | 24 | 25 | if __name__ == "__main__": 26 | args = utils.get_args() 27 | 28 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 29 | 30 | log_level = { 31 | "INFO": logging.INFO, 32 | "DEBUG": logging.DEBUG, 33 | "WARNING": logging.WARNING, 34 | "ERROR": logging.ERROR, 35 | "CRITICAL": logging.CRITICAL, 36 | } 37 | 38 | config = read_ini(args.config_file) 39 | my_config = dict() 40 | for section in config: 41 | my_config[section] = dict(config.items(section)) 42 | 43 | copy(args.config_file, args.log_dir) 44 | copy(args.graph_file, args.log_dir) 45 | utils.write_args(args, args.log_dir) 46 | 47 | g = Graph() 48 | g.read_graph_from_file(args.graph_file, args.graph_type) 49 | n_machines = args.machines 50 | procs_per_machine = args.procs_per_machine[0] 51 | l = Linear(n_machines, procs_per_machine) 52 | m_id = args.machine_id 53 | 54 | sm = args.server_machine 55 | sr = args.server_rank 56 | 57 | processes = [] 58 | if sm == m_id: 59 | processes.append( 60 | mp.Process( 61 | target=PeerSampler, 62 | args=[ 63 | sr, 64 | m_id, 65 | l, 66 | g, 67 | my_config, 68 | args.iterations, 69 | args.log_dir, 70 | log_level[args.log_level], 71 | ], 72 | ) 73 | ) 74 | 75 | for r in range(0, procs_per_machine): 76 | processes.append( 77 | mp.Process( 78 | target=DPSGDWithPeerSampler, 79 | args=[ 80 | r, 81 | m_id, 82 | l, 83 | g, 84 | my_config, 85 | args.iterations, 86 | args.log_dir, 87 | args.weights_store_dir, 88 | log_level[args.log_level], 89 | args.test_after, 90 | args.train_evaluate_after, 91 | args.reset_optimizer, 92 | ], 93 | ) 94 | ) 95 | 96 | for p in processes: 97 | p.start() 98 | 99 | for p in processes: 100 | p.join() 101 | -------------------------------------------------------------------------------- /eval/testingPeerSamplerDynamic.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Linear import Linear 11 | from decentralizepy.node.DPSGDWithPeerSampler import DPSGDWithPeerSampler 12 | from decentralizepy.node.PeerSamplerDynamic import PeerSamplerDynamic 13 | 14 | 15 | def read_ini(file_path): 16 | config = LocalConfig(file_path) 17 | for section in config: 18 | print("Section: ", section) 19 | for key, value in config.items(section): 20 | print((key, value)) 21 | print(dict(config.items("DATASET"))) 22 | return config 23 | 24 | 25 | if __name__ == "__main__": 26 | args = utils.get_args() 27 | 28 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 29 | 30 | log_level = { 31 | "INFO": logging.INFO, 32 | "DEBUG": logging.DEBUG, 33 | "WARNING": logging.WARNING, 34 | "ERROR": logging.ERROR, 35 | "CRITICAL": logging.CRITICAL, 36 | } 37 | 38 | config = read_ini(args.config_file) 39 | my_config = dict() 40 | for section in config: 41 | my_config[section] = dict(config.items(section)) 42 | 43 | copy(args.config_file, args.log_dir) 44 | copy(args.graph_file, args.log_dir) 45 | utils.write_args(args, args.log_dir) 46 | 47 | g = Graph() 48 | g.read_graph_from_file(args.graph_file, args.graph_type) 49 | n_machines = args.machines 50 | procs_per_machine = args.procs_per_machine[0] 51 | m_id = args.machine_id 52 | 53 | sm = args.server_machine 54 | sr = args.server_rank 55 | 56 | l = Linear( 57 | n_machines, procs_per_machine, global_service_machine=sm, current_machine=m_id 58 | ) 59 | 60 | processes = [] 61 | if sm == m_id: 62 | processes.append( 63 | mp.Process( 64 | target=PeerSamplerDynamic, 65 | args=[ 66 | sr, 67 | m_id, 68 | l, 69 | g, 70 | my_config, 71 | args.iterations, 72 | args.log_dir, 73 | log_level[args.log_level], 74 | ], 75 | ) 76 | ) 77 | 78 | for r in range(0, procs_per_machine): 79 | processes.append( 80 | mp.Process( 81 | target=DPSGDWithPeerSampler, 82 | args=[ 83 | r, 84 | m_id, 85 | l, 86 | g, 87 | my_config, 88 | args.iterations, 89 | args.log_dir, 90 | args.weights_store_dir, 91 | log_level[args.log_level], 92 | args.test_after, 93 | args.train_evaluate_after, 94 | args.reset_optimizer, 95 | ], 96 | ) 97 | ) 98 | 99 | for p in processes: 100 | p.start() 101 | 102 | for p in processes: 103 | p.join() 104 | -------------------------------------------------------------------------------- /eval/testingPeerSamplerDynamicManual.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Manual import Manual 11 | from decentralizepy.node.DPSGDWithPeerSampler import DPSGDWithPeerSampler 12 | from decentralizepy.node.PeerSamplerDynamic import PeerSamplerDynamic 13 | 14 | 15 | def read_ini(file_path): 16 | config = LocalConfig(file_path) 17 | for section in config: 18 | print("Section: ", section) 19 | for key, value in config.items(section): 20 | print((key, value)) 21 | print(dict(config.items("DATASET"))) 22 | return config 23 | 24 | 25 | if __name__ == "__main__": 26 | args = utils.get_args() 27 | 28 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 29 | 30 | log_level = { 31 | "INFO": logging.INFO, 32 | "DEBUG": logging.DEBUG, 33 | "WARNING": logging.WARNING, 34 | "ERROR": logging.ERROR, 35 | "CRITICAL": logging.CRITICAL, 36 | } 37 | 38 | config = read_ini(args.config_file) 39 | my_config = dict() 40 | for section in config: 41 | my_config[section] = dict(config.items(section)) 42 | 43 | copy(args.config_file, args.log_dir) 44 | copy(args.graph_file, args.log_dir) 45 | utils.write_args(args, args.log_dir) 46 | 47 | g = Graph() 48 | g.read_graph_from_file(args.graph_file, args.graph_type) 49 | n_machines = args.machines 50 | procs_per_machine = args.procs_per_machine 51 | m_id = args.machine_id 52 | 53 | sm = args.server_machine 54 | sr = args.server_rank 55 | 56 | l = Manual( 57 | n_machines, procs_per_machine, global_service_machine=sm, current_machine=m_id 58 | ) 59 | 60 | processes = [] 61 | if sm == m_id: 62 | processes.append( 63 | mp.Process( 64 | target=PeerSamplerDynamic, 65 | args=[ 66 | sr, 67 | m_id, 68 | l, 69 | g, 70 | my_config, 71 | args.iterations, 72 | args.log_dir, 73 | log_level[args.log_level], 74 | ], 75 | ) 76 | ) 77 | 78 | for r in range(0, procs_per_machine[m_id]): 79 | processes.append( 80 | mp.Process( 81 | target=DPSGDWithPeerSampler, 82 | args=[ 83 | r, 84 | m_id, 85 | l, 86 | g, 87 | my_config, 88 | args.iterations, 89 | args.log_dir, 90 | args.weights_store_dir, 91 | log_level[args.log_level], 92 | args.test_after, 93 | args.train_evaluate_after, 94 | args.reset_optimizer, 95 | ], 96 | ) 97 | ) 98 | 99 | for p in processes: 100 | p.start() 101 | 102 | for p in processes: 103 | p.join() 104 | -------------------------------------------------------------------------------- /eval/testingSTC.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Linear import Linear 11 | from decentralizepy.node.STC.STCClient import STCClient 12 | from decentralizepy.node.STC.STCServer import STCServer 13 | 14 | 15 | def read_ini(file_path): 16 | config = LocalConfig(file_path) 17 | for section in config: 18 | print("Section: ", section) 19 | for key, value in config.items(section): 20 | print((key, value)) 21 | print(dict(config.items("DATASET"))) 22 | return config 23 | 24 | 25 | if __name__ == "__main__": 26 | args = utils.get_args() 27 | 28 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 29 | 30 | log_level = { 31 | "INFO": logging.INFO, 32 | "DEBUG": logging.DEBUG, 33 | "WARNING": logging.WARNING, 34 | "ERROR": logging.ERROR, 35 | "CRITICAL": logging.CRITICAL, 36 | } 37 | 38 | config = read_ini(args.config_file) 39 | my_config = dict() 40 | for section in config: 41 | my_config[section] = dict(config.items(section)) 42 | 43 | copy(args.config_file, args.log_dir) 44 | copy(args.graph_file, args.log_dir) 45 | utils.write_args(args, args.log_dir) 46 | 47 | g = Graph() 48 | g.read_graph_from_file(args.graph_file, args.graph_type) 49 | n_machines = args.machines 50 | procs_per_machine = args.procs_per_machine[0] 51 | l = Linear(n_machines, procs_per_machine) 52 | m_id = args.machine_id 53 | 54 | sm = args.server_machine 55 | sr = args.server_rank 56 | 57 | processes = [] 58 | if sm == m_id: 59 | processes.append( 60 | mp.Process( 61 | target=STCServer, 62 | args=[ 63 | sr, 64 | m_id, 65 | l, 66 | g, 67 | my_config, 68 | args.iterations, 69 | args.log_dir, 70 | args.weights_store_dir, 71 | log_level[args.log_level], 72 | args.test_after, 73 | args.train_evaluate_after, 74 | args.working_rate, 75 | ], 76 | ) 77 | ) 78 | 79 | for r in range(0, procs_per_machine): 80 | processes.append( 81 | mp.Process( 82 | target=STCClient, 83 | args=[ 84 | r, 85 | m_id, 86 | l, 87 | g, 88 | my_config, 89 | args.iterations, 90 | args.log_dir, 91 | args.weights_store_dir, 92 | log_level[args.log_level], 93 | args.test_after, 94 | args.train_evaluate_after, 95 | args.reset_optimizer, 96 | ], 97 | ) 98 | ) 99 | 100 | for p in processes: 101 | p.start() 102 | 103 | for p in processes: 104 | p.join() 105 | -------------------------------------------------------------------------------- /tutorial/EpidemicLearning/testingEL_Oracle.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Linear import Linear 11 | from decentralizepy.node.EpidemicLearning.EL_Oracle_Client import EL_Oracle_Client 12 | from decentralizepy.node.EpidemicLearning.EL_Oracle_TopologyBuilder import ( 13 | EL_Oracle_TopologyBuilder, 14 | ) 15 | 16 | 17 | def read_ini(file_path): 18 | config = LocalConfig(file_path) 19 | for section in config: 20 | print("Section: ", section) 21 | for key, value in config.items(section): 22 | print((key, value)) 23 | print(dict(config.items("DATASET"))) 24 | return config 25 | 26 | 27 | if __name__ == "__main__": 28 | args = utils.get_args() 29 | 30 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 31 | 32 | log_level = { 33 | "INFO": logging.INFO, 34 | "DEBUG": logging.DEBUG, 35 | "WARNING": logging.WARNING, 36 | "ERROR": logging.ERROR, 37 | "CRITICAL": logging.CRITICAL, 38 | } 39 | 40 | config = read_ini(args.config_file) 41 | my_config = dict() 42 | for section in config: 43 | my_config[section] = dict(config.items(section)) 44 | 45 | copy(args.config_file, args.log_dir) 46 | copy(args.graph_file, args.log_dir) 47 | utils.write_args(args, args.log_dir) 48 | 49 | g = Graph() 50 | g.read_graph_from_file(args.graph_file, args.graph_type) 51 | n_machines = args.machines 52 | procs_per_machine = args.procs_per_machine[0] 53 | m_id = args.machine_id 54 | 55 | sm = args.server_machine 56 | sr = args.server_rank 57 | 58 | l = Linear( 59 | n_machines, procs_per_machine, global_service_machine=sm, current_machine=m_id 60 | ) 61 | 62 | processes = [] 63 | if sm == m_id: 64 | processes.append( 65 | mp.Process( 66 | target=EL_Oracle_TopologyBuilder, 67 | args=[ 68 | sr, 69 | m_id, 70 | l, 71 | g, 72 | my_config, 73 | args.iterations, 74 | args.log_dir, 75 | log_level[args.log_level], 76 | ], 77 | ) 78 | ) 79 | 80 | for r in range(0, procs_per_machine): 81 | processes.append( 82 | mp.Process( 83 | target=EL_Oracle_Client, 84 | args=[ 85 | r, 86 | m_id, 87 | l, 88 | g, 89 | my_config, 90 | args.iterations, 91 | args.log_dir, 92 | args.weights_store_dir, 93 | log_level[args.log_level], 94 | args.test_after, 95 | args.train_evaluate_after, 96 | args.reset_optimizer, 97 | ], 98 | ) 99 | ) 100 | 101 | for p in processes: 102 | p.start() 103 | 104 | for p in processes: 105 | p.join() 106 | -------------------------------------------------------------------------------- /eval/testingFederated.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from shutil import copy 4 | 5 | from localconfig import LocalConfig 6 | from torch import multiprocessing as mp 7 | 8 | from decentralizepy import utils 9 | from decentralizepy.graphs.Graph import Graph 10 | from decentralizepy.mappings.Linear import Linear 11 | from decentralizepy.node.DPSGDNodeFederated import DPSGDNodeFederated 12 | from decentralizepy.node.FederatedParameterServer import FederatedParameterServer 13 | 14 | 15 | def read_ini(file_path): 16 | config = LocalConfig(file_path) 17 | for section in config: 18 | print("Section: ", section) 19 | for key, value in config.items(section): 20 | print((key, value)) 21 | print(dict(config.items("DATASET"))) 22 | return config 23 | 24 | 25 | if __name__ == "__main__": 26 | args = utils.get_args() 27 | 28 | Path(args.log_dir).mkdir(parents=True, exist_ok=True) 29 | 30 | log_level = { 31 | "INFO": logging.INFO, 32 | "DEBUG": logging.DEBUG, 33 | "WARNING": logging.WARNING, 34 | "ERROR": logging.ERROR, 35 | "CRITICAL": logging.CRITICAL, 36 | } 37 | 38 | config = read_ini(args.config_file) 39 | my_config = dict() 40 | for section in config: 41 | my_config[section] = dict(config.items(section)) 42 | 43 | copy(args.config_file, args.log_dir) 44 | copy(args.graph_file, args.log_dir) 45 | utils.write_args(args, args.log_dir) 46 | 47 | g = Graph() 48 | g.read_graph_from_file(args.graph_file, args.graph_type) 49 | n_machines = args.machines 50 | procs_per_machine = args.procs_per_machine[0] 51 | l = Linear(n_machines, procs_per_machine) 52 | m_id = args.machine_id 53 | 54 | sm = args.server_machine 55 | sr = args.server_rank 56 | 57 | processes = [] 58 | if sm == m_id: 59 | processes.append( 60 | mp.Process( 61 | target=FederatedParameterServer, 62 | args=[ 63 | sr, 64 | m_id, 65 | l, 66 | g, 67 | my_config, 68 | args.iterations, 69 | args.log_dir, 70 | args.weights_store_dir, 71 | log_level[args.log_level], 72 | args.test_after, 73 | args.train_evaluate_after, 74 | args.working_rate, 75 | ], 76 | ) 77 | ) 78 | 79 | for r in range(0, procs_per_machine): 80 | processes.append( 81 | mp.Process( 82 | target=DPSGDNodeFederated, 83 | args=[ 84 | r, 85 | m_id, 86 | l, 87 | g, 88 | my_config, 89 | args.iterations, 90 | args.log_dir, 91 | args.weights_store_dir, 92 | log_level[args.log_level], 93 | args.test_after, 94 | args.train_evaluate_after, 95 | args.reset_optimizer, 96 | ], 97 | ) 98 | ) 99 | 100 | for p in processes: 101 | p.start() 102 | 103 | for p in processes: 104 | p.join() 105 | -------------------------------------------------------------------------------- /eval/plot_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | from matplotlib import pyplot as plt 8 | from pyexpat import model 9 | 10 | 11 | def plot(x, y, label, *args): 12 | plt.plot(x, y, *args, label=label) 13 | plt.legend() 14 | 15 | 16 | def reject_outliers(data, m=2.0): 17 | d = np.abs(data - np.median(data)) 18 | mdev = np.median(d) 19 | s = d / (mdev if mdev else 1.0) 20 | return data[s < m] 21 | 22 | 23 | def plot_model(path, title): 24 | model_path = os.path.join(path, "plots") 25 | Path(model_path).mkdir(parents=True, exist_ok=True) 26 | files = [f for f in os.listdir(path) if f.endswith("json")] 27 | for file in files: 28 | filepath = os.path.join(path, file) 29 | with open(filepath, "r") as inf: 30 | model_vec = json.load(inf) 31 | del model_vec["order"] 32 | del model_vec["shapes"] 33 | model_vec = np.array(model_vec[list(model_vec.keys())[0]]) 34 | num_elements = model_vec.shape[0] 35 | x_axis = np.arange(1, num_elements + 1) 36 | plt.clf() 37 | plt.title(title) 38 | plot(x_axis, model_vec, "unsorted", ".") 39 | model_vec = np.sort(model_vec) 40 | plot(x_axis, model_vec, "sorted") 41 | plt.savefig(os.path.join(model_path, file[0:-5])) 42 | 43 | 44 | def plot_ratio(path_change, path_val, title): 45 | model_path = os.path.join(path_change, "plots_ratio") 46 | Path(model_path).mkdir(parents=True, exist_ok=True) 47 | files_change = [f for f in os.listdir(path_change) if f.endswith("json")] 48 | files_val = [f for f in os.listdir(path_val) if f.endswith("json")] 49 | for i, file in enumerate(files_change): 50 | print("Processed ", file) 51 | filepath_change = os.path.join(path_change, file) 52 | filepath_val = os.path.join(path_val, files_val[i]) 53 | with open(filepath_change, "r") as inf: 54 | model_change = json.load(inf) 55 | del model_change["order"] 56 | del model_change["shapes"] 57 | model_change = np.array(model_change[list(model_change.keys())[0]]) 58 | with open(filepath_val, "r") as inf: 59 | model_val = json.load(inf) 60 | del model_val["order"] 61 | del model_val["shapes"] 62 | model_val = np.array(model_val[list(model_val.keys())[0]]) 63 | num_elements = model_val.shape[0] 64 | x_axis = np.arange(1, num_elements + 1) 65 | plt.clf() 66 | plt.title(title) 67 | model_vec = np.divide( 68 | model_change, 69 | model_val, 70 | out=np.zeros_like(model_change), 71 | where=model_val != 0.0, 72 | ) 73 | model_vec = reject_outliers(model_vec) 74 | num_elements = model_vec.shape[0] 75 | x_axis = np.arange(1, num_elements + 1) 76 | plot(x_axis, model_vec, "unsorted", ".") 77 | model_vec = np.sort(model_vec) 78 | plot(x_axis, model_vec, "sorted") 79 | plt.savefig(os.path.join(model_path, file[0:-5])) 80 | 81 | 82 | if __name__ == "__main__": 83 | assert len(sys.argv) == 3 84 | plot_model( 85 | os.path.join(sys.argv[1], "model_change", sys.argv[2]), "Change in Weights" 86 | ) 87 | plot_model(os.path.join(sys.argv[1], "model_val", sys.argv[2]), "Model Parameters") 88 | plot_ratio( 89 | os.path.join(sys.argv[1], "model_change", sys.argv[2]), 90 | os.path.join(sys.argv[1], "model_val", sys.argv[2]), 91 | "Ratio", 92 | ) 93 | -------------------------------------------------------------------------------- /src/decentralizepy/node/PeerSamplerDynamic.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from decentralizepy.graphs.Graph import Graph 4 | from decentralizepy.graphs.Regular import Regular 5 | from decentralizepy.mappings.Mapping import Mapping 6 | from decentralizepy.node.PeerSampler import PeerSampler 7 | 8 | 9 | class PeerSamplerDynamic(PeerSampler): 10 | """ 11 | This class defines the peer sampling service 12 | 13 | """ 14 | 15 | def get_neighbors(self, node, iteration=None): 16 | if iteration != None: 17 | if iteration > self.iteration: 18 | logging.debug( 19 | "iteration, self.iteration: {}, {}".format( 20 | iteration, self.iteration 21 | ) 22 | ) 23 | assert iteration == self.iteration + 1 24 | self.iteration = iteration 25 | self.graphs.append( 26 | Regular( 27 | self.graph.n_procs, 28 | self.graph_degree, 29 | seed=self.random_seed * 100000 + iteration, 30 | ) 31 | ) 32 | return self.graphs[iteration].neighbors(node) 33 | else: 34 | return self.graph.neighbors(node) 35 | 36 | def __init__( 37 | self, 38 | rank: int, 39 | machine_id: int, 40 | mapping: Mapping, 41 | graph: Graph, 42 | config, 43 | iterations=1, 44 | log_dir=".", 45 | log_level=logging.INFO, 46 | *args 47 | ): 48 | """ 49 | Constructor 50 | 51 | Parameters 52 | ---------- 53 | rank : int 54 | Rank of process local to the machine 55 | machine_id : int 56 | Machine ID on which the process in running 57 | mapping : decentralizepy.mappings 58 | The object containing the mapping rank <--> uid 59 | graph : decentralizepy.graphs 60 | The object containing the global graph 61 | config : dict 62 | A dictionary of configurations. Must contain the following: 63 | [DATASET] 64 | dataset_package 65 | dataset_class 66 | model_class 67 | [OPTIMIZER_PARAMS] 68 | optimizer_package 69 | optimizer_class 70 | [TRAIN_PARAMS] 71 | training_package = decentralizepy.training.Training 72 | training_class = Training 73 | epochs_per_round = 25 74 | batch_size = 64 75 | iterations : int 76 | Number of iterations (communication steps) for which the model should be trained 77 | log_dir : str 78 | Logging directory 79 | log_level : logging.Level 80 | One of DEBUG, INFO, WARNING, ERROR, CRITICAL 81 | args : optional 82 | Other arguments 83 | 84 | """ 85 | 86 | self.iteration = -1 87 | self.graphs = [] 88 | 89 | nodeConfigs = config["NODE"] 90 | self.graph_degree = nodeConfigs["graph_degree"] 91 | 92 | self.instantiate( 93 | rank, 94 | machine_id, 95 | mapping, 96 | graph, 97 | config, 98 | iterations, 99 | log_dir, 100 | log_level, 101 | *args 102 | ) 103 | 104 | self.run() 105 | 106 | logging.info("Peer Sampler exiting") 107 | -------------------------------------------------------------------------------- /src/decentralizepy/sharing/PlainAverageSharing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | 5 | from decentralizepy.sharing.Sharing import Sharing 6 | 7 | 8 | class PlainAverageSharing(Sharing): 9 | """ 10 | Class to do plain averaging instead of Metropolis Hastings 11 | 12 | """ 13 | 14 | def __init__( 15 | self, 16 | rank, 17 | machine_id, 18 | communication, 19 | mapping, 20 | graph, 21 | model, 22 | dataset, 23 | log_dir, 24 | compress=False, 25 | compression_package=None, 26 | compression_class=None, 27 | float_precision=None, 28 | ): 29 | """ 30 | Constructor 31 | 32 | Parameters 33 | ---------- 34 | rank : int 35 | Local rank 36 | machine_id : int 37 | Global machine id 38 | communication : decentralizepy.communication.Communication 39 | Communication module used to send and receive messages 40 | mapping : decentralizepy.mappings.Mapping 41 | Mapping (rank, machine_id) -> uid 42 | graph : decentralizepy.graphs.Graph 43 | Graph reprensenting neighbors 44 | model : decentralizepy.models.Model 45 | Model to train 46 | dataset : decentralizepy.datasets.Dataset 47 | Dataset for sharing data. 48 | log_dir : str 49 | Location to write shared_params (only writing for 2 procs per machine) 50 | 51 | """ 52 | super().__init__( 53 | rank, 54 | machine_id, 55 | communication, 56 | mapping, 57 | graph, 58 | model, 59 | dataset, 60 | log_dir, 61 | compress, 62 | compression_package, 63 | compression_class, 64 | float_precision, 65 | ) 66 | self.received_this_round = 0 67 | 68 | def _pre_step(self): 69 | """ 70 | Called at the beginning of step. 71 | 72 | """ 73 | pass 74 | 75 | def _post_step(self): 76 | """ 77 | Called at the end of step. 78 | 79 | """ 80 | pass 81 | 82 | def _averaging(self, peer_deques): 83 | """ 84 | Averages the received model with the local model 85 | 86 | """ 87 | self.received_this_round = 0 88 | with torch.no_grad(): 89 | total = dict() 90 | weight = 1 / (len(peer_deques) + 1) 91 | for i, n in enumerate(peer_deques): 92 | self.received_this_round += 1 93 | data = peer_deques[n].popleft() 94 | iteration = data["iteration"] 95 | del data["iteration"] 96 | del data["CHANNEL"] 97 | logging.debug( 98 | "Averaging model from neighbor {} of iteration {}".format( 99 | n, iteration 100 | ) 101 | ) 102 | data = self.deserialized_model(data) 103 | for key, value in data.items(): 104 | if key in total: 105 | total[key] += value * weight 106 | else: 107 | total[key] = value * weight 108 | 109 | for key, value in self.model.state_dict().items(): 110 | total[key] += value * weight 111 | 112 | self.model.load_state_dict(total) 113 | self._post_step() 114 | self.communication_round += 1 115 | 116 | def get_data_to_send(self, *args, **kwargs): 117 | self._pre_step() 118 | data = self.serialized_model() 119 | data["iteration"] = self.communication_round 120 | return data 121 | -------------------------------------------------------------------------------- /generate_graph.py: -------------------------------------------------------------------------------- 1 | import getopt 2 | import sys 3 | 4 | from decentralizepy.graphs.FullyConnected import FullyConnected 5 | from decentralizepy.graphs.Regular import Regular 6 | from decentralizepy.graphs.Ring import Ring 7 | from decentralizepy.graphs.SmallWorld import SmallWorld 8 | from decentralizepy.graphs.Star import Star 9 | 10 | if __name__ == "__main__": 11 | """ 12 | Script to generate a graph file. 13 | 14 | Usage 15 | ----- 16 | python generate_graph.py -g -n -s -d -k -b -f -a 17 | 18 | Parameters 19 | ---------- 20 | graph_type : str 21 | One of {"Regular", "FullyConnected", "Ring", "SmallWorld", "Star"} 22 | num_nodes : int 23 | Number of nodes in the graph 24 | seed : int, optional 25 | Seed for random number generator 26 | degree : int, optional 27 | Degree of the graph 28 | k_over_2 : int, optional 29 | Parameter for smallworld 30 | beta : float, optional 31 | Parameter for smallworld 32 | file_name : str, optional 33 | Name of the file to write the graph to 34 | a : flag, optional 35 | If set, the graph is written in adjacency list format, otherwise in edge list format 36 | h : flag, optional 37 | Prints this help message 38 | 39 | """ 40 | __doc__ = "Usage: python3 generate_graph.py -g -n -s -d -k -b -f -a -h" 41 | assert len(sys.argv) >= 2, __doc__ 42 | argumentList = sys.argv[1:] 43 | 44 | options = "hg:n:s:d:k:b:f:a" 45 | 46 | long_options = [ 47 | "graph=", 48 | "nodes=", 49 | "seed=", 50 | "degree=", 51 | "kover2=", 52 | "beta=", 53 | "file=", 54 | "adjacency", 55 | "help", 56 | ] 57 | 58 | try: 59 | arguments, values = getopt.getopt(argumentList, options, long_options) 60 | 61 | graph_type = None 62 | num_nodes = None 63 | seed = None 64 | degree = None 65 | k_over_2 = None 66 | beta = None 67 | file_name = None 68 | type_adjacency = "edges" 69 | 70 | for currentArgument, currentValue in arguments: 71 | if currentArgument in ("-h", "--help"): 72 | print(__doc__) 73 | exit(0) 74 | elif currentArgument in ("-g", "--graph"): 75 | graph_type = currentValue 76 | elif currentArgument in ("-n", "--nodes"): 77 | num_nodes = int(currentValue) 78 | elif currentArgument in ("-s", "--seed"): 79 | seed = int(currentValue) 80 | elif currentArgument in ("-d", "--degree"): 81 | degree = int(currentValue) 82 | elif currentArgument in ("-k", "--kover2"): 83 | k_over_2 = int(currentValue) 84 | elif currentArgument in ("-b", "--beta"): 85 | beta = float(currentValue) 86 | elif currentArgument in ("-f", "--file"): 87 | file_name = currentValue 88 | elif currentArgument in ("-a", "--adjacency"): 89 | type_adjacency = "adjacency" 90 | 91 | if graph_type == "Regular": 92 | g = Regular(num_nodes, degree, seed) 93 | elif graph_type == "FullyConnected": 94 | g = FullyConnected(num_nodes) 95 | elif graph_type == "Ring": 96 | g = Ring(num_nodes) 97 | elif graph_type == "SmallWorld": 98 | g = SmallWorld(num_nodes, k_over_2, beta) 99 | elif graph_type == "Star": 100 | g = Star(num_nodes) 101 | else: 102 | raise ValueError("Invalid graph type: " + graph_type) 103 | 104 | if file_name is not None: 105 | g.write_graph_to_file(file_name, type=type_adjacency) 106 | else: 107 | raise ValueError("No file name. " + __doc__) 108 | except getopt.error as err: 109 | print(str(err)) 110 | sys.exit(2) 111 | -------------------------------------------------------------------------------- /src/decentralizepy/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import json 4 | import os 5 | 6 | 7 | def conditional_value(var, nul, default): 8 | """ 9 | Set the value to default if nul. 10 | 11 | Parameters 12 | ---------- 13 | var : any 14 | The value 15 | nul : any 16 | The null value. Assigns default if var == nul 17 | default : any 18 | The default value 19 | 20 | Returns 21 | ------- 22 | type(var) 23 | The final value 24 | 25 | """ 26 | if var != nul: 27 | return var 28 | else: 29 | return default 30 | 31 | 32 | def remove_keys(d, keys_to_remove): 33 | """ 34 | Removes given keys from the dict. Returns a new list. 35 | 36 | Parameters 37 | ---------- 38 | d : dict 39 | The initial dictionary 40 | keys_to_remove : list 41 | List of keys to remove from dict 42 | 43 | Returns 44 | ------- 45 | dict 46 | A new dictionary with the given keys removed. 47 | 48 | """ 49 | return {key: d[key] for key in d if key not in keys_to_remove} 50 | 51 | 52 | def get_args(): 53 | """ 54 | Utility to parse arguments. 55 | 56 | Returns 57 | ------- 58 | args 59 | Command line arguments 60 | 61 | """ 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument("-mid", "--machine_id", type=int, default=0) 64 | parser.add_argument("-ps", "--procs_per_machine", type=int, default=1, nargs="+") 65 | parser.add_argument("-ms", "--machines", type=int, default=1) 66 | parser.add_argument( 67 | "-ld", 68 | "--log_dir", 69 | type=str, 70 | default="./{}".format(datetime.datetime.now().isoformat(timespec="minutes")), 71 | ) 72 | parser.add_argument( 73 | "-wsd", 74 | "--weights_store_dir", 75 | type=str, 76 | default="./{}_ws".format(datetime.datetime.now().isoformat(timespec="minutes")), 77 | ) 78 | parser.add_argument("-is", "--iterations", type=int, default=1) 79 | parser.add_argument("-cf", "--config_file", type=str, default="config.ini") 80 | parser.add_argument("-ll", "--log_level", type=str, default="INFO") 81 | parser.add_argument("-gf", "--graph_file", type=str, default="36_nodes.edges") 82 | parser.add_argument("-gt", "--graph_type", type=str, default="edges") 83 | parser.add_argument("-ta", "--test_after", type=int, default=5) 84 | parser.add_argument("-tea", "--train_evaluate_after", type=int, default=1) 85 | parser.add_argument("-ro", "--reset_optimizer", type=int, default=1) 86 | parser.add_argument("-sm", "--server_machine", type=int, default=0) 87 | parser.add_argument("-sr", "--server_rank", type=int, default=-1) 88 | parser.add_argument("-wr", "--working_rate", type=float, default=1.0) 89 | 90 | args = parser.parse_args() 91 | return args 92 | 93 | 94 | def write_args(args, path): 95 | """ 96 | Write arguments to a json file 97 | 98 | Parameters 99 | ---------- 100 | args : args 101 | Command line args 102 | path : str 103 | Location of the file to write to 104 | 105 | """ 106 | data = { 107 | "machine_id": args.machine_id, 108 | "procs_per_machine": args.procs_per_machine, 109 | "machines": args.machines, 110 | "log_dir": args.log_dir, 111 | "weights_store_dir": args.weights_store_dir, 112 | "iterations": args.iterations, 113 | "config_file": args.config_file, 114 | "log_level": args.log_level, 115 | "graph_file": args.graph_file, 116 | "graph_type": args.graph_type, 117 | "test_after": args.test_after, 118 | "train_evaluate_after": args.train_evaluate_after, 119 | "reset_optimizer": args.reset_optimizer, 120 | "working_rate": args.working_rate, 121 | } 122 | with open(os.path.join(path, "args.json"), "w") as of: 123 | json.dump(data, of) 124 | 125 | 126 | def identity(obj): 127 | """ 128 | Identity function 129 | Parameters 130 | ---------- 131 | obj 132 | Some object 133 | Returns 134 | ------- 135 | obj 136 | The same object 137 | """ 138 | return obj 139 | -------------------------------------------------------------------------------- /eval/96_regular.edges: -------------------------------------------------------------------------------- 1 | 96 2 | 0 24 3 | 0 1 4 | 0 26 5 | 0 95 6 | 1 2 7 | 1 0 8 | 1 82 9 | 1 83 10 | 2 33 11 | 2 90 12 | 2 3 13 | 2 1 14 | 3 2 15 | 3 4 16 | 3 14 17 | 3 79 18 | 4 3 19 | 4 12 20 | 4 5 21 | 4 86 22 | 5 64 23 | 5 42 24 | 5 4 25 | 5 6 26 | 6 9 27 | 6 5 28 | 6 62 29 | 6 7 30 | 7 24 31 | 7 8 32 | 7 45 33 | 7 6 34 | 8 81 35 | 8 17 36 | 8 9 37 | 8 7 38 | 9 8 39 | 9 10 40 | 9 53 41 | 9 6 42 | 10 9 43 | 10 11 44 | 10 29 45 | 10 31 46 | 11 80 47 | 11 10 48 | 11 36 49 | 11 12 50 | 12 11 51 | 12 4 52 | 12 13 53 | 12 70 54 | 13 12 55 | 13 53 56 | 13 30 57 | 13 14 58 | 14 3 59 | 14 15 60 | 14 13 61 | 14 47 62 | 15 16 63 | 15 26 64 | 15 14 65 | 16 41 66 | 16 17 67 | 16 15 68 | 17 8 69 | 17 16 70 | 17 18 71 | 17 83 72 | 18 17 73 | 18 19 74 | 18 95 75 | 18 63 76 | 19 82 77 | 19 18 78 | 19 20 79 | 19 22 80 | 20 19 81 | 20 59 82 | 20 21 83 | 20 22 84 | 21 72 85 | 21 58 86 | 21 20 87 | 21 22 88 | 22 19 89 | 22 20 90 | 22 21 91 | 22 23 92 | 23 24 93 | 23 65 94 | 23 85 95 | 23 22 96 | 24 0 97 | 24 25 98 | 24 23 99 | 24 7 100 | 25 32 101 | 25 24 102 | 25 26 103 | 25 38 104 | 26 0 105 | 26 25 106 | 26 27 107 | 26 15 108 | 27 32 109 | 27 26 110 | 27 28 111 | 27 63 112 | 28 27 113 | 28 92 114 | 28 29 115 | 28 39 116 | 29 10 117 | 29 52 118 | 29 28 119 | 29 30 120 | 30 66 121 | 30 29 122 | 30 13 123 | 30 31 124 | 31 32 125 | 31 10 126 | 31 36 127 | 31 30 128 | 32 25 129 | 32 27 130 | 32 31 131 | 32 33 132 | 33 32 133 | 33 2 134 | 33 84 135 | 33 34 136 | 34 33 137 | 34 50 138 | 34 35 139 | 34 93 140 | 35 57 141 | 35 34 142 | 35 43 143 | 35 36 144 | 36 35 145 | 36 11 146 | 36 37 147 | 36 31 148 | 37 88 149 | 37 36 150 | 37 38 151 | 37 79 152 | 38 25 153 | 38 37 154 | 38 39 155 | 38 49 156 | 39 40 157 | 39 28 158 | 39 77 159 | 39 38 160 | 40 41 161 | 40 91 162 | 40 39 163 | 40 87 164 | 41 16 165 | 41 40 166 | 41 42 167 | 41 51 168 | 42 41 169 | 42 43 170 | 42 5 171 | 43 42 172 | 43 35 173 | 43 44 174 | 44 72 175 | 44 43 176 | 44 75 177 | 44 45 178 | 45 67 179 | 45 44 180 | 45 46 181 | 45 7 182 | 46 76 183 | 46 45 184 | 46 54 185 | 46 47 186 | 47 48 187 | 47 65 188 | 47 14 189 | 47 46 190 | 48 56 191 | 48 49 192 | 48 61 193 | 48 47 194 | 49 48 195 | 49 50 196 | 49 38 197 | 49 71 198 | 50 49 199 | 50 34 200 | 50 51 201 | 50 93 202 | 51 41 203 | 51 50 204 | 51 52 205 | 51 95 206 | 52 51 207 | 52 74 208 | 52 53 209 | 52 29 210 | 53 9 211 | 53 52 212 | 53 13 213 | 53 54 214 | 54 75 215 | 54 53 216 | 54 46 217 | 54 55 218 | 55 56 219 | 55 69 220 | 55 85 221 | 55 54 222 | 56 48 223 | 56 57 224 | 56 69 225 | 56 55 226 | 57 56 227 | 57 89 228 | 57 58 229 | 57 35 230 | 58 57 231 | 58 59 232 | 58 21 233 | 58 86 234 | 59 73 235 | 59 58 236 | 59 20 237 | 59 60 238 | 60 62 239 | 60 59 240 | 60 61 241 | 60 78 242 | 61 48 243 | 61 62 244 | 61 60 245 | 61 94 246 | 62 60 247 | 62 61 248 | 62 6 249 | 62 63 250 | 63 64 251 | 63 18 252 | 63 27 253 | 63 62 254 | 64 65 255 | 64 84 256 | 64 5 257 | 64 63 258 | 65 64 259 | 65 66 260 | 65 23 261 | 65 47 262 | 66 65 263 | 66 89 264 | 66 67 265 | 66 30 266 | 67 80 267 | 67 66 268 | 67 68 269 | 67 45 270 | 68 67 271 | 68 92 272 | 68 69 273 | 68 94 274 | 69 56 275 | 69 68 276 | 69 70 277 | 69 55 278 | 70 90 279 | 70 12 280 | 70 69 281 | 70 71 282 | 71 72 283 | 71 49 284 | 71 70 285 | 71 87 286 | 72 73 287 | 72 44 288 | 72 21 289 | 72 71 290 | 73 72 291 | 73 91 292 | 73 59 293 | 73 74 294 | 74 73 295 | 74 75 296 | 74 52 297 | 74 76 298 | 75 74 299 | 75 44 300 | 75 54 301 | 75 76 302 | 76 74 303 | 76 75 304 | 76 77 305 | 76 46 306 | 77 81 307 | 77 76 308 | 77 78 309 | 77 39 310 | 78 88 311 | 78 60 312 | 78 77 313 | 78 79 314 | 79 80 315 | 79 3 316 | 79 37 317 | 79 78 318 | 80 81 319 | 80 67 320 | 80 11 321 | 80 79 322 | 81 8 323 | 81 82 324 | 81 80 325 | 81 77 326 | 82 81 327 | 82 1 328 | 82 83 329 | 82 19 330 | 83 1 331 | 83 82 332 | 83 84 333 | 83 17 334 | 84 64 335 | 84 33 336 | 84 83 337 | 84 85 338 | 85 84 339 | 85 55 340 | 85 86 341 | 85 23 342 | 86 58 343 | 86 4 344 | 86 85 345 | 86 87 346 | 87 40 347 | 87 88 348 | 87 86 349 | 87 71 350 | 88 89 351 | 88 37 352 | 88 78 353 | 88 87 354 | 89 88 355 | 89 57 356 | 89 66 357 | 89 90 358 | 90 89 359 | 90 2 360 | 90 91 361 | 90 70 362 | 91 40 363 | 91 73 364 | 91 90 365 | 91 92 366 | 92 93 367 | 92 91 368 | 92 68 369 | 92 28 370 | 93 50 371 | 93 34 372 | 93 94 373 | 93 92 374 | 94 93 375 | 94 68 376 | 94 61 377 | 94 95 378 | 95 0 379 | 95 18 380 | 95 51 381 | 95 94 382 | -------------------------------------------------------------------------------- /src/decentralizepy/compression/Quantization.py: -------------------------------------------------------------------------------- 1 | # Quantize to [-k, k] 2 | 3 | import pickle 4 | 5 | import numpy as np 6 | 7 | from decentralizepy.compression.Compression import Compression 8 | 9 | 10 | class Quantization(Compression): 11 | """ 12 | Compress metadata and quantize parameters 13 | 14 | """ 15 | 16 | def __init__(self, float_precision: int = 2**15 - 1, *args, **kwargs): 17 | """ 18 | Constructor 19 | 20 | Parameters 21 | ---------- 22 | float_precision : int, optional 23 | Quantization parameter 24 | """ 25 | super().__init__(*args, **kwargs) 26 | self.k = float_precision 27 | 28 | def compress_float(self, x): 29 | """ 30 | compression function for float arrays 31 | 32 | Parameters 33 | ---------- 34 | x : np.ndarray 35 | Data to compress 36 | 37 | Returns 38 | ------- 39 | bytearray 40 | encoded data as bytes 41 | 42 | """ 43 | 44 | # Compute scale factor 45 | scale_factor = np.mean(np.abs(x)) / self.k 46 | # scale_factor = np.max(np.abs(x)) / self.k 47 | 48 | # Normalize x to [-k, k] 49 | norm_factor = np.max(np.abs(x)) / self.k 50 | x = x / norm_factor 51 | x = x.round().astype(np.int32) 52 | 53 | # Get the maximum absolute value from the input array 54 | max_abs = np.max(np.abs(x)) 55 | 56 | # Get the nearest power of 2 greater than equal to max_abs 57 | nearest_pow_2 = 2 ** np.ceil(np.log2(max_abs)) 58 | 59 | # Check if nearest_pow_2 is the same as max_abs 60 | if nearest_pow_2 == max_abs: 61 | nearest_pow_2 = nearest_pow_2 * 2 62 | 63 | # Calculate the number of bits required to represent the nearest power of 2 64 | num_bits = int(np.ceil(np.log2(nearest_pow_2))) + 1 65 | 66 | # Make all numbers of x positive 67 | x = x + nearest_pow_2 - 1 68 | 69 | x = np.asarray(x, dtype=np.uint32) 70 | 71 | # Create a numpy array of shape (x.shape, num_bits) and fill it with zeros 72 | bit_rep = np.zeros((x.shape[0], num_bits), dtype=np.uint8) 73 | 74 | # Iterate over x and convert each number to binary 75 | for i in range(len(x)): 76 | str_bit = np.binary_repr(x[i], width=num_bits) 77 | array_bit = np.array(list(str_bit), dtype=np.uint8) 78 | indices_with_1 = np.where(array_bit == 1)[0] 79 | bit_rep[i][indices_with_1] = 1 80 | 81 | bit_rep = bit_rep.reshape(-1) 82 | 83 | # Pack the bits into minimum number of bytes 84 | intermediate_rep = np.packbits(bit_rep, bitorder="little") 85 | padding = np.array([0], dtype=np.uint8) 86 | if bit_rep.shape[0] % 8: 87 | padding = np.array([8 - (bit_rep.shape[0] % 8)], dtype=np.uint8) 88 | num_bits = np.array([num_bits], dtype=np.uint8) 89 | to_send = np.concatenate((padding, num_bits, intermediate_rep), dtype=np.uint8) 90 | 91 | return pickle.dumps((scale_factor, to_send)) 92 | 93 | def decompress_float(self, bytes): 94 | """ 95 | decompression function for compressed float arrays 96 | 97 | Parameters 98 | ---------- 99 | bytes :bytearray 100 | compressed data 101 | 102 | Returns 103 | ------- 104 | np.ndarray 105 | decompressed data as array 106 | 107 | """ 108 | # Extract scale_factor and x from bytes 109 | scale_factor, x = pickle.loads(bytes) 110 | 111 | # Extract padding and num_bits from x 112 | padding = -x[0].item() if x[0].item() else None 113 | num_bits = x[1].item() 114 | rest_of_x = x[2:].astype(np.uint8) 115 | 116 | # Unpack rest_of_x and reshape it 117 | received_x = np.unpackbits(rest_of_x, bitorder="little", count=padding) 118 | received_x = received_x.reshape((-1, num_bits)).astype(np.uint8) 119 | 120 | # Initialize an unit8 array with the same number of rows as received_x 121 | output = np.zeros(received_x.shape[0], dtype=np.int32) 122 | 123 | # Convert each row into an integer 124 | for i in range(received_x.shape[0]): 125 | output[i] = ( 126 | int("".join(received_x[i].astype(str)), 2) - (2 ** (num_bits - 1)) + 1 127 | ) 128 | 129 | # Denormalize the output 130 | output = output * scale_factor 131 | 132 | return output.astype(np.float32) 133 | -------------------------------------------------------------------------------- /eval/plot_percentile.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import torch 8 | from matplotlib import pyplot as plt 9 | 10 | 11 | def get_stats(l): 12 | assert len(l) > 0 13 | mean_dict, stdev_dict, min_dict, max_dict = {}, {}, {}, {} 14 | for key in l[0].keys(): 15 | all_nodes = [i[key] for i in l] 16 | all_nodes = np.array(all_nodes) 17 | mean = np.mean(all_nodes) 18 | std = np.std(all_nodes) 19 | min = np.min(all_nodes) 20 | max = np.max(all_nodes) 21 | mean_dict[int(key)] = mean 22 | stdev_dict[int(key)] = std 23 | min_dict[int(key)] = min 24 | max_dict[int(key)] = max 25 | return mean_dict, stdev_dict, min_dict, max_dict 26 | 27 | 28 | def plot(means, stdevs, mins, maxs, title, label, loc): 29 | plt.title(title) 30 | plt.xlabel("communication rounds") 31 | x_axis = list(means.keys()) 32 | y_axis = list(means.values()) 33 | err = list(stdevs.values()) 34 | plt.errorbar(x_axis, y_axis, yerr=err, label=label) 35 | plt.legend(loc=loc) 36 | 37 | 38 | def plot_results(path): 39 | """ 40 | plots the percentiles 41 | Based on plot.py 42 | Parameters 43 | ---------- 44 | path 45 | path to the folders from which to create the percentiles plots 46 | 47 | """ 48 | folders = os.listdir(path) 49 | folders.sort() 50 | print("Reading folders from: ", path) 51 | print("Folders: ", folders) 52 | for folder in folders: 53 | folder_path = os.path.join(path, folder) 54 | if not os.path.isdir(folder_path): 55 | continue 56 | results = [] 57 | all_shared_params = [] 58 | machine_folders = os.listdir(folder_path) 59 | for machine_folder in machine_folders: 60 | mf_path = os.path.join(folder_path, machine_folder) 61 | if not os.path.isdir(mf_path): 62 | continue 63 | files = os.listdir(mf_path) 64 | shared_params = [f for f in files if f.endswith("_shared_parameters.json")] 65 | files = [f for f in files if f.endswith("_results.json")] 66 | for f in files: 67 | filepath = os.path.join(mf_path, f) 68 | with open(filepath, "r") as inf: 69 | results.append(json.load(inf)) 70 | for sp in shared_params: 71 | filepath = os.path.join(mf_path, sp) 72 | with open(filepath, "r") as spf: 73 | all_shared_params.append(np.array(json.load(spf), dtype=np.int32)) 74 | 75 | # Plot Training loss 76 | plt.figure(1) 77 | # Average of the shared parameters 78 | mean = np.mean(all_shared_params, axis=0) 79 | std = np.std(all_shared_params, axis=0) 80 | with open( 81 | os.path.join(path, "shared_params_avg_" + folder + ".json"), "w" 82 | ) as mf: 83 | json.dump(mean.tolist(), mf) 84 | 85 | with open( 86 | os.path.join(path, "shared_params_std_" + folder + ".json"), "w" 87 | ) as sf: 88 | json.dump(std.tolist(), sf) 89 | 90 | # copy jupyter notebook code 91 | percentile = np.percentile(mean, np.arange(0, 100, 1)) 92 | plt.plot(np.arange(0, 100, 1), percentile, label=folder) 93 | plt.title("Shared parameters Percentiles") 94 | # plt.ylabel("Absolute frequency value") 95 | plt.xlabel("Percentiles") 96 | plt.xticks(np.arange(0, 110, 10)) 97 | plt.legend(loc="lower right") 98 | 99 | plt.figure(2) 100 | sort = torch.sort(torch.tensor(mean)).values 101 | print(sort) 102 | length = sort.shape[0] 103 | length = int(length / 20) 104 | bins = [ 105 | torch.sum(sort[length * i : length * (i + 1)]).item() for i in range(20) 106 | ] 107 | total = np.sum(bins) 108 | perc = bins / total # np.divide(bins, total) 109 | print(perc) 110 | plt.bar(np.arange(0, 97.5, 5), perc, width=5, align="edge", label=folder) 111 | 112 | plt.title("Shared parameters Percentiles") 113 | # plt.ylabel("Absolute frequency value") 114 | plt.xlabel("Percentiles") 115 | plt.legend(loc="lower right") 116 | plt.savefig(os.path.join(path, f"percentiles_histogram_{folder}.png"), dpi=300) 117 | plt.clf() 118 | plt.cla() 119 | 120 | plt.figure(1) 121 | plt.savefig(os.path.join(path, "percentiles.png"), dpi=300) 122 | 123 | 124 | if __name__ == "__main__": 125 | assert len(sys.argv) == 2 126 | plot_results(sys.argv[1]) 127 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://upload.wikimedia.org/wikipedia/commons/f/f4/Logo_EPFL.svg 2 | :alt: EPFL logo 3 | :width: 75px 4 | :align: right 5 | 6 | ============== 7 | decentralizepy 8 | ============== 9 | 10 | decentralizepy is a framework for running distributed applications (particularly ML) on top of arbitrary topologies (decentralized, federated, parameter server). 11 | It was primarily conceived for assessing scientific ideas on several aspects of distributed learning (communication efficiency, privacy, data heterogeneity etc.). 12 | 13 | ------------------------- 14 | Setting up decentralizepy 15 | ------------------------- 16 | 17 | * Fork the repository. 18 | * Clone and enter your local repository. 19 | * Check if you have ``python>=3.8``. :: 20 | 21 | python --version 22 | 23 | * (Optional) Create and activate a virtual environment. :: 24 | 25 | python3 -m venv [venv-name] 26 | source [venv-name]/bin/activate 27 | 28 | * Update pip. :: 29 | 30 | pip3 install --upgrade pip 31 | pip install --upgrade pip 32 | 33 | * On Mac M1, installing ``pyzmq`` fails with `pip`. Use `conda `_. 34 | * Install decentralizepy for development. (zsh) :: 35 | 36 | pip3 install --editable .\[dev\] 37 | 38 | * Install decentralizepy for development. (bash) :: 39 | 40 | pip3 install --editable .[dev] 41 | 42 | * Download CIFAR-10 using ``download_dataset.py``. :: 43 | 44 | python download_dataset.py 45 | 46 | * (Optional) Download other datasets from LEAF and place them in ``eval/data/``. 47 | 48 | ---------------- 49 | Running the code 50 | ---------------- 51 | 52 | * Follow the tutorial in ``tutorial/``. OR, 53 | * Generate a new graph file with the required topology using ``generate_graph.py``. :: 54 | 55 | python generate_graph.py --help 56 | 57 | * Choose and modify one of the config files in ``eval/{step,epoch}_configs``. 58 | * Modify the dataset paths and ``addresses_filepath`` in the config file. 59 | * In eval/run.sh, modify arguments as required. 60 | * Execute eval/run.sh on all the machines simultaneously. There is a synchronization barrier mechanism at the start so that all processes start training together. 61 | 62 | ------ 63 | Citing 64 | ------ 65 | 66 | Cite us as :: 67 | 68 | @inproceedings{decentralizepy, 69 | author = {Dhasade, Akash and Kermarrec, Anne-Marie and Pires, Rafael and Sharma, Rishi and Vujasinovic, Milos}, 70 | title = {Decentralized Learning Made Easy with DecentralizePy}, 71 | year = {2023}, 72 | isbn = {9798400700842}, 73 | publisher = {Association for Computing Machinery}, 74 | address = {New York, NY, USA}, 75 | url = {https://doi.org/10.1145/3578356.3592587}, 76 | doi = {10.1145/3578356.3592587}, 77 | booktitle = {Proceedings of the 3rd Workshop on Machine Learning and Systems}, 78 | pages = {34–41}, 79 | numpages = {8}, 80 | keywords = {peer-to-peer, distributed systems, machine learning, middleware, decentralized learning, network topology}, 81 | location = {Rome, Italy}, 82 | series = {EuroMLSys '23} 83 | } 84 | 85 | ------------------------- 86 | Built with DecentralizePy 87 | ------------------------- 88 | 89 | .. _`Epidemic Learning`: https://arxiv.org/abs/2310.01972/ 90 | 91 | `Epidemic Learning`_ 92 | -------------------- 93 | 94 | Tutorial 95 | ``tutorial/EpidemicLearning`` 96 | Source files 97 | ``src/node/EpidemicLearning/`` 98 | Cite 99 | ``Martijn de Vos, Sadegh Farhadkhani, Rachid Guerraoui, Anne-Marie Kermarrec, Rafael Pires, and Rishi Sharma. Epidemic Learning: Boosting Decentralized Learning with Randomized Communication. In Thirty-seventh Conference on Neural Information Processing Systems (NeurIPS), 2023.`` 100 | 101 | .. _`Get More for Less in Decentralized Learning Systems`: https://ieeexplore.ieee.org/document/10272515/ 102 | 103 | `Get More for Less in Decentralized Learning Systems`_ 104 | ------------------------------------------------------ 105 | 106 | Tutorial 107 | ``tutorial/JWINS`` 108 | Source files 109 | ``src/sharing/JWINS/`` 110 | Cite 111 | ``Akash Dhasade, Anne-Marie Kermarrec, Rafael Pires, Rishi Sharma, Jeffrey Wigger, and Milos Vujasinovic. Get More for Less in Decentralized Learning Systems. In IEEE 43rd International Conference on Distributed Computing Systems (ICDCS), 2023.`` 112 | 113 | 114 | ------------ 115 | Contributing 116 | ------------ 117 | 118 | * ``isort`` and ``black`` are installed along with the package for code linting. 119 | * While in the root directory of the repository, before committing the changes, please run :: 120 | 121 | black . 122 | isort . 123 | 124 | ------- 125 | Modules 126 | ------- 127 | 128 | Following are the modules of decentralizepy: 129 | 130 | Node 131 | ---- 132 | * The Manager. Optimizations at process level. 133 | 134 | Dataset 135 | ------- 136 | * Static 137 | 138 | Training 139 | -------- 140 | * Heterogeneity. How much do I want to work? 141 | 142 | Graph 143 | ----- 144 | * Static. Who are my neighbours? Topologies. 145 | 146 | Mapping 147 | ------- 148 | * Naming. The globally unique ids of the ``processes <-> machine_id, local_rank`` 149 | 150 | Sharing 151 | ------- 152 | * Leverage Redundancy. Privacy. Optimizations in model and data sharing. 153 | 154 | Communication 155 | ------------- 156 | * IPC/Network level. Compression. Privacy. Reliability 157 | 158 | Model 159 | ----- 160 | * Learning Model 161 | -------------------------------------------------------------------------------- /src/decentralizepy/node/DPSGDWithPeerSampler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import os 4 | from collections import deque 5 | 6 | import torch 7 | 8 | from decentralizepy.graphs.Graph import Graph 9 | from decentralizepy.mappings.Mapping import Mapping 10 | from decentralizepy.node.DPSGDNode import DPSGDNode 11 | 12 | 13 | class DPSGDWithPeerSampler(DPSGDNode): 14 | """ 15 | This class defines the node for DPSGD 16 | 17 | """ 18 | 19 | def receive_neighbors(self): 20 | return self.receive_channel("PEERS")[1]["NEIGHBORS"] 21 | 22 | def get_neighbors(self, node=None): 23 | logging.debug("Requesting neighbors from the peer sampler.") 24 | self.communication.send( 25 | self.peer_sampler_uid, 26 | { 27 | "REQUEST_NEIGHBORS": self.uid, 28 | "iteration": self.iteration, 29 | "CHANNEL": "SERVER_REQUEST", 30 | }, 31 | ) 32 | my_neighbors = self.receive_neighbors() 33 | logging.debug("Neighbors this round: {}".format(my_neighbors)) 34 | return my_neighbors 35 | 36 | def __init__( 37 | self, 38 | rank: int, 39 | machine_id: int, 40 | mapping: Mapping, 41 | graph: Graph, 42 | config, 43 | iterations=1, 44 | log_dir=".", 45 | weights_store_dir=".", 46 | log_level=logging.INFO, 47 | test_after=5, 48 | train_evaluate_after=1, 49 | reset_optimizer=1, 50 | peer_sampler_uid=-1, 51 | *args 52 | ): 53 | """ 54 | Constructor 55 | 56 | Parameters 57 | ---------- 58 | rank : int 59 | Rank of process local to the machine 60 | machine_id : int 61 | Machine ID on which the process in running 62 | mapping : decentralizepy.mappings 63 | The object containing the mapping rank <--> uid 64 | graph : decentralizepy.graphs 65 | The object containing the global graph 66 | config : dict 67 | A dictionary of configurations. Must contain the following: 68 | [DATASET] 69 | dataset_package 70 | dataset_class 71 | model_class 72 | [OPTIMIZER_PARAMS] 73 | optimizer_package 74 | optimizer_class 75 | [TRAIN_PARAMS] 76 | training_package = decentralizepy.training.Training 77 | training_class = Training 78 | epochs_per_round = 25 79 | batch_size = 64 80 | iterations : int 81 | Number of iterations (communication steps) for which the model should be trained 82 | log_dir : str 83 | Logging directory 84 | weights_store_dir : str 85 | Directory in which to store model weights 86 | log_level : logging.Level 87 | One of DEBUG, INFO, WARNING, ERROR, CRITICAL 88 | test_after : int 89 | Number of iterations after which the test loss and accuracy arecalculated 90 | train_evaluate_after : int 91 | Number of iterations after which the train loss is calculated 92 | reset_optimizer : int 93 | 1 if optimizer should be reset every communication round, else 0 94 | args : optional 95 | Other arguments 96 | 97 | """ 98 | 99 | total_threads = os.cpu_count() 100 | self.threads_per_proc = max( 101 | math.floor(total_threads / mapping.get_local_procs_count()), 1 102 | ) 103 | torch.set_num_threads(self.threads_per_proc) 104 | torch.set_num_interop_threads(1) 105 | self.instantiate( 106 | rank, 107 | machine_id, 108 | mapping, 109 | graph, 110 | config, 111 | iterations, 112 | log_dir, 113 | weights_store_dir, 114 | log_level, 115 | test_after, 116 | train_evaluate_after, 117 | reset_optimizer, 118 | *args 119 | ) 120 | logging.info( 121 | "Each proc uses %d threads out of %d.", self.threads_per_proc, total_threads 122 | ) 123 | 124 | self.message_queue["PEERS"] = deque() 125 | 126 | self.peer_sampler_uid = peer_sampler_uid 127 | self.connect_neighbor(self.peer_sampler_uid) 128 | self.wait_for_hello(self.peer_sampler_uid) 129 | 130 | self.run() 131 | 132 | def disconnect_neighbors(self): 133 | """ 134 | Disconnects all neighbors. 135 | 136 | Raises 137 | ------ 138 | RuntimeError 139 | If received another message while waiting for BYEs 140 | 141 | """ 142 | if not self.sent_disconnections: 143 | logging.info("Disconnecting neighbors") 144 | 145 | if self.peer_sampler_uid in self.barrier: 146 | self.communication.send( 147 | self.peer_sampler_uid, 148 | {"BYE": self.uid, "CHANNEL": "SERVER_REQUEST"}, 149 | ) 150 | self.barrier.remove(self.peer_sampler_uid) 151 | 152 | for uid in self.barrier: 153 | self.communication.send(uid, {"BYE": self.uid, "CHANNEL": "DISCONNECT"}) 154 | self.sent_disconnections = True 155 | 156 | while len(self.barrier): 157 | sender, _ = self.receive_disconnect() 158 | self.barrier.remove(sender) 159 | -------------------------------------------------------------------------------- /src/decentralizepy/datasets/Dataset.py: -------------------------------------------------------------------------------- 1 | from decentralizepy import utils 2 | from decentralizepy.mappings.Mapping import Mapping 3 | 4 | 5 | class Dataset: 6 | """ 7 | This class defines the Dataset API. 8 | All datasets must follow this API. 9 | 10 | """ 11 | 12 | def __init__( 13 | self, 14 | rank: int, 15 | machine_id: int, 16 | mapping: Mapping, 17 | random_seed: int = 1234, 18 | only_local=False, 19 | train_dir="", 20 | test_dir="", 21 | sizes="", 22 | test_batch_size="", 23 | validation_source="", 24 | validation_size="", 25 | ): 26 | """ 27 | Constructor which reads the data files, instantiates and partitions the dataset 28 | 29 | Parameters 30 | ---------- 31 | rank : int 32 | Rank of the current process (to get the partition). 33 | machine_id : int 34 | Machine ID 35 | mapping : decentralizepy.mappings.Mapping 36 | Mapping to convert rank, machine_id -> uid for data partitioning 37 | It also provides the total number of global processes 38 | random_seed : int, optional 39 | Random seed for the dataset 40 | only_local : bool, optional 41 | True if the dataset needs to be partioned only among local procs, False otherwise 42 | train_dir : str, optional 43 | Path to the training data files. Required to instantiate the training set 44 | The training set is partitioned according to the number of global processes and sizes 45 | test_dir : str. optional 46 | Path to the testing data files Required to instantiate the testing set 47 | sizes : list(int), optional 48 | A list of fractions specifying how much data to alot each process. Sum of fractions should be 1.0 49 | By default, each process gets an equal amount. 50 | test_batch_size : int, optional 51 | Batch size during testing. Default value is 64 52 | validation_source : str, optional 53 | Source of the validation set. Can be one of 'train' or 'test' 54 | validation_size : int, optional 55 | size of the test set used as validation set 56 | """ 57 | self.rank = rank 58 | self.machine_id = machine_id 59 | self.mapping = mapping 60 | self.random_seed = random_seed 61 | self.uid = self.mapping.get_uid(rank, machine_id) 62 | self.only_local = only_local 63 | self.dataset_id = self.rank if self.only_local else self.uid 64 | self.num_partitions = ( 65 | self.mapping.get_local_procs_count() 66 | if self.only_local 67 | else self.mapping.get_n_procs() 68 | ) 69 | self.train_dir = utils.conditional_value(train_dir, "", None) 70 | self.test_dir = utils.conditional_value(test_dir, "", None) 71 | self.sizes = utils.conditional_value(sizes, "", None) 72 | self.test_batch_size = utils.conditional_value(test_batch_size, "", 64) 73 | self.num_classes = None 74 | self.validation_size = utils.conditional_value(validation_size, "", None) 75 | self.validation_source = utils.conditional_value(validation_source, "", None) 76 | 77 | if self.sizes: 78 | if type(self.sizes) == str: 79 | self.sizes = eval(self.sizes) 80 | 81 | if train_dir: 82 | self.__training__ = True 83 | else: 84 | self.__training__ = False 85 | 86 | if test_dir: 87 | self.__testing__ = True 88 | else: 89 | self.__testing__ = False 90 | 91 | if self.validation_size and self.validation_source: 92 | self.__validating__ = True 93 | else: 94 | self.__validating__ = False 95 | 96 | self.label_distribution = None 97 | 98 | def get_label_distribution(self): 99 | # Only supported for classification 100 | if self.label_distribution == None: 101 | self.label_distribution = [0 for _ in range(self.num_classes)] 102 | tr_set = self.get_trainset() 103 | for _, ys in tr_set: 104 | for y in ys: 105 | y_val = y.item() 106 | self.label_distribution[y_val] += 1 107 | 108 | return self.label_distribution 109 | 110 | def get_trainset(self): 111 | """ 112 | Function to get the training set 113 | 114 | Returns 115 | ------- 116 | torch.utils.Dataset(decentralizepy.datasets.Data) 117 | 118 | Raises 119 | ------ 120 | RuntimeError 121 | If the training set was not initialized 122 | 123 | """ 124 | raise NotImplementedError 125 | 126 | def get_testset(self): 127 | """ 128 | Function to get the test set 129 | 130 | Returns 131 | ------- 132 | torch.utils.Dataset(decentralizepy.datasets.Data) 133 | 134 | Raises 135 | ------ 136 | RuntimeError 137 | If the test set was not initialized 138 | 139 | """ 140 | raise NotImplementedError 141 | 142 | def get_validationset(self): 143 | """ 144 | Function to get the test set 145 | 146 | Returns 147 | ------- 148 | torch.utils.Dataset(decentralizepy.datasets.Data) 149 | 150 | Raises 151 | ------ 152 | RuntimeError 153 | If the test set was not initialized 154 | 155 | """ 156 | raise NotImplementedError 157 | -------------------------------------------------------------------------------- /src/decentralizepy/graphs/Graph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | 4 | 5 | class Graph: 6 | """ 7 | This class defines the graph topology. 8 | Adapted from https://gitlab.epfl.ch/sacs/ml-rawdatasharing/dnn-recommender/-/blob/master/api.py 9 | """ 10 | 11 | def __init__(self, n_procs=None): 12 | """ 13 | Constructor 14 | 15 | Parameters 16 | ---------- 17 | n_procs : int, optional 18 | Number of processes in the graph, if already known 19 | 20 | """ 21 | if n_procs != None: 22 | self.n_procs = n_procs 23 | self.adj_list = [set() for i in range(self.n_procs)] 24 | 25 | def get_all_nodes(self): 26 | return [i for i in range(self.n_procs)] 27 | 28 | def __insert_adj__(self, node, neighbours): 29 | """ 30 | Inserts `neighbours` into the adjacency list of `node` 31 | 32 | Parameters 33 | ---------- 34 | node : int 35 | The vertex in question 36 | neighbours : list(int) 37 | A list of neighbours of the `node` 38 | 39 | """ 40 | self.adj_list[node].update(neighbours) 41 | 42 | def __insert_edge__(self, x, y): 43 | """ 44 | Inserts edge `x -> y` into the graph 45 | 46 | Parameters 47 | ---------- 48 | x : int 49 | The source vertex 50 | y : int 51 | The destination vertex 52 | 53 | """ 54 | self.adj_list[x].add(y) 55 | self.adj_list[y].add(x) 56 | 57 | def read_graph_from_file(self, file, type="edges", force_connect=False): 58 | """ 59 | Reads the graph from a given file 60 | 61 | Parameters 62 | ---------- 63 | file : str 64 | path to the file 65 | type : str 66 | `edges` or `adjacency` 67 | force_connect : bool, optional 68 | Should the graph be force-connected using a ring 69 | 70 | Returns 71 | ------- 72 | int 73 | Number of processes, read from the first line of the file 74 | 75 | Raises 76 | ------ 77 | ValueError 78 | If the type is not either `edges` or `adjacency` 79 | 80 | """ 81 | 82 | with open(file, "r") as inf: 83 | self.n_procs = int(inf.readline().strip()) 84 | self.adj_list = [set() for i in range(self.n_procs)] 85 | 86 | lines = inf.readlines() 87 | if type == "edges": 88 | for line in lines: 89 | x, y = map(int, line.strip().split()) 90 | self.__insert_edge__(x, y) 91 | elif type == "adjacency": 92 | node_id = 0 93 | for line in lines: 94 | neighbours = map(int, line.strip().split()) 95 | self.__insert_adj__(node_id, neighbours) 96 | node_id += 1 97 | else: 98 | raise ValueError("type must be from {edges, adjacency}!") 99 | 100 | if force_connect: 101 | self.connect_graph() 102 | 103 | return self.n_procs 104 | 105 | def write_graph_to_file(self, file, type="edges"): 106 | """ 107 | Writes graph to file 108 | 109 | Parameters 110 | ---------- 111 | file : str 112 | File path 113 | type : str 114 | One of {"edges", "adjacency"}. Writes the corresponding format. 115 | 116 | """ 117 | with open(file, "w") as of: 118 | of.write(str(self.n_procs) + "\n") 119 | if type == "edges": 120 | for node, adj in enumerate(self.adj_list): 121 | for neighbor in adj: 122 | of.write("{} {}".format(node, neighbor) + "\n") 123 | elif type == "adjacency": 124 | for adj in self.adj_list: 125 | of.write(str(*adj) + "\n") 126 | else: 127 | raise ValueError("type must be from {edges, adjacency}!") 128 | 129 | def connect_graph(self): 130 | """ 131 | Connects the graph using a Ring 132 | 133 | """ 134 | for node in range(self.n_procs): 135 | self.adj_list[node].add((node + 1) % self.n_procs) 136 | self.adj_list[node].add((node - 1) % self.n_procs) 137 | 138 | def neighbors(self, uid): 139 | """ 140 | Gives the neighbors of a node 141 | 142 | Parameters 143 | ---------- 144 | uid : int 145 | globally unique identifier of the node 146 | 147 | Returns 148 | ------- 149 | set(int) 150 | a set of neighbours 151 | 152 | """ 153 | return self.adj_list[uid] 154 | 155 | def centr(self): 156 | my_adj = {x: list(adj) for x, adj in enumerate(self.adj_list)} 157 | nxGraph = nx.Graph(my_adj) 158 | a = nx.to_numpy_matrix(nxGraph) 159 | self.averaging_weights = np.ones((self.n_procs, self.n_procs), dtype=float) 160 | centrality = nx.betweenness_centrality(nxGraph) 161 | for i in range(len(centrality)): 162 | centrality[i] += 0.01 163 | for i in range(self.averaging_weights.shape[0]): 164 | s = 0 165 | for j in range(self.averaging_weights.shape[0]): 166 | self.averaging_weights[i, j] = 1.0 / centrality[j] 167 | s += self.averaging_weights[i, j] 168 | for j in range(self.averaging_weights.shape[0]): 169 | self.averaging_weights[i, j] = self.averaging_weights[i, j] / s 170 | return self.averaging_weights 171 | -------------------------------------------------------------------------------- /eval/80_nodes.edges: -------------------------------------------------------------------------------- 1 | 80 2 | 0 1 3 | 0 3 4 | 0 60 5 | 0 47 6 | 0 79 7 | 0 21 8 | 0 53 9 | 0 28 10 | 1 0 11 | 1 2 12 | 1 70 13 | 1 20 14 | 1 61 15 | 1 30 16 | 2 33 17 | 2 1 18 | 2 3 19 | 2 4 20 | 2 68 21 | 3 0 22 | 3 2 23 | 3 4 24 | 3 14 25 | 3 19 26 | 3 20 27 | 3 25 28 | 3 59 29 | 4 2 30 | 4 3 31 | 4 5 32 | 4 78 33 | 4 16 34 | 5 4 35 | 5 6 36 | 5 7 37 | 5 10 38 | 5 76 39 | 5 79 40 | 5 24 41 | 5 60 42 | 6 17 43 | 6 26 44 | 6 5 45 | 6 7 46 | 7 5 47 | 7 6 48 | 7 8 49 | 7 54 50 | 7 61 51 | 8 7 52 | 8 9 53 | 8 16 54 | 8 19 55 | 8 57 56 | 8 63 57 | 9 68 58 | 9 8 59 | 9 10 60 | 9 77 61 | 9 54 62 | 9 24 63 | 9 27 64 | 9 30 65 | 10 69 66 | 10 5 67 | 10 9 68 | 10 11 69 | 10 45 70 | 10 47 71 | 10 55 72 | 11 10 73 | 11 19 74 | 11 12 75 | 11 71 76 | 12 37 77 | 12 74 78 | 12 11 79 | 12 13 80 | 12 63 81 | 13 65 82 | 13 71 83 | 13 39 84 | 13 76 85 | 13 45 86 | 13 14 87 | 13 12 88 | 13 51 89 | 13 53 90 | 13 54 91 | 13 63 92 | 14 32 93 | 14 64 94 | 14 3 95 | 14 70 96 | 14 13 97 | 14 15 98 | 14 51 99 | 14 23 100 | 14 60 101 | 15 69 102 | 15 40 103 | 15 44 104 | 15 14 105 | 15 16 106 | 15 19 107 | 15 53 108 | 15 22 109 | 15 27 110 | 16 4 111 | 16 70 112 | 16 8 113 | 16 77 114 | 16 15 115 | 16 17 116 | 16 60 117 | 16 31 118 | 17 32 119 | 17 6 120 | 17 40 121 | 17 16 122 | 17 18 123 | 18 32 124 | 18 46 125 | 18 17 126 | 18 19 127 | 18 20 128 | 18 24 129 | 19 3 130 | 19 8 131 | 19 11 132 | 19 75 133 | 19 78 134 | 19 15 135 | 19 18 136 | 19 20 137 | 19 21 138 | 19 55 139 | 19 58 140 | 20 1 141 | 20 65 142 | 20 3 143 | 20 70 144 | 20 18 145 | 20 19 146 | 20 21 147 | 20 22 148 | 21 0 149 | 21 38 150 | 21 46 151 | 21 19 152 | 21 20 153 | 21 22 154 | 21 24 155 | 22 15 156 | 22 51 157 | 22 20 158 | 22 21 159 | 22 23 160 | 22 63 161 | 23 36 162 | 23 14 163 | 23 79 164 | 23 22 165 | 23 24 166 | 24 5 167 | 24 38 168 | 24 37 169 | 24 9 170 | 24 18 171 | 24 21 172 | 24 23 173 | 24 25 174 | 24 61 175 | 25 66 176 | 25 3 177 | 25 58 178 | 25 40 179 | 25 24 180 | 25 26 181 | 26 6 182 | 26 53 183 | 26 25 184 | 26 27 185 | 26 29 186 | 27 71 187 | 27 9 188 | 27 15 189 | 27 48 190 | 27 52 191 | 27 54 192 | 27 26 193 | 27 28 194 | 27 62 195 | 28 0 196 | 28 40 197 | 28 46 198 | 28 56 199 | 28 27 200 | 28 29 201 | 29 48 202 | 29 26 203 | 29 28 204 | 29 30 205 | 30 65 206 | 30 1 207 | 30 34 208 | 30 68 209 | 30 9 210 | 30 29 211 | 30 31 212 | 31 16 213 | 31 32 214 | 31 43 215 | 31 30 216 | 32 33 217 | 32 38 218 | 32 41 219 | 32 44 220 | 32 14 221 | 32 17 222 | 32 18 223 | 32 57 224 | 32 31 225 | 33 32 226 | 33 2 227 | 33 34 228 | 33 69 229 | 33 54 230 | 33 63 231 | 34 38 232 | 34 33 233 | 34 35 234 | 34 30 235 | 35 34 236 | 35 36 237 | 35 38 238 | 35 39 239 | 35 46 240 | 36 35 241 | 36 37 242 | 36 42 243 | 36 46 244 | 36 54 245 | 36 23 246 | 37 36 247 | 37 38 248 | 37 12 249 | 37 76 250 | 37 24 251 | 38 32 252 | 38 34 253 | 38 35 254 | 38 37 255 | 38 39 256 | 38 21 257 | 38 24 258 | 39 40 259 | 39 35 260 | 39 13 261 | 39 38 262 | 40 39 263 | 40 41 264 | 40 15 265 | 40 17 266 | 40 55 267 | 40 25 268 | 40 28 269 | 41 32 270 | 41 42 271 | 41 40 272 | 41 79 273 | 42 64 274 | 42 36 275 | 42 41 276 | 42 43 277 | 42 50 278 | 42 53 279 | 42 55 280 | 42 58 281 | 43 64 282 | 43 68 283 | 43 42 284 | 43 44 285 | 43 31 286 | 44 32 287 | 44 43 288 | 44 45 289 | 44 15 290 | 44 51 291 | 44 62 292 | 45 72 293 | 45 10 294 | 45 44 295 | 45 13 296 | 45 46 297 | 45 50 298 | 46 35 299 | 46 36 300 | 46 76 301 | 46 45 302 | 46 77 303 | 46 47 304 | 46 18 305 | 46 50 306 | 46 21 307 | 46 28 308 | 47 0 309 | 47 10 310 | 47 74 311 | 47 46 312 | 47 48 313 | 48 49 314 | 48 27 315 | 48 29 316 | 48 47 317 | 49 64 318 | 49 48 319 | 49 50 320 | 49 52 321 | 49 54 322 | 50 64 323 | 50 42 324 | 50 76 325 | 50 45 326 | 50 46 327 | 50 49 328 | 50 51 329 | 51 67 330 | 51 72 331 | 51 44 332 | 51 13 333 | 51 14 334 | 51 50 335 | 51 52 336 | 51 22 337 | 51 55 338 | 52 76 339 | 52 49 340 | 52 51 341 | 52 53 342 | 52 27 343 | 53 0 344 | 53 68 345 | 53 42 346 | 53 13 347 | 53 15 348 | 53 52 349 | 53 54 350 | 53 55 351 | 53 26 352 | 54 33 353 | 54 36 354 | 54 7 355 | 54 9 356 | 54 13 357 | 54 49 358 | 54 53 359 | 54 55 360 | 54 57 361 | 54 27 362 | 55 40 363 | 55 10 364 | 55 42 365 | 55 51 366 | 55 19 367 | 55 53 368 | 55 54 369 | 55 56 370 | 56 57 371 | 56 28 372 | 56 55 373 | 57 32 374 | 57 68 375 | 57 8 376 | 57 54 377 | 57 56 378 | 57 58 379 | 58 42 380 | 58 19 381 | 58 25 382 | 58 59 383 | 58 57 384 | 59 64 385 | 59 58 386 | 59 3 387 | 59 60 388 | 60 0 389 | 60 5 390 | 60 74 391 | 60 14 392 | 60 16 393 | 60 59 394 | 60 61 395 | 61 1 396 | 61 68 397 | 61 7 398 | 61 76 399 | 61 24 400 | 61 60 401 | 61 62 402 | 62 70 403 | 62 44 404 | 62 27 405 | 62 61 406 | 62 63 407 | 63 64 408 | 63 33 409 | 63 66 410 | 63 68 411 | 63 8 412 | 63 74 413 | 63 12 414 | 63 13 415 | 63 78 416 | 63 22 417 | 63 62 418 | 64 65 419 | 64 70 420 | 64 42 421 | 64 43 422 | 64 14 423 | 64 49 424 | 64 50 425 | 64 59 426 | 64 63 427 | 65 64 428 | 65 66 429 | 65 13 430 | 65 20 431 | 65 30 432 | 66 65 433 | 66 25 434 | 66 67 435 | 66 63 436 | 67 66 437 | 67 51 438 | 67 68 439 | 68 2 440 | 68 67 441 | 68 69 442 | 68 9 443 | 68 73 444 | 68 43 445 | 68 53 446 | 68 57 447 | 68 61 448 | 68 30 449 | 68 63 450 | 69 33 451 | 69 68 452 | 69 70 453 | 69 10 454 | 69 15 455 | 70 64 456 | 70 1 457 | 70 69 458 | 70 71 459 | 70 14 460 | 70 16 461 | 70 20 462 | 70 62 463 | 71 70 464 | 71 72 465 | 71 11 466 | 71 13 467 | 71 27 468 | 72 73 469 | 72 51 470 | 72 45 471 | 72 71 472 | 73 72 473 | 73 74 474 | 73 68 475 | 74 73 476 | 74 75 477 | 74 12 478 | 74 47 479 | 74 60 480 | 74 63 481 | 75 74 482 | 75 19 483 | 75 76 484 | 76 37 485 | 76 5 486 | 76 75 487 | 76 13 488 | 76 46 489 | 76 77 490 | 76 50 491 | 76 52 492 | 76 61 493 | 77 9 494 | 77 76 495 | 77 46 496 | 77 78 497 | 77 16 498 | 78 4 499 | 78 77 500 | 78 79 501 | 78 19 502 | 78 63 503 | 79 0 504 | 79 5 505 | 79 41 506 | 79 78 507 | 79 23 508 | -------------------------------------------------------------------------------- /src/decentralizepy/training/text/LLMTraining.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | 5 | from decentralizepy import utils 6 | from decentralizepy.training.Training import Training 7 | 8 | 9 | class LLMTraining(Training): 10 | """ 11 | This class implements the training module for a single node. 12 | 13 | """ 14 | 15 | def __init__( 16 | self, 17 | rank, 18 | machine_id, 19 | mapping, 20 | model, 21 | optimizer, 22 | loss=None, 23 | log_dir=".", 24 | rounds="", 25 | full_epochs="", 26 | batch_size="", 27 | shuffle="", 28 | ): 29 | """ 30 | Constructor 31 | 32 | Parameters 33 | ---------- 34 | rank : int 35 | Rank of process local to the machine 36 | machine_id : int 37 | Machine ID on which the process in running 38 | mapping : decentralizepy.mappings 39 | The object containing the mapping rank <--> uid 40 | model : torch.nn.Module 41 | Neural Network for training 42 | optimizer : torch.optim 43 | Optimizer to learn parameters 44 | loss : function 45 | Loss function 46 | log_dir : str 47 | Directory to log the model change. 48 | rounds : int, optional 49 | Number of steps/epochs per training call 50 | full_epochs : bool, optional 51 | True if 1 round = 1 epoch. False if 1 round = 1 minibatch 52 | batch_size : int, optional 53 | Number of items to learn over, in one batch 54 | shuffle : bool 55 | True if the dataset should be shuffled before training. 56 | 57 | """ 58 | super().__init__( 59 | rank, 60 | machine_id, 61 | mapping, 62 | model, 63 | optimizer, 64 | loss, 65 | log_dir, 66 | rounds, 67 | full_epochs, 68 | batch_size, 69 | shuffle, 70 | ) 71 | 72 | def eval_loss(self, dataset): 73 | """ 74 | Evaluate the loss on the training set 75 | 76 | Parameters 77 | ---------- 78 | dataset : decentralizepy.datasets.Dataset 79 | The training dataset. Should implement get_trainset(batch_size, shuffle) 80 | 81 | """ 82 | trainset = dataset.get_trainset(self.batch_size, self.shuffle) 83 | epoch_loss = 0.0 84 | count = 0 85 | with torch.no_grad(): 86 | for batch in trainset: 87 | input_ids = batch["input_ids"] 88 | attention_mask = batch["attention_mask"] 89 | labels = batch["labels"] 90 | outputs = self.model( 91 | input_ids, attention_mask=attention_mask, labels=labels 92 | ) 93 | loss = outputs[0] 94 | epoch_loss += loss.item() 95 | count += 1 96 | loss = epoch_loss / count 97 | logging.info("Loss after iteration: {}".format(loss)) 98 | return loss 99 | 100 | def trainstep(self, batch): 101 | """ 102 | One training step on a minibatch. 103 | 104 | Parameters 105 | ---------- 106 | batch : any 107 | Data item 108 | 109 | Returns 110 | ------- 111 | int 112 | Loss Value for the step 113 | 114 | """ 115 | self.optimizer.zero_grad() 116 | input_ids = batch["input_ids"] 117 | attention_mask = batch["attention_mask"] 118 | labels = batch["labels"] 119 | outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels) 120 | loss = outputs[0] 121 | loss.backward() 122 | self.optimizer.step() 123 | return loss.item() 124 | 125 | def train_full(self, dataset): 126 | """ 127 | One training iteration, goes through the entire dataset 128 | 129 | Parameters 130 | ---------- 131 | trainset : torch.utils.data.Dataloader 132 | The training dataset. 133 | 134 | """ 135 | trainset = dataset.get_trainset(self.batch_size, self.shuffle) 136 | for epoch in range(self.rounds): 137 | epoch_loss = 0.0 138 | count = 0 139 | for batch in trainset: 140 | logging.debug( 141 | "Starting minibatch {} with num_samples: {}".format( 142 | count, len(batch["input_ids"]) 143 | ) 144 | ) 145 | epoch_loss += self.trainstep(batch) 146 | count += 1 147 | logging.debug("Epoch: {} loss: {}".format(epoch, epoch_loss / count)) 148 | 149 | def train(self, dataset): 150 | """ 151 | One training iteration 152 | 153 | Parameters 154 | ---------- 155 | dataset : decentralizepy.datasets.Dataset 156 | The training dataset. Should implement get_trainset(batch_size, shuffle) 157 | 158 | """ 159 | self.model.train() 160 | 161 | if self.full_epochs: 162 | self.train_full(dataset) 163 | else: 164 | iter_loss = 0.0 165 | count = 0 166 | trainset = dataset.get_trainset(self.batch_size, self.shuffle) 167 | while count < self.rounds: 168 | for data in trainset: 169 | iter_loss += self.trainstep(data) 170 | count += 1 171 | logging.debug("Round: {} loss: {}".format(count, iter_loss / count)) 172 | if count >= self.rounds: 173 | break 174 | -------------------------------------------------------------------------------- /eval/run_grid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Documentation 3 | # This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory. 4 | # The second one (python_bin) is the path to the python bin folder. 5 | # The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory. 6 | # 7 | # The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder 8 | # called configs which contains the file 'ip_addr_6Machines.json' 9 | # The python bin folder needs to include all the dependencies of this project including crudini. 10 | # The results will be stored in $nfs_home/$logs_subfolder 11 | # Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment 12 | # starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc. 13 | # The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment 14 | # was run. 15 | # Example: ./run_grid.sh /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba 16 | # 17 | # Additional requirements: 18 | # Each node needs a folder called 'tmp' in the user's home directory 19 | # 20 | # Note: 21 | # - The script does not change the optimizer. All configs are writen to use SGD. 22 | # - The script will set '--test_after' and '--train_evaluate_after' such that it happens at the end of a global epoch. 23 | # - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only 24 | # relevant for Adams and other optimizers with internal state) 25 | # 26 | # Addapting the script to other datasets: 27 | # Change the variable 'dataset_size' to reflect the data sets size. 28 | # 29 | # Known issues: 30 | # - If the script is started at the very end of a minute then there is a change that two folders are created as not all 31 | # machines may start running the script at the exact same moment. 32 | 33 | nfs_home=$1 34 | python_bin=$2 35 | logs_subfolder=$3 36 | decpy_path=$nfs_home/decentralizepy/eval 37 | cd $decpy_path 38 | 39 | env_python=$python_bin/python3 40 | graph=96_regular.edges 41 | config_file=~/tmp/config.ini 42 | procs_per_machine=16 43 | machines=6 44 | global_epochs=25 45 | eval_file=testing.py 46 | log_level=INFO 47 | 48 | ip_machines=$nfs_home/configs/ip_addr_6Machines.json 49 | 50 | m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2` 51 | export PYTHONFAULTHANDLER=1 52 | 53 | # Base configs for which the gird search is done 54 | tests=("step_configs/config_celeba_sharing.ini") 55 | # Learning rates to test 56 | lrs=( "0.001" "0.0001" "0.0001") 57 | # Batch sizes to test 58 | batchsize=("8" "16") 59 | # The number of communication rounds per global epoch to test 60 | comm_rounds_per_global_epoch=("1" "5" "10") 61 | procs=`expr $procs_per_machine \* $machines` 62 | echo procs: $procs 63 | dataset_size=63741 64 | # Calculating the number of samples that each user/proc will have on average 65 | samples_per_user=`expr $dataset_size / $procs` 66 | echo samples per user: $samples_per_user 67 | 68 | for b in "${batchsize[@]}" 69 | do 70 | echo batchsize: $b 71 | for r in "${comm_rounds_per_global_epoch[@]}" 72 | do 73 | echo communication rounds per global epoch: $r 74 | # calculating how many batches there are in a global epoch for each user/proc 75 | batches_per_epoch=$(($samples_per_user / $b)) 76 | echo batches per global epoch: $batches_per_epoch 77 | # the number of iterations in 25 global epochs 78 | iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $r >= $batches_per_epoch else print($global_epochs * $r)") 79 | echo iterations: $iterations 80 | # calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down) 81 | batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $r); print(1 if x==0 else x)") 82 | # since the batches per communication round were rounded down we need to change the number of iterations to reflect that 83 | new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $r); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $r)/x)*$iterations); print($iterations if y<$iterations else y)") 84 | echo batches per communication round: $batches_per_comm_round 85 | echo corrected iterations: $new_iterations 86 | test_after=$(($new_iterations / $global_epochs)) 87 | echo test after: $test_after 88 | for lr in "${lrs[@]}" 89 | do 90 | for i in "${tests[@]}" 91 | do 92 | echo $i 93 | IFS='_' read -ra NAMES <<< $i 94 | IFS='.' read -ra NAME <<< ${NAMES[-1]} 95 | log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$r:b=$b:$(date '+%Y-%m-%dT%H:%M')/machine$m 96 | echo results are stored in: $log_dir 97 | mkdir -p $log_dir 98 | cp $i $config_file 99 | # changing the config files to reflect the values of the current grid search state 100 | $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines 101 | $python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr 102 | $python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round 103 | $python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $b 104 | $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level 105 | echo $i is done 106 | sleep 1 107 | echo end of sleep 108 | done 109 | done 110 | done 111 | done 112 | # 113 | 114 | -------------------------------------------------------------------------------- /src/decentralizepy/training/Training.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | 5 | from decentralizepy import utils 6 | 7 | 8 | class Training: 9 | """ 10 | This class implements the training module for a single node. 11 | 12 | """ 13 | 14 | def __init__( 15 | self, 16 | rank, 17 | machine_id, 18 | mapping, 19 | model, 20 | optimizer, 21 | loss, 22 | log_dir, 23 | rounds="", 24 | full_epochs="", 25 | batch_size="", 26 | shuffle="", 27 | ): 28 | """ 29 | Constructor 30 | 31 | Parameters 32 | ---------- 33 | rank : int 34 | Rank of process local to the machine 35 | machine_id : int 36 | Machine ID on which the process in running 37 | mapping : decentralizepy.mappings 38 | The object containing the mapping rank <--> uid 39 | model : torch.nn.Module 40 | Neural Network for training 41 | optimizer : torch.optim 42 | Optimizer to learn parameters 43 | loss : function 44 | Loss function 45 | log_dir : str 46 | Directory to log the model change. 47 | rounds : int, optional 48 | Number of steps/epochs per training call 49 | full_epochs : bool, optional 50 | True if 1 round = 1 epoch. False if 1 round = 1 minibatch 51 | batch_size : int, optional 52 | Number of items to learn over, in one batch 53 | shuffle : bool 54 | True if the dataset should be shuffled before training. 55 | 56 | """ 57 | self.model = model 58 | self.optimizer = optimizer 59 | self.loss = loss 60 | self.log_dir = log_dir 61 | self.rank = rank 62 | self.machine_id = machine_id 63 | self.mapping = mapping 64 | self.rounds = utils.conditional_value(rounds, "", int(1)) 65 | self.full_epochs = utils.conditional_value(full_epochs, "", False) 66 | self.batch_size = utils.conditional_value(batch_size, "", int(1)) 67 | self.shuffle = utils.conditional_value(shuffle, "", False) 68 | 69 | def reset_optimizer(self, optimizer): 70 | """ 71 | Replace the current optimizer with a new one 72 | 73 | Parameters 74 | ---------- 75 | optimizer : torch.optim 76 | A new optimizer 77 | 78 | """ 79 | self.optimizer = optimizer 80 | 81 | def eval_loss(self, dataset): 82 | """ 83 | Evaluate the loss on the training set 84 | 85 | Parameters 86 | ---------- 87 | dataset : decentralizepy.datasets.Dataset 88 | The training dataset. Should implement get_trainset(batch_size, shuffle) 89 | 90 | """ 91 | trainset = dataset.get_trainset(self.batch_size, self.shuffle) 92 | epoch_loss = 0.0 93 | count = 0 94 | with torch.no_grad(): 95 | for data, target in trainset: 96 | output = self.model(data) 97 | loss_val = self.loss(output, target) 98 | epoch_loss += loss_val.item() 99 | count += 1 100 | loss = epoch_loss / count 101 | logging.info("Loss after iteration: {}".format(loss)) 102 | return loss 103 | 104 | def trainstep(self, data, target): 105 | """ 106 | One training step on a minibatch. 107 | 108 | Parameters 109 | ---------- 110 | data : any 111 | Data item 112 | target : any 113 | Label 114 | 115 | Returns 116 | ------- 117 | int 118 | Loss Value for the step 119 | 120 | """ 121 | self.model.zero_grad() 122 | output = self.model(data) 123 | loss_val = self.loss(output, target) 124 | loss_val.backward() 125 | self.optimizer.step() 126 | return loss_val.item() 127 | 128 | def train_full(self, dataset): 129 | """ 130 | One training iteration, goes through the entire dataset 131 | 132 | Parameters 133 | ---------- 134 | trainset : torch.utils.data.Dataloader 135 | The training dataset. 136 | 137 | """ 138 | for epoch in range(self.rounds): 139 | trainset = dataset.get_trainset(self.batch_size, self.shuffle) 140 | epoch_loss = 0.0 141 | count = 0 142 | for data, target in trainset: 143 | logging.debug( 144 | "Starting minibatch {} with num_samples: {}".format( 145 | count, len(data) 146 | ) 147 | ) 148 | logging.debug("Classes: {}".format(target)) 149 | epoch_loss += self.trainstep(data, target) 150 | count += 1 151 | logging.debug("Epoch: {} loss: {}".format(epoch, epoch_loss / count)) 152 | 153 | def train(self, dataset): 154 | """ 155 | One training iteration 156 | 157 | Parameters 158 | ---------- 159 | dataset : decentralizepy.datasets.Dataset 160 | The training dataset. Should implement get_trainset(batch_size, shuffle) 161 | 162 | """ 163 | self.model.train() 164 | 165 | if self.full_epochs: 166 | self.train_full(dataset) 167 | else: 168 | iter_loss = 0.0 169 | count = 0 170 | trainset = dataset.get_trainset(self.batch_size, self.shuffle) 171 | while count < self.rounds: 172 | for data, target in trainset: 173 | iter_loss += self.trainstep(data, target) 174 | count += 1 175 | logging.debug("Round: {} loss: {}".format(count, iter_loss / count)) 176 | if count >= self.rounds: 177 | break 178 | -------------------------------------------------------------------------------- /eval/96_nodes_smallworld.edges: -------------------------------------------------------------------------------- 1 | 96 2 | 0 1 3 | 0 66 4 | 0 8 5 | 0 43 6 | 0 19 7 | 0 58 8 | 0 95 9 | 1 0 10 | 1 2 11 | 1 35 12 | 1 4 13 | 1 80 14 | 1 50 15 | 1 90 16 | 2 56 17 | 2 3 18 | 2 35 19 | 2 1 20 | 3 2 21 | 3 4 22 | 3 5 23 | 3 72 24 | 3 15 25 | 3 86 26 | 3 55 27 | 4 1 28 | 4 3 29 | 4 36 30 | 4 37 31 | 4 38 32 | 4 5 33 | 4 76 34 | 5 3 35 | 5 4 36 | 5 6 37 | 5 49 38 | 5 53 39 | 5 92 40 | 6 67 41 | 6 36 42 | 6 5 43 | 6 7 44 | 6 78 45 | 6 86 46 | 7 64 47 | 7 6 48 | 7 8 49 | 7 41 50 | 7 47 51 | 7 17 52 | 7 87 53 | 8 0 54 | 8 7 55 | 8 9 56 | 8 56 57 | 8 26 58 | 9 8 59 | 9 10 60 | 9 75 61 | 9 77 62 | 9 15 63 | 10 32 64 | 10 36 65 | 10 9 66 | 10 11 67 | 10 12 68 | 10 81 69 | 10 82 70 | 11 32 71 | 11 34 72 | 11 10 73 | 11 12 74 | 11 59 75 | 11 92 76 | 11 61 77 | 12 13 78 | 12 10 79 | 12 11 80 | 12 29 81 | 13 18 82 | 13 12 83 | 13 14 84 | 14 73 85 | 14 91 86 | 14 13 87 | 14 15 88 | 15 3 89 | 15 9 90 | 15 75 91 | 15 14 92 | 15 47 93 | 15 16 94 | 15 27 95 | 15 31 96 | 16 17 97 | 16 66 98 | 16 46 99 | 16 15 100 | 17 16 101 | 17 18 102 | 17 20 103 | 17 7 104 | 18 32 105 | 18 13 106 | 18 79 107 | 18 17 108 | 18 19 109 | 18 93 110 | 19 0 111 | 19 18 112 | 19 20 113 | 19 86 114 | 20 46 115 | 20 80 116 | 20 17 117 | 20 19 118 | 20 21 119 | 20 88 120 | 20 90 121 | 21 20 122 | 21 69 123 | 21 22 124 | 21 23 125 | 22 35 126 | 22 69 127 | 22 79 128 | 22 21 129 | 22 23 130 | 22 58 131 | 23 38 132 | 23 77 133 | 23 21 134 | 23 22 135 | 23 24 136 | 23 89 137 | 23 58 138 | 24 25 139 | 24 58 140 | 24 23 141 | 24 79 142 | 25 36 143 | 25 69 144 | 25 41 145 | 25 42 146 | 25 24 147 | 25 26 148 | 26 8 149 | 26 25 150 | 26 27 151 | 26 87 152 | 27 34 153 | 27 26 154 | 27 28 155 | 27 15 156 | 28 27 157 | 28 46 158 | 28 82 159 | 28 91 160 | 28 29 161 | 28 95 162 | 29 12 163 | 29 28 164 | 29 53 165 | 29 56 166 | 29 60 167 | 29 30 168 | 30 35 169 | 30 45 170 | 30 92 171 | 30 29 172 | 30 31 173 | 31 64 174 | 31 33 175 | 31 32 176 | 31 76 177 | 31 78 178 | 31 15 179 | 31 50 180 | 31 30 181 | 32 33 182 | 32 37 183 | 32 10 184 | 32 11 185 | 32 42 186 | 32 18 187 | 32 31 188 | 33 32 189 | 33 34 190 | 33 31 191 | 34 33 192 | 34 35 193 | 34 75 194 | 34 11 195 | 34 55 196 | 34 27 197 | 35 1 198 | 35 2 199 | 35 34 200 | 35 36 201 | 35 53 202 | 35 22 203 | 35 56 204 | 35 30 205 | 36 89 206 | 36 35 207 | 36 4 208 | 36 37 209 | 36 6 210 | 36 72 211 | 36 10 212 | 36 75 213 | 36 85 214 | 36 25 215 | 37 32 216 | 37 4 217 | 37 36 218 | 37 38 219 | 37 51 220 | 38 4 221 | 38 37 222 | 38 39 223 | 38 43 224 | 38 23 225 | 39 38 226 | 39 40 227 | 39 42 228 | 39 75 229 | 39 94 230 | 40 70 231 | 40 39 232 | 40 41 233 | 40 48 234 | 40 49 235 | 40 54 236 | 40 95 237 | 41 68 238 | 41 7 239 | 41 40 240 | 41 42 241 | 41 43 242 | 41 25 243 | 41 91 244 | 42 32 245 | 42 70 246 | 42 39 247 | 42 41 248 | 42 43 249 | 42 56 250 | 42 25 251 | 42 60 252 | 43 0 253 | 43 38 254 | 43 41 255 | 43 42 256 | 43 44 257 | 44 64 258 | 44 71 259 | 44 43 260 | 44 45 261 | 44 58 262 | 45 46 263 | 45 44 264 | 45 30 265 | 46 45 266 | 46 47 267 | 46 16 268 | 46 48 269 | 46 20 270 | 46 28 271 | 47 48 272 | 47 15 273 | 47 46 274 | 47 7 275 | 48 40 276 | 48 46 277 | 48 47 278 | 48 49 279 | 48 89 280 | 48 62 281 | 48 63 282 | 49 5 283 | 49 71 284 | 49 40 285 | 49 78 286 | 49 48 287 | 49 50 288 | 50 1 289 | 50 51 290 | 50 49 291 | 50 31 292 | 51 67 293 | 51 37 294 | 51 73 295 | 51 50 296 | 51 52 297 | 52 53 298 | 52 51 299 | 52 59 300 | 52 69 301 | 53 35 302 | 53 5 303 | 53 52 304 | 53 54 305 | 53 55 306 | 53 89 307 | 53 29 308 | 53 94 309 | 54 40 310 | 54 82 311 | 54 84 312 | 54 53 313 | 54 55 314 | 55 34 315 | 55 3 316 | 55 53 317 | 55 54 318 | 55 56 319 | 56 2 320 | 56 35 321 | 56 8 322 | 56 42 323 | 56 55 324 | 56 57 325 | 56 29 326 | 57 75 327 | 57 56 328 | 57 58 329 | 57 91 330 | 58 0 331 | 58 44 332 | 58 22 333 | 58 23 334 | 58 24 335 | 58 57 336 | 58 59 337 | 59 58 338 | 59 11 339 | 59 52 340 | 59 60 341 | 60 59 342 | 60 42 343 | 60 61 344 | 60 29 345 | 61 70 346 | 61 11 347 | 61 60 348 | 61 62 349 | 62 79 350 | 62 48 351 | 62 81 352 | 62 86 353 | 62 90 354 | 62 61 355 | 62 63 356 | 63 48 357 | 63 88 358 | 63 64 359 | 63 62 360 | 64 65 361 | 64 7 362 | 64 44 363 | 64 63 364 | 64 31 365 | 65 72 366 | 65 66 367 | 65 64 368 | 66 0 369 | 66 16 370 | 66 67 371 | 66 65 372 | 67 66 373 | 67 68 374 | 67 6 375 | 67 76 376 | 67 51 377 | 68 41 378 | 68 67 379 | 68 69 380 | 68 86 381 | 69 68 382 | 69 70 383 | 69 52 384 | 69 21 385 | 69 22 386 | 69 25 387 | 69 92 388 | 70 69 389 | 70 71 390 | 70 40 391 | 70 42 392 | 70 75 393 | 70 61 394 | 71 70 395 | 71 72 396 | 71 44 397 | 71 49 398 | 71 89 399 | 71 94 400 | 72 65 401 | 72 3 402 | 72 36 403 | 72 71 404 | 72 73 405 | 73 72 406 | 73 74 407 | 73 51 408 | 73 14 409 | 74 73 410 | 74 75 411 | 74 86 412 | 75 34 413 | 75 36 414 | 75 70 415 | 75 39 416 | 75 9 417 | 75 74 418 | 75 76 419 | 75 15 420 | 75 57 421 | 76 67 422 | 76 4 423 | 76 75 424 | 76 77 425 | 76 92 426 | 76 31 427 | 77 9 428 | 77 76 429 | 77 78 430 | 77 23 431 | 78 6 432 | 78 77 433 | 78 79 434 | 78 49 435 | 78 31 436 | 79 78 437 | 79 80 438 | 79 18 439 | 79 22 440 | 79 24 441 | 79 62 442 | 80 1 443 | 80 79 444 | 80 81 445 | 80 82 446 | 80 20 447 | 80 87 448 | 81 80 449 | 81 10 450 | 81 82 451 | 81 62 452 | 82 10 453 | 82 80 454 | 82 81 455 | 82 83 456 | 82 84 457 | 82 54 458 | 82 28 459 | 82 93 460 | 83 82 461 | 83 84 462 | 83 93 463 | 84 82 464 | 84 83 465 | 84 85 466 | 84 54 467 | 84 88 468 | 84 90 469 | 85 36 470 | 85 86 471 | 85 84 472 | 86 3 473 | 86 68 474 | 86 6 475 | 86 74 476 | 86 19 477 | 86 85 478 | 86 87 479 | 86 93 480 | 86 62 481 | 87 7 482 | 87 80 483 | 87 86 484 | 87 88 485 | 87 26 486 | 88 20 487 | 88 84 488 | 88 87 489 | 88 89 490 | 88 63 491 | 89 36 492 | 89 71 493 | 89 48 494 | 89 53 495 | 89 23 496 | 89 88 497 | 89 90 498 | 90 1 499 | 90 20 500 | 90 84 501 | 90 89 502 | 90 91 503 | 90 62 504 | 91 41 505 | 91 14 506 | 91 92 507 | 91 57 508 | 91 90 509 | 91 28 510 | 92 69 511 | 92 5 512 | 92 11 513 | 92 76 514 | 92 91 515 | 92 93 516 | 92 30 517 | 93 18 518 | 93 83 519 | 93 82 520 | 93 86 521 | 93 92 522 | 93 94 523 | 94 71 524 | 94 39 525 | 94 53 526 | 94 93 527 | 94 95 528 | 95 40 529 | 95 0 530 | 95 28 531 | 95 94 532 | -------------------------------------------------------------------------------- /eval/run_xtimes_cifar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Documentation 3 | # This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory. 4 | # The second one (python_bin) is the path to the python bin folder. 5 | # The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory. 6 | # 7 | # The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder 8 | # called configs which contains the file 'ip_addr_6Machines.json' 9 | # The python bin folder needs to include all the dependencies of this project including crudini. 10 | # The results will be stored in $nfs_home/$logs_subfolder 11 | # Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment 12 | # starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc. 13 | # The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment 14 | # was run. 15 | # Example: ./run_grid.sh /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba 16 | # 17 | # Additional requirements: 18 | # Each node needs a folder called 'tmp' in the user's home directory 19 | # 20 | # Note: 21 | # - The script does not change the optimizer. All configs are writen to use SGD. 22 | # - The script will set '--test_after' and '--train_evaluate_after' such that it happens at the end of a global epoch. 23 | # - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only 24 | # relevant for Adams and other optimizers with internal state) 25 | # 26 | # Addapting the script to other datasets: 27 | # Change the variable 'dataset_size' to reflect the data sets size. 28 | # 29 | # Known issues: 30 | # - If the script is started at the very end of a minute then there is a change that two folders are created as not all 31 | # machines may start running the script at the exact same moment. 32 | 33 | nfs_home=$1 34 | python_bin=$2 35 | logs_subfolder=$3 36 | decpy_path=$nfs_home/decentralizepy/eval 37 | cd $decpy_path 38 | 39 | env_python=$python_bin/python3 40 | graph=96_regular.edges 41 | config_file=~/tmp/config.ini 42 | procs_per_machine=16 43 | machines=6 44 | global_epochs=100 45 | eval_file=testingFederated.py 46 | log_level=INFO 47 | working_rate=0.1 48 | 49 | ip_machines=$nfs_home/configs/ip_addr_6Machines.json 50 | 51 | m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2` 52 | export PYTHONFAULTHANDLER=1 53 | 54 | # Base configs for which the gird search is done 55 | tests=("step_configs/config_cifar_sharing.ini") 56 | # Learning rates 57 | lr="0.01" 58 | # Batch size 59 | batchsize="8" 60 | # The number of communication rounds per global epoch 61 | comm_rounds_per_global_epoch="20" 62 | procs=`expr $procs_per_machine \* $machines` 63 | echo procs: $procs 64 | dataset_size=50000 65 | # Calculating the number of samples that each user/proc will have on average 66 | samples_per_user=`expr $dataset_size / $procs` 67 | echo samples per user: $samples_per_user 68 | 69 | # random_seeds for which to rerun the experiments 70 | random_seeds=("90") 71 | # random_seed = 97 72 | echo batchsize: $batchsize 73 | echo communication rounds per global epoch: $comm_rounds_per_global_epoch 74 | # calculating how many batches there are in a global epoch for each user/proc 75 | batches_per_epoch=$(($samples_per_user / $batchsize)) 76 | echo batches per global epoch: $batches_per_epoch 77 | # the number of iterations in 25 global epochs 78 | iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $comm_rounds_per_global_epoch >= $batches_per_epoch else print($global_epochs * $comm_rounds_per_global_epoch)") 79 | echo iterations: $iterations 80 | # calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down) 81 | batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $comm_rounds_per_global_epoch); print(1 if x==0 else x)") 82 | # since the batches per communication round were rounded down we need to change the number of iterations to reflect that 83 | new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $comm_rounds_per_global_epoch); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $comm_rounds_per_global_epoch)/x)*$iterations); print($iterations if y<$iterations else y)") 84 | echo batches per communication round: $batches_per_comm_round 85 | echo corrected iterations: $new_iterations 86 | test_after=$(($new_iterations / $global_epochs)) 87 | echo test after: $test_after 88 | for i in "${tests[@]}" 89 | do 90 | for seed in "${random_seeds[@]}" 91 | do 92 | echo $i 93 | IFS='_' read -ra NAMES <<< $i 94 | IFS='.' read -ra NAME <<< ${NAMES[-1]} 95 | log_dir_base=$nfs_home/$logs_subfolder/${NAME[0]}:lr=$lr:r=$comm_rounds_per_global_epoch:b=$batchsize:$(date '+%Y-%m-%dT%H:%M') 96 | echo results are stored in: $log_dir_base 97 | log_dir=$log_dir_base/machine$m 98 | mkdir -p $log_dir 99 | weight_store_dir=$log_dir_base/weights 100 | mkdir -p $weight_store_dir 101 | cp $i $config_file 102 | # changing the config files to reflect the values of the current grid search state 103 | $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines 104 | $python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr 105 | $python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round 106 | $python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $batchsize 107 | $python_bin/crudini --set $config_file DATASET random_seed $seed 108 | $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -wsd $weight_store_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level -wr $working_rate 109 | echo $i is done 110 | sleep 200 111 | echo end of sleep 112 | done 113 | done 114 | # 115 | -------------------------------------------------------------------------------- /eval/96_nodes_random2.edges: -------------------------------------------------------------------------------- 1 | 96 2 | 0 1 3 | 0 36 4 | 0 13 5 | 0 46 6 | 0 28 7 | 0 95 8 | 1 0 9 | 1 33 10 | 1 2 11 | 1 36 12 | 1 4 13 | 1 43 14 | 1 14 15 | 1 21 16 | 1 91 17 | 1 95 18 | 2 1 19 | 2 3 20 | 2 5 21 | 2 9 22 | 2 23 23 | 2 89 24 | 3 2 25 | 3 4 26 | 3 13 27 | 3 18 28 | 3 90 29 | 4 1 30 | 4 34 31 | 4 3 32 | 4 5 33 | 4 73 34 | 4 10 35 | 4 88 36 | 4 95 37 | 5 2 38 | 5 66 39 | 5 4 40 | 5 6 41 | 5 74 42 | 5 54 43 | 5 90 44 | 6 5 45 | 6 7 46 | 6 74 47 | 6 16 48 | 6 49 49 | 6 80 50 | 6 31 51 | 7 6 52 | 7 8 53 | 7 80 54 | 7 53 55 | 7 21 56 | 7 92 57 | 8 64 58 | 8 68 59 | 8 7 60 | 8 41 61 | 8 9 62 | 8 11 63 | 8 45 64 | 8 54 65 | 8 88 66 | 9 32 67 | 9 2 68 | 9 35 69 | 9 8 70 | 9 10 71 | 9 76 72 | 9 17 73 | 9 85 74 | 9 55 75 | 10 34 76 | 10 4 77 | 10 38 78 | 10 9 79 | 10 11 80 | 11 8 81 | 11 42 82 | 11 10 83 | 11 76 84 | 11 12 85 | 12 73 86 | 12 11 87 | 12 13 88 | 12 56 89 | 12 58 90 | 12 88 91 | 13 0 92 | 13 3 93 | 13 74 94 | 13 12 95 | 13 14 96 | 13 80 97 | 13 25 98 | 14 1 99 | 14 42 100 | 14 13 101 | 14 15 102 | 14 63 103 | 15 39 104 | 15 14 105 | 15 47 106 | 15 16 107 | 15 25 108 | 16 34 109 | 16 36 110 | 16 6 111 | 16 15 112 | 16 17 113 | 17 9 114 | 17 45 115 | 17 79 116 | 17 16 117 | 17 18 118 | 17 24 119 | 17 26 120 | 17 59 121 | 18 3 122 | 18 17 123 | 18 19 124 | 18 84 125 | 18 91 126 | 19 39 127 | 19 41 128 | 19 48 129 | 19 18 130 | 19 20 131 | 19 91 132 | 20 90 133 | 20 19 134 | 20 21 135 | 20 22 136 | 20 26 137 | 21 32 138 | 21 1 139 | 21 7 140 | 21 74 141 | 21 20 142 | 21 22 143 | 21 90 144 | 21 95 145 | 22 74 146 | 22 50 147 | 22 20 148 | 22 21 149 | 22 23 150 | 23 2 151 | 23 66 152 | 23 40 153 | 23 46 154 | 23 48 155 | 23 22 156 | 23 24 157 | 23 95 158 | 24 17 159 | 24 27 160 | 24 25 161 | 24 23 162 | 25 13 163 | 25 15 164 | 25 88 165 | 25 24 166 | 25 26 167 | 25 94 168 | 26 17 169 | 26 20 170 | 26 25 171 | 26 27 172 | 26 61 173 | 27 34 174 | 27 69 175 | 27 45 176 | 27 28 177 | 27 24 178 | 27 26 179 | 27 60 180 | 28 0 181 | 28 64 182 | 28 85 183 | 28 57 184 | 28 27 185 | 28 29 186 | 29 65 187 | 29 78 188 | 29 50 189 | 29 28 190 | 29 61 191 | 29 30 192 | 30 38 193 | 30 43 194 | 30 93 195 | 30 29 196 | 30 31 197 | 31 32 198 | 31 67 199 | 31 6 200 | 31 48 201 | 31 93 202 | 31 30 203 | 32 33 204 | 32 35 205 | 32 37 206 | 32 9 207 | 32 43 208 | 32 21 209 | 32 91 210 | 32 92 211 | 32 93 212 | 32 31 213 | 33 32 214 | 33 1 215 | 33 34 216 | 33 71 217 | 34 33 218 | 34 35 219 | 34 4 220 | 34 10 221 | 34 16 222 | 34 81 223 | 34 27 224 | 35 32 225 | 35 34 226 | 35 36 227 | 35 9 228 | 35 51 229 | 36 0 230 | 36 1 231 | 36 35 232 | 36 37 233 | 36 16 234 | 36 56 235 | 37 32 236 | 37 60 237 | 37 38 238 | 37 36 239 | 38 37 240 | 38 39 241 | 38 10 242 | 38 45 243 | 38 30 244 | 39 40 245 | 39 19 246 | 39 38 247 | 39 15 248 | 40 39 249 | 40 41 250 | 40 48 251 | 40 23 252 | 40 91 253 | 40 63 254 | 41 8 255 | 41 40 256 | 41 42 257 | 41 19 258 | 41 85 259 | 42 41 260 | 42 43 261 | 42 11 262 | 42 14 263 | 42 53 264 | 43 32 265 | 43 1 266 | 43 42 267 | 43 44 268 | 43 45 269 | 43 30 270 | 44 43 271 | 44 67 272 | 44 45 273 | 44 46 274 | 45 38 275 | 45 8 276 | 45 43 277 | 45 44 278 | 45 46 279 | 45 17 280 | 45 87 281 | 45 27 282 | 46 0 283 | 46 44 284 | 46 77 285 | 46 45 286 | 46 47 287 | 46 23 288 | 46 61 289 | 46 95 290 | 47 48 291 | 47 65 292 | 47 46 293 | 47 15 294 | 48 40 295 | 48 47 296 | 48 49 297 | 48 19 298 | 48 86 299 | 48 23 300 | 48 60 301 | 48 31 302 | 49 6 303 | 49 79 304 | 49 48 305 | 49 50 306 | 49 89 307 | 50 81 308 | 50 49 309 | 50 51 310 | 50 22 311 | 50 29 312 | 51 35 313 | 51 50 314 | 51 52 315 | 51 86 316 | 51 90 317 | 51 94 318 | 52 66 319 | 52 51 320 | 52 53 321 | 53 7 322 | 53 42 323 | 53 52 324 | 53 54 325 | 53 56 326 | 53 90 327 | 54 8 328 | 54 53 329 | 54 5 330 | 54 55 331 | 55 65 332 | 55 9 333 | 55 56 334 | 55 54 335 | 56 36 336 | 56 74 337 | 56 12 338 | 56 53 339 | 56 55 340 | 56 57 341 | 57 56 342 | 57 58 343 | 57 28 344 | 58 57 345 | 58 59 346 | 58 12 347 | 59 70 348 | 59 75 349 | 59 17 350 | 59 58 351 | 59 60 352 | 60 37 353 | 60 59 354 | 60 48 355 | 60 27 356 | 60 61 357 | 61 46 358 | 61 29 359 | 61 26 360 | 61 60 361 | 61 93 362 | 61 62 363 | 62 68 364 | 62 93 365 | 62 85 366 | 62 61 367 | 62 63 368 | 63 64 369 | 63 40 370 | 63 14 371 | 63 93 372 | 63 62 373 | 64 8 374 | 64 65 375 | 64 28 376 | 64 63 377 | 65 64 378 | 65 66 379 | 65 69 380 | 65 74 381 | 65 47 382 | 65 55 383 | 65 29 384 | 66 65 385 | 66 67 386 | 66 69 387 | 66 5 388 | 66 52 389 | 66 23 390 | 67 66 391 | 67 68 392 | 67 44 393 | 67 86 394 | 67 31 395 | 68 8 396 | 68 67 397 | 68 69 398 | 68 62 399 | 69 65 400 | 69 66 401 | 69 68 402 | 69 70 403 | 69 77 404 | 69 83 405 | 69 27 406 | 70 59 407 | 70 69 408 | 70 78 409 | 70 71 410 | 71 33 411 | 71 70 412 | 71 72 413 | 71 87 414 | 71 90 415 | 72 73 416 | 72 90 417 | 72 71 418 | 73 72 419 | 73 74 420 | 73 4 421 | 73 12 422 | 74 65 423 | 74 5 424 | 74 6 425 | 74 73 426 | 74 75 427 | 74 13 428 | 74 21 429 | 74 22 430 | 74 56 431 | 75 74 432 | 75 59 433 | 75 76 434 | 76 9 435 | 76 75 436 | 76 11 437 | 76 77 438 | 77 69 439 | 77 76 440 | 77 78 441 | 77 46 442 | 77 93 443 | 78 70 444 | 78 77 445 | 78 79 446 | 78 87 447 | 78 29 448 | 79 80 449 | 79 17 450 | 79 78 451 | 79 49 452 | 80 6 453 | 80 7 454 | 80 13 455 | 80 79 456 | 80 81 457 | 80 85 458 | 81 34 459 | 81 80 460 | 81 50 461 | 81 82 462 | 81 88 463 | 82 81 464 | 82 83 465 | 83 82 466 | 83 84 467 | 83 69 468 | 84 18 469 | 84 83 470 | 84 85 471 | 84 95 472 | 85 9 473 | 85 41 474 | 85 80 475 | 85 84 476 | 85 86 477 | 85 88 478 | 85 28 479 | 85 62 480 | 86 67 481 | 86 48 482 | 86 51 483 | 86 85 484 | 86 87 485 | 86 88 486 | 87 71 487 | 87 45 488 | 87 78 489 | 87 86 490 | 87 88 491 | 88 89 492 | 88 4 493 | 88 8 494 | 88 12 495 | 88 81 496 | 88 85 497 | 88 86 498 | 88 87 499 | 88 25 500 | 89 88 501 | 89 49 502 | 89 2 503 | 89 90 504 | 90 3 505 | 90 5 506 | 90 71 507 | 90 72 508 | 90 51 509 | 90 20 510 | 90 21 511 | 90 53 512 | 90 89 513 | 90 91 514 | 91 32 515 | 91 1 516 | 91 40 517 | 91 18 518 | 91 19 519 | 91 90 520 | 91 92 521 | 92 32 522 | 92 91 523 | 92 93 524 | 92 7 525 | 93 32 526 | 93 77 527 | 93 63 528 | 93 30 529 | 93 94 530 | 93 92 531 | 93 61 532 | 93 62 533 | 93 31 534 | 94 25 535 | 94 51 536 | 94 93 537 | 94 95 538 | 95 0 539 | 95 1 540 | 95 4 541 | 95 46 542 | 95 84 543 | 95 21 544 | 95 23 545 | 95 94 546 | --------------------------------------------------------------------------------