├── .gitignore ├── README.md ├── experiments ├── convex │ ├── linear_regression.py │ └── logistic_regression.py └── papers │ └── destress.py ├── nda ├── __init__.py ├── datasets │ ├── __init__.py │ ├── dataset.py │ ├── gisette.py │ ├── libsvm.py │ └── mnist.py ├── experiment_utils │ ├── __init__.py │ └── utils.py ├── log.py ├── optimizers │ ├── __init__.py │ ├── centralized │ │ ├── GD.py │ │ ├── NAG.py │ │ ├── SARAH.py │ │ ├── SGD.py │ │ ├── SVRG.py │ │ └── __init__.py │ ├── centralized_distributed │ │ ├── ADMM.py │ │ ├── DANE.py │ │ └── __init__.py │ ├── compressor.py │ ├── decentralized_distributed │ │ ├── CHOCO_SGD.py │ │ ├── D2.py │ │ ├── DGD.py │ │ ├── DGD_tracking.py │ │ ├── DSGD.py │ │ ├── EXTRA.py │ │ ├── GT_SARAH.py │ │ ├── NIDS.py │ │ └── __init__.py │ ├── network │ │ ├── Destress.py │ │ ├── NetworkDANE.py │ │ ├── NetworkDANE_quadratic.py │ │ ├── NetworkSARAH.py │ │ ├── NetworkSVRG.py │ │ ├── __init__.py │ │ └── network_optimizer.py │ ├── optimizer.py │ └── utils.py └── problems │ ├── __init__.py │ ├── linear_regression.py │ ├── logistic_regression.py │ ├── neural_network.py │ └── problem.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | figs 3 | data 4 | build 5 | problems/MNIST 6 | *npz 7 | *pt 8 | *egg-info 9 | *data 10 | .DS_Store 11 | *egg-info 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Network-Distributed Algorithm Experiments 2 | 3 | This repository contains a set of optimization algorithms and objective functions, and all code needed to reproduce experiments in: 4 | 5 | 1. "DESTRESS: Computation-Optimal and Communication-Efficient Decentralized Nonconvex Finite-Sum Optimization" [[PDF](https://arxiv.org/abs/2110.01165)]. (code is in this file [[link](https://github.com/liboyue/Network-Distributed-Algorithm/blob/master/experiments/papers/destress.py)]) 6 | 7 | 2. "Communication-Efficient Distributed Optimization in Networks with Gradient Tracking and Variance Reduction" [[PDF](https://arxiv.org/abs/1909.05844v2)]. (code is in the previous version of this repo [[link](https://github.com/liboyue/Network-Distributed-Algorithm/tree/08abe14f2a2d5929fc401ff99961ca3bae40ff60)]) 8 | 9 | Due to the random data generation procedure, 10 | results may be slightly different from those appeared in papers, 11 | but conclusions remain the same. 12 | 13 | If you find this code useful, please cite our papers: 14 | 15 | ``` 16 | @article{li2022destress, 17 | title={DESTRESS: Computation-Optimal and Communication-Efficient Decentralized Nonconvex Finite-Sum Optimization}, 18 | author={Li, Boyue and Li, Zhize and Chi, Yuejie}, 19 | journal={SIAM Journal on Mathematics of Data Science}, 20 | volume = {4}, 21 | number = {3}, 22 | pages = {1031-1051}, 23 | year={2022} 24 | } 25 | ``` 26 | 27 | ``` 28 | @article{li2020communication, 29 | title={Communication-Efficient Distributed Optimization in Networks with Gradient Tracking and Variance Reduction}, 30 | author={Li, Boyue and Cen, Shicong and Chen, Yuxin and Chi, Yuejie}, 31 | journal={Journal of Machine Learning Research}, 32 | volume={21}, 33 | pages={1--51}, 34 | year={2020} 35 | } 36 | ``` 37 | 38 | 39 | ## 1. Features 40 | - Easy to use: come with several popular objective functions with optional regularization and compression, essential optimization algorithms, utilities to run experiments and plot results 41 | - Extendability: easy to implement your own objective functions / optimization algorithms / datasets 42 | - Correctness: numerically verified gradient implementation 43 | - Performance: can run on both CPU and GPU 44 | - Data preprocessing: shuffling, normalizing, splitting 45 | 46 | 47 | ## 2. Installation and usage 48 | ### 2.1 Installation 49 | 50 | `pip install git+https://github.com/liboyue/Network-Distributed-Algorithm.git` 51 | 52 | If you have Nvidia GPUs, please also install `cupy`. 53 | 54 | ### 2.2 Implementing your own objective function 55 | ### 2.3 Implementing your own optimizer 56 | 57 | 58 | ## 3. Objective functions 59 | The gradient implementations of all objective functions are checked numerically. 60 | 61 | ### 3.1 Linear regression 62 | Linear regression with random generated data. 63 | The objective function is 64 | 65 | 66 | ### 3.2 Logistic regression 67 | Logistic regression with l-2 or nonconvex regularization with random generated data or the Gisette dataset or datasets from `libsvmtools`. 68 | The objective function is 69 | 70 | 71 | ### 3.3 One-hidden-layer fully-connected neural netowrk 72 | One-hidden-layer fully-connected neural network with softmax loss on the MNIST dataset. 73 | 74 | 75 | ## 4. Datasets 76 | - MNIST 77 | - Gisette 78 | - LibSVM data 79 | - Random generated data 80 | 81 | 82 | ## 5. Optimization algorithms 83 | 84 | ### 5.1 Centralized optimization algorithms 85 | - Gradient descent 86 | - Stochastic gradient descent 87 | - Nesterov's accelerated gradient descent 88 | - SVRG 89 | - SARAH 90 | 91 | ### 5.2 Distributed optimization algorithms (i.e. with parameter server) 92 | - ADMM 93 | - DANE 94 | 95 | ### 5.3 Decentralized optimization algorithms 96 | - Decentralized gradient descent 97 | - Decentralized stochastic gradient descent 98 | - Decentralized gradient descent with gradient tracking 99 | - EXTRA 100 | - NIDS 101 | - D2 102 | - CHOCO-SGD 103 | - Network-DANE/SARAH/SVRG 104 | - GT-SARAH 105 | - DESTRESS 106 | 107 | 108 | ## 6. Change log 109 | 110 | - Mar-03-2022: Add GPU support, refactor code 111 | -------------------------------------------------------------------------------- /experiments/convex/linear_regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from nda import log 7 | from nda.problems import LinearRegression 8 | from nda.optimizers import * 9 | from nda.optimizers.utils import generate_mixing_matrix 10 | from nda.experiment_utils import run_exp 11 | 12 | 13 | if __name__ == '__main__': 14 | 15 | n_agent = 20 16 | m = 1000 17 | dim = 40 18 | 19 | kappa = 10 20 | mu = 5e-10 21 | n_iters = 10 22 | 23 | p = LinearRegression(n_agent=n_agent, m=m, dim=dim, noise_variance=1, kappa=kappa, graph_type='er', graph_params=0.3) 24 | W, alpha = generate_mixing_matrix(p) 25 | 26 | log.info('m = %d, n = %d, alpha = %.4f' % (m, n_agent, alpha)) 27 | 28 | x_0 = np.random.rand(dim, n_agent) 29 | x_0_mean = x_0.mean(axis=1) 30 | 31 | eta_2 = 2 / (p.L + p.sigma) 32 | eta_1 = 1 / p.L 33 | 34 | n_inner_iters = 100 35 | n_sarah_iters = n_iters * 20 36 | n_dgd_iters = n_iters * 20 37 | batch_size = int(m / 100) 38 | n_dsgd_iters = int(n_iters * m / batch_size) 39 | 40 | centralized = [ 41 | GD(p, n_iters=n_iters, eta=eta_2, x_0=x_0_mean), 42 | SGD(p, n_iters=n_dsgd_iters, eta=eta_2 * 3, batch_size=batch_size, x_0=x_0_mean, diminishing_step_size=True), 43 | NAG(p, n_iters=n_iters, x_0=x_0_mean), 44 | SARAH(p, n_iters=n_sarah_iters, n_inner_iters=n_inner_iters, eta=eta_2 / 20, x_0=x_0_mean), 45 | ] 46 | 47 | distributed = [ 48 | DGD_tracking(p, n_iters=n_dgd_iters, eta=eta_2 / 20, x_0=x_0, W=W), 49 | DANE(p, n_iters=n_iters, mu=mu, x_0=x_0.mean(axis=1)), 50 | NetworkDANE(p, n_iters=n_iters, mu=mu, x_0=x_0, W=W) 51 | ] 52 | 53 | exps = centralized + distributed 54 | 55 | res = run_exp(exps, kappa=kappa, max_iter=n_iters, name='linear_regression', n_cpu_processes=4, save=True) 56 | 57 | plt.show() 58 | -------------------------------------------------------------------------------- /experiments/convex/logistic_regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from nda.problems import LogisticRegression 7 | from nda.optimizers import * 8 | from nda.optimizers.utils import generate_mixing_matrix 9 | 10 | from nda.experiment_utils import run_exp 11 | 12 | if __name__ == '__main__': 13 | n_agent = 20 14 | m = 1000 15 | dim = 40 16 | 17 | 18 | kappa = 10000 19 | mu = 5e-3 20 | 21 | kappa = 100 22 | mu = 5e-8 23 | 24 | n_iters = 10 25 | 26 | p = LogisticRegression(n_agent=n_agent, m=m, dim=dim, noise_ratio=0.05, graph_type='er', kappa=kappa, graph_params=0.3) 27 | print(p.n_edges) 28 | 29 | 30 | x_0 = np.random.rand(dim, n_agent) 31 | x_0_mean = x_0.mean(axis=1) 32 | W, alpha = generate_mixing_matrix(p) 33 | print('alpha = ' + str(alpha)) 34 | 35 | 36 | eta = 2/(p.L + p.sigma) 37 | n_inner_iters = int(m * 0.05) 38 | batch_size = int(m / 10) 39 | batch_size = 10 40 | n_dgd_iters = n_iters * 20 41 | n_svrg_iters = n_iters * 20 42 | n_dsgd_iters = int(n_iters * m / batch_size) 43 | 44 | 45 | single_machine = [ 46 | GD(p, n_iters=n_iters, eta=eta, x_0=x_0_mean), 47 | SGD(p, n_iters=n_dsgd_iters, eta=eta*3, batch_size=batch_size, x_0=x_0_mean, diminishing_step_size=True), 48 | NAG(p, n_iters=n_iters, x_0=x_0_mean), 49 | SVRG(p, n_iters=n_svrg_iters, n_inner_iters=n_inner_iters, eta=eta/20, x_0=x_0_mean), 50 | SARAH(p, n_iters=n_svrg_iters, n_inner_iters=n_inner_iters, eta=eta/20, x_0=x_0_mean), 51 | ] 52 | 53 | 54 | distributed = [ 55 | DGD_tracking(p, n_iters=n_dgd_iters, eta=eta/10, x_0=x_0, W=W), 56 | DSGD(p, n_iters=n_dsgd_iters, eta=eta*2, batch_size=batch_size, x_0=x_0, W=W, diminishing_step_size=True), 57 | EXTRA(p, n_iters=n_dgd_iters, eta=eta/2, x_0=x_0, W=W), 58 | NIDS(p, n_iters=n_dgd_iters, eta=eta, x_0=x_0, W=W), 59 | 60 | ADMM(p, n_iters=n_iters, rho=1, x_0=x_0_mean), 61 | DANE(p, n_iters=n_iters, mu=mu, x_0=x_0_mean) 62 | ] 63 | 64 | network = [ 65 | NetworkSVRG(p, n_iters=n_svrg_iters, n_inner_iters=n_inner_iters, eta=eta/20, mu=mu, x_0=x_0, W=W, batch_size=batch_size), 66 | NetworkSARAH(p, n_iters=n_svrg_iters, n_inner_iters=n_inner_iters, eta=eta/20, mu=mu, x_0=x_0, W=W, batch_size=batch_size), 67 | NetworkDANE(p, n_iters=n_iters, mu=mu, x_0=x_0, W=W), 68 | ] 69 | 70 | exps = single_machine + distributed + network 71 | 72 | res = run_exp(exps, kappa=kappa, max_iter=n_iters, name='logistic_regression', n_cpu_processes=4, save=True) 73 | 74 | 75 | plt.show() 76 | -------------------------------------------------------------------------------- /experiments/papers/destress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import pickle 6 | import os 7 | 8 | from nda import log 9 | from nda.problems import LogisticRegression 10 | from nda.optimizers import * 11 | from nda.optimizers.utils import generate_mixing_matrix 12 | from nda.experiment_utils import run_exp 13 | 14 | import time 15 | 16 | def plot_exp(exps, configs, filename, dim, n_agent, logx=False, logy=False): 17 | 18 | colors = ['k', 'r', 'g', 'b', 'c', 'm', 'y'] 19 | line_styles = ['-', '--', ':'] 20 | # log.info("Initial accuracy = " + str(p.accuracy(x_0))) 21 | results = [[exp.get_name()] + list(exp.get_metrics()) for exp in exps] 22 | 23 | with open(f"data/{filename}", 'wb') as f: 24 | pickle.dump(results, f) 25 | 26 | row_index = {'t': 1, 'comm_rounds': 0} 27 | column_index = {'f': 0} 28 | 29 | all_columns = {column for (_, columns, _) in results for column in columns} 30 | if 'grad_norm' in all_columns: 31 | column_index.update({'grad_norm': len(column_index)}) 32 | if 'test_accuracy' in all_columns: 33 | column_index.update({'test_accuracy': len(column_index)}) 34 | if 'var_error' in all_columns: 35 | column_index.update({'var_error': len(column_index)}) 36 | if 'f_test' in all_columns: 37 | column_index.update({'f_test': len(column_index)}) 38 | 39 | 40 | fig, axs = plt.subplots(2, len(column_index)) 41 | 42 | 43 | legends = [] 44 | for (name, columns, data), config, (line_style, color) in zip(results, configs, product(line_styles, colors)): 45 | 46 | tmp = get_bits_per_round_per_agent(config, dim) * n_agent 47 | n_skip = max(int(len(data) / 1000), 1) 48 | # log.info(name) 49 | # log.info('n_skip = %d', n_skip) 50 | # log.info('len = %d', len(data)) 51 | 52 | def _plot_iter(y, logx=False, logy=False): 53 | iter_ax = axs[row_index['t'], column_index[y]] 54 | iter_ax.plot( 55 | data[:, columns.index('t')][::n_skip], 56 | data[:, columns.index(y)][::n_skip], 57 | line_style + color 58 | ) 59 | iter_ax.set(xlabel='Iterations', ylabel=y) 60 | if logy: 61 | iter_ax.set_yscale('log') 62 | if logx: 63 | iter_ax.set_xscale('log') 64 | 65 | def _plot_comm(y, logx=False, logy=False): 66 | comm_ax = axs[row_index['comm_rounds'], column_index[y]] 67 | comm_ax.plot( 68 | data[:, columns.index('comm_rounds')][::n_skip] * tmp, 69 | data[:, columns.index(y)][::n_skip], 70 | line_style + color 71 | ) 72 | comm_ax.set(xlabel='Bits transferred', ylabel=y) 73 | if logy: 74 | comm_ax.set_yscale('log') 75 | if logx: 76 | comm_ax.set_xscale('log') 77 | 78 | for column in column_index.keys(): 79 | if column in columns: 80 | _plot_iter(column, logx=logx, logy=logy) 81 | if 'comm_rounds' in columns: 82 | _plot_comm(column, logx=logx, logy=logy) 83 | 84 | legends.append(name + ','.join([k + '=' + str(v) for k, v in config.items() if k in ['gamma', 'compressor_param', 'compressor_type', 'eta', 'batch_size']])) 85 | 86 | plt.legend(legends) 87 | 88 | 89 | def plot_gisette_exp(exps, topology, total_samples): 90 | # print("Initial accuracy = " + str(p.accuracy(x_0.mean(axis=1)))) 91 | results = [[exp.get_name()] + list(exp.get_metrics()) for exp in exps] 92 | with open('data/gisette_%s_res.data' % topology, 'wb') as f: 93 | pickle.dump(results, f) 94 | max_comm = min([results[i][2][-1, results[i][1].index('comm_rounds')] for i in range(len(results)) if 'comm_rounds' in results[i][1]]) 95 | min_comm = 0 96 | min_grad = p.m_total / 2 97 | max_grad = min([results[i][2][-1, results[i][1].index('n_grads')] for i in range(len(results))]) 98 | fig, axs = plt.subplots(1, 4) 99 | for (name, columns, data) in results: 100 | comm_idx = columns.index('comm_rounds') 101 | grad_idx = columns.index('n_grads') 102 | acc_idx = columns.index('test_accuracy') 103 | loss_idx = columns.index('f') 104 | if len(data) > 1000: 105 | skip = max(int(len(data) / 1000), 1) 106 | data = data[::skip] 107 | comm_mask = (data[:, comm_idx] <= max_comm) & (data[:, comm_idx] > min_comm) 108 | grad_mask = (data[:, grad_idx] <= max_grad) & (data[:, grad_idx] > min_grad) 109 | axs[0].loglog(data[:, comm_idx][comm_mask], data[:, loss_idx][comm_mask]) 110 | axs[0].set(xlabel='\#communication rounds', ylabel='Loss') 111 | axs[1].semilogx(data[:, comm_idx][comm_mask], data[:, acc_idx][comm_mask]) 112 | axs[1].set(xlabel='\#communication rounds', ylabel='Testing accuracy') 113 | axs[2].loglog(data[:, grad_idx][grad_mask] / total_samples, data[:, loss_idx][grad_mask]) 114 | axs[2].set(xlabel='\#grads/\#samples', ylabel='Loss') 115 | axs[3].semilogx(data[:, grad_idx][grad_mask] / total_samples, data[:, acc_idx][grad_mask]) 116 | axs[3].set(xlabel='\#grads/\#samples', ylabel='Testing accuracy') 117 | axs[3].legend([result[0].replace('_', '-') for result in results]) 118 | # plt.show() 119 | tikzplotlib.save("data/gisette-%s.tex" % topology, standalone=True, externalize_tables=True, override_externals=True) 120 | 121 | if __name__ == '__main__': 122 | n_agent = 20 123 | 124 | # Experiment for Gisette classification 125 | p = LogisticRegression(n_agent, graph_type='er', graph_params=0.3, dataset='gisette', alpha=0.001) 126 | dim = p.dim 127 | 128 | os.system('mkdir data figs') 129 | if os.path.exists('data/gisette_initialization.npz'): 130 | x_0 = np.load('data/gisette_initialization.npz').get('x_0') 131 | else: 132 | x_0 = np.random.rand(dim, n_agent) 133 | np.savez('data/gisette_initialization.npz', x_0=x_0) 134 | x_0_mean = x_0.mean(axis=1) 135 | 136 | extra_metrics = ['grad_norm', 'test_accuracy'] 137 | # Experiment 1: er topology 138 | W, alpha = generate_mixing_matrix(p) 139 | log.info('alpha = %.4f', alpha) 140 | 141 | exps = [ 142 | DSGD(p, n_iters=20000, eta=10, x_0=x_0, W=W, diminishing_step_size=True, extra_metrics=extra_metrics), 143 | Destress(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=1, K_in=2, K_out=2, x_0=x_0, W=W, extra_metrics=extra_metrics), 144 | GT_SARAH(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=0.1, x_0=x_0, W=W, extra_metrics=extra_metrics), 145 | ] 146 | 147 | begin = time.time() 148 | exps = run_exp(exps, name='gisette-er', n_process=1, plot=False, save=True) 149 | end = time.time() 150 | log.info('Total %.2fs', end - begin) 151 | 152 | plot_gisette_exp(exps, 'er', p.m_total) 153 | 154 | # Experiment 2: grid topology 155 | p.generate_graph('grid', (4, 5)) 156 | W, alpha = generate_mixing_matrix(p) 157 | log.info('alpha = %.4f', alpha) 158 | 159 | exps = [ 160 | DSGD(p, n_iters=20000, eta=1, x_0=x_0, W=W, diminishing_step_size=True, extra_metrics=extra_metrics), 161 | Destress(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=1, K_in=2, K_out=2, x_0=x_0, W=W, extra_metrics=extra_metrics), 162 | GT_SARAH(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=0.01, x_0=x_0, W=W, extra_metrics=extra_metrics), 163 | ] 164 | 165 | begin = time.time() 166 | exps = run_exp(exps, name='gisette-grid', n_process=1, plot=False, save=True) 167 | end = time.time() 168 | log.info('Total %.2fs', end - begin) 169 | 170 | plot_gisette_exp(exps, 'grid', p.m_total) 171 | 172 | 173 | # Experiment 3: path topology 174 | p.generate_graph(graph_type='path') 175 | W, alpha = generate_mixing_matrix(p) 176 | log.info('alpha = %.4f', alpha) 177 | 178 | exps = [ 179 | DSGD(p, n_iters=20000, eta=1, x_0=x_0, W=W, diminishing_step_size=True, extra_metrics=extra_metrics), 180 | Destress(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=1, K_in=8, K_out=8, x_0=x_0, W=W, extra_metrics=extra_metrics), 181 | GT_SARAH(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=0.01, x_0=x_0, W=W, extra_metrics=extra_metrics), 182 | ] 183 | 184 | 185 | begin = time.time() 186 | exps = run_exp(exps, name='gisette_path', n_process=1, plot=False, save=True) 187 | end = time.time() 188 | log.info('Total %.2fs', end - begin) 189 | 190 | plot_gisette_exp(exps, 'path', p.m_total) 191 | 192 | 193 | # Experiment for MNIST classification 194 | p = NN(n_agent, graph_type='er', graph_params=0.3) 195 | dim = p.dim 196 | 197 | if os.path.exists('data/mnist_initialization.npz'): 198 | x_0 = np.load('data/mnist_initialization.npz').get('x_0') 199 | else: 200 | x_0 = np.random.rand(dim, n_agent) 201 | np.savez('data/mnist_initialization.npz', x_0=x_0) 202 | x_0_mean = x_0.mean(axis=1) 203 | 204 | # Experiment 1: er topology 205 | W, alpha = generate_mixing_matrix(p) 206 | log.info('alpha = %.4f', alpha) 207 | 208 | exps = [ 209 | DSGD(p, n_iters=20000, eta=10, x_0=x_0, W=W, diminishing_step_size=True, extra_metrics=extra_metrics), 210 | Destress(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=1, K_in=2, K_out=2, x_0=x_0, W=W, extra_metrics=extra_metrics), 211 | GT_SARAH(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=0.1, x_0=x_0, W=W, extra_metrics=extra_metrics), 212 | ] 213 | 214 | begin = time.time() 215 | exps = run_exp(exps, name='gisette-er', n_process=1, plot=False, save=True) 216 | end = time.time() 217 | log.info('Total %.2fs', end - begin) 218 | 219 | plot_gisette_exp(exps, 'er', p.m_total) 220 | 221 | # Experiment 2: grid topology 222 | p.generate_graph('grid', (4, 5)) 223 | W, alpha = generate_mixing_matrix(p) 224 | log.info('alpha = %.4f', alpha) 225 | 226 | exps = [ 227 | DSGD(p, n_iters=20000, eta=1, x_0=x_0, W=W, diminishing_step_size=True, extra_metrics=extra_metrics), 228 | Destress(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=1, K_in=2, K_out=2, x_0=x_0, W=W, extra_metrics=extra_metrics), 229 | GT_SARAH(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=0.01, x_0=x_0, W=W, extra_metrics=extra_metrics), 230 | ] 231 | 232 | begin = time.time() 233 | exps = run_exp(exps, name='gisette-grid', n_process=1, plot=False, save=True) 234 | end = time.time() 235 | log.info('Total %.2fs', end - begin) 236 | 237 | plot_gisette_exp(exps, 'grid', p.m_total) 238 | 239 | 240 | # Experiment 3: path topology 241 | p.generate_graph(graph_type='path') 242 | W, alpha = generate_mixing_matrix(p) 243 | log.info('alpha = %.4f', alpha) 244 | 245 | exps = [ 246 | DSGD(p, n_iters=20000, eta=1, x_0=x_0, W=W, diminishing_step_size=True, extra_metrics=extra_metrics), 247 | Destress(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=1, K_in=8, K_out=8, x_0=x_0, W=W, extra_metrics=extra_metrics), 248 | GT_SARAH(p, n_iters=40, batch_size=10, n_inner_iters=10, eta=0.01, x_0=x_0, W=W, extra_metrics=extra_metrics), 249 | ] 250 | 251 | 252 | begin = time.time() 253 | exps = run_exp(exps, name='gisette_path', n_process=1, plot=False, save=True) 254 | end = time.time() 255 | log.info('Total %.2fs', end - begin) 256 | 257 | plot_gisette_exp(exps, 'path', p.m_total) 258 | -------------------------------------------------------------------------------- /nda/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from nda import optimizers 5 | from nda import problems 6 | from nda import experiment_utils 7 | from nda import datasets 8 | from nda import log 9 | 10 | __all__ = ['problems', 'optimizers', 'experiment_utils', 'log', 'datasets'] 11 | -------------------------------------------------------------------------------- /nda/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from nda.datasets.dataset import Dataset 5 | from nda.datasets.gisette import Gisette 6 | from nda.datasets.libsvm import LibSVM 7 | from nda.datasets.mnist import MNIST 8 | -------------------------------------------------------------------------------- /nda/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import os 4 | import numpy as np 5 | 6 | from nda import log 7 | 8 | 9 | class Dataset: 10 | def __init__(self, data_urls=None, name=None, normalize=False, root='~/data'): 11 | self.name = self.__class__.__name__ if name is None else name 12 | self.data_root = os.path.expanduser(root) 13 | self.data_dir = os.path.join(self.data_root, self.name) 14 | self.cache_path = os.path.join(self.data_dir, '%s.npz' % self.name) 15 | self.data_urls = data_urls 16 | self.normalize = normalize 17 | 18 | def load(self): 19 | if not os.path.exists(self.cache_path): 20 | log.info('Downloading %s dataset' % self.name) 21 | os.system('mkdir -p %s' % self.data_dir) 22 | self.download() 23 | self.load_raw() 24 | np.savez_compressed( 25 | self.cache_path, 26 | X_train=self.X_train, Y_train=self.Y_train, 27 | X_test=self.X_test, Y_test=self.Y_test 28 | ) 29 | 30 | else: 31 | log.info('Loading %s dataset from cached file' % self.name) 32 | data = np.load(self.cache_path, allow_pickle=True) 33 | self.X_train = data['X_train'] 34 | self.Y_train = data['Y_train'] 35 | self.X_test = data['X_test'] 36 | self.Y_test = data['Y_test'] 37 | 38 | if self.normalize: 39 | self.normalize_data() 40 | 41 | return self.X_train, self.Y_train, self.X_test, self.Y_test 42 | 43 | def load_raw(self): 44 | raise NotImplementedError 45 | 46 | def normalize_data(self): 47 | mean = self.X_train.mean() 48 | std = self.X_train.std() 49 | self.X_train = (self.X_train - mean) / std 50 | self.X_test = (self.X_test - mean) / std 51 | 52 | def download(self): 53 | os.system("mkdir -p %s" % self.data_dir) 54 | for url in self.data_urls: 55 | os.system("wget -nc -P %s %s" % (self.data_dir, url)) 56 | -------------------------------------------------------------------------------- /nda/datasets/gisette.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import os 4 | import numpy as np 5 | from nda.datasets import Dataset 6 | from nda import log 7 | 8 | class Gisette(Dataset): 9 | def __init__(self, **kwargs): 10 | data_urls = [ 11 | 'https://archive.ics.uci.edu/ml/machine-learning-databases/gisette/GISETTE/gisette_train.data', 12 | 'https://archive.ics.uci.edu/ml/machine-learning-databases/gisette/GISETTE/gisette_train.labels', 13 | 'https://archive.ics.uci.edu/ml/machine-learning-databases/gisette/GISETTE/gisette_valid.data', 14 | 'https://archive.ics.uci.edu/ml/machine-learning-databases/gisette/gisette_valid.labels' 15 | ] 16 | 17 | super().__init__(data_urls=data_urls, **kwargs) 18 | 19 | def load_raw(self): 20 | def _load_raw(split): 21 | data_path = os.path.join(self.data_dir, 'gisette_%s.data' % split) 22 | with open(data_path) as f: 23 | data = f.readlines() 24 | data = np.array([[int(x) for x in line.split()] for line in data], dtype=float) 25 | 26 | label_path = os.path.join(self.data_dir, 'gisette_%s.labels' % split) 27 | with open(label_path) as f: 28 | labels = np.array([int(x) for x in f.read().split()], dtype=float) 29 | 30 | labels[labels < 0] = 0 31 | return data, labels 32 | 33 | self.X_train, self.Y_train = _load_raw('train') 34 | self.X_test, self.Y_test = _load_raw('valid') 35 | 36 | def normalize_data(self): 37 | self.X_train /= 999 38 | self.X_test /= 999 39 | super().normalize_data() 40 | -------------------------------------------------------------------------------- /nda/datasets/libsvm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import os 4 | import numpy as np 5 | 6 | from sklearn.datasets import load_svmlight_file 7 | from nda.datasets import Dataset 8 | 9 | from nda import log 10 | 11 | class LibSVM(Dataset): 12 | def __init__(self, name='a9a', **kwargs): 13 | data_urls = [ 14 | 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/%s' % name, 15 | 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/%s.t' %name 16 | ] 17 | super().__init__(root='~/data/LibSVM', name=name, data_urls=data_urls, **kwargs) 18 | 19 | def load_raw(self): 20 | def _load_raw(name, n_features=None): 21 | data_path = os.path.join(self.data_dir, name) 22 | X, Y = load_svmlight_file(data_path, n_features=n_features) 23 | X = X.toarray() 24 | Y[Y < 0] = 0 25 | return X, Y 26 | 27 | if self.name == 'a9a': 28 | n_features = 123 29 | elif self.name == 'a5a': 30 | n_features = 123 31 | else: 32 | n_features = None 33 | 34 | self.X_train, self.Y_train = _load_raw(self.name, n_features=n_features) 35 | self.X_test, self.Y_test = _load_raw('%s.t' % self.name, n_features=n_features) 36 | -------------------------------------------------------------------------------- /nda/datasets/mnist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import os 4 | import numpy as np 5 | from sklearn.datasets import fetch_openml 6 | from nda.datasets import Dataset 7 | 8 | 9 | class MNIST(Dataset): 10 | 11 | def download(self): 12 | pass 13 | 14 | def load_raw(self): 15 | X, y = fetch_openml('mnist_784', version=1, return_X_y=True) 16 | # One-hot encode labels 17 | y = y.astype('int') 18 | Y = np.eye(y.max() + 1)[y] 19 | 20 | # Split to train & test 21 | n_train = 60000 22 | 23 | self.X_train, self.X_test = X[:n_train], X[n_train:] 24 | self.Y_train, self.Y_test = Y[:n_train], Y[n_train:] 25 | 26 | def normalize_data(self): 27 | self.X_train /= 255 28 | self.X_test /= 255 29 | super().normalize_data() 30 | -------------------------------------------------------------------------------- /nda/experiment_utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from .utils import run_exp, plot_results 5 | -------------------------------------------------------------------------------- /nda/experiment_utils/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import matplotlib.pyplot as plt 5 | import multiprocessing as mp 6 | import itertools, os, random, time 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from nda import log 11 | 12 | 13 | def LINE_STYLES(): 14 | return itertools.cycle([ 15 | line + color for line in ['-', '--', ':'] for color in ['k', 'r', 'b', 'c', 'y', 'g'] 16 | ]) 17 | 18 | 19 | def multi_process_helper(device_id, task_id, opt, res_queue): 20 | start = time.time() 21 | log.info(f'{opt.get_name()} started') 22 | log.debug(f'task {task_id} started on device {device_id}') 23 | np.random.seed(task_id) 24 | random.seed(task_id) 25 | 26 | try: 27 | import cupy as cp 28 | cp.cuda.Device(device=device_id).use() 29 | cp.random.seed(task_id) 30 | opt.cuda() 31 | opt.optimize() 32 | columns, metrics = opt.get_metrics() 33 | name = opt.get_name() 34 | except ModuleNotFoundError: 35 | opt.optimize() 36 | columns, metrics = opt.get_metrics() 37 | name = opt.get_name() 38 | 39 | end = time.time() 40 | log.info('%s done, total %.2fs', name, end - start) 41 | log.debug(f'task {task_id} on device {device_id} exited') 42 | 43 | res_queue.put([task_id, name, pd.DataFrame(metrics, columns=columns)]) 44 | 45 | 46 | def run_exp(exps, kappa=None, max_iter=None, name=None, save=False, plot=True, n_cpu_processes=None, n_gpus=None, processes_per_gpu=1, verbose=False): 47 | 48 | try: 49 | mp.set_start_method('spawn') 50 | except RuntimeError: 51 | pass 52 | 53 | use_gpu = n_gpus is not None 54 | if use_gpu: 55 | pool = {n: [] for n in range(n_gpus)} 56 | else: 57 | pool = {n: [] for n in range(n_cpu_processes)} 58 | 59 | _exps = list(enumerate(exps)) 60 | q = mp.Queue(len(_exps)) 61 | res = [] 62 | 63 | def _pop_queue(): 64 | while q.empty() is False: 65 | res.append(q.get()) 66 | # print(f'{res[-1][0]} stopped') 67 | 68 | def _remove_dead_process(): 69 | for device_id in pool.keys(): 70 | pool[device_id] = [process for process in pool[device_id] if process.is_alive()] 71 | 72 | while len(_exps) > 0: 73 | 74 | _pop_queue() 75 | _remove_dead_process() 76 | 77 | availabel_device_id = -1 78 | min_processes = processes_per_gpu if use_gpu else 1 79 | 80 | for device_id, processes in pool.items(): 81 | n_processes = len(processes) 82 | if n_processes < min_processes: 83 | availabel_device_id = device_id 84 | min_processes = n_processes 85 | 86 | if availabel_device_id > -1: 87 | task_id, exp = _exps.pop(0) 88 | # print(f'{task_id} launched') 89 | pp = mp.Process(target=multi_process_helper, args=(availabel_device_id, task_id, exp, q,)) 90 | pp.start() 91 | pool[availabel_device_id].append(pp) 92 | pp, task_id, exp = None, None, None 93 | else: 94 | time.sleep(0.1) 95 | 96 | while len(pool) > 0: 97 | _remove_dead_process() 98 | pool = {k: v for k, v in pool.items() if len(v) > 0} 99 | _pop_queue() 100 | time.sleep(0.1) 101 | 102 | res = [_[1:] for _ in sorted(res, key=lambda x: x[0])] 103 | 104 | if save is True: 105 | os.system('mkdir -p data figs') 106 | 107 | if plot is True: 108 | plot_results( 109 | res, 110 | exps[0].p.m_total, 111 | kappa=kappa, 112 | max_iter=max_iter, 113 | name=name, 114 | save=save, 115 | ) 116 | 117 | if save is True: # Save data files too 118 | 119 | # Save to txt file 120 | for (name, data), exp in zip(res, exps): 121 | 122 | if kappa is not None: 123 | fname = r'data/' + str(name) + '_kappa_' + str(int(kappa)) 124 | else: 125 | fname = r'data/' + str(name) 126 | 127 | if hasattr(exp, 'n_mix'): 128 | fname += '_mix_' + str(exp.n_mix) + '_' + name + '.txt' 129 | else: 130 | fname += '_mix_1_' + name + '.txt' 131 | 132 | data.to_csv(fname, index=False) 133 | 134 | return res 135 | 136 | 137 | def plot_results(results, m_total, kappa=None, max_iter=None, name=None, save=False): 138 | 139 | if kappa is not None: 140 | fig_path = r'figs/' + str(name) + '_kappa_' + str(int(kappa)) 141 | else: 142 | fig_path = r'figs/' + str(name) 143 | 144 | plot_iters(results, fig_path, kappa=kappa, max_iter=max_iter, save=save) 145 | plot_grads(results, fig_path, m_total, kappa=kappa, max_iter=max_iter, save=save) 146 | 147 | if any(['comm_rounds' in res[1].columns for res in results]): 148 | plot_comms(results, fig_path, kappa=kappa, max_iter=max_iter, save=save) 149 | 150 | 151 | def plot_iters(results, path=None, kappa=None, max_iter=None, save=False): 152 | 153 | # iters vs. var_error 154 | legends = [] 155 | 156 | if any(['var_error' in result.columns for name, result in results]): 157 | plt.figure() 158 | for (name, result), style in zip(results, LINE_STYLES()): 159 | if 'var_error' in result.columns: 160 | legends.append(name) 161 | plt.loglog(result.t, result.var_error, style) 162 | plt.ylabel(r"$\frac{f({\bar{\mathbf{x}}}^{(t)}) - f({\mathbf{x}}^\star)}{f({\mathbf{x}}^\star)}$") 163 | plt.xlabel('#outer iterations') 164 | 165 | if kappa is not None: 166 | plt.title(r"$\kappa$ = " + str(int(kappa))) 167 | plt.legend(legends) 168 | if save is True: 169 | plt.savefig(path + '_var_iter.eps', format='eps') 170 | 171 | # iters vs. f 172 | legends = [] 173 | if max_iter is None: 174 | max_iter = min([res[1].t.iloc[-1] for res in results]) 175 | 176 | plt.figure() 177 | for (name, result), style in zip(results, LINE_STYLES()): 178 | legends.append(name) 179 | mask = result.t <= max_iter 180 | result = result.loc[mask] 181 | plt.loglog(result.t, result.f, style) 182 | # plt.title('Function value error vs. #outer iterations') 183 | plt.ylabel(r"Loss") 184 | plt.xlabel('#outer iterations') 185 | if kappa is not None: 186 | plt.title(r"$\kappa$ = " + str(int(kappa))) 187 | plt.legend(legends) 188 | if save is True: 189 | plt.savefig(path + '_fval_iter.eps', format='eps') 190 | 191 | 192 | def plot_comms(results, path, kappa=None, max_iter=None, save=False): 193 | 194 | if any(['var_error' in result.columns for name, result in results]): 195 | legends = [] 196 | plt.figure() 197 | for (name, data), style in zip(results, LINE_STYLES()): 198 | if 'var_error' in data.columns and 'comm_rounds' in data.columns: 199 | legends.append(name) 200 | plt.loglog(data.comm_rounds, data.var_error, style) 201 | 202 | plt.ylabel(r"$\frac{\Vert {\bar{\mathbf{x}}}^{(t)} - {\mathbf{x}}^\star \Vert}{\Vert {\mathbf{x}}^\star \Vert}$") 203 | plt.xlabel('#communication rounds') 204 | if kappa is not None: 205 | plt.title(r"$\kappa$ = " + str(int(kappa))) 206 | plt.legend(legends) 207 | if save is True: 208 | plt.savefig(path + '_var_comm.eps', format='eps') 209 | 210 | legends = [] 211 | plt.figure() 212 | for (name, data), style in zip(results, LINE_STYLES()): 213 | if 'comm_rounds' in data.columns: 214 | legends.append(name) 215 | plt.semilogy(data.comm_rounds, data.f, style) 216 | # plt.title('Function value error vs. #communications') 217 | plt.ylabel(r"Loss") 218 | plt.xlabel('#communication rounds') 219 | if kappa is not None: 220 | plt.title(r"$\kappa$ = " + str(int(kappa))) 221 | plt.legend(legends) 222 | if save is True: 223 | plt.savefig(path + '_fval_comm.eps', format='eps') 224 | 225 | 226 | def plot_grads(results, path, m, kappa=None, max_iter=None, save=False): 227 | 228 | # n_grads vs. var_error 229 | if any(['var_error' in result.columns for name, result in results]): 230 | legends = [] 231 | plt.figure() 232 | for (name, data), style in zip(results, LINE_STYLES()): 233 | if 'var_error' in data.columns: 234 | plt.loglog(data.n_grads / m, data.var_error, style) 235 | legends.append(name) 236 | # plt.title('Variable error vs. #gradient evaluations') 237 | plt.ylabel(r"$\frac{\Vert {\bar{\mathbf{x}}}^{(t)} - {\mathbf{x}}^\star \Vert}{\Vert {\mathbf{x}}^\star \Vert}$") 238 | plt.xlabel('#gradient evaluations / #total samples') 239 | if kappa is not None: 240 | plt.title(r"$\kappa$ = " + str(int(kappa))) 241 | plt.legend(legends) 242 | if save is True: 243 | plt.savefig(path + '_var_grads.eps', format='eps') 244 | 245 | # n_grads vs. f 246 | legends = [] 247 | plt.figure() 248 | for (name, data), style in zip(results, LINE_STYLES()): 249 | plt.loglog(data.n_grads / m, data.f, style) 250 | legends.append(name) 251 | # plt.title('Function value error vs. #gradient evaluations') 252 | plt.ylabel(r"Loss") 253 | plt.xlabel('#gradient evaluations / #total samples') 254 | if kappa is not None: 255 | plt.title(r"$\kappa$ = " + str(int(kappa))) 256 | plt.legend(legends) 257 | if save is True: 258 | plt.savefig(path + '_fval_grads.eps', format='eps') 259 | -------------------------------------------------------------------------------- /nda/log.py: -------------------------------------------------------------------------------- 1 | # Cloned from https://github.com/benley/python-glog 2 | """A simple Google-style logging wrapper.""" 3 | 4 | import os 5 | import sys 6 | import time 7 | import traceback 8 | import logging 9 | import colorlog 10 | 11 | 12 | def format_message(record): 13 | try: 14 | record_message = '%s' % (record.msg % record.args) 15 | except TypeError: 16 | record_message = record.msg 17 | return record_message 18 | 19 | 20 | class MyLogFormatter(colorlog.ColoredFormatter): 21 | LEVEL_MAP = { 22 | logging.FATAL: 'F', # FATAL is alias of CRITICAL 23 | logging.ERROR: 'E', 24 | logging.WARN: 'W', 25 | logging.INFO: 'I', 26 | logging.DEBUG: 'D' 27 | } 28 | 29 | def __init__(self): 30 | colorlog.ColoredFormatter.__init__(self, '%(log_color)s%(levelname)s %(message)s%(reset)s') 31 | 32 | def format(self, record): 33 | date = time.localtime(record.created) 34 | date_usec = (record.created - int(record.created)) * 1e4 35 | record_message = '%02d:%02d:%02d.%04d %s %s:%d] %s' % ( 36 | date.tm_hour, date.tm_min, 37 | date.tm_sec, date_usec, 38 | record.process if record.process is not None else '?????', 39 | record.filename, 40 | record.lineno, 41 | format_message(record)) 42 | record.getMessage = lambda: record_message 43 | return colorlog.ColoredFormatter.format(self, record) 44 | 45 | 46 | def set_level(new_level): 47 | logger.setLevel(new_level) 48 | logger.debug('Log level set to %s', new_level) 49 | 50 | 51 | debug = logging.debug 52 | info = logging.info 53 | warning = logging.warning 54 | warn = logging.warning 55 | error = logging.error 56 | exception = logging.exception 57 | fatal = logging.fatal 58 | log = logging.log 59 | 60 | DEBUG = logging.DEBUG 61 | INFO = logging.INFO 62 | WARNING = logging.WARNING 63 | WARN = logging.WARN 64 | ERROR = logging.ERROR 65 | FATAL = logging.FATAL 66 | 67 | handler = logging.StreamHandler() 68 | handler.setFormatter(MyLogFormatter()) 69 | 70 | glog = logger = logging.getLogger() 71 | logger.addHandler(handler) 72 | set_level('INFO') 73 | 74 | 75 | def _critical(self, message, *args, **kws): 76 | self._log(50, message, args, **kws) 77 | sys.exit(-1) 78 | 79 | 80 | logging.Logger.critical = _critical 81 | 82 | # Define functions emulating C++ glog check-macros 83 | # https://htmlpreview.github.io/?https://github.com/google/glog/master/doc/glog.html#check 84 | 85 | 86 | def format_stacktrace(stack): 87 | """Print a stack trace that is easier to read. 88 | 89 | * Reduce paths to basename component 90 | * Truncates the part of the stack after the check failure 91 | """ 92 | lines = [] 93 | for _, f in enumerate(stack): 94 | fname = os.path.basename(f[0]) 95 | line = "\t%s:%d\t%s" % (fname + "::" + f[2], f[1], f[3]) 96 | lines.append(line) 97 | return lines 98 | 99 | 100 | class FailedCheckException(AssertionError): 101 | """Exception with message indicating check-failure location and values.""" 102 | 103 | 104 | def check_failed(message): 105 | stack = traceback.extract_stack() 106 | stack = stack[0:-2] 107 | stacktrace_lines = format_stacktrace(stack) 108 | filename, line_num, _, _ = stack[-1] 109 | 110 | try: 111 | raise FailedCheckException(message) 112 | except FailedCheckException: 113 | log_record = logger.makeRecord('CRITICAL', 50, filename, line_num, 114 | message, None, None) 115 | handler.handle(log_record) 116 | 117 | log_record = logger.makeRecord('DEBUG', 10, filename, line_num, 118 | 'Check failed here:', None, None) 119 | handler.handle(log_record) 120 | for line in stacktrace_lines: 121 | log_record = logger.makeRecord('DEBUG', 10, filename, line_num, 122 | line, None, None) 123 | handler.handle(log_record) 124 | raise 125 | 126 | 127 | def check(condition, message=None): 128 | """Raise exception with message if condition is False.""" 129 | if not condition: 130 | if message is None: 131 | message = "Check failed." 132 | check_failed(message) 133 | 134 | 135 | def check_eq(obj1, obj2, message=None): 136 | """Raise exception with message if obj1 != obj2.""" 137 | if obj1 != obj2: 138 | if message is None: 139 | message = "Check failed: %s != %s" % (str(obj1), str(obj2)) 140 | check_failed(message) 141 | 142 | 143 | def check_ne(obj1, obj2, message=None): 144 | """Raise exception with message if obj1 == obj2.""" 145 | if obj1 == obj2: 146 | if message is None: 147 | message = "Check failed: %s == %s" % (str(obj1), str(obj2)) 148 | check_failed(message) 149 | 150 | 151 | def check_le(obj1, obj2, message=None): 152 | """Raise exception with message if not (obj1 <= obj2).""" 153 | if obj1 > obj2: 154 | if message is None: 155 | message = "Check failed: %s > %s" % (str(obj1), str(obj2)) 156 | check_failed(message) 157 | 158 | 159 | def check_ge(obj1, obj2, message=None): 160 | """Raise exception with message unless (obj1 >= obj2).""" 161 | if obj1 < obj2: 162 | if message is None: 163 | message = "Check failed: %s < %s" % (str(obj1), str(obj2)) 164 | check_failed(message) 165 | 166 | 167 | def check_lt(obj1, obj2, message=None): 168 | """Raise exception with message unless (obj1 < obj2).""" 169 | if obj1 >= obj2: 170 | if message is None: 171 | message = "Check failed: %s >= %s" % (str(obj1), str(obj2)) 172 | check_failed(message) 173 | 174 | 175 | def check_gt(obj1, obj2, message=None): 176 | """Raise exception with message unless (obj1 > obj2).""" 177 | if obj1 <= obj2: 178 | if message is None: 179 | message = "Check failed: %s <= %s" % (str(obj1), str(obj2)) 180 | check_failed(message) 181 | 182 | 183 | def check_notnone(obj, message=None): 184 | """Raise exception with message if obj is None.""" 185 | if obj is None: 186 | if message is None: 187 | message = "Check failed: Object is None." 188 | check_failed(message) 189 | -------------------------------------------------------------------------------- /nda/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from nda.optimizers.optimizer import Optimizer 5 | from nda.optimizers.centralized import GD, SGD, NAG, SARAH, SVRG 6 | from nda.optimizers.centralized_distributed import ADMM, DANE 7 | 8 | from nda.optimizers.decentralized_distributed import * 9 | from nda.optimizers.network import * 10 | 11 | from nda.optimizers import compressor 12 | -------------------------------------------------------------------------------- /nda/optimizers/centralized/GD.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as xp 5 | except ImportError: 6 | import numpy as xp 7 | 8 | from nda import log 9 | from nda.optimizers import Optimizer 10 | 11 | 12 | class GD(Optimizer): 13 | '''The vanilla GD''' 14 | 15 | def __init__(self, p, eta=0.1, **kwargs): 16 | super().__init__(p, is_distributed=False, **kwargs) 17 | self.eta = eta 18 | if self.p.is_smooth is False: 19 | log.info('Nonsmooth problem, running sub-gradient descent instead') 20 | self.update = self.subgd_update 21 | self.name = 'SubGD' 22 | 23 | def update(self): 24 | self.x -= self.eta * self.grad(self.x) 25 | 26 | def subgd_update(self): 27 | self.x -= self.eta / xp.sqrt(self.t) * self.grad(self.x) 28 | -------------------------------------------------------------------------------- /nda/optimizers/centralized/NAG.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | from nda import log 5 | from nda.optimizers import Optimizer 6 | 7 | 8 | class NAG(Optimizer): 9 | '''The Nesterov's Accelerated GD''' 10 | 11 | def __init__(self, p, **kwargs): 12 | super().__init__(p, is_distributed=False, **kwargs) 13 | 14 | if self.p.sigma > 0: 15 | self.update = self.update_strongly_convex 16 | else: 17 | log.error('NAG only supports strongly convex') 18 | 19 | if self.p.sigma > 0: 20 | self.x = self.y = self.x_0 21 | root_kappa = np.sqrt(self.p.L / self.p.sigma) 22 | r = (root_kappa - 1) / (root_kappa + 1) 23 | self.r_1 = 1 + r 24 | self.r_2 = r 25 | 26 | def update_convex(self): 27 | pass 28 | 29 | def update_strongly_convex(self): 30 | x_last = self.x.copy() 31 | self.x = self.y - self.grad(self.x) / self.p.L 32 | self.y = self.r_1 * self.x - self.r_2 * x_last 33 | -------------------------------------------------------------------------------- /nda/optimizers/centralized/SARAH.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as xp 5 | except ImportError: 6 | import numpy as xp 7 | 8 | from nda.optimizers import Optimizer 9 | 10 | 11 | class SARAH(Optimizer): 12 | '''The SARAH algorithm''' 13 | 14 | def __init__(self, p, n_inner_iters=20, batch_size=1, eta=0.01, opt=1, **kwargs): 15 | super().__init__(p, is_distributed=False, **kwargs) 16 | self.eta = eta 17 | self.n_inner_iters = n_inner_iters 18 | self.opt = opt 19 | self.batch_size = batch_size 20 | 21 | def update(self): 22 | v = self.grad(self.x) 23 | x_last = self.x.copy() 24 | self.x -= self.eta * v 25 | 26 | if self.opt == 1: 27 | n_inner_iters = self.n_inner_iters 28 | else: 29 | # Choose random stopping point from [1, n_inner_iters] 30 | n_inner_iters = xp.random.randint(1, self.n_inner_iters + 1) 31 | if type(n_inner_iters) is xp.ndarray: 32 | n_inner_iters = n_inner_iters.item() 33 | 34 | sample_list = xp.random.randint(0, self.p.m_total, (n_inner_iters, self.batch_size)) 35 | for i in range(n_inner_iters - 1): 36 | v += self.grad(self.x, j=sample_list[i]) - self.grad(x_last, j=sample_list[i]) 37 | x_last = self.x.copy() 38 | self.x -= self.eta * v 39 | -------------------------------------------------------------------------------- /nda/optimizers/centralized/SGD.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as xp 5 | except ImportError: 6 | import numpy as xp 7 | 8 | from nda.optimizers import Optimizer 9 | 10 | 11 | class SGD(Optimizer): 12 | '''Stochastic Gradient Descent''' 13 | 14 | def __init__(self, p, batch_size=1, eta=0.1, diminishing_step_size=False, **kwargs): 15 | super().__init__(p, is_distributed=False, **kwargs) 16 | self.eta = eta 17 | self.batch_size = batch_size 18 | self.diminishing_step_size = diminishing_step_size 19 | 20 | def update(self): 21 | 22 | sample_list = xp.random.randint(0, self.p.m_total, self.batch_size) 23 | grad = self.grad(self.x, j=sample_list) 24 | 25 | if self.diminishing_step_size is True: 26 | self.x -= self.eta / self.t * grad 27 | else: 28 | self.x -= self.eta * grad 29 | -------------------------------------------------------------------------------- /nda/optimizers/centralized/SVRG.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as xp 5 | except ImportError: 6 | import numpy as xp 7 | 8 | from nda.optimizers import Optimizer 9 | 10 | 11 | class SVRG(Optimizer): 12 | '''The SVRG algorithm''' 13 | 14 | def __init__(self, p, n_inner_iters=20, batch_size=1, eta=0.01, opt=1, **kwargs): 15 | super().__init__(p, is_distributed=False, **kwargs) 16 | self.eta = eta 17 | self.n_inner_iters = n_inner_iters 18 | self.opt = opt 19 | self.batch_size = batch_size 20 | 21 | def update(self): 22 | mu = self.grad(self.x) 23 | u = self.x.copy() 24 | 25 | if self.opt == 1: 26 | n_inner_iters = self.n_inner_iters 27 | else: 28 | # Choose random stopping point from [1, n_inner_iters] 29 | n_inner_iters = xp.random.randint(1, self.n_inner_iters + 1) 30 | if type(n_inner_iters) is xp.ndarray: 31 | n_inner_iters = n_inner_iters.item() 32 | 33 | sample_list = xp.random.randint(0, self.p.m_total, (n_inner_iters, self.batch_size)) 34 | for i in range(n_inner_iters - 1): 35 | v = self.grad(u, j=sample_list[i]) - self.grad(self.x, j=sample_list[i]) + mu 36 | u -= self.eta * v 37 | 38 | self.x = u 39 | -------------------------------------------------------------------------------- /nda/optimizers/centralized/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from nda.optimizers.centralized.GD import GD 5 | from nda.optimizers.centralized.SGD import SGD 6 | from nda.optimizers.centralized.NAG import NAG 7 | from nda.optimizers.centralized.SARAH import SARAH 8 | from nda.optimizers.centralized.SVRG import SVRG 9 | -------------------------------------------------------------------------------- /nda/optimizers/centralized_distributed/ADMM.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as xp 5 | except ImportError: 6 | import numpy as xp 7 | 8 | import numpy as np 9 | 10 | from nda.optimizers.utils import NAG, GD, FISTA 11 | from nda.optimizers import Optimizer 12 | 13 | 14 | class ADMM(Optimizer): 15 | '''ADMM for consensus optimization described in http://www.princeton.edu/~yc5/ele522_optimization/lectures/ADMM.pdf''' 16 | 17 | def __init__(self, p, rho=0.1, local_n_iters=100, delta=None, local_optimizer='NAG', **kwargs): 18 | super().__init__(p, **kwargs) 19 | self.rho = rho 20 | self.local_optimizer = local_optimizer 21 | self.local_n_iters = local_n_iters 22 | self.delta = delta 23 | self.Lambda = np.random.rand(self.p.dim, self.p.n_agent) 24 | 25 | def update(self): 26 | self.comm_rounds += 1 27 | 28 | x = xp.random.rand(self.p.dim, self.p.n_agent) 29 | z = self.x.copy() # Using notations from the tutorial 30 | 31 | for i in range(self.p.n_agent): 32 | 33 | # Non-smooth problems 34 | if self.p.is_smooth is not True: 35 | 36 | def _grad(tmp): 37 | return self.grad_f(tmp, i) + self.rho / 2 * (tmp - z) + self.Lambda[:, i] / 2 38 | 39 | x[:, i], count = FISTA(_grad, self.x.copy(), self.p.L + self.rho, self.p.r, n_iters=self.local_n_iters, eps=1e-10) 40 | else: 41 | 42 | def _grad(tmp): 43 | return self.grad(tmp, i) + self.rho / 2 * (tmp - z) + self.Lambda[:, i] / 2 44 | 45 | if self.local_optimizer == "NAG": 46 | x[:, i], _ = NAG(_grad, self.x.copy(), self.p.L + self.rho, self.p.sigma + self.rho, self.local_n_iters) 47 | else: 48 | if self.delta is not None: 49 | x[:, i], _ = GD(_grad, self.x.copy(), self.delta, self.local_n_iters) 50 | else: 51 | x[:, i], _ = GD(_grad, self.x.copy(), 2 / (self.p.L + self.rho + self.p.sigma + self.rho), self.local_n_iters) 52 | 53 | z = (x + self.Lambda).mean(axis=1) 54 | 55 | for i in range(self.p.n_agent): 56 | self.Lambda[:, i] += self.rho * (x[:, i] - self.x) 57 | 58 | self.x = z 59 | -------------------------------------------------------------------------------- /nda/optimizers/centralized_distributed/DANE.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from nda.optimizers.utils import NAG, GD, FISTA 4 | from nda.optimizers import Optimizer 5 | 6 | 7 | class DANE(Optimizer): 8 | '''The (inexact) DANE algorithm described in Communication Efficient Distributed Optimization using an Approximate Newton-type Method, https://arxiv.org/abs/1312.7853''' 9 | 10 | def __init__(self, p, mu=0.1, local_n_iters=100, local_optimizer='NAG', delta=None, **kwargs): 11 | super().__init__(p, **kwargs) 12 | self.mu = mu 13 | self.local_optimizer = local_optimizer 14 | self.local_n_iters = local_n_iters 15 | self.delta = delta 16 | 17 | def update(self): 18 | self.comm_rounds += 2 19 | 20 | grad_x = self.grad_h(self.x) 21 | 22 | x_next = 0 23 | for i in range(self.p.n_agent): 24 | 25 | if self.p.is_smooth is False: 26 | grad_x_i = self.grad_h(self.x, i) 27 | 28 | def _grad(tmp): 29 | return self.grad_h(tmp, i) - grad_x_i + grad_x + self.mu * (tmp - self.x) 30 | tmp, count = FISTA(_grad, self.x.copy(), self.mu + 1, self.p.r, n_iters=self.local_n_iters, eps=1e-10) 31 | 32 | else: 33 | grad_x_i = self.grad_h(self.x, i) 34 | 35 | def _grad(tmp): 36 | return self.grad_h(tmp, i) - grad_x_i + grad_x + self.mu * (tmp - self.x) 37 | 38 | if self.local_optimizer == "NAG": 39 | if self.delta is not None: 40 | tmp, count_ = NAG(_grad, self.x.copy(), self.delta, self.local_n_iters) 41 | else: 42 | tmp, count_ = NAG(_grad, self.x.copy(), self.p.L + self.mu, self.p.sigma + self.mu, self.local_n_iters) 43 | 44 | else: 45 | if self.delta is not None: 46 | tmp, count_ = GD(_grad, self.x.copy(), self.delta, self.local_n_iters) 47 | else: 48 | tmp, count_ = GD(_grad, self.x.copy(), 2 / (self.p.L + self.mu + self.p.sigma + self.mu), self.local_n_iters) 49 | 50 | x_next += tmp 51 | 52 | self.x = x_next / self.p.n_agent 53 | -------------------------------------------------------------------------------- /nda/optimizers/centralized_distributed/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from nda.optimizers.centralized_distributed.ADMM import ADMM 5 | from nda.optimizers.centralized_distributed.DANE import DANE 6 | -------------------------------------------------------------------------------- /nda/optimizers/compressor.py: -------------------------------------------------------------------------------- 1 | try: 2 | import cupy as xp 3 | except ImportError: 4 | import numpy as xp 5 | 6 | # import numpy as np 7 | 8 | def identity(x, *args, **kwargs): 9 | return x 10 | 11 | # top_a 12 | def top(x, a): 13 | dim = x.shape[0] 14 | if a == 0: 15 | return 0 16 | if a >= dim: 17 | return x 18 | index_array = xp.argpartition(x, kth=a, axis=0)[a:] 19 | xp.put_along_axis(x, index_array, 0, axis=0) 20 | return x 21 | 22 | # x = np.random.randint(0, 100, 24).reshape(6, 4) 23 | # x 24 | # top(x, 2) 25 | 26 | # Random_a compressor, keep a values 27 | def random(x, a): 28 | dim = x.shape[0] 29 | if a == 0: 30 | return 0 31 | if a == dim: 32 | return x 33 | if x.ndim == 2: 34 | for i in range(x.shape[1]): 35 | zero_mask = xp.random.choice(dim, dim - a, replace=False) 36 | x[zero_mask, i] = 0 37 | else: 38 | zero_mask = xp.random.choice(dim, dim - a, replace=False) 39 | x[zero_mask] = 0 40 | return x 41 | 42 | 43 | # gsgd_b 44 | def gsgd(x, b): 45 | norm = xp.linalg.norm(x, axis=0) 46 | return norm / (2 ** (b - 1)) * xp.sign(x) * xp.floor( 47 | (2 ** (b - 1)) / norm * xp.abs(x) + xp.random.uniform(0, 1, x.shape) 48 | ) 49 | 50 | 51 | # random quantization 2-norm with level s 52 | def random_quantization(x, s): 53 | dim = x.shape[0] 54 | xnorm = xp.linalg.norm(x) 55 | if s == 0 or xnorm == 0: 56 | return xp.zeros(dim, dtype=int) 57 | noise = xp.random.uniform(0, 1, dim) 58 | rounded = xp.floor(s * xp.abs(x) / xnorm + noise) 59 | compressed = (xnorm / s) * xp.sign(x) * rounded 60 | return compressed 61 | 62 | 63 | # natural compression (power of 2 for each coordinate) 64 | def natural_compression(x): 65 | dim = x.shape[0] 66 | logx = xp.ma.log2(xp.abs(x)).filled(-15) 67 | logx_floor = xp.floor(logx) 68 | noise = xp.random.uniform(0.0, 1.0, dim) 69 | leftx = xp.exp2(logx_floor) 70 | rounded = xp.floor(xp.ma.log2(xp.abs(x) + leftx * noise).filled(-15)) 71 | compressed = xp.sign(x) * xp.exp2(rounded) 72 | return compressed 73 | -------------------------------------------------------------------------------- /nda/optimizers/decentralized_distributed/CHOCO_SGD.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as np 5 | except ImportError: 6 | import numpy as np 7 | 8 | from nda.optimizers import Optimizer 9 | from nda.optimizers import compressor 10 | 11 | 12 | class CHOCO_SGD(Optimizer): 13 | '''Decentralized Stochastic Optimization and Gossip Algorithms with Compressed Communication''' 14 | 15 | def __init__(self, p, eta=0.1, gamma=0.1, batch_size=1, compressor_type=None, compressor_param=None, **kwargs): 16 | 17 | super().__init__(p, **kwargs) 18 | self.eta = eta 19 | self.gamma = gamma 20 | self.batch_size = batch_size 21 | 22 | # Compressor 23 | self.compressor_param = compressor_param 24 | if compressor_type == 'top': 25 | self.Q = compressor.top 26 | elif compressor_type == 'random': 27 | self.Q = compressor.random 28 | elif compressor_type == 'gsgd': 29 | self.Q = compressor.gsgd 30 | else: 31 | self.Q = compressor.identity 32 | 33 | self.x_hat = np.zeros_like(self.x) 34 | self.W_shifted = self.W - np.eye(self.p.n_agent) 35 | 36 | def update(self): 37 | self.comm_rounds += 1 38 | 39 | samples = np.random.randint(0, self.p.m, (self.p.n_agent, self.batch_size)) 40 | grad = self.grad(self.x, j=samples) 41 | 42 | self.x -= self.eta * grad 43 | self.x_hat += self.Q(self.x - self.x_hat, self.compressor_param) 44 | self.x += self.gamma * self.x_hat.dot(self.W_shifted) 45 | -------------------------------------------------------------------------------- /nda/optimizers/decentralized_distributed/D2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as xp 5 | except ImportError: 6 | import numpy as xp 7 | 8 | import numpy as np 9 | 10 | from nda.optimizers import Optimizer 11 | 12 | class D2(Optimizer): 13 | '''D2: Decentralized Training over Decentralized Data, https://arxiv.org/abs/1803.07068''' 14 | 15 | def __init__(self, p, eta=0.1, batch_size=1, **kwargs): 16 | super().__init__(p, **kwargs) 17 | self.eta = eta 18 | self.grad_last = None 19 | self.batch_size = batch_size 20 | self.tilde_W = (self.W + np.eye(self.p.n_agent)) / 2 21 | 22 | def update(self): 23 | self.comm_rounds += 1 24 | 25 | samples = xp.random.randint(0, self.p.m, (self.p.n_agent, self.batch_size)) 26 | 27 | if self.t == 1: 28 | self.grad_last = self.grad(self.x, j=samples) 29 | tmp = self.x - self.eta * self.grad_last 30 | 31 | else: 32 | tmp = 2 * self.x - self.x_last 33 | tmp += self.eta * self.grad_last 34 | self.grad_last = self.grad(self.x, j=samples) 35 | tmp -= self.eta * self.grad_last 36 | tmp = tmp.dot(self.tilde_W) 37 | 38 | # Update variables 39 | self.x, self.x_last = tmp, self.x 40 | -------------------------------------------------------------------------------- /nda/optimizers/decentralized_distributed/DGD.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from nda.optimizers import Optimizer 4 | 5 | 6 | class DGD(Optimizer): 7 | 8 | def __init__(self, p, eta=0.1, **kwargs): 9 | 10 | super().__init__(p, **kwargs) 11 | self.eta = eta 12 | 13 | def update(self): 14 | self.comm_rounds += 1 15 | self.x = self.x.dot(self.W) - self.eta * self.grad(self.x) 16 | -------------------------------------------------------------------------------- /nda/optimizers/decentralized_distributed/DGD_tracking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from nda.optimizers import Optimizer 4 | 5 | 6 | class DGD_tracking(Optimizer): 7 | '''The distributed gradient descent algorithm with gradient tracking, described in 'Harnessing Smoothness to Accelerate Distributed Optimization', Guannan Qu, Na Li''' 8 | 9 | def __init__(self, p, eta=0.1, **kwargs): 10 | super().__init__(p, **kwargs) 11 | self.eta = eta 12 | self.grad_last = None 13 | 14 | def init(self): 15 | super().init() 16 | self.s = self.grad(self.x) 17 | self.grad_last = self.s.copy() 18 | 19 | def update(self): 20 | self.comm_rounds += 2 21 | 22 | self.x = self.x.dot(self.W) - self.eta * self.s 23 | grad_current = self.grad(self.x) 24 | 25 | self.s = self.s.dot(self.W) + grad_current - self.grad_last 26 | self.grad_last = grad_current 27 | -------------------------------------------------------------------------------- /nda/optimizers/decentralized_distributed/DSGD.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as xp 5 | except ImportError: 6 | import numpy as xp 7 | 8 | from nda.optimizers import Optimizer 9 | 10 | 11 | class DSGD(Optimizer): 12 | '''The Decentralized SGD (D-PSGD) described in https://arxiv.org/pdf/1808.07576.pdf''' 13 | 14 | def __init__(self, p, batch_size=1, eta=0.1, diminishing_step_size=False, **kwargs): 15 | 16 | super().__init__(p, **kwargs) 17 | self.eta = eta 18 | self.batch_size = batch_size 19 | self.diminishing_step_size = diminishing_step_size 20 | 21 | def update(self): 22 | self.comm_rounds += 1 23 | 24 | if self.diminishing_step_size is True: 25 | delta_t = self.eta / self.t 26 | else: 27 | delta_t = self.eta 28 | 29 | samples = xp.random.randint(0, self.p.m, (self.p.n_agent, self.batch_size)) 30 | grad = self.grad(self.x, j=samples) 31 | self.x = self.x.dot(self.W) - delta_t * grad 32 | # self.x -= delta_t * grad 33 | # self.x = self.x.dot(self.W) 34 | -------------------------------------------------------------------------------- /nda/optimizers/decentralized_distributed/EXTRA.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as xp 5 | except ImportError: 6 | import numpy as xp 7 | 8 | from nda.optimizers import Optimizer 9 | 10 | 11 | class EXTRA(Optimizer): 12 | '''EXTRA: AN EXACT FIRST-ORDER ALGORITHM FOR DECENTRALIZED CONSENSUS OPTIMIZATION, https://arxiv.org/pdf/1404.6264.pdf''' 13 | 14 | def __init__(self, p, eta=0.1, **kwargs): 15 | super().__init__(p, **kwargs) 16 | self.eta = eta 17 | self.grad_last = None 18 | W_min_diag = min(np.diag(self.W)) 19 | tmp = (1 - 1e-1) / (1 - W_min_diag) 20 | self.W_s = self.W * tmp + np.eye(self.p.n_agent) * (1 - tmp) 21 | 22 | def update(self): 23 | self.comm_rounds += 1 24 | 25 | if self.t == 1: 26 | self.grad_last = self.grad(self.x) 27 | tmp = self.x.dot(self.W) - self.eta * self.grad_last 28 | else: 29 | tmp = self.x.dot(self.W) + self.x - self.x_last.dot(self.W_s) 30 | tmp += self.eta * self.grad_last 31 | self.grad_last = self.grad(self.x) 32 | tmp -= self.eta * self.grad_last 33 | 34 | self.x, self.x_last = tmp, self.x 35 | -------------------------------------------------------------------------------- /nda/optimizers/decentralized_distributed/GT_SARAH.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as xp 5 | except ImportError: 6 | import numpy as xp 7 | 8 | from nda.optimizers import Optimizer 9 | 10 | 11 | class GT_SARAH(Optimizer): 12 | '''A near-optimal stochastic gradient method for decentralized non-convex finite-sum optimization, https://arxiv.org/abs/2008.07428''' 13 | 14 | def __init__(self, p, n_inner_iters=100, eta=0.1, batch_size=1, **kwargs): 15 | super().__init__(p, **kwargs) 16 | self.eta = eta 17 | self.n_inner_iters = n_inner_iters 18 | self.batch_size = batch_size 19 | 20 | self.v = np.zeros((self.p.dim, self.p.n_agent)) 21 | self.y = np.zeros((self.p.dim, self.p.n_agent)) 22 | 23 | def update(self): 24 | 25 | self.v_last = self.v 26 | self.x_last = self.x 27 | 28 | self.v = self.grad(self.x) 29 | self.y = self.y.dot(self.W) + self.v - self.v_last 30 | self.x = self.x.dot(self.W) - self.eta * self.y 31 | self.comm_rounds += 1 32 | 33 | samples = xp.random.randint(0, self.p.m, (self.n_inner_iters, self.p.n_agent, self.batch_size)) 34 | for inner_iter in range(self.n_inner_iters): 35 | 36 | self.v_last = self.v 37 | self.x_last = self.x 38 | 39 | self.v = self.v + self.grad(self.x, j=samples[inner_iter]) - self.grad(self.x_last, j=samples[inner_iter]) 40 | 41 | self.y = self.y.dot(self.W) + self.v - self.v_last 42 | self.x = self.x.dot(self.W) - self.eta * self.y 43 | self.comm_rounds += 1 44 | 45 | if inner_iter < self.n_inner_iters - 1: 46 | self.save_metrics() 47 | -------------------------------------------------------------------------------- /nda/optimizers/decentralized_distributed/NIDS.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as np 5 | except ImportError: 6 | import numpy as np 7 | 8 | from nda.optimizers import Optimizer 9 | 10 | 11 | class NIDS(Optimizer): 12 | '''A Decentralized Proximal-Gradient Method with Network Independent Step-sizes and Separated Convergence Rates, https://arxiv.org/abs/1704.07807''' 13 | 14 | def __init__(self, p, eta=0.1, **kwargs): 15 | super().__init__(p, **kwargs) 16 | self.eta = eta 17 | self.tilde_W = (self.W + np.eye(self.p.n_agent)) / 2 18 | self.grad_last = None 19 | 20 | def update(self): 21 | self.comm_rounds += 1 22 | 23 | if self.t == 1: 24 | self.grad_last = self.grad(self.x) 25 | tmp = self.x - self.eta * self.grad_last 26 | 27 | else: 28 | tmp = 2 * self.x - self.x_last 29 | tmp += self.eta * self.grad_last 30 | self.grad_last = self.grad(self.x) 31 | tmp -= self.eta * self.grad_last 32 | tmp = tmp.dot(self.tilde_W) 33 | 34 | # Update variables 35 | self.x, self.x_last = tmp, self.x 36 | -------------------------------------------------------------------------------- /nda/optimizers/decentralized_distributed/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from nda.optimizers.decentralized_distributed.DGD import DGD 5 | from nda.optimizers.decentralized_distributed.DSGD import DSGD 6 | from nda.optimizers.decentralized_distributed.DGD_tracking import DGD_tracking 7 | from nda.optimizers.decentralized_distributed.EXTRA import EXTRA 8 | from nda.optimizers.decentralized_distributed.NIDS import NIDS 9 | from nda.optimizers.decentralized_distributed.GT_SARAH import GT_SARAH 10 | from nda.optimizers.decentralized_distributed.CHOCO_SGD import CHOCO_SGD 11 | from nda.optimizers.decentralized_distributed.D2 import D2 12 | -------------------------------------------------------------------------------- /nda/optimizers/network/Destress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | 5 | try: 6 | import cupy as xp 7 | except ImportError: 8 | import numpy as xp 9 | 10 | from nda.optimizers import Optimizer 11 | 12 | 13 | def T(x, k): 14 | 15 | if k == 0: 16 | if type(x) is np.ndarray: 17 | return np.eye(x.shape[0]) 18 | else: 19 | return 1 20 | 21 | if type(x) is np.ndarray: 22 | prev = np.eye(x.shape[0]) 23 | else: 24 | prev = 1 25 | 26 | current = x 27 | for _ in range(k - 1): 28 | current, prev = 2 * np.dot(x, current) - prev, current 29 | 30 | return current 31 | 32 | 33 | class Destress(Optimizer): 34 | def __init__(self, p, n_mix=1, n_inner_iters=100, eta=0.1, K_in=1, K_out=1, batch_size=1, opt=0, perturbation_threshould=None, perturbation_radius=None, perturbation_variance=None, **kwargs): 35 | super().__init__(p, **kwargs) 36 | 37 | self.K_in = K_in 38 | self.K_out = K_out 39 | 40 | self.eta = eta 41 | self.opt = opt 42 | self.n_inner_iters = n_inner_iters 43 | self.batch_size = batch_size 44 | 45 | average_matrix = np.ones((self.p.n_agent, self.p.n_agent)) / self.p.n_agent 46 | alpha = np.linalg.norm(self.W - average_matrix, 2) 47 | self.W_in = T(self.W / alpha, self.K_in) / T(1 / alpha, self.K_in) 48 | self.W_out = T(self.W / alpha, self.K_out) / T(1 / alpha, self.K_out) 49 | 50 | def init(self): 51 | 52 | super().init() 53 | 54 | # Equivalent mixing matrices after n_mix rounds of mixng 55 | # W_min_diag = min(np.diag(self.W)) 56 | # tmp = (1 - 1e-1) / (1 - W_min_diag) 57 | # self.W_s = self.W * tmp + np.eye(self.p.n_agent) * (1 - tmp) 58 | 59 | if len(self.x_0.shape) == 2: 60 | self.x = xp.tile(self.x_0.mean(axis=1), (self.p.n_agent, 1)).T 61 | else: 62 | self.x = self.x_0.copy() 63 | 64 | self.grad_last = self.grad(self.x) 65 | self.s = self.grad_last.copy() 66 | self.s = xp.tile(self.s.mean(axis=1), (self.p.n_agent, 1)).T 67 | 68 | def local_update(self): 69 | if self.opt == 1: 70 | n_inner_iters = self.n_inner_iters 71 | else: 72 | # Choose random x^{(t)} from n_inner_iters 73 | n_inner_iters = xp.random.randint(1, self.n_inner_iters + 1) 74 | if type(n_inner_iters) is xp.ndarray: 75 | n_inner_iters = n_inner_iters.item() 76 | 77 | samples = xp.random.randint(0, self.p.m, (n_inner_iters, self.p.n_agent, self.batch_size)) 78 | 79 | u = self.x.copy() 80 | v = self.s.copy() 81 | for inner_iter in range(n_inner_iters): 82 | 83 | u_last, u = u, (u - self.eta * v).dot(self.W_in) 84 | self.comm_rounds += self.K_in 85 | 86 | v += self.grad(u, j=samples[inner_iter]) - self.grad(u_last, j=samples[inner_iter]) 87 | v = v.dot(self.W_in) 88 | self.comm_rounds += self.K_in 89 | 90 | if inner_iter < n_inner_iters - 1: 91 | self.save_metrics(x=u) 92 | 93 | self.x = u 94 | 95 | def update(self): 96 | 97 | self.local_update() 98 | 99 | self.s -= self.grad_last 100 | self.grad_last = self.grad(self.x) 101 | self.s += self.grad_last 102 | self.s = self.s.dot(self.W_out) 103 | self.comm_rounds += self.K_out 104 | -------------------------------------------------------------------------------- /nda/optimizers/network/NetworkDANE.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | from nda.optimizers.utils import NAG, GD, FISTA 4 | from .network_optimizer import NetworkOptimizer 5 | 6 | 7 | class NetworkDANE(NetworkOptimizer): 8 | 9 | def __init__(self, p, mu=0.1, local_n_iters=100, local_optimizer='NAG', delta=None, **kwargs): 10 | super().__init__(p, **kwargs) 11 | self.mu = mu 12 | self.local_optimizer = local_optimizer 13 | self.local_n_iters = local_n_iters 14 | self.delta = delta 15 | 16 | def local_update(self): 17 | 18 | for i in range(self.p.n_agent): 19 | 20 | if self.p.is_smooth is False: 21 | grad_y = self.p.grad_f(self.y[:, i], i) 22 | 23 | def _grad(tmp): 24 | return self.grad_f(tmp, i) - grad_y + self.s[:, i] + self.mu * (tmp - self.y[:, i]) 25 | self.x[:, i], count = FISTA(_grad, self.y[:, i].copy(), self.p.L + self.mu, self.p.r, n_iters=self.local_n_iters, eps=1e-10) 26 | else: 27 | grad_y = self.p.grad(self.y[:, i], i) 28 | 29 | def _grad(tmp): 30 | return self.grad(tmp, i) - grad_y + self.s[:, i] + self.mu * (tmp - self.y[:, i]) 31 | 32 | if self.local_optimizer == 'NAG': 33 | self.x[:, i], count = NAG(_grad, self.y[:, i].copy(), self.p.L + self.mu, self.p.sigma + self.mu, self.local_n_iters) 34 | else: 35 | if self.delta is not None: 36 | self.x[:, i], count = GD(_grad, self.y[:, i].copy(), self.delta, self.local_n_iters) 37 | else: 38 | self.x[:, i], count = GD(_grad, self.y[:, i].copy(), 2 / (self.p.L + self.mu + self.p.sigma + self.mu), self.local_n_iters) 39 | -------------------------------------------------------------------------------- /nda/optimizers/network/NetworkDANE_quadratic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as np 5 | except ImportError: 6 | import numpy as np 7 | 8 | from .network_optimizer import NetworkOptimizer 9 | 10 | 11 | class NetworkDANE_quadratic(NetworkOptimizer): 12 | '''The Network DANE algorithm for qudratic objectives.''' 13 | 14 | def __init__(self, p, mu=0.1, **kwargs): 15 | super().__init__(p, **kwargs) 16 | self.mu = mu 17 | 18 | def local_update(self): 19 | 20 | for i in range(self.p.n_agent): 21 | self.x[:, i] = self.y[:, i] - np.linalg.solve(self.hessian(self.y[:, i], i) + self.mu * np.eye(self.p.dim), self.s[:, i]) 22 | -------------------------------------------------------------------------------- /nda/optimizers/network/NetworkSARAH.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as np 5 | except ImportError: 6 | import numpy as np 7 | 8 | from .network_optimizer import NetworkOptimizer 9 | 10 | 11 | class NetworkSARAH(NetworkOptimizer): 12 | def __init__(self, p, n_inner_iters=100, eta=0.1, mu=0, opt=1, batch_size=1, **kwargs): 13 | super().__init__(p, **kwargs) 14 | self.eta = eta 15 | self.opt = opt 16 | self.mu = mu 17 | self.n_inner_iters = n_inner_iters 18 | self.batch_size = batch_size 19 | 20 | def local_update(self): 21 | u = self.y.copy() 22 | v = self.s.copy() 23 | 24 | if self.opt == 1: 25 | n_inner_iters = self.n_inner_iters 26 | else: 27 | # Choose random x^{(t)} from n_inner_iters 28 | n_inner_iters = np.random.randint(1, self.n_inner_iters + 1) 29 | if type(n_inner_iters) is np.ndarray: 30 | n_inner_iters = n_inner_iters.item() 31 | 32 | for _ in range(n_inner_iters): 33 | u_last = u.copy() 34 | u -= self.eta * v 35 | 36 | sample_list = np.random.randint(0, self.p.m, (self.p.n_agent, self.batch_size)) 37 | 38 | v += self.grad(u, j=sample_list) - self.grad(u_last, j=sample_list) \ 39 | + self.mu * (u - self.y) 40 | 41 | self.x = u 42 | -------------------------------------------------------------------------------- /nda/optimizers/network/NetworkSVRG.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | try: 4 | import cupy as np 5 | except ImportError: 6 | import numpy as np 7 | 8 | from .network_optimizer import NetworkOptimizer 9 | 10 | 11 | class NetworkSVRG(NetworkOptimizer): 12 | def __init__(self, p, n_inner_iters=100, eta=0.1, mu=0, opt=1, batch_size=1, **kwargs): 13 | super().__init__(p, **kwargs) 14 | self.eta = eta 15 | self.opt = opt 16 | self.mu = mu 17 | self.n_inner_iters = n_inner_iters 18 | self.batch_size = batch_size 19 | 20 | def local_update(self): 21 | u = self.y.copy() 22 | v = self.s 23 | 24 | if self.opt == 1: 25 | n_inner_iters = self.n_inner_iters 26 | else: 27 | # Choose random x^{(t)} from n_inner_iters 28 | n_inner_iters = np.random.randint(1, self.n_inner_iters + 1) 29 | if type(n_inner_iters) is np.ndarray: 30 | n_inner_iters = n_inner_iters.item() 31 | 32 | for _ in range(n_inner_iters): 33 | u -= self.eta * v 34 | sample_list = np.random.randint(0, self.p.m, (self.p.n_agent, self.batch_size)) 35 | v = self.grad(u, j=sample_list) - self.grad(self.y, j=sample_list) \ 36 | + self.mu * (u - self.y) + self.s 37 | 38 | self.x = u 39 | -------------------------------------------------------------------------------- /nda/optimizers/network/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from nda.optimizers.network.network_optimizer import NetworkOptimizer 5 | 6 | from nda.optimizers.network.NetworkDANE import NetworkDANE 7 | from nda.optimizers.network.NetworkDANE_quadratic import NetworkDANE_quadratic 8 | 9 | from nda.optimizers.network.NetworkSARAH import NetworkSARAH 10 | from nda.optimizers.network.NetworkSVRG import NetworkSVRG 11 | from nda.optimizers.network.Destress import Destress 12 | -------------------------------------------------------------------------------- /nda/optimizers/network/network_optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | 5 | try: 6 | import cupy as xp 7 | except ImportError: 8 | import numpy as xp 9 | 10 | import matplotlib.pyplot as plt 11 | from nda.optimizers import Optimizer 12 | 13 | norm = xp.linalg.norm 14 | 15 | 16 | class NetworkOptimizer(Optimizer): 17 | '''The base network optimizer class, which handles saving/plotting history.''' 18 | 19 | def __init__(self, p, n_mix=1, grad_tracking_batch_size=None, **kwargs): 20 | super().__init__(p, **kwargs) 21 | self.n_mix = n_mix 22 | self.grad_tracking_batch_size = grad_tracking_batch_size 23 | 24 | W_min_diag = min(np.diag(self.W)) 25 | tmp = (1 - 1e-1) / (1 - W_min_diag) 26 | self.W_s = self.W * tmp + np.eye(self.p.n_agent) * (1 - tmp) 27 | 28 | # Equivalent mixing matrices after n_mix rounds of mixng 29 | self.W = np.linalg.matrix_power(self.W, self.n_mix) 30 | self.W_s = np.linalg.matrix_power(self.W_s, self.n_mix) 31 | 32 | def init(self): 33 | 34 | super().init() 35 | 36 | self.y = self.x_0.copy() 37 | self.s = xp.zeros((self.p.dim, self.p.n_agent)) 38 | for i in range(self.p.n_agent): 39 | self.s[:, i] = self.grad_h(self.y[:, i], i) 40 | 41 | self.grad_last = self.s.copy() 42 | 43 | def update(self): 44 | self.comm_rounds += self.n_mix 45 | 46 | y_last = self.y.copy() 47 | self.y = self.x.dot(self.W) 48 | self.s = self.s.dot(self.W_s) 49 | if self.grad_tracking_batch_size is None: 50 | # We can store the last gradient, so don't need to compute again 51 | self.s -= self.grad_last 52 | if self.p.is_smooth is True: 53 | self.grad_last = self.grad(self.y) 54 | else: 55 | self.grad_last = self.grad_h(self.y) 56 | self.s += self.grad_last 57 | 58 | else: 59 | if self.p.is_smooth is True: 60 | for i in range(self.p.n_agent): 61 | # We need to compute the stochastic gradient everytime 62 | j_list = xp.random.randint(0, self.p.m, self.grad_tracking_batch_size) 63 | self.s[:, i] += self.grad_h(self.y[:, i], i, j_list) - self.grad(y_last[:, i], i, j_list) 64 | else: 65 | for i in range(self.p.n_agent): 66 | # We need to compute the stochastic gradient everytime 67 | j_list = xp.random.randint(0, self.p.m, self.grad_tracking_batch_size) 68 | self.s[:, i] += self.grad_h(self.y[:, i], i, j_list) - self.grad_h(y_last[:, i], i, j_list) 69 | 70 | self.local_update() 71 | 72 | def plot_history(self): 73 | 74 | if ~hasattr(self, 'x_min'): 75 | return 76 | 77 | p = self.p 78 | hist = self.history 79 | 80 | x_min = p.x_min 81 | f_min = p.f_min 82 | 83 | plt.figure() 84 | legends = [] 85 | 86 | # | f_0(x_0^{(t)}) - f(x^\star) / f(x^\star) 87 | tmp = [(p.f(h['x'][:, 0], 0) - f_min) / f_min for h in hist] 88 | plt.semilogy(tmp) 89 | legends.append(r"$\frac{ f_0(\mathbf{x}_0^{(t)}) - f(\mathbf{x}^\star) } {f(\mathbf{x}^\star) }$") 90 | 91 | # | f_0(x_0^{(t)}) - f_0(x^\star) / f_0(x^\star) 92 | tmp = [(p.f(h['x'][:, 0], 0) - p.f(x_min, 0)) / p.f(x_min, 0) for h in hist] 93 | plt.semilogy(tmp) 94 | legends.append(r"$\frac{ f_0(\mathbf{x}_0^{(t)}) - f_0(\mathbf{x}^\star) } {f_0(\mathbf{x}^\star) }$") 95 | 96 | # | f_0(\bar x^{(t)}) - f(x^\star) / f(x^\star) 97 | tmp = [(p.f(h['x'].mean(axis=1), 0) - f_min) / f_min for h in hist] 98 | plt.semilogy(tmp) 99 | legends.append(r"$\frac{ f_0(\bar{\mathbf{x}}^{(t)}) - f(\mathbf{x}^\star) } {f(\mathbf{x}^\star) }$") 100 | 101 | # | f_0(\bar x^{(t)}) - f_0(x^\star) / f_0(x^\star) 102 | tmp = [(p.f(h['x'].mean(axis=1), 0) - p.f(x_min, 0)) / p.f(x_min, 0) for h in hist] 103 | plt.semilogy(tmp) 104 | legends.append(r"$\frac{f_0(\bar{\mathbf{x}}^{(t)}) - f_0(\mathbf{x}^\star) } {f_0(\mathbf{x}^\star) }$") 105 | 106 | # | f(x_0^{(t)}) - f(x^\star) / f(x^\star) 107 | tmp = [(p.f(h['x'][:, 0]) - f_min) / f_min for h in hist] 108 | plt.semilogy(tmp) 109 | legends.append(r"$\frac{ f(\mathbf{x}_0^{(t)}) - f(\mathbf{x}^\star) } {f(\mathbf{x}^\star) }$") 110 | 111 | # | f(\bar x^{(t)}) - f(x^\star) / f(x^\star) 112 | tmp = [(p.f(h['x'].mean(axis=1)) - f_min) / f_min for h in hist] 113 | plt.semilogy(tmp) 114 | legends.append(r"$\frac{ f(\bar{\mathbf{x}}^{(t)}) - f(\mathbf{x}^\star) } {f(\mathbf{x}^\star) }$") 115 | 116 | # | \frac 1n \sum f_i(x_i^{(t)}) - f(x^\star) / f(x^\star) 117 | tmp = [(np.mean([p.f(h['x'][:, i], i) for i in range(p.n_agent)]) - f_min) / f_min for h in hist] 118 | plt.semilogy(tmp) 119 | legends.append(r"$\frac{ \frac{1}{n} \sum f_i (\mathbf{x}_i^{(t)}) - f(\mathbf{x}^\star) } {f(\mathbf{x}^\star) }$") 120 | 121 | # | \frac 1n \sum f(x_i^{(t)}) - f(x^\star) / f(x^\star) 122 | tmp = [(np.mean([p.f(h['x'][:, i]) for i in range(p.n_agent)]) - f_min) / f_min for h in hist] 123 | plt.semilogy(tmp) 124 | legends.append(r"$\frac{\frac{1}{n} \sum f(\mathbf{x}_i^{(t)}) - f(\mathbf{x}^\star) } {f(\mathbf{x}^\star) }$") 125 | 126 | plt.ylabel('Distance') 127 | plt.xlabel('#iters') 128 | plt.legend(legends) 129 | 130 | plt.figure() 131 | legends = [] 132 | 133 | # \Vert \nabla f(\bar x^{(t)}) \Vert 134 | tmp = [norm(p.grad_f(h['x'].mean(axis=1))) for h in hist] 135 | plt.semilogy(tmp) 136 | legends.append(r"$\Vert \nabla f(\bar{\mathbf{x}}^{(t)}) \Vert$") 137 | 138 | # \Vert \nabla f_0(x_0^{(t)}) \Vert 139 | tmp = [norm(p.grad_f(h['x'][:, 0]), 0) for h in hist] 140 | plt.semilogy(tmp) 141 | legends.append(r"$\Vert \nabla f_0({\mathbf{x}_0}^{(t)}) \Vert$") 142 | 143 | # \Vert \frac 1n \sum \nabla f_i(x_i({(t)}) \Vert 144 | tmp = [norm(np.mean([p.grad_f(h['x'][:, i], i) for i in range(p.n_agent)])) for h in hist] 145 | plt.semilogy(tmp) 146 | legends.append(r"$\Vert \frac{1}{n} \sum_i \nabla f_i({\mathbf{x}_i}^{(t)}) \Vert$") 147 | 148 | # \frac 1n \sum \Vert \nabla f_i(x_i({(t)}) \Vert 149 | tmp = [np.mean([norm(p.grad_f(h['x'][:, i], i)) for i in range(p.n_agent)]) for h in hist] 150 | plt.semilogy(tmp) 151 | legends.append(r"$\frac{1}{n} \sum_i \Vert \nabla f_i({\mathbf{x}_i}^{(t)}) \Vert$") 152 | 153 | plt.ylabel('Distance') 154 | plt.xlabel('#iters') 155 | plt.legend(legends) 156 | 157 | plt.figure() 158 | legends = [] 159 | 160 | # \Vert \bar x^{(t)} - x^\star \Vert 161 | tmp = [norm(h['x'].mean(axis=1) - x_min) for h in hist] 162 | k = np.exp(np.log(tmp[-1] / tmp[0]) / len(hist)) 163 | print("Actual convergence rate of " + self.name + " is k = " + str(k)) 164 | plt.semilogy(tmp) 165 | legends.append(r"$\Vert \bar{\mathbf{x}}^{(t)} - \mathbf{x}^\star \Vert$") 166 | 167 | # \Vert x^{(t)} - \mathbf 1 \otimes x^\star \Vert 168 | plt.semilogy([norm(h['x'].T - x_min, 'fro') for h in hist]) 169 | legends.append(r"$\Vert \mathbf{x}^{(t)} - \mathbf{1} \otimes \mathbf{x}^\star \Vert$") 170 | 171 | # \Vert x^{(t)} - \mathbf 1 \otimes \bar x^{(t)} \Vert 172 | plt.semilogy([norm(h['x'].T - h['x'].mean(axis=1), 'fro') for h in hist]) 173 | legends.append(r"$\Vert \mathbf{x}^{(t)} - \mathbf{1} \otimes \bar{\mathbf{x}}^{(t)} \Vert$") 174 | 175 | # \Vert s^{(t)} \Vert 176 | plt.semilogy([norm(h['s'], 'fro') for h in hist]) 177 | legends.append(r"$\Vert \mathbf{s}^{(t)} \Vert$") 178 | 179 | # \Vert s^{(t)} - \mathbf 1 \otimes \bar g^{(t)} \Vert 180 | tmp = [] 181 | for h in hist: 182 | g = np.array([p.grad_f(h['y'][:, i], i) for i in range(p.n_agent)]) 183 | g = g.T.mean(axis=1) 184 | tmp.append(norm(h['s'].T - g, 'fro')) 185 | 186 | plt.semilogy(tmp) 187 | legends.append(r"$\Vert \mathbf{s}^{(t)} - \mathbf{1} \otimes \bar{\mathbf{g}}^{(t)} \Vert$") 188 | 189 | # \Vert \bar g^{(t)} - \nabla f(\bar y^{(t)}) \Vert 190 | tmp = [] 191 | for h in hist: 192 | g = np.array([p.grad_f(h['y'][:, i], i) for i in range(p.n_agent)]) 193 | g = g.T.mean(axis=1) 194 | tmp.append(norm(p.grad_f(h['y'].mean(axis=1)) - g, 2)) 195 | 196 | plt.semilogy(tmp) 197 | legends.append(r"$\Vert \bar{\mathbf{g}}^{(t)} - \nabla f(\bar{\mathbf{y}}^{(t)}) \Vert$") 198 | 199 | # \Vert s^{(t)} - \mathbf 1 \otimes \nabla f(\bar y^{(t)}) \Vert 200 | tmp = [] 201 | for h in hist: 202 | g = p.grad_f(h['y'].mean(axis=1)) 203 | tmp.append(norm(h['s'].T - g, 'fro')) 204 | 205 | plt.semilogy(tmp) 206 | legends.append(r"$\Vert \mathbf{s}^{(t)} - \mathbf{1} \otimes \nabla f(\bar{\mathbf{y}}^{(t)}) \Vert$") 207 | 208 | # \Vert \nabla f(\bar y^{(t)}) \Vert 209 | tmp = [] 210 | for h in hist: 211 | g = p.grad_f(h['y'].mean(axis=1)) 212 | tmp.append(norm(g)) 213 | 214 | plt.semilogy(tmp) 215 | legends.append(r"$\Vert \nabla f(\bar{\mathbf{y}}^{(t)}) \Vert$") 216 | 217 | # \Vert s^{(t)} - \nabla(y^{(t)}) \Vert 218 | tmp = [] 219 | for h in hist: 220 | # print(h['y']) 221 | # print(p.grad_f(h['y'][:, 0], 0)) 222 | g = np.array([p.grad_f(h['y'][:, i], i) for i in range(p.n_agent)]) 223 | tmp.append(norm(h['s'] - g.T, 'fro')) 224 | 225 | plt.semilogy(tmp) 226 | legends.append(r"$\Vert \mathbf{s}^{(t)} - \nabla(\mathbf{y}^{(t)}) \Vert$") 227 | 228 | # \Vert s^{(t)} - (\nabla(y^{(t)} - \nabla f_i(x^\star)) \Vert 229 | tmp = [] 230 | for h in hist: 231 | g = np.array([p.grad_f(h['y'][:, i], i) - p.grad_f(p.x_min, i) for i in range(p.n_agent)]) 232 | tmp.append(norm(h['s'] - g.T, 'fro')) 233 | 234 | plt.semilogy(tmp) 235 | legends.append(r"$\Vert \mathbf{s}^{(t)} - (\nabla(\mathbf{y}^{(t)}) - \nabla f_i(\mathbf{x}^\star)) \Vert$") 236 | 237 | # Optimal first order method bound starting from x_0 = 0 238 | # kappa = L / sigma 239 | # r = (kappa - 1) / (kappa + 1) 240 | # tmp = r ** np.arange(len(hist)) * norm(x_min) 241 | # plt.semilogy(tmp) 242 | # legends.append("Optimal 1st order bound") 243 | 244 | plt.xlabel('#iters') 245 | plt.ylabel('Distance') 246 | plt.title('Details of ' + self.name + ', L=' + str(self.p.L) + r', $\sigma$=' + str(self.p.sigma)) 247 | plt.legend(legends) 248 | -------------------------------------------------------------------------------- /nda/optimizers/optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | 5 | try: 6 | import cupy as xp 7 | except ImportError: 8 | import numpy as xp 9 | 10 | from nda.optimizers.utils import eps 11 | from nda import log 12 | 13 | norm = xp.linalg.norm 14 | 15 | 16 | def relative_error(w, w_0): 17 | return norm(w - w_0) / norm(w_0) 18 | 19 | 20 | class Optimizer(object): 21 | '''The base optimizer class, which handles logging, convergence/divergence checking.''' 22 | 23 | def __init__(self, p, n_iters=100, x_0=None, W=None, save_metric_frequency=1, is_distributed=True, extra_metrics=[], early_stopping=True, grad_eps=eps, var_eps=eps, f_eps=eps*eps): 24 | 25 | self.name = self.__class__.__name__ 26 | self.p = p 27 | self.n_iters = n_iters 28 | self.save_metric_frequency = save_metric_frequency 29 | self.save_metric_counter = 0 30 | self.is_distributed = is_distributed 31 | self.early_stopping = early_stopping 32 | self.grad_eps = grad_eps 33 | self.var_eps = var_eps 34 | self.f_eps = f_eps 35 | self.is_initialized = False 36 | 37 | if W is not None: 38 | self.W = np.array(W) 39 | 40 | if x_0 is not None: 41 | self.x_0 = np.array(x_0) 42 | else: 43 | if self.is_distributed: 44 | self.x_0 = np.random.rand(p.dim, p.n_agent) 45 | else: 46 | self.x_0 = np.random.rand(self.p.dim) 47 | 48 | self.x = self.x_0.copy() 49 | 50 | self.t = 0 51 | self.comm_rounds = 0 52 | self.n_grads = 0 53 | self.metrics = [] 54 | self.history = [] 55 | self.metric_names = ['t', 'n_grads', 'f'] 56 | if self.is_distributed: 57 | self.metric_names += ['comm_rounds'] 58 | self.metric_names += extra_metrics 59 | 60 | if self.p.x_min is not None: 61 | self.metric_names += ['var_error'] 62 | 63 | def cuda(self): 64 | 65 | log.debug("Copying data to GPU") 66 | 67 | self.p.cuda() 68 | 69 | for k in self.__dict__: 70 | if type(self.__dict__[k]) == np.ndarray: 71 | self.__dict__[k] = xp.array(self.__dict__[k]) 72 | 73 | def f(self, *args, **kwargs): 74 | return self.p.f(*args, **kwargs) 75 | 76 | def grad(self, w, i=None, j=None): 77 | '''Gradient wrapper. Provide logging function.''' 78 | 79 | return self.grad_h(w, i=i, j=j) + self.grad_g(w) 80 | 81 | def hessian(self, *args, **kwargs): 82 | return self.p.hessian(*args, **kwargs) 83 | 84 | def grad_h(self, w, i=None, j=None): 85 | '''Gradient wrapper. Provide logging function.''' 86 | 87 | if w.ndim == 1: 88 | if i is None and j is None: 89 | self.n_grads += self.p.m_total # Works for agents is list or integer 90 | elif i is not None and j is None: 91 | self.n_grads += self.p.m 92 | elif j is not None: 93 | if type(j) is int: 94 | j = [j] 95 | self.n_grads += len(j) 96 | elif w.ndim == 2: 97 | if j is None: 98 | self.n_grads += self.p.m_total # Works for agents is list or integer 99 | elif j is not None: 100 | if type(j) is xp.ndarray: 101 | self.n_grads += j.size 102 | elif type(j) is list: 103 | self.n_grads += sum([1 if type(j[i]) is int else len(j[i]) for i in range(self.p.n_agent)]) 104 | else: 105 | raise NotImplementedError 106 | else: 107 | raise NotImplementedError 108 | 109 | return self.p.grad_h(w, i=i, j=j) 110 | 111 | def grad_g(self, w): 112 | '''Gradient wrapper. Provide logging function.''' 113 | 114 | return self.p.grad_g(w) 115 | 116 | def compute_metric(self, metric_name, x): 117 | 118 | if metric_name == 't': 119 | res = self.t 120 | elif metric_name == 'comm_rounds': 121 | res = self.comm_rounds 122 | elif metric_name == 'n_grads': 123 | res = self.n_grads 124 | elif metric_name == 'f': 125 | res = self.f(x) 126 | elif metric_name == 'f_test': 127 | res = self.f(x, split='test') 128 | elif metric_name == 'var_error': 129 | res = relative_error(x, self.p.x_min) 130 | elif metric_name == 'train_accuracy': 131 | acc = self.p.accuracy(x, split='train') 132 | if type(acc) is tuple: 133 | acc = acc[0] 134 | res = acc 135 | elif metric_name == 'test_accuracy': 136 | acc = self.p.accuracy(x, split='test') 137 | if type(acc) is tuple: 138 | acc = acc[0] 139 | res = acc 140 | elif metric_name == 'grad_norm': 141 | res = norm(self.p.grad(x)) 142 | else: 143 | raise NotImplementedError(f'Metric {metric_name} is not implemented') 144 | 145 | return res 146 | 147 | def save_metrics(self, x=None): 148 | 149 | self.save_metric_counter %= self.save_metric_frequency 150 | self.save_metric_counter += 1 151 | 152 | if x is None: 153 | x = self.x 154 | 155 | if x.ndim > 1: 156 | x = x.mean(axis=1) 157 | 158 | self.metrics.append( 159 | [self.compute_metric(name, x) for name in self.metric_names] 160 | ) 161 | 162 | def get_metrics(self): 163 | self.metrics = [[_metric.item() if type(_metric) is xp.ndarray else _metric for _metric in _metrics] for _metrics in self.metrics] 164 | return self.metric_names, np.array(self.metrics) 165 | 166 | def get_name(self): 167 | return self.name 168 | 169 | def optimize(self): 170 | 171 | self.init() 172 | 173 | # Initial value 174 | self.save_metrics() 175 | 176 | for self.t in range(1, self.n_iters + 1): 177 | 178 | # The actual update step for optimization variable 179 | self.update() 180 | 181 | self.save_metrics() 182 | 183 | if self.early_stopping is True and self.check_stopping_conditions() is True: 184 | break 185 | 186 | # endfor 187 | 188 | return self.get_metrics() 189 | 190 | def check_stopping_conditions(self): 191 | '''Check stopping conditions''' 192 | 193 | if self.x.ndim > 1: 194 | x = self.x.mean(axis=1) 195 | else: 196 | x = self.x 197 | 198 | if self.grad_eps is not None: 199 | grad_norm = norm(self.p.grad(x)) 200 | if grad_norm < self.grad_eps: 201 | log.info('Gradient norm converged') 202 | return True 203 | elif grad_norm > 100 * self.p.dim: 204 | log.info('Gradient norm diverged') 205 | return True 206 | 207 | if self.p.x_min is not None and self.var_eps is not None: 208 | distance = norm(x - self.p.x_min) / norm(self.p.x_min) 209 | if distance < self.var_eps: 210 | log.info('Variable converged') 211 | return True 212 | 213 | if distance > 100: 214 | log.info('Variable diverged') 215 | return True 216 | 217 | if self.p.f_min is not None and self.f_eps is not None: 218 | distance = np.abs(self.p.f(x) / self.p.f_min - 1) 219 | if distance < self.f_eps: 220 | log.info('Function value converged') 221 | return True 222 | 223 | if distance > 100: 224 | log.info('Function value diverged') 225 | return True 226 | 227 | return False 228 | 229 | def init(self): 230 | pass 231 | 232 | def update(self): 233 | pass 234 | -------------------------------------------------------------------------------- /nda/optimizers/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | 5 | try: 6 | import cupy as xp 7 | except ImportError: 8 | import numpy as xp 9 | 10 | import networkx as nx 11 | import cvxpy as cvx 12 | import os 13 | 14 | eps = 1e-6 15 | 16 | def NAG(grad, x_0, L, sigma, n_iters=100, eps=eps): 17 | '''Nesterov's Accelerated Gradient Descent for strongly convex functions''' 18 | 19 | x = y = x_0 20 | root_kappa = xp.sqrt(L / sigma) 21 | r = (root_kappa - 1) / (root_kappa + 1) 22 | r_1 = 1 + r 23 | r_2 = r 24 | 25 | for t in range(n_iters): 26 | y_last = y 27 | 28 | _grad = grad(y) 29 | if xp.linalg.norm(_grad) < eps: 30 | break 31 | 32 | y = x - _grad / L 33 | x = r_1*y - r_2*y_last 34 | 35 | return y, t 36 | 37 | 38 | def GD(grad, x_0, eta, n_iters=100, eps=eps): 39 | '''Gradient Descent''' 40 | 41 | x = x_0 42 | for t in range(n_iters): 43 | 44 | _grad = grad(x) 45 | if xp.linalg.norm(_grad) < eps: 46 | break 47 | 48 | x -= eta * _grad 49 | 50 | return x, t + 1 51 | 52 | 53 | def Sub_GD(grad, x_0, n_iters=100, eps=eps): 54 | '''Sub-gradient Descent''' 55 | 56 | R = xp.linalg.norm(x_0) 57 | x = x_0 58 | for t in range(n_iters): 59 | eta_t = R / xp.sqrt(t) 60 | 61 | g_t = grad(x) 62 | if xp.linalg.norm(g_t) < eps: 63 | break 64 | 65 | g_t /= xp.linalg.norm(g_t) 66 | x -= eta_t * g_t 67 | 68 | return x, t + 1 69 | 70 | 71 | def FISTA(grad_f, x_0, L, LAMBDA, n_iters=100, eps=1e-10): 72 | '''FISTA''' 73 | r = xp.zeros(n_iters+1) 74 | 75 | for t in range(1, n_iters + 1): 76 | r[t] = 0.5 + xp.sqrt(1 + 4 * r[t - 1] ** 2) / 2 77 | 78 | gamma = (1 - r[:n_iters]) / r[1:] 79 | 80 | x = x_0.copy() 81 | y = x_0.copy() 82 | 83 | for t in range(1, n_iters): 84 | 85 | _grad = grad_f(x) 86 | if xp.linalg.norm(_grad) < eps: 87 | break 88 | 89 | x -= _grad / L 90 | y_new = xp.sign(x) * xp.maximum(xp.abs(x) - LAMBDA/L, 0) 91 | x = (1 - gamma[t]) * y_new + gamma[t] * y 92 | y = y_new 93 | 94 | return y, t + 1 95 | 96 | def generate_mixing_matrix(p): 97 | return symmetric_fdla_matrix(p.G) 98 | 99 | def asymmetric_fdla_matrix(G, m): 100 | n = G.number_of_nodes() 101 | 102 | ind = nx.adjacency_matrix(G).toarray() + np.eye(n) 103 | ind = ~ind.astype(bool) 104 | 105 | average_vec = m / m.sum() 106 | average_matrix = np.ones((n, 1)).dot(average_vec[np.newaxis, :]).T 107 | one_vec = np.ones(n) 108 | 109 | W = cvx.Variable((n, n)) 110 | 111 | if ind.sum() == 0: 112 | prob = cvx.Problem(cvx.Minimize(cvx.norm(W - average_matrix)), 113 | [ 114 | cvx.sum(W, axis=1) == one_vec, 115 | cvx.sum(W, axis=0) == one_vec 116 | ]) 117 | else: 118 | prob = cvx.Problem(cvx.Minimize(cvx.norm(W - average_matrix)), 119 | [ 120 | W[ind] == 0, 121 | cvx.sum(W, axis=1) == one_vec, 122 | cvx.sum(W, axis=0) == one_vec 123 | ]) 124 | prob.solve() 125 | 126 | W = W.value 127 | # W = (W + W.T) / 2 128 | W[ind] = 0 129 | W -= np.diag(W.sum(axis=1) - 1) 130 | alpha = np.linalg.norm(W - average_matrix, 2) 131 | 132 | return W, alpha 133 | 134 | 135 | 136 | def symmetric_fdla_matrix(G): 137 | 138 | n = G.number_of_nodes() 139 | 140 | ind = nx.adjacency_matrix(G).toarray() + np.eye(n) 141 | ind = ~ind.astype(bool) 142 | 143 | average_matrix = np.ones((n, n)) / n 144 | one_vec = np.ones(n) 145 | 146 | W = cvx.Variable((n, n)) 147 | 148 | if ind.sum() == 0: 149 | prob = cvx.Problem(cvx.Minimize(cvx.norm(W - average_matrix)), 150 | [ 151 | W == W.T, 152 | cvx.sum(W, axis=1) == one_vec 153 | ]) 154 | else: 155 | prob = cvx.Problem(cvx.Minimize(cvx.norm(W - average_matrix)), 156 | [ 157 | W[ind] == 0, 158 | W == W.T, 159 | cvx.sum(W, axis=1) == one_vec 160 | ]) 161 | prob.solve() 162 | 163 | W = W.value 164 | W = (W + W.T) / 2 165 | W[ind] = 0 166 | W -= np.diag(W.sum(axis=1) - 1) 167 | alpha = np.linalg.norm(W - average_matrix, 2) 168 | 169 | return np.array(W), alpha 170 | 171 | def relative_error(x, y): 172 | return xp.linalg.norm(x - y) / xp.linalg.norm(y) 173 | -------------------------------------------------------------------------------- /nda/problems/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from nda.problems.problem import Problem 5 | from nda.problems.linear_regression import LinearRegression 6 | from nda.problems.logistic_regression import LogisticRegression 7 | from nda.problems.neural_network import NN 8 | -------------------------------------------------------------------------------- /nda/problems/linear_regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | 5 | try: 6 | import cupy as xp 7 | except ImportError: 8 | import numpy as xp 9 | 10 | import multiprocessing as mp 11 | 12 | from nda import log 13 | from nda.problems import Problem 14 | 15 | 16 | class LinearRegression(Problem): 17 | '''f(w) = 1/n \sum f_i(w) + r * g(w) = 1/n \sum 1/2m || Y_i - X_i w ||^2 + r * g(w)''' 18 | 19 | def __init__(self, noise_variance=0.1, kappa=10, **kwargs): 20 | 21 | self.noise_variance = noise_variance 22 | self.kappa = kappa 23 | 24 | super().__init__(**kwargs) 25 | 26 | # Pre-calculate matrix products to accelerate gradient and function value evaluations 27 | self.H = self.X_train.T.dot(self.X_train) / self.m_total 28 | self.X_T_Y = self.X_train.T.dot(self.Y_train) / self.m_total 29 | 30 | if xp.__name__ == 'cupy': 31 | log.info('Initializing using GPU') 32 | q = mp.Queue(2) 33 | pp = mp.Process(target=self._init, args=(q,)) 34 | pp.start() 35 | pp.join() 36 | self.x_min = self.w_min = q.get() 37 | self.f_min = q.get() 38 | else: 39 | log.info('Initializing using CPU') 40 | self.x_min, self.f_min = self._init() 41 | 42 | # Pre-calculate matrix products to accelerate gradient and function value evaluations 43 | # After computing minimum to reduce memory copy 44 | self.H_list = np.einsum('ikj,ikl->ijl', self.X, self.X) / self.m 45 | self.X_T_Y_list = np.einsum('ikj,ik->ij', self.X, self.Y) / self.m 46 | log.info('beta = %.4f', np.linalg.norm(self.H_list - self.H, ord=2, axis=(1, 2)).max()) 47 | log.info('Initialization done') 48 | 49 | def _init(self, result_queue=None): 50 | 51 | if xp.__name__ == 'cupy': 52 | self.cuda() 53 | 54 | if self.is_smooth is True: 55 | x_min = xp.linalg.solve(self.X_train.T.dot(self.X_train) + 2 * self.m_total * self.r * xp.eye(self.dim), self.X_train.T.dot(self.Y_train)) 56 | 57 | else: 58 | from nda.optimizers.utils import FISTA 59 | x_min, _ = FISTA(self.grad_h, xp.random.randn(self.dim), self.L, self.r, n_iters=100000) 60 | 61 | f_min = self.f(x_min) 62 | log.info(f'f_min = {f_min}') 63 | 64 | if xp.__name__ == 'cupy': 65 | f_min = f_min.item() 66 | x_min = x_min.get() 67 | 68 | if result_queue is not None: 69 | result_queue.put(x_min) 70 | result_queue.put(f_min) 71 | 72 | return x_min, f_min 73 | 74 | def generate_data(self): 75 | 76 | def _generate_x(n_samples, dim, kappa): 77 | '''Helper function to generate data''' 78 | 79 | powers = - np.log(kappa) / np.log(dim) / 2 80 | 81 | S = np.power(np.arange(dim) + 1, powers) 82 | X = np.random.randn(n_samples, dim) # Random standard Gaussian data 83 | X *= S # Conditioning 84 | X_list = self.split_data(X) 85 | 86 | max_norm = max([np.linalg.norm(X_list[i].T.dot(X_list[i]), 2) / X_list[i].shape[0] for i in range(self.n_agent)]) 87 | X /= max_norm 88 | 89 | return X, 1, 1 / kappa, np.diag(S) 90 | 91 | # Generate X 92 | self.X_train, self.L, self.sigma, self.S = _generate_x(self.m_total, self.dim, self.kappa) 93 | 94 | # Generate Y and the optimal solution 95 | self.x_0 = self.w_0 = np.random.rand(self.dim) 96 | self.Y_0_train = self.X_train.dot(self.w_0) 97 | self.Y_train = self.Y_0_train + np.sqrt(self.noise_variance) * np.random.randn(self.m_total) 98 | 99 | def grad_h(self, w, i=None, j=None, split='train'): 100 | '''Gradient of h(x) at w. Depending on the shape of w and parameters i and j, this function behaves differently: 101 | 1. If w is a vector of shape (dim,) 102 | 1.1 If i is None and j is None 103 | returns the full gradient. 104 | 1.2 If i is not None and j is None 105 | returns the gradient at the i-th agent. 106 | 1.3 If i is None and j is not None 107 | returns the i-th gradient of all training data. 108 | 1.4 If i is not None and j is not None 109 | returns the gradient of the j-th data sample at the i-th agent. 110 | Note i, j can be integers, lists or vectors. 111 | 2. If w is a matrix of shape (dim, n_agent) 112 | 2.1 if j is None 113 | returns the gradient of each parameter at the corresponding agent 114 | 2.2 if j is not None 115 | returns the gradient of each parameter of the j-th sample at the corresponding agent. 116 | Note j can be lists of lists or vectors. 117 | ''' 118 | 119 | if w.ndim == 1: 120 | if type(j) is int: 121 | j = [j] 122 | 123 | if i is None and j is None: # Return the full gradient 124 | return self.H.dot(w) - self.X_T_Y 125 | elif i is not None and j is None: # Return the local gradient 126 | return self.H_list[i].dot(w) - self.X_T_Y_list[i] 127 | elif i is None and j is not None: # Return the stochastic gradient 128 | return (self.X_train[j].dot(w) - self.Y_train[j]).dot(self.X_train[j]) / len(j) 129 | else: # Return the stochastic gradient 130 | return (self.X[i][j].dot(w) - self.Y[i][j]).dot(self.X[i][j]) / len(j) 131 | 132 | elif w.ndim == 2: 133 | if i is None and j is None: # Return the distributed gradient 134 | return xp.einsum('ijk,ki->ji', self.H_list, w) - self.X_T_Y_list.T 135 | elif i is None and j is not None: # Return the stochastic gradient 136 | res = [] 137 | for i in range(self.n_agent): 138 | if type(j[i]) is int: 139 | samples = [j[i]] 140 | else: 141 | samples = j[i] 142 | res.append((self.X[i][samples].dot(w[:, i]) - self.Y[i][samples]).dot(self.X[i][samples]) / len(samples)) 143 | return xp.array(res).T 144 | else: 145 | log.fatal('For distributed gradients j must be None') 146 | else: 147 | log.fatal('Parameter dimension should only be 1 or 2') 148 | 149 | def h(self, w, i=None, j=None, split='train'): 150 | '''Function value of h(x) at w. If i is None, returns h(x); if i is not None but j is, returns the function value at the i-th machine; otherwise,return the function value of j-th sample at the i-th machine.''' 151 | 152 | if i is None and j is None: # Return the function value 153 | Z = xp.sqrt(2 * self.m_total) 154 | return xp.sum((self.Y_train / Z - (self.X_train / Z).dot(w)) ** 2) 155 | elif i is not None and j is None: # Return the function value at machine i 156 | return xp.sum((self.Y[i] - self.X[i].dot(w)) ** 2) / 2 / self.m 157 | elif i is not None and j is not None: # Return the function value of sample j at machine i 158 | return xp.sum((self.Y[i][j] - self.X[i][j].dot(w)) ** 2) / 2 159 | else: 160 | log.fatal('When i is None, j mush be None') 161 | 162 | def hessian(self, w=None, i=None, j=None): 163 | '''Hessian matrix at w. If i is None, returns the full Hessian matrix; if i is not None but j is, returns the hessian matrix at the i-th machine; otherwise,return the hessian matrix of j-th sample at the i-th machine.''' 164 | 165 | if i is None: # Return the full hessian matrix 166 | return self.H 167 | elif j is None: # Return the hessian matrix at machine i 168 | return self.H_list[i] 169 | else: # Return the hessian matrix of sample j at machine i 170 | return self.X[i][xp.newaxis, j, :].T.dot(self.X[i][xp.newaxis, j, :]) 171 | -------------------------------------------------------------------------------- /nda/problems/logistic_regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | 5 | try: 6 | import cupy as xp 7 | except ImportError: 8 | import numpy as xp 9 | 10 | import multiprocessing as mp 11 | 12 | from nda import log 13 | from nda.problems import Problem 14 | from nda.optimizers.utils import NAG 15 | 16 | 17 | def logit_1d(X, w): 18 | return 1 / (1 + xp.exp(-X.dot(w))) 19 | 20 | 21 | def logit_1d_np(X, w): 22 | return 1 / (1 + np.exp(-X.dot(w))) 23 | 24 | 25 | def logit_2d(X, w): 26 | tmp = xp.einsum('ijk,ki->ij', X, w) 27 | return 1 / (1 + xp.exp(-tmp)) 28 | 29 | 30 | class LogisticRegression(Problem): 31 | '''f(w) = - 1 / N * (\sum y_i log(1/(1 + exp(w^T x_i))) + (1 - y_i) log (1 - 1/(1 + exp(w^T x_i)))) + \frac{\lambda}{2} \| w \|^2 + \alpha \sum w_i^2 / (1 + w_i^2)''' 32 | def grad_g(self, w): 33 | if self.alpha == 0: 34 | return 0 35 | return 2 * self.alpha * w / ((1 + w**2)**2) 36 | 37 | def g(self, w): 38 | if self.alpha == 0: 39 | return 0 40 | return (1 - 1 / (1 + w ** 2)).sum() * self.alpha 41 | 42 | def __init__(self, kappa=None, noise_ratio=None, LAMBDA=0, alpha=0, **kwargs): 43 | 44 | self.noise_ratio = noise_ratio 45 | self.kappa = kappa 46 | self.alpha = alpha 47 | self.LAMBDA = LAMBDA 48 | 49 | super().__init__(**kwargs) 50 | 51 | if alpha == 0: 52 | if kappa == 1: 53 | self.LAMBDA = 100 54 | elif kappa is not None: 55 | self.LAMBDA = 1 / (self.kappa - 1) 56 | self.L = 1 + self.LAMBDA 57 | self.sigma = self.LAMBDA if self.LAMBDA != 0 else None 58 | else: 59 | self.L = 1 + self.LAMBDA + 6 * self.alpha 60 | self.sigma = self.LAMBDA + 2 * self.alpha 61 | 62 | if xp.__name__ == 'cupy': 63 | log.info('Initializing using GPU') 64 | q = mp.Queue(3) 65 | pp = mp.Process(target=self._init, args=(q,)) 66 | pp.start() 67 | pp.join() 68 | norm = q.get() 69 | if self.kappa is not None: 70 | self.x_min = self.w_min = q.get() 71 | self.f_min = q.get() 72 | else: 73 | log.info('Initializing using CPU') 74 | norm, self.x_min, self.f_min = self._init() 75 | 76 | self.X_train /= norm 77 | self.X_test /= norm 78 | 79 | log.info('Initialization done') 80 | 81 | 82 | def _init(self, result_queue=None): 83 | 84 | if xp.__name__ == 'cupy': 85 | self.cuda() 86 | 87 | log.info('Computing norm') 88 | norm = xp.linalg.norm(self.X_train, 2) / (2 * xp.sqrt(self.m_total)) # Upper bound of the hessian 89 | self.X_train /= norm 90 | self.X /= norm 91 | 92 | if self.kappa is not None: 93 | log.info('Computing min') 94 | x_min, count = NAG(self.grad, xp.random.randn(self.dim), self.L, self.sigma, n_iters=5000, eps=1e-10) 95 | log.info(f'NAG ran for {count} iterations') 96 | f_min = self.f(x_min) 97 | log.info(f'f_min = {f_min}') 98 | log.info(f'grad_f(x_min) = {xp.linalg.norm(self.grad(x_min))}') 99 | 100 | if xp.__name__ == 'cupy': 101 | norm = norm.item() 102 | if self.kappa is not None: 103 | x_min = x_min.get() 104 | f_min = f_min.item() 105 | else: 106 | x_min = f_min = None 107 | 108 | if result_queue is not None: 109 | result_queue.put(norm) 110 | if self.kappa is not None: 111 | result_queue.put(x_min) 112 | result_queue.put(f_min) 113 | 114 | if self.kappa is not None: 115 | return norm, x_min, f_min 116 | 117 | return norm, None, None 118 | 119 | def generate_data(self): 120 | def _generate_data(m_total, dim, noise_ratio, m_test=None): 121 | if m_test is None: 122 | m_test = int(m_total / 10) 123 | 124 | # Generate data 125 | X = np.random.randn(m_total + m_test, dim) 126 | 127 | # Generate labels 128 | w_0 = np.random.rand(dim) 129 | Y = logit_1d_np(X, w_0) 130 | Y[Y > 0.5] = 1 131 | Y[Y <= 0.5] = 0 132 | 133 | X_train, X_test = X[:m_total], X[m_total:] 134 | Y_train, Y_test = Y[:m_total], Y[m_total:] 135 | 136 | noise = np.random.binomial(1, noise_ratio, m_total) 137 | Y_train = (noise - Y_train) * noise + Y_train * (1 - noise) 138 | return X_train, Y_train, X_test, Y_test, w_0 139 | 140 | self.X_train, self.Y_train, self.X_test, self.Y_test, self.w_0 = _generate_data(self.n_agent * self.m, self.dim, self.noise_ratio) 141 | 142 | def grad_h(self, w, i=None, j=None): 143 | '''Gradient of h(x) at w. Depending on the shape of w and parameters i and j, this function behaves differently: 144 | 1. If w is a vector of shape (dim,) 145 | 1.1 If i is None and j is None 146 | returns the full gradient. 147 | 1.2 If i is not None and j is None 148 | returns the gradient at the i-th agent. 149 | 1.3 If i is None and j is not None 150 | returns the i-th gradient of all training data. 151 | 1.4 If i is not None and j is not None 152 | returns the gradient of the j-th data sample at the i-th agent. 153 | Note i, j can be integers, lists or vectors. 154 | 2. If w is a matrix of shape (dim, n_agent) 155 | 2.1 if j is None 156 | returns the gradient of each parameter at the corresponding agent 157 | 2.2 if j is not None 158 | returns the gradient of each parameter of the j-th sample at the corresponding agent. 159 | Note j can be lists of lists or vectors. 160 | ''' 161 | 162 | if w.ndim == 1: 163 | if type(j) is int: 164 | j = [j] 165 | if i is None and j is None: # Return the full gradient 166 | return self.X_train.T.dot(logit_1d(self.X_train, w) - self.Y_train) / self.m_total + w * self.LAMBDA 167 | elif i is not None and j is None: 168 | return self.X[i].T.dot(logit_1d(self.X[i], w) - self.Y[i]) / self.m + w * self.LAMBDA 169 | elif i is None and j is not None: # Return the full gradient 170 | return self.X_train[j].T.dot(logit_1d(self.X_train[j], w) - self.Y_train[j]) / len(j) + w * self.LAMBDA 171 | else: # Return the gradient of sample j at machine i 172 | return (logit_1d(self.X[i][j], w) - self.Y[i][j]).dot(self.X[i][j]) / len(j) + w * self.LAMBDA 173 | 174 | elif w.ndim == 2: 175 | if i is None and j is None: # Return the distributed gradient 176 | tmp = logit_2d(self.X, w) - self.Y 177 | return xp.einsum('ikj,ik->ji', self.X, tmp) / self.m + w * self.LAMBDA 178 | elif i is None and j is not None: # Return the stochastic gradient 179 | res = [] 180 | for i in range(self.n_agent): 181 | if type(j[i]) is int: 182 | samples = [j[i]] 183 | else: 184 | samples = j[i] 185 | res.append(self.X[i][samples].T.dot(logit_1d(self.X[i][samples], w[:, i]) - self.Y[i][samples]) / len(samples) + w[:, i] * self.LAMBDA) 186 | return xp.array(res).T 187 | else: 188 | log.fatal('For distributed gradients j must be None') 189 | else: 190 | log.fatal('Parameter dimension should only be 1 or 2') 191 | 192 | def h(self, w, i=None, j=None, split='train'): 193 | '''Function value at w. If i is None, returns f(x); if i is not None but j is, returns the function value in the i-th machine; otherwise,return the function value of j-th sample in i-th machine.''' 194 | 195 | if split == 'train': 196 | X = self.X_train 197 | Y = self.Y_train 198 | elif split == 'test': 199 | if w.ndim > 1 or i is not None or j is not None: 200 | log.fatal("Function value on test set only applies to one parameter vector") 201 | X = self.X_test 202 | Y = self.Y_test 203 | 204 | if i is None: # Return the function value 205 | tmp = X.dot(w) 206 | return -xp.sum( 207 | (Y - 1) * tmp - xp.log1p(xp.exp(-tmp)) 208 | ) / X.shape[0] + xp.sum(w**2) * self.LAMBDA / 2 209 | 210 | elif j is None: # Return the function value in machine i 211 | tmp = self.X[i].dot(w) 212 | return -xp.sum((self.Y[i] - 1) * tmp - xp.log1p(xp.exp(-tmp))) / self.m + xp.sum(w**2) * self.LAMBDA / 2 213 | else: # Return the gradient of sample j in machine i 214 | tmp = self.X[i][j].dot(w) 215 | return -((self.Y[i][j] - 1) * tmp - xp.log1p(xp.exp(-tmp))) + xp.sum(w**2) * self.LAMBDA / 2 216 | 217 | def accuracy(self, w, split='train'): 218 | 219 | if len(w.shape) > 1: 220 | w = w.mean(axis=1) 221 | if split == 'train': 222 | X = self.X_train 223 | Y = self.Y_train 224 | elif split == 'test': 225 | X = self.X_test 226 | Y = self.Y_test 227 | else: 228 | log.fatal('Data split %s is not supported' % split) 229 | 230 | Y_hat = X.dot(w) 231 | Y_hat[Y_hat > 0] = 1 232 | Y_hat[Y_hat <= 0] = 0 233 | return xp.mean(Y_hat == Y) 234 | -------------------------------------------------------------------------------- /nda/problems/neural_network.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | 5 | try: 6 | import cupy as xp 7 | except ImportError: 8 | xp = np 9 | 10 | from nda.problems import Problem 11 | from nda.datasets import MNIST 12 | from nda import log 13 | 14 | 15 | def sigmoid(x): 16 | return 1 / (1 + xp.exp(-x)) 17 | 18 | 19 | def softmax(x): 20 | tmp = xp.exp(x) 21 | return tmp / tmp.sum(axis=1, keepdims=True) 22 | 23 | 24 | def softmax_loss(Y, score): 25 | return - xp.sum(xp.log(score[Y != 0])) / Y.shape[0] 26 | # return - xp.sum(Y * xp.log(score)) / Y.shape[0] 27 | 28 | 29 | class NN(Problem): 30 | '''f(w) = 1/n \sum l_i(w), where l_i(w) is the logistic loss''' 31 | 32 | def __init__(self, n_hidden=64, dataset='mnist', **kwargs): 33 | 34 | super().__init__(dataset=dataset, **kwargs) 35 | 36 | self.n_hidden = n_hidden # Number of neurons in hidden layer 37 | self.n_class = self.Y_train.shape[1] 38 | self.img_dim = self.X_train.shape[1] 39 | self.dim = (n_hidden + 1) * (self.img_dim + self.n_class) 40 | 41 | self.Y_train_labels = self.Y_train.argmax(axis=1) 42 | self.Y_test_labels = self.Y_test.argmax(axis=1) 43 | 44 | # Internal buffers 45 | self._dw = np.zeros(self.dim) 46 | self._dw1, self._dw2 = self.unpack_w(self._dw) # Reference to the internal buffer 47 | 48 | log.info('Initialization done') 49 | 50 | def cuda(self): 51 | super().cuda() 52 | self._dw1, self._dw2 = self.unpack_w(self._dw) # Renew the reference 53 | 54 | def unpack_w(self, W): 55 | # This function returns references 56 | return W[: self.img_dim * (self.n_hidden + 1)].reshape(self.img_dim, self.n_hidden + 1), \ 57 | W[self.img_dim * (self.n_hidden + 1):].reshape(self.n_hidden + 1, self.n_class) 58 | 59 | def pack_w(self, W_1, W_2): 60 | # This function returns a new array 61 | return xp.append(W_1.reshape(-1), W_2.reshape(-1)) 62 | 63 | def grad_h(self, w, i=None, j=None): 64 | '''Gradient at w. If i is None, returns the full gradient; if i is not None but j is, returns the gradient in the i-th machine; otherwise,return the gradient of j-th sample in i-th machine. ''' 65 | 66 | if not(self._dw1.base is self._dw and self._dw2.base is self._dw): 67 | self._dw1, self._dw2 = self.unpack_w(self._dw) 68 | 69 | if w.ndim == 1: 70 | if type(j) is int: 71 | j = [j] 72 | 73 | if i is None and j is None: # Return the full gradient 74 | return self.forward_backward(self.X_train, self.Y_train, w)[0] 75 | elif i is not None and j is None: # Return the local gradient 76 | return self.forward_backward(self.X[i], self.Y[i], w)[0] 77 | elif i is None and j is not None: # Return the stochastic gradient 78 | return self.forward_backward(self.X_train[j], self.Y_train[j], w)[0] 79 | else: # Return the stochastic gradient 80 | return self.forward_backward(self.X[i][j], self.Y[i][j], w)[0] 81 | 82 | elif w.ndim == 2: 83 | if i is None and j is None: # Return the distributed gradient 84 | return xp.array([self.forward_backward(self.X[i], self.Y[i], w[:, i])[0].copy() for i in range(self.n_agent)]).T 85 | elif i is None and j is not None: # Return the stochastic gradient 86 | return xp.array([self.forward_backward(self.X[i][j[i]], self.Y[i][j[i]], w[:, i])[0].copy() for i in range(self.n_agent)]).T 87 | else: 88 | log.fatal('For distributed gradients j must be None') 89 | 90 | else: 91 | log.fatal('Parameter dimension should only be 1 or 2') 92 | 93 | def h(self, w, i=None, j=None, split='train'): 94 | '''Function value at w. If i is None, returns f(x); if i is not None but j is, returns the function value in the i-th machine; otherwise,return the function value of j-th sample in i-th machine.''' 95 | 96 | if split == 'train': 97 | X = self.X_train 98 | Y = self.Y_train 99 | elif split == 'test': 100 | if w.ndim > 1 or i is not None or j is not None: 101 | log.fatal("Function value on test set only applies to one parameter vector") 102 | X = self.X_test 103 | Y = self.Y_test 104 | 105 | if i is None and j is None: # Return the function value 106 | return self.forward(X, Y, w)[0] 107 | elif i is not None and j is None: # Return the function value at machine i 108 | return self.forward(self.X[i], self.Y[i], w)[0] 109 | else: # Return the function value at machine i 110 | if type(j) is int: 111 | j = [j] 112 | return self.forward(self.X[i][j], self.Y[i][j], w)[0] 113 | 114 | def forward(self, X, Y, w): 115 | w1, w2 = self.unpack_w(w) 116 | A1 = sigmoid(X.dot(w1)) 117 | A1[:, -1] = 1 118 | A2 = softmax(A1.dot(w2)) 119 | 120 | return softmax_loss(Y, A2), A1, A2 121 | 122 | def forward_backward(self, X, Y, w): 123 | w1, w2 = self.unpack_w(w) 124 | loss, A1, A2 = self.forward(X, Y, w) 125 | 126 | dZ2 = A2 - Y 127 | xp.dot(A1.T, dZ2, out=self._dw2) 128 | dA1 = dZ2.dot(w2.T) 129 | dZ1 = dA1 * A1 * (1 - A1) 130 | xp.dot(X.T, dZ1, out=self._dw1) 131 | self._dw /= X.shape[0] 132 | 133 | return self._dw, loss 134 | 135 | def accuracy(self, w, split='test'): 136 | if w.ndim > 1: 137 | w = w.mean(axis=1) 138 | if split == 'train': 139 | X = self.X_train 140 | Y = self.Y_train 141 | labels = self.Y_train_labels 142 | elif split == 'test': 143 | X = self.X_test 144 | Y = self.Y_test 145 | labels = self.Y_test_labels 146 | else: 147 | log.fatal('Data split %s is not supported' % split) 148 | 149 | loss, _, A2 = self.forward(X, Y, w) 150 | pred = A2.argmax(axis=1) 151 | 152 | return sum(pred == labels) / len(pred), loss 153 | 154 | 155 | if __name__ == '__main__': 156 | 157 | p = NN() 158 | -------------------------------------------------------------------------------- /nda/problems/problem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | import numpy as np 4 | 5 | try: 6 | import cupy as xp 7 | except ImportError: 8 | xp = np 9 | 10 | import networkx as nx 11 | import matplotlib.pyplot as plt 12 | from nda import log 13 | 14 | 15 | class Problem(object): 16 | '''The base problem class, which generates the random problem and supports function value and gradient evaluation''' 17 | 18 | def __init__(self, n_agent=20, m=1000, dim=40, graph_type='er', graph_params=None, regularization=None, r=0, dataset='random', sort=False, shuffle=False, normalize_data=False, gpu=False): 19 | 20 | self.n_agent = n_agent # Number of agents 21 | self.m = m # Number of samples per agent 22 | self.dim = dim # Dimension of the variable 23 | self.X_total = None # All data 24 | self.Y_total = None # All labels 25 | self.X = [] # Distributed data 26 | self.Y = [] # Distributed labels 27 | self.x_0 = None # The true varibal value 28 | self.x_min = None # The minimizer varibal value 29 | self.f_min = None # The optimal function value 30 | self.L = None # The smoothness constant 31 | self.sigma = 0 # The strong convexity constant 32 | self.is_smooth = True # If the problem is smooth or not 33 | self.r = r 34 | self.graph_params = graph_params 35 | self.graph_type = graph_type 36 | self.dataset = dataset 37 | 38 | if dataset == 'random': 39 | self.m_total = m * n_agent # Total number of data samples of all agents 40 | self.generate_data() 41 | else: 42 | from nda import datasets 43 | 44 | if dataset == 'gisette': 45 | self.X_train, self.Y_train, self.X_test, self.Y_test = datasets.Gisette(normalize=normalize_data).load() 46 | elif dataset == 'mnist': 47 | self.X_train, self.Y_train, self.X_test, self.Y_test = datasets.MNIST(normalize=normalize_data).load() 48 | 49 | else: 50 | self.X_train, self.Y_train, self.X_test, self.Y_test = datasets.LibSVM(name=dataset, normalize=normalize_data).load() 51 | 52 | self.X_train = np.append(self.X_train, np.ones((self.X_train.shape[0], 1)), axis=1) 53 | self.X_test = np.append(self.X_test, np.ones((self.X_test.shape[0], 1)), axis=1) 54 | self.m = self.X_train.shape[0] // n_agent 55 | self.m_total = self.m * n_agent 56 | 57 | self.X_train = self.X_train[:self.m_total] 58 | self.Y_train = self.Y_train[:self.m_total] 59 | self.dim = self.X_train.shape[1] 60 | 61 | 62 | if sort or shuffle: 63 | if sort: 64 | if self.Y_train.ndim > 1: 65 | order = self.Y_train.argmax(axis=1).argsort() 66 | else: 67 | order = self.Y_train.argsort() 68 | elif shuffle: 69 | order = np.random.permutation(len(self.X_train)) 70 | 71 | self.X_train = self.X_train[order].copy() 72 | self.Y_train = self.Y_train[order].copy() 73 | 74 | # Split data 75 | self.X = self.split_data(self.X_train) 76 | self.Y = self.split_data(self.Y_train) 77 | 78 | self.generate_graph(graph_type=graph_type, params=graph_params) 79 | 80 | if regularization == 'l1': 81 | self.grad_g = self._grad_regularization_l1 82 | self.is_smooth = False 83 | 84 | elif regularization == 'l2': 85 | self.grad_g = self._grad_regularization_l2 86 | 87 | def cuda(self): 88 | log.debug("Copying data to GPU") 89 | 90 | # Copy every np.array to GPU if needed 91 | for k in self.__dict__: 92 | if type(self.__dict__[k]) == np.ndarray: 93 | self.__dict__[k] = xp.array(self.__dict__[k]) 94 | 95 | def split_data(self, X): 96 | '''Helper function to split data according to the number of training samples per agent.''' 97 | if self.m * self.n_agent != len(X): 98 | log.fatal('Data cannot be distributed equally to %d agents' % self.n_agent) 99 | if X.ndim == 1: 100 | return X.reshape(self.n_agent, -1) 101 | else: 102 | return X.reshape(self.n_agent, self.m, -1) 103 | 104 | def grad(self, w, i=None, j=None): 105 | '''(sub-)Gradient of f(x) = h(x) + g(x) at w. Depending on the shape of w and parameters i and j, this function behaves differently: 106 | 1. If w is a vector of shape (dim,) 107 | 1.1 If i is None and j is None 108 | returns the full gradient. 109 | 1.2 If i is not None and j is None 110 | returns the gradient at the i-th agent. 111 | 1.3 If i is None and j is not None 112 | returns the i-th gradient of all training data. 113 | 1.4 If i is not None and j is not None 114 | returns the gradient of the j-th data sample at the i-th agent. 115 | Note i, j can be integers, lists or vectors. 116 | 2. If w is a matrix of shape (dim, n_agent) 117 | 2.1 if j is None 118 | returns the gradient of each parameter at the corresponding agent 119 | 2.2 if j is not None 120 | returns the gradient of each parameter of the j-th sample at the corresponding agent. 121 | Note j can be lists of lists or vectors. 122 | ''' 123 | return self.grad_h(w, i=i, j=j) + self.grad_g(w) 124 | 125 | def grad_h(self, w, i=None, j=None): 126 | '''Gradient of h(x) at w. Depending on the shape of w and parameters i and j, this function behaves differently: 127 | 1. If w is a vector of shape (dim,) 128 | 1.1 If i is None and j is None 129 | returns the full gradient. 130 | 1.2 If i is not None and j is None 131 | returns the gradient at the i-th agent. 132 | 1.3 If i is None and j is not None 133 | returns the i-th gradient of all training data. 134 | 1.4 If i is not None and j is not None 135 | returns the gradient of the j-th data sample at the i-th agent. 136 | Note i, j can be integers, lists or vectors. 137 | 2. If w is a matrix of shape (dim, n_agent) 138 | 2.1 if j is None 139 | returns the gradient of each parameter at the corresponding agent 140 | 2.2 if j is not None 141 | returns the gradient of each parameter of the j-th sample at the corresponding agent. 142 | Note j can be lists of lists or vectors. 143 | ''' 144 | pass 145 | 146 | def grad_g(self, w): 147 | '''Sub-gradient of g(x) at w. Returns the sub-gradient of corresponding parameters. w can be a vector of shape (dim,) or a matrix of shape (dim, n_agent). 148 | ''' 149 | return 0 150 | 151 | def f(self, w, i=None, j=None, split='train'): 152 | '''Function value of f(x) = h(x) + g(x) at w. If i is None, returns the global function value; if i is not None but j is, returns the function value in the i-th machine; otherwise,return the function value of j-th sample in i-th machine.''' 153 | return self.h(w, i=i, j=j, split=split) + self.g(w) 154 | 155 | def hessian(self, *args, **kwargs): 156 | raise NotImplementedError 157 | 158 | def h(self, w, i=None, j=None, split='train'): 159 | '''Function value at w. If i is None, returns h(x); if i is not None but j is, returns the function value in the i-th machine; otherwise,return the function value of j-th sample in i-th machine.''' 160 | raise NotImplementedError 161 | 162 | def g(self, w): 163 | '''Function value of g(x) at w. Returns 0 if no regularization.''' 164 | return 0 165 | 166 | def _regularization_l1(self, w): 167 | return self.r * xp.abs(w).sum(axis=0) 168 | 169 | def _regularization_l2(self, w): 170 | return self.r * (w * w).sum(axis=0) 171 | 172 | def _grad_regularization_l1(self, w): 173 | g = xp.zeros(w.shape) 174 | g[w > 1e-5] = 1 175 | g[w < -1e-5] = -1 176 | return self.r * g 177 | 178 | def _grad_regularization_l2(self, w): 179 | return 2 * self.r * w 180 | 181 | def grad_check(self): 182 | '''Check whether the full gradient equals to the gradient computed by finite difference at a random point.''' 183 | w = xp.random.randn(self.dim) 184 | delta = xp.zeros(self.dim) 185 | grad = xp.zeros(self.dim) 186 | eps = 1e-4 187 | 188 | for i in range(self.dim): 189 | delta[i] = eps 190 | grad[i] = (self.f(w + delta) - self.f(w - delta)) / 2 / eps 191 | delta[i] = 0 192 | 193 | error = xp.linalg.norm(grad - self.grad(w)) 194 | if error > eps: 195 | log.warn('Gradient implementation check failed with difference %.4f!' % error) 196 | return False 197 | else: 198 | log.info('Gradient implementation check succeeded!') 199 | return True 200 | 201 | def distributed_check(self): 202 | '''Check the distributed function and gradient implementations are correct.''' 203 | 204 | def _check_1d_gradient(): 205 | 206 | w = xp.random.randn(self.dim) 207 | g = self.grad(w) 208 | g_i = g_ij = 0 209 | res = True 210 | 211 | for i in range(self.n_agent): 212 | _tmp_g_i = self.grad(w, i) 213 | _tmp_g_ij = 0 214 | for j in range(self.m): 215 | _tmp_g_ij += self.grad(w, i, j) 216 | 217 | if xp.linalg.norm(_tmp_g_i - _tmp_g_ij / self.m) > 1e-5: 218 | log.warn('Distributed graident check failed! Difference between local graident at agent %d and average of all local sample gradients is %.4f' % (i, xp.linalg.norm(_tmp_g_i - _tmp_g_ij / self.m))) 219 | res = False 220 | 221 | g_i += _tmp_g_i 222 | g_ij += _tmp_g_ij 223 | 224 | g_i /= self.n_agent 225 | g_ij /= self.m_total 226 | 227 | if xp.linalg.norm(g - g_i) > 1e-5: 228 | log.warn('Distributed gradient check failed! Difference between global graident and average of local gradients is %.4f', xp.linalg.norm(g - g_i)) 229 | res = False 230 | 231 | if xp.linalg.norm(g - g_ij) > 1e-5: 232 | log.warn('Distributed graident check failed! Difference between global graident and average of all sample gradients is %.4f' % xp.linalg.norm(g - g_ij)) 233 | res = False 234 | 235 | return res 236 | 237 | def _check_2d_gradient(): 238 | 239 | res = True 240 | w_2d = xp.random.randn(self.dim, self.n_agent) 241 | 242 | g_1d = 0 243 | for i in range(self.n_agent): 244 | g_1d += self.grad(w_2d[:, i], i=i) 245 | 246 | g_1d /= self.n_agent 247 | g_2d = self.grad(w_2d).mean(axis=1) 248 | 249 | if xp.linalg.norm(g_1d - g_2d) > 1e-5: 250 | log.warn('Distributed graident check failed! Difference between global gradient and average of distributed graidents is %.4f' % xp.linalg.norm(g_1d - g_2d)) 251 | res = False 252 | 253 | g_2d_sample = self.grad(w_2d, j=xp.arange(self.m).reshape(-1, 1).repeat(self.n_agent, axis=1).T).mean(axis=1) 254 | 255 | if xp.linalg.norm(g_1d - g_2d_sample) > 1e-5: 256 | log.warn('Distributed graident check failed! Difference between global graident and average of all sample gradients is %.4f' % xp.linalg.norm(g_1d - g_2d_sample)) 257 | res = False 258 | 259 | samples = xp.random.randint(0, self.m, (self.n_agent, 10)) 260 | g_2d_stochastic = self.grad(w_2d, j=samples) 261 | for i in range(self.n_agent): 262 | g_1d_stochastic = self.grad(w_2d[:, i], i=i, j=samples[i]) 263 | if xp.linalg.norm(g_1d_stochastic - g_2d_stochastic[:, i]) > 1e-5: 264 | log.warn('Distributed graident check failed! Difference between distributed stoachastic gradient at agent %d and average of sample gradients is %.4f' % (i, xp.linalg.norm(g_1d_stochastic - g_2d_stochastic[:, i]))) 265 | res = False 266 | 267 | return res 268 | 269 | def _check_function_value(): 270 | w = xp.random.randn(self.dim) 271 | f = self.f(w) 272 | f_i = f_ij = 0 273 | res = True 274 | 275 | for i in range(self.n_agent): 276 | _tmp_f_i = self.f(w, i) 277 | _tmp_f_ij = 0 278 | for j in range(self.m): 279 | _tmp_f_ij += self.f(w, i, j) 280 | 281 | if xp.abs(_tmp_f_i - _tmp_f_ij / self.m) > 1e-10: 282 | log.warn('Distributed function value check failed! Difference between local function value at agent %d and average of all local sample function values %d is %.4f' % (i, i, xp.abs(_tmp_f_i - _tmp_f_ij / self.m))) 283 | res = False 284 | 285 | f_i += _tmp_f_i 286 | f_ij += _tmp_f_ij 287 | 288 | f_i /= self.n_agent 289 | f_ij /= self.m_total 290 | 291 | if xp.abs(f - f_i) > 1e-10: 292 | log.warn('Distributed function value check failed! Difference between the global function value and average of local function values is %.4f' % xp.abs(f - f_i)) 293 | res = False 294 | 295 | if xp.abs(f - f_ij) > 1e-10: 296 | log.warn('Distributed function value check failed! Difference between the global function value and average of all sample function values is %.4f' % xp.abs(f - f_ij)) 297 | res = False 298 | 299 | return res 300 | 301 | res = _check_function_value() & _check_1d_gradient() & _check_2d_gradient() 302 | if res: 303 | log.info('Distributed check succeeded!') 304 | return True 305 | else: 306 | return False 307 | 308 | def generate_graph(self, graph_type='expander', params=None): 309 | '''Generate connected connectivity graph according to the params.''' 310 | 311 | if graph_type == 'expander': 312 | G = nx.paley_graph(self.n_agent).to_undirected() 313 | elif graph_type == 'grid': 314 | G = nx.grid_2d_graph(*params) 315 | elif graph_type == 'cycle': 316 | G = nx.cycle_graph(self.n_agent) 317 | elif graph_type == 'path': 318 | G = nx.path_graph(self.n_agent) 319 | elif graph_type == 'star': 320 | G = nx.star_graph(self.n_agent - 1) 321 | elif graph_type == 'er': 322 | if params < 2 / (self.n_agent - 1): 323 | log.fatal("Need higher probability to create a connected E-R graph!") 324 | G = None 325 | while G is None or nx.is_connected(G) is False: 326 | G = nx.erdos_renyi_graph(self.n_agent, params) 327 | else: 328 | log.fatal('Graph type %s not supported' % graph_type) 329 | 330 | self.n_edges = G.number_of_edges() 331 | self.G = G 332 | 333 | def plot_graph(self): 334 | '''Plot the generated connectivity graph.''' 335 | 336 | plt.figure() 337 | nx.draw(self.G) 338 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # From https://github.com/navdeep-G/setup.py 5 | 6 | import io, os, sys 7 | from setuptools import find_packages, setup 8 | 9 | NAME = 'Network Distributed Algorithms' 10 | DESCRIPTION = 'Network distributed algorithms and experiments' 11 | URL = 'https://github.com/liboyue/Network-Distributed-Algorithm' 12 | EMAIL = 'boyuel@andrew.cmu.edu' 13 | AUTHOR = 'Boyue Li' 14 | VERSION = '0.2' 15 | 16 | # What packages are required for this module to be executed? 17 | REQUIRED = ['matplotlib', 'numpy', 'networkx', 'cvxpy', 'pandas', 'scipy', 'scikit-learn', 'colorlog'] 18 | EXTRAS = {'GPU': ['cupy']} 19 | 20 | # The rest you shouldn't have to touch too much :) 21 | # ------------------------------------------------ 22 | # Except, perhaps the License and Trove Classifiers! 23 | # If you do change the License, remember to change the Trove Classifier for that! 24 | 25 | here = os.path.abspath(os.path.dirname(__file__)) 26 | 27 | # Import the README and use it as the long-description. 28 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 29 | try: 30 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 31 | long_description = '\n' + f.read() 32 | except FileNotFoundError: 33 | long_description = DESCRIPTION 34 | 35 | # Load the package's __version__.py module as a dictionary. 36 | about = {} 37 | if not VERSION: 38 | project_slug = NAME.lower().replace("-", "_").replace(" ", "_") 39 | with open(os.path.join(here, project_slug, '__version__.py')) as f: 40 | exec(f.read(), about) 41 | else: 42 | about['__version__'] = VERSION 43 | 44 | 45 | # Where the magic happens: 46 | setup( 47 | name=NAME, 48 | version=about['__version__'], 49 | description=DESCRIPTION, 50 | long_description=long_description, 51 | long_description_content_type='text/markdown', 52 | author=AUTHOR, 53 | author_email=EMAIL, 54 | url=URL, 55 | packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*", "experiments"]), 56 | 57 | install_requires=REQUIRED, 58 | extras_require=EXTRAS, 59 | include_package_data=True, 60 | license='MIT', 61 | classifiers=[ 62 | # Trove classifiers 63 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 64 | 'License :: OSI Approved :: MIT License', 65 | 'Programming Language :: Python', 66 | 'Programming Language :: Python :: 3', 67 | 'Programming Language :: Python :: 3.8', 68 | ], 69 | ) 70 | --------------------------------------------------------------------------------