├── .gitignore ├── AGN ├── AGN.py ├── README.md └── run.sh ├── Basics-Tutorial ├── Local-then-Global-Variables.ipynb ├── Multiple-Workers │ ├── Local-then-Global-Variables-Worker1.ipynb │ ├── Local-then-Global-Variables-Worker2.ipynb │ └── Parameter-Server.ipynb ├── Parameter-Server.ipynb ├── README.md └── Servers.ipynb ├── DOWNPOUR-Easy ├── DOWNPOUR.py ├── README.md └── run.sh ├── DOWNPOUR ├── DOWNPOUR.py ├── README.md └── run.sh ├── Distributed-Setup ├── README.md ├── dist_setup.py ├── dist_setup_sup.py ├── run.sh └── run_sup.sh ├── Hogwild ├── Hogwild.py ├── README.md └── run.sh ├── LICENSE ├── Multiple-GPUs-Single-Machine ├── README.md ├── dist_mult_gpu_sing_mach.py └── dist_mult_gpu_sing_mach.sh ├── Non-Distributed_Setup.py ├── README.md ├── SAGN ├── README.md ├── SAGN.py └── run.sh ├── Synchronous-SGD-different-learning-rates ├── README.md ├── run.sh └── ssgd.py ├── Synchronous-SGD ├── README.md ├── run.sh └── ssgd.py └── imgs └── data-parallelism.png /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints* 2 | Basics-Tutorial/Multiple-Workers/.ipynb_checkpoints/* 3 | Basics-Tutorial/Beginner\ Tutorial\ Variables.ipynb 4 | Basics-Tutorial/.ipynb_checkpoints/* 5 | -------------------------------------------------------------------------------- /AGN/AGN.py: -------------------------------------------------------------------------------- 1 | """Asynchronous Distributed Adaptive Gradients (ADAG) 2 | 3 | Formerly known as ADAG. 4 | Performs asynchronous updates with update window. 5 | 6 | Author: Tommy Mulc 7 | """ 8 | 9 | from __future__ import print_function 10 | import tensorflow as tf 11 | import argparse 12 | import time 13 | import os 14 | FLAGS = None 15 | log_dir = '/logdir' 16 | 17 | def main(): 18 | # Configure 19 | config=tf.ConfigProto(log_device_placement=False) 20 | 21 | #Server Setup 22 | cluster_spec = { 23 | 'ps':['localhost:2222'], 24 | 'worker':['localhost:2223','localhost:2224'] 25 | } #allows this node know about all other nodes 26 | n_pss = len(cluster_spec['ps']) #the number of parameter servers 27 | n_workers = len(cluster_spec['worker']) #the number of worker nodes 28 | cluster = tf.train.ClusterSpec(cluster_spec) 29 | 30 | if FLAGS.job_name == 'ps': #checks if parameter server 31 | server = tf.train.Server(cluster, 32 | job_name="ps", 33 | task_index=FLAGS.task_index, 34 | config=config) 35 | server.join() 36 | else: #it must be a worker server 37 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 38 | server = tf.train.Server(cluster, 39 | job_name="worker", 40 | task_index=FLAGS.task_index, 41 | config=config) 42 | 43 | # Graph 44 | # We must not use train.replicate_device_setter for normal operations 45 | # Local operations 46 | with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index): 47 | a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 48 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 49 | b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 50 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 51 | c=a+b 52 | 53 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 54 | loss = tf.reduce_mean(tf.square(c-target)) 55 | 56 | local_step = tf.Variable(0,dtype=tf.int32,trainable=False, 57 | name='local_step',collections=['local_non_trainable']) 58 | 59 | lr = .0001 60 | # loptimizer = tf.train.GradientDescentOptimizer(lr) #local optimizer 61 | loptimizer = tf.train.AdamOptimizer(lr) #local optimizer 62 | 63 | # ADAG (simplest case since all batches are the same) 64 | update_window = 3 # T: update/communication window 65 | grad_list = [] # the array to store the gradients through the communication window 66 | for t in range(update_window): 67 | if t != 0: 68 | with tf.control_dependencies([opt_local]): #compute gradients only if the local opt was run 69 | grads, varss = zip(*loptimizer.compute_gradients(loss, 70 | var_list=tf.local_variables())) 71 | else: 72 | grads, varss = zip(*loptimizer.compute_gradients(loss, 73 | var_list=tf.local_variables())) 74 | grad_list.append(grads) #add gradients to the list 75 | opt_local = loptimizer.apply_gradients(zip(grads,varss), 76 | global_step=local_step) #update local parameters 77 | grads = tf.reduce_mean(grad_list,axis=0) 78 | grads = tuple([grads[i]for i in range(len(varss))]) 79 | 80 | # add these variables created by local optimizer to local collection 81 | lopt_vars = add_global_variables_to_local_collection() 82 | 83 | # delete the variables from the global collection 84 | clear_global_collection() 85 | 86 | with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss, 87 | worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))): 88 | global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step') 89 | 90 | # optimizer for central variables 91 | optimizer = tf.train.AdamOptimizer(lr) 92 | # optimizer = tf.train.GradientDescentOptimizer(lr) 93 | 94 | #create global variables and/or references 95 | local_to_global, global_to_local = create_global_variables(lopt_vars) 96 | 97 | opt = optimizer.apply_gradients( 98 | zip(grads,[ local_to_global[v] for v in varss]) 99 | ,global_step=global_step) #apply the gradients to variables on ps 100 | 101 | # Pull param from global server 102 | with tf.control_dependencies([opt]): 103 | assign_locals = assign_global_to_local(global_to_local) 104 | 105 | # Init ops 106 | init_local = tf.variables_initializer(tf.local_variables() \ 107 | +tf.get_collection('local_non_trainable'))#for local variables 108 | init = tf.global_variables_initializer() # for global variables 109 | 110 | # Grab global state before training so all workers have same initialization 111 | grab_global_init = assign_global_to_local(global_to_local) 112 | 113 | # Assigns local values to global ones for chief to execute 114 | assign_global = assign_local_to_global(local_to_global) 115 | 116 | # Session 117 | stop_hook = tf.train.StopAtStepHook(last_step=40) 118 | hooks = [stop_hook] 119 | scaff = tf.train.Scaffold(init_op=init,local_init_op=init_local) 120 | 121 | #Monitored Training Session 122 | sess = tf.train.MonitoredTrainingSession(master=server.target, 123 | is_chief=is_chief, 124 | config=config, 125 | scaffold=scaff, 126 | hooks=hooks, 127 | save_checkpoint_secs=1, 128 | checkpoint_dir='logdir') 129 | if is_chief: 130 | sess.run(assign_global) #Assigns chief's initial values to ps 131 | time.sleep(10) #grace period to wait on other workers before starting training 132 | 133 | # Train until hook stops session 134 | print('Starting training on worker %d'%FLAGS.task_index) 135 | sess.run(grab_global_init) 136 | while not sess.should_stop(): 137 | _,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step]) 138 | print(r,"global step: "+str(gs),"worker: "+str(FLAGS.task_index),"local step: "+str(ls)) 139 | time.sleep(1) 140 | print('Done',FLAGS.task_index) 141 | 142 | time.sleep(10) #grace period to wait before closing session 143 | sess.close() 144 | print('Session from worker %d closed cleanly'%FLAGS.task_index) 145 | 146 | 147 | def assign_global_to_local(global_to_local): 148 | """ 149 | global_to_local : dictionary with corresponding local variable for global key 150 | 151 | Assigns global variable value to local variables 152 | """ 153 | r = [] 154 | for v in global_to_local.keys(): 155 | r.append(tf.assign(global_to_local[v],v)) 156 | with tf.control_dependencies(r): 157 | a = tf.no_op() 158 | return a 159 | 160 | 161 | def assign_local_to_global(local_to_global): 162 | """Assigns global variable value to local variables. 163 | 164 | local_to_global : dictionary with corresponding global variable for local key 165 | """ 166 | r= [] 167 | for v in local_to_global.keys(): 168 | r.append(tf.assign(local_to_global[v],v)) 169 | with tf.control_dependencies(r): 170 | a = tf.no_op() 171 | return a 172 | 173 | 174 | def get_global_variable_by_name(name): 175 | """Returns the global variable of given name. 176 | 177 | name : the name of the global variable 178 | """ 179 | return [v for v in tf.global_variables() if v.name == name][0] 180 | 181 | 182 | def create_global_variables(local_optimizer_vars = []): 183 | """Creates global variables for local variables on the graph. 184 | Skips variables local variables that are created for 185 | local optimization. 186 | 187 | Returns dictionarys for local-to-global and global-to-local 188 | variable mappings. 189 | """ 190 | local_to_global = {} 191 | global_to_local = {} 192 | with tf.device('/job:ps/task:0'): 193 | for v in tf.local_variables(): 194 | if v not in local_optimizer_vars: 195 | v_g = tf.get_variable('g/'+v.op.name, 196 | shape = v.shape, 197 | dtype = v.dtype, 198 | trainable=True, 199 | collections=[tf.GraphKeys.GLOBAL_VARIABLES, 200 | tf.GraphKeys.TRAINABLE_VARIABLES]) 201 | local_to_global[v] = v_g 202 | global_to_local[v_g] = v 203 | return local_to_global,global_to_local 204 | 205 | 206 | def add_global_variables_to_local_collection(): 207 | """Adds all variables from the global collection 208 | to the local collection. 209 | 210 | Returns the list of variables added. 211 | """ 212 | r =[] 213 | for var in tf.get_default_graph()._collections[tf.GraphKeys.GLOBAL_VARIABLES]: 214 | tf.add_to_collection(tf.GraphKeys.LOCAL_VARIABLES,var) 215 | r.append(var) 216 | return r 217 | 218 | 219 | def clear_global_collection(): 220 | """Removes all variables from global collection.""" 221 | g = tf.get_default_graph() 222 | for _ in range(len(g._collections[tf.GraphKeys.GLOBAL_VARIABLES])): 223 | del g._collections[tf.GraphKeys.GLOBAL_VARIABLES][0] 224 | 225 | 226 | if __name__ == '__main__': 227 | parser = argparse.ArgumentParser() 228 | # Flags for defining the tf.train.ClusterSpec 229 | parser.add_argument( 230 | "--job_name", 231 | type=str, 232 | default="", 233 | help="One of 'ps', 'worker'" 234 | ) 235 | # Flags for defining the tf.train.Server 236 | parser.add_argument( 237 | "--task_index", 238 | type=int, 239 | default=0, 240 | help="Index of task within the job" 241 | ) 242 | FLAGS, unparsed = parser.parse_known_args() 243 | print(FLAGS.task_index) 244 | main() 245 | -------------------------------------------------------------------------------- /AGN/README.md: -------------------------------------------------------------------------------- 1 | ## AGN (Accumulated Gradient Normalization) 2 | 3 | This method was formerly known as ADAG (Asynchronous Distributed Adaptive Gradients). 4 | 5 | Similar to DOWNPOUR expect that it uses a communications window *T* and accumulates gradients for *T* steps before sending updates to the parameter server. 6 | -------------------------------------------------------------------------------- /AGN/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python AGN.py --job_name "ps" --task_index 0 & 3 | python AGN.py --job_name "worker" --task_index 0 & 4 | python AGN.py --job_name "worker" --task_index 1 & -------------------------------------------------------------------------------- /Basics-Tutorial/Local-then-Global-Variables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "'1.3.0'" 23 | ] 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "tf.__version__" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Author: Tommy Mulc\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "print \"Author: Tommy Mulc\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Create a TensorFlow cluster with one worker node and one ps node." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "cluster_spec = tf.train.ClusterSpec({'worker' : ['localhost:2223'], 'ps' : ['localhost:2222']})\n", 67 | "server = tf.train.Server(cluster_spec,job_name='worker')" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "**Now launch run all the cells in the parameter server notebook**" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Create variables locally then makes global copy. One worker scenario" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "tf.reset_default_graph()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "#create local graph like normal specifying the local device\n", 104 | "with tf.device('/job:worker/task:0'):\n", 105 | " a = tf.Variable([0.],name='a',collections=[tf.GraphKeys.LOCAL_VARIABLES])\n", 106 | " b = tf.constant([100.])\n", 107 | " loss = tf.abs(a-b)\n", 108 | " \n", 109 | " optimizer = tf.train.GradientDescentOptimizer(.1)\n", 110 | " grads,local_vars = zip(*optimizer.compute_gradients(loss,var_list=tf.local_variables()))\n", 111 | " local_update = optimizer.apply_gradients(zip(grads,local_vars))\n", 112 | " \n", 113 | " \n", 114 | " init_local = tf.local_variables_initializer()\n", 115 | "\n", 116 | "#create the globabl copies on the ps\n", 117 | "with tf.device('/job:ps/task:0'):\n", 118 | " for v in tf.local_variables():\n", 119 | " v_g = tf.get_variable('g/'+v.op.name,\n", 120 | " shape = v.shape,\n", 121 | " dtype = v.dtype,\n", 122 | " trainable=True,\n", 123 | " collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES])\n", 124 | "\n", 125 | "\n", 126 | "#gloabl updates\n", 127 | "with tf.device('/job:worker/task:0'):\n", 128 | " #this needs to be updated. Clearly not robust for any graph more complext\n", 129 | " global_vars = tf.global_variables()\n", 130 | " global_update = optimizer.apply_gradients(zip(grads,global_vars))\n", 131 | "\n", 132 | "#create init op on the chief node\n", 133 | "with tf.device('/job:worker/task:0'):\n", 134 | " init_global = tf.global_variables_initializer()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "View the device placement of ops and variables" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 7, 147 | "metadata": { 148 | "collapsed": true 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "a_global = tf.global_variables()[0]" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 10, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "/job:worker/task:0\n", 165 | "/job:worker/task:0\n", 166 | "/job:worker/task:0\n", 167 | "/job:worker/task:0\n", 168 | "/job:ps/task:0\n", 169 | "/job:ps/task:0\n", 170 | "/job:worker/task:0\n", 171 | "/job:ps/task:0\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "print(a.device)\n", 177 | "print(b.device)\n", 178 | "print(loss.device)\n", 179 | "print(local_update.device)\n", 180 | "print(global_update.device)\n", 181 | "print(init_global.device)\n", 182 | "print(init_local.device)\n", 183 | "print(a_global.device)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "Now, let's view the states of local and global variables as we do local then global updates" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 11, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "[None, None]" 202 | ] 203 | }, 204 | "execution_count": 11, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | } 208 | ], 209 | "source": [ 210 | "sess = tf.Session(target=server.target)\n", 211 | "sess.run([init_local,init_global])" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 12, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "[array([ 0.], dtype=float32), array([-1.26032162], dtype=float32)]" 223 | ] 224 | }, 225 | "execution_count": 12, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "sess.run([a,a_global])" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 13, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "sess.run(local_update)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 14, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "[array([ 0.1], dtype=float32), array([-1.26032162], dtype=float32)]" 254 | ] 255 | }, 256 | "execution_count": 14, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "sess.run([a,a_global])" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "Notice that the state of the global variable hasn't changed" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 15, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "sess.run(global_update)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 16, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "[array([ 0.1], dtype=float32), array([-1.16032159], dtype=float32)]" 292 | ] 293 | }, 294 | "execution_count": 16, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "sess.run([a,a_global])" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "collapsed": true 308 | }, 309 | "outputs": [], 310 | "source": [] 311 | } 312 | ], 313 | "metadata": { 314 | "kernelspec": { 315 | "display_name": "Python [conda env:tensorflow13]", 316 | "language": "python", 317 | "name": "conda-env-tensorflow13-py" 318 | }, 319 | "language_info": { 320 | "codemirror_mode": { 321 | "name": "ipython", 322 | "version": 2 323 | }, 324 | "file_extension": ".py", 325 | "mimetype": "text/x-python", 326 | "name": "python", 327 | "nbconvert_exporter": "python", 328 | "pygments_lexer": "ipython2", 329 | "version": "2.7.13" 330 | } 331 | }, 332 | "nbformat": 4, 333 | "nbformat_minor": 2 334 | } 335 | -------------------------------------------------------------------------------- /Basics-Tutorial/Multiple-Workers/Local-then-Global-Variables-Worker1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "'1.3.0'" 23 | ] 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "tf.__version__" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Author: Tommy Mulc\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "print \"Author: Tommy Mulc\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Create a TensorFlow cluster with one worker node and one ps node." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "task_index=0" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "cluster_spec = tf.train.ClusterSpec({'ps' : ['localhost:2222'],'worker' : ['localhost:2223','localhost:2224']})\n", 78 | "server = tf.train.Server(cluster_spec,job_name='worker',task_index=task_index)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "**Launch and run all the cells in the parameter server notebook**" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Create variables locally then makes global copy on ps." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "tf.reset_default_graph()\n", 104 | "\n", 105 | "#create local graph like normal specifying the local device\n", 106 | "with tf.device('/job:worker/task:0'):\n", 107 | " a = tf.Variable([0.],name='a',collections=[tf.GraphKeys.LOCAL_VARIABLES])\n", 108 | " b = tf.constant([100.])\n", 109 | " loss = tf.abs(a-b)\n", 110 | " \n", 111 | " optimizer = tf.train.GradientDescentOptimizer(.1)\n", 112 | " grads,local_vars = zip(*optimizer.compute_gradients(loss,var_list=tf.local_variables()))\n", 113 | " local_update = optimizer.apply_gradients(zip(grads,local_vars))\n", 114 | " \n", 115 | " \n", 116 | " init_local = tf.local_variables_initializer()\n", 117 | "\n", 118 | "#create the globabl copies on the ps\n", 119 | "with tf.device('/job:ps/task:0'):\n", 120 | " for v in tf.local_variables():\n", 121 | " v_g = tf.get_variable('g/'+v.op.name,\n", 122 | " shape = v.shape,\n", 123 | " dtype = v.dtype,\n", 124 | " trainable=True,\n", 125 | " collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES])\n", 126 | "\n", 127 | "\n", 128 | "#gloabl updates\n", 129 | "with tf.device('/job:worker/task:%d'%task_index):\n", 130 | " #this needs to be updated. Clearly not robust for any graph more complext\n", 131 | " global_vars = tf.global_variables()\n", 132 | " global_update = optimizer.apply_gradients(zip(grads,global_vars))\n", 133 | "\n", 134 | "#create init op on the chief node\n", 135 | "with tf.device('/job:worker/task:%d'%task_index):\n", 136 | " init_global = tf.global_variables_initializer()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "View device placements" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "a_global = tf.global_variables()[0]" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "/job:worker/task:0\n", 167 | "/job:worker/task:0\n", 168 | "/job:worker/task:0\n", 169 | "/job:worker/task:0\n", 170 | "/job:ps/task:0\n", 171 | "/job:ps/task:0\n", 172 | "/job:worker/task:0\n", 173 | "/job:ps/task:0\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "print(a.device)\n", 179 | "print(b.device)\n", 180 | "print(loss.device)\n", 181 | "print(local_update.device)\n", 182 | "print(global_update.device)\n", 183 | "print(init_global.device)\n", 184 | "print(init_local.device)\n", 185 | "print(a_global.device)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 9, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "[None, None]" 197 | ] 198 | }, 199 | "execution_count": 9, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "sess = tf.Session(target=server.target)\n", 206 | "sess.run([init_local,init_global])" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "Make sure you have also run all cells in the worker 2 notebook up to this point before continuing. The above cell should hang until you initialize the worker 2 session." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 10, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "text/plain": [ 224 | "[array([ 0.], dtype=float32), array([-1.17584229], dtype=float32)]" 225 | ] 226 | }, 227 | "execution_count": 10, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "sess.run([a,a_global])" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 11, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "sess.run(local_update)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 12, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "[array([ 0.1], dtype=float32), array([-1.17584229], dtype=float32)]" 256 | ] 257 | }, 258 | "execution_count": 12, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "sess.run([a,a_global])" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 13, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "sess.run(global_update)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 14, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "[array([ 0.1], dtype=float32), array([-1.07584226], dtype=float32)]" 287 | ] 288 | }, 289 | "execution_count": 14, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "sess.run([a,a_global])" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "Pause here. Run the last cell in this notebook after you have done a global update in the worker 2 notebook." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 15, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "[array([-0.97584224], dtype=float32)]" 314 | ] 315 | }, 316 | "execution_count": 15, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "sess.run([a_global])" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [] 333 | } 334 | ], 335 | "metadata": { 336 | "kernelspec": { 337 | "display_name": "Python [conda env:tensorflow13]", 338 | "language": "python", 339 | "name": "conda-env-tensorflow13-py" 340 | }, 341 | "language_info": { 342 | "codemirror_mode": { 343 | "name": "ipython", 344 | "version": 2 345 | }, 346 | "file_extension": ".py", 347 | "mimetype": "text/x-python", 348 | "name": "python", 349 | "nbconvert_exporter": "python", 350 | "pygments_lexer": "ipython2", 351 | "version": "2.7.13" 352 | } 353 | }, 354 | "nbformat": 4, 355 | "nbformat_minor": 2 356 | } 357 | -------------------------------------------------------------------------------- /Basics-Tutorial/Multiple-Workers/Local-then-Global-Variables-Worker2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "'1.3.0'" 23 | ] 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "tf.__version__" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Author: Tommy Mulc\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "print \"Author: Tommy Mulc\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Create a TensorFlow cluster with one worker node and one ps node." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "task_index=1" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "cluster_spec = tf.train.ClusterSpec({'ps' : ['localhost:2222'],'worker' : ['localhost:2223','localhost:2224']})\n", 78 | "server = tf.train.Server(cluster_spec,job_name='worker',task_index=task_index)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "**Launch and run all the cells in the parameter server notebook (if you haven't already)**" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Create variables locally then makes a global copy on ps." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "tf.reset_default_graph()\n", 104 | "\n", 105 | "#create local graph like normal specifying the local device\n", 106 | "with tf.device('/job:worker/task:%d'%task_index):\n", 107 | " a = tf.Variable([0.],name='a',collections=[tf.GraphKeys.LOCAL_VARIABLES])\n", 108 | " b = tf.constant([100.])\n", 109 | " loss = tf.abs(a-b)\n", 110 | " \n", 111 | " optimizer = tf.train.GradientDescentOptimizer(.1)\n", 112 | " grads,local_vars = zip(*optimizer.compute_gradients(loss,var_list=tf.local_variables()))\n", 113 | " local_update = optimizer.apply_gradients(zip(grads,local_vars))\n", 114 | " \n", 115 | " \n", 116 | " init_local = tf.local_variables_initializer()\n", 117 | "\n", 118 | "#create the globabl copies on the ps\n", 119 | "with tf.device('/job:ps/task:0'):\n", 120 | " for v in tf.local_variables():\n", 121 | " v_g = tf.get_variable('g/'+v.op.name,\n", 122 | " shape = v.shape,\n", 123 | " dtype = v.dtype,\n", 124 | " trainable=True,\n", 125 | " collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES])\n", 126 | "\n", 127 | "\n", 128 | "#gloabl updates\n", 129 | "with tf.device('/job:worker/task:%d'%task_index):\n", 130 | " #this needs to be updated. Clearly not robust for any graph more complext\n", 131 | " global_vars = tf.global_variables()\n", 132 | " global_update = optimizer.apply_gradients(zip(grads,global_vars))\n", 133 | "\n", 134 | "#create init op on the chief node\n", 135 | "with tf.device('/job:worker/task:%d'%task_index):\n", 136 | " init_global = tf.global_variables_initializer()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "View device placements" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "a_global = tf.global_variables()[0]" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "/job:worker/task:1\n", 167 | "/job:worker/task:1\n", 168 | "/job:worker/task:1\n", 169 | "/job:worker/task:1\n", 170 | "/job:ps/task:0\n", 171 | "/job:ps/task:0\n", 172 | "/job:worker/task:1\n", 173 | "/job:ps/task:0\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "print(a.device)\n", 179 | "print(b.device)\n", 180 | "print(loss.device)\n", 181 | "print(local_update.device)\n", 182 | "print(global_update.device)\n", 183 | "print(init_global.device)\n", 184 | "print(init_local.device)\n", 185 | "print(a_global.device)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 9, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "[None]" 197 | ] 198 | }, 199 | "execution_count": 9, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "sess = tf.Session(target=server.target)\n", 206 | "sess.run([init_local])" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 11, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "[array([ 0.], dtype=float32), array([-1.07584226], dtype=float32)]" 218 | ] 219 | }, 220 | "execution_count": 11, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "sess.run([a,a_global])" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "Wait for a global update from worker 1, then continue." 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 12, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "sess.run(local_update)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 13, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "[array([ 0.1], dtype=float32), array([-1.07584226], dtype=float32)]" 256 | ] 257 | }, 258 | "execution_count": 13, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "sess.run([a,a_global])" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 14, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "sess.run(global_update)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 15, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "[array([ 0.1], dtype=float32), array([-0.97584224], dtype=float32)]" 287 | ] 288 | }, 289 | "execution_count": 15, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "sess.run([a,a_global])" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": true 303 | }, 304 | "outputs": [], 305 | "source": [] 306 | } 307 | ], 308 | "metadata": { 309 | "kernelspec": { 310 | "display_name": "Python [conda env:tensorflow13]", 311 | "language": "python", 312 | "name": "conda-env-tensorflow13-py" 313 | }, 314 | "language_info": { 315 | "codemirror_mode": { 316 | "name": "ipython", 317 | "version": 2 318 | }, 319 | "file_extension": ".py", 320 | "mimetype": "text/x-python", 321 | "name": "python", 322 | "nbconvert_exporter": "python", 323 | "pygments_lexer": "ipython2", 324 | "version": "2.7.13" 325 | } 326 | }, 327 | "nbformat": 4, 328 | "nbformat_minor": 2 329 | } 330 | -------------------------------------------------------------------------------- /Basics-Tutorial/Multiple-Workers/Parameter-Server.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "Running the below cell will cause this kernel to stall on the cell until the notebook is shutdown." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "cluster_spec = tf.train.ClusterSpec({'ps' : ['localhost:2222'],'worker' : ['localhost:2223','localhost:2224']})\n", 30 | "ps = tf.train.Server(cluster_spec,job_name='ps')\n", 31 | "ps.join()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python [conda env:tensorflow13]", 47 | "language": "python", 48 | "name": "conda-env-tensorflow13-py" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 2 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython2", 60 | "version": "2.7.13" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 2 65 | } 66 | -------------------------------------------------------------------------------- /Basics-Tutorial/Parameter-Server.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "Running the below cell will cause this kernel to stall on the cell until the notebook is shutdown." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "cluster_spec = tf.train.ClusterSpec({'worker' : ['localhost:2223'], 'ps' : ['localhost:2222']})\n", 30 | "ps = tf.train.Server(cluster_spec,job_name='ps')\n", 31 | "ps.join()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python [conda env:tensorflow13]", 47 | "language": "python", 48 | "name": "conda-env-tensorflow13-py" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 2 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython2", 60 | "version": "2.7.13" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 2 65 | } 66 | -------------------------------------------------------------------------------- /Basics-Tutorial/README.md: -------------------------------------------------------------------------------- 1 | ## Basics Tutorial 2 | 3 | This short tutorial will show you how to get started with distributed TensorFlow. The aim is to get you familiar with basic distributed TF concepts that are reoccurring, such as TF servers. You should work throught the content in the following order 4 | 5 | 1. [`Server.ipynb`](Servers.ipynb) 6 | 2. [`Parameter-Server.ipynb`](Parameter-Server.ipynb) 7 | 3. [`Local-then-Global-Variables.ipynb`](Local-then-Global-Variables.ipynb) 8 | 4. [`Multiple-Workers`](Multiple-Workers/). 9 | 10 | The Multiple-Workers exercise requires three notebooks, and should be started from the Worker1 notebook. 11 | 12 | ### Coming Soon! 13 | * Sessions, Sesssion Managers, Training Sessions... 14 | 15 | -------------------------------------------------------------------------------- /Basics-Tutorial/Servers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "'1.3.0'" 23 | ] 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "tf.__version__" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Author: Tommy Mulc\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "print \"Author: Tommy Mulc\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# TensorFlow Servers\n", 56 | "\n", 57 | "Create a TensorFlow cluster with one node. Let this node be responsible for a job that that has name \"worker\" and that will operate one take at localhost:2222" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "cluster_spec = tf.train.ClusterSpec({'worker' : ['localhost:2222']})\n", 69 | "server = tf.train.Server(cluster_spec)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "'grpc://localhost:2222'" 81 | ] 82 | }, 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "server.target" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "The server is currently running. Check this by running \n", 97 | "\n", 98 | "`lsof -i -P -n | grep LISTEN | grep python`\n", 99 | "\n", 100 | "in your terminal." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 6, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "python2.7 66001 tmulc 3u IPv6 0x358037b03a6c7799 0t0 TCP [::1]:8888 (LISTEN)\n", 113 | "python2.7 66001 tmulc 4u IPv4 0x358037b038251061 0t0 TCP 127.0.0.1:8888 (LISTEN)\n", 114 | "python2.7 66017 tmulc 25u IPv4 0x358037b0381ff769 0t0 TCP 127.0.0.1:60322 (LISTEN)\n", 115 | "python2.7 66017 tmulc 28u IPv4 0x358037b0381fd251 0t0 TCP 127.0.0.1:60324 (LISTEN)\n", 116 | "python2.7 66017 tmulc 31u IPv4 0x358037b038285251 0t0 TCP 127.0.0.1:60325 (LISTEN)\n", 117 | "python2.7 66017 tmulc 34u IPv4 0x358037b038485b49 0t0 TCP 127.0.0.1:60323 (LISTEN)\n", 118 | "python2.7 66017 tmulc 39u IPv4 0x358037b039c18769 0t0 TCP 127.0.0.1:60339 (LISTEN)\n", 119 | "python2.7 66017 tmulc 52u IPv4 0x358037b038282579 0t0 TCP 127.0.0.1:60326 (LISTEN)\n", 120 | "python2.7 66017 tmulc 69u IPv6 0x358037b03a6c8259 0t0 TCP *:2222 (LISTEN)\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "%%bash\n", 126 | "lsof -i -P -n | grep LISTEN | grep python" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "View the meta data" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "cluster {\n", 145 | " job {\n", 146 | " name: \"worker\"\n", 147 | " tasks {\n", 148 | " value: \"localhost:2222\"\n", 149 | " }\n", 150 | " }\n", 151 | "}\n", 152 | "job_name: \"worker\"\n", 153 | "protocol: \"grpc\"" 154 | ] 155 | }, 156 | "execution_count": 7, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "server.server_def" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "Launch a TensorFlow session with the excecution engine being the server." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 8, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "sess = tf.Session(target=server.target)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "Use TensorFlow to create a local server and use `lsof` to find out the location of the server." 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 9, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "server = tf.train.Server.create_local_server()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 10, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "python2.7 66001 tmulc 3u IPv6 0x358037b03a6c7799 0t0 TCP [::1]:8888 (LISTEN)\n", 211 | "python2.7 66001 tmulc 4u IPv4 0x358037b038251061 0t0 TCP 127.0.0.1:8888 (LISTEN)\n", 212 | "python2.7 66017 tmulc 25u IPv4 0x358037b0381ff769 0t0 TCP 127.0.0.1:60322 (LISTEN)\n", 213 | "python2.7 66017 tmulc 28u IPv4 0x358037b0381fd251 0t0 TCP 127.0.0.1:60324 (LISTEN)\n", 214 | "python2.7 66017 tmulc 31u IPv4 0x358037b038285251 0t0 TCP 127.0.0.1:60325 (LISTEN)\n", 215 | "python2.7 66017 tmulc 34u IPv4 0x358037b038485b49 0t0 TCP 127.0.0.1:60323 (LISTEN)\n", 216 | "python2.7 66017 tmulc 39u IPv4 0x358037b039c18769 0t0 TCP 127.0.0.1:60339 (LISTEN)\n", 217 | "python2.7 66017 tmulc 52u IPv4 0x358037b038282579 0t0 TCP 127.0.0.1:60326 (LISTEN)\n", 218 | "python2.7 66017 tmulc 69u IPv6 0x358037b03a6c8259 0t0 TCP *:2222 (LISTEN)\n", 219 | "python2.7 66017 tmulc 75u IPv6 0x358037b031e27239 0t0 TCP *:60371 (LISTEN)\n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "%%bash\n", 225 | "lsof -i -P -n | grep LISTEN | grep python" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": { 231 | "collapsed": true 232 | }, 233 | "source": [ 234 | "View devices available in this session." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 11, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "/job:worker/replica:0/task:0/device:CPU:0\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "devices = sess.list_devices()\n", 252 | "for d in devices:\n", 253 | " print(d.name)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 12, 259 | "metadata": { 260 | "collapsed": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "sess.close()" 265 | ] 266 | } 267 | ], 268 | "metadata": { 269 | "kernelspec": { 270 | "display_name": "Python [conda env:tensorflow13]", 271 | "language": "python", 272 | "name": "conda-env-tensorflow13-py" 273 | }, 274 | "language_info": { 275 | "codemirror_mode": { 276 | "name": "ipython", 277 | "version": 2 278 | }, 279 | "file_extension": ".py", 280 | "mimetype": "text/x-python", 281 | "name": "python", 282 | "nbconvert_exporter": "python", 283 | "pygments_lexer": "ipython2", 284 | "version": "2.7.13" 285 | } 286 | }, 287 | "nbformat": 4, 288 | "nbformat_minor": 2 289 | } 290 | -------------------------------------------------------------------------------- /DOWNPOUR-Easy/DOWNPOUR.py: -------------------------------------------------------------------------------- 1 | """DOWNPOUR Easy 2 | 3 | Performs asynchronous updates with update window. 4 | Uses SGD on the local level for updates instead of Adagrad. 5 | 6 | Author: Tommy Mulc 7 | """ 8 | 9 | from __future__ import print_function 10 | import tensorflow as tf 11 | import argparse 12 | import time 13 | import os 14 | FLAGS = None 15 | log_dir = '/logdir' 16 | 17 | def main(): 18 | # Configure 19 | config=tf.ConfigProto(log_device_placement=False) 20 | 21 | # Server Setup 22 | cluster_spec = {'ps':['localhost:2222'], 23 | 'worker':['localhost:2223','localhost:2224']} 24 | n_pss = len(cluster_spec['ps']) #the number of parameter servers 25 | n_workers = len(cluster_spec['worker']) #the number of worker nodes 26 | cluster = tf.train.ClusterSpec(cluster_spec) #allows this node know about all other nodes 27 | 28 | if FLAGS.job_name == 'ps': #checks if parameter server 29 | server = tf.train.Server(cluster, 30 | job_name="ps", 31 | task_index=FLAGS.task_index, 32 | config=config) 33 | server.join() 34 | else: #it must be a worker server 35 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 36 | server = tf.train.Server(cluster, 37 | job_name="worker", 38 | task_index=FLAGS.task_index, 39 | config=config) 40 | 41 | # Graph 42 | # Local operations 43 | with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index): 44 | a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 45 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 46 | b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 47 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 48 | c=a+b 49 | 50 | local_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='local_step', 51 | collections=['local_non_trainable']) 52 | lr = .0001 53 | loptimizer = tf.train.GradientDescentOptimizer(lr*FLAGS.task_index) #local optimizer 54 | 55 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 56 | loss = tf.reduce_mean(tf.square(c-target)) 57 | 58 | # DOWNPOUR 59 | update_window = 3 # T: communication window 60 | grad_list = [] # array to store the gradients through the communication window 61 | for t in range(update_window): 62 | if t != 0: 63 | with tf.control_dependencies([opt_local]): #compute gradients only if the local opt was run 64 | grads, varss = zip(*loptimizer.compute_gradients( 65 | loss,var_list=tf.local_variables())) 66 | else: 67 | grads, varss = zip(*loptimizer.compute_gradients( 68 | loss,var_list=tf.local_variables())) 69 | grad_list.append(grads) #add gradients to the list 70 | opt_local = loptimizer.apply_gradients(zip(grads,varss), 71 | global_step=local_step) #update local parameters 72 | 73 | grads = tf.reduce_sum(grad_list,axis=0) #sum updates before applying globally 74 | grads = tuple([grads[i]for i in range(len(varss))]) 75 | 76 | 77 | with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss, 78 | worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))): 79 | 80 | global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step') 81 | 82 | # all workers use the same learning rate and it is decided on by the task 0 83 | # or maybe the from the graph of the chief worker 84 | optimizer = tf.train.AdagradOptimizer(lr) #global optimizer 85 | 86 | # create global variables and/or references 87 | local_to_global, global_to_local = create_global_variables() 88 | opt = optimizer.apply_gradients( 89 | zip(grads,[local_to_global[v] for v in varss]) 90 | ,global_step=global_step) #apply the gradients to variables on ps 91 | 92 | # Pull params from global server 93 | with tf.control_dependencies([opt]): 94 | assign_locals = assign_global_to_local(global_to_local) 95 | 96 | 97 | # Grab global state before training so all workers have same initialization 98 | grab_global_init = assign_global_to_local(global_to_local) 99 | 100 | # Assigns local values to global ones for chief to execute 101 | assign_global = assign_local_to_global(local_to_global) 102 | 103 | # Init ops 104 | init = tf.global_variables_initializer() # for global variables 105 | init_local = tf.variables_initializer(tf.local_variables() \ 106 | +tf.get_collection('local_non_trainable'))#for local variables 107 | 108 | # Session 109 | stop_hook = tf.train.StopAtStepHook(last_step=60) 110 | hooks = [stop_hook] 111 | scaff = tf.train.Scaffold(init_op=init,local_init_op=[init_local]) 112 | 113 | # Monitored Training Session 114 | sess = tf.train.MonitoredTrainingSession(master=server.target, 115 | is_chief=is_chief, 116 | config=config, 117 | scaffold=scaff, 118 | hooks=hooks, 119 | save_checkpoint_secs=1, 120 | checkpoint_dir='logdir') 121 | 122 | if is_chief: 123 | sess.run(assign_global) #Assigns chief's initial values to ps 124 | time.sleep(10) #grace period to wait on other workers before starting training 125 | 126 | # Train until hook stops session 127 | print('Starting training on worker %d'%FLAGS.task_index) 128 | sess.run(grab_global_init) 129 | while not sess.should_stop(): 130 | _,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step]) 131 | 132 | print(r,"global step: "+str(gs),"worker: "+str(FLAGS.task_index),"local step: "+str(ls)) 133 | 134 | time.sleep(1) # so we can observe training 135 | print('Done',FLAGS.task_index) 136 | 137 | time.sleep(10) #grace period to wait before closing session 138 | sess.close() 139 | print('Session from worker %d closed cleanly'%FLAGS.task_index) 140 | 141 | 142 | def assign_global_to_local(global_to_local): 143 | """Assigns global variable value to local variables. 144 | 145 | global_to_local : dictionary with corresponding local variable for global key 146 | """ 147 | r = [] 148 | for v in global_to_local.keys(): 149 | r.append(tf.assign(global_to_local[v],v)) 150 | with tf.control_dependencies(r): 151 | a = tf.no_op() 152 | return a 153 | 154 | 155 | def assign_local_to_global(local_to_global): 156 | """Assigns global variable value to local variables. 157 | 158 | local_to_global : dictionary with corresponding global variable for local key 159 | """ 160 | r= [] 161 | for v in local_to_global.keys(): 162 | r.append(tf.assign(local_to_global[v],v)) 163 | with tf.control_dependencies(r): 164 | a = tf.no_op() 165 | return a 166 | 167 | 168 | def get_variable_by_name(name): 169 | """Returns the variable of given name. 170 | 171 | name : the name of the global variable 172 | """ 173 | return [v for v in tf.get_collection('variables') if v.name == name][0] 174 | 175 | 176 | def get_global_variable_by_name(name): 177 | """Returns the global variable of given name 178 | 179 | name : the name of the global variable 180 | """ 181 | # return [v for v in tf.variables() if v.name == name][0] 182 | return [v for v in tf.global_variables() if v.name == name][0] 183 | 184 | 185 | def create_global_variables(): 186 | """Creates global variables for local variables on the graph. 187 | 188 | Returns dictionarys for local-to-global and global-to-local 189 | variable mappings. 190 | """ 191 | local_to_global = {} 192 | global_to_local = {} 193 | with tf.device('/job:ps/task:0'): 194 | for v in tf.local_variables(): 195 | v_g = tf.get_variable('g/'+v.op.name, 196 | shape = v.shape, 197 | dtype = v.dtype, 198 | trainable=True, 199 | collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES]) 200 | local_to_global[v] = v_g 201 | global_to_local[v_g] = v 202 | return local_to_global,global_to_local 203 | 204 | 205 | if __name__ == '__main__': 206 | parser = argparse.ArgumentParser() 207 | # Flags for defining the tf.train.ClusterSpec 208 | parser.add_argument( 209 | "--job_name", 210 | type=str, 211 | default="", 212 | help="One of 'ps', 'worker'" 213 | ) 214 | # Flags for defining the tf.train.Server 215 | parser.add_argument( 216 | "--task_index", 217 | type=int, 218 | default=0, 219 | help="Index of task within the job" 220 | ) 221 | FLAGS, unparsed = parser.parse_known_args() 222 | print(FLAGS.task_index) 223 | main() 224 | -------------------------------------------------------------------------------- /DOWNPOUR-Easy/README.md: -------------------------------------------------------------------------------- 1 | ## DOWNPOUR Easy 2 | 3 | The same as DOWNPOUR except that instead of updating variables using Adagrad locally, variables are updated using SGD. This makes implementing the algorithm easier because you don't need to worry about finding the variables created by the local Adagrad optimizer and forcing them to be local variables. 4 | -------------------------------------------------------------------------------- /DOWNPOUR-Easy/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python DOWNPOUR.py --job_name "ps" --task_index 0 & 3 | python DOWNPOUR.py --job_name "worker" --task_index 0 & 4 | python DOWNPOUR.py --job_name "worker" --task_index 1 & -------------------------------------------------------------------------------- /DOWNPOUR/DOWNPOUR.py: -------------------------------------------------------------------------------- 1 | """DOWNPOUR 2 | 3 | Performs asynchronous updates with update window. 4 | 5 | Author: Tommy Mulc 6 | """ 7 | 8 | 9 | from __future__ import print_function 10 | import tensorflow as tf 11 | import argparse 12 | import time 13 | import os 14 | 15 | 16 | FLAGS = None 17 | log_dir = '/logdir' 18 | 19 | def main(): 20 | # Configure 21 | config=tf.ConfigProto(log_device_placement=False) 22 | 23 | #Server Setup 24 | cluster_spec = {'ps':['localhost:2222'], 25 | 'worker':['localhost:2223','localhost:2224']} 26 | n_pss = len(cluster_spec['ps']) #the number of parameter servers 27 | n_workers = len(cluster_spec['worker']) #the number of worker nodes 28 | cluster = tf.train.ClusterSpec(cluster_spec) #allows this node know about all other nodes 29 | 30 | if FLAGS.job_name == 'ps': #checks if parameter server 31 | server = tf.train.Server(cluster, 32 | job_name="ps", 33 | task_index=FLAGS.task_index, 34 | config=config) 35 | server.join() 36 | else: #it must be a worker server 37 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 38 | server = tf.train.Server(cluster, 39 | job_name="worker", 40 | task_index=FLAGS.task_index, 41 | config=config) 42 | 43 | # Graph 44 | # Local operations 45 | with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index): 46 | a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 47 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 48 | b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 49 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 50 | c=a+b 51 | 52 | local_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='local_step', 53 | collections=['local_non_trainable']) 54 | lr = .0001 55 | 56 | #loptimizer = tf.train.GradientDescentOptimizer(lr*FLAGS.task_index) #local optimizer 57 | loptimizer = tf.train.AdagradOptimizer(lr) #local optimizer 58 | 59 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 60 | loss = tf.reduce_mean(tf.square(c-target)) 61 | 62 | # DOWNPOUR 63 | update_window = 3 # T: communication window 64 | grad_list = [] # the array to store the gradients through the communication window 65 | for t in range(update_window): 66 | if t != 0: 67 | with tf.control_dependencies([opt_local]): #compute gradients only if the local opt was run 68 | grads, varss = zip(*loptimizer.compute_gradients( \ 69 | loss,var_list=tf.local_variables())) 70 | else: 71 | grads, varss = zip(*loptimizer.compute_gradients( \ 72 | loss,var_list=tf.local_variables())) 73 | grad_list.append(grads) #add gradients to the list 74 | opt_local = loptimizer.apply_gradients(zip(grads,varss), 75 | global_step=local_step) #update local parameters 76 | 77 | grads = tf.reduce_sum(grad_list,axis=0) #sum updates before applying globally 78 | grads = tuple([grads[i]for i in range(len(varss))]) 79 | 80 | # add these variables created by local optimizer to local collection 81 | lopt_vars = add_global_variables_to_local_collection() 82 | 83 | # delete the variables from the global collection 84 | clear_global_collection() 85 | 86 | with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss, 87 | worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))): 88 | global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step') 89 | 90 | # all workers use the same learning rate and it is decided on by the task 0 91 | # or maybe the from the graph of the chief worker 92 | optimizer = tf.train.AdagradOptimizer(lr) #global optimizer 93 | 94 | # create global variables and/or references 95 | local_to_global, global_to_local = create_global_variables(lopt_vars) 96 | opt = optimizer.apply_gradients( 97 | zip(grads,[local_to_global[v] for v in varss]) 98 | ,global_step=global_step) #apply the gradients to variables on ps 99 | 100 | # Pull params from global server 101 | with tf.control_dependencies([opt]): 102 | assign_locals = assign_global_to_local(global_to_local) 103 | 104 | # Grab global state before training so all workers have same initialization 105 | grab_global_init = assign_global_to_local(global_to_local) 106 | 107 | # Assigns local values to global ones for chief to execute 108 | assign_global = assign_local_to_global(local_to_global) 109 | 110 | # Init ops 111 | init = tf.global_variables_initializer() # for global variables 112 | init_local = tf.variables_initializer(tf.local_variables() \ 113 | +tf.get_collection('local_non_trainable')) #for local variables 114 | 115 | # Session 116 | stop_hook = tf.train.StopAtStepHook(last_step=60) 117 | hooks = [stop_hook] 118 | scaff = tf.train.Scaffold(init_op=init,local_init_op=[init_local]) 119 | 120 | # Monitored Training Session 121 | sess = tf.train.MonitoredTrainingSession(master=server.target, 122 | is_chief=is_chief, 123 | config=config, 124 | scaffold=scaff, 125 | hooks=hooks, 126 | save_checkpoint_secs=1, 127 | checkpoint_dir='logdir') 128 | 129 | if is_chief: 130 | sess.run(assign_global) #Assigns chief's initial values to ps 131 | time.sleep(10) #grace period to wait on other workers before starting training 132 | 133 | # Train until hook stops session 134 | print('Starting training on worker %d'%FLAGS.task_index) 135 | sess.run(grab_global_init) 136 | while not sess.should_stop(): 137 | _,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step]) 138 | 139 | print(r,"global step: "+str(gs),"worker: "+str(FLAGS.task_index),"local step: "+str(ls)) 140 | 141 | time.sleep(1) # so we can observe training 142 | print('Done',FLAGS.task_index) 143 | 144 | time.sleep(10) #grace period to wait before closing session 145 | sess.close() 146 | print('Session from worker %d closed cleanly'%FLAGS.task_index) 147 | 148 | 149 | def assign_global_to_local(global_to_local): 150 | """Assigns global variable value to local variables. 151 | 152 | global_to_local : dictionary with corresponding local variable for global key 153 | """ 154 | r = [] 155 | for v in global_to_local.keys(): 156 | r.append(tf.assign(global_to_local[v],v)) 157 | with tf.control_dependencies(r): 158 | a = tf.no_op() 159 | return a 160 | 161 | 162 | def assign_local_to_global(local_to_global): 163 | """Assigns global variable value to local variables. 164 | 165 | local_to_global : dictionary with corresponding global variable for local key 166 | """ 167 | r= [] 168 | for v in local_to_global.keys(): 169 | r.append(tf.assign(local_to_global[v],v)) 170 | with tf.control_dependencies(r): 171 | a = tf.no_op() 172 | return a 173 | 174 | 175 | def get_variable_by_name(name): 176 | """Returns the variable of given name 177 | 178 | name : the name of the global variable 179 | """ 180 | return [v for v in tf.get_collection('variables') if v.name == name][0] 181 | 182 | 183 | def get_global_variable_by_name(name): 184 | """Returns the global variable of given name. 185 | 186 | name : the name of the global variable 187 | """ 188 | # return [v for v in tf.variables() if v.name == name][0] 189 | return [v for v in tf.global_variables() if v.name == name][0] 190 | 191 | 192 | def create_global_variables(local_optimizer_vars = []): 193 | """Creates global variables for local variables on the graph. 194 | Skips variables local variables that are created for 195 | local optimization. 196 | 197 | Returns dictionarys for local-to-global and global-to-local 198 | variable mappings. 199 | """ 200 | local_to_global = {} 201 | global_to_local = {} 202 | with tf.device('/job:ps/task:0'): 203 | for v in tf.local_variables(): 204 | if v not in local_optimizer_vars: 205 | v_g = tf.get_variable('g/'+v.op.name, 206 | shape = v.shape, 207 | dtype = v.dtype, 208 | trainable=True, 209 | collections=[tf.GraphKeys.GLOBAL_VARIABLES, 210 | tf.GraphKeys.TRAINABLE_VARIABLES]) 211 | local_to_global[v] = v_g 212 | global_to_local[v_g] = v 213 | return local_to_global,global_to_local 214 | 215 | 216 | def add_global_variables_to_local_collection(): 217 | """Adds all variables from the global collection 218 | to the local collection. 219 | 220 | Returns the list of variables added. 221 | """ 222 | r =[] 223 | for var in tf.get_default_graph()._collections[tf.GraphKeys.GLOBAL_VARIABLES]: 224 | tf.add_to_collection(tf.GraphKeys.LOCAL_VARIABLES,var) 225 | r.append(var) 226 | return r 227 | 228 | 229 | def clear_global_collection(): 230 | """Removes all variables from global collection.""" 231 | g = tf.get_default_graph() 232 | for _ in range(len(g._collections[tf.GraphKeys.GLOBAL_VARIABLES])): 233 | del g._collections[tf.GraphKeys.GLOBAL_VARIABLES][0] 234 | 235 | 236 | if __name__ == '__main__': 237 | parser = argparse.ArgumentParser() 238 | # Flags for defining the tf.train.ClusterSpec 239 | parser.add_argument( 240 | "--job_name", 241 | type=str, 242 | default="", 243 | help="One of 'ps', 'worker'" 244 | ) 245 | # Flags for defining the tf.train.Server 246 | parser.add_argument( 247 | "--task_index", 248 | type=int, 249 | default=0, 250 | help="Index of task within the job" 251 | ) 252 | FLAGS, unparsed = parser.parse_known_args() 253 | print(FLAGS.task_index) 254 | main() 255 | -------------------------------------------------------------------------------- /DOWNPOUR/README.md: -------------------------------------------------------------------------------- 1 | ## DOWNPOUR 2 | 3 | Similar to Hogwild! expect that it uses Adagrad to update the local workers. Additionally, there is a communication window which servers as a time buffer for updates to the parameter server (although the original paper set the communication window to one, which voided the need for this buffer). 4 | -------------------------------------------------------------------------------- /DOWNPOUR/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python DOWNPOUR.py --job_name "ps" --task_index 0 & 3 | python DOWNPOUR.py --job_name "worker" --task_index 0 & 4 | python DOWNPOUR.py --job_name "worker" --task_index 1 & -------------------------------------------------------------------------------- /Distributed-Setup/README.md: -------------------------------------------------------------------------------- 1 | There are many ways to set up a session in a distributed setting but we demonstrate two in this example: 2 | 3 | 1. Monitored Training Session 4 | 2. Supervisor Session 5 | 6 | The Monitored Training Session is the best option because it can handle many hooks and can be used for synchronous training. The Supervisor Session offers suppport for to handling threads and can be used for some distributed training, but overall offers less than the Monitored Training Session. The schema for this is directory is as follows 7 | 8 | * `dist_setup.py` -- python code for Monitored Training Session 9 | * `dist_setup_sup.py` -- python code for Supervisor Session 10 | * `run.sh` -- bash script for Monitored Training Session 11 | * `run_sup.sh` -- bash script for Supervisor Session -------------------------------------------------------------------------------- /Distributed-Setup/dist_setup.py: -------------------------------------------------------------------------------- 1 | """Simple example with one parameter server and one worker. 2 | 3 | Author: Tommy Mulc 4 | """ 5 | 6 | 7 | from __future__ import print_function 8 | import tensorflow as tf 9 | import argparse 10 | import time 11 | import os 12 | 13 | 14 | FLAGS = None 15 | log_dir = '/logdir' 16 | 17 | def main(): 18 | # Distributed Baggage 19 | cluster = tf.train.ClusterSpec({ 20 | 'ps':['localhost:2222'], 21 | 'worker':['localhost:2223'] 22 | }) #lets this node know about all other nodes 23 | if FLAGS.job_name == 'ps': #checks if parameter server 24 | server = tf.train.Server(cluster, 25 | job_name="ps", 26 | task_index=FLAGS.task_index) 27 | server.join() 28 | else: 29 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 30 | server = tf.train.Server(cluster, 31 | job_name="worker", 32 | task_index=FLAGS.task_index) 33 | 34 | # Graph 35 | with tf.device('/cpu:0'): 36 | a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 37 | b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 38 | c=a+b 39 | 40 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 41 | loss = tf.reduce_mean(tf.square(c-target)) 42 | 43 | opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) 44 | 45 | # Session 46 | # Monitored Training Session 47 | sess = tf.train.MonitoredTrainingSession( 48 | master=server.target, 49 | is_chief=is_chief) 50 | for i in range(1000): 51 | if sess.should_stop(): break 52 | sess.run(opt) 53 | if i % 10 == 0: 54 | r = sess.run(c) 55 | print(r) 56 | time.sleep(.1) 57 | sess.close() 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser() 61 | # Flags for defining the tf.train.ClusterSpec 62 | parser.add_argument( 63 | "--job_name", 64 | type=str, 65 | default="", 66 | help="One of 'ps', 'worker'" 67 | ) 68 | # Flags for defining the tf.train.Server 69 | parser.add_argument( 70 | "--task_index", 71 | type=int, 72 | default=0, 73 | help="Index of task within the job" 74 | ) 75 | FLAGS, unparsed = parser.parse_known_args() 76 | main() 77 | -------------------------------------------------------------------------------- /Distributed-Setup/dist_setup_sup.py: -------------------------------------------------------------------------------- 1 | """Simple example with one parameter server and one worker. 2 | 3 | Author: Tommy Mulc 4 | """ 5 | 6 | 7 | from __future__ import print_function 8 | import tensorflow as tf 9 | import argparse 10 | import time 11 | import os 12 | 13 | 14 | FLAGS = None 15 | log_dir = '/logdir' 16 | 17 | def main(): 18 | # Distributed Baggage 19 | cluster = tf.train.ClusterSpec({ 20 | 'ps':['localhost:2222'], 21 | 'worker':['localhost:2223'] 22 | }) #lets this node know about all other nodes 23 | if FLAGS.job_name == 'ps': #checks if parameter server 24 | server = tf.train.Server(cluster,job_name="ps",task_index=FLAGS.task_index) 25 | server.join() 26 | else: 27 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 28 | server = tf.train.Server(cluster,job_name="worker",task_index=FLAGS.task_index) 29 | 30 | # Graph 31 | with tf.device('/cpu:0'): 32 | a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 33 | b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 34 | c=a+b 35 | 36 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 37 | loss = tf.reduce_mean(tf.square(c-target)) 38 | 39 | opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) 40 | 41 | # Session 42 | # Supervisor 43 | sv = tf.train.Supervisor(logdir=os.getcwd()+log_dir,is_chief=is_chief,save_model_secs=30) 44 | sess = sv.prepare_or_wait_for_session(server.target) 45 | for i in range(1000): 46 | if sv.should_stop(): break 47 | sess.run(opt) 48 | if i % 10 == 0: 49 | r = sess.run(c) 50 | print(r) 51 | time.sleep(.1) 52 | 53 | if __name__ == '__main__': 54 | parser = argparse.ArgumentParser() 55 | # Flags for defining the tf.train.ClusterSpec 56 | parser.add_argument( 57 | "--job_name", 58 | type=str, 59 | default="", 60 | help="One of 'ps', 'worker'" 61 | ) 62 | # Flags for defining the tf.train.Server 63 | parser.add_argument( 64 | "--task_index", 65 | type=int, 66 | default=0, 67 | help="Index of task within the job" 68 | ) 69 | FLAGS, unparsed = parser.parse_known_args() 70 | main() 71 | -------------------------------------------------------------------------------- /Distributed-Setup/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python dist_setup.py --job_name "ps" --task_index 0 & 3 | python dist_setup.py --job_name "worker" --task_index 0 & 4 | -------------------------------------------------------------------------------- /Distributed-Setup/run_sup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python dist_setup_sup.py --job_name "ps" --task_index 0 & 3 | python dist_setup_sup.py --job_name "worker" --task_index 0 & 4 | -------------------------------------------------------------------------------- /Hogwild/Hogwild.py: -------------------------------------------------------------------------------- 1 | """Hogwild! 2 | 3 | Asynchronous updates with 1 parameter server and 2 workers. 4 | The updates happen 'hogwild' style so the parameters are 5 | never locked. 6 | 7 | Author: Tommy Mulc 8 | """ 9 | 10 | from __future__ import print_function 11 | import tensorflow as tf 12 | import argparse 13 | import time 14 | import os 15 | FLAGS = None 16 | log_dir = '/logdir' 17 | 18 | def main(): 19 | # Server Setup 20 | cluster = tf.train.ClusterSpec({ 21 | 'ps':['localhost:2222'], 22 | 'worker':['localhost:2223','localhost:2224'] 23 | }) #allows this node know about all other nodes 24 | if FLAGS.job_name == 'ps': #checks if parameter server 25 | server = tf.train.Server(cluster, 26 | job_name="ps", 27 | task_index=FLAGS.task_index) 28 | server.join() 29 | else: 30 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 31 | server = tf.train.Server(cluster, 32 | job_name="worker", 33 | task_index=FLAGS.task_index) 34 | 35 | # Graph 36 | with tf.device('/cpu:0'): 37 | a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 38 | b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 39 | c=a+b 40 | 41 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 42 | loss = tf.reduce_mean(tf.square(c-target)) 43 | 44 | opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) 45 | 46 | # Session 47 | sv = tf.train.Supervisor(logdir=os.getcwd()+log_dir, 48 | is_chief=is_chief, 49 | save_model_secs=30) 50 | sess = sv.prepare_or_wait_for_session(server.target) 51 | for i in range(1000): 52 | if sv.should_stop(): break 53 | sess.run(opt) 54 | if i % 10 == 0: 55 | r = sess.run(c) 56 | print(r) 57 | time.sleep(.1) 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser() 61 | # Flags for defining the tf.train.ClusterSpec 62 | parser.add_argument( 63 | "--job_name", 64 | type=str, 65 | default="", 66 | help="One of 'ps', 'worker'" 67 | ) 68 | # Flags for defining the tf.train.Server 69 | parser.add_argument( 70 | "--task_index", 71 | type=int, 72 | default=0, 73 | help="Index of task within the job" 74 | ) 75 | FLAGS, unparsed = parser.parse_known_args() 76 | main() 77 | -------------------------------------------------------------------------------- /Hogwild/README.md: -------------------------------------------------------------------------------- 1 | ## HogWild! 2 | 3 | The famous, lock-free approach to SGD. Have a bunch of workers and parameter server, then let the workers update the variables whenever they want. 4 | -------------------------------------------------------------------------------- /Hogwild/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python Hogwild.py --job_name "ps" --task_index 0 & 3 | python Hogwild.py --job_name "worker" --task_index 0 & 4 | python Hogwild.py --job_name "worker" --task_index 1 & 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Tommy Mulc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Multiple-GPUs-Single-Machine/README.md: -------------------------------------------------------------------------------- 1 | ## Multiple GPUs Single Machine 2 | 3 | Use environment variables to manually override the available GPUs in a TensorFlow process. There is a way to do this without using environment variables, but it's a not worth the effort (if you really need this, you can remap the available devices so the GPU you want to use is labeled as device 0, then set visible devices to 0). 4 | -------------------------------------------------------------------------------- /Multiple-GPUs-Single-Machine/dist_mult_gpu_sing_mach.py: -------------------------------------------------------------------------------- 1 | """Asynchrnous training on multiple GPUs on the same machine. 2 | 3 | Author: Tommy Mulc 4 | """ 5 | 6 | from __future__ import print_function 7 | import tensorflow as tf 8 | import argparse 9 | import time 10 | import os 11 | FLAGS = None 12 | log_dir = '/logdir' 13 | 14 | def main(): 15 | # Server Setup 16 | cluster = tf.train.ClusterSpec({ 17 | 'ps':['localhost:2222'], 18 | 'worker':['localhost:2223','localhost:2224'] 19 | }) #allows this node know about all other nodes 20 | if FLAGS.job_name == 'ps': #checks if parameter server 21 | with tf.device('/cpu:0'): 22 | server = tf.train.Server(cluster, 23 | job_name="ps", 24 | task_index=FLAGS.task_index) 25 | server.join() 26 | else: 27 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 28 | server = tf.train.Server(cluster,job_name="worker", 29 | task_index=FLAGS.task_index,config=config) 30 | # Graph 31 | with tf.device('/gpu:0'): 32 | a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 33 | b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 34 | c=a+b 35 | 36 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 37 | loss = tf.reduce_mean(tf.square(c-target)) 38 | 39 | opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) 40 | 41 | # Session 42 | sv = tf.train.Supervisor(logdir=os.getcwd()+log_dir, 43 | is_chief=is_chief, 44 | save_model_secs=30) 45 | gpu_options = tf.GPUOptions(allow_growth=True, 46 | allocator_type="BFC", 47 | visible_device_list="%d"%FLAGS.task_index) 48 | config = tf.ConfigProto(gpu_options=gpu_options, 49 | allow_soft_placement=True) 50 | sess = sv.prepare_or_wait_for_session(server.target,config=config) 51 | for i in range(1000): 52 | if sv.should_stop(): break 53 | sess.run(opt) 54 | if i % 10 == 0: 55 | r = sess.run(c) 56 | print(r) 57 | time.sleep(.1) 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser() 61 | # Flags for defining the tf.train.ClusterSpec 62 | parser.add_argument( 63 | "--job_name", 64 | type=str, 65 | default="", 66 | help="One of 'ps', 'worker'" 67 | ) 68 | # Flags for defining the tf.train.Server 69 | parser.add_argument( 70 | "--task_index", 71 | type=int, 72 | default=0, 73 | help="Index of task within the job" 74 | ) 75 | FLAGS, unparsed = parser.parse_known_args() 76 | main() 77 | -------------------------------------------------------------------------------- /Multiple-GPUs-Single-Machine/dist_mult_gpu_sing_mach.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=-1 3 | python dist_mult_gpu_sing_mach.py --job_name "ps" --task_index 0 & 4 | export CUDA_VISIBLE_DEVICES=0 5 | python dist_mult_gpu_sing_mach.py --job_name "worker" --task_index 0 & 6 | export CUDA_VISIBLE_DEVICES=1 7 | python dist_mult_gpu_sing_mach.py --job_name "worker" --task_index 1 & 8 | -------------------------------------------------------------------------------- /Non-Distributed_Setup.py: -------------------------------------------------------------------------------- 1 | """The non-distributed solution to the problem. 2 | 3 | Author: Tommy Mulc 4 | """ 5 | 6 | from __future__ import print_function 7 | import tensorflow as tf 8 | import time 9 | 10 | def main(): 11 | # Graph 12 | with tf.device('/cpu:0'): 13 | a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 14 | b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 15 | c=a+b 16 | 17 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 18 | loss = tf.reduce_mean(tf.square(c-target)) 19 | 20 | opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) 21 | 22 | # Session 23 | sv = tf.train.Supervisor() 24 | sess = sv.prepare_or_wait_for_session() 25 | for i in range(1000): 26 | sess.run(opt) 27 | if i % 10 == 0: 28 | r = sess.run(c) 29 | print(r) 30 | time.sleep(.1) 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed TensorFlow Guide 2 | 3 | 4 | This guide is a collection of distributed training examples (that can act as boilerplate code) and a tutorial of basic distributed TensorFlow. Many of the examples focus on implementing well-known distributed training schemes, such as those available in [*dist-keras*](https://github.com/cerndb/dist-keras) which were discussed in the author's [blog post](http://joerihermans.com/ramblings/distributed-deep-learning-part-1-an-introduction/). 5 | 6 |
7 | 8 |
9 | 10 | Almost all the examples can be run on a single machine with a CPU, and all the examples only use data-parallelism (i.e., between-graph replication). 11 | 12 | The motivation for this guide stems from the current state of distributed deep learning. Deep learning papers typical demonstrate successful new architectures on some benchmark, but rarely show how these models can be trained with 1000x the data which is usually the requirement in industy. Furthermore, most successful distributed cases use state-of-the-art hardware to bruteforce massive effective minibatches in a synchronous fashion across high-bandwidth networks; there has been little research showing the potential of asynchronous training (which is why there are a lot of those examples in this guide). Finally, the lack of documenation for distributed TF was the largest motivator for this project. TF is a great tool that prides itself on its scalability, but unfortunately there are few examples that show how to make your model scale with data size. 13 | 14 | The aim of this guide is to aid all interested in distributed deep learning, from beginners to researchers. 15 | 16 | ## Basics Tutorial 17 | 18 | See the Basics-Tutorial folder for notebooks demonstrating core concepts used in distributed TensorFlow. The rest of the examples assume understanding of the basics tutorial. 19 | 20 | * [`Servers.ipynb`](Basics-Tutorial/Servers.ipynb) -- basics of TensorFlow servers 21 | * [`Parameter-Server.ipynb`](Basics-Tutorial/Parameter-Server.ipynb) -- everything about parameter servers 22 | * [`Local-then-Global-Variables.ipynb`](Basics-Tutorial/Local-then-Global-Variables.ipynb) -- creates a graph locally then make global copies of the variables Useful for graphs that do local updates before pushing global updates (e.g., DOWNPOUR, ADAG, etc.) 23 | * [`Multiple-Workers`](Basics-Tutorial/Multiple-Workers/) -- contains three notebooks: one parameter server notebook and two worker notebooks The exercise shows how global variables are communicated via the parameter server and how local updates can be made by explicitly placing ops on local devices 24 | 25 | 26 | ## Training Algorithm Examples 27 | 28 | The complete list of examples is below. The first example, [`Non-Distributed-Setup`](Non-Distributed_Setup.py), shows the basic learning problem we want to solve distributively; this example should be familiar to all since it doesn't use any distributed code. The second example, [`Distributed-Setup`](Distributed-Setup/), shows the same problem being solved with distributed code (i.e., with one parameter server and one worker). The remaining examples are a mix of synchronous and non-synchronous training schemes. 29 | 30 | * [`Non-Distributed-Setup`](Non-Distributed_Setup.py) 31 | * [`Distributed-Setup`](Distributed-Setup) 32 | * [`HogWild`](Hogwild) (Asychronous SGD) 33 | * [`DOWNPOUR`](DOWNPOUR) 34 | * [`DOWNPOUR-Easy`](DOWNPOUR-Easy/)1 35 | * [`AGN`](AGN) (Accumulated Gradient Normalization) 36 | * [`Synchronous-SGD`](Synchronous-SGD/) 37 | * [`Synchronous-SGD-different-learning-rates`](Synchronous-SGD-different-learning-rates/) 38 | * [`SAGN`](SAGN) (Synchronous Accumulated Gradients Normalization) 39 | * [`Multiple-GPUs-Single-Machine`](Multiple-GPUs-Single-Machine/) 40 | * `Dynamic SGD` **TODO** 41 | * `Asynchronous Elastic Averaging SGD` (AEASGD) **TODO** 42 | * `Asynchronous Elastic Averaging Momentum SGD` (AEAMSGD) **TODO** 43 | 44 | 45 | 1This is the same as the DOWNPOUR example except that is uses SGD on the workers instead of Adagrad. 46 | 47 | ## Running Training Algorithm Examples 48 | All the training examples (except the non-distributed example) live in a folder. To run them, move to the example directory and run the bash script. 49 | 50 | ```bash 51 | cd / 52 | bash run.sh 53 | ``` 54 | 55 | In order to completely stop the example, you'll need to kill the python processes associated with it. If you want to stopped training early, then there will be python processes for each of the workers in addition to the parameter server processes. Unfortunately, the parameter server processes continue to run even after the workers are finished--these will always need to be killed manually. To kill all python processes, run pkill. 56 | 57 | ```bash 58 | sudo pkill python 59 | ``` 60 | 61 | ## Requirements 62 | 63 | * Python 2.7 64 | * TensorFlow >= 1.2 65 | 66 | 67 | ## Links 68 | * [Official Documenation](https://www.tensorflow.org/deploy/distributed) 69 | * [Threads and Queues](https://www.tensorflow.org/programmers_guide/threading_and_queues) 70 | * [More TensorFlow Documentation](https://www.tensorflow.org/api_guides/python/train#Distributedexecution) 71 | 72 | ## Glossary 73 | * [Server](https://www.tensorflow.org/api_docs/python/tf/train/Server) -- encapsulates a Session target and belongs to a cluster 74 | * [Coordinator](https://www.tensorflow.org/api_docs/python/tf/train/Coordinator) -- coordinates threads 75 | * [Session Manager](https://www.tensorflow.org/api_docs/python/tf/train/SessionManager) -- restores session and initialized variables and coordinates threads 76 | * [Supervisor](https://www.tensorflow.org/api_docs/python/tf/train/Supervisor) -- good for threads. Coordinater, Saver, and Session Manager. > Session Manager 77 | * [Session Creator](https://www.tensorflow.org/api_docs/python/tf/train/SessionCreator) -- Factory for creating a session? 78 | * [Monitored Session](https://www.tensorflow.org/api_docs/python/tf/train/MonitoredSession) -- Session. initialization, hooks, recovery. 79 | * [Monitored Training Session](https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession) -- only distributed solution for sync optimization 80 | * [Sync Replicas](https://www.tensorflow.org/api_docs/python/tf/train/SyncReplicasOptimizer) -- wrapper of optimizer for synchronous optimization 81 | * [Scaffold](https://www.tensorflow.org/api_docs/python/tf/train/Scaffold) -- holds lots of meta training settings and passed to Session creator 82 | 83 | ### Hooks 84 | * [LoggingTensorHook](https://www.tensorflow.org/api_docs/python/tf/train/LoggingTensorHook) -- prints tensors every *N* steps or seconds 85 | * [StopAtStepHook](https://www.tensorflow.org/api_docs/python/tf/train/StopAtStepHook) -- requests to stop training at a certain step 86 | * [StepCounterHook](https://www.tensorflow.org/api_docs/python/tf/train/StepCounterHook) -- counts steps per second 87 | * [CheckpointSaverHook](https://www.tensorflow.org/api_docs/python/tf/train/CheckpointSaverHook) -- saves new checkpoint every *N* steps or seconds 88 | * [NanTensorHook](https://www.tensorflow.org/api_docs/python/tf/train/NanTensorHook) -- stops training if loss is NaN 89 | * [SummarySaverHook](https://www.tensorflow.org/api_docs/python/tf/train/SummarySaverHook) -- saves summaries every *N* steps or seconds 90 | * [GlobalStepWaiterHook](https://www.tensorflow.org/api_docs/python/tf/train/GlobalStepWaiterHook) -- waits until global step reaches threshold before training 91 | * [FinalOpsHook](https://www.tensorflow.org/api_docs/python/tf/train/FinalOpsHook) -- runs specified ops before closing session 92 | * [FeedFnHook](https://www.tensorflow.org/api_docs/python/tf/train/FeedFnHook) -- assigns feed_dict 93 | 94 | ## Algorithm References 95 | 96 | * [Hogwild!](https://people.eecs.berkeley.edu/~brecht/papers/hogwildTR.pdf) 97 | * [DOWNPOUR](https://static.googleusercontent.com/media/research.google.com/en//archive/large_deep_networks_nips2012.pdf) 98 | * [ADAG](http://joerihermans.com/ramblings/distributed-deep-learning-part-1-an-introduction/) 99 | * [AGN](https://arxiv.org/abs/1710.02368) 100 | * [EASGD and EAMSGD](https://arxiv.org/abs/1412.6651) 101 | -------------------------------------------------------------------------------- /SAGN/README.md: -------------------------------------------------------------------------------- 1 | ## SDAG (Synchronous Accumulated Gradient Normalization) 2 | 3 | A hybrid of SSGD and AGN. This method averages gradients over the communication window but apply updates to the ps variables synchronously. 4 | -------------------------------------------------------------------------------- /SAGN/SAGN.py: -------------------------------------------------------------------------------- 1 | """Synchronous Accumulated Gradients Normalization (SGAN) 2 | 3 | Performs synchronous updates with gradients averaged 4 | over a time window. 5 | 6 | Author: Tommy Mulc 7 | """ 8 | 9 | from __future__ import print_function 10 | import tensorflow as tf 11 | import argparse 12 | import time 13 | import os 14 | FLAGS = None 15 | log_dir = '/logdir' 16 | 17 | def main(): 18 | # Configure 19 | config=tf.ConfigProto(log_device_placement=False) 20 | 21 | # Server Setup 22 | cluster_spec = { 23 | 'ps':['localhost:2222'], 24 | 'worker':['localhost:2223','localhost:2224'] 25 | } #allows this node know about all other nodes 26 | n_pss = len(cluster_spec['ps']) #the number of parameter servers 27 | n_workers = len(cluster_spec['worker']) #the number of worker nodes 28 | cluster = tf.train.ClusterSpec(cluster_spec) #allows this node know about all other nodes 29 | 30 | if FLAGS.job_name == 'ps': #checks if parameter server 31 | server = tf.train.Server(cluster, 32 | job_name="ps", 33 | task_index=FLAGS.task_index, 34 | config=config) 35 | server.join() 36 | else: #it must be a worker server 37 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 38 | server = tf.train.Server(cluster, 39 | job_name="worker", 40 | task_index=FLAGS.task_index, 41 | config=config) 42 | # Graph 43 | with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index): 44 | a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 45 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 46 | b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 47 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 48 | c=a+b 49 | 50 | local_step = tf.Variable(0,dtype=tf.int32,trainable=False, 51 | name='local_step',collections=['local_non_trainable']) 52 | 53 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 54 | loss = tf.reduce_mean(tf.square(c-target)) 55 | 56 | base_lr = .0001 57 | loptimizer = tf.train.AdamOptimizer(base_lr) 58 | # loptimizer = tf.train.GradientDescentOptimizer(base_lr) 59 | 60 | # SDAG (simplest case since all batches are the same) 61 | update_window = 5 # T: communication window 62 | grad_list = [] # the array to store the gradients through the communication window 63 | for t in range(update_window): 64 | if t != 0: 65 | #compute gradients only if the local opt was run 66 | with tf.control_dependencies([opt_local]): 67 | grads, varss = zip(*loptimizer.compute_gradients( \ 68 | loss,var_list=tf.local_variables())) 69 | else: 70 | grads, varss = zip(*loptimizer.compute_gradients( \ 71 | loss,var_list=tf.local_variables())) 72 | #add gradients to the list 73 | grad_list.append(grads) 74 | #update local parameters 75 | opt_local = loptimizer.apply_gradients(zip(grads,varss), 76 | global_step=local_step) 77 | 78 | # averages updates before applying globally 79 | grads = tf.reduce_mean(grad_list,axis=0) 80 | grads = tuple([grads[i] for i in range(len(varss))]) 81 | 82 | # add these variables created by local optimizer to local collection 83 | lopt_vars = add_global_variables_to_local_collection() 84 | 85 | # delete the variables from the global collection 86 | clear_global_collection() 87 | 88 | with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss, 89 | worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))): 90 | 91 | global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step') 92 | 93 | #create global variables and/or references 94 | local_to_global, global_to_local = create_global_variables(lopt_vars) 95 | 96 | optimizer = tf.train.AdamOptimizer(base_lr) 97 | # optimizer = tf.train.GradientDescentOptimizer(base_lr) 98 | optimizer1 = tf.train.SyncReplicasOptimizer(optimizer, 99 | replicas_to_aggregate=2, 100 | total_num_replicas=2) 101 | 102 | #apply the gradients to variables on ps 103 | opt = optimizer1.apply_gradients( 104 | zip(grads,[local_to_global[v] for v in varss]) 105 | ,global_step=global_step) 106 | 107 | with tf.control_dependencies([opt]): 108 | assign_locals = assign_global_to_local(global_to_local) 109 | 110 | # Grab global state before training so all workers have same initialization 111 | grab_global_init = assign_global_to_local(global_to_local) 112 | 113 | # Assigns local values to global ones for chief to execute 114 | assign_global = assign_local_to_global(local_to_global) 115 | 116 | # Initialized global step tokens 117 | init_tokens_op = optimizer1.get_init_tokens_op() 118 | 119 | # Init ops 120 | # gets step token 121 | local_init=optimizer1.local_step_init_op 122 | if is_chief: 123 | # fills token queue and gets token 124 | local_init = optimizer1.chief_init_op 125 | 126 | # indicates if variables are initialized 127 | ready_for_local_init = optimizer1.ready_for_local_init_op 128 | 129 | with tf.control_dependencies([local_init]): 130 | init_local = tf.variables_initializer(tf.local_variables() \ 131 | +tf.get_collection('local_non_trainable')) #for local variables 132 | 133 | init = tf.global_variables_initializer() # must come after other init ops 134 | 135 | # Session 136 | sync_replicas_hook = optimizer1.make_session_run_hook(is_chief) 137 | stop_hook = tf.train.StopAtStepHook(last_step=10) 138 | chief_hooks = [sync_replicas_hook,stop_hook] 139 | scaff = tf.train.Scaffold(init_op=init, 140 | local_init_op=init_local, 141 | ready_for_local_init_op=ready_for_local_init) 142 | 143 | #Monitored Training Session 144 | sess = tf.train.MonitoredTrainingSession(master=server.target, 145 | is_chief=is_chief, 146 | config=config, 147 | scaffold=scaff, 148 | hooks=chief_hooks, 149 | stop_grace_period_secs=10) 150 | 151 | if is_chief: 152 | sess.run(assign_global) # Assigns chief's initial values to ps 153 | time.sleep(40) # grace period to wait on other workers before starting training 154 | 155 | # Train until hook stops session 156 | print('Starting training on worker %d'%FLAGS.task_index) 157 | sess.run(grab_global_init) 158 | 159 | 160 | # Train until hook stops session 161 | print('Starting training on worker %d'%FLAGS.task_index) 162 | while not sess.should_stop(): 163 | _,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step]) 164 | # _,r,gs=sess.run([opt,c,global_step]) 165 | print(r,gs,FLAGS.task_index) 166 | if is_chief: time.sleep(1) 167 | time.sleep(1) 168 | print('Done',FLAGS.task_index) 169 | 170 | time.sleep(10) #grace period to wait before closing session 171 | sess.close() 172 | print('Session from worker %d closed cleanly'%FLAGS.task_index) 173 | 174 | 175 | def assign_global_to_local(global_to_local): 176 | """Assigns global variable value to local variables. 177 | 178 | global_to_local : dictionary with corresponding local variable for global key 179 | """ 180 | r = [] 181 | for v in global_to_local.keys(): 182 | r.append(tf.assign(global_to_local[v],v)) 183 | with tf.control_dependencies(r): 184 | a = tf.no_op() 185 | return a 186 | 187 | 188 | def assign_local_to_global(local_to_global): 189 | """Assigns global variable value to local variables. 190 | 191 | local_to_global : dictionary with corresponding global variable for local key 192 | """ 193 | r= [] 194 | for v in local_to_global.keys(): 195 | r.append(tf.assign(local_to_global[v],v)) 196 | with tf.control_dependencies(r): 197 | a = tf.no_op() 198 | return a 199 | 200 | 201 | def get_variable_by_name(name): 202 | """Returns the variable of given name 203 | 204 | name : the name of the global variable 205 | """ 206 | return [v for v in tf.get_collection('variables') if v.name == name][0] 207 | 208 | 209 | def get_global_variable_by_name(name): 210 | """Returns the global variable of given name. 211 | 212 | name : the name of the global variable 213 | """ 214 | # return [v for v in tf.variables() if v.name == name][0] 215 | return [v for v in tf.global_variables() if v.name == name][0] 216 | 217 | 218 | def create_global_variables(local_optimizer_vars = []): 219 | """Creates global variables for local variables on the graph. 220 | Skips variables local variables that are created for 221 | local optimization. 222 | 223 | Returns dictionarys for local-to-global and global-to-local 224 | variable mappings. 225 | """ 226 | local_to_global = {} 227 | global_to_local = {} 228 | with tf.device('/job:ps/task:0'): 229 | for v in tf.local_variables(): 230 | if v not in local_optimizer_vars: 231 | v_g = tf.get_variable('g/'+v.op.name, 232 | shape = v.shape, 233 | dtype = v.dtype, 234 | trainable=True, 235 | collections=[tf.GraphKeys.GLOBAL_VARIABLES, 236 | tf.GraphKeys.TRAINABLE_VARIABLES]) 237 | local_to_global[v] = v_g 238 | global_to_local[v_g] = v 239 | return local_to_global,global_to_local 240 | 241 | 242 | def add_global_variables_to_local_collection(): 243 | """Adds all variables from the global collection 244 | to the local collection. 245 | 246 | Returns the list of variables added. 247 | """ 248 | r =[] 249 | for var in tf.get_default_graph()._collections[tf.GraphKeys.GLOBAL_VARIABLES]: 250 | tf.add_to_collection(tf.GraphKeys.LOCAL_VARIABLES,var) 251 | r.append(var) 252 | return r 253 | 254 | 255 | def clear_global_collection(): 256 | """Removes all variables from global collection.""" 257 | g = tf.get_default_graph() 258 | for _ in range(len(g._collections[tf.GraphKeys.GLOBAL_VARIABLES])): 259 | del g._collections[tf.GraphKeys.GLOBAL_VARIABLES][0] 260 | 261 | 262 | if __name__ == '__main__': 263 | parser = argparse.ArgumentParser() 264 | # Flags for defining the tf.train.ClusterSpec 265 | parser.add_argument( 266 | "--job_name", 267 | type=str, 268 | default="", 269 | help="One of 'ps', 'worker'" 270 | ) 271 | # Flags for defining the tf.train.Server 272 | parser.add_argument( 273 | "--task_index", 274 | type=int, 275 | default=0, 276 | help="Index of task within the job" 277 | ) 278 | FLAGS, unparsed = parser.parse_known_args() 279 | print(FLAGS.task_index) 280 | main() 281 | -------------------------------------------------------------------------------- /SAGN/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python SAGN.py --job_name "ps" --task_index 0 & 3 | python SAGN.py --job_name "worker" --task_index 0 & 4 | python SAGN.py --job_name "worker" --task_index 1 & 5 | -------------------------------------------------------------------------------- /Synchronous-SGD-different-learning-rates/README.md: -------------------------------------------------------------------------------- 1 | ## SSGD different learning rates 2 | 3 | Same as vanilla SSGD except each of the workers has its own learning rate. 4 | -------------------------------------------------------------------------------- /Synchronous-SGD-different-learning-rates/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python ssgd.py --job_name "ps" --task_index 0 & 3 | python ssgd.py --job_name "worker" --task_index 0 & 4 | python ssgd.py --job_name "worker" --task_index 1 & -------------------------------------------------------------------------------- /Synchronous-SGD-different-learning-rates/ssgd.py: -------------------------------------------------------------------------------- 1 | """Synchrous SGD with different learning rates 2 | 3 | Author: Tommy Mulc 4 | """ 5 | 6 | from __future__ import print_function 7 | import tensorflow as tf 8 | import argparse 9 | import time 10 | import os 11 | FLAGS = None 12 | log_dir = '/logdir' 13 | 14 | def main(): 15 | # Configure 16 | config=tf.ConfigProto(log_device_placement=False) 17 | 18 | # Server Setup 19 | cluster = tf.train.ClusterSpec({ 20 | 'ps':['localhost:2222'], 21 | 'worker':['localhost:2223','localhost:2224'] 22 | }) #allows this node know about all other nodes 23 | if FLAGS.job_name == 'ps': #checks if parameter server 24 | server = tf.train.Server(cluster, 25 | job_name="ps", 26 | task_index=FLAGS.task_index, 27 | config=config) 28 | server.join() 29 | else: #it must be a worker server 30 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 31 | server = tf.train.Server(cluster, 32 | job_name="worker", 33 | task_index=FLAGS.task_index, 34 | config=config) 35 | 36 | # Graph 37 | worker_device = "/job:%s/task:%d/cpu:0" % (FLAGS.job_name,FLAGS.task_index) 38 | with tf.device(tf.train.replica_device_setter(ps_tasks=1, 39 | worker_device=worker_device)): 40 | 41 | a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32) 42 | b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32) 43 | c = a+b 44 | 45 | global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step') 46 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 47 | loss = tf.reduce_mean(tf.square(c-target)) 48 | 49 | # all workers use the same learning rate and it is decided on by the task 0 50 | # or maybe the from the graph of the chief worker 51 | base_lr = 1. 52 | optimizer = tf.train.GradientDescentOptimizer(base_lr) 53 | optimizer1 = tf.train.SyncReplicasOptimizer(optimizer, 54 | replicas_to_aggregate=2, 55 | total_num_replicas=2) 56 | 57 | # use different learning rates (hacky) 58 | # only works for GradientDescentOptimizer 59 | lr = .0001 60 | if FLAGS.task_index == 0: 61 | lr = .1 62 | new_lr = lr/base_lr 63 | grads, varss = zip(*optimizer1.compute_gradients(loss)) 64 | grads = [new_lr*2.0*grad for grad in grads] #this sums gradients 65 | opt = optimizer1.apply_gradients(zip(grads,varss),global_step=global_step) 66 | 67 | # Init ops 68 | init_tokens_op = optimizer1.get_init_tokens_op() 69 | 70 | local_init=optimizer1.local_step_init_op # initialize local step 71 | if is_chief: 72 | local_init = optimizer1.chief_init_op # initializes token queue 73 | 74 | ready_for_local_init = optimizer1.ready_for_local_init_op # checks if global vars are init 75 | init = tf.global_variables_initializer() # must come after other init ops 76 | 77 | # Session 78 | sync_replicas_hook = optimizer1.make_session_run_hook(is_chief) 79 | stop_hook = tf.train.StopAtStepHook(last_step=10) 80 | chief_hooks = [sync_replicas_hook,stop_hook] 81 | scaff = tf.train.Scaffold(init_op=init, 82 | local_init_op=local_init, 83 | ready_for_local_init_op=ready_for_local_init) 84 | 85 | #Monitored Training Session 86 | sess = tf.train.MonitoredTrainingSession(master=server.target, 87 | is_chief=is_chief, 88 | config=config, 89 | scaffold=scaff, 90 | hooks=chief_hooks, 91 | stop_grace_period_secs=10) 92 | 93 | if is_chief: 94 | sess.run(init_tokens_op) 95 | time.sleep(40) #grace period to wait on other workers before starting training 96 | 97 | print('Starting training on worker %d'%FLAGS.task_index) 98 | while not sess.should_stop(): 99 | _,r,gs=sess.run([opt,c,global_step]) 100 | print(r,'step: ',gs,'worker: ',FLAGS.task_index) 101 | if is_chief: time.sleep(1) 102 | time.sleep(1) 103 | print('Done',FLAGS.task_index) 104 | 105 | time.sleep(10) #grace period to wait before closing session 106 | sess.close() 107 | print('Session from worker %d closed cleanly'%FLAGS.task_index) 108 | 109 | 110 | if __name__ == '__main__': 111 | parser = argparse.ArgumentParser() 112 | # Flags for defining the tf.train.ClusterSpec 113 | parser.add_argument( 114 | "--job_name", 115 | type=str, 116 | default="", 117 | help="One of 'ps', 'worker'" 118 | ) 119 | # Flags for defining the tf.train.Server 120 | parser.add_argument( 121 | "--task_index", 122 | type=int, 123 | default=0, 124 | help="Index of task within the job" 125 | ) 126 | FLAGS, unparsed = parser.parse_known_args() 127 | print(FLAGS.task_index) 128 | main() 129 | -------------------------------------------------------------------------------- /Synchronous-SGD/README.md: -------------------------------------------------------------------------------- 1 | ## SSGD (Synchronous SGD) 2 | 3 | Have workers send their updates to a ps, but only update the variables on the ps after *N* updates have been accumulated. If the number of workers is *M* and *M>N*, then this is known as dropping the last *M-N* *stale gradients*. 4 | -------------------------------------------------------------------------------- /Synchronous-SGD/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python ssgd.py --job_name "ps" --task_index 0 & 3 | python ssgd.py --job_name "worker" --task_index 0 & 4 | python ssgd.py --job_name "worker" --task_index 1 & -------------------------------------------------------------------------------- /Synchronous-SGD/ssgd.py: -------------------------------------------------------------------------------- 1 | """Synchronous SGD 2 | 3 | Author: Tommy Mulc 4 | """ 5 | 6 | from __future__ import print_function 7 | import tensorflow as tf 8 | import argparse 9 | import time 10 | import os 11 | FLAGS = None 12 | log_dir = '/logdir' 13 | REPLICAS_TO_AGGREGATE = 2 14 | 15 | def main(): 16 | # Configure 17 | config=tf.ConfigProto(log_device_placement=False) 18 | 19 | # Server Setup 20 | cluster = tf.train.ClusterSpec({ 21 | 'ps':['localhost:2222'], 22 | 'worker':['localhost:2223','localhost:2224'] 23 | }) #allows this node know about all other nodes 24 | if FLAGS.job_name == 'ps': #checks if parameter server 25 | server = tf.train.Server(cluster, 26 | job_name="ps", 27 | task_index=FLAGS.task_index, 28 | config=config) 29 | server.join() 30 | else: #it must be a worker server 31 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 32 | server = tf.train.Server(cluster, 33 | job_name="worker", 34 | task_index=FLAGS.task_index, 35 | config=config) 36 | 37 | # Graph 38 | worker_device = "/job:%s/task:%d/cpu:0" % (FLAGS.job_name,FLAGS.task_index) 39 | with tf.device(tf.train.replica_device_setter(ps_tasks=1, 40 | worker_device=worker_device)): 41 | 42 | a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32) 43 | b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32) 44 | c=a+b 45 | 46 | global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step') 47 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 48 | loss = tf.reduce_mean(tf.square(c-target)) 49 | 50 | # create an optimizer then wrap it with SynceReplicasOptimizer 51 | optimizer = tf.train.GradientDescentOptimizer(.0001) 52 | optimizer1 = tf.train.SyncReplicasOptimizer(optimizer, 53 | replicas_to_aggregate=REPLICAS_TO_AGGREGATE, total_num_replicas=2) 54 | 55 | opt = optimizer1.minimize(loss,global_step=global_step) # averages gradients 56 | #opt = optimizer1.minimize(REPLICAS_TO_AGGREGATE*loss, 57 | # global_step=global_step) # hackily sums gradients 58 | 59 | # Session 60 | sync_replicas_hook = optimizer1.make_session_run_hook(is_chief) 61 | stop_hook = tf.train.StopAtStepHook(last_step=10) 62 | hooks = [sync_replicas_hook,stop_hook] 63 | 64 | # Monitored Training Session 65 | sess = tf.train.MonitoredTrainingSession(master = server.target, 66 | is_chief=is_chief, 67 | config=config, 68 | hooks=hooks, 69 | stop_grace_period_secs=10) 70 | 71 | print('Starting training on worker %d'%FLAGS.task_index) 72 | while not sess.should_stop(): 73 | _,r,gs=sess.run([opt,c,global_step]) 74 | print(r,'step: ',gs,'worker: ',FLAGS.task_index) 75 | if is_chief: time.sleep(1) 76 | time.sleep(1) 77 | print('Done',FLAGS.task_index) 78 | 79 | time.sleep(10) #grace period to wait before closing session 80 | sess.close() 81 | print('Session from worker %d closed cleanly'%FLAGS.task_index) 82 | 83 | 84 | if __name__ == '__main__': 85 | parser = argparse.ArgumentParser() 86 | # Flags for defining the tf.train.ClusterSpec 87 | parser.add_argument( 88 | "--job_name", 89 | type=str, 90 | default="", 91 | help="One of 'ps', 'worker'" 92 | ) 93 | # Flags for defining the tf.train.Server 94 | parser.add_argument( 95 | "--task_index", 96 | type=int, 97 | default=0, 98 | help="Index of task within the job" 99 | ) 100 | FLAGS, unparsed = parser.parse_known_args() 101 | print(FLAGS.task_index) 102 | main() 103 | -------------------------------------------------------------------------------- /imgs/data-parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmulc18/Distributed-TensorFlow-Guide/8e7fec757112a3ab5dccff93e848e7617ef7ed3e/imgs/data-parallelism.png --------------------------------------------------------------------------------