├── .gitignore ├── AGN ├── AGN.py ├── README.md └── run.sh ├── Basics-Tutorial ├── Local-then-Global-Variables.ipynb ├── Multiple-Workers │ ├── Local-then-Global-Variables-Worker1.ipynb │ ├── Local-then-Global-Variables-Worker2.ipynb │ └── Parameter-Server.ipynb ├── Parameter-Server.ipynb ├── README.md └── Servers.ipynb ├── DOWNPOUR-Easy ├── DOWNPOUR.py ├── README.md └── run.sh ├── DOWNPOUR ├── DOWNPOUR.py ├── README.md └── run.sh ├── Distributed-Setup ├── README.md ├── dist_setup.py ├── dist_setup_sup.py ├── run.sh └── run_sup.sh ├── Hogwild ├── Hogwild.py ├── README.md └── run.sh ├── LICENSE ├── Multiple-GPUs-Single-Machine ├── README.md ├── dist_mult_gpu_sing_mach.py └── dist_mult_gpu_sing_mach.sh ├── Non-Distributed_Setup.py ├── README.md ├── SAGN ├── README.md ├── SAGN.py └── run.sh ├── Synchronous-SGD-different-learning-rates ├── README.md ├── run.sh └── ssgd.py ├── Synchronous-SGD ├── README.md ├── run.sh └── ssgd.py └── imgs └── data-parallelism.png /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints* 2 | Basics-Tutorial/Multiple-Workers/.ipynb_checkpoints/* 3 | Basics-Tutorial/Beginner\ Tutorial\ Variables.ipynb 4 | Basics-Tutorial/.ipynb_checkpoints/* 5 | -------------------------------------------------------------------------------- /AGN/AGN.py: -------------------------------------------------------------------------------- 1 | """Asynchronous Distributed Adaptive Gradients (ADAG) 2 | 3 | Formerly known as ADAG. 4 | Performs asynchronous updates with update window. 5 | 6 | Author: Tommy Mulc 7 | """ 8 | 9 | from __future__ import print_function 10 | import tensorflow as tf 11 | import argparse 12 | import time 13 | import os 14 | FLAGS = None 15 | log_dir = '/logdir' 16 | 17 | def main(): 18 | # Configure 19 | config=tf.ConfigProto(log_device_placement=False) 20 | 21 | #Server Setup 22 | cluster_spec = { 23 | 'ps':['localhost:2222'], 24 | 'worker':['localhost:2223','localhost:2224'] 25 | } #allows this node know about all other nodes 26 | n_pss = len(cluster_spec['ps']) #the number of parameter servers 27 | n_workers = len(cluster_spec['worker']) #the number of worker nodes 28 | cluster = tf.train.ClusterSpec(cluster_spec) 29 | 30 | if FLAGS.job_name == 'ps': #checks if parameter server 31 | server = tf.train.Server(cluster, 32 | job_name="ps", 33 | task_index=FLAGS.task_index, 34 | config=config) 35 | server.join() 36 | else: #it must be a worker server 37 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 38 | server = tf.train.Server(cluster, 39 | job_name="worker", 40 | task_index=FLAGS.task_index, 41 | config=config) 42 | 43 | # Graph 44 | # We must not use train.replicate_device_setter for normal operations 45 | # Local operations 46 | with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index): 47 | a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 48 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 49 | b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 50 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 51 | c=a+b 52 | 53 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 54 | loss = tf.reduce_mean(tf.square(c-target)) 55 | 56 | local_step = tf.Variable(0,dtype=tf.int32,trainable=False, 57 | name='local_step',collections=['local_non_trainable']) 58 | 59 | lr = .0001 60 | # loptimizer = tf.train.GradientDescentOptimizer(lr) #local optimizer 61 | loptimizer = tf.train.AdamOptimizer(lr) #local optimizer 62 | 63 | # ADAG (simplest case since all batches are the same) 64 | update_window = 3 # T: update/communication window 65 | grad_list = [] # the array to store the gradients through the communication window 66 | for t in range(update_window): 67 | if t != 0: 68 | with tf.control_dependencies([opt_local]): #compute gradients only if the local opt was run 69 | grads, varss = zip(*loptimizer.compute_gradients(loss, 70 | var_list=tf.local_variables())) 71 | else: 72 | grads, varss = zip(*loptimizer.compute_gradients(loss, 73 | var_list=tf.local_variables())) 74 | grad_list.append(grads) #add gradients to the list 75 | opt_local = loptimizer.apply_gradients(zip(grads,varss), 76 | global_step=local_step) #update local parameters 77 | grads = tf.reduce_mean(grad_list,axis=0) 78 | grads = tuple([grads[i]for i in range(len(varss))]) 79 | 80 | # add these variables created by local optimizer to local collection 81 | lopt_vars = add_global_variables_to_local_collection() 82 | 83 | # delete the variables from the global collection 84 | clear_global_collection() 85 | 86 | with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss, 87 | worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))): 88 | global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step') 89 | 90 | # optimizer for central variables 91 | optimizer = tf.train.AdamOptimizer(lr) 92 | # optimizer = tf.train.GradientDescentOptimizer(lr) 93 | 94 | #create global variables and/or references 95 | local_to_global, global_to_local = create_global_variables(lopt_vars) 96 | 97 | opt = optimizer.apply_gradients( 98 | zip(grads,[ local_to_global[v] for v in varss]) 99 | ,global_step=global_step) #apply the gradients to variables on ps 100 | 101 | # Pull param from global server 102 | with tf.control_dependencies([opt]): 103 | assign_locals = assign_global_to_local(global_to_local) 104 | 105 | # Init ops 106 | init_local = tf.variables_initializer(tf.local_variables() \ 107 | +tf.get_collection('local_non_trainable'))#for local variables 108 | init = tf.global_variables_initializer() # for global variables 109 | 110 | # Grab global state before training so all workers have same initialization 111 | grab_global_init = assign_global_to_local(global_to_local) 112 | 113 | # Assigns local values to global ones for chief to execute 114 | assign_global = assign_local_to_global(local_to_global) 115 | 116 | # Session 117 | stop_hook = tf.train.StopAtStepHook(last_step=40) 118 | hooks = [stop_hook] 119 | scaff = tf.train.Scaffold(init_op=init,local_init_op=init_local) 120 | 121 | #Monitored Training Session 122 | sess = tf.train.MonitoredTrainingSession(master=server.target, 123 | is_chief=is_chief, 124 | config=config, 125 | scaffold=scaff, 126 | hooks=hooks, 127 | save_checkpoint_secs=1, 128 | checkpoint_dir='logdir') 129 | if is_chief: 130 | sess.run(assign_global) #Assigns chief's initial values to ps 131 | time.sleep(10) #grace period to wait on other workers before starting training 132 | 133 | # Train until hook stops session 134 | print('Starting training on worker %d'%FLAGS.task_index) 135 | sess.run(grab_global_init) 136 | while not sess.should_stop(): 137 | _,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step]) 138 | print(r,"global step: "+str(gs),"worker: "+str(FLAGS.task_index),"local step: "+str(ls)) 139 | time.sleep(1) 140 | print('Done',FLAGS.task_index) 141 | 142 | time.sleep(10) #grace period to wait before closing session 143 | sess.close() 144 | print('Session from worker %d closed cleanly'%FLAGS.task_index) 145 | 146 | 147 | def assign_global_to_local(global_to_local): 148 | """ 149 | global_to_local : dictionary with corresponding local variable for global key 150 | 151 | Assigns global variable value to local variables 152 | """ 153 | r = [] 154 | for v in global_to_local.keys(): 155 | r.append(tf.assign(global_to_local[v],v)) 156 | with tf.control_dependencies(r): 157 | a = tf.no_op() 158 | return a 159 | 160 | 161 | def assign_local_to_global(local_to_global): 162 | """Assigns global variable value to local variables. 163 | 164 | local_to_global : dictionary with corresponding global variable for local key 165 | """ 166 | r= [] 167 | for v in local_to_global.keys(): 168 | r.append(tf.assign(local_to_global[v],v)) 169 | with tf.control_dependencies(r): 170 | a = tf.no_op() 171 | return a 172 | 173 | 174 | def get_global_variable_by_name(name): 175 | """Returns the global variable of given name. 176 | 177 | name : the name of the global variable 178 | """ 179 | return [v for v in tf.global_variables() if v.name == name][0] 180 | 181 | 182 | def create_global_variables(local_optimizer_vars = []): 183 | """Creates global variables for local variables on the graph. 184 | Skips variables local variables that are created for 185 | local optimization. 186 | 187 | Returns dictionarys for local-to-global and global-to-local 188 | variable mappings. 189 | """ 190 | local_to_global = {} 191 | global_to_local = {} 192 | with tf.device('/job:ps/task:0'): 193 | for v in tf.local_variables(): 194 | if v not in local_optimizer_vars: 195 | v_g = tf.get_variable('g/'+v.op.name, 196 | shape = v.shape, 197 | dtype = v.dtype, 198 | trainable=True, 199 | collections=[tf.GraphKeys.GLOBAL_VARIABLES, 200 | tf.GraphKeys.TRAINABLE_VARIABLES]) 201 | local_to_global[v] = v_g 202 | global_to_local[v_g] = v 203 | return local_to_global,global_to_local 204 | 205 | 206 | def add_global_variables_to_local_collection(): 207 | """Adds all variables from the global collection 208 | to the local collection. 209 | 210 | Returns the list of variables added. 211 | """ 212 | r =[] 213 | for var in tf.get_default_graph()._collections[tf.GraphKeys.GLOBAL_VARIABLES]: 214 | tf.add_to_collection(tf.GraphKeys.LOCAL_VARIABLES,var) 215 | r.append(var) 216 | return r 217 | 218 | 219 | def clear_global_collection(): 220 | """Removes all variables from global collection.""" 221 | g = tf.get_default_graph() 222 | for _ in range(len(g._collections[tf.GraphKeys.GLOBAL_VARIABLES])): 223 | del g._collections[tf.GraphKeys.GLOBAL_VARIABLES][0] 224 | 225 | 226 | if __name__ == '__main__': 227 | parser = argparse.ArgumentParser() 228 | # Flags for defining the tf.train.ClusterSpec 229 | parser.add_argument( 230 | "--job_name", 231 | type=str, 232 | default="", 233 | help="One of 'ps', 'worker'" 234 | ) 235 | # Flags for defining the tf.train.Server 236 | parser.add_argument( 237 | "--task_index", 238 | type=int, 239 | default=0, 240 | help="Index of task within the job" 241 | ) 242 | FLAGS, unparsed = parser.parse_known_args() 243 | print(FLAGS.task_index) 244 | main() 245 | -------------------------------------------------------------------------------- /AGN/README.md: -------------------------------------------------------------------------------- 1 | ## AGN (Accumulated Gradient Normalization) 2 | 3 | This method was formerly known as ADAG (Asynchronous Distributed Adaptive Gradients). 4 | 5 | Similar to DOWNPOUR expect that it uses a communications window *T* and accumulates gradients for *T* steps before sending updates to the parameter server. 6 | -------------------------------------------------------------------------------- /AGN/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python AGN.py --job_name "ps" --task_index 0 & 3 | python AGN.py --job_name "worker" --task_index 0 & 4 | python AGN.py --job_name "worker" --task_index 1 & -------------------------------------------------------------------------------- /Basics-Tutorial/Local-then-Global-Variables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "'1.3.0'" 23 | ] 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "tf.__version__" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Author: Tommy Mulc\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "print \"Author: Tommy Mulc\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Create a TensorFlow cluster with one worker node and one ps node." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "cluster_spec = tf.train.ClusterSpec({'worker' : ['localhost:2223'], 'ps' : ['localhost:2222']})\n", 67 | "server = tf.train.Server(cluster_spec,job_name='worker')" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "**Now launch run all the cells in the parameter server notebook**" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Create variables locally then makes global copy. One worker scenario" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "tf.reset_default_graph()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "#create local graph like normal specifying the local device\n", 104 | "with tf.device('/job:worker/task:0'):\n", 105 | " a = tf.Variable([0.],name='a',collections=[tf.GraphKeys.LOCAL_VARIABLES])\n", 106 | " b = tf.constant([100.])\n", 107 | " loss = tf.abs(a-b)\n", 108 | " \n", 109 | " optimizer = tf.train.GradientDescentOptimizer(.1)\n", 110 | " grads,local_vars = zip(*optimizer.compute_gradients(loss,var_list=tf.local_variables()))\n", 111 | " local_update = optimizer.apply_gradients(zip(grads,local_vars))\n", 112 | " \n", 113 | " \n", 114 | " init_local = tf.local_variables_initializer()\n", 115 | "\n", 116 | "#create the globabl copies on the ps\n", 117 | "with tf.device('/job:ps/task:0'):\n", 118 | " for v in tf.local_variables():\n", 119 | " v_g = tf.get_variable('g/'+v.op.name,\n", 120 | " shape = v.shape,\n", 121 | " dtype = v.dtype,\n", 122 | " trainable=True,\n", 123 | " collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES])\n", 124 | "\n", 125 | "\n", 126 | "#gloabl updates\n", 127 | "with tf.device('/job:worker/task:0'):\n", 128 | " #this needs to be updated. Clearly not robust for any graph more complext\n", 129 | " global_vars = tf.global_variables()\n", 130 | " global_update = optimizer.apply_gradients(zip(grads,global_vars))\n", 131 | "\n", 132 | "#create init op on the chief node\n", 133 | "with tf.device('/job:worker/task:0'):\n", 134 | " init_global = tf.global_variables_initializer()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "View the device placement of ops and variables" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 7, 147 | "metadata": { 148 | "collapsed": true 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "a_global = tf.global_variables()[0]" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 10, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "/job:worker/task:0\n", 165 | "/job:worker/task:0\n", 166 | "/job:worker/task:0\n", 167 | "/job:worker/task:0\n", 168 | "/job:ps/task:0\n", 169 | "/job:ps/task:0\n", 170 | "/job:worker/task:0\n", 171 | "/job:ps/task:0\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "print(a.device)\n", 177 | "print(b.device)\n", 178 | "print(loss.device)\n", 179 | "print(local_update.device)\n", 180 | "print(global_update.device)\n", 181 | "print(init_global.device)\n", 182 | "print(init_local.device)\n", 183 | "print(a_global.device)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "Now, let's view the states of local and global variables as we do local then global updates" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 11, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "[None, None]" 202 | ] 203 | }, 204 | "execution_count": 11, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | } 208 | ], 209 | "source": [ 210 | "sess = tf.Session(target=server.target)\n", 211 | "sess.run([init_local,init_global])" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 12, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "[array([ 0.], dtype=float32), array([-1.26032162], dtype=float32)]" 223 | ] 224 | }, 225 | "execution_count": 12, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "sess.run([a,a_global])" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 13, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "sess.run(local_update)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 14, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "[array([ 0.1], dtype=float32), array([-1.26032162], dtype=float32)]" 254 | ] 255 | }, 256 | "execution_count": 14, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "sess.run([a,a_global])" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "Notice that the state of the global variable hasn't changed" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 15, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "sess.run(global_update)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 16, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "[array([ 0.1], dtype=float32), array([-1.16032159], dtype=float32)]" 292 | ] 293 | }, 294 | "execution_count": 16, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "sess.run([a,a_global])" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "collapsed": true 308 | }, 309 | "outputs": [], 310 | "source": [] 311 | } 312 | ], 313 | "metadata": { 314 | "kernelspec": { 315 | "display_name": "Python [conda env:tensorflow13]", 316 | "language": "python", 317 | "name": "conda-env-tensorflow13-py" 318 | }, 319 | "language_info": { 320 | "codemirror_mode": { 321 | "name": "ipython", 322 | "version": 2 323 | }, 324 | "file_extension": ".py", 325 | "mimetype": "text/x-python", 326 | "name": "python", 327 | "nbconvert_exporter": "python", 328 | "pygments_lexer": "ipython2", 329 | "version": "2.7.13" 330 | } 331 | }, 332 | "nbformat": 4, 333 | "nbformat_minor": 2 334 | } 335 | -------------------------------------------------------------------------------- /Basics-Tutorial/Multiple-Workers/Local-then-Global-Variables-Worker1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "'1.3.0'" 23 | ] 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "tf.__version__" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Author: Tommy Mulc\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "print \"Author: Tommy Mulc\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Create a TensorFlow cluster with one worker node and one ps node." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "task_index=0" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "cluster_spec = tf.train.ClusterSpec({'ps' : ['localhost:2222'],'worker' : ['localhost:2223','localhost:2224']})\n", 78 | "server = tf.train.Server(cluster_spec,job_name='worker',task_index=task_index)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "**Launch and run all the cells in the parameter server notebook**" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Create variables locally then makes global copy on ps." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "tf.reset_default_graph()\n", 104 | "\n", 105 | "#create local graph like normal specifying the local device\n", 106 | "with tf.device('/job:worker/task:0'):\n", 107 | " a = tf.Variable([0.],name='a',collections=[tf.GraphKeys.LOCAL_VARIABLES])\n", 108 | " b = tf.constant([100.])\n", 109 | " loss = tf.abs(a-b)\n", 110 | " \n", 111 | " optimizer = tf.train.GradientDescentOptimizer(.1)\n", 112 | " grads,local_vars = zip(*optimizer.compute_gradients(loss,var_list=tf.local_variables()))\n", 113 | " local_update = optimizer.apply_gradients(zip(grads,local_vars))\n", 114 | " \n", 115 | " \n", 116 | " init_local = tf.local_variables_initializer()\n", 117 | "\n", 118 | "#create the globabl copies on the ps\n", 119 | "with tf.device('/job:ps/task:0'):\n", 120 | " for v in tf.local_variables():\n", 121 | " v_g = tf.get_variable('g/'+v.op.name,\n", 122 | " shape = v.shape,\n", 123 | " dtype = v.dtype,\n", 124 | " trainable=True,\n", 125 | " collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES])\n", 126 | "\n", 127 | "\n", 128 | "#gloabl updates\n", 129 | "with tf.device('/job:worker/task:%d'%task_index):\n", 130 | " #this needs to be updated. Clearly not robust for any graph more complext\n", 131 | " global_vars = tf.global_variables()\n", 132 | " global_update = optimizer.apply_gradients(zip(grads,global_vars))\n", 133 | "\n", 134 | "#create init op on the chief node\n", 135 | "with tf.device('/job:worker/task:%d'%task_index):\n", 136 | " init_global = tf.global_variables_initializer()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "View device placements" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "a_global = tf.global_variables()[0]" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "/job:worker/task:0\n", 167 | "/job:worker/task:0\n", 168 | "/job:worker/task:0\n", 169 | "/job:worker/task:0\n", 170 | "/job:ps/task:0\n", 171 | "/job:ps/task:0\n", 172 | "/job:worker/task:0\n", 173 | "/job:ps/task:0\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "print(a.device)\n", 179 | "print(b.device)\n", 180 | "print(loss.device)\n", 181 | "print(local_update.device)\n", 182 | "print(global_update.device)\n", 183 | "print(init_global.device)\n", 184 | "print(init_local.device)\n", 185 | "print(a_global.device)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 9, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "[None, None]" 197 | ] 198 | }, 199 | "execution_count": 9, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "sess = tf.Session(target=server.target)\n", 206 | "sess.run([init_local,init_global])" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "Make sure you have also run all cells in the worker 2 notebook up to this point before continuing. The above cell should hang until you initialize the worker 2 session." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 10, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "text/plain": [ 224 | "[array([ 0.], dtype=float32), array([-1.17584229], dtype=float32)]" 225 | ] 226 | }, 227 | "execution_count": 10, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "sess.run([a,a_global])" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 11, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "sess.run(local_update)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 12, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "[array([ 0.1], dtype=float32), array([-1.17584229], dtype=float32)]" 256 | ] 257 | }, 258 | "execution_count": 12, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "sess.run([a,a_global])" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 13, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "sess.run(global_update)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 14, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "[array([ 0.1], dtype=float32), array([-1.07584226], dtype=float32)]" 287 | ] 288 | }, 289 | "execution_count": 14, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "sess.run([a,a_global])" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "Pause here. Run the last cell in this notebook after you have done a global update in the worker 2 notebook." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 15, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "[array([-0.97584224], dtype=float32)]" 314 | ] 315 | }, 316 | "execution_count": 15, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "sess.run([a_global])" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [] 333 | } 334 | ], 335 | "metadata": { 336 | "kernelspec": { 337 | "display_name": "Python [conda env:tensorflow13]", 338 | "language": "python", 339 | "name": "conda-env-tensorflow13-py" 340 | }, 341 | "language_info": { 342 | "codemirror_mode": { 343 | "name": "ipython", 344 | "version": 2 345 | }, 346 | "file_extension": ".py", 347 | "mimetype": "text/x-python", 348 | "name": "python", 349 | "nbconvert_exporter": "python", 350 | "pygments_lexer": "ipython2", 351 | "version": "2.7.13" 352 | } 353 | }, 354 | "nbformat": 4, 355 | "nbformat_minor": 2 356 | } 357 | -------------------------------------------------------------------------------- /Basics-Tutorial/Multiple-Workers/Local-then-Global-Variables-Worker2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "'1.3.0'" 23 | ] 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "tf.__version__" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Author: Tommy Mulc\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "print \"Author: Tommy Mulc\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Create a TensorFlow cluster with one worker node and one ps node." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "task_index=1" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "cluster_spec = tf.train.ClusterSpec({'ps' : ['localhost:2222'],'worker' : ['localhost:2223','localhost:2224']})\n", 78 | "server = tf.train.Server(cluster_spec,job_name='worker',task_index=task_index)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "**Launch and run all the cells in the parameter server notebook (if you haven't already)**" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Create variables locally then makes a global copy on ps." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "tf.reset_default_graph()\n", 104 | "\n", 105 | "#create local graph like normal specifying the local device\n", 106 | "with tf.device('/job:worker/task:%d'%task_index):\n", 107 | " a = tf.Variable([0.],name='a',collections=[tf.GraphKeys.LOCAL_VARIABLES])\n", 108 | " b = tf.constant([100.])\n", 109 | " loss = tf.abs(a-b)\n", 110 | " \n", 111 | " optimizer = tf.train.GradientDescentOptimizer(.1)\n", 112 | " grads,local_vars = zip(*optimizer.compute_gradients(loss,var_list=tf.local_variables()))\n", 113 | " local_update = optimizer.apply_gradients(zip(grads,local_vars))\n", 114 | " \n", 115 | " \n", 116 | " init_local = tf.local_variables_initializer()\n", 117 | "\n", 118 | "#create the globabl copies on the ps\n", 119 | "with tf.device('/job:ps/task:0'):\n", 120 | " for v in tf.local_variables():\n", 121 | " v_g = tf.get_variable('g/'+v.op.name,\n", 122 | " shape = v.shape,\n", 123 | " dtype = v.dtype,\n", 124 | " trainable=True,\n", 125 | " collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES])\n", 126 | "\n", 127 | "\n", 128 | "#gloabl updates\n", 129 | "with tf.device('/job:worker/task:%d'%task_index):\n", 130 | " #this needs to be updated. Clearly not robust for any graph more complext\n", 131 | " global_vars = tf.global_variables()\n", 132 | " global_update = optimizer.apply_gradients(zip(grads,global_vars))\n", 133 | "\n", 134 | "#create init op on the chief node\n", 135 | "with tf.device('/job:worker/task:%d'%task_index):\n", 136 | " init_global = tf.global_variables_initializer()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "View device placements" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "a_global = tf.global_variables()[0]" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "/job:worker/task:1\n", 167 | "/job:worker/task:1\n", 168 | "/job:worker/task:1\n", 169 | "/job:worker/task:1\n", 170 | "/job:ps/task:0\n", 171 | "/job:ps/task:0\n", 172 | "/job:worker/task:1\n", 173 | "/job:ps/task:0\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "print(a.device)\n", 179 | "print(b.device)\n", 180 | "print(loss.device)\n", 181 | "print(local_update.device)\n", 182 | "print(global_update.device)\n", 183 | "print(init_global.device)\n", 184 | "print(init_local.device)\n", 185 | "print(a_global.device)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 9, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "[None]" 197 | ] 198 | }, 199 | "execution_count": 9, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "sess = tf.Session(target=server.target)\n", 206 | "sess.run([init_local])" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 11, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "[array([ 0.], dtype=float32), array([-1.07584226], dtype=float32)]" 218 | ] 219 | }, 220 | "execution_count": 11, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "sess.run([a,a_global])" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "Wait for a global update from worker 1, then continue." 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 12, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "sess.run(local_update)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 13, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "[array([ 0.1], dtype=float32), array([-1.07584226], dtype=float32)]" 256 | ] 257 | }, 258 | "execution_count": 13, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "sess.run([a,a_global])" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 14, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "sess.run(global_update)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 15, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "[array([ 0.1], dtype=float32), array([-0.97584224], dtype=float32)]" 287 | ] 288 | }, 289 | "execution_count": 15, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "sess.run([a,a_global])" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": true 303 | }, 304 | "outputs": [], 305 | "source": [] 306 | } 307 | ], 308 | "metadata": { 309 | "kernelspec": { 310 | "display_name": "Python [conda env:tensorflow13]", 311 | "language": "python", 312 | "name": "conda-env-tensorflow13-py" 313 | }, 314 | "language_info": { 315 | "codemirror_mode": { 316 | "name": "ipython", 317 | "version": 2 318 | }, 319 | "file_extension": ".py", 320 | "mimetype": "text/x-python", 321 | "name": "python", 322 | "nbconvert_exporter": "python", 323 | "pygments_lexer": "ipython2", 324 | "version": "2.7.13" 325 | } 326 | }, 327 | "nbformat": 4, 328 | "nbformat_minor": 2 329 | } 330 | -------------------------------------------------------------------------------- /Basics-Tutorial/Multiple-Workers/Parameter-Server.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "Running the below cell will cause this kernel to stall on the cell until the notebook is shutdown." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "cluster_spec = tf.train.ClusterSpec({'ps' : ['localhost:2222'],'worker' : ['localhost:2223','localhost:2224']})\n", 30 | "ps = tf.train.Server(cluster_spec,job_name='ps')\n", 31 | "ps.join()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python [conda env:tensorflow13]", 47 | "language": "python", 48 | "name": "conda-env-tensorflow13-py" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 2 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython2", 60 | "version": "2.7.13" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 2 65 | } 66 | -------------------------------------------------------------------------------- /Basics-Tutorial/Parameter-Server.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "Running the below cell will cause this kernel to stall on the cell until the notebook is shutdown." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "cluster_spec = tf.train.ClusterSpec({'worker' : ['localhost:2223'], 'ps' : ['localhost:2222']})\n", 30 | "ps = tf.train.Server(cluster_spec,job_name='ps')\n", 31 | "ps.join()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python [conda env:tensorflow13]", 47 | "language": "python", 48 | "name": "conda-env-tensorflow13-py" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 2 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython2", 60 | "version": "2.7.13" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 2 65 | } 66 | -------------------------------------------------------------------------------- /Basics-Tutorial/README.md: -------------------------------------------------------------------------------- 1 | ## Basics Tutorial 2 | 3 | This short tutorial will show you how to get started with distributed TensorFlow. The aim is to get you familiar with basic distributed TF concepts that are reoccurring, such as TF servers. You should work throught the content in the following order 4 | 5 | 1. [`Server.ipynb`](Servers.ipynb) 6 | 2. [`Parameter-Server.ipynb`](Parameter-Server.ipynb) 7 | 3. [`Local-then-Global-Variables.ipynb`](Local-then-Global-Variables.ipynb) 8 | 4. [`Multiple-Workers`](Multiple-Workers/). 9 | 10 | The Multiple-Workers exercise requires three notebooks, and should be started from the Worker1 notebook. 11 | 12 | ### Coming Soon! 13 | * Sessions, Sesssion Managers, Training Sessions... 14 | 15 | -------------------------------------------------------------------------------- /Basics-Tutorial/Servers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "'1.3.0'" 23 | ] 24 | }, 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "tf.__version__" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Author: Tommy Mulc\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "print \"Author: Tommy Mulc\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# TensorFlow Servers\n", 56 | "\n", 57 | "Create a TensorFlow cluster with one node. Let this node be responsible for a job that that has name \"worker\" and that will operate one take at localhost:2222" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "cluster_spec = tf.train.ClusterSpec({'worker' : ['localhost:2222']})\n", 69 | "server = tf.train.Server(cluster_spec)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "'grpc://localhost:2222'" 81 | ] 82 | }, 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "server.target" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "The server is currently running. Check this by running \n", 97 | "\n", 98 | "`lsof -i -P -n | grep LISTEN | grep python`\n", 99 | "\n", 100 | "in your terminal." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 6, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "python2.7 66001 tmulc 3u IPv6 0x358037b03a6c7799 0t0 TCP [::1]:8888 (LISTEN)\n", 113 | "python2.7 66001 tmulc 4u IPv4 0x358037b038251061 0t0 TCP 127.0.0.1:8888 (LISTEN)\n", 114 | "python2.7 66017 tmulc 25u IPv4 0x358037b0381ff769 0t0 TCP 127.0.0.1:60322 (LISTEN)\n", 115 | "python2.7 66017 tmulc 28u IPv4 0x358037b0381fd251 0t0 TCP 127.0.0.1:60324 (LISTEN)\n", 116 | "python2.7 66017 tmulc 31u IPv4 0x358037b038285251 0t0 TCP 127.0.0.1:60325 (LISTEN)\n", 117 | "python2.7 66017 tmulc 34u IPv4 0x358037b038485b49 0t0 TCP 127.0.0.1:60323 (LISTEN)\n", 118 | "python2.7 66017 tmulc 39u IPv4 0x358037b039c18769 0t0 TCP 127.0.0.1:60339 (LISTEN)\n", 119 | "python2.7 66017 tmulc 52u IPv4 0x358037b038282579 0t0 TCP 127.0.0.1:60326 (LISTEN)\n", 120 | "python2.7 66017 tmulc 69u IPv6 0x358037b03a6c8259 0t0 TCP *:2222 (LISTEN)\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "%%bash\n", 126 | "lsof -i -P -n | grep LISTEN | grep python" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "View the meta data" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "cluster {\n", 145 | " job {\n", 146 | " name: \"worker\"\n", 147 | " tasks {\n", 148 | " value: \"localhost:2222\"\n", 149 | " }\n", 150 | " }\n", 151 | "}\n", 152 | "job_name: \"worker\"\n", 153 | "protocol: \"grpc\"" 154 | ] 155 | }, 156 | "execution_count": 7, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "server.server_def" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "Launch a TensorFlow session with the excecution engine being the server." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 8, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "sess = tf.Session(target=server.target)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "Use TensorFlow to create a local server and use `lsof` to find out the location of the server." 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 9, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "server = tf.train.Server.create_local_server()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 10, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "python2.7 66001 tmulc 3u IPv6 0x358037b03a6c7799 0t0 TCP [::1]:8888 (LISTEN)\n", 211 | "python2.7 66001 tmulc 4u IPv4 0x358037b038251061 0t0 TCP 127.0.0.1:8888 (LISTEN)\n", 212 | "python2.7 66017 tmulc 25u IPv4 0x358037b0381ff769 0t0 TCP 127.0.0.1:60322 (LISTEN)\n", 213 | "python2.7 66017 tmulc 28u IPv4 0x358037b0381fd251 0t0 TCP 127.0.0.1:60324 (LISTEN)\n", 214 | "python2.7 66017 tmulc 31u IPv4 0x358037b038285251 0t0 TCP 127.0.0.1:60325 (LISTEN)\n", 215 | "python2.7 66017 tmulc 34u IPv4 0x358037b038485b49 0t0 TCP 127.0.0.1:60323 (LISTEN)\n", 216 | "python2.7 66017 tmulc 39u IPv4 0x358037b039c18769 0t0 TCP 127.0.0.1:60339 (LISTEN)\n", 217 | "python2.7 66017 tmulc 52u IPv4 0x358037b038282579 0t0 TCP 127.0.0.1:60326 (LISTEN)\n", 218 | "python2.7 66017 tmulc 69u IPv6 0x358037b03a6c8259 0t0 TCP *:2222 (LISTEN)\n", 219 | "python2.7 66017 tmulc 75u IPv6 0x358037b031e27239 0t0 TCP *:60371 (LISTEN)\n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "%%bash\n", 225 | "lsof -i -P -n | grep LISTEN | grep python" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": { 231 | "collapsed": true 232 | }, 233 | "source": [ 234 | "View devices available in this session." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 11, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "/job:worker/replica:0/task:0/device:CPU:0\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "devices = sess.list_devices()\n", 252 | "for d in devices:\n", 253 | " print(d.name)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 12, 259 | "metadata": { 260 | "collapsed": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "sess.close()" 265 | ] 266 | } 267 | ], 268 | "metadata": { 269 | "kernelspec": { 270 | "display_name": "Python [conda env:tensorflow13]", 271 | "language": "python", 272 | "name": "conda-env-tensorflow13-py" 273 | }, 274 | "language_info": { 275 | "codemirror_mode": { 276 | "name": "ipython", 277 | "version": 2 278 | }, 279 | "file_extension": ".py", 280 | "mimetype": "text/x-python", 281 | "name": "python", 282 | "nbconvert_exporter": "python", 283 | "pygments_lexer": "ipython2", 284 | "version": "2.7.13" 285 | } 286 | }, 287 | "nbformat": 4, 288 | "nbformat_minor": 2 289 | } 290 | -------------------------------------------------------------------------------- /DOWNPOUR-Easy/DOWNPOUR.py: -------------------------------------------------------------------------------- 1 | """DOWNPOUR Easy 2 | 3 | Performs asynchronous updates with update window. 4 | Uses SGD on the local level for updates instead of Adagrad. 5 | 6 | Author: Tommy Mulc 7 | """ 8 | 9 | from __future__ import print_function 10 | import tensorflow as tf 11 | import argparse 12 | import time 13 | import os 14 | FLAGS = None 15 | log_dir = '/logdir' 16 | 17 | def main(): 18 | # Configure 19 | config=tf.ConfigProto(log_device_placement=False) 20 | 21 | # Server Setup 22 | cluster_spec = {'ps':['localhost:2222'], 23 | 'worker':['localhost:2223','localhost:2224']} 24 | n_pss = len(cluster_spec['ps']) #the number of parameter servers 25 | n_workers = len(cluster_spec['worker']) #the number of worker nodes 26 | cluster = tf.train.ClusterSpec(cluster_spec) #allows this node know about all other nodes 27 | 28 | if FLAGS.job_name == 'ps': #checks if parameter server 29 | server = tf.train.Server(cluster, 30 | job_name="ps", 31 | task_index=FLAGS.task_index, 32 | config=config) 33 | server.join() 34 | else: #it must be a worker server 35 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 36 | server = tf.train.Server(cluster, 37 | job_name="worker", 38 | task_index=FLAGS.task_index, 39 | config=config) 40 | 41 | # Graph 42 | # Local operations 43 | with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index): 44 | a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 45 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 46 | b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 47 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 48 | c=a+b 49 | 50 | local_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='local_step', 51 | collections=['local_non_trainable']) 52 | lr = .0001 53 | loptimizer = tf.train.GradientDescentOptimizer(lr*FLAGS.task_index) #local optimizer 54 | 55 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 56 | loss = tf.reduce_mean(tf.square(c-target)) 57 | 58 | # DOWNPOUR 59 | update_window = 3 # T: communication window 60 | grad_list = [] # array to store the gradients through the communication window 61 | for t in range(update_window): 62 | if t != 0: 63 | with tf.control_dependencies([opt_local]): #compute gradients only if the local opt was run 64 | grads, varss = zip(*loptimizer.compute_gradients( 65 | loss,var_list=tf.local_variables())) 66 | else: 67 | grads, varss = zip(*loptimizer.compute_gradients( 68 | loss,var_list=tf.local_variables())) 69 | grad_list.append(grads) #add gradients to the list 70 | opt_local = loptimizer.apply_gradients(zip(grads,varss), 71 | global_step=local_step) #update local parameters 72 | 73 | grads = tf.reduce_sum(grad_list,axis=0) #sum updates before applying globally 74 | grads = tuple([grads[i]for i in range(len(varss))]) 75 | 76 | 77 | with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss, 78 | worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))): 79 | 80 | global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step') 81 | 82 | # all workers use the same learning rate and it is decided on by the task 0 83 | # or maybe the from the graph of the chief worker 84 | optimizer = tf.train.AdagradOptimizer(lr) #global optimizer 85 | 86 | # create global variables and/or references 87 | local_to_global, global_to_local = create_global_variables() 88 | opt = optimizer.apply_gradients( 89 | zip(grads,[local_to_global[v] for v in varss]) 90 | ,global_step=global_step) #apply the gradients to variables on ps 91 | 92 | # Pull params from global server 93 | with tf.control_dependencies([opt]): 94 | assign_locals = assign_global_to_local(global_to_local) 95 | 96 | 97 | # Grab global state before training so all workers have same initialization 98 | grab_global_init = assign_global_to_local(global_to_local) 99 | 100 | # Assigns local values to global ones for chief to execute 101 | assign_global = assign_local_to_global(local_to_global) 102 | 103 | # Init ops 104 | init = tf.global_variables_initializer() # for global variables 105 | init_local = tf.variables_initializer(tf.local_variables() \ 106 | +tf.get_collection('local_non_trainable'))#for local variables 107 | 108 | # Session 109 | stop_hook = tf.train.StopAtStepHook(last_step=60) 110 | hooks = [stop_hook] 111 | scaff = tf.train.Scaffold(init_op=init,local_init_op=[init_local]) 112 | 113 | # Monitored Training Session 114 | sess = tf.train.MonitoredTrainingSession(master=server.target, 115 | is_chief=is_chief, 116 | config=config, 117 | scaffold=scaff, 118 | hooks=hooks, 119 | save_checkpoint_secs=1, 120 | checkpoint_dir='logdir') 121 | 122 | if is_chief: 123 | sess.run(assign_global) #Assigns chief's initial values to ps 124 | time.sleep(10) #grace period to wait on other workers before starting training 125 | 126 | # Train until hook stops session 127 | print('Starting training on worker %d'%FLAGS.task_index) 128 | sess.run(grab_global_init) 129 | while not sess.should_stop(): 130 | _,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step]) 131 | 132 | print(r,"global step: "+str(gs),"worker: "+str(FLAGS.task_index),"local step: "+str(ls)) 133 | 134 | time.sleep(1) # so we can observe training 135 | print('Done',FLAGS.task_index) 136 | 137 | time.sleep(10) #grace period to wait before closing session 138 | sess.close() 139 | print('Session from worker %d closed cleanly'%FLAGS.task_index) 140 | 141 | 142 | def assign_global_to_local(global_to_local): 143 | """Assigns global variable value to local variables. 144 | 145 | global_to_local : dictionary with corresponding local variable for global key 146 | """ 147 | r = [] 148 | for v in global_to_local.keys(): 149 | r.append(tf.assign(global_to_local[v],v)) 150 | with tf.control_dependencies(r): 151 | a = tf.no_op() 152 | return a 153 | 154 | 155 | def assign_local_to_global(local_to_global): 156 | """Assigns global variable value to local variables. 157 | 158 | local_to_global : dictionary with corresponding global variable for local key 159 | """ 160 | r= [] 161 | for v in local_to_global.keys(): 162 | r.append(tf.assign(local_to_global[v],v)) 163 | with tf.control_dependencies(r): 164 | a = tf.no_op() 165 | return a 166 | 167 | 168 | def get_variable_by_name(name): 169 | """Returns the variable of given name. 170 | 171 | name : the name of the global variable 172 | """ 173 | return [v for v in tf.get_collection('variables') if v.name == name][0] 174 | 175 | 176 | def get_global_variable_by_name(name): 177 | """Returns the global variable of given name 178 | 179 | name : the name of the global variable 180 | """ 181 | # return [v for v in tf.variables() if v.name == name][0] 182 | return [v for v in tf.global_variables() if v.name == name][0] 183 | 184 | 185 | def create_global_variables(): 186 | """Creates global variables for local variables on the graph. 187 | 188 | Returns dictionarys for local-to-global and global-to-local 189 | variable mappings. 190 | """ 191 | local_to_global = {} 192 | global_to_local = {} 193 | with tf.device('/job:ps/task:0'): 194 | for v in tf.local_variables(): 195 | v_g = tf.get_variable('g/'+v.op.name, 196 | shape = v.shape, 197 | dtype = v.dtype, 198 | trainable=True, 199 | collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES]) 200 | local_to_global[v] = v_g 201 | global_to_local[v_g] = v 202 | return local_to_global,global_to_local 203 | 204 | 205 | if __name__ == '__main__': 206 | parser = argparse.ArgumentParser() 207 | # Flags for defining the tf.train.ClusterSpec 208 | parser.add_argument( 209 | "--job_name", 210 | type=str, 211 | default="", 212 | help="One of 'ps', 'worker'" 213 | ) 214 | # Flags for defining the tf.train.Server 215 | parser.add_argument( 216 | "--task_index", 217 | type=int, 218 | default=0, 219 | help="Index of task within the job" 220 | ) 221 | FLAGS, unparsed = parser.parse_known_args() 222 | print(FLAGS.task_index) 223 | main() 224 | -------------------------------------------------------------------------------- /DOWNPOUR-Easy/README.md: -------------------------------------------------------------------------------- 1 | ## DOWNPOUR Easy 2 | 3 | The same as DOWNPOUR except that instead of updating variables using Adagrad locally, variables are updated using SGD. This makes implementing the algorithm easier because you don't need to worry about finding the variables created by the local Adagrad optimizer and forcing them to be local variables. 4 | -------------------------------------------------------------------------------- /DOWNPOUR-Easy/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python DOWNPOUR.py --job_name "ps" --task_index 0 & 3 | python DOWNPOUR.py --job_name "worker" --task_index 0 & 4 | python DOWNPOUR.py --job_name "worker" --task_index 1 & -------------------------------------------------------------------------------- /DOWNPOUR/DOWNPOUR.py: -------------------------------------------------------------------------------- 1 | """DOWNPOUR 2 | 3 | Performs asynchronous updates with update window. 4 | 5 | Author: Tommy Mulc 6 | """ 7 | 8 | 9 | from __future__ import print_function 10 | import tensorflow as tf 11 | import argparse 12 | import time 13 | import os 14 | 15 | 16 | FLAGS = None 17 | log_dir = '/logdir' 18 | 19 | def main(): 20 | # Configure 21 | config=tf.ConfigProto(log_device_placement=False) 22 | 23 | #Server Setup 24 | cluster_spec = {'ps':['localhost:2222'], 25 | 'worker':['localhost:2223','localhost:2224']} 26 | n_pss = len(cluster_spec['ps']) #the number of parameter servers 27 | n_workers = len(cluster_spec['worker']) #the number of worker nodes 28 | cluster = tf.train.ClusterSpec(cluster_spec) #allows this node know about all other nodes 29 | 30 | if FLAGS.job_name == 'ps': #checks if parameter server 31 | server = tf.train.Server(cluster, 32 | job_name="ps", 33 | task_index=FLAGS.task_index, 34 | config=config) 35 | server.join() 36 | else: #it must be a worker server 37 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 38 | server = tf.train.Server(cluster, 39 | job_name="worker", 40 | task_index=FLAGS.task_index, 41 | config=config) 42 | 43 | # Graph 44 | # Local operations 45 | with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index): 46 | a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 47 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 48 | b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32, 49 | collections=[tf.GraphKeys.LOCAL_VARIABLES]) 50 | c=a+b 51 | 52 | local_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='local_step', 53 | collections=['local_non_trainable']) 54 | lr = .0001 55 | 56 | #loptimizer = tf.train.GradientDescentOptimizer(lr*FLAGS.task_index) #local optimizer 57 | loptimizer = tf.train.AdagradOptimizer(lr) #local optimizer 58 | 59 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 60 | loss = tf.reduce_mean(tf.square(c-target)) 61 | 62 | # DOWNPOUR 63 | update_window = 3 # T: communication window 64 | grad_list = [] # the array to store the gradients through the communication window 65 | for t in range(update_window): 66 | if t != 0: 67 | with tf.control_dependencies([opt_local]): #compute gradients only if the local opt was run 68 | grads, varss = zip(*loptimizer.compute_gradients( \ 69 | loss,var_list=tf.local_variables())) 70 | else: 71 | grads, varss = zip(*loptimizer.compute_gradients( \ 72 | loss,var_list=tf.local_variables())) 73 | grad_list.append(grads) #add gradients to the list 74 | opt_local = loptimizer.apply_gradients(zip(grads,varss), 75 | global_step=local_step) #update local parameters 76 | 77 | grads = tf.reduce_sum(grad_list,axis=0) #sum updates before applying globally 78 | grads = tuple([grads[i]for i in range(len(varss))]) 79 | 80 | # add these variables created by local optimizer to local collection 81 | lopt_vars = add_global_variables_to_local_collection() 82 | 83 | # delete the variables from the global collection 84 | clear_global_collection() 85 | 86 | with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss, 87 | worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))): 88 | global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step') 89 | 90 | # all workers use the same learning rate and it is decided on by the task 0 91 | # or maybe the from the graph of the chief worker 92 | optimizer = tf.train.AdagradOptimizer(lr) #global optimizer 93 | 94 | # create global variables and/or references 95 | local_to_global, global_to_local = create_global_variables(lopt_vars) 96 | opt = optimizer.apply_gradients( 97 | zip(grads,[local_to_global[v] for v in varss]) 98 | ,global_step=global_step) #apply the gradients to variables on ps 99 | 100 | # Pull params from global server 101 | with tf.control_dependencies([opt]): 102 | assign_locals = assign_global_to_local(global_to_local) 103 | 104 | # Grab global state before training so all workers have same initialization 105 | grab_global_init = assign_global_to_local(global_to_local) 106 | 107 | # Assigns local values to global ones for chief to execute 108 | assign_global = assign_local_to_global(local_to_global) 109 | 110 | # Init ops 111 | init = tf.global_variables_initializer() # for global variables 112 | init_local = tf.variables_initializer(tf.local_variables() \ 113 | +tf.get_collection('local_non_trainable')) #for local variables 114 | 115 | # Session 116 | stop_hook = tf.train.StopAtStepHook(last_step=60) 117 | hooks = [stop_hook] 118 | scaff = tf.train.Scaffold(init_op=init,local_init_op=[init_local]) 119 | 120 | # Monitored Training Session 121 | sess = tf.train.MonitoredTrainingSession(master=server.target, 122 | is_chief=is_chief, 123 | config=config, 124 | scaffold=scaff, 125 | hooks=hooks, 126 | save_checkpoint_secs=1, 127 | checkpoint_dir='logdir') 128 | 129 | if is_chief: 130 | sess.run(assign_global) #Assigns chief's initial values to ps 131 | time.sleep(10) #grace period to wait on other workers before starting training 132 | 133 | # Train until hook stops session 134 | print('Starting training on worker %d'%FLAGS.task_index) 135 | sess.run(grab_global_init) 136 | while not sess.should_stop(): 137 | _,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step]) 138 | 139 | print(r,"global step: "+str(gs),"worker: "+str(FLAGS.task_index),"local step: "+str(ls)) 140 | 141 | time.sleep(1) # so we can observe training 142 | print('Done',FLAGS.task_index) 143 | 144 | time.sleep(10) #grace period to wait before closing session 145 | sess.close() 146 | print('Session from worker %d closed cleanly'%FLAGS.task_index) 147 | 148 | 149 | def assign_global_to_local(global_to_local): 150 | """Assigns global variable value to local variables. 151 | 152 | global_to_local : dictionary with corresponding local variable for global key 153 | """ 154 | r = [] 155 | for v in global_to_local.keys(): 156 | r.append(tf.assign(global_to_local[v],v)) 157 | with tf.control_dependencies(r): 158 | a = tf.no_op() 159 | return a 160 | 161 | 162 | def assign_local_to_global(local_to_global): 163 | """Assigns global variable value to local variables. 164 | 165 | local_to_global : dictionary with corresponding global variable for local key 166 | """ 167 | r= [] 168 | for v in local_to_global.keys(): 169 | r.append(tf.assign(local_to_global[v],v)) 170 | with tf.control_dependencies(r): 171 | a = tf.no_op() 172 | return a 173 | 174 | 175 | def get_variable_by_name(name): 176 | """Returns the variable of given name 177 | 178 | name : the name of the global variable 179 | """ 180 | return [v for v in tf.get_collection('variables') if v.name == name][0] 181 | 182 | 183 | def get_global_variable_by_name(name): 184 | """Returns the global variable of given name. 185 | 186 | name : the name of the global variable 187 | """ 188 | # return [v for v in tf.variables() if v.name == name][0] 189 | return [v for v in tf.global_variables() if v.name == name][0] 190 | 191 | 192 | def create_global_variables(local_optimizer_vars = []): 193 | """Creates global variables for local variables on the graph. 194 | Skips variables local variables that are created for 195 | local optimization. 196 | 197 | Returns dictionarys for local-to-global and global-to-local 198 | variable mappings. 199 | """ 200 | local_to_global = {} 201 | global_to_local = {} 202 | with tf.device('/job:ps/task:0'): 203 | for v in tf.local_variables(): 204 | if v not in local_optimizer_vars: 205 | v_g = tf.get_variable('g/'+v.op.name, 206 | shape = v.shape, 207 | dtype = v.dtype, 208 | trainable=True, 209 | collections=[tf.GraphKeys.GLOBAL_VARIABLES, 210 | tf.GraphKeys.TRAINABLE_VARIABLES]) 211 | local_to_global[v] = v_g 212 | global_to_local[v_g] = v 213 | return local_to_global,global_to_local 214 | 215 | 216 | def add_global_variables_to_local_collection(): 217 | """Adds all variables from the global collection 218 | to the local collection. 219 | 220 | Returns the list of variables added. 221 | """ 222 | r =[] 223 | for var in tf.get_default_graph()._collections[tf.GraphKeys.GLOBAL_VARIABLES]: 224 | tf.add_to_collection(tf.GraphKeys.LOCAL_VARIABLES,var) 225 | r.append(var) 226 | return r 227 | 228 | 229 | def clear_global_collection(): 230 | """Removes all variables from global collection.""" 231 | g = tf.get_default_graph() 232 | for _ in range(len(g._collections[tf.GraphKeys.GLOBAL_VARIABLES])): 233 | del g._collections[tf.GraphKeys.GLOBAL_VARIABLES][0] 234 | 235 | 236 | if __name__ == '__main__': 237 | parser = argparse.ArgumentParser() 238 | # Flags for defining the tf.train.ClusterSpec 239 | parser.add_argument( 240 | "--job_name", 241 | type=str, 242 | default="", 243 | help="One of 'ps', 'worker'" 244 | ) 245 | # Flags for defining the tf.train.Server 246 | parser.add_argument( 247 | "--task_index", 248 | type=int, 249 | default=0, 250 | help="Index of task within the job" 251 | ) 252 | FLAGS, unparsed = parser.parse_known_args() 253 | print(FLAGS.task_index) 254 | main() 255 | -------------------------------------------------------------------------------- /DOWNPOUR/README.md: -------------------------------------------------------------------------------- 1 | ## DOWNPOUR 2 | 3 | Similar to Hogwild! expect that it uses Adagrad to update the local workers. Additionally, there is a communication window which servers as a time buffer for updates to the parameter server (although the original paper set the communication window to one, which voided the need for this buffer). 4 | -------------------------------------------------------------------------------- /DOWNPOUR/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python DOWNPOUR.py --job_name "ps" --task_index 0 & 3 | python DOWNPOUR.py --job_name "worker" --task_index 0 & 4 | python DOWNPOUR.py --job_name "worker" --task_index 1 & -------------------------------------------------------------------------------- /Distributed-Setup/README.md: -------------------------------------------------------------------------------- 1 | There are many ways to set up a session in a distributed setting but we demonstrate two in this example: 2 | 3 | 1. Monitored Training Session 4 | 2. Supervisor Session 5 | 6 | The Monitored Training Session is the best option because it can handle many hooks and can be used for synchronous training. The Supervisor Session offers suppport for to handling threads and can be used for some distributed training, but overall offers less than the Monitored Training Session. The schema for this is directory is as follows 7 | 8 | * `dist_setup.py` -- python code for Monitored Training Session 9 | * `dist_setup_sup.py` -- python code for Supervisor Session 10 | * `run.sh` -- bash script for Monitored Training Session 11 | * `run_sup.sh` -- bash script for Supervisor Session -------------------------------------------------------------------------------- /Distributed-Setup/dist_setup.py: -------------------------------------------------------------------------------- 1 | """Simple example with one parameter server and one worker. 2 | 3 | Author: Tommy Mulc 4 | """ 5 | 6 | 7 | from __future__ import print_function 8 | import tensorflow as tf 9 | import argparse 10 | import time 11 | import os 12 | 13 | 14 | FLAGS = None 15 | log_dir = '/logdir' 16 | 17 | def main(): 18 | # Distributed Baggage 19 | cluster = tf.train.ClusterSpec({ 20 | 'ps':['localhost:2222'], 21 | 'worker':['localhost:2223'] 22 | }) #lets this node know about all other nodes 23 | if FLAGS.job_name == 'ps': #checks if parameter server 24 | server = tf.train.Server(cluster, 25 | job_name="ps", 26 | task_index=FLAGS.task_index) 27 | server.join() 28 | else: 29 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 30 | server = tf.train.Server(cluster, 31 | job_name="worker", 32 | task_index=FLAGS.task_index) 33 | 34 | # Graph 35 | with tf.device('/cpu:0'): 36 | a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 37 | b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 38 | c=a+b 39 | 40 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 41 | loss = tf.reduce_mean(tf.square(c-target)) 42 | 43 | opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) 44 | 45 | # Session 46 | # Monitored Training Session 47 | sess = tf.train.MonitoredTrainingSession( 48 | master=server.target, 49 | is_chief=is_chief) 50 | for i in range(1000): 51 | if sess.should_stop(): break 52 | sess.run(opt) 53 | if i % 10 == 0: 54 | r = sess.run(c) 55 | print(r) 56 | time.sleep(.1) 57 | sess.close() 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser() 61 | # Flags for defining the tf.train.ClusterSpec 62 | parser.add_argument( 63 | "--job_name", 64 | type=str, 65 | default="", 66 | help="One of 'ps', 'worker'" 67 | ) 68 | # Flags for defining the tf.train.Server 69 | parser.add_argument( 70 | "--task_index", 71 | type=int, 72 | default=0, 73 | help="Index of task within the job" 74 | ) 75 | FLAGS, unparsed = parser.parse_known_args() 76 | main() 77 | -------------------------------------------------------------------------------- /Distributed-Setup/dist_setup_sup.py: -------------------------------------------------------------------------------- 1 | """Simple example with one parameter server and one worker. 2 | 3 | Author: Tommy Mulc 4 | """ 5 | 6 | 7 | from __future__ import print_function 8 | import tensorflow as tf 9 | import argparse 10 | import time 11 | import os 12 | 13 | 14 | FLAGS = None 15 | log_dir = '/logdir' 16 | 17 | def main(): 18 | # Distributed Baggage 19 | cluster = tf.train.ClusterSpec({ 20 | 'ps':['localhost:2222'], 21 | 'worker':['localhost:2223'] 22 | }) #lets this node know about all other nodes 23 | if FLAGS.job_name == 'ps': #checks if parameter server 24 | server = tf.train.Server(cluster,job_name="ps",task_index=FLAGS.task_index) 25 | server.join() 26 | else: 27 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 28 | server = tf.train.Server(cluster,job_name="worker",task_index=FLAGS.task_index) 29 | 30 | # Graph 31 | with tf.device('/cpu:0'): 32 | a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 33 | b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 34 | c=a+b 35 | 36 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 37 | loss = tf.reduce_mean(tf.square(c-target)) 38 | 39 | opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) 40 | 41 | # Session 42 | # Supervisor 43 | sv = tf.train.Supervisor(logdir=os.getcwd()+log_dir,is_chief=is_chief,save_model_secs=30) 44 | sess = sv.prepare_or_wait_for_session(server.target) 45 | for i in range(1000): 46 | if sv.should_stop(): break 47 | sess.run(opt) 48 | if i % 10 == 0: 49 | r = sess.run(c) 50 | print(r) 51 | time.sleep(.1) 52 | 53 | if __name__ == '__main__': 54 | parser = argparse.ArgumentParser() 55 | # Flags for defining the tf.train.ClusterSpec 56 | parser.add_argument( 57 | "--job_name", 58 | type=str, 59 | default="", 60 | help="One of 'ps', 'worker'" 61 | ) 62 | # Flags for defining the tf.train.Server 63 | parser.add_argument( 64 | "--task_index", 65 | type=int, 66 | default=0, 67 | help="Index of task within the job" 68 | ) 69 | FLAGS, unparsed = parser.parse_known_args() 70 | main() 71 | -------------------------------------------------------------------------------- /Distributed-Setup/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python dist_setup.py --job_name "ps" --task_index 0 & 3 | python dist_setup.py --job_name "worker" --task_index 0 & 4 | -------------------------------------------------------------------------------- /Distributed-Setup/run_sup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python dist_setup_sup.py --job_name "ps" --task_index 0 & 3 | python dist_setup_sup.py --job_name "worker" --task_index 0 & 4 | -------------------------------------------------------------------------------- /Hogwild/Hogwild.py: -------------------------------------------------------------------------------- 1 | """Hogwild! 2 | 3 | Asynchronous updates with 1 parameter server and 2 workers. 4 | The updates happen 'hogwild' style so the parameters are 5 | never locked. 6 | 7 | Author: Tommy Mulc 8 | """ 9 | 10 | from __future__ import print_function 11 | import tensorflow as tf 12 | import argparse 13 | import time 14 | import os 15 | FLAGS = None 16 | log_dir = '/logdir' 17 | 18 | def main(): 19 | # Server Setup 20 | cluster = tf.train.ClusterSpec({ 21 | 'ps':['localhost:2222'], 22 | 'worker':['localhost:2223','localhost:2224'] 23 | }) #allows this node know about all other nodes 24 | if FLAGS.job_name == 'ps': #checks if parameter server 25 | server = tf.train.Server(cluster, 26 | job_name="ps", 27 | task_index=FLAGS.task_index) 28 | server.join() 29 | else: 30 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 31 | server = tf.train.Server(cluster, 32 | job_name="worker", 33 | task_index=FLAGS.task_index) 34 | 35 | # Graph 36 | with tf.device('/cpu:0'): 37 | a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 38 | b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 39 | c=a+b 40 | 41 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 42 | loss = tf.reduce_mean(tf.square(c-target)) 43 | 44 | opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) 45 | 46 | # Session 47 | sv = tf.train.Supervisor(logdir=os.getcwd()+log_dir, 48 | is_chief=is_chief, 49 | save_model_secs=30) 50 | sess = sv.prepare_or_wait_for_session(server.target) 51 | for i in range(1000): 52 | if sv.should_stop(): break 53 | sess.run(opt) 54 | if i % 10 == 0: 55 | r = sess.run(c) 56 | print(r) 57 | time.sleep(.1) 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser() 61 | # Flags for defining the tf.train.ClusterSpec 62 | parser.add_argument( 63 | "--job_name", 64 | type=str, 65 | default="", 66 | help="One of 'ps', 'worker'" 67 | ) 68 | # Flags for defining the tf.train.Server 69 | parser.add_argument( 70 | "--task_index", 71 | type=int, 72 | default=0, 73 | help="Index of task within the job" 74 | ) 75 | FLAGS, unparsed = parser.parse_known_args() 76 | main() 77 | -------------------------------------------------------------------------------- /Hogwild/README.md: -------------------------------------------------------------------------------- 1 | ## HogWild! 2 | 3 | The famous, lock-free approach to SGD. Have a bunch of workers and parameter server, then let the workers update the variables whenever they want. 4 | -------------------------------------------------------------------------------- /Hogwild/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python Hogwild.py --job_name "ps" --task_index 0 & 3 | python Hogwild.py --job_name "worker" --task_index 0 & 4 | python Hogwild.py --job_name "worker" --task_index 1 & 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Tommy Mulc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Multiple-GPUs-Single-Machine/README.md: -------------------------------------------------------------------------------- 1 | ## Multiple GPUs Single Machine 2 | 3 | Use environment variables to manually override the available GPUs in a TensorFlow process. There is a way to do this without using environment variables, but it's a not worth the effort (if you really need this, you can remap the available devices so the GPU you want to use is labeled as device 0, then set visible devices to 0). 4 | -------------------------------------------------------------------------------- /Multiple-GPUs-Single-Machine/dist_mult_gpu_sing_mach.py: -------------------------------------------------------------------------------- 1 | """Asynchrnous training on multiple GPUs on the same machine. 2 | 3 | Author: Tommy Mulc 4 | """ 5 | 6 | from __future__ import print_function 7 | import tensorflow as tf 8 | import argparse 9 | import time 10 | import os 11 | FLAGS = None 12 | log_dir = '/logdir' 13 | 14 | def main(): 15 | # Server Setup 16 | cluster = tf.train.ClusterSpec({ 17 | 'ps':['localhost:2222'], 18 | 'worker':['localhost:2223','localhost:2224'] 19 | }) #allows this node know about all other nodes 20 | if FLAGS.job_name == 'ps': #checks if parameter server 21 | with tf.device('/cpu:0'): 22 | server = tf.train.Server(cluster, 23 | job_name="ps", 24 | task_index=FLAGS.task_index) 25 | server.join() 26 | else: 27 | is_chief = (FLAGS.task_index == 0) #checks if this is the chief node 28 | server = tf.train.Server(cluster,job_name="worker", 29 | task_index=FLAGS.task_index,config=config) 30 | # Graph 31 | with tf.device('/gpu:0'): 32 | a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 33 | b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 34 | c=a+b 35 | 36 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 37 | loss = tf.reduce_mean(tf.square(c-target)) 38 | 39 | opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) 40 | 41 | # Session 42 | sv = tf.train.Supervisor(logdir=os.getcwd()+log_dir, 43 | is_chief=is_chief, 44 | save_model_secs=30) 45 | gpu_options = tf.GPUOptions(allow_growth=True, 46 | allocator_type="BFC", 47 | visible_device_list="%d"%FLAGS.task_index) 48 | config = tf.ConfigProto(gpu_options=gpu_options, 49 | allow_soft_placement=True) 50 | sess = sv.prepare_or_wait_for_session(server.target,config=config) 51 | for i in range(1000): 52 | if sv.should_stop(): break 53 | sess.run(opt) 54 | if i % 10 == 0: 55 | r = sess.run(c) 56 | print(r) 57 | time.sleep(.1) 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser() 61 | # Flags for defining the tf.train.ClusterSpec 62 | parser.add_argument( 63 | "--job_name", 64 | type=str, 65 | default="", 66 | help="One of 'ps', 'worker'" 67 | ) 68 | # Flags for defining the tf.train.Server 69 | parser.add_argument( 70 | "--task_index", 71 | type=int, 72 | default=0, 73 | help="Index of task within the job" 74 | ) 75 | FLAGS, unparsed = parser.parse_known_args() 76 | main() 77 | -------------------------------------------------------------------------------- /Multiple-GPUs-Single-Machine/dist_mult_gpu_sing_mach.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=-1 3 | python dist_mult_gpu_sing_mach.py --job_name "ps" --task_index 0 & 4 | export CUDA_VISIBLE_DEVICES=0 5 | python dist_mult_gpu_sing_mach.py --job_name "worker" --task_index 0 & 6 | export CUDA_VISIBLE_DEVICES=1 7 | python dist_mult_gpu_sing_mach.py --job_name "worker" --task_index 1 & 8 | -------------------------------------------------------------------------------- /Non-Distributed_Setup.py: -------------------------------------------------------------------------------- 1 | """The non-distributed solution to the problem. 2 | 3 | Author: Tommy Mulc 4 | """ 5 | 6 | from __future__ import print_function 7 | import tensorflow as tf 8 | import time 9 | 10 | def main(): 11 | # Graph 12 | with tf.device('/cpu:0'): 13 | a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 14 | b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) 15 | c=a+b 16 | 17 | target = tf.constant(100.,shape=[2],dtype=tf.float32) 18 | loss = tf.reduce_mean(tf.square(c-target)) 19 | 20 | opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) 21 | 22 | # Session 23 | sv = tf.train.Supervisor() 24 | sess = sv.prepare_or_wait_for_session() 25 | for i in range(1000): 26 | sess.run(opt) 27 | if i % 10 == 0: 28 | r = sess.run(c) 29 | print(r) 30 | time.sleep(.1) 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed TensorFlow Guide 2 | 3 | 4 | This guide is a collection of distributed training examples (that can act as boilerplate code) and a tutorial of basic distributed TensorFlow. Many of the examples focus on implementing well-known distributed training schemes, such as those available in [*dist-keras*](https://github.com/cerndb/dist-keras) which were discussed in the author's [blog post](http://joerihermans.com/ramblings/distributed-deep-learning-part-1-an-introduction/). 5 | 6 |