├── .gitignore
├── AGN
    ├── AGN.py
    ├── README.md
    └── run.sh
├── Basics-Tutorial
    ├── Local-then-Global-Variables.ipynb
    ├── Multiple-Workers
    │   ├── Local-then-Global-Variables-Worker1.ipynb
    │   ├── Local-then-Global-Variables-Worker2.ipynb
    │   └── Parameter-Server.ipynb
    ├── Parameter-Server.ipynb
    ├── README.md
    └── Servers.ipynb
├── DOWNPOUR-Easy
    ├── DOWNPOUR.py
    ├── README.md
    └── run.sh
├── DOWNPOUR
    ├── DOWNPOUR.py
    ├── README.md
    └── run.sh
├── Distributed-Setup
    ├── README.md
    ├── dist_setup.py
    ├── dist_setup_sup.py
    ├── run.sh
    └── run_sup.sh
├── Hogwild
    ├── Hogwild.py
    ├── README.md
    └── run.sh
├── LICENSE
├── Multiple-GPUs-Single-Machine
    ├── README.md
    ├── dist_mult_gpu_sing_mach.py
    └── dist_mult_gpu_sing_mach.sh
├── Non-Distributed_Setup.py
├── README.md
├── SAGN
    ├── README.md
    ├── SAGN.py
    └── run.sh
├── Synchronous-SGD-different-learning-rates
    ├── README.md
    ├── run.sh
    └── ssgd.py
├── Synchronous-SGD
    ├── README.md
    ├── run.sh
    └── ssgd.py
└── imgs
    └── data-parallelism.png


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints*
2 | Basics-Tutorial/Multiple-Workers/.ipynb_checkpoints/*
3 | Basics-Tutorial/Beginner\ Tutorial\ Variables.ipynb
4 | Basics-Tutorial/.ipynb_checkpoints/*
5 | 


--------------------------------------------------------------------------------
/AGN/AGN.py:
--------------------------------------------------------------------------------
  1 | """Asynchronous Distributed Adaptive Gradients (ADAG)
  2 | 
  3 | Formerly known as ADAG.
  4 | Performs asynchronous updates with update window. 
  5 | 
  6 | Author: Tommy Mulc
  7 | """
  8 | 
  9 | from __future__ import print_function
 10 | import tensorflow as tf
 11 | import argparse
 12 | import time
 13 | import os
 14 | FLAGS = None
 15 | log_dir = '/logdir'
 16 | 
 17 | def main():
 18 | 	# Configure
 19 | 	config=tf.ConfigProto(log_device_placement=False)
 20 | 
 21 | 	#Server Setup
 22 | 	cluster_spec = {
 23 |   			'ps':['localhost:2222'],
 24 |   			'worker':['localhost:2223','localhost:2224']
 25 |   			} #allows this node know about all other nodes
 26 | 	n_pss = len(cluster_spec['ps']) #the number of parameter servers
 27 | 	n_workers = len(cluster_spec['worker']) #the number of worker nodes
 28 | 	cluster = tf.train.ClusterSpec(cluster_spec)
 29 | 
 30 | 	if FLAGS.job_name == 'ps': #checks if parameter server
 31 | 		server = tf.train.Server(cluster,
 32 | 					job_name="ps",
 33 | 					task_index=FLAGS.task_index,
 34 | 					config=config)
 35 | 		server.join()
 36 | 	else: #it must be a worker server
 37 | 		is_chief = (FLAGS.task_index == 0) #checks if this is the chief node
 38 | 		server = tf.train.Server(cluster,
 39 | 					job_name="worker",
 40 | 					task_index=FLAGS.task_index,
 41 | 					config=config)
 42 | 		
 43 | 		# Graph
 44 | 		# We must not use train.replicate_device_setter for normal operations
 45 | 		# Local operations
 46 | 		with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index):
 47 | 			a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32,
 48 | 						collections=[tf.GraphKeys.LOCAL_VARIABLES])
 49 | 			b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32,
 50 | 						collections=[tf.GraphKeys.LOCAL_VARIABLES])
 51 | 			c=a+b
 52 | 
 53 | 			target = tf.constant(100.,shape=[2],dtype=tf.float32)
 54 | 			loss = tf.reduce_mean(tf.square(c-target))
 55 | 
 56 | 			local_step = tf.Variable(0,dtype=tf.int32,trainable=False,
 57 | 						name='local_step',collections=['local_non_trainable'])
 58 | 
 59 | 			lr = .0001
 60 | 			# loptimizer = tf.train.GradientDescentOptimizer(lr) #local optimizer
 61 | 			loptimizer = tf.train.AdamOptimizer(lr) #local optimizer
 62 | 
 63 | 			# ADAG (simplest case since all batches are the same)
 64 | 			update_window = 3 # T: update/communication window
 65 | 			grad_list = [] # the array to store the gradients through the communication window
 66 | 			for t in range(update_window):
 67 | 				if t != 0:
 68 | 					with tf.control_dependencies([opt_local]): #compute gradients only if the local opt was run
 69 | 						grads, varss = zip(*loptimizer.compute_gradients(loss,
 70 | 									var_list=tf.local_variables()))
 71 | 				else:
 72 | 					grads, varss = zip(*loptimizer.compute_gradients(loss,
 73 | 								var_list=tf.local_variables())) 
 74 | 				grad_list.append(grads) #add gradients to the list
 75 | 				opt_local = loptimizer.apply_gradients(zip(grads,varss),
 76 | 							global_step=local_step) #update local parameters
 77 | 			grads = tf.reduce_mean(grad_list,axis=0)
 78 | 			grads = tuple([grads[i]for i in range(len(varss))])
 79 | 
 80 | 			# add these variables created by local optimizer to local collection
 81 | 			lopt_vars = add_global_variables_to_local_collection()
 82 | 
 83 | 			# delete the variables from the global collection
 84 | 			clear_global_collection()
 85 | 
 86 | 		with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss,
 87 |         	worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))):
 88 | 			global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step')
 89 | 			
 90 | 			# optimizer for central variables
 91 | 			optimizer = tf.train.AdamOptimizer(lr)
 92 | 			# optimizer = tf.train.GradientDescentOptimizer(lr)
 93 | 
 94 | 			#create global variables and/or references
 95 | 			local_to_global, global_to_local = create_global_variables(lopt_vars)
 96 | 		
 97 | 			opt = optimizer.apply_gradients(
 98 | 						zip(grads,[ local_to_global[v] for v in varss])
 99 | 						,global_step=global_step) #apply the gradients to variables on ps
100 | 
101 | 			# Pull param from global server
102 | 			with tf.control_dependencies([opt]):
103 | 				assign_locals = assign_global_to_local(global_to_local)
104 | 
105 | 			# Init ops
106 | 			init_local = tf.variables_initializer(tf.local_variables() \
107 | 					+tf.get_collection('local_non_trainable'))#for local variables
108 | 			init = tf.global_variables_initializer() # for global variables
109 | 
110 | 			# Grab global state before training so all workers have same initialization
111 | 			grab_global_init = assign_global_to_local(global_to_local)
112 | 
113 | 			# Assigns local values to global ones for chief to execute
114 | 			assign_global = assign_local_to_global(local_to_global)
115 | 
116 | 		# Session
117 | 		stop_hook = tf.train.StopAtStepHook(last_step=40)
118 | 		hooks = [stop_hook]
119 | 		scaff = tf.train.Scaffold(init_op=init,local_init_op=init_local)
120 | 
121 | 		#Monitored Training Session
122 | 		sess = tf.train.MonitoredTrainingSession(master=server.target,
123 | 					is_chief=is_chief,
124 | 					config=config,
125 | 					scaffold=scaff,
126 | 					hooks=hooks,
127 | 					save_checkpoint_secs=1,
128 | 					checkpoint_dir='logdir')
129 | 		if is_chief:
130 | 			sess.run(assign_global) #Assigns chief's initial values to ps
131 | 			time.sleep(10) #grace period to wait on other workers before starting training
132 | 
133 | 		# Train until hook stops session
134 | 		print('Starting training on worker %d'%FLAGS.task_index)
135 | 		sess.run(grab_global_init)
136 | 		while not sess.should_stop():
137 | 			_,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step])
138 | 			print(r,"global step: "+str(gs),"worker: "+str(FLAGS.task_index),"local step: "+str(ls))
139 | 			time.sleep(1)
140 | 		print('Done',FLAGS.task_index)
141 | 
142 | 		time.sleep(10) #grace period to wait before closing session
143 | 		sess.close()
144 | 		print('Session from worker %d closed cleanly'%FLAGS.task_index)
145 | 
146 | 
147 | def assign_global_to_local(global_to_local):
148 | 	"""
149 | 	global_to_local : dictionary with corresponding local variable for global key
150 | 
151 | 	Assigns global variable value to local variables
152 | 	"""
153 | 	r = []
154 | 	for v in global_to_local.keys():
155 | 		r.append(tf.assign(global_to_local[v],v))
156 | 	with tf.control_dependencies(r):
157 | 		a = tf.no_op()
158 | 	return a
159 | 
160 | 
161 | def assign_local_to_global(local_to_global):
162 | 	"""Assigns global variable value to local variables.
163 | 
164 | 	local_to_global : dictionary with corresponding global variable for local key
165 | 	"""
166 | 	r= []
167 | 	for v in local_to_global.keys():
168 | 		r.append(tf.assign(local_to_global[v],v))
169 | 	with tf.control_dependencies(r):
170 | 		a = tf.no_op()
171 | 	return a
172 | 
173 | 
174 | def get_global_variable_by_name(name):
175 | 	"""Returns the global variable of given name.
176 | 
177 | 	name : the name of the global variable
178 | 	"""
179 | 	return [v for v in tf.global_variables() if v.name == name][0]
180 | 
181 | 
182 | def create_global_variables(local_optimizer_vars = []):
183 | 	"""Creates global variables for local variables on the graph.
184 | 	Skips variables local variables that are created for
185 | 	local optimization.
186 | 
187 | 	Returns dictionarys for local-to-global and global-to-local
188 | 	variable mappings.
189 | 	"""
190 | 	local_to_global = {}
191 | 	global_to_local = {}
192 | 	with tf.device('/job:ps/task:0'):
193 | 		for v in tf.local_variables():
194 | 			if v not in local_optimizer_vars:
195 | 				v_g = tf.get_variable('g/'+v.op.name,
196 | 					shape = v.shape,
197 | 					dtype = v.dtype,
198 | 					trainable=True,
199 | 					collections=[tf.GraphKeys.GLOBAL_VARIABLES,
200 | 								tf.GraphKeys.TRAINABLE_VARIABLES])
201 | 				local_to_global[v] = v_g
202 | 				global_to_local[v_g] = v
203 | 	return local_to_global,global_to_local
204 | 
205 | 
206 | def add_global_variables_to_local_collection():
207 | 	"""Adds all variables from the global collection
208 | 	to the local collection.
209 | 
210 | 	Returns the list of variables added.
211 | 	"""
212 | 	r =[]
213 | 	for var in tf.get_default_graph()._collections[tf.GraphKeys.GLOBAL_VARIABLES]:
214 | 		tf.add_to_collection(tf.GraphKeys.LOCAL_VARIABLES,var)
215 | 		r.append(var)
216 | 	return r
217 | 
218 | 
219 | def clear_global_collection():
220 | 	"""Removes all variables from global collection."""
221 | 	g = tf.get_default_graph()
222 | 	for _ in range(len(g._collections[tf.GraphKeys.GLOBAL_VARIABLES])):
223 | 		del g._collections[tf.GraphKeys.GLOBAL_VARIABLES][0]
224 | 
225 | 
226 | if __name__ == '__main__':
227 | 	parser = argparse.ArgumentParser()
228 | 	# Flags for defining the tf.train.ClusterSpec
229 | 	parser.add_argument(
230 |     	"--job_name",
231 |     	type=str,
232 |     	default="",
233 |     	help="One of 'ps', 'worker'"
234 |     )
235 |   # Flags for defining the tf.train.Server
236 | 	parser.add_argument(
237 |     	"--task_index",
238 |     	type=int,
239 |     	default=0,
240 |     	help="Index of task within the job"
241 |     )
242 | 	FLAGS, unparsed = parser.parse_known_args()
243 | 	print(FLAGS.task_index)
244 | 	main()
245 | 


--------------------------------------------------------------------------------
/AGN/README.md:
--------------------------------------------------------------------------------
1 | ## AGN (Accumulated Gradient Normalization) 
2 | 
3 | This method was formerly known as ADAG (Asynchronous Distributed Adaptive Gradients).
4 | 
5 | Similar to DOWNPOUR expect that it uses a communications window *T* and accumulates gradients for *T* steps before sending updates to the parameter server.
6 | 


--------------------------------------------------------------------------------
/AGN/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python AGN.py --job_name "ps" --task_index 0 &
3 | python AGN.py --job_name "worker" --task_index 0 &
4 | python AGN.py --job_name "worker" --task_index 1 &


--------------------------------------------------------------------------------
/Basics-Tutorial/Local-then-Global-Variables.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "'1.3.0'"
 23 |       ]
 24 |      },
 25 |      "execution_count": 2,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "tf.__version__"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "Author: Tommy Mulc\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "print \"Author: Tommy Mulc\""
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Create a TensorFlow cluster with one worker node and one ps node."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "cluster_spec = tf.train.ClusterSpec({'worker' : ['localhost:2223'], 'ps' : ['localhost:2222']})\n",
 67 |     "server = tf.train.Server(cluster_spec,job_name='worker')"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "**Now launch run all the cells in the parameter server notebook**"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "Create variables locally then makes global copy.  One worker scenario"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 5,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "tf.reset_default_graph()"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 6,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "#create local graph like normal specifying the local device\n",
104 |     "with tf.device('/job:worker/task:0'):\n",
105 |     "    a = tf.Variable([0.],name='a',collections=[tf.GraphKeys.LOCAL_VARIABLES])\n",
106 |     "    b = tf.constant([100.])\n",
107 |     "    loss = tf.abs(a-b)\n",
108 |     "    \n",
109 |     "    optimizer = tf.train.GradientDescentOptimizer(.1)\n",
110 |     "    grads,local_vars = zip(*optimizer.compute_gradients(loss,var_list=tf.local_variables()))\n",
111 |     "    local_update = optimizer.apply_gradients(zip(grads,local_vars))\n",
112 |     "    \n",
113 |     "    \n",
114 |     "    init_local = tf.local_variables_initializer()\n",
115 |     "\n",
116 |     "#create the globabl copies on the ps\n",
117 |     "with tf.device('/job:ps/task:0'):\n",
118 |     "    for v in tf.local_variables():\n",
119 |     "        v_g = tf.get_variable('g/'+v.op.name,\n",
120 |     "                            shape = v.shape,\n",
121 |     "                            dtype = v.dtype,\n",
122 |     "                            trainable=True,\n",
123 |     "                            collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES])\n",
124 |     "\n",
125 |     "\n",
126 |     "#gloabl updates\n",
127 |     "with tf.device('/job:worker/task:0'):\n",
128 |     "    #this needs to be updated.  Clearly not robust for any graph more complext\n",
129 |     "    global_vars = tf.global_variables()\n",
130 |     "    global_update = optimizer.apply_gradients(zip(grads,global_vars))\n",
131 |     "\n",
132 |     "#create init op on the chief node\n",
133 |     "with tf.device('/job:worker/task:0'):\n",
134 |     "    init_global = tf.global_variables_initializer()"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "View the device placement of ops and variables"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 7,
147 |    "metadata": {
148 |     "collapsed": true
149 |    },
150 |    "outputs": [],
151 |    "source": [
152 |     "a_global = tf.global_variables()[0]"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 10,
158 |    "metadata": {},
159 |    "outputs": [
160 |     {
161 |      "name": "stdout",
162 |      "output_type": "stream",
163 |      "text": [
164 |       "/job:worker/task:0\n",
165 |       "/job:worker/task:0\n",
166 |       "/job:worker/task:0\n",
167 |       "/job:worker/task:0\n",
168 |       "/job:ps/task:0\n",
169 |       "/job:ps/task:0\n",
170 |       "/job:worker/task:0\n",
171 |       "/job:ps/task:0\n"
172 |      ]
173 |     }
174 |    ],
175 |    "source": [
176 |     "print(a.device)\n",
177 |     "print(b.device)\n",
178 |     "print(loss.device)\n",
179 |     "print(local_update.device)\n",
180 |     "print(global_update.device)\n",
181 |     "print(init_global.device)\n",
182 |     "print(init_local.device)\n",
183 |     "print(a_global.device)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "Now, let's view the states of local and global variables as we do local then global updates"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 11,
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "data": {
200 |       "text/plain": [
201 |        "[None, None]"
202 |       ]
203 |      },
204 |      "execution_count": 11,
205 |      "metadata": {},
206 |      "output_type": "execute_result"
207 |     }
208 |    ],
209 |    "source": [
210 |     "sess = tf.Session(target=server.target)\n",
211 |     "sess.run([init_local,init_global])"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 12,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/plain": [
222 |        "[array([ 0.], dtype=float32), array([-1.26032162], dtype=float32)]"
223 |       ]
224 |      },
225 |      "execution_count": 12,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "sess.run([a,a_global])"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 13,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "sess.run(local_update)"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 14,
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "data": {
252 |       "text/plain": [
253 |        "[array([ 0.1], dtype=float32), array([-1.26032162], dtype=float32)]"
254 |       ]
255 |      },
256 |      "execution_count": 14,
257 |      "metadata": {},
258 |      "output_type": "execute_result"
259 |     }
260 |    ],
261 |    "source": [
262 |     "sess.run([a,a_global])"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "Notice that the state of the global variable hasn't changed"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 15,
275 |    "metadata": {
276 |     "collapsed": true
277 |    },
278 |    "outputs": [],
279 |    "source": [
280 |     "sess.run(global_update)"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 16,
286 |    "metadata": {},
287 |    "outputs": [
288 |     {
289 |      "data": {
290 |       "text/plain": [
291 |        "[array([ 0.1], dtype=float32), array([-1.16032159], dtype=float32)]"
292 |       ]
293 |      },
294 |      "execution_count": 16,
295 |      "metadata": {},
296 |      "output_type": "execute_result"
297 |     }
298 |    ],
299 |    "source": [
300 |     "sess.run([a,a_global])"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {
307 |     "collapsed": true
308 |    },
309 |    "outputs": [],
310 |    "source": []
311 |   }
312 |  ],
313 |  "metadata": {
314 |   "kernelspec": {
315 |    "display_name": "Python [conda env:tensorflow13]",
316 |    "language": "python",
317 |    "name": "conda-env-tensorflow13-py"
318 |   },
319 |   "language_info": {
320 |    "codemirror_mode": {
321 |     "name": "ipython",
322 |     "version": 2
323 |    },
324 |    "file_extension": ".py",
325 |    "mimetype": "text/x-python",
326 |    "name": "python",
327 |    "nbconvert_exporter": "python",
328 |    "pygments_lexer": "ipython2",
329 |    "version": "2.7.13"
330 |   }
331 |  },
332 |  "nbformat": 4,
333 |  "nbformat_minor": 2
334 | }
335 | 


--------------------------------------------------------------------------------
/Basics-Tutorial/Multiple-Workers/Local-then-Global-Variables-Worker1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "'1.3.0'"
 23 |       ]
 24 |      },
 25 |      "execution_count": 2,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "tf.__version__"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "Author: Tommy Mulc\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "print \"Author: Tommy Mulc\""
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Create a TensorFlow cluster with one worker node and one ps node."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "task_index=0"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 5,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "cluster_spec = tf.train.ClusterSpec({'ps' : ['localhost:2222'],'worker' : ['localhost:2223','localhost:2224']})\n",
 78 |     "server = tf.train.Server(cluster_spec,job_name='worker',task_index=task_index)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "**Launch and run all the cells in the parameter server notebook**"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Create variables locally then makes global copy on ps."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 6,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "tf.reset_default_graph()\n",
104 |     "\n",
105 |     "#create local graph like normal specifying the local device\n",
106 |     "with tf.device('/job:worker/task:0'):\n",
107 |     "    a = tf.Variable([0.],name='a',collections=[tf.GraphKeys.LOCAL_VARIABLES])\n",
108 |     "    b = tf.constant([100.])\n",
109 |     "    loss = tf.abs(a-b)\n",
110 |     "    \n",
111 |     "    optimizer = tf.train.GradientDescentOptimizer(.1)\n",
112 |     "    grads,local_vars = zip(*optimizer.compute_gradients(loss,var_list=tf.local_variables()))\n",
113 |     "    local_update = optimizer.apply_gradients(zip(grads,local_vars))\n",
114 |     "    \n",
115 |     "    \n",
116 |     "    init_local = tf.local_variables_initializer()\n",
117 |     "\n",
118 |     "#create the globabl copies on the ps\n",
119 |     "with tf.device('/job:ps/task:0'):\n",
120 |     "    for v in tf.local_variables():\n",
121 |     "        v_g = tf.get_variable('g/'+v.op.name,\n",
122 |     "                            shape = v.shape,\n",
123 |     "                            dtype = v.dtype,\n",
124 |     "                            trainable=True,\n",
125 |     "                            collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES])\n",
126 |     "\n",
127 |     "\n",
128 |     "#gloabl updates\n",
129 |     "with tf.device('/job:worker/task:%d'%task_index):\n",
130 |     "    #this needs to be updated.  Clearly not robust for any graph more complext\n",
131 |     "    global_vars = tf.global_variables()\n",
132 |     "    global_update = optimizer.apply_gradients(zip(grads,global_vars))\n",
133 |     "\n",
134 |     "#create init op on the chief node\n",
135 |     "with tf.device('/job:worker/task:%d'%task_index):\n",
136 |     "    init_global = tf.global_variables_initializer()"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "View device placements"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 7,
149 |    "metadata": {
150 |     "collapsed": true
151 |    },
152 |    "outputs": [],
153 |    "source": [
154 |     "a_global = tf.global_variables()[0]"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 8,
160 |    "metadata": {},
161 |    "outputs": [
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "/job:worker/task:0\n",
167 |       "/job:worker/task:0\n",
168 |       "/job:worker/task:0\n",
169 |       "/job:worker/task:0\n",
170 |       "/job:ps/task:0\n",
171 |       "/job:ps/task:0\n",
172 |       "/job:worker/task:0\n",
173 |       "/job:ps/task:0\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "print(a.device)\n",
179 |     "print(b.device)\n",
180 |     "print(loss.device)\n",
181 |     "print(local_update.device)\n",
182 |     "print(global_update.device)\n",
183 |     "print(init_global.device)\n",
184 |     "print(init_local.device)\n",
185 |     "print(a_global.device)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 9,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "[None, None]"
197 |       ]
198 |      },
199 |      "execution_count": 9,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "sess = tf.Session(target=server.target)\n",
206 |     "sess.run([init_local,init_global])"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "Make sure you have also run all cells in the worker 2 notebook up to this point before continuing.  The above cell should hang until you initialize the worker 2 session."
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 10,
219 |    "metadata": {},
220 |    "outputs": [
221 |     {
222 |      "data": {
223 |       "text/plain": [
224 |        "[array([ 0.], dtype=float32), array([-1.17584229], dtype=float32)]"
225 |       ]
226 |      },
227 |      "execution_count": 10,
228 |      "metadata": {},
229 |      "output_type": "execute_result"
230 |     }
231 |    ],
232 |    "source": [
233 |     "sess.run([a,a_global])"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 11,
239 |    "metadata": {
240 |     "collapsed": true
241 |    },
242 |    "outputs": [],
243 |    "source": [
244 |     "sess.run(local_update)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 12,
250 |    "metadata": {},
251 |    "outputs": [
252 |     {
253 |      "data": {
254 |       "text/plain": [
255 |        "[array([ 0.1], dtype=float32), array([-1.17584229], dtype=float32)]"
256 |       ]
257 |      },
258 |      "execution_count": 12,
259 |      "metadata": {},
260 |      "output_type": "execute_result"
261 |     }
262 |    ],
263 |    "source": [
264 |     "sess.run([a,a_global])"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 13,
270 |    "metadata": {
271 |     "collapsed": true
272 |    },
273 |    "outputs": [],
274 |    "source": [
275 |     "sess.run(global_update)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 14,
281 |    "metadata": {},
282 |    "outputs": [
283 |     {
284 |      "data": {
285 |       "text/plain": [
286 |        "[array([ 0.1], dtype=float32), array([-1.07584226], dtype=float32)]"
287 |       ]
288 |      },
289 |      "execution_count": 14,
290 |      "metadata": {},
291 |      "output_type": "execute_result"
292 |     }
293 |    ],
294 |    "source": [
295 |     "sess.run([a,a_global])"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "Pause here. Run the last cell in this notebook after you have done a global update in the worker 2 notebook."
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 15,
308 |    "metadata": {},
309 |    "outputs": [
310 |     {
311 |      "data": {
312 |       "text/plain": [
313 |        "[array([-0.97584224], dtype=float32)]"
314 |       ]
315 |      },
316 |      "execution_count": 15,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "sess.run([a_global])"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {
329 |     "collapsed": true
330 |    },
331 |    "outputs": [],
332 |    "source": []
333 |   }
334 |  ],
335 |  "metadata": {
336 |   "kernelspec": {
337 |    "display_name": "Python [conda env:tensorflow13]",
338 |    "language": "python",
339 |    "name": "conda-env-tensorflow13-py"
340 |   },
341 |   "language_info": {
342 |    "codemirror_mode": {
343 |     "name": "ipython",
344 |     "version": 2
345 |    },
346 |    "file_extension": ".py",
347 |    "mimetype": "text/x-python",
348 |    "name": "python",
349 |    "nbconvert_exporter": "python",
350 |    "pygments_lexer": "ipython2",
351 |    "version": "2.7.13"
352 |   }
353 |  },
354 |  "nbformat": 4,
355 |  "nbformat_minor": 2
356 | }
357 | 


--------------------------------------------------------------------------------
/Basics-Tutorial/Multiple-Workers/Local-then-Global-Variables-Worker2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "'1.3.0'"
 23 |       ]
 24 |      },
 25 |      "execution_count": 2,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "tf.__version__"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "Author: Tommy Mulc\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "print \"Author: Tommy Mulc\""
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Create a TensorFlow cluster with one worker node and one ps node."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "task_index=1"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 5,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "cluster_spec = tf.train.ClusterSpec({'ps' : ['localhost:2222'],'worker' : ['localhost:2223','localhost:2224']})\n",
 78 |     "server = tf.train.Server(cluster_spec,job_name='worker',task_index=task_index)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "**Launch and run all the cells in the parameter server notebook (if you haven't already)**"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Create variables locally then makes a global copy on ps."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 6,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "tf.reset_default_graph()\n",
104 |     "\n",
105 |     "#create local graph like normal specifying the local device\n",
106 |     "with tf.device('/job:worker/task:%d'%task_index):\n",
107 |     "    a = tf.Variable([0.],name='a',collections=[tf.GraphKeys.LOCAL_VARIABLES])\n",
108 |     "    b = tf.constant([100.])\n",
109 |     "    loss = tf.abs(a-b)\n",
110 |     "    \n",
111 |     "    optimizer = tf.train.GradientDescentOptimizer(.1)\n",
112 |     "    grads,local_vars = zip(*optimizer.compute_gradients(loss,var_list=tf.local_variables()))\n",
113 |     "    local_update = optimizer.apply_gradients(zip(grads,local_vars))\n",
114 |     "    \n",
115 |     "    \n",
116 |     "    init_local = tf.local_variables_initializer()\n",
117 |     "\n",
118 |     "#create the globabl copies on the ps\n",
119 |     "with tf.device('/job:ps/task:0'):\n",
120 |     "    for v in tf.local_variables():\n",
121 |     "        v_g = tf.get_variable('g/'+v.op.name,\n",
122 |     "                            shape = v.shape,\n",
123 |     "                            dtype = v.dtype,\n",
124 |     "                            trainable=True,\n",
125 |     "                            collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES])\n",
126 |     "\n",
127 |     "\n",
128 |     "#gloabl updates\n",
129 |     "with tf.device('/job:worker/task:%d'%task_index):\n",
130 |     "    #this needs to be updated.  Clearly not robust for any graph more complext\n",
131 |     "    global_vars = tf.global_variables()\n",
132 |     "    global_update = optimizer.apply_gradients(zip(grads,global_vars))\n",
133 |     "\n",
134 |     "#create init op on the chief node\n",
135 |     "with tf.device('/job:worker/task:%d'%task_index):\n",
136 |     "    init_global = tf.global_variables_initializer()"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "View device placements"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 7,
149 |    "metadata": {
150 |     "collapsed": true
151 |    },
152 |    "outputs": [],
153 |    "source": [
154 |     "a_global = tf.global_variables()[0]"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 8,
160 |    "metadata": {},
161 |    "outputs": [
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "/job:worker/task:1\n",
167 |       "/job:worker/task:1\n",
168 |       "/job:worker/task:1\n",
169 |       "/job:worker/task:1\n",
170 |       "/job:ps/task:0\n",
171 |       "/job:ps/task:0\n",
172 |       "/job:worker/task:1\n",
173 |       "/job:ps/task:0\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "print(a.device)\n",
179 |     "print(b.device)\n",
180 |     "print(loss.device)\n",
181 |     "print(local_update.device)\n",
182 |     "print(global_update.device)\n",
183 |     "print(init_global.device)\n",
184 |     "print(init_local.device)\n",
185 |     "print(a_global.device)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 9,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "[None]"
197 |       ]
198 |      },
199 |      "execution_count": 9,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "sess = tf.Session(target=server.target)\n",
206 |     "sess.run([init_local])"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 11,
212 |    "metadata": {},
213 |    "outputs": [
214 |     {
215 |      "data": {
216 |       "text/plain": [
217 |        "[array([ 0.], dtype=float32), array([-1.07584226], dtype=float32)]"
218 |       ]
219 |      },
220 |      "execution_count": 11,
221 |      "metadata": {},
222 |      "output_type": "execute_result"
223 |     }
224 |    ],
225 |    "source": [
226 |     "sess.run([a,a_global])"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "Wait for a global update from worker 1, then continue."
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 12,
239 |    "metadata": {
240 |     "collapsed": true
241 |    },
242 |    "outputs": [],
243 |    "source": [
244 |     "sess.run(local_update)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 13,
250 |    "metadata": {},
251 |    "outputs": [
252 |     {
253 |      "data": {
254 |       "text/plain": [
255 |        "[array([ 0.1], dtype=float32), array([-1.07584226], dtype=float32)]"
256 |       ]
257 |      },
258 |      "execution_count": 13,
259 |      "metadata": {},
260 |      "output_type": "execute_result"
261 |     }
262 |    ],
263 |    "source": [
264 |     "sess.run([a,a_global])"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 14,
270 |    "metadata": {
271 |     "collapsed": true
272 |    },
273 |    "outputs": [],
274 |    "source": [
275 |     "sess.run(global_update)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 15,
281 |    "metadata": {},
282 |    "outputs": [
283 |     {
284 |      "data": {
285 |       "text/plain": [
286 |        "[array([ 0.1], dtype=float32), array([-0.97584224], dtype=float32)]"
287 |       ]
288 |      },
289 |      "execution_count": 15,
290 |      "metadata": {},
291 |      "output_type": "execute_result"
292 |     }
293 |    ],
294 |    "source": [
295 |     "sess.run([a,a_global])"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {
302 |     "collapsed": true
303 |    },
304 |    "outputs": [],
305 |    "source": []
306 |   }
307 |  ],
308 |  "metadata": {
309 |   "kernelspec": {
310 |    "display_name": "Python [conda env:tensorflow13]",
311 |    "language": "python",
312 |    "name": "conda-env-tensorflow13-py"
313 |   },
314 |   "language_info": {
315 |    "codemirror_mode": {
316 |     "name": "ipython",
317 |     "version": 2
318 |    },
319 |    "file_extension": ".py",
320 |    "mimetype": "text/x-python",
321 |    "name": "python",
322 |    "nbconvert_exporter": "python",
323 |    "pygments_lexer": "ipython2",
324 |    "version": "2.7.13"
325 |   }
326 |  },
327 |  "nbformat": 4,
328 |  "nbformat_minor": 2
329 | }
330 | 


--------------------------------------------------------------------------------
/Basics-Tutorial/Multiple-Workers/Parameter-Server.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import tensorflow as tf"
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "markdown",
16 |    "metadata": {},
17 |    "source": [
18 |     "Running the below cell will cause this kernel to stall on the cell until the notebook is shutdown."
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {
25 |     "collapsed": true
26 |    },
27 |    "outputs": [],
28 |    "source": [
29 |     "cluster_spec = tf.train.ClusterSpec({'ps' : ['localhost:2222'],'worker' : ['localhost:2223','localhost:2224']})\n",
30 |     "ps = tf.train.Server(cluster_spec,job_name='ps')\n",
31 |     "ps.join()"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "metadata": {
38 |     "collapsed": true
39 |    },
40 |    "outputs": [],
41 |    "source": []
42 |   }
43 |  ],
44 |  "metadata": {
45 |   "kernelspec": {
46 |    "display_name": "Python [conda env:tensorflow13]",
47 |    "language": "python",
48 |    "name": "conda-env-tensorflow13-py"
49 |   },
50 |   "language_info": {
51 |    "codemirror_mode": {
52 |     "name": "ipython",
53 |     "version": 2
54 |    },
55 |    "file_extension": ".py",
56 |    "mimetype": "text/x-python",
57 |    "name": "python",
58 |    "nbconvert_exporter": "python",
59 |    "pygments_lexer": "ipython2",
60 |    "version": "2.7.13"
61 |   }
62 |  },
63 |  "nbformat": 4,
64 |  "nbformat_minor": 2
65 | }
66 | 


--------------------------------------------------------------------------------
/Basics-Tutorial/Parameter-Server.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import tensorflow as tf"
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "markdown",
16 |    "metadata": {},
17 |    "source": [
18 |     "Running the below cell will cause this kernel to stall on the cell until the notebook is shutdown."
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {
25 |     "collapsed": true
26 |    },
27 |    "outputs": [],
28 |    "source": [
29 |     "cluster_spec = tf.train.ClusterSpec({'worker' : ['localhost:2223'], 'ps' : ['localhost:2222']})\n",
30 |     "ps = tf.train.Server(cluster_spec,job_name='ps')\n",
31 |     "ps.join()"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "metadata": {
38 |     "collapsed": true
39 |    },
40 |    "outputs": [],
41 |    "source": []
42 |   }
43 |  ],
44 |  "metadata": {
45 |   "kernelspec": {
46 |    "display_name": "Python [conda env:tensorflow13]",
47 |    "language": "python",
48 |    "name": "conda-env-tensorflow13-py"
49 |   },
50 |   "language_info": {
51 |    "codemirror_mode": {
52 |     "name": "ipython",
53 |     "version": 2
54 |    },
55 |    "file_extension": ".py",
56 |    "mimetype": "text/x-python",
57 |    "name": "python",
58 |    "nbconvert_exporter": "python",
59 |    "pygments_lexer": "ipython2",
60 |    "version": "2.7.13"
61 |   }
62 |  },
63 |  "nbformat": 4,
64 |  "nbformat_minor": 2
65 | }
66 | 


--------------------------------------------------------------------------------
/Basics-Tutorial/README.md:
--------------------------------------------------------------------------------
 1 | ## Basics Tutorial
 2 | 
 3 | This short tutorial will show you how to get started with distributed TensorFlow.  The aim is to get you familiar with basic distributed TF concepts that are reoccurring, such as TF servers.  You should work throught the content in the following order 
 4 | 
 5 | 1. [`Server.ipynb`](Servers.ipynb)
 6 | 2. [`Parameter-Server.ipynb`](Parameter-Server.ipynb)
 7 | 3. [`Local-then-Global-Variables.ipynb`](Local-then-Global-Variables.ipynb)
 8 | 4. [`Multiple-Workers`](Multiple-Workers/).
 9 | 
10 | The Multiple-Workers exercise requires three notebooks, and should be started from the Worker1 notebook.
11 | 
12 | ### Coming Soon!
13 | * Sessions, Sesssion Managers, Training Sessions...
14 | 
15 | 


--------------------------------------------------------------------------------
/Basics-Tutorial/Servers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "'1.3.0'"
 23 |       ]
 24 |      },
 25 |      "execution_count": 2,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "tf.__version__"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "Author: Tommy Mulc\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "print \"Author: Tommy Mulc\""
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "# TensorFlow Servers\n",
 56 |     "\n",
 57 |     "Create a TensorFlow cluster with one node.  Let this node be responsible for a job that that has name \"worker\" and that will operate one take at localhost:2222"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 4,
 63 |    "metadata": {
 64 |     "collapsed": true
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "cluster_spec = tf.train.ClusterSpec({'worker' : ['localhost:2222']})\n",
 69 |     "server = tf.train.Server(cluster_spec)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 5,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/plain": [
 80 |        "'grpc://localhost:2222'"
 81 |       ]
 82 |      },
 83 |      "execution_count": 5,
 84 |      "metadata": {},
 85 |      "output_type": "execute_result"
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "server.target"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "The server is currently running.  Check this by running \n",
 97 |     "\n",
 98 |     "`lsof -i -P -n | grep LISTEN | grep python`\n",
 99 |     "\n",
100 |     "in your terminal."
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 6,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "python2.7 66001 tmulc    3u  IPv6 0x358037b03a6c7799      0t0  TCP [::1]:8888 (LISTEN)\n",
113 |       "python2.7 66001 tmulc    4u  IPv4 0x358037b038251061      0t0  TCP 127.0.0.1:8888 (LISTEN)\n",
114 |       "python2.7 66017 tmulc   25u  IPv4 0x358037b0381ff769      0t0  TCP 127.0.0.1:60322 (LISTEN)\n",
115 |       "python2.7 66017 tmulc   28u  IPv4 0x358037b0381fd251      0t0  TCP 127.0.0.1:60324 (LISTEN)\n",
116 |       "python2.7 66017 tmulc   31u  IPv4 0x358037b038285251      0t0  TCP 127.0.0.1:60325 (LISTEN)\n",
117 |       "python2.7 66017 tmulc   34u  IPv4 0x358037b038485b49      0t0  TCP 127.0.0.1:60323 (LISTEN)\n",
118 |       "python2.7 66017 tmulc   39u  IPv4 0x358037b039c18769      0t0  TCP 127.0.0.1:60339 (LISTEN)\n",
119 |       "python2.7 66017 tmulc   52u  IPv4 0x358037b038282579      0t0  TCP 127.0.0.1:60326 (LISTEN)\n",
120 |       "python2.7 66017 tmulc   69u  IPv6 0x358037b03a6c8259      0t0  TCP *:2222 (LISTEN)\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "%%bash\n",
126 |     "lsof -i -P -n | grep LISTEN | grep python"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "View the meta data"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 7,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "cluster {\n",
145 |        "  job {\n",
146 |        "    name: \"worker\"\n",
147 |        "    tasks {\n",
148 |        "      value: \"localhost:2222\"\n",
149 |        "    }\n",
150 |        "  }\n",
151 |        "}\n",
152 |        "job_name: \"worker\"\n",
153 |        "protocol: \"grpc\""
154 |       ]
155 |      },
156 |      "execution_count": 7,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "server.server_def"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "Launch a TensorFlow session with the excecution engine being the server."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 8,
175 |    "metadata": {
176 |     "collapsed": true
177 |    },
178 |    "outputs": [],
179 |    "source": [
180 |     "sess = tf.Session(target=server.target)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "Use TensorFlow to create a local server and use `lsof` to find out the location of the server."
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 9,
193 |    "metadata": {
194 |     "collapsed": true
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "server = tf.train.Server.create_local_server()"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 10,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "name": "stdout",
208 |      "output_type": "stream",
209 |      "text": [
210 |       "python2.7 66001 tmulc    3u  IPv6 0x358037b03a6c7799      0t0  TCP [::1]:8888 (LISTEN)\n",
211 |       "python2.7 66001 tmulc    4u  IPv4 0x358037b038251061      0t0  TCP 127.0.0.1:8888 (LISTEN)\n",
212 |       "python2.7 66017 tmulc   25u  IPv4 0x358037b0381ff769      0t0  TCP 127.0.0.1:60322 (LISTEN)\n",
213 |       "python2.7 66017 tmulc   28u  IPv4 0x358037b0381fd251      0t0  TCP 127.0.0.1:60324 (LISTEN)\n",
214 |       "python2.7 66017 tmulc   31u  IPv4 0x358037b038285251      0t0  TCP 127.0.0.1:60325 (LISTEN)\n",
215 |       "python2.7 66017 tmulc   34u  IPv4 0x358037b038485b49      0t0  TCP 127.0.0.1:60323 (LISTEN)\n",
216 |       "python2.7 66017 tmulc   39u  IPv4 0x358037b039c18769      0t0  TCP 127.0.0.1:60339 (LISTEN)\n",
217 |       "python2.7 66017 tmulc   52u  IPv4 0x358037b038282579      0t0  TCP 127.0.0.1:60326 (LISTEN)\n",
218 |       "python2.7 66017 tmulc   69u  IPv6 0x358037b03a6c8259      0t0  TCP *:2222 (LISTEN)\n",
219 |       "python2.7 66017 tmulc   75u  IPv6 0x358037b031e27239      0t0  TCP *:60371 (LISTEN)\n"
220 |      ]
221 |     }
222 |    ],
223 |    "source": [
224 |     "%%bash\n",
225 |     "lsof -i -P -n | grep LISTEN | grep python"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {
231 |     "collapsed": true
232 |    },
233 |    "source": [
234 |     "View devices available in this session."
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 11,
240 |    "metadata": {},
241 |    "outputs": [
242 |     {
243 |      "name": "stdout",
244 |      "output_type": "stream",
245 |      "text": [
246 |       "/job:worker/replica:0/task:0/device:CPU:0\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "devices = sess.list_devices()\n",
252 |     "for d in devices:\n",
253 |     "    print(d.name)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 12,
259 |    "metadata": {
260 |     "collapsed": true
261 |    },
262 |    "outputs": [],
263 |    "source": [
264 |     "sess.close()"
265 |    ]
266 |   }
267 |  ],
268 |  "metadata": {
269 |   "kernelspec": {
270 |    "display_name": "Python [conda env:tensorflow13]",
271 |    "language": "python",
272 |    "name": "conda-env-tensorflow13-py"
273 |   },
274 |   "language_info": {
275 |    "codemirror_mode": {
276 |     "name": "ipython",
277 |     "version": 2
278 |    },
279 |    "file_extension": ".py",
280 |    "mimetype": "text/x-python",
281 |    "name": "python",
282 |    "nbconvert_exporter": "python",
283 |    "pygments_lexer": "ipython2",
284 |    "version": "2.7.13"
285 |   }
286 |  },
287 |  "nbformat": 4,
288 |  "nbformat_minor": 2
289 | }
290 | 


--------------------------------------------------------------------------------
/DOWNPOUR-Easy/DOWNPOUR.py:
--------------------------------------------------------------------------------
  1 | """DOWNPOUR Easy
  2 | 
  3 | Performs asynchronous updates with update window.
  4 | Uses SGD on the local level for updates instead of Adagrad.
  5 | 
  6 | Author: Tommy Mulc
  7 | """
  8 | 
  9 | from __future__ import print_function
 10 | import tensorflow as tf
 11 | import argparse
 12 | import time
 13 | import os
 14 | FLAGS = None
 15 | log_dir = '/logdir'
 16 | 
 17 | def main():
 18 | 	# Configure
 19 | 	config=tf.ConfigProto(log_device_placement=False)
 20 | 
 21 | 	# Server Setup
 22 | 	cluster_spec = {'ps':['localhost:2222'],
 23 | 				'worker':['localhost:2223','localhost:2224']}
 24 | 	n_pss = len(cluster_spec['ps']) #the number of parameter servers
 25 | 	n_workers = len(cluster_spec['worker']) #the number of worker nodes
 26 | 	cluster = tf.train.ClusterSpec(cluster_spec) #allows this node know about all other nodes
 27 | 
 28 | 	if FLAGS.job_name == 'ps': #checks if parameter server
 29 | 		server = tf.train.Server(cluster,
 30 | 					job_name="ps",
 31 | 					task_index=FLAGS.task_index,
 32 | 					config=config)
 33 | 		server.join()
 34 | 	else: #it must be a worker server
 35 | 		is_chief = (FLAGS.task_index == 0) #checks if this is the chief node
 36 | 		server = tf.train.Server(cluster,
 37 | 					job_name="worker",
 38 | 					task_index=FLAGS.task_index,
 39 | 					config=config)
 40 | 		
 41 | 		# Graph
 42 | 		# Local operations
 43 | 		with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index):
 44 | 			a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32,
 45 | 						collections=[tf.GraphKeys.LOCAL_VARIABLES])
 46 | 			b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32,
 47 | 						collections=[tf.GraphKeys.LOCAL_VARIABLES])
 48 | 			c=a+b
 49 | 
 50 | 			local_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='local_step',
 51 | 						collections=['local_non_trainable'])
 52 | 			lr = .0001
 53 | 			loptimizer = tf.train.GradientDescentOptimizer(lr*FLAGS.task_index) #local optimizer
 54 | 
 55 | 			target = tf.constant(100.,shape=[2],dtype=tf.float32)
 56 | 			loss = tf.reduce_mean(tf.square(c-target))
 57 | 
 58 | 			# DOWNPOUR
 59 | 			update_window = 3 # T: communication window
 60 | 			grad_list = [] # array to store the gradients through the communication window
 61 | 			for t in range(update_window):
 62 | 				if t != 0:
 63 | 					with tf.control_dependencies([opt_local]): #compute gradients only if the local opt was run
 64 | 						grads, varss = zip(*loptimizer.compute_gradients(
 65 | 									loss,var_list=tf.local_variables()))
 66 | 				else:
 67 | 					grads, varss = zip(*loptimizer.compute_gradients(
 68 | 								loss,var_list=tf.local_variables()))
 69 | 				grad_list.append(grads) #add gradients to the list
 70 | 				opt_local = loptimizer.apply_gradients(zip(grads,varss),
 71 | 							global_step=local_step) #update local parameters
 72 | 
 73 | 			grads = tf.reduce_sum(grad_list,axis=0) #sum updates before applying globally
 74 | 			grads = tuple([grads[i]for i in range(len(varss))])
 75 | 
 76 | 			
 77 | 		with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss,
 78 | 					worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))):
 79 | 
 80 | 			global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step')
 81 | 
 82 | 			# all workers use the same learning rate and it is decided on by the task 0 
 83 | 			# or maybe the from the graph of the chief worker
 84 | 			optimizer = tf.train.AdagradOptimizer(lr) #global optimizer
 85 | 
 86 | 			# create global variables and/or references
 87 | 			local_to_global, global_to_local = create_global_variables()
 88 | 			opt = optimizer.apply_gradients(
 89 | 						zip(grads,[local_to_global[v] for v in varss])
 90 | 						,global_step=global_step) #apply the gradients to variables on ps
 91 | 
 92 | 			# Pull params from global server
 93 | 			with tf.control_dependencies([opt]):
 94 | 				assign_locals = assign_global_to_local(global_to_local)
 95 | 
 96 | 
 97 | 			# Grab global state before training so all workers have same initialization
 98 | 			grab_global_init = assign_global_to_local(global_to_local)
 99 | 
100 | 			# Assigns local values to global ones for chief to execute
101 | 			assign_global = assign_local_to_global(local_to_global)
102 | 
103 | 			# Init ops
104 | 			init = tf.global_variables_initializer() # for global variables
105 | 			init_local = tf.variables_initializer(tf.local_variables() \
106 | 						+tf.get_collection('local_non_trainable'))#for local variables
107 | 
108 | 		# Session
109 | 		stop_hook = tf.train.StopAtStepHook(last_step=60)
110 | 		hooks = [stop_hook]
111 | 		scaff = tf.train.Scaffold(init_op=init,local_init_op=[init_local])
112 | 
113 | 		# Monitored Training Session
114 | 		sess = tf.train.MonitoredTrainingSession(master=server.target,
115 | 					is_chief=is_chief,
116 | 					config=config,
117 | 					scaffold=scaff,
118 | 					hooks=hooks,
119 | 					save_checkpoint_secs=1,
120 | 					checkpoint_dir='logdir')
121 | 
122 | 		if is_chief:
123 | 			sess.run(assign_global) #Assigns chief's initial values to ps
124 | 			time.sleep(10) #grace period to wait on other workers before starting training
125 | 
126 | 		# Train until hook stops session
127 | 		print('Starting training on worker %d'%FLAGS.task_index)
128 | 		sess.run(grab_global_init)
129 | 		while not sess.should_stop():
130 | 			_,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step])
131 | 
132 | 			print(r,"global step: "+str(gs),"worker: "+str(FLAGS.task_index),"local step: "+str(ls))
133 | 
134 | 			time.sleep(1) # so we can observe training
135 | 		print('Done',FLAGS.task_index)
136 | 
137 | 		time.sleep(10) #grace period to wait before closing session
138 | 		sess.close()
139 | 		print('Session from worker %d closed cleanly'%FLAGS.task_index)
140 | 
141 | 
142 | def assign_global_to_local(global_to_local):
143 | 	"""Assigns global variable value to local variables.
144 | 
145 | 	global_to_local : dictionary with corresponding local variable for global key
146 | 	"""
147 | 	r = []
148 | 	for v in global_to_local.keys():
149 | 		r.append(tf.assign(global_to_local[v],v))
150 | 	with tf.control_dependencies(r):
151 | 		a = tf.no_op()
152 | 	return a
153 | 
154 | 
155 | def assign_local_to_global(local_to_global):
156 | 	"""Assigns global variable value to local variables.
157 | 
158 | 	local_to_global : dictionary with corresponding global variable for local key
159 | 	"""
160 | 	r= []
161 | 	for v in local_to_global.keys():
162 | 		r.append(tf.assign(local_to_global[v],v))
163 | 	with tf.control_dependencies(r):
164 | 		a = tf.no_op()
165 | 	return a
166 | 
167 | 
168 | def get_variable_by_name(name):
169 | 	"""Returns the variable of given name.
170 | 
171 | 	name : the name of the global variable
172 | 	"""
173 | 	return [v for v in tf.get_collection('variables') if v.name == name][0]
174 | 
175 | 
176 | def get_global_variable_by_name(name):
177 | 	"""Returns the global variable of given name
178 | 
179 | 	name : the name of the global variable
180 | 	"""
181 | 	# return [v for v in tf.variables() if v.name == name][0]
182 | 	return [v for v in tf.global_variables() if v.name == name][0]
183 | 
184 | 
185 | def create_global_variables():
186 | 	"""Creates global variables for local variables on the graph.
187 | 
188 | 	Returns dictionarys for local-to-global and global-to-local
189 | 	variable mappings.
190 | 	"""
191 | 	local_to_global = {}
192 | 	global_to_local = {}
193 | 	with tf.device('/job:ps/task:0'):
194 | 		for v in tf.local_variables():
195 | 			v_g = tf.get_variable('g/'+v.op.name,
196 | 				shape = v.shape,
197 | 				dtype = v.dtype,
198 | 				trainable=True,
199 | 				collections=[tf.GraphKeys.GLOBAL_VARIABLES,tf.GraphKeys.TRAINABLE_VARIABLES])
200 | 			local_to_global[v] = v_g
201 | 			global_to_local[v_g] = v
202 | 	return local_to_global,global_to_local
203 | 
204 | 
205 | if __name__ == '__main__':
206 | 	parser = argparse.ArgumentParser()
207 | 	# Flags for defining the tf.train.ClusterSpec
208 | 	parser.add_argument(
209 |     	"--job_name",
210 |     	type=str,
211 |     	default="",
212 |     	help="One of 'ps', 'worker'"
213 |     )
214 |   # Flags for defining the tf.train.Server
215 | 	parser.add_argument(
216 |     	"--task_index",
217 |     	type=int,
218 |     	default=0,
219 |     	help="Index of task within the job"
220 |     )
221 | 	FLAGS, unparsed = parser.parse_known_args()
222 | 	print(FLAGS.task_index)
223 | 	main()
224 | 


--------------------------------------------------------------------------------
/DOWNPOUR-Easy/README.md:
--------------------------------------------------------------------------------
1 | ## DOWNPOUR Easy
2 | 
3 | The same as DOWNPOUR except that instead of updating variables using Adagrad locally, variables are updated using SGD.  This makes implementing the algorithm easier because you don't need to worry about finding the variables created by the local Adagrad optimizer and forcing them to be local variables.
4 | 


--------------------------------------------------------------------------------
/DOWNPOUR-Easy/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python DOWNPOUR.py --job_name "ps" --task_index 0 &
3 | python DOWNPOUR.py --job_name "worker" --task_index 0 &
4 | python DOWNPOUR.py --job_name "worker" --task_index 1 &


--------------------------------------------------------------------------------
/DOWNPOUR/DOWNPOUR.py:
--------------------------------------------------------------------------------
  1 | """DOWNPOUR
  2 | 
  3 | Performs asynchronous updates with update window.
  4 | 
  5 | Author: Tommy Mulc
  6 | """
  7 | 
  8 | 
  9 | from __future__ import print_function
 10 | import tensorflow as tf
 11 | import argparse
 12 | import time
 13 | import os
 14 | 
 15 | 
 16 | FLAGS = None
 17 | log_dir = '/logdir'
 18 | 
 19 | def main():
 20 | 	# Configure
 21 | 	config=tf.ConfigProto(log_device_placement=False)
 22 | 
 23 | 	#Server Setup
 24 | 	cluster_spec = {'ps':['localhost:2222'],
 25 | 				'worker':['localhost:2223','localhost:2224']}
 26 | 	n_pss = len(cluster_spec['ps']) #the number of parameter servers
 27 | 	n_workers = len(cluster_spec['worker']) #the number of worker nodes
 28 | 	cluster = tf.train.ClusterSpec(cluster_spec) #allows this node know about all other nodes
 29 | 
 30 | 	if FLAGS.job_name == 'ps': #checks if parameter server
 31 | 		server = tf.train.Server(cluster,
 32 | 					job_name="ps",
 33 | 					task_index=FLAGS.task_index,
 34 | 					config=config)
 35 | 		server.join()
 36 | 	else: #it must be a worker server
 37 | 		is_chief = (FLAGS.task_index == 0) #checks if this is the chief node
 38 | 		server = tf.train.Server(cluster,
 39 | 					job_name="worker",
 40 | 					task_index=FLAGS.task_index,
 41 | 					config=config)
 42 | 		
 43 | 		# Graph
 44 | 		# Local operations
 45 | 		with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index):
 46 | 			a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32,
 47 | 						collections=[tf.GraphKeys.LOCAL_VARIABLES])
 48 | 			b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32,
 49 | 						collections=[tf.GraphKeys.LOCAL_VARIABLES])
 50 | 			c=a+b
 51 | 
 52 | 			local_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='local_step',
 53 | 						collections=['local_non_trainable'])
 54 | 			lr = .0001
 55 | 			
 56 | 			#loptimizer = tf.train.GradientDescentOptimizer(lr*FLAGS.task_index) #local optimizer
 57 | 			loptimizer = tf.train.AdagradOptimizer(lr) #local optimizer
 58 | 
 59 | 			target = tf.constant(100.,shape=[2],dtype=tf.float32)
 60 | 			loss = tf.reduce_mean(tf.square(c-target))
 61 | 
 62 | 			# DOWNPOUR
 63 | 			update_window = 3 # T: communication window
 64 | 			grad_list = [] # the array to store the gradients through the communication window
 65 | 			for t in range(update_window):
 66 | 				if t != 0:
 67 | 					with tf.control_dependencies([opt_local]): #compute gradients only if the local opt was run
 68 | 						grads, varss = zip(*loptimizer.compute_gradients( \
 69 | 									loss,var_list=tf.local_variables()))
 70 | 				else:
 71 | 					grads, varss = zip(*loptimizer.compute_gradients( \
 72 | 								loss,var_list=tf.local_variables()))
 73 | 				grad_list.append(grads) #add gradients to the list
 74 | 				opt_local = loptimizer.apply_gradients(zip(grads,varss),
 75 | 							global_step=local_step) #update local parameters
 76 | 
 77 | 			grads = tf.reduce_sum(grad_list,axis=0) #sum updates before applying globally
 78 | 			grads = tuple([grads[i]for i in range(len(varss))])
 79 | 
 80 | 			# add these variables created by local optimizer to local collection
 81 | 			lopt_vars = add_global_variables_to_local_collection()
 82 | 
 83 | 			# delete the variables from the global collection
 84 | 			clear_global_collection()
 85 | 
 86 | 		with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss,
 87 |           worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))):
 88 | 			global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step')
 89 | 
 90 | 			# all workers use the same learning rate and it is decided on by the task 0 
 91 | 			# or maybe the from the graph of the chief worker
 92 | 			optimizer = tf.train.AdagradOptimizer(lr) #global optimizer
 93 | 
 94 | 			# create global variables and/or references
 95 | 			local_to_global, global_to_local = create_global_variables(lopt_vars)
 96 | 			opt = optimizer.apply_gradients(
 97 | 						zip(grads,[local_to_global[v] for v in varss])
 98 | 						,global_step=global_step) #apply the gradients to variables on ps
 99 | 
100 | 			# Pull params from global server
101 | 			with tf.control_dependencies([opt]):
102 | 				assign_locals = assign_global_to_local(global_to_local)
103 | 
104 | 			# Grab global state before training so all workers have same initialization
105 | 			grab_global_init = assign_global_to_local(global_to_local)
106 | 
107 | 			# Assigns local values to global ones for chief to execute
108 | 			assign_global = assign_local_to_global(local_to_global)
109 | 
110 | 			# Init ops
111 | 			init = tf.global_variables_initializer() # for global variables
112 | 			init_local = tf.variables_initializer(tf.local_variables() \
113 | 						+tf.get_collection('local_non_trainable')) #for local variables
114 | 
115 | 		# Session
116 | 		stop_hook = tf.train.StopAtStepHook(last_step=60)
117 | 		hooks = [stop_hook]
118 | 		scaff = tf.train.Scaffold(init_op=init,local_init_op=[init_local])
119 | 
120 | 		# Monitored Training Session
121 | 		sess = tf.train.MonitoredTrainingSession(master=server.target,
122 | 					is_chief=is_chief,
123 | 					config=config,
124 | 					scaffold=scaff,
125 | 					hooks=hooks,
126 | 					save_checkpoint_secs=1,
127 | 					checkpoint_dir='logdir')
128 | 		
129 | 		if is_chief:
130 | 			sess.run(assign_global) #Assigns chief's initial values to ps
131 | 			time.sleep(10) #grace period to wait on other workers before starting training
132 | 
133 | 		# Train until hook stops session
134 | 		print('Starting training on worker %d'%FLAGS.task_index)
135 | 		sess.run(grab_global_init)
136 | 		while not sess.should_stop():
137 | 			_,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step])
138 | 
139 | 			print(r,"global step: "+str(gs),"worker: "+str(FLAGS.task_index),"local step: "+str(ls))
140 | 
141 | 			time.sleep(1) # so we can observe training
142 | 		print('Done',FLAGS.task_index)
143 | 
144 | 		time.sleep(10) #grace period to wait before closing session
145 | 		sess.close()
146 | 		print('Session from worker %d closed cleanly'%FLAGS.task_index)
147 | 
148 | 
149 | def assign_global_to_local(global_to_local):
150 | 	"""Assigns global variable value to local variables.
151 | 
152 | 	global_to_local : dictionary with corresponding local variable for global key
153 | 	"""
154 | 	r = []
155 | 	for v in global_to_local.keys():
156 | 		r.append(tf.assign(global_to_local[v],v))
157 | 	with tf.control_dependencies(r):
158 | 		a = tf.no_op()
159 | 	return a
160 | 
161 | 
162 | def assign_local_to_global(local_to_global):
163 | 	"""Assigns global variable value to local variables.
164 | 
165 | 	local_to_global : dictionary with corresponding global variable for local key
166 | 	"""
167 | 	r= []
168 | 	for v in local_to_global.keys():
169 | 		r.append(tf.assign(local_to_global[v],v))
170 | 	with tf.control_dependencies(r):
171 | 		a = tf.no_op()
172 | 	return a
173 | 
174 | 
175 | def get_variable_by_name(name):
176 | 	"""Returns the variable of given name
177 | 
178 | 	name : the name of the global variable
179 | 	"""
180 | 	return [v for v in tf.get_collection('variables') if v.name == name][0]
181 | 
182 | 
183 | def get_global_variable_by_name(name):
184 | 	"""Returns the global variable of given name.
185 | 
186 | 	name : the name of the global variable
187 | 	"""
188 | 	# return [v for v in tf.variables() if v.name == name][0]
189 | 	return [v for v in tf.global_variables() if v.name == name][0]
190 | 
191 | 
192 | def create_global_variables(local_optimizer_vars = []):
193 | 	"""Creates global variables for local variables on the graph.
194 | 	Skips variables local variables that are created for
195 | 	local optimization.
196 | 
197 | 	Returns dictionarys for local-to-global and global-to-local
198 | 	variable mappings.
199 | 	"""
200 | 	local_to_global = {}
201 | 	global_to_local = {}
202 | 	with tf.device('/job:ps/task:0'):
203 | 		for v in tf.local_variables():
204 | 			if v not in local_optimizer_vars:
205 | 				v_g = tf.get_variable('g/'+v.op.name,
206 | 					shape = v.shape,
207 | 					dtype = v.dtype,
208 | 					trainable=True,
209 | 					collections=[tf.GraphKeys.GLOBAL_VARIABLES,
210 | 								tf.GraphKeys.TRAINABLE_VARIABLES])
211 | 				local_to_global[v] = v_g
212 | 				global_to_local[v_g] = v
213 | 	return local_to_global,global_to_local
214 | 
215 | 
216 | def add_global_variables_to_local_collection():
217 | 	"""Adds all variables from the global collection
218 | 	to the local collection.
219 | 
220 | 	Returns the list of variables added.
221 | 	"""
222 | 	r =[]
223 | 	for var in tf.get_default_graph()._collections[tf.GraphKeys.GLOBAL_VARIABLES]:
224 | 		tf.add_to_collection(tf.GraphKeys.LOCAL_VARIABLES,var)
225 | 		r.append(var)
226 | 	return r
227 | 
228 | 
229 | def clear_global_collection():
230 | 	"""Removes all variables from global collection."""
231 | 	g = tf.get_default_graph()
232 | 	for _ in range(len(g._collections[tf.GraphKeys.GLOBAL_VARIABLES])):
233 | 		del g._collections[tf.GraphKeys.GLOBAL_VARIABLES][0]
234 | 
235 | 
236 | if __name__ == '__main__':
237 | 	parser = argparse.ArgumentParser()
238 | 	# Flags for defining the tf.train.ClusterSpec
239 | 	parser.add_argument(
240 |     	"--job_name",
241 |     	type=str,
242 |     	default="",
243 |     	help="One of 'ps', 'worker'"
244 |     )
245 |   # Flags for defining the tf.train.Server
246 | 	parser.add_argument(
247 |     	"--task_index",
248 |     	type=int,
249 |     	default=0,
250 |     	help="Index of task within the job"
251 |     )
252 | 	FLAGS, unparsed = parser.parse_known_args()
253 | 	print(FLAGS.task_index)
254 | 	main()
255 | 


--------------------------------------------------------------------------------
/DOWNPOUR/README.md:
--------------------------------------------------------------------------------
1 | ## DOWNPOUR
2 | 
3 | Similar to Hogwild! expect that it uses Adagrad to update the local workers.  Additionally, there is a communication window which servers as a time buffer for updates to the parameter server (although the original paper set the communication window to one, which voided the need for this buffer).
4 | 


--------------------------------------------------------------------------------
/DOWNPOUR/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python DOWNPOUR.py --job_name "ps" --task_index 0 &
3 | python DOWNPOUR.py --job_name "worker" --task_index 0 &
4 | python DOWNPOUR.py --job_name "worker" --task_index 1 &


--------------------------------------------------------------------------------
/Distributed-Setup/README.md:
--------------------------------------------------------------------------------
 1 | There are many ways to set up a session in a distributed setting but we demonstrate two in this example:
 2 | 
 3 | 	1. Monitored Training Session
 4 | 	2. Supervisor Session
 5 | 
 6 | The Monitored Training Session is the best option because it can handle many hooks and can be used for synchronous training.  The Supervisor Session offers suppport for to handling threads and can be used for some distributed training, but overall offers less than the Monitored Training Session.  The schema for this is directory is as follows
 7 | 
 8 | * `dist_setup.py` -- python code for Monitored Training Session
 9 | * `dist_setup_sup.py` -- python code for Supervisor Session
10 | * `run.sh` -- bash script for Monitored Training Session
11 | * `run_sup.sh` -- bash script for Supervisor Session


--------------------------------------------------------------------------------
/Distributed-Setup/dist_setup.py:
--------------------------------------------------------------------------------
 1 | """Simple example with one parameter server and one worker.
 2 | 
 3 | Author: Tommy Mulc
 4 | """
 5 | 
 6 | 
 7 | from __future__ import print_function
 8 | import tensorflow as tf
 9 | import argparse
10 | import time
11 | import os
12 | 
13 | 
14 | FLAGS = None
15 | log_dir = '/logdir'
16 | 
17 | def main():
18 | 	# Distributed Baggage
19 | 	cluster = tf.train.ClusterSpec({
20 |         'ps':['localhost:2222'],
21 |         'worker':['localhost:2223']
22 |         }) #lets this node know about all other nodes
23 | 	if FLAGS.job_name == 'ps': #checks if parameter server
24 | 		server = tf.train.Server(cluster,
25 |           job_name="ps",
26 |           task_index=FLAGS.task_index)
27 | 		server.join()
28 | 	else:
29 | 		is_chief = (FLAGS.task_index == 0) #checks if this is the chief node
30 | 		server = tf.train.Server(cluster,
31 |           job_name="worker",
32 |           task_index=FLAGS.task_index)
33 | 
34 | 		# Graph
35 | 		with tf.device('/cpu:0'):
36 | 			a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
37 | 			b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
38 | 			c=a+b
39 | 
40 | 			target = tf.constant(100.,shape=[2],dtype=tf.float32)
41 | 			loss = tf.reduce_mean(tf.square(c-target))
42 | 		
43 | 			opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss)
44 | 
45 |     # Session
46 |     # Monitored Training Session
47 | 		sess = tf.train.MonitoredTrainingSession(
48 |           master=server.target,
49 |           is_chief=is_chief)
50 | 		for i in range(1000):
51 | 			if sess.should_stop(): break
52 | 			sess.run(opt)
53 | 			if i % 10 == 0:
54 | 				r = sess.run(c)
55 | 				print(r)
56 | 			time.sleep(.1)
57 | 		sess.close()
58 | 
59 | if __name__ == '__main__':
60 | 	parser = argparse.ArgumentParser()
61 | 	# Flags for defining the tf.train.ClusterSpec
62 | 	parser.add_argument(
63 |     	"--job_name",
64 |     	type=str,
65 |     	default="",
66 |     	help="One of 'ps', 'worker'"
67 |     )
68 |   # Flags for defining the tf.train.Server
69 | 	parser.add_argument(
70 |     	"--task_index",
71 |     	type=int,
72 |     	default=0,
73 |     	help="Index of task within the job"
74 |     )
75 | 	FLAGS, unparsed = parser.parse_known_args()
76 | 	main()
77 | 


--------------------------------------------------------------------------------
/Distributed-Setup/dist_setup_sup.py:
--------------------------------------------------------------------------------
 1 | """Simple example with one parameter server and one worker.
 2 | 
 3 | Author: Tommy Mulc
 4 | """
 5 | 
 6 | 
 7 | from __future__ import print_function
 8 | import tensorflow as tf
 9 | import argparse
10 | import time
11 | import os
12 | 
13 | 
14 | FLAGS = None
15 | log_dir = '/logdir'
16 | 
17 | def main():
18 |   # Distributed Baggage
19 |   cluster = tf.train.ClusterSpec({
20 |         'ps':['localhost:2222'],
21 |         'worker':['localhost:2223']
22 |         }) #lets this node know about all other nodes
23 |   if FLAGS.job_name == 'ps': #checks if parameter server
24 |     server = tf.train.Server(cluster,job_name="ps",task_index=FLAGS.task_index)
25 |     server.join()
26 |   else:
27 |     is_chief = (FLAGS.task_index == 0) #checks if this is the chief node
28 |     server = tf.train.Server(cluster,job_name="worker",task_index=FLAGS.task_index)
29 | 
30 |     # Graph
31 |     with tf.device('/cpu:0'):
32 |       a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
33 |       b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
34 |       c=a+b
35 | 
36 |       target = tf.constant(100.,shape=[2],dtype=tf.float32)
37 |       loss = tf.reduce_mean(tf.square(c-target))
38 | 
39 |       opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss)
40 |     
41 |     # Session
42 |     # Supervisor
43 |     sv = tf.train.Supervisor(logdir=os.getcwd()+log_dir,is_chief=is_chief,save_model_secs=30)
44 |     sess = sv.prepare_or_wait_for_session(server.target)
45 |     for i in range(1000):
46 |       if sv.should_stop(): break
47 |       sess.run(opt)
48 |       if i % 10 == 0:
49 |         r = sess.run(c)
50 |         print(r)
51 |         time.sleep(.1)
52 | 
53 | if __name__ == '__main__':
54 | 	parser = argparse.ArgumentParser()
55 | 	# Flags for defining the tf.train.ClusterSpec
56 | 	parser.add_argument(
57 |     	"--job_name",
58 |     	type=str,
59 |     	default="",
60 |     	help="One of 'ps', 'worker'"
61 |     )
62 |   # Flags for defining the tf.train.Server
63 | 	parser.add_argument(
64 |     	"--task_index",
65 |     	type=int,
66 |     	default=0,
67 |     	help="Index of task within the job"
68 |     )
69 | 	FLAGS, unparsed = parser.parse_known_args()
70 | 	main()
71 | 


--------------------------------------------------------------------------------
/Distributed-Setup/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python dist_setup.py --job_name "ps" --task_index 0 &
3 | python dist_setup.py --job_name "worker" --task_index 0 &
4 | 


--------------------------------------------------------------------------------
/Distributed-Setup/run_sup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python dist_setup_sup.py --job_name "ps" --task_index 0 &
3 | python dist_setup_sup.py --job_name "worker" --task_index 0 &
4 | 


--------------------------------------------------------------------------------
/Hogwild/Hogwild.py:
--------------------------------------------------------------------------------
 1 | """Hogwild!
 2 | 
 3 | Asynchronous updates with 1 parameter server and 2 workers.
 4 | The updates happen 'hogwild' style so the parameters are 
 5 | never locked.
 6 | 
 7 | Author: Tommy Mulc
 8 | """
 9 | 
10 | from __future__ import print_function
11 | import tensorflow as tf
12 | import argparse
13 | import time
14 | import os
15 | FLAGS = None
16 | log_dir = '/logdir'
17 | 
18 | def main():
19 | 	# Server Setup
20 |   cluster = tf.train.ClusterSpec({
21 |         'ps':['localhost:2222'],
22 |         'worker':['localhost:2223','localhost:2224']
23 |         }) #allows this node know about all other nodes
24 |   if FLAGS.job_name == 'ps': #checks if parameter server
25 |     server = tf.train.Server(cluster,
26 |           job_name="ps",
27 |           task_index=FLAGS.task_index)
28 |     server.join()
29 |   else:
30 |     is_chief = (FLAGS.task_index == 0) #checks if this is the chief node
31 |     server = tf.train.Server(cluster,
32 |           job_name="worker",
33 |           task_index=FLAGS.task_index)
34 | 		
35 |     # Graph
36 |     with tf.device('/cpu:0'):
37 |       a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
38 |       b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
39 |       c=a+b
40 | 
41 |       target = tf.constant(100.,shape=[2],dtype=tf.float32)
42 |       loss = tf.reduce_mean(tf.square(c-target))
43 | 
44 |       opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss)
45 | 
46 | 		# Session
47 |     sv = tf.train.Supervisor(logdir=os.getcwd()+log_dir,
48 |           is_chief=is_chief,
49 |           save_model_secs=30)
50 |     sess = sv.prepare_or_wait_for_session(server.target)
51 |     for i in range(1000):
52 |       if sv.should_stop(): break
53 |       sess.run(opt)
54 |       if i % 10 == 0:
55 |         r = sess.run(c)
56 |         print(r)
57 |       time.sleep(.1)
58 |       
59 | if __name__ == '__main__':
60 | 	parser = argparse.ArgumentParser()
61 | 	# Flags for defining the tf.train.ClusterSpec
62 | 	parser.add_argument(
63 |     	"--job_name",
64 |     	type=str,
65 |     	default="",
66 |     	help="One of 'ps', 'worker'"
67 |     )
68 |     # Flags for defining the tf.train.Server
69 | 	parser.add_argument(
70 |     	"--task_index",
71 |     	type=int,
72 |     	default=0,
73 |     	help="Index of task within the job"
74 |     )
75 | 	FLAGS, unparsed = parser.parse_known_args()
76 | 	main()
77 | 


--------------------------------------------------------------------------------
/Hogwild/README.md:
--------------------------------------------------------------------------------
1 | ## HogWild!
2 | 
3 | The famous, lock-free approach to SGD.  Have a bunch of workers and parameter server, then let the workers update the variables whenever they want.
4 | 


--------------------------------------------------------------------------------
/Hogwild/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python Hogwild.py --job_name "ps" --task_index 0 &
3 | python Hogwild.py --job_name "worker" --task_index 0 &
4 | python Hogwild.py --job_name "worker" --task_index 1 &
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Tommy Mulc
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Multiple-GPUs-Single-Machine/README.md:
--------------------------------------------------------------------------------
1 | ## Multiple GPUs Single Machine
2 | 
3 | Use environment variables to manually override the available GPUs in a TensorFlow process.  There is a way to do this without using environment variables, but it's a not worth the effort (if you really need this, you can remap the available devices so the GPU you want to use is labeled as device 0, then set visible devices to 0).
4 | 


--------------------------------------------------------------------------------
/Multiple-GPUs-Single-Machine/dist_mult_gpu_sing_mach.py:
--------------------------------------------------------------------------------
 1 | """Asynchrnous training on multiple GPUs on the same machine.
 2 | 
 3 | Author: Tommy Mulc
 4 | """
 5 | 
 6 | from __future__ import print_function
 7 | import tensorflow as tf
 8 | import argparse
 9 | import time
10 | import os
11 | FLAGS = None
12 | log_dir = '/logdir'
13 | 
14 | def main():
15 |     # Server Setup
16 |   cluster = tf.train.ClusterSpec({
17 |         'ps':['localhost:2222'],
18 |         'worker':['localhost:2223','localhost:2224']
19 |         }) #allows this node know about all other nodes
20 | 	if FLAGS.job_name == 'ps': #checks if parameter server
21 | 		with tf.device('/cpu:0'):
22 | 			server = tf.train.Server(cluster,
23 |             job_name="ps",
24 |             task_index=FLAGS.task_index)
25 | 			server.join()
26 | 	else:
27 | 		is_chief = (FLAGS.task_index == 0) #checks if this is the chief node
28 | 		server = tf.train.Server(cluster,job_name="worker",
29 | 					task_index=FLAGS.task_index,config=config)
30 | 		# Graph
31 | 		with tf.device('/gpu:0'):
32 | 			a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
33 | 			b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
34 | 			c=a+b
35 | 
36 | 			target = tf.constant(100.,shape=[2],dtype=tf.float32)
37 | 			loss = tf.reduce_mean(tf.square(c-target))
38 | 		
39 | 			opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss)
40 | 
41 | 		# Session
42 | 		sv = tf.train.Supervisor(logdir=os.getcwd()+log_dir,
43 |           is_chief=is_chief,
44 |           save_model_secs=30)
45 | 		gpu_options = tf.GPUOptions(allow_growth=True,
46 |           allocator_type="BFC",
47 |           visible_device_list="%d"%FLAGS.task_index)
48 | 		config = tf.ConfigProto(gpu_options=gpu_options,
49 |           allow_soft_placement=True)
50 | 		sess = sv.prepare_or_wait_for_session(server.target,config=config)
51 | 		for i in range(1000):
52 | 			if sv.should_stop(): break
53 | 			sess.run(opt)
54 | 			if i % 10 == 0:
55 | 				r = sess.run(c)
56 | 				print(r)
57 | 			time.sleep(.1)
58 | 
59 | if __name__ == '__main__':
60 | 	parser = argparse.ArgumentParser()
61 | 	# Flags for defining the tf.train.ClusterSpec
62 | 	parser.add_argument(
63 |     	"--job_name",
64 |     	type=str,
65 |     	default="",
66 |     	help="One of 'ps', 'worker'"
67 |     )
68 |   # Flags for defining the tf.train.Server
69 | 	parser.add_argument(
70 |     	"--task_index",
71 |     	type=int,
72 |     	default=0,
73 |     	help="Index of task within the job"
74 |     )
75 | 	FLAGS, unparsed = parser.parse_known_args()
76 | 	main()
77 | 


--------------------------------------------------------------------------------
/Multiple-GPUs-Single-Machine/dist_mult_gpu_sing_mach.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=-1
3 | python dist_mult_gpu_sing_mach.py --job_name "ps" --task_index 0 &
4 | export CUDA_VISIBLE_DEVICES=0
5 | python dist_mult_gpu_sing_mach.py --job_name "worker" --task_index 0 &
6 | export CUDA_VISIBLE_DEVICES=1
7 | python dist_mult_gpu_sing_mach.py --job_name "worker" --task_index 1 &
8 | 


--------------------------------------------------------------------------------
/Non-Distributed_Setup.py:
--------------------------------------------------------------------------------
 1 | """The non-distributed solution to the problem.
 2 | 
 3 | Author: Tommy Mulc
 4 | """
 5 | 
 6 | from __future__ import print_function
 7 | import tensorflow as tf
 8 | import time
 9 | 
10 | def main():
11 | 	# Graph
12 | 	with tf.device('/cpu:0'):
13 | 		a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
14 | 		b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32)
15 | 		c=a+b
16 | 
17 | 		target = tf.constant(100.,shape=[2],dtype=tf.float32)
18 | 		loss = tf.reduce_mean(tf.square(c-target))
19 | 	
20 | 		opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss)
21 | 
22 | 	# Session
23 | 	sv = tf.train.Supervisor()
24 | 	sess = sv.prepare_or_wait_for_session()
25 | 	for i in range(1000):
26 | 		sess.run(opt)
27 | 		if i % 10 == 0:
28 | 			r = sess.run(c)
29 | 			print(r)
30 | 		time.sleep(.1)
31 | 
32 | if __name__ == '__main__':
33 | 	main()
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Distributed TensorFlow Guide
  2 |  
  3 | 
  4 | This guide is a collection of distributed training examples (that can act as boilerplate code) and a tutorial of basic distributed TensorFlow.  Many of the examples focus on implementing well-known distributed training schemes, such as those available in [*dist-keras*](https://github.com/cerndb/dist-keras) which were discussed in the author's [blog post](http://joerihermans.com/ramblings/distributed-deep-learning-part-1-an-introduction/). 
  5 |  
  6 | <div align="center">
  7 | <img src="imgs/data-parallelism.png" width=50%>
  8 | </div>
  9 | 
 10 | Almost all the examples can be run on a single machine with a CPU, and all the examples only use data-parallelism (i.e., between-graph replication).
 11 | 
 12 | The motivation for this guide stems from the current state of distributed deep learning.  Deep learning papers typical demonstrate successful new architectures on some benchmark, but rarely show how these models can be trained with 1000x the data which is usually the requirement in industy.  Furthermore, most successful distributed cases use state-of-the-art hardware to bruteforce massive effective minibatches in a synchronous fashion across high-bandwidth networks; there has been little research showing the potential of asynchronous training (which is why there are a lot of those examples in this guide).  Finally, the lack of documenation for distributed TF was the largest motivator for this project.  TF is a great tool that prides itself on its scalability, but unfortunately there are few examples that show how to make your model scale with data size.  
 13 | 
 14 | The aim of this guide is to aid all interested in distributed deep learning, from beginners to researchers.
 15 | 
 16 | ## Basics Tutorial
 17 | 
 18 | See the Basics-Tutorial folder for notebooks demonstrating core concepts used in distributed TensorFlow.  The rest of the examples assume understanding of the basics tutorial.
 19 | 
 20 | * [`Servers.ipynb`](Basics-Tutorial/Servers.ipynb) -- basics of TensorFlow servers
 21 | * [`Parameter-Server.ipynb`](Basics-Tutorial/Parameter-Server.ipynb) -- everything about parameter servers
 22 | * [`Local-then-Global-Variables.ipynb`](Basics-Tutorial/Local-then-Global-Variables.ipynb) -- creates a graph locally then make global copies of the variables Useful for graphs that do local updates before pushing global updates (e.g., DOWNPOUR, ADAG, etc.)
 23 | * [`Multiple-Workers`](Basics-Tutorial/Multiple-Workers/) -- contains three notebooks: one parameter server notebook and two worker notebooks  The exercise shows how global variables are communicated via the parameter server and how local updates can be made by explicitly placing ops on local devices
 24 | 
 25 | 
 26 | ## Training Algorithm Examples
 27 | 
 28 | The complete list of examples is below. The first example, [`Non-Distributed-Setup`](Non-Distributed_Setup.py), shows the basic learning problem we want to solve distributively; this example should be familiar to all since it doesn't use any distributed code.  The second example, [`Distributed-Setup`](Distributed-Setup/), shows the same problem being solved with distributed code (i.e., with one parameter server and one worker). The remaining examples are a mix of synchronous and non-synchronous training schemes.
 29 | 
 30 | * [`Non-Distributed-Setup`](Non-Distributed_Setup.py)
 31 | * [`Distributed-Setup`](Distributed-Setup)
 32 | * [`HogWild`](Hogwild) (Asychronous SGD)
 33 | * [`DOWNPOUR`](DOWNPOUR)
 34 | * [`DOWNPOUR-Easy`](DOWNPOUR-Easy/)<sup>1</sup>
 35 | * [`AGN`](AGN) (Accumulated Gradient Normalization)
 36 | * [`Synchronous-SGD`](Synchronous-SGD/)
 37 | * [`Synchronous-SGD-different-learning-rates`](Synchronous-SGD-different-learning-rates/)
 38 | * [`SAGN`](SAGN) (Synchronous Accumulated Gradients Normalization)
 39 | * [`Multiple-GPUs-Single-Machine`](Multiple-GPUs-Single-Machine/)
 40 | * `Dynamic SGD` **TODO**
 41 | * `Asynchronous Elastic Averaging SGD` (AEASGD) **TODO**
 42 | * `Asynchronous Elastic Averaging Momentum SGD` (AEAMSGD) **TODO**
 43 | 
 44 | 
 45 | <sup>1</sup>This is the same as the DOWNPOUR example except that is uses SGD on the workers instead of Adagrad.
 46 | 
 47 | ## Running Training Algorithm Examples
 48 | All the training examples (except the non-distributed example) live in a folder.  To run them, move to the example directory and run the bash script.
 49 | 
 50 | ```bash
 51 | cd <example_name>/
 52 | bash run.sh
 53 | ``` 
 54 | 
 55 | In order to completely stop the example, you'll need to kill the python processes associated with it.  If you want to stopped training early, then there will be python processes for each of the workers in addition to the parameter server processes.  Unfortunately, the parameter server processes continue to run even after the workers are finished--these will always need to be killed manually.   To kill all python processes, run pkill.
 56 | 
 57 | ```bash
 58 | sudo pkill python
 59 | ```
 60 | 
 61 | ## Requirements
 62 | 
 63 | * Python 2.7
 64 | * TensorFlow >= 1.2
 65 | 
 66 | 
 67 | ## Links
 68 | * [Official Documenation](https://www.tensorflow.org/deploy/distributed)
 69 | * [Threads and Queues](https://www.tensorflow.org/programmers_guide/threading_and_queues)
 70 | * [More TensorFlow Documentation](https://www.tensorflow.org/api_guides/python/train#Distributedexecution)
 71 | 
 72 | ## Glossary
 73 | * [Server](https://www.tensorflow.org/api_docs/python/tf/train/Server) -- encapsulates a Session target and belongs to a cluster
 74 | * [Coordinator](https://www.tensorflow.org/api_docs/python/tf/train/Coordinator) -- coordinates threads
 75 | * [Session Manager](https://www.tensorflow.org/api_docs/python/tf/train/SessionManager) -- restores session and initialized variables and coordinates threads
 76 | * [Supervisor](https://www.tensorflow.org/api_docs/python/tf/train/Supervisor) -- good for threads. Coordinater, Saver, and Session Manager. > Session Manager
 77 | * [Session Creator](https://www.tensorflow.org/api_docs/python/tf/train/SessionCreator) -- Factory for creating a session?
 78 | * [Monitored Session](https://www.tensorflow.org/api_docs/python/tf/train/MonitoredSession) -- Session.  initialization, hooks, recovery.
 79 | * [Monitored Training Session](https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession) -- only distributed solution for sync optimization
 80 | * [Sync Replicas](https://www.tensorflow.org/api_docs/python/tf/train/SyncReplicasOptimizer) -- wrapper of optimizer for synchronous optimization
 81 | * [Scaffold](https://www.tensorflow.org/api_docs/python/tf/train/Scaffold) -- holds lots of meta training settings and passed to Session creator
 82 | 
 83 | ### Hooks
 84 | * [LoggingTensorHook](https://www.tensorflow.org/api_docs/python/tf/train/LoggingTensorHook) -- prints tensors every *N* steps or seconds
 85 | * [StopAtStepHook](https://www.tensorflow.org/api_docs/python/tf/train/StopAtStepHook) -- requests to stop training at a certain step
 86 | * [StepCounterHook](https://www.tensorflow.org/api_docs/python/tf/train/StepCounterHook) -- counts steps per second
 87 | * [CheckpointSaverHook](https://www.tensorflow.org/api_docs/python/tf/train/CheckpointSaverHook) -- saves new checkpoint every *N* steps or seconds
 88 | * [NanTensorHook](https://www.tensorflow.org/api_docs/python/tf/train/NanTensorHook) -- stops training if loss is NaN
 89 | * [SummarySaverHook](https://www.tensorflow.org/api_docs/python/tf/train/SummarySaverHook) -- saves summaries every *N* steps or seconds
 90 | * [GlobalStepWaiterHook](https://www.tensorflow.org/api_docs/python/tf/train/GlobalStepWaiterHook) -- waits until global step reaches threshold before training
 91 | * [FinalOpsHook](https://www.tensorflow.org/api_docs/python/tf/train/FinalOpsHook) -- runs specified ops before closing session
 92 | * [FeedFnHook](https://www.tensorflow.org/api_docs/python/tf/train/FeedFnHook) -- assigns feed_dict
 93 | 
 94 | ## Algorithm References
 95 | 
 96 | * [Hogwild!](https://people.eecs.berkeley.edu/~brecht/papers/hogwildTR.pdf)
 97 | * [DOWNPOUR](https://static.googleusercontent.com/media/research.google.com/en//archive/large_deep_networks_nips2012.pdf)
 98 | * [ADAG](http://joerihermans.com/ramblings/distributed-deep-learning-part-1-an-introduction/)
 99 | * [AGN](https://arxiv.org/abs/1710.02368)
100 | * [EASGD and EAMSGD](https://arxiv.org/abs/1412.6651)
101 | 


--------------------------------------------------------------------------------
/SAGN/README.md:
--------------------------------------------------------------------------------
1 | ## SDAG (Synchronous Accumulated Gradient Normalization)
2 | 
3 | A hybrid of SSGD and AGN.  This method averages gradients over the communication window but apply updates to the ps variables synchronously.
4 | 


--------------------------------------------------------------------------------
/SAGN/SAGN.py:
--------------------------------------------------------------------------------
  1 | """Synchronous Accumulated Gradients Normalization (SGAN)
  2 | 
  3 | Performs synchronous updates with gradients averaged
  4 | over a time window.
  5 | 
  6 | Author: Tommy Mulc
  7 | """
  8 | 
  9 | from __future__ import print_function
 10 | import tensorflow as tf
 11 | import argparse
 12 | import time
 13 | import os
 14 | FLAGS = None
 15 | log_dir = '/logdir'
 16 | 
 17 | def main():
 18 | 	# Configure
 19 | 	config=tf.ConfigProto(log_device_placement=False)
 20 | 
 21 | 	# Server Setup
 22 | 	cluster_spec = {
 23 |   			'ps':['localhost:2222'],
 24 |   			'worker':['localhost:2223','localhost:2224']
 25 |   			} #allows this node know about all other nodes
 26 | 	n_pss = len(cluster_spec['ps']) #the number of parameter servers
 27 | 	n_workers = len(cluster_spec['worker']) #the number of worker nodes
 28 | 	cluster = tf.train.ClusterSpec(cluster_spec) #allows this node know about all other nodes
 29 | 
 30 | 	if FLAGS.job_name == 'ps': #checks if parameter server
 31 | 		server = tf.train.Server(cluster,
 32 | 					job_name="ps",
 33 | 					task_index=FLAGS.task_index,
 34 | 					config=config)
 35 | 		server.join()
 36 | 	else: #it must be a worker server
 37 | 		is_chief = (FLAGS.task_index == 0) #checks if this is the chief node
 38 | 		server = tf.train.Server(cluster,
 39 | 					job_name="worker",
 40 | 					task_index=FLAGS.task_index,
 41 | 					config=config)
 42 | 		# Graph
 43 | 		with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index):
 44 | 			a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32,
 45 | 						collections=[tf.GraphKeys.LOCAL_VARIABLES])
 46 | 			b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32,
 47 | 						collections=[tf.GraphKeys.LOCAL_VARIABLES])
 48 | 			c=a+b
 49 | 
 50 | 			local_step = tf.Variable(0,dtype=tf.int32,trainable=False,
 51 | 						name='local_step',collections=['local_non_trainable'])
 52 | 
 53 | 			target = tf.constant(100.,shape=[2],dtype=tf.float32)
 54 | 			loss = tf.reduce_mean(tf.square(c-target))
 55 | 
 56 | 			base_lr = .0001
 57 | 			loptimizer = tf.train.AdamOptimizer(base_lr)
 58 | 			# loptimizer = tf.train.GradientDescentOptimizer(base_lr)
 59 | 
 60 | 			# SDAG (simplest case since all batches are the same)
 61 | 			update_window = 5 # T: communication window
 62 | 			grad_list = [] # the array to store the gradients through the communication window
 63 | 			for t in range(update_window):
 64 | 				if t != 0:
 65 | 					#compute gradients only if the local opt was run
 66 | 					with tf.control_dependencies([opt_local]): 
 67 | 						grads, varss = zip(*loptimizer.compute_gradients( \
 68 | 									loss,var_list=tf.local_variables()))
 69 | 				else:
 70 | 					grads, varss = zip(*loptimizer.compute_gradients( \
 71 | 								loss,var_list=tf.local_variables()))
 72 | 				#add gradients to the list
 73 | 				grad_list.append(grads)
 74 | 				#update local parameters
 75 | 				opt_local = loptimizer.apply_gradients(zip(grads,varss),
 76 | 							global_step=local_step)
 77 | 
 78 | 			# averages updates before applying globally
 79 | 			grads = tf.reduce_mean(grad_list,axis=0)
 80 | 			grads = tuple([grads[i] for i in range(len(varss))])
 81 | 
 82 | 			# add these variables created by local optimizer to local collection
 83 | 			lopt_vars = add_global_variables_to_local_collection()
 84 | 
 85 | 			# delete the variables from the global collection
 86 | 			clear_global_collection()
 87 | 
 88 | 		with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss,
 89 |         	worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))):
 90 | 
 91 | 			global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step')
 92 | 
 93 | 			#create global variables and/or references
 94 | 			local_to_global, global_to_local = create_global_variables(lopt_vars)
 95 | 
 96 | 			optimizer = tf.train.AdamOptimizer(base_lr)
 97 | 			# optimizer = tf.train.GradientDescentOptimizer(base_lr)
 98 | 			optimizer1 = tf.train.SyncReplicasOptimizer(optimizer,
 99 | 						replicas_to_aggregate=2,
100 | 						total_num_replicas=2)
101 | 
102 | 			#apply the gradients to variables on ps
103 | 			opt = optimizer1.apply_gradients(
104 | 						zip(grads,[local_to_global[v] for v in varss])
105 | 						,global_step=global_step)
106 | 
107 | 			with tf.control_dependencies([opt]):
108 | 				assign_locals = assign_global_to_local(global_to_local)
109 | 
110 | 			# Grab global state before training so all workers have same initialization
111 | 			grab_global_init = assign_global_to_local(global_to_local)
112 | 
113 | 			# Assigns local values to global ones for chief to execute
114 | 			assign_global = assign_local_to_global(local_to_global)
115 | 
116 | 			# Initialized global step tokens
117 | 			init_tokens_op = optimizer1.get_init_tokens_op()
118 | 
119 | 			# Init ops
120 | 			# gets step token
121 | 			local_init=optimizer1.local_step_init_op
122 | 			if is_chief:
123 | 				# fills token queue and gets token
124 | 				local_init = optimizer1.chief_init_op
125 | 
126 | 			# indicates if variables are initialized
127 | 			ready_for_local_init = optimizer1.ready_for_local_init_op
128 | 
129 | 			with tf.control_dependencies([local_init]):
130 | 				init_local = tf.variables_initializer(tf.local_variables() \
131 | 							+tf.get_collection('local_non_trainable')) #for local variables
132 | 
133 | 			init = tf.global_variables_initializer() # must come after other init ops
134 | 
135 | 		# Session
136 | 		sync_replicas_hook = optimizer1.make_session_run_hook(is_chief)
137 | 		stop_hook = tf.train.StopAtStepHook(last_step=10)
138 | 		chief_hooks = [sync_replicas_hook,stop_hook]
139 | 		scaff = tf.train.Scaffold(init_op=init,
140 | 					local_init_op=init_local,
141 | 					ready_for_local_init_op=ready_for_local_init)
142 | 
143 | 		#Monitored Training Session
144 | 		sess = tf.train.MonitoredTrainingSession(master=server.target,
145 | 					is_chief=is_chief,
146 | 					config=config,
147 | 					scaffold=scaff,
148 | 					hooks=chief_hooks,
149 | 					stop_grace_period_secs=10)
150 | 
151 | 		if is_chief:
152 | 			sess.run(assign_global) # Assigns chief's initial values to ps
153 | 			time.sleep(40) # grace period to wait on other workers before starting training
154 | 
155 | 		# Train until hook stops session
156 | 		print('Starting training on worker %d'%FLAGS.task_index)
157 | 		sess.run(grab_global_init)
158 | 
159 | 
160 | 		# Train until hook stops session
161 | 		print('Starting training on worker %d'%FLAGS.task_index)
162 | 		while not sess.should_stop():
163 | 			_,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step])
164 | 			# _,r,gs=sess.run([opt,c,global_step])
165 | 			print(r,gs,FLAGS.task_index)
166 | 			if is_chief: time.sleep(1)
167 | 			time.sleep(1)
168 | 		print('Done',FLAGS.task_index)
169 | 
170 | 		time.sleep(10) #grace period to wait before closing session
171 | 		sess.close()
172 | 		print('Session from worker %d closed cleanly'%FLAGS.task_index)
173 | 
174 | 
175 | def assign_global_to_local(global_to_local):
176 | 	"""Assigns global variable value to local variables.
177 | 
178 | 	global_to_local : dictionary with corresponding local variable for global key
179 | 	"""
180 | 	r = []
181 | 	for v in global_to_local.keys():
182 | 		r.append(tf.assign(global_to_local[v],v))
183 | 	with tf.control_dependencies(r):
184 | 		a = tf.no_op()
185 | 	return a
186 | 
187 | 
188 | def assign_local_to_global(local_to_global):
189 | 	"""Assigns global variable value to local variables.
190 | 
191 | 	local_to_global : dictionary with corresponding global variable for local key
192 | 	"""
193 | 	r= []
194 | 	for v in local_to_global.keys():
195 | 		r.append(tf.assign(local_to_global[v],v))
196 | 	with tf.control_dependencies(r):
197 | 		a = tf.no_op()
198 | 	return a
199 | 
200 | 
201 | def get_variable_by_name(name):
202 | 	"""Returns the variable of given name
203 | 
204 | 	name : the name of the global variable
205 | 	"""
206 | 	return [v for v in tf.get_collection('variables') if v.name == name][0]
207 | 
208 | 
209 | def get_global_variable_by_name(name):
210 | 	"""Returns the global variable of given name.
211 | 
212 | 	name : the name of the global variable
213 | 	"""
214 | 	# return [v for v in tf.variables() if v.name == name][0]
215 | 	return [v for v in tf.global_variables() if v.name == name][0]
216 | 
217 | 
218 | def create_global_variables(local_optimizer_vars = []):
219 | 	"""Creates global variables for local variables on the graph.
220 | 	Skips variables local variables that are created for
221 | 	local optimization.
222 | 
223 | 	Returns dictionarys for local-to-global and global-to-local
224 | 	variable mappings.
225 | 	"""
226 | 	local_to_global = {}
227 | 	global_to_local = {}
228 | 	with tf.device('/job:ps/task:0'):
229 | 		for v in tf.local_variables():
230 | 			if v not in local_optimizer_vars:
231 | 				v_g = tf.get_variable('g/'+v.op.name,
232 | 					shape = v.shape,
233 | 					dtype = v.dtype,
234 | 					trainable=True,
235 | 					collections=[tf.GraphKeys.GLOBAL_VARIABLES,
236 | 								tf.GraphKeys.TRAINABLE_VARIABLES])
237 | 				local_to_global[v] = v_g
238 | 				global_to_local[v_g] = v
239 | 	return local_to_global,global_to_local
240 | 
241 | 
242 | def add_global_variables_to_local_collection():
243 | 	"""Adds all variables from the global collection
244 | 	to the local collection.
245 | 
246 | 	Returns the list of variables added.
247 | 	"""
248 | 	r =[]
249 | 	for var in tf.get_default_graph()._collections[tf.GraphKeys.GLOBAL_VARIABLES]:
250 | 		tf.add_to_collection(tf.GraphKeys.LOCAL_VARIABLES,var)
251 | 		r.append(var)
252 | 	return r
253 | 
254 | 
255 | def clear_global_collection():
256 | 	"""Removes all variables from global collection."""
257 | 	g = tf.get_default_graph()
258 | 	for _ in range(len(g._collections[tf.GraphKeys.GLOBAL_VARIABLES])):
259 | 		del g._collections[tf.GraphKeys.GLOBAL_VARIABLES][0]
260 | 
261 | 
262 | if __name__ == '__main__':
263 | 	parser = argparse.ArgumentParser()
264 | 	# Flags for defining the tf.train.ClusterSpec
265 | 	parser.add_argument(
266 |     	"--job_name",
267 |     	type=str,
268 |     	default="",
269 |     	help="One of 'ps', 'worker'"
270 |     )
271 |     # Flags for defining the tf.train.Server
272 | 	parser.add_argument(
273 |     	"--task_index",
274 |     	type=int,
275 |     	default=0,
276 |     	help="Index of task within the job"
277 |     )
278 | 	FLAGS, unparsed = parser.parse_known_args()
279 | 	print(FLAGS.task_index)
280 | 	main()
281 | 


--------------------------------------------------------------------------------
/SAGN/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python SAGN.py --job_name "ps" --task_index 0 &
3 | python SAGN.py --job_name "worker" --task_index 0 &
4 | python SAGN.py --job_name "worker" --task_index 1 &
5 | 


--------------------------------------------------------------------------------
/Synchronous-SGD-different-learning-rates/README.md:
--------------------------------------------------------------------------------
1 | ## SSGD different learning rates
2 | 
3 | Same as vanilla SSGD except each of the workers has its own learning rate.
4 | 


--------------------------------------------------------------------------------
/Synchronous-SGD-different-learning-rates/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python ssgd.py --job_name "ps" --task_index 0 &
3 | python ssgd.py --job_name "worker" --task_index 0 &
4 | python ssgd.py --job_name "worker" --task_index 1 &


--------------------------------------------------------------------------------
/Synchronous-SGD-different-learning-rates/ssgd.py:
--------------------------------------------------------------------------------
  1 | """Synchrous SGD with different learning rates
  2 | 
  3 | Author: Tommy Mulc
  4 | """
  5 | 
  6 | from __future__ import print_function
  7 | import tensorflow as tf
  8 | import argparse
  9 | import time
 10 | import os
 11 | FLAGS = None
 12 | log_dir = '/logdir'
 13 | 
 14 | def main():
 15 | 	# Configure
 16 | 	config=tf.ConfigProto(log_device_placement=False)
 17 | 
 18 | 	# Server Setup
 19 | 	cluster = tf.train.ClusterSpec({
 20 | 				'ps':['localhost:2222'],
 21 | 				'worker':['localhost:2223','localhost:2224']
 22 | 				}) #allows this node know about all other nodes
 23 | 	if FLAGS.job_name == 'ps': #checks if parameter server
 24 | 		server = tf.train.Server(cluster,
 25 | 					job_name="ps",
 26 | 					task_index=FLAGS.task_index,
 27 | 					config=config)
 28 | 		server.join()
 29 | 	else: #it must be a worker server
 30 | 		is_chief = (FLAGS.task_index == 0) #checks if this is the chief node
 31 | 		server = tf.train.Server(cluster,
 32 | 					job_name="worker",
 33 | 					task_index=FLAGS.task_index,
 34 | 					config=config)
 35 | 		
 36 | 		# Graph
 37 | 		worker_device = "/job:%s/task:%d/cpu:0" % (FLAGS.job_name,FLAGS.task_index)
 38 | 		with tf.device(tf.train.replica_device_setter(ps_tasks=1,
 39 | 					worker_device=worker_device)):
 40 | 
 41 | 			a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32)
 42 | 			b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32)
 43 | 			c = a+b
 44 | 
 45 | 			global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step')
 46 | 			target = tf.constant(100.,shape=[2],dtype=tf.float32)
 47 | 			loss = tf.reduce_mean(tf.square(c-target))
 48 | 		
 49 | 			# all workers use the same learning rate and it is decided on by the task 0 
 50 | 			# or maybe the from the graph of the chief worker
 51 | 			base_lr = 1.
 52 | 			optimizer = tf.train.GradientDescentOptimizer(base_lr)
 53 | 			optimizer1 = tf.train.SyncReplicasOptimizer(optimizer,
 54 | 						replicas_to_aggregate=2,
 55 | 						total_num_replicas=2)
 56 | 
 57 | 			# use different learning rates (hacky)
 58 | 			# only works for GradientDescentOptimizer
 59 | 			lr = .0001
 60 | 			if FLAGS.task_index == 0:
 61 | 				lr = .1
 62 | 			new_lr = lr/base_lr
 63 | 			grads, varss = zip(*optimizer1.compute_gradients(loss))
 64 | 			grads = [new_lr*2.0*grad for grad in grads] #this sums gradients
 65 | 			opt = optimizer1.apply_gradients(zip(grads,varss),global_step=global_step)
 66 | 
 67 | 			# Init ops
 68 | 			init_tokens_op = optimizer1.get_init_tokens_op()
 69 | 
 70 | 			local_init=optimizer1.local_step_init_op # initialize local step
 71 | 			if is_chief:
 72 | 				local_init = optimizer1.chief_init_op # initializes token queue
 73 | 
 74 | 			ready_for_local_init = optimizer1.ready_for_local_init_op # checks if global vars are init
 75 | 			init = tf.global_variables_initializer() # must come after other init ops
 76 | 
 77 | 		# Session
 78 | 		sync_replicas_hook = optimizer1.make_session_run_hook(is_chief)
 79 | 		stop_hook = tf.train.StopAtStepHook(last_step=10)
 80 | 		chief_hooks = [sync_replicas_hook,stop_hook]
 81 | 		scaff = tf.train.Scaffold(init_op=init,
 82 | 					local_init_op=local_init,
 83 | 					ready_for_local_init_op=ready_for_local_init)
 84 | 
 85 | 		#Monitored Training Session
 86 | 		sess = tf.train.MonitoredTrainingSession(master=server.target,
 87 | 					is_chief=is_chief,
 88 | 					config=config,
 89 | 					scaffold=scaff,
 90 | 					hooks=chief_hooks,
 91 | 					stop_grace_period_secs=10)
 92 | 
 93 | 		if is_chief:
 94 | 			sess.run(init_tokens_op)
 95 | 			time.sleep(40) #grace period to wait on other workers before starting training
 96 | 
 97 | 		print('Starting training on worker %d'%FLAGS.task_index)
 98 | 		while not sess.should_stop():
 99 | 			_,r,gs=sess.run([opt,c,global_step])
100 | 			print(r,'step: ',gs,'worker: ',FLAGS.task_index)
101 | 			if is_chief: time.sleep(1)
102 | 			time.sleep(1)
103 | 		print('Done',FLAGS.task_index)
104 | 
105 | 		time.sleep(10) #grace period to wait before closing session
106 | 		sess.close()
107 | 		print('Session from worker %d closed cleanly'%FLAGS.task_index)
108 | 
109 | 
110 | if __name__ == '__main__':
111 | 	parser = argparse.ArgumentParser()
112 | 	# Flags for defining the tf.train.ClusterSpec
113 | 	parser.add_argument(
114 |     	"--job_name",
115 |     	type=str,
116 |     	default="",
117 |     	help="One of 'ps', 'worker'"
118 |     )
119 |     # Flags for defining the tf.train.Server
120 | 	parser.add_argument(
121 |     	"--task_index",
122 |     	type=int,
123 |     	default=0,
124 |     	help="Index of task within the job"
125 |     )
126 | 	FLAGS, unparsed = parser.parse_known_args()
127 | 	print(FLAGS.task_index)
128 | 	main()
129 | 


--------------------------------------------------------------------------------
/Synchronous-SGD/README.md:
--------------------------------------------------------------------------------
1 | ## SSGD (Synchronous SGD)
2 | 
3 | Have workers send their updates to a ps, but only update the variables on the ps after *N* updates have been accumulated.  If the number of workers is *M* and *M>N*, then this is known as dropping the last *M-N* *stale gradients*.
4 | 


--------------------------------------------------------------------------------
/Synchronous-SGD/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python ssgd.py --job_name "ps" --task_index 0 &
3 | python ssgd.py --job_name "worker" --task_index 0 &
4 | python ssgd.py --job_name "worker" --task_index 1 &


--------------------------------------------------------------------------------
/Synchronous-SGD/ssgd.py:
--------------------------------------------------------------------------------
  1 | """Synchronous SGD
  2 | 
  3 | Author: Tommy Mulc
  4 | """
  5 | 
  6 | from __future__ import print_function
  7 | import tensorflow as tf
  8 | import argparse
  9 | import time
 10 | import os
 11 | FLAGS = None
 12 | log_dir = '/logdir'
 13 | REPLICAS_TO_AGGREGATE = 2
 14 | 
 15 | def main():
 16 |   # Configure
 17 |   config=tf.ConfigProto(log_device_placement=False)
 18 | 
 19 |   # Server Setup
 20 |   cluster = tf.train.ClusterSpec({
 21 |         'ps':['localhost:2222'],
 22 |         'worker':['localhost:2223','localhost:2224']
 23 |         }) #allows this node know about all other nodes
 24 |   if FLAGS.job_name == 'ps': #checks if parameter server
 25 |     server = tf.train.Server(cluster,
 26 |           job_name="ps",
 27 |           task_index=FLAGS.task_index,
 28 |           config=config)
 29 |     server.join()
 30 |   else: #it must be a worker server
 31 |     is_chief = (FLAGS.task_index == 0) #checks if this is the chief node
 32 |     server = tf.train.Server(cluster,
 33 |           job_name="worker",
 34 |           task_index=FLAGS.task_index,
 35 |           config=config)
 36 |     
 37 |     # Graph
 38 |     worker_device = "/job:%s/task:%d/cpu:0" % (FLAGS.job_name,FLAGS.task_index)
 39 |     with tf.device(tf.train.replica_device_setter(ps_tasks=1,
 40 |           worker_device=worker_device)):
 41 | 
 42 |       a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32)
 43 |       b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32)
 44 |       c=a+b
 45 | 
 46 |       global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='global_step')
 47 |       target = tf.constant(100.,shape=[2],dtype=tf.float32)
 48 |       loss = tf.reduce_mean(tf.square(c-target))
 49 | 
 50 |       # create an optimizer then wrap it with SynceReplicasOptimizer
 51 |       optimizer = tf.train.GradientDescentOptimizer(.0001)
 52 |       optimizer1 = tf.train.SyncReplicasOptimizer(optimizer,
 53 |             replicas_to_aggregate=REPLICAS_TO_AGGREGATE, total_num_replicas=2)
 54 |       
 55 |       opt = optimizer1.minimize(loss,global_step=global_step) # averages gradients
 56 |       #opt = optimizer1.minimize(REPLICAS_TO_AGGREGATE*loss,
 57 |       #                           global_step=global_step) # hackily sums gradients
 58 | 
 59 |     # Session
 60 |     sync_replicas_hook = optimizer1.make_session_run_hook(is_chief)
 61 |     stop_hook = tf.train.StopAtStepHook(last_step=10)
 62 |     hooks = [sync_replicas_hook,stop_hook]
 63 | 
 64 |     # Monitored Training Session
 65 |     sess = tf.train.MonitoredTrainingSession(master = server.target, 
 66 |           is_chief=is_chief,
 67 |           config=config,
 68 |           hooks=hooks,
 69 |           stop_grace_period_secs=10)
 70 | 
 71 |     print('Starting training on worker %d'%FLAGS.task_index)
 72 |     while not sess.should_stop():
 73 |       _,r,gs=sess.run([opt,c,global_step])
 74 |       print(r,'step: ',gs,'worker: ',FLAGS.task_index)
 75 |       if is_chief: time.sleep(1)
 76 |       time.sleep(1)
 77 |     print('Done',FLAGS.task_index)
 78 | 
 79 |     time.sleep(10) #grace period to wait before closing session
 80 |     sess.close()
 81 |     print('Session from worker %d closed cleanly'%FLAGS.task_index)
 82 | 
 83 | 
 84 | if __name__ == '__main__':
 85 |   parser = argparse.ArgumentParser()
 86 |   # Flags for defining the tf.train.ClusterSpec
 87 |   parser.add_argument(
 88 |       "--job_name",
 89 |       type=str,
 90 |       default="",
 91 |       help="One of 'ps', 'worker'"
 92 |     )
 93 |   # Flags for defining the tf.train.Server
 94 |   parser.add_argument(
 95 |       "--task_index",
 96 |       type=int,
 97 |       default=0,
 98 |       help="Index of task within the job"
 99 |     )
100 |   FLAGS, unparsed = parser.parse_known_args()
101 |   print(FLAGS.task_index)
102 |   main()
103 | 


--------------------------------------------------------------------------------
/imgs/data-parallelism.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmulc18/Distributed-TensorFlow-Guide/8e7fec757112a3ab5dccff93e848e7617ef7ed3e/imgs/data-parallelism.png


--------------------------------------------------------------------------------