├── README.md
├── backprop_network.py
├── synthetic_gradient_network.py
└── Synthetic Gradients Explained.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # synthetic_gradients_explained
 2 | This is the code for "Synthetic Gradients Explained" by Siraj Raval on Youtube
 3 | 
 4 | ## Overview
 5 | 
 6 | This is the code for [this](https://www.youtube.com/watch?v=qirjknNY1zo) video on Youtube by Siraj Raval on synthetic gradients. This is an optimization strategy DeepMind released that could overtake backpropagation. 
 7 | 
 8 | ## Dependencies
 9 | 
10 | * numpy
11 | 
12 | Use [pip](https://pip.pypa.io/en/stable/) to install missing dependencies. 
13 | 
14 | ## Usage
15 | 
16 | Both versions (backprop and synthetic gradient versions) have their own python file. Run them from command line using 
17 | 
18 | `python name_of_file.py` 
19 | 
20 | To run the notebook run this in the root directory
21 | 
22 | `jupyter notebook` 
23 | 
24 | Install jupyter from [here](http://jupyter.readthedocs.io/en/latest/install.html) 
25 | 
26 | 
27 | ## Credits
28 | 
29 | Credits for the code go to [trask](https://iamtrask.github.io/2017/03/21/synthetic-gradients/). I've merely created a wrapper to get people started. 
30 | 


--------------------------------------------------------------------------------
/backprop_network.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | 
 4 | def generate_dataset(output_dim = 8,num_examples=1000):
 5 |     def int2vec(x,dim=output_dim):
 6 |         out = np.zeros(dim)
 7 |         binrep = np.array(list(np.binary_repr(x))).astype('int')
 8 |         out[-len(binrep):] = binrep
 9 |         return out
10 | 
11 |     x_left_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')
12 |     x_right_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')
13 |     y_int = x_left_int + x_right_int
14 | 
15 |     x = list()
16 |     for i in range(len(x_left_int)):
17 |         x.append(np.concatenate((int2vec(x_left_int[i]),int2vec(x_right_int[i]))))
18 | 
19 |     y = list()
20 |     for i in range(len(y_int)):
21 |         y.append(int2vec(y_int[i]))
22 | 
23 |     x = np.array(x)
24 |     y = np.array(y)
25 |     
26 |     return (x,y)
27 |     
28 | np.random.seed(1)
29 | 
30 | def sigmoid(x):
31 |     return 1 / (1 + np.exp(-x))
32 | 
33 | num_examples = 1000
34 | output_dim = 12
35 | iterations = 1000
36 | 
37 | x,y = generate_dataset(num_examples=num_examples, output_dim = output_dim)
38 | 
39 | batch_size = 10
40 | alpha = 0.1
41 | 
42 | input_dim = len(x[0])
43 | layer_1_dim = 128
44 | layer_2_dim = 64
45 | output_dim = len(y[0])
46 | 
47 | weights_0_1 = (np.random.randn(input_dim,layer_1_dim) * 0.2) - 0.1
48 | weights_1_2 = (np.random.randn(layer_1_dim,layer_2_dim) * 0.2) - 0.1
49 | weights_2_3 = (np.random.randn(layer_2_dim,output_dim) * 0.2) - 0.1
50 | 
51 | 
52 | for iter in range(iterations):
53 |     error = 0
54 | 
55 |     for batch_i in range(int(len(x) / batch_size)):
56 |         batch_x = x[(batch_i * batch_size):(batch_i+1)*batch_size]
57 |         batch_y = y[(batch_i * batch_size):(batch_i+1)*batch_size]    
58 | 
59 |         layer_0 = batch_x
60 |         layer_1 = sigmoid(layer_0.dot(weights_0_1))
61 |         layer_2 = sigmoid(layer_1.dot(weights_1_2))
62 |         layer_3 = sigmoid(layer_2.dot(weights_2_3))
63 | 
64 |         layer_3_delta = (layer_3 - batch_y) * layer_3  * (1 - layer_3)
65 |         layer_2_delta = layer_3_delta.dot(weights_2_3.T) * layer_2 * (1 - layer_2)
66 |         layer_1_delta = layer_2_delta.dot(weights_1_2.T) * layer_1 * (1 - layer_1)
67 | 
68 |         weights_0_1 -= layer_0.T.dot(layer_1_delta) * alpha
69 |         weights_1_2 -= layer_1.T.dot(layer_2_delta) * alpha
70 |         weights_2_3 -= layer_2.T.dot(layer_3_delta) * alpha
71 | 
72 |         error += (np.sum(np.abs(layer_3_delta)))
73 | 
74 |     sys.stdout.write("\rIter:" + str(iter) + " Loss:" + str(error))
75 |     if(iter % 100 == 99):
76 |         print("")
77 | 


--------------------------------------------------------------------------------
/synthetic_gradient_network.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import sys
  4 | 
  5 | def generate_dataset(output_dim = 8,num_examples=1000):
  6 |     def int2vec(x,dim=output_dim):
  7 |         out = np.zeros(dim)
  8 |         binrep = np.array(list(np.binary_repr(x))).astype('int')
  9 |         out[-len(binrep):] = binrep
 10 |         return out
 11 | 
 12 |     x_left_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')
 13 |     x_right_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')
 14 |     y_int = x_left_int + x_right_int
 15 | 
 16 |     x = list()
 17 |     for i in range(len(x_left_int)):
 18 |         x.append(np.concatenate((int2vec(x_left_int[i]),int2vec(x_right_int[i]))))
 19 | 
 20 |     y = list()
 21 |     for i in range(len(y_int)):
 22 |         y.append(int2vec(y_int[i]))
 23 | 
 24 |     x = np.array(x)
 25 |     y = np.array(y)
 26 |     
 27 |     return (x,y)
 28 | 
 29 | def sigmoid(x):
 30 |     return 1 / (1 + np.exp(-x))
 31 | 
 32 | def sigmoid_out2deriv(out):
 33 |     return out * (1 - out)
 34 | 
 35 | class DNI(object):
 36 |     
 37 |     def __init__(self,input_dim, output_dim,nonlin,nonlin_deriv,alpha = 0.1):
 38 |         
 39 |         self.weights = (np.random.randn(input_dim, output_dim) * 2) - 1
 40 |         self.bias = (np.random.randn(output_dim) * 2) - 1
 41 |         
 42 |         self.weights_0_1_synthetic_grads = (np.random.randn(output_dim,output_dim) * .0) - .0
 43 |         self.bias_0_1_synthetic_grads = (np.random.randn(output_dim) * .0) - .0
 44 |     
 45 |         self.nonlin = nonlin
 46 |         self.nonlin_deriv = nonlin_deriv
 47 |         self.alpha = alpha
 48 |     
 49 |     def forward_and_synthetic_update(self,input,update=True):
 50 |         
 51 |         self.input = input
 52 |         self.output = self.nonlin(self.input.dot(self.weights)  + self.bias)
 53 |         
 54 |         if(not update):
 55 |             return self.output
 56 |         else:
 57 |             self.synthetic_gradient = (self.output.dot(self.weights_0_1_synthetic_grads) + self.bias_0_1_synthetic_grads)
 58 |             self.weight_synthetic_gradient = self.synthetic_gradient * self.nonlin_deriv(self.output)
 59 |         
 60 |             self.weights -= self.input.T.dot(self.weight_synthetic_gradient) * self.alpha
 61 |             self.bias -= np.average(self.weight_synthetic_gradient,axis=0) * self.alpha
 62 |         
 63 |         return self.weight_synthetic_gradient.dot(self.weights.T), self.output
 64 |     
 65 |     def normal_update(self,true_gradient):
 66 |         grad = true_gradient * self.nonlin_deriv(self.output)
 67 |         
 68 |         self.weights -= self.input.T.dot(grad) * self.alpha
 69 |         self.bias -= np.average(grad,axis=0) * self.alpha
 70 |         
 71 |         return grad.dot(self.weights.T)
 72 |     
 73 |     def update_synthetic_weights(self,true_gradient):
 74 |         self.synthetic_gradient_delta = (self.synthetic_gradient - true_gradient)
 75 |         self.weights_0_1_synthetic_grads -= self.output.T.dot(self.synthetic_gradient_delta) * self.alpha
 76 |         self.bias_0_1_synthetic_grads -= np.average(self.synthetic_gradient_delta,axis=0) * self.alpha
 77 |         
 78 | np.random.seed(1)
 79 | 
 80 | num_examples = 100
 81 | output_dim = 8
 82 | iterations = 100000
 83 | 
 84 | x,y = generate_dataset(num_examples=num_examples, output_dim = output_dim)
 85 | 
 86 | batch_size = 10
 87 | alpha = 0.01
 88 | 
 89 | input_dim = len(x[0])
 90 | layer_1_dim = 64
 91 | layer_2_dim = 32
 92 | output_dim = len(y[0])
 93 | 
 94 | layer_1 = DNI(input_dim,layer_1_dim,sigmoid,sigmoid_out2deriv,alpha)
 95 | layer_2 = DNI(layer_1_dim,layer_2_dim,sigmoid,sigmoid_out2deriv,alpha)
 96 | layer_3 = DNI(layer_2_dim, output_dim,sigmoid, sigmoid_out2deriv,alpha)
 97 | 
 98 | for iter in range(iterations):
 99 |     error = 0
100 |     synthetic_error = 0
101 |     
102 |     for batch_i in range(int(len(x) / batch_size)):
103 |         batch_x = x[(batch_i * batch_size):(batch_i+1)*batch_size]
104 |         batch_y = y[(batch_i * batch_size):(batch_i+1)*batch_size]  
105 |         
106 |         _, layer_1_out = layer_1.forward_and_synthetic_update(batch_x)
107 |         layer_1_delta, layer_2_out = layer_2.forward_and_synthetic_update(layer_1_out)
108 |         layer_3_out = layer_3.forward_and_synthetic_update(layer_2_out,False)
109 | 
110 |         layer_3_delta = layer_3_out - batch_y
111 |         layer_2_delta = layer_3.normal_update(layer_3_delta)
112 |         layer_2.update_synthetic_weights(layer_2_delta)
113 |         layer_1.update_synthetic_weights(layer_1_delta)
114 |         
115 |         error += (np.sum(np.abs(layer_3_delta)))
116 |         synthetic_error += (np.sum(np.abs(layer_2_delta - layer_2.synthetic_gradient)))
117 |     if(iter % 100 == 99):
118 |         sys.stdout.write("\rIter:" + str(iter) + " Loss:" + str(error) + " Synthetic Loss:" + str(synthetic_error))
119 |     if(iter % 10000 == 9999):
120 |         print("")
121 | 


--------------------------------------------------------------------------------
/Synthetic Gradients Explained.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# What are synthetic gradients?\n",
  8 |     "\n",
  9 |     "### Demo - We're going to use a newer optimization strategy called \"Synthetic Gradients\" instead of \"Backpropagation\" to train our simple feedforward Neural Network.\n",
 10 |     "\n",
 11 |     "<img src=\"https://storage.googleapis.com/deepmind-live-cms/documents/3-6.gif\">\n",
 12 |     "\n",
 13 |     "## How do Neural Networks Learn?\n",
 14 |     "\n",
 15 |     "![alt text](http://datathings.com/blog/images/neuralnet/nnblackbox.png \"Logo Title Text 1\")\n",
 16 |     "\n",
 17 |     "![alt text](https://www.intechopen.com/source/html/38738/media/f2.jpg \"Logo Title Text 1\")\n",
 18 |     "\n",
 19 |     "Learning process\n",
 20 |     "- Use inputs + desired outputs to update internal state accordingly\n",
 21 |     "\n",
 22 |     "Prediction process \n",
 23 |     "- Use input and internal state to generate most likely output according to its past “training experience”\n",
 24 |     "\n",
 25 |     "![alt text](https://qph.ec.quoracdn.net/main-qimg-b2afcc88428418db01552987182e7b6a.webp \"Logo Title Text 1\")\n",
 26 |     "\n",
 27 |     "![alt text](https://qph.ec.quoracdn.net/main-qimg-7bdfcff266211a74a31bfcdcc99c0087.webp \"Logo Title Text 1\")\n",
 28 |     "\n",
 29 |     "\n",
 30 |     "## Gradient Descent\n",
 31 |     "![alt text](http://datathings.com/blog/images/neuralnet/derivative2.png \"Logo Title Text 1\")"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 7,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "Iter:999 Loss:380.142015668 Synthetic Loss:1694.16764408\n",
 44 |       "Iter:1999 Loss:367.203984217 Synthetic Loss:1717.46239983\n",
 45 |       "Iter:2999 Loss:361.265375816 Synthetic Loss:1735.53587437\n",
 46 |       "Iter:3999 Loss:357.190353264 Synthetic Loss:1715.18658268\n",
 47 |       "Iter:4999 Loss:351.924334112 Synthetic Loss:1749.48418294\n",
 48 |       "Iter:5999 Loss:344.629266092 Synthetic Loss:1723.40376341\n",
 49 |       "Iter:6999 Loss:342.940872462 Synthetic Loss:1734.03534908\n",
 50 |       "Iter:7999 Loss:338.074206236 Synthetic Loss:1720.04109979\n",
 51 |       "Iter:8999 Loss:326.151407022 Synthetic Loss:1636.67750546\n",
 52 |       "Iter:9999 Loss:324.912688062 Synthetic Loss:1633.27356925\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "\n",
 58 |     "import numpy as np\n",
 59 |     "import sys\n",
 60 |     "\n",
 61 |     "def generate_dataset(output_dim = 8,num_examples=1000):\n",
 62 |     "    def int2vec(x,dim=output_dim):\n",
 63 |     "        out = np.zeros(dim)\n",
 64 |     "        binrep = np.array(list(np.binary_repr(x))).astype('int')\n",
 65 |     "        out[-len(binrep):] = binrep\n",
 66 |     "        return out\n",
 67 |     "\n",
 68 |     "    x_left_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')\n",
 69 |     "    x_right_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')\n",
 70 |     "    y_int = x_left_int + x_right_int\n",
 71 |     "\n",
 72 |     "    x = list()\n",
 73 |     "    for i in range(len(x_left_int)):\n",
 74 |     "        x.append(np.concatenate((int2vec(x_left_int[i]),int2vec(x_right_int[i]))))\n",
 75 |     "\n",
 76 |     "    y = list()\n",
 77 |     "    for i in range(len(y_int)):\n",
 78 |     "        y.append(int2vec(y_int[i]))\n",
 79 |     "\n",
 80 |     "    x = np.array(x)\n",
 81 |     "    y = np.array(y)\n",
 82 |     "    \n",
 83 |     "    return (x,y)\n",
 84 |     "\n",
 85 |     "def sigmoid(x):\n",
 86 |     "    return 1 / (1 + np.exp(-x))\n",
 87 |     "\n",
 88 |     "def sigmoid_out2deriv(out):\n",
 89 |     "    return out * (1 - out)\n",
 90 |     "\n",
 91 |     "class DNI(object):\n",
 92 |     "    \n",
 93 |     "    def __init__(self,input_dim, output_dim,nonlin,nonlin_deriv,alpha = 0.1):\n",
 94 |     "        \n",
 95 |     "        self.weights = (np.random.randn(input_dim, output_dim) * 2) - 1\n",
 96 |     "        self.bias = (np.random.randn(output_dim) * 2) - 1\n",
 97 |     "        \n",
 98 |     "        self.weights_0_1_synthetic_grads = (np.random.randn(output_dim,output_dim) * .0) - .0\n",
 99 |     "        self.bias_0_1_synthetic_grads = (np.random.randn(output_dim) * .0) - .0\n",
100 |     "    \n",
101 |     "        self.nonlin = nonlin\n",
102 |     "        self.nonlin_deriv = nonlin_deriv\n",
103 |     "        self.alpha = alpha\n",
104 |     "    \n",
105 |     "    def forward_and_synthetic_update(self,input,update=True):\n",
106 |     "        \n",
107 |     "        self.input = input\n",
108 |     "        self.output = self.nonlin(self.input.dot(self.weights)  + self.bias)\n",
109 |     "        \n",
110 |     "        if(not update):\n",
111 |     "            return self.output\n",
112 |     "        else:\n",
113 |     "            self.synthetic_gradient = (self.output.dot(self.weights_0_1_synthetic_grads) + self.bias_0_1_synthetic_grads)\n",
114 |     "            self.weight_synthetic_gradient = self.synthetic_gradient * self.nonlin_deriv(self.output)\n",
115 |     "        \n",
116 |     "            self.weights -= self.input.T.dot(self.weight_synthetic_gradient) * self.alpha\n",
117 |     "            self.bias -= np.average(self.weight_synthetic_gradient,axis=0) * self.alpha\n",
118 |     "        \n",
119 |     "        return self.weight_synthetic_gradient.dot(self.weights.T), self.output\n",
120 |     "    \n",
121 |     "    def normal_update(self,true_gradient):\n",
122 |     "        grad = true_gradient * self.nonlin_deriv(self.output)\n",
123 |     "        \n",
124 |     "        self.weights -= self.input.T.dot(grad) * self.alpha\n",
125 |     "        self.bias -= np.average(grad,axis=0) * self.alpha\n",
126 |     "        \n",
127 |     "        return grad.dot(self.weights.T)\n",
128 |     "    \n",
129 |     "    def update_synthetic_weights(self,true_gradient):\n",
130 |     "        self.synthetic_gradient_delta = (self.synthetic_gradient - true_gradient)\n",
131 |     "        self.weights_0_1_synthetic_grads -= self.output.T.dot(self.synthetic_gradient_delta) * self.alpha\n",
132 |     "        self.bias_0_1_synthetic_grads -= np.average(self.synthetic_gradient_delta,axis=0) * self.alpha\n",
133 |     "        \n",
134 |     "np.random.seed(1)\n",
135 |     "\n",
136 |     "num_examples = 100\n",
137 |     "output_dim = 8\n",
138 |     "iterations = 10000\n",
139 |     "\n",
140 |     "x,y = generate_dataset(num_examples=num_examples, output_dim = output_dim)\n",
141 |     "\n",
142 |     "batch_size = 10\n",
143 |     "alpha = 0.01\n",
144 |     "\n",
145 |     "input_dim = len(x[0])\n",
146 |     "layer_1_dim = 64\n",
147 |     "layer_2_dim = 32\n",
148 |     "output_dim = len(y[0])\n",
149 |     "\n",
150 |     "layer_1 = DNI(input_dim,layer_1_dim,sigmoid,sigmoid_out2deriv,alpha)\n",
151 |     "layer_2 = DNI(layer_1_dim,layer_2_dim,sigmoid,sigmoid_out2deriv,alpha)\n",
152 |     "layer_3 = DNI(layer_2_dim, output_dim,sigmoid, sigmoid_out2deriv,alpha)\n",
153 |     "\n",
154 |     "for iter in range(iterations):\n",
155 |     "    error = 0\n",
156 |     "    synthetic_error = 0\n",
157 |     "    \n",
158 |     "    for batch_i in range(int(len(x) / batch_size)):\n",
159 |     "        batch_x = x[(batch_i * batch_size):(batch_i+1)*batch_size]\n",
160 |     "        batch_y = y[(batch_i * batch_size):(batch_i+1)*batch_size]  \n",
161 |     "        \n",
162 |     "        _, layer_1_out = layer_1.forward_and_synthetic_update(batch_x)\n",
163 |     "        layer_1_delta, layer_2_out = layer_2.forward_and_synthetic_update(layer_1_out)\n",
164 |     "        layer_3_out = layer_3.forward_and_synthetic_update(layer_2_out,False)\n",
165 |     "\n",
166 |     "        layer_3_delta = layer_3_out - batch_y\n",
167 |     "        layer_2_delta = layer_3.normal_update(layer_3_delta)\n",
168 |     "        layer_2.update_synthetic_weights(layer_2_delta)\n",
169 |     "        layer_1.update_synthetic_weights(layer_1_delta)\n",
170 |     "        \n",
171 |     "        error += (np.sum(np.abs(layer_3_delta)))\n",
172 |     "        synthetic_error += (np.sum(np.abs(layer_2_delta - layer_2.synthetic_gradient)))\n",
173 |     "    \n",
174 |     "    sys.stdout.write(\"\\rIter:\" + str(iter) + \" Loss:\" + str(error) + \" Synthetic Loss:\" + str(synthetic_error))\n",
175 |     "    if(iter % 1000 == 999):\n",
176 |     "        print(\"\")"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## The Problem with Backpropagation\n",
184 |     "\n",
185 |     "### Locking\n",
186 |     "\n",
187 |     "![alt text](https://storage.googleapis.com/deepmind-live-cms/images/3-1.width-1500_zU6x0wC.png \"Logo Title Text 1\")\n",
188 |     "\n",
189 |     "- A layer can only be updated after a full forward+backward pass been \n",
190 |     "- After Layer 1 has processed input, it updates after output activations (black lines) have been propagated through the rest of the network, generated a loss, and the error gradients (green lines) backpropagated through every layer until Layer 1 is reached. \n",
191 |     "- So L1 must wait for forward+backward pass of L2 & L3 before updating\n",
192 |     "- Therefore L1 is locked/coupled to the rest of the network\n",
193 |     "\n",
194 |     "![alt text](https://www.semiwiki.com/forum/attachments/content/attachments/17619d1467046829-googlenet-inceptions-jpg \"Logo Title Text 1\")\n",
195 |     "\n",
196 |     "- For simple networks it's a non-issue\n",
197 |     "- But consider a complex system of multiple networks, acting in multiple environments at asynchronous and irregular timescales.\n",
198 |     "-  Or a big distributed network spread over multiple machines. Time expensive\n",
199 |     "\n",
200 |     "\n",
201 |     "### If we decouple the interfaces - the connections -  between layers, every layer can be updated independently, and is not locked to the rest of the network. But how?\n",
202 |     "\n",
203 |     "\n",
204 |     "## Synthetic Gradients\n",
205 |     "\n",
206 |     "![alt text](https://storage.googleapis.com/deepmind-live-cms/images/3-3.width-1500_Ij679hz.png \"Logo Title Text 1\")\n",
207 |     "\n",
208 |     "- Normally, a neural network compares its predictions to a dataset to decide how to update its weights. \n",
209 |     "- It then uses backpropagation to figure out how each weight should move in order to make the prediction more accurate. \n",
210 |     "- However, with Synthetic Gradients, individual layers instead make a \"best guess\" for what they think the data will say, and then update their weights according to this guess. \n",
211 |     "- This \"best guess\" is called a Synthetic Gradient. \n",
212 |     "- The data is only used to help update each layer's \"guesser\" or Synthetic Gradient generator. \n",
213 |     "- This allows for (most of the time), individual layers to learn in isolation, which increases the speed of training.\n",
214 |     "\n",
215 |     "If we use a synthetic gradient model we can do the following:\n",
216 |     "\n",
217 |     "![alt text](https://storage.googleapis.com/deepmind-live-cms/images/3-4.width-1500_jjNNlb7.png \"Logo Title Text 1\")\n",
218 |     "\n",
219 |     "... and use the synthetic gradients (blue) to update Layer 1 before the rest of the network has even been executed.\n",
220 |     "\n",
221 |     "The synthetic gradient model itself is trained to regress target gradients - these target gradients could be the true gradients backpropagated from the loss or other synthetic gradients which have been backpropagated from a further downstream synthetic gradient model.\n",
222 |     "\n",
223 |     "![alt text](https://storage.googleapis.com/deepmind-live-cms/images/3-5.width-1500_pmWHi94.png \"Logo Title Text 1\")\n",
224 |     "\n",
225 |     "Animated:\n",
226 |     "\n",
227 |     "![alt text](https://storage.googleapis.com/deepmind-live-cms/documents/3-6.gif \"Logo Title Text 1\")\n",
228 |     "\n",
229 |     "![alt text](https://iamtrask.github.io/img/synthetic_grads_paper.png \"Logo Title Text 1\")\n",
230 |     "\n",
231 |     "- Synthetic Gradient generators are just neural nets trained to take the output of a layer and predict the gradient that will likely happen at that layer.\n",
232 |     "- When we perform full forward + back pass, we get the \"correct\" gradient\n",
233 |     "- We can compare this to our \"synthetic\" gradient \n",
234 |     "- So we can train our Synthetic Gradient networks by pretending that our \"true gradients\" are coming from from mythical dataset\n",
235 |     "\n",
236 |     "See how the gradient (M i+2) backpropagates through (f i+1) and into M(i+1)? As you can see, each synthetic gradient generator is actually only trained using the Synthetic Gradients generated from the next layer. Thus, only the last layer actually trains on the data. All the other layers, including the Synthetic Gradient generator networks, train based on Synthetic Gradients. Thus, the network can train with each layer only having to wait on the synthetic gradient from the following layer (which has no other dependencies). \n",
237 |     "\n",
238 |     "- DNI doesn’t magically allow networks to train without true gradient information. The true gradient information does percolate backwards through the network, but just slower and over many training iterations, through the losses of the synthetic gradient models. \n",
239 |     "- But overall the network is faster  because the synthetic gradient models approximate and smooth over the absence of true gradients.\n",
240 |     "- DNI can be applied to any generic neural network architecture, not just feed-forward networks\n",
241 |     "- This is awesome! I want to see this integrated into all major DL libraries. allow distributed training of networks + faster + cleaner\n"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 5,
247 |    "metadata": {},
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "Iter:999 Loss:1.77001735428\n",
254 |       "Iter:1999 Loss:0.327780634499\n",
255 |       "Iter:2999 Loss:0.181843992452\n",
256 |       "Iter:3999 Loss:0.184111347924\n",
257 |       "Iter:4999 Loss:0.0999035747467\n",
258 |       "Iter:5999 Loss:0.0736026337433\n",
259 |       "Iter:6999 Loss:0.0588353479911\n",
260 |       "Iter:7999 Loss:0.0491536076171\n",
261 |       "Iter:8999 Loss:0.0422581205704\n",
262 |       "Iter:9999 Loss:0.0370760733124\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "# Normal back-prop gradient descent neural network\n",
268 |     "\n",
269 |     "def generate_dataset(output_dim = 8,num_examples=1000):\n",
270 |     "    def int2vec(x,dim=output_dim):\n",
271 |     "        out = np.zeros(dim)\n",
272 |     "        binrep = np.array(list(np.binary_repr(x))).astype('int')\n",
273 |     "        out[-len(binrep):] = binrep\n",
274 |     "        return out\n",
275 |     "\n",
276 |     "    x_left_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')\n",
277 |     "    x_right_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')\n",
278 |     "    y_int = x_left_int + x_right_int\n",
279 |     "\n",
280 |     "    x = list()\n",
281 |     "    for i in range(len(x_left_int)):\n",
282 |     "        x.append(np.concatenate((int2vec(x_left_int[i]),int2vec(x_right_int[i]))))\n",
283 |     "\n",
284 |     "    y = list()\n",
285 |     "    for i in range(len(y_int)):\n",
286 |     "        y.append(int2vec(y_int[i]))\n",
287 |     "\n",
288 |     "    x = np.array(x)\n",
289 |     "    y = np.array(y)\n",
290 |     "    \n",
291 |     "    return (x,y)\n",
292 |     "    \n",
293 |     "np.random.seed(1)\n",
294 |     "\n",
295 |     "def sigmoid(x):\n",
296 |     "    return 1 / (1 + np.exp(-x))\n",
297 |     "\n",
298 |     "num_examples = 100\n",
299 |     "output_dim = 8\n",
300 |     "iterations = 10000\n",
301 |     "\n",
302 |     "x,y = generate_dataset(num_examples=num_examples, output_dim = output_dim)\n",
303 |     "\n",
304 |     "batch_size = 10\n",
305 |     "alpha = 0.1\n",
306 |     "\n",
307 |     "input_dim = len(x[0])\n",
308 |     "layer_1_dim = 128\n",
309 |     "layer_2_dim = 64\n",
310 |     "output_dim = len(y[0])\n",
311 |     "\n",
312 |     "weights_0_1 = (np.random.randn(input_dim,layer_1_dim) * 0.2) - 0.1\n",
313 |     "weights_1_2 = (np.random.randn(layer_1_dim,layer_2_dim) * 0.2) - 0.1\n",
314 |     "weights_2_3 = (np.random.randn(layer_2_dim,output_dim) * 0.2) - 0.1\n",
315 |     "\n",
316 |     "\n",
317 |     "for iter in range(iterations):\n",
318 |     "    error = 0\n",
319 |     "\n",
320 |     "    for batch_i in range(int(len(x) / batch_size)):\n",
321 |     "        batch_x = x[(batch_i * batch_size):(batch_i+1)*batch_size]\n",
322 |     "        batch_y = y[(batch_i * batch_size):(batch_i+1)*batch_size]    \n",
323 |     "\n",
324 |     "        layer_0 = batch_x\n",
325 |     "        layer_1 = sigmoid(layer_0.dot(weights_0_1))\n",
326 |     "        layer_2 = sigmoid(layer_1.dot(weights_1_2))\n",
327 |     "        layer_3 = sigmoid(layer_2.dot(weights_2_3))\n",
328 |     "\n",
329 |     "        layer_3_delta = (layer_3 - batch_y) * layer_3  * (1 - layer_3)\n",
330 |     "        layer_2_delta = layer_3_delta.dot(weights_2_3.T) * layer_2 * (1 - layer_2)\n",
331 |     "        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * layer_1 * (1 - layer_1)\n",
332 |     "\n",
333 |     "        weights_0_1 -= layer_0.T.dot(layer_1_delta) * alpha\n",
334 |     "        weights_1_2 -= layer_1.T.dot(layer_2_delta) * alpha\n",
335 |     "        weights_2_3 -= layer_2.T.dot(layer_3_delta) * alpha\n",
336 |     "\n",
337 |     "        error += (np.sum(np.abs(layer_3_delta)))\n",
338 |     "\n",
339 |     "    sys.stdout.write(\"\\rIter:\" + str(iter) + \" Loss:\" + str(error))\n",
340 |     "    if(iter % 1000 == 999):\n",
341 |     "        print(\"\")\n"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {
348 |     "collapsed": true
349 |    },
350 |    "outputs": [],
351 |    "source": []
352 |   }
353 |  ],
354 |  "metadata": {
355 |   "kernelspec": {
356 |    "display_name": "Python 3",
357 |    "language": "python",
358 |    "name": "python3"
359 |   },
360 |   "language_info": {
361 |    "codemirror_mode": {
362 |     "name": "ipython",
363 |     "version": 3
364 |    },
365 |    "file_extension": ".py",
366 |    "mimetype": "text/x-python",
367 |    "name": "python",
368 |    "nbconvert_exporter": "python",
369 |    "pygments_lexer": "ipython3",
370 |    "version": "3.6.2"
371 |   }
372 |  },
373 |  "nbformat": 4,
374 |  "nbformat_minor": 2
375 | }
376 | 


--------------------------------------------------------------------------------