├── Convex optimizations.py
├── Optimization Methods in Deep Learning.pdf
└── README.md


/Convex optimizations.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Created on Wed Mar 28 18:01:15 2018
  5 | 
  6 | @author: Xunzhe Wen
  7 | """
  8 | 
  9 | 
 10 | import math
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | def f(x):
 15 |     return x**3-2*x-10+x**2  
 16 |   
 17 | def derivative_f(x):
 18 |     return 3*(x**2)+2*x-2  
 19 |   
 20 | def GD(x, learning_rate):
 21 |     gradient=0.0
 22 |     trace=[]
 23 |     
 24 |     for i in range(1000000):  
 25 |   
 26 |         if((abs(gradient)>0.00001) and (abs(gradient)<0.0001)):  
 27 |             print("Gradient Descent converges with iterations "+str(i))  
 28 |             break  
 29 |             
 30 |         else:  
 31 |             gradient = derivative_f(x)  
 32 |             x = x -  learning_rate*gradient
 33 |             trace.append(x)
 34 |             y = f(x)
 35 |             
 36 |     return x,y,trace
 37 | 
 38 | def Adagrad(x, learning_rate):
 39 |     gradient=0  
 40 |     e=0.00000001  
 41 |     sum = 0.0
 42 |     trace=[]
 43 |   
 44 |     for i in range(100000):  
 45 |         if((abs(gradient)>0.00001) and (abs(gradient)<0.0001)):
 46 |             print("Adagrad converges with iterations "+str(i))  
 47 |             break  
 48 |         else:  
 49 |             gradient = derivative_f(x)  
 50 |             sum += gradient**2;  
 51 |             x=x-learning_rate*gradient/(math.sqrt(sum/(i+1))+e) 
 52 |             trace.append(x)
 53 |             y=f(x)
 54 |     return x,y,trace
 55 | 
 56 | def RMSProp(x, learning_rate):
 57 |     gradient=0  
 58 |     e=0.00000001  
 59 |     sum = 0.0  
 60 |     d = 0.9 
 61 |     Egt=0
 62 |     Edt = 0
 63 |     delta = 0  
 64 |     trace=[]
 65 |     for i in range(100000):  
 66 |         
 67 |         if(abs(gradient)>0.00001 and (abs(gradient)<0.0001)):  
 68 |             print("RMSProp converges with iterations "+str(i))  
 69 |             break  
 70 |         else:
 71 |             gradient = derivative_f(x)  
 72 |             Egt = d * Egt + (1-d)*(gradient**2)  
 73 |             x=x-learning_rate*gradient/math.sqrt(Egt + e)  
 74 |             trace.append(x)
 75 |             y=f(x)  
 76 |     return x,y,trace
 77 | 
 78 | def Adam(x,learning_rate):
 79 |     gradient=0
 80 |     e=0.00000001
 81 |     b1 = 0.9  
 82 |     b2 = 0.995  
 83 |     trace=[]
 84 |   
 85 |     m = 0  
 86 |     v = 0  
 87 |     t = 0  
 88 |   
 89 |     for i in range(10000):
 90 |         #print('x = {:6f}, f(x) = {:6f},gradient={:6f}'.format(x,y,gradient))  
 91 |         if(abs(gradient)>0.00001 and (abs(gradient)<0.0001)):  
 92 |             print("Adam converges with iterations "+str(i))  
 93 |             break  
 94 |         else:
 95 |             gradient = derivative_f(x)  
 96 |   
 97 |             t=t+1  
 98 |             m = b1*m + (1-b1)*gradient  
 99 |             v = b2*v +(1-b2)*(gradient**2)
100 |             mt = m/(1-(b1**t))  
101 |             vt = v/(1-(b2**t))
102 |             x = x- learning_rate * mt/(math.sqrt(vt)+e) 
103 |             trace.append(x)
104 |             y=f(x)  
105 |     return x,y,trace
106 | 
107 | ini=-1.2
108 | x1,y1,trace1=GD(ini,0.1)
109 | x2,y2,trace2=Adagrad(ini,0.1)
110 | x3,y3,trace3=RMSProp(ini,0.1)
111 | x4,y4,trace4=Adam(ini,0.1)
112 | 
113 | t1=np.array(trace1)[::]
114 | t2=np.array(trace2)[::]
115 | t3=np.array(trace3)[::]
116 | t4=np.array(trace4)[::]
117 | 
118 | plt.figure(figsize=(12,6))
119 | x=np.linspace(-2,2,100000)
120 | plt.text(-2, -3, r'$y=x^3+x^2-2x-10$',fontsize=15)
121 | plt.plot(x,f(x)) 
122 | plt.plot(t1,f(t1),'r*')
123 | plt.xlabel('x')
124 | plt.ylabel('y')
125 | plt.title('Gradient Descent Optimization (3057 iteration until convergence)')
126 | plt.grid()
127 | plt.savefig("7.jpg") 
128 | plt.show()
129 | 
130 | plt.figure(figsize=(12,6))
131 | x=np.linspace(-2,2,100000)
132 | plt.text(-2, -3, r'$y=x^3+x^2-2x-10$',fontsize=15)
133 | plt.plot(x,f(x))
134 | plt.plot(t2,f(t2),'r*')
135 | plt.xlabel('x')
136 | plt.ylabel('y')
137 | plt.title('Adagrad Optimization (3639 iteration until convergence)')
138 | plt.grid()
139 | plt.savefig("8.jpg") 
140 | plt.show()
141 | 
142 | plt.figure(figsize=(12,6))
143 | x=np.linspace(-2,2,100000)
144 | plt.text(-2, -3, r'$y=x^3+x^2-2x-10$',fontsize=15)
145 | plt.plot(x,f(x))
146 | plt.plot(t3,f(t3),'r*')
147 | plt.xlabel('x')
148 | plt.ylabel('y')
149 | plt.title('RMSProp Optimization (1774 iteration until convergence)')
150 | plt.grid()
151 | plt.savefig("9.jpg") 
152 | plt.show()
153 | 
154 | plt.figure(figsize=(12,6))
155 | x=np.linspace(-2,2,100000)
156 | plt.text(-2, -3, r'$y=x^3+x^2-2x-10$',fontsize=15)
157 | plt.plot(x,f(x))
158 | plt.plot(t4,f(t4),'r*')
159 | plt.xlabel('x')
160 | plt.ylabel('y')
161 | plt.title('Adam Optimization (2253 iteration until convergence)')
162 | plt.grid()
163 | plt.savefig("10.jpg") 
164 | plt.show()
165 | 
166 | print(x1,y1)
167 | print(x2,y2)
168 | print(x3,y3)
169 | print(x4,y4)
170 | 
171 | import numpy as np
172 | import matplotlib.pyplot as plt
173 | import random
174 | import math
175 | import random
176 | import sklearn
177 | from sklearn.datasets.samples_generator import make_regression
178 | import pylab
179 | from scipy import stats
180 | 
181 | def rosenbrock(x):
182 |     return 100*(x[1]-x[0]**2)**2+(1-x[0])**2
183 | 
184 | def jacobian(x):
185 |     return np.array([-400*x[0]*(x[1]-x[0]**2)-2*(1-x[0]),200*(x[1]-x[0]**2)])
186 | 
187 | def hessian(x):
188 |     return np.array([[-400*(x[1]-3*x[0]**2)+2,-400*x[0]],[-400*x[0],200]])
189 | 
190 | def gradient_descent(alpha, x0, ep, max_iter):
191 |     
192 |     W=np.zeros((2,max_iter))
193 |     W[:,0] = x0
194 |     i = 1     
195 |     x = x0
196 |     Report=[]
197 |     grad = jacobian(x)
198 |     delta = sum(grad**2)
199 | 
200 |     while i<max_iter and delta>ep:
201 |         p = -jacobian(x)
202 |         #x0=x
203 |         x = x + alpha*p
204 |         W[:,i] = x
205 |         grad = jacobian(x)
206 |         delta = sum(grad**2)
207 |         Report.append(delta)
208 |         i=i+1
209 |         
210 |     print('Converged using GD, iterations: ', i)
211 |     print('Approximated Optimum:', x)
212 | 
213 |     W=W[:,0:i] 
214 |     return x, Report, W
215 | 
216 | def adam(alpha, x, ep, max_iter):
217 | 
218 |     converged = False
219 |     iter = 1
220 |     W=np.zeros((2,max_iter))
221 |     W[:,0] = x
222 |     
223 |     P = jacobian(x)
224 |     J = sum(P**2)
225 |     e=0.00000001 
226 |     b1=0.9
227 |     b2=0.995
228 |     Report=[]
229 |     
230 |     m=0
231 |     v=0
232 |     t=0
233 | 
234 |     
235 |     while not converged:
236 |         
237 |         grad = jacobian(x)
238 |         
239 |         t += 1
240 |         
241 |         m = b1*m + (1-b1)*grad
242 |         v = b2*v + (1-b2)*(grad**2)  
243 |    
244 |         mt = m/(1-(b1**t))  
245 |         vt = v/(1-(b2**t))  
246 |   
247 |         x = x - alpha * mt/(np.sqrt(vt)+e) 
248 |         
249 |         W[:,iter] = x
250 |     
251 |         J=sum(jacobian(x)**2)
252 |         Report.append(J)
253 |         
254 |         if abs(J) <= ep:
255 |             print('Converged using Adam, iterations: ', iter)
256 |             print('Approximated Optimum:', x)
257 |             converged = True
258 |         
259 |         iter += 1 
260 |         
261 |         if iter == max_iter:
262 |             print('Max interactions exceeded!')
263 |             converged = True
264 |     
265 |     W=W[:,0:iter] 
266 |     return x, Report, W
267 | 
268 | def newton(alpha, x0, ep):
269 | 
270 |     W=np.zeros((2,10**3))
271 |     i = 1
272 |     imax = 1000
273 |     W[:,0] = x0 
274 |     x = x0
275 |     delta = 1
276 |     Report=[]
277 | 
278 |     while i<imax and delta>ep:
279 |         p = -np.dot(np.linalg.inv(hessian(x)),jacobian(x))
280 |         x0 = x
281 |         x = x + alpha*p
282 |         W[:,i] = x
283 |         delta = sum((x-x0)**2)
284 |         Report.append(delta)
285 |         i=i+1
286 |     print('Converged using Newton, Iterations:', i)
287 |     print('Approximated Optimum:', x)
288 |     W=W[:,0:i] 
289 |     return x,Report, W
290 | 
291 | #a=np.array([-0.5,2])
292 | a=np.array([-1.2,-1])
293 | x11,r1,w1=gradient_descent(0.001, a, 0.001, 20000)
294 | x22,r2,w2=adam(0.1, a, 0.001, 20000)
295 | x33,r3,w3=newton(0.1, a, 0.00001)
296 | 
297 | X1=np.arange(-1.5,1.5+0.05, 0.05)
298 | X2=np.arange(-3.5, 5 +0.05, 0.05)
299 | [x1,x2]=np.meshgrid(X1,X2)
300 | 
301 | f=100*(x2-x1**2)**2+(1-x1)**2
302 | 
303 | plt.figure(figsize=(8,6))
304 | plt.xlabel('x1')
305 | plt.ylabel('x2')
306 | plt.title('Optimazition using Steepest Descent (6585 iterations)')
307 | plt.contour(x1,x2,f,30) 
308 | plt.plot(w1[0,:],w1[1,:],'b*',w1[0,:],w1[1,:],'r') 
309 | plt.savefig("3.jpg")  
310 | plt.show()
311 | 
312 | plt.figure(figsize=(8,6))
313 | plt.xlabel('x1')
314 | plt.ylabel('x2')
315 | plt.title('Optimazition using Adam (965 iterations)')
316 | plt.contour(x1,x2,f,30) 
317 | plt.plot(w2[0,:],w2[1,:],'b*',w2[0,:],w2[1,:],'r') 
318 | plt.savefig("4.jpg")  
319 | plt.show()
320 | 
321 | plt.figure(figsize=(8,6))
322 | plt.contour(x1,x2,f,30) 
323 | plt.xlabel('x1')
324 | plt.ylabel('x2')
325 | plt.title('Optimazition using Newton (146 iterations)')
326 | plt.plot(w3[0,:],w3[1,:],'b*',w3[0,:],w3[1,:],'r') 
327 | plt.savefig("5.jpg")  
328 | plt.show()
329 | 
330 | plt.figure(figsize=(8,6))
331 | plt.contour(x1,x2,f,30) 
332 | plt.xlabel('x1')
333 | plt.ylabel('x2')
334 | plt.title('Optimazition on rosenbrock')
335 | plt.plot(w1[0,:],w1[1,:],'r',label='Steepest Descent') 
336 | plt.plot(w2[0,:],w2[1,:],'g',label='Adam') 
337 | plt.plot(w3[0,:],w3[1,:],'b',label='Newton') 
338 | plt.legend()
339 | plt.savefig("6.jpg")  
340 | plt.show()
341 | 
342 | def gradient_descent(alpha, x0, ep, max_iter):
343 |     
344 |     W=np.zeros((2,max_iter))
345 |     W[:,0] = x0
346 |     i = 1     
347 |     x = x0
348 |     Report=[]
349 |     grad = jacobian(x)
350 |     delta = sum(grad**2)
351 | 
352 |     while i<max_iter and delta>ep:
353 |         p = -jacobian(x)
354 |         #x0=x
355 |         x = x + alpha*p
356 |         W[:,i] = x
357 |         grad = jacobian(x)
358 |         delta = sum(grad**2)
359 |         Report.append(delta)
360 |         i=i+1
361 |         
362 |     print('Converged using GD, iterations: ', i)
363 |     print('Approximated Optimum:', x)
364 | 
365 |     W=W[:,0:i] 
366 |     return x, Report, W
367 | 
368 | def adam(alpha, x, ep, max_iter):
369 | 
370 |     converged = False
371 |     iter = 1
372 |     W=np.zeros((2,max_iter))
373 |     W[:,0] = x
374 |     
375 |     P = jacobian(x)
376 |     J = sum(P**2)
377 |     e=0.00000001 
378 |     b1=0.9
379 |     b2=0.995
380 |     Report=[]
381 |     
382 |     m=0
383 |     v=0
384 |     t=0
385 | 
386 |     
387 |     while not converged:
388 |         
389 |         grad = jacobian(x)
390 |         
391 |         t += 1
392 |         
393 |         m = b1*m + (1-b1)*grad
394 |         v = b2*v + (1-b2)*(grad**2)  
395 |    
396 |         mt = m/(1-(b1**t))  
397 |         vt = v/(1-(b2**t))  
398 |   
399 |         x = x - alpha * mt/(np.sqrt(vt)+e) 
400 |         
401 |         W[:,iter] = x
402 |     
403 |         J=sum(jacobian(x)**2)
404 |         Report.append(J)
405 |         
406 |         if abs(J) <= ep:
407 |             print('Converged using Adam, iterations: ', iter)
408 |             print('Approximated Optimum:', x)
409 |             converged = True
410 |         
411 |         iter += 1 
412 |         
413 |         if iter == max_iter:
414 |             print('Max interactions exceeded!')
415 |             converged = True
416 |     
417 |     W=W[:,0:iter] 
418 |     return x, Report, W
419 | 
420 | new=np.array([-0.5,2])
421 | #new=np.array([-1.2,-1])
422 | 
423 | x_adam,r2,w22=adam(0.1, new, 0.001, 900)
424 | x_com,r1,w11=gradient_descent(0.001, x_adam, 0.001, 20000)
425 | 
426 | X1=np.arange(-1.5,1.5+0.05, 0.05)
427 | X2=np.arange(-3.5, 5 +0.05, 0.05)
428 | [x1,x2]=np.meshgrid(X1,X2)
429 | 
430 | f=100*(x2-x1**2)**2+(1-x1)**2
431 | 
432 | plt.figure(figsize=(8,6))
433 | plt.xlabel('x1')
434 | plt.ylabel('x2')
435 | plt.title('Optimazition using Steepest Descent + Adam')
436 | plt.contour(x1,x2,f,30) 
437 | plt.plot(w22[0,:],w22[1,:],'b',w11[0,:],w11[1,:],'r') 
438 | plt.savefig("3.jpg")  
439 | plt.show()
440 | 


--------------------------------------------------------------------------------
/Optimization Methods in Deep Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vince-CV/optimization-in-deep-learning/91a33d95ad9a384b26bb7d193bc354b04148e581/Optimization Methods in Deep Learning.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Convex-Optimizer-in-Deep-Learning
 2 | 
 3 | 
 4 |  Authors notice:
 5 | 
 6 |  DL got developed thanks to the a variaty of techniques such as layer, gradient update, initialization, non-linear, and normalization, but optimization in DL are usually non-convex problem. 
 7 | 
 8 |  GD: using full batch of dataset, and easier to stuck into local minima (cost functions in DL are always not that flat). Also large computation requires;
 9 | 
10 |  SGD: using single data, and this kind of strategy introduced larger noise so that the training can jump out from saddle points. This method tends to be slow, and performances will have dramatical vibrataion.
11 | 
12 |  MiniBatch GD: using a certain part of dataset. Usually it's not intuitive to find a optimal value for batch size. Too large Batch size limited by GPU storge, but too small Batch can also limited by time.
13 | 
14 |  1st Order: SGD, Adagrad, Adam..., Batch size would be around hundreds;
15 | 2nd Order: Conj-gradient, Newton, Quasi-Newton, L-BFGS... the errors would be skyrocketing if the 1st order have obviously errors; Even though the smaller batch can speed-up convergence, it also introduced much noise which will adversly affects the performance. So batch size would be thousands or more for 2nd order optimization methods;
16 | 
17 |  In the other hand, smaller batch are not that stable, so I always increased the learning rate to fix this gap, otherwise the model could not convergent since the malignant oscillation.
18 | 
19 |  Experience from my DL book:
20 | 1. slightly increase batch size step-by-step;
21 | 2. using SGD near the end of training;
22 | 3. small batch to introduce noise at the earlier stage, and large batch to get rid of oscillation later.
23 | 


--------------------------------------------------------------------------------