├── Convex optimizations.py ├── Optimization Methods in Deep Learning.pdf └── README.md /Convex optimizations.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Created on Wed Mar 28 18:01:15 2018 5 | 6 | @author: Xunzhe Wen 7 | """ 8 | 9 | 10 | import math 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | 14 | def f(x): 15 | return x**3-2*x-10+x**2 16 | 17 | def derivative_f(x): 18 | return 3*(x**2)+2*x-2 19 | 20 | def GD(x, learning_rate): 21 | gradient=0.0 22 | trace=[] 23 | 24 | for i in range(1000000): 25 | 26 | if((abs(gradient)>0.00001) and (abs(gradient)<0.0001)): 27 | print("Gradient Descent converges with iterations "+str(i)) 28 | break 29 | 30 | else: 31 | gradient = derivative_f(x) 32 | x = x - learning_rate*gradient 33 | trace.append(x) 34 | y = f(x) 35 | 36 | return x,y,trace 37 | 38 | def Adagrad(x, learning_rate): 39 | gradient=0 40 | e=0.00000001 41 | sum = 0.0 42 | trace=[] 43 | 44 | for i in range(100000): 45 | if((abs(gradient)>0.00001) and (abs(gradient)<0.0001)): 46 | print("Adagrad converges with iterations "+str(i)) 47 | break 48 | else: 49 | gradient = derivative_f(x) 50 | sum += gradient**2; 51 | x=x-learning_rate*gradient/(math.sqrt(sum/(i+1))+e) 52 | trace.append(x) 53 | y=f(x) 54 | return x,y,trace 55 | 56 | def RMSProp(x, learning_rate): 57 | gradient=0 58 | e=0.00000001 59 | sum = 0.0 60 | d = 0.9 61 | Egt=0 62 | Edt = 0 63 | delta = 0 64 | trace=[] 65 | for i in range(100000): 66 | 67 | if(abs(gradient)>0.00001 and (abs(gradient)<0.0001)): 68 | print("RMSProp converges with iterations "+str(i)) 69 | break 70 | else: 71 | gradient = derivative_f(x) 72 | Egt = d * Egt + (1-d)*(gradient**2) 73 | x=x-learning_rate*gradient/math.sqrt(Egt + e) 74 | trace.append(x) 75 | y=f(x) 76 | return x,y,trace 77 | 78 | def Adam(x,learning_rate): 79 | gradient=0 80 | e=0.00000001 81 | b1 = 0.9 82 | b2 = 0.995 83 | trace=[] 84 | 85 | m = 0 86 | v = 0 87 | t = 0 88 | 89 | for i in range(10000): 90 | #print('x = {:6f}, f(x) = {:6f},gradient={:6f}'.format(x,y,gradient)) 91 | if(abs(gradient)>0.00001 and (abs(gradient)<0.0001)): 92 | print("Adam converges with iterations "+str(i)) 93 | break 94 | else: 95 | gradient = derivative_f(x) 96 | 97 | t=t+1 98 | m = b1*m + (1-b1)*gradient 99 | v = b2*v +(1-b2)*(gradient**2) 100 | mt = m/(1-(b1**t)) 101 | vt = v/(1-(b2**t)) 102 | x = x- learning_rate * mt/(math.sqrt(vt)+e) 103 | trace.append(x) 104 | y=f(x) 105 | return x,y,trace 106 | 107 | ini=-1.2 108 | x1,y1,trace1=GD(ini,0.1) 109 | x2,y2,trace2=Adagrad(ini,0.1) 110 | x3,y3,trace3=RMSProp(ini,0.1) 111 | x4,y4,trace4=Adam(ini,0.1) 112 | 113 | t1=np.array(trace1)[::] 114 | t2=np.array(trace2)[::] 115 | t3=np.array(trace3)[::] 116 | t4=np.array(trace4)[::] 117 | 118 | plt.figure(figsize=(12,6)) 119 | x=np.linspace(-2,2,100000) 120 | plt.text(-2, -3, r'$y=x^3+x^2-2x-10$',fontsize=15) 121 | plt.plot(x,f(x)) 122 | plt.plot(t1,f(t1),'r*') 123 | plt.xlabel('x') 124 | plt.ylabel('y') 125 | plt.title('Gradient Descent Optimization (3057 iteration until convergence)') 126 | plt.grid() 127 | plt.savefig("7.jpg") 128 | plt.show() 129 | 130 | plt.figure(figsize=(12,6)) 131 | x=np.linspace(-2,2,100000) 132 | plt.text(-2, -3, r'$y=x^3+x^2-2x-10$',fontsize=15) 133 | plt.plot(x,f(x)) 134 | plt.plot(t2,f(t2),'r*') 135 | plt.xlabel('x') 136 | plt.ylabel('y') 137 | plt.title('Adagrad Optimization (3639 iteration until convergence)') 138 | plt.grid() 139 | plt.savefig("8.jpg") 140 | plt.show() 141 | 142 | plt.figure(figsize=(12,6)) 143 | x=np.linspace(-2,2,100000) 144 | plt.text(-2, -3, r'$y=x^3+x^2-2x-10$',fontsize=15) 145 | plt.plot(x,f(x)) 146 | plt.plot(t3,f(t3),'r*') 147 | plt.xlabel('x') 148 | plt.ylabel('y') 149 | plt.title('RMSProp Optimization (1774 iteration until convergence)') 150 | plt.grid() 151 | plt.savefig("9.jpg") 152 | plt.show() 153 | 154 | plt.figure(figsize=(12,6)) 155 | x=np.linspace(-2,2,100000) 156 | plt.text(-2, -3, r'$y=x^3+x^2-2x-10$',fontsize=15) 157 | plt.plot(x,f(x)) 158 | plt.plot(t4,f(t4),'r*') 159 | plt.xlabel('x') 160 | plt.ylabel('y') 161 | plt.title('Adam Optimization (2253 iteration until convergence)') 162 | plt.grid() 163 | plt.savefig("10.jpg") 164 | plt.show() 165 | 166 | print(x1,y1) 167 | print(x2,y2) 168 | print(x3,y3) 169 | print(x4,y4) 170 | 171 | import numpy as np 172 | import matplotlib.pyplot as plt 173 | import random 174 | import math 175 | import random 176 | import sklearn 177 | from sklearn.datasets.samples_generator import make_regression 178 | import pylab 179 | from scipy import stats 180 | 181 | def rosenbrock(x): 182 | return 100*(x[1]-x[0]**2)**2+(1-x[0])**2 183 | 184 | def jacobian(x): 185 | return np.array([-400*x[0]*(x[1]-x[0]**2)-2*(1-x[0]),200*(x[1]-x[0]**2)]) 186 | 187 | def hessian(x): 188 | return np.array([[-400*(x[1]-3*x[0]**2)+2,-400*x[0]],[-400*x[0],200]]) 189 | 190 | def gradient_descent(alpha, x0, ep, max_iter): 191 | 192 | W=np.zeros((2,max_iter)) 193 | W[:,0] = x0 194 | i = 1 195 | x = x0 196 | Report=[] 197 | grad = jacobian(x) 198 | delta = sum(grad**2) 199 | 200 | while iep: 201 | p = -jacobian(x) 202 | #x0=x 203 | x = x + alpha*p 204 | W[:,i] = x 205 | grad = jacobian(x) 206 | delta = sum(grad**2) 207 | Report.append(delta) 208 | i=i+1 209 | 210 | print('Converged using GD, iterations: ', i) 211 | print('Approximated Optimum:', x) 212 | 213 | W=W[:,0:i] 214 | return x, Report, W 215 | 216 | def adam(alpha, x, ep, max_iter): 217 | 218 | converged = False 219 | iter = 1 220 | W=np.zeros((2,max_iter)) 221 | W[:,0] = x 222 | 223 | P = jacobian(x) 224 | J = sum(P**2) 225 | e=0.00000001 226 | b1=0.9 227 | b2=0.995 228 | Report=[] 229 | 230 | m=0 231 | v=0 232 | t=0 233 | 234 | 235 | while not converged: 236 | 237 | grad = jacobian(x) 238 | 239 | t += 1 240 | 241 | m = b1*m + (1-b1)*grad 242 | v = b2*v + (1-b2)*(grad**2) 243 | 244 | mt = m/(1-(b1**t)) 245 | vt = v/(1-(b2**t)) 246 | 247 | x = x - alpha * mt/(np.sqrt(vt)+e) 248 | 249 | W[:,iter] = x 250 | 251 | J=sum(jacobian(x)**2) 252 | Report.append(J) 253 | 254 | if abs(J) <= ep: 255 | print('Converged using Adam, iterations: ', iter) 256 | print('Approximated Optimum:', x) 257 | converged = True 258 | 259 | iter += 1 260 | 261 | if iter == max_iter: 262 | print('Max interactions exceeded!') 263 | converged = True 264 | 265 | W=W[:,0:iter] 266 | return x, Report, W 267 | 268 | def newton(alpha, x0, ep): 269 | 270 | W=np.zeros((2,10**3)) 271 | i = 1 272 | imax = 1000 273 | W[:,0] = x0 274 | x = x0 275 | delta = 1 276 | Report=[] 277 | 278 | while iep: 279 | p = -np.dot(np.linalg.inv(hessian(x)),jacobian(x)) 280 | x0 = x 281 | x = x + alpha*p 282 | W[:,i] = x 283 | delta = sum((x-x0)**2) 284 | Report.append(delta) 285 | i=i+1 286 | print('Converged using Newton, Iterations:', i) 287 | print('Approximated Optimum:', x) 288 | W=W[:,0:i] 289 | return x,Report, W 290 | 291 | #a=np.array([-0.5,2]) 292 | a=np.array([-1.2,-1]) 293 | x11,r1,w1=gradient_descent(0.001, a, 0.001, 20000) 294 | x22,r2,w2=adam(0.1, a, 0.001, 20000) 295 | x33,r3,w3=newton(0.1, a, 0.00001) 296 | 297 | X1=np.arange(-1.5,1.5+0.05, 0.05) 298 | X2=np.arange(-3.5, 5 +0.05, 0.05) 299 | [x1,x2]=np.meshgrid(X1,X2) 300 | 301 | f=100*(x2-x1**2)**2+(1-x1)**2 302 | 303 | plt.figure(figsize=(8,6)) 304 | plt.xlabel('x1') 305 | plt.ylabel('x2') 306 | plt.title('Optimazition using Steepest Descent (6585 iterations)') 307 | plt.contour(x1,x2,f,30) 308 | plt.plot(w1[0,:],w1[1,:],'b*',w1[0,:],w1[1,:],'r') 309 | plt.savefig("3.jpg") 310 | plt.show() 311 | 312 | plt.figure(figsize=(8,6)) 313 | plt.xlabel('x1') 314 | plt.ylabel('x2') 315 | plt.title('Optimazition using Adam (965 iterations)') 316 | plt.contour(x1,x2,f,30) 317 | plt.plot(w2[0,:],w2[1,:],'b*',w2[0,:],w2[1,:],'r') 318 | plt.savefig("4.jpg") 319 | plt.show() 320 | 321 | plt.figure(figsize=(8,6)) 322 | plt.contour(x1,x2,f,30) 323 | plt.xlabel('x1') 324 | plt.ylabel('x2') 325 | plt.title('Optimazition using Newton (146 iterations)') 326 | plt.plot(w3[0,:],w3[1,:],'b*',w3[0,:],w3[1,:],'r') 327 | plt.savefig("5.jpg") 328 | plt.show() 329 | 330 | plt.figure(figsize=(8,6)) 331 | plt.contour(x1,x2,f,30) 332 | plt.xlabel('x1') 333 | plt.ylabel('x2') 334 | plt.title('Optimazition on rosenbrock') 335 | plt.plot(w1[0,:],w1[1,:],'r',label='Steepest Descent') 336 | plt.plot(w2[0,:],w2[1,:],'g',label='Adam') 337 | plt.plot(w3[0,:],w3[1,:],'b',label='Newton') 338 | plt.legend() 339 | plt.savefig("6.jpg") 340 | plt.show() 341 | 342 | def gradient_descent(alpha, x0, ep, max_iter): 343 | 344 | W=np.zeros((2,max_iter)) 345 | W[:,0] = x0 346 | i = 1 347 | x = x0 348 | Report=[] 349 | grad = jacobian(x) 350 | delta = sum(grad**2) 351 | 352 | while iep: 353 | p = -jacobian(x) 354 | #x0=x 355 | x = x + alpha*p 356 | W[:,i] = x 357 | grad = jacobian(x) 358 | delta = sum(grad**2) 359 | Report.append(delta) 360 | i=i+1 361 | 362 | print('Converged using GD, iterations: ', i) 363 | print('Approximated Optimum:', x) 364 | 365 | W=W[:,0:i] 366 | return x, Report, W 367 | 368 | def adam(alpha, x, ep, max_iter): 369 | 370 | converged = False 371 | iter = 1 372 | W=np.zeros((2,max_iter)) 373 | W[:,0] = x 374 | 375 | P = jacobian(x) 376 | J = sum(P**2) 377 | e=0.00000001 378 | b1=0.9 379 | b2=0.995 380 | Report=[] 381 | 382 | m=0 383 | v=0 384 | t=0 385 | 386 | 387 | while not converged: 388 | 389 | grad = jacobian(x) 390 | 391 | t += 1 392 | 393 | m = b1*m + (1-b1)*grad 394 | v = b2*v + (1-b2)*(grad**2) 395 | 396 | mt = m/(1-(b1**t)) 397 | vt = v/(1-(b2**t)) 398 | 399 | x = x - alpha * mt/(np.sqrt(vt)+e) 400 | 401 | W[:,iter] = x 402 | 403 | J=sum(jacobian(x)**2) 404 | Report.append(J) 405 | 406 | if abs(J) <= ep: 407 | print('Converged using Adam, iterations: ', iter) 408 | print('Approximated Optimum:', x) 409 | converged = True 410 | 411 | iter += 1 412 | 413 | if iter == max_iter: 414 | print('Max interactions exceeded!') 415 | converged = True 416 | 417 | W=W[:,0:iter] 418 | return x, Report, W 419 | 420 | new=np.array([-0.5,2]) 421 | #new=np.array([-1.2,-1]) 422 | 423 | x_adam,r2,w22=adam(0.1, new, 0.001, 900) 424 | x_com,r1,w11=gradient_descent(0.001, x_adam, 0.001, 20000) 425 | 426 | X1=np.arange(-1.5,1.5+0.05, 0.05) 427 | X2=np.arange(-3.5, 5 +0.05, 0.05) 428 | [x1,x2]=np.meshgrid(X1,X2) 429 | 430 | f=100*(x2-x1**2)**2+(1-x1)**2 431 | 432 | plt.figure(figsize=(8,6)) 433 | plt.xlabel('x1') 434 | plt.ylabel('x2') 435 | plt.title('Optimazition using Steepest Descent + Adam') 436 | plt.contour(x1,x2,f,30) 437 | plt.plot(w22[0,:],w22[1,:],'b',w11[0,:],w11[1,:],'r') 438 | plt.savefig("3.jpg") 439 | plt.show() 440 | -------------------------------------------------------------------------------- /Optimization Methods in Deep Learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vince-CV/optimization-in-deep-learning/91a33d95ad9a384b26bb7d193bc354b04148e581/Optimization Methods in Deep Learning.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Convex-Optimizer-in-Deep-Learning 2 | 3 | 4 | Authors notice: 5 | 6 | DL got developed thanks to the a variaty of techniques such as layer, gradient update, initialization, non-linear, and normalization, but optimization in DL are usually non-convex problem. 7 | 8 | GD: using full batch of dataset, and easier to stuck into local minima (cost functions in DL are always not that flat). Also large computation requires; 9 | 10 | SGD: using single data, and this kind of strategy introduced larger noise so that the training can jump out from saddle points. This method tends to be slow, and performances will have dramatical vibrataion. 11 | 12 | MiniBatch GD: using a certain part of dataset. Usually it's not intuitive to find a optimal value for batch size. Too large Batch size limited by GPU storge, but too small Batch can also limited by time. 13 | 14 | 1st Order: SGD, Adagrad, Adam..., Batch size would be around hundreds; 15 | 2nd Order: Conj-gradient, Newton, Quasi-Newton, L-BFGS... the errors would be skyrocketing if the 1st order have obviously errors; Even though the smaller batch can speed-up convergence, it also introduced much noise which will adversly affects the performance. So batch size would be thousands or more for 2nd order optimization methods; 16 | 17 | In the other hand, smaller batch are not that stable, so I always increased the learning rate to fix this gap, otherwise the model could not convergent since the malignant oscillation. 18 | 19 | Experience from my DL book: 20 | 1. slightly increase batch size step-by-step; 21 | 2. using SGD near the end of training; 22 | 3. small batch to introduce noise at the earlier stage, and large batch to get rid of oscillation later. 23 | --------------------------------------------------------------------------------