├── .gitignore ├── Lab1_Basics ├── Fig │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ └── UGA.png ├── Lab1_Basics.ipynb ├── algorithms.py ├── plotLib.py ├── problem1.py ├── problem2.py ├── problem3.py ├── problem4.py └── problem5.py ├── Lab2_GradientDescent ├── Fig │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ └── UGA.png ├── Lab2_GradientDescent.ipynb ├── algorithms.py ├── plotLib.py ├── problem1.py ├── problem2.py ├── problem3.py ├── problem4.py └── problem5.py ├── Lab3_ProjectedGradient ├── Fig │ ├── .DS_Store │ ├── ._.DS_Store │ ├── ._1.png │ ├── ._2.png │ ├── ._3.png │ ├── ._4.png │ ├── ._5.png │ ├── ._UGA.png │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ └── UGA.png ├── Lab3_ProjectedGradient.ipynb ├── algoProjGradient.py ├── plotLib.py ├── problem1.py ├── problem2.py ├── problem3.py ├── problem4.py └── problem5.py ├── Lab4_Prox ├── Fig │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ └── UGA.png ├── Lab4_Proximal_algorithms.ipynb ├── logistic_regression_student.py ├── student.npz └── student.txt ├── Lab5_MachineLearningExample ├── Fig │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ └── UGA.png ├── Lab5_OptimForML.ipynb ├── algoGradient.py ├── ionosphere.data └── logistic_regression_ionosphere.py ├── Lab6_LPQP ├── Fig │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ └── UGA.png ├── Lab6_LP_and_QP.ipynb └── toy_problem.py ├── Lab7_StochasticMethods ├── Fig │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ └── UGA.png ├── Lab7_StochMethods.ipynb ├── algoProx.py ├── logistic_regression_student.py ├── plotLib.py ├── student.npz └── student.txt ├── Lab8_MinMax ├── Fig │ └── UGA.png └── Lab8_Two-player zero-sum games.ipynb ├── Lab9_Uzawa ├── Fig │ ├── ._1.png │ ├── ._2.png │ ├── ._3.png │ ├── ._4.png │ ├── ._5.png │ ├── ._UGA.png │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ └── UGA.png ├── Lab9_constrainedOptimization.ipynb └── plotLib.py ├── README.md ├── Tuto1_Basics ├── harder.png ├── poly.png ├── rosenbrock.png ├── simple.png ├── tuto1.pdf ├── tuto1.tex └── two_pits.png ├── Tuto4_Prox ├── tuto4.pdf └── tuto4.tex ├── Tuto5_Rates ├── tuto5.pdf └── tuto5.tex └── Tuto6_LPQP ├── tuto6.pdf └── tuto6.tex /.gitignore: -------------------------------------------------------------------------------- 1 | ## Core latex/pdflatex auxiliary files: 2 | *.aux 3 | *.lof 4 | *.log 5 | *.lot 6 | *.fls 7 | *.out 8 | *.toc 9 | *.fmt 10 | *.fot 11 | *.cb 12 | *.cb2 13 | .*.lb 14 | 15 | ## Intermediate documents: 16 | *.dvi 17 | *.xdv 18 | *-converted-to.* 19 | *.fdb_latexmk 20 | *.synctex.gz 21 | 22 | # Notebooks 23 | __pycache__ 24 | .ipynb_checkpoints 25 | 26 | -------------------------------------------------------------------------------- /Lab1_Basics/Fig/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/1.png -------------------------------------------------------------------------------- /Lab1_Basics/Fig/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/2.png -------------------------------------------------------------------------------- /Lab1_Basics/Fig/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/3.png -------------------------------------------------------------------------------- /Lab1_Basics/Fig/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/4.png -------------------------------------------------------------------------------- /Lab1_Basics/Fig/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/5.png -------------------------------------------------------------------------------- /Lab1_Basics/Fig/UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/UGA.png -------------------------------------------------------------------------------- /Lab1_Basics/algorithms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Gradient-based algorithms 5 | # 6 | # In this file, we code our gradient-based optimization algorithms. 7 | 8 | ################################# 9 | # # 1. Gradient algorithms 10 | ################################## 11 | # 12 | # For minimizing a differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given: 13 | # * the function to minimize `f` 14 | # * a 1st order oracle `f_grad` (see `problem1.ipynb` for instance) 15 | # * an initialization point `x0` 16 | # * the sought precision `PREC` 17 | # * a maximal number of iterations `ITE_MAX` 18 | # 19 | # 20 | # these algorithms perform iterations of the form 21 | # $$ x_{k+1} = x_k - \gamma_k \nabla f(x_k) $$ 22 | # where $\gamma_k$ is a stepsize to choose. 23 | 24 | # ### 1.a. Constant stepsize gradient algorithm 25 | # 26 | # First, we consider the case where the stepsize is fixed over iterations and passed an argument `step` to the algorithm. 27 | 28 | 29 | 30 | import numpy as np 31 | import timeit 32 | 33 | def gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ): 34 | x = np.copy(x0) 35 | stop = PREC*np.linalg.norm(f_grad(x0) ) 36 | 37 | x_tab = np.copy(x) 38 | print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step)) 39 | t_s = timeit.default_timer() 40 | for k in range(ITE_MAX): 41 | g = f_grad(x) 42 | x = x - step*g ####### ITERATION 43 | 44 | x_tab = np.vstack((x_tab,x)) 45 | 46 | if np.linalg.norm(g) < stop: 47 | break 48 | t_e = timeit.default_timer() 49 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1])) 50 | return x,x_tab 51 | 52 | # # 1.b Newton algorithm 53 | # 54 | # For minimizing a *twice* differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given: 55 | # * the function to minimize `f` 56 | # * a 2nd order oracle `f_grad_hessian` (see `problem1.ipynb` for instance) 57 | # * an initialization point `x0` 58 | # * the sought precision `PREC` 59 | # * a maximal number of iterations `ITE_MAX` 60 | # 61 | # 62 | # these algorithms perform iterations of the form 63 | # $$ x_{k+1} = x_k - [\nabla^2 f(x_k) ]^{-1} \nabla f(x_k) .$$ 64 | 65 | 66 | # Q: Complete the code for the Newton iterations 67 | 68 | import numpy as np 69 | import timeit 70 | 71 | def newton_algorithm(f , f_grad_hessian , x0 , PREC , ITE_MAX ): 72 | x = np.copy(x0) 73 | g0,H0 = f_grad_hessian(x0) 74 | stop = PREC*np.linalg.norm(g0 ) 75 | 76 | x_tab = np.copy(x) 77 | print("------------------------------------\nNewton's algorithm\n------------------------------------\nSTART") 78 | t_s = timeit.default_timer() 79 | for k in range(ITE_MAX): 80 | 81 | g, H = f_grad_hessian(x) 82 | 83 | ### COMPLETE 84 | 85 | x_tab = np.vstack((x_tab,x)) 86 | 87 | if np.linalg.norm(g) < stop: 88 | break 89 | t_e = timeit.default_timer() 90 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1])) 91 | return x,x_tab 92 | 93 | ################################# 94 | # # 2. More involved functions 95 | ################################## 96 | 97 | # # 2.b Adaptive stepsize gradient algorithm 98 | 99 | # Q: Complete the adaptive gradient below using your intuition 100 | 101 | 102 | import numpy as np 103 | import timeit 104 | 105 | def gradient_adaptive_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ): 106 | x = np.copy(x0) 107 | stop = PREC*np.linalg.norm(f_grad(x0) ) 108 | 109 | x_tab = np.copy(x) 110 | print("------------------------------------\nAdaptative Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step)) 111 | t_s = timeit.default_timer() 112 | for k in range(ITE_MAX): 113 | 114 | g = f_grad(x) 115 | x_prev = np.copy(x) 116 | 117 | x = x - step*g ####### ITERATION 118 | 119 | ### COMPLETE 120 | 121 | x_tab = np.vstack((x_tab,x)) 122 | 123 | if np.linalg.norm(g) < stop: 124 | break 125 | t_e = timeit.default_timer() 126 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1])) 127 | return x,x_tab 128 | 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /Lab1_Basics/plotLib.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from matplotlib import cm 5 | from mpl_toolkits.mplot3d import Axes3D 6 | import time 7 | from IPython import display 8 | 9 | 10 | def custom_3dplot( f, x1_min,x1_max,x2_min,x2_max,nb_points, v_min, v_max ): 11 | 12 | def f_no_vector(x1,x2): 13 | return f( np.array( [x1,x2] ) ) 14 | 15 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 16 | z = f_no_vector(x,y) 17 | 18 | fig = plt.figure() 19 | # Old syntax 20 | # ax = fig.gca(projection='3d') 21 | ax = fig.add_subplot(projection='3d') 22 | ax.plot_surface(x, y, z, cmap=cm.hot , vmin = v_min, vmax = v_max) 23 | ax.set_zlim(v_min, v_max) 24 | plt.show() 25 | 26 | 27 | def level_plot( f, x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 28 | 29 | 30 | def f_no_vector(x1,x2): 31 | return f( np.array( [x1,x2] ) ) 32 | 33 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 34 | z = f_no_vector(x,y) 35 | 36 | fig = plt.figure() 37 | graphe = plt.contour(x,y,z,levels) 38 | #plt.plot(3,1,'r*',markersize=15) 39 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 40 | plt.title(title) 41 | plt.show() 42 | 43 | 44 | def level_points_plot( f , x_tab , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 45 | 46 | def f_no_vector(x1,x2): 47 | return f( np.array( [x1,x2] ) ) 48 | 49 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 50 | z = f_no_vector(x,y) 51 | 52 | fig = plt.figure() 53 | graphe = plt.contour(x,y,z,levels) 54 | #plt.plot(3,1,'r*',markersize=15) 55 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 56 | plt.title(title) 57 | 58 | delay = 4.0/x_tab.shape[0] 59 | for k in range(x_tab.shape[0]): 60 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10) 61 | plt.xlim([x1_min,x1_max]) 62 | plt.ylim([x2_min,x2_max]) 63 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1])) 64 | plt.draw() 65 | display.clear_output(wait=True) 66 | display.display(fig) 67 | time.sleep(delay) 68 | display.clear_output() 69 | plt.show() 70 | 71 | 72 | def level_2points_plot( f , x_tab , x_tab2 , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 73 | 74 | 75 | def f_no_vector(x1,x2): 76 | return f( np.array( [x1,x2] ) ) 77 | 78 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 79 | z = f_no_vector(x,y) 80 | 81 | fig = plt.figure() 82 | graphe = plt.contour(x,y,z,levels) 83 | #plt.plot(3,1,'r*',markersize=15) 84 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 85 | plt.xlim([x1_min,x1_max]) 86 | plt.ylim([x2_min,x2_max]) 87 | plt.title(title) 88 | 89 | delay = 4.0/x_tab.shape[0] 90 | for k in range(x_tab.shape[0]): 91 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10) 92 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1])) 93 | plt.draw() 94 | #plt.pause(delay) 95 | 96 | delay = 4.0/x_tab2.shape[0] 97 | for k in range(x_tab2.shape[0]): 98 | plt.plot(x_tab2[k,0],x_tab2[k,1],'dg',markersize=8) 99 | #plt.annotate(k,(x_tab2[k,0],x_tab2[k,1])) 100 | #plt.pause(delay) 101 | plt.draw() 102 | 103 | plt.show() 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /Lab1_Basics/problem1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 1 5 | # 6 | # 7 | # The objective of Problem 1 is to minimize a simple quadratic function $f$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # f: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & 4 (x_1-3)^2 + 2(x_2-1)^2 12 | # \end{array}$$ 13 | 14 | 15 | 16 | ##### Function definition 17 | def f(x): 18 | x1 = x[0] 19 | x2 = x[1] 20 | return 4*(x1-3)**2+2*(x2-1)**2 21 | #### 22 | 23 | ##### Plot parameters f 24 | x1_min = -0.5 25 | x1_max = 5.5 26 | x2_min = -0.5 27 | x2_max = 5.5 28 | nb_points = 200 29 | vmin = 0 30 | vmax = 80 31 | levels = [0.5,1,2,5,10,15] 32 | title = 'f: a simple function' 33 | #### 34 | 35 | 36 | # ### Some parameters 37 | # 38 | # Before solving things numerically, some useful things can be computed: 39 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc 40 | # * Good starting points (for hot starting e.g.) 41 | 42 | 43 | 44 | ###### Useful Parameters 45 | L = 8 # Lipschitz constant of the gradient 46 | 47 | 48 | # ### Oracles 49 | # 50 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian). 51 | 52 | 53 | 54 | # Q: Observe the first order oracle `f_grad`. 55 | 56 | 57 | 58 | import numpy as np 59 | 60 | ##### Gradient oracle 61 | def f_grad(x): 62 | x1 = x[0] 63 | x2 = x[1] 64 | gx = 8*(x1-3) 65 | gy = 4*(x2-1) 66 | return np.array( [ gx , gy ] ) 67 | #### 68 | 69 | 70 | # Q: Observe the second order oracle `f_grad_hessian`. 71 | 72 | 73 | import numpy as np 74 | 75 | ##### Hessian scaled Gradient computation 76 | def f_grad_hessian(x): 77 | x1 = x[0] 78 | x2 = x[1] 79 | gx = 8*(x1-3) 80 | gy = 4*(x2-1) 81 | g = np.array( [ gx , gy ] ) 82 | H = np.array( [ (8.0 , 0.0 ) , ( 0.0 , 4.0 ) ] ) 83 | return g,H 84 | #### 85 | 86 | -------------------------------------------------------------------------------- /Lab1_Basics/problem2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 2 5 | # 6 | # 7 | # The objective of Problem 2 is to minimize a more involved but very smooth function function $g$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # g: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & \log( 1 + \exp(4 (x_1-3)^2 ) + \exp( 2(x_2-1)^2 ) ) - \log(3) 12 | # \end{array}$$ 13 | 14 | 15 | 16 | 17 | ##### Function definition 18 | def f(x): 19 | x1 = x[0] 20 | x2 = x[1] 21 | return np.log( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) - np.log(3) 22 | #### 23 | 24 | ##### Plot parameters f 25 | x1_min = -0.5 26 | x1_max = 5.5 27 | x2_min = -0.5 28 | x2_max = 5.5 29 | nb_points = 500 30 | vmin = 0 31 | vmax = 100 32 | levels = [0.5,1,2,5,10,15] 33 | title = 'a Harder function: g' 34 | #### 35 | 36 | 37 | 38 | ###### Useful Parameters 39 | L = 8 # Lipschitz constant of the gradient 40 | 41 | 42 | # ### Oracles 43 | 44 | 45 | # Q: Complete the first order oracle `f_grad`. 46 | 47 | 48 | 49 | import numpy as np 50 | 51 | ##### Gradient oracle 52 | def f_grad(x): 53 | x1 = x[0] 54 | x2 = x[1] 55 | gx = 0 ## To complete 56 | gy = 0 ## To complete 57 | return np.array( [ gx , gy ] ) 58 | #### 59 | 60 | 61 | # Q: Fill the following second order oracle `f_grad_hessian`. 62 | 63 | 64 | import numpy as np 65 | 66 | ##### Hessian scaled Gradient computation 67 | def f_grad_hessian(x): 68 | 69 | return g,H 70 | #### 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /Lab1_Basics/problem3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 3 5 | # 6 | # 7 | # The objective of Problem 3 is to minimize non-convex smooth Rosenbrock function $r$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # r: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2 12 | # \end{array}$$ 13 | 14 | 15 | 16 | 17 | ##### Function definition 18 | def f(x): 19 | """Rosenbrock.""" 20 | x1 = x[0] 21 | x2 = x[1] 22 | return (1-x1)**2+100*(x2-x1**2)**2 23 | #### 24 | 25 | ##### Plot parameters f 26 | x1_min = -1.5 27 | x1_max = 1.55 28 | x2_min = -0.2 29 | x2_max = 1.5 30 | nb_points = 200 31 | vmin = 0 32 | vmax = 120 33 | levels = [0.05,1,5,15,50,100,200] 34 | title = 'Rosenbrock function' 35 | #### 36 | 37 | 38 | 39 | 40 | 41 | ### Oracles 42 | 43 | 44 | # Q: Complete the first order oracle `f_grad`. 45 | 46 | 47 | 48 | 49 | import numpy as np 50 | 51 | ##### Gradient oracle 52 | def f_grad(x): 53 | 54 | return 0.0 ### To complete 55 | #### 56 | 57 | 58 | # Q: Fill the following second order oracle `f_grad_hessian`. 59 | 60 | 61 | import numpy as np 62 | 63 | ##### Hessian scaled Gradient computation 64 | def f_grad_hessian(x): 65 | 66 | return g,H ### To complete 67 | #### 68 | 69 | -------------------------------------------------------------------------------- /Lab1_Basics/problem4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 4 5 | # 6 | # 7 | # The objective of Problem 4 is to minimize a non-convex function $t$ with two minimizers on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # t: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & (0.6 x_1 + 0.2 x_2)^2 \left((0.6 x_1 + 0.2 x_2)^2 - 4 (0.6 x_1 + 0.2 x_2)+4\right) + (-0.2 x_1 + 0.6 x_2)^2 12 | # \end{array}$$ 13 | 14 | 15 | 16 | 17 | 18 | ##### Function definition 19 | def f(x): 20 | x1 = x[0] 21 | x2 = x[1] 22 | return (0.6*x1 + 0.2*x2)**2 * ((0.6*x1 + 0.2*x2)**2 - 4*(0.6*x1 + 0.2*x2)+4) + (-0.2*x1 + 0.6*x2)**2 23 | #### 24 | 25 | ##### Plot parameters f 26 | x1_min = -1 27 | x1_max = 4 28 | x2_min = -1 29 | x2_max = 4 30 | nb_points = 200 31 | levels = [0.05,0.5,1,2,5] 32 | vmin = 0 33 | vmax = 5 34 | title = 'two pits' 35 | #### 36 | 37 | 38 | 39 | 40 | 41 | ###### Useful Parameters 42 | L = 8 # Lipschitz constant of the gradient 43 | 44 | 45 | ### Oracles 46 | 47 | # Q: Complete the first order oracle `f_grad`. 48 | 49 | 50 | 51 | 52 | import numpy as np 53 | 54 | ##### Gradient oracle 55 | def f_grad(x): 56 | 57 | return ### To complete 58 | 59 | 60 | # Q: Does a second order oracle exist for any point? 61 | 62 | 63 | -------------------------------------------------------------------------------- /Lab1_Basics/problem5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 5 5 | # 6 | # 7 | # The objective of Problem 5 is to minimize a polyhedral function $p$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # p: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & \left| x_1-3 \right| + 2\left| x_2-1\right| . 12 | # \end{array}$$ 13 | 14 | 15 | 16 | 17 | ##### Function definition 18 | def f(x): 19 | x1 = x[0] 20 | x2 = x[1] 21 | return np.abs(x1-3)+2*np.abs(x2-1) 22 | #### 23 | 24 | ##### Plot parameters f 25 | x1_min = -0.5 26 | x1_max = 5.5 27 | x2_min = -0.5 28 | x2_max = 5.5 29 | nb_points = 200 30 | levels = [0.05,0.5,1,2,5] 31 | vmin = 0 32 | vmax = 5 33 | title = 'polyhedral' 34 | #### 35 | 36 | 37 | ### Oracles 38 | 39 | 40 | # Q: Compute a first order oracle `f_grad`. Is it unique? 41 | 42 | 43 | 44 | import numpy as np 45 | 46 | ##### Gradient oracle 47 | def f_grad(x): 48 | 49 | return g ### To complete 50 | #### 51 | 52 | 53 | # Q: What about a second order oracle? 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /Lab2_GradientDescent/Fig/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/1.png -------------------------------------------------------------------------------- /Lab2_GradientDescent/Fig/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/2.png -------------------------------------------------------------------------------- /Lab2_GradientDescent/Fig/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/3.png -------------------------------------------------------------------------------- /Lab2_GradientDescent/Fig/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/4.png -------------------------------------------------------------------------------- /Lab2_GradientDescent/Fig/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/5.png -------------------------------------------------------------------------------- /Lab2_GradientDescent/Fig/UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/UGA.png -------------------------------------------------------------------------------- /Lab2_GradientDescent/Lab2_GradientDescent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "
\n", 8 | "

Master of Science in Industrial and Applied Mathematics (MSIAM) - 1st year

\n", 9 | "
\n", 10 | "

Optimization

\n", 11 | "

Lab 2: Gradient algorithm

" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n", 19 | "---\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "%load_ext autoreload\n", 29 | "%autoreload 2" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "---" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "# 1. Line-search\n", 44 | "\n", 45 | "In the previous Lab, we saw that it can be difficult to choose a satisfying stepsize.\n", 46 | "\n", 47 | "An option to choose a satisfying stepsize $\\gamma$ is to test different stepsizes by calling succesively the function oracles. Wolfe's line-search is implemented in `Scipy`'s `scipy.optimize.line_search`. \n", 48 | "\n", 49 | "\n", 50 | "**Wolfe's line-search.** Let $x$ be the current point, $d$ a descent direction, and $q(\\gamma)=f(x+\\gamma d)$.Wolfe's line-search consists in deciding that \n", 51 | "* $\\gamma$ is *satisfying* if $q(\\gamma)\\leq q(0)+m_1 \\gamma q'(0)$ and $q'(\\gamma)\\geq m_2 q'(0)$;\n", 52 | "* $\\gamma$ is *too big* if $q(\\gamma) > q(0)+m_1 \\gamma q'(0)$;\n", 53 | "* $\\gamma$ is *too small* if $q(\\gamma)\\leq q(0)+m_1 \\gamma q'(0)$ and $q'(\\gamma) Complete the function `gradient_Wolfe` in `algorithms.py`.
\n", 68 | "> Compare the convergence of this gradient with other gradient methods." 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "source": [ 77 | "---\n", 78 | "### 1a. Comparing constant stepsize gradient algorithm and Wolfe search on Problem 1\n", 79 | "\n", 80 | "> Print the stepsizes chosen by line search and compare with theoretical ones." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from algorithms import * # import all methods of the module into the current environment\n", 90 | "\n", 91 | "import numpy as np\n", 92 | "import problem1 as pb1\n", 93 | "\n", 94 | "\n", 95 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n", 96 | "PREC = 0.01 # Sought precision\n", 97 | "ITE_MAX = 20 # Max number of iterations\n", 98 | "x0 = np.array( (0.0,0.0 ) ) # Initial point\n", 99 | "step = 0.1\n", 100 | "\n", 101 | "##### gradient algorithm\n", 102 | "x,x_tab = gradient_algorithm(pb1.f , pb1.f_grad , x0 , step , PREC , ITE_MAX )\n", 103 | "\n", 104 | "##### Wolfe line-search algorithm\n", 105 | "xW,xW_tab = gradient_Wolfe(pb1.f , pb1.f_grad , x0 , PREC , ITE_MAX )\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "###### Plots" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "from plotLib import *\n", 122 | "%matplotlib inline\n", 123 | "\n", 124 | "##### comparison\n", 125 | "level_2points_plot( pb1.f , x_tab , xW_tab , pb1.x1_min, pb1.x1_max, pb1.x2_min, pb1.x2_max, pb1.nb_points, pb1.levels , pb1.title )" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "source": [ 134 | "---\n", 135 | "### 1b. Comparing constant stepsize gradient algorithm and Wolfe search on Problem 2\n", 136 | "\n", 137 | "> Try different starting points and observe the results of line search.\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "from algorithms import * # import all methods of the module into the current environment\n", 147 | "\n", 148 | "import numpy as np\n", 149 | "import problem2 as pb2\n", 150 | "\n", 151 | "\n", 152 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n", 153 | "PREC = 0.01 # Sought precision\n", 154 | "ITE_MAX = 20 # Max number of iterations\n", 155 | "x0 = np.array( (1.5,1.5 ) ) # Initial point\n", 156 | "step = 0.1\n", 157 | "\n", 158 | "##### gradient algorithm\n", 159 | "x,x_tab = gradient_algorithm(pb2.f , pb2.f_grad , x0 , step , PREC , ITE_MAX )\n", 160 | "\n", 161 | "##### Wolfe line-search algorithm\n", 162 | "xW,xW_tab = gradient_Wolfe(pb2.f , pb2.f_grad , x0 , PREC , ITE_MAX )\n" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "###### Plots" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "from plotLib import *\n", 179 | "%matplotlib inline\n", 180 | "\n", 181 | "##### comparison\n", 182 | "level_2points_plot( pb2.f , x_tab , xW_tab , pb2.x1_min, pb2.x1_max, pb2.x2_min, pb2.x2_max, pb2.nb_points, pb2.levels , pb2.title )" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "---\n", 190 | "### 1c. Comparing constant stepsize gradient algorithm and Wolfe search on Problem 3\n", 191 | "\n", 192 | "> Compare the convergence of the gradient with and without line search. Keeping in mind that Newton method takes around $30$ iterations to converge, what is the biggest problem for minimizing such function, the stepsize or the descent direction?\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "scrolled": true 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "from algorithms import * # import all methods of the module into the current environment\n", 204 | "\n", 205 | "import numpy as np\n", 206 | "import problem3 as pb3\n", 207 | "\n", 208 | "\n", 209 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n", 210 | "PREC = 0.0001 # Sought precision\n", 211 | "ITE_MAX = 10000 # Max number of iterations\n", 212 | "x0 = np.array( (-1.0,1.2 ) ) # Initial point\n", 213 | "step = 0.001\n", 214 | "\n", 215 | "##### gradient algorithm\n", 216 | "x,x_tab = gradient_algorithm(pb3.f , pb3.f_grad , x0 , step , PREC , ITE_MAX )\n", 217 | "\n", 218 | "##### Wolfe line-search algorithm\n", 219 | "xW,xW_tab = gradient_Wolfe(pb3.f , pb3.f_grad , x0 , PREC , ITE_MAX )\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "###### Plots" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "from plotLib import *\n", 236 | "%matplotlib inline\n", 237 | "\n", 238 | "##### comparison\n", 239 | "level_2points_plot( pb3.f , x_tab , xW_tab , pb3.x1_min, pb3.x1_max, pb3.x2_min, pb3.x2_max, pb3.nb_points, pb3.levels , pb3.title )" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "---\n", 247 | "### 1d. Comparing constant stepsize gradient algorithm and Wolfe search on Problem 5\n", 248 | "\n", 249 | "> Try different starting points $(0,0)$ , $(0,1)$, $(1,0)$, $(0.2,0.4)$.\n" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "scrolled": true 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "from algorithms import * # import all methods of the module into the current environment\n", 261 | "\n", 262 | "import numpy as np\n", 263 | "import problem5 as pb5\n", 264 | "\n", 265 | "\n", 266 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n", 267 | "PREC = 0.001 # Sought precision\n", 268 | "ITE_MAX = 100 # Max number of iterations\n", 269 | "x0 = np.array( (0.,0. ) ) # Initial point\n", 270 | "step = 0.1\n", 271 | "\n", 272 | "##### gradient algorithm\n", 273 | "x,x_tab = gradient_algorithm(pb5.f , pb5.f_grad , x0 , step , PREC , ITE_MAX )\n", 274 | "\n", 275 | "##### Wolfe line-search algorithm\n", 276 | "xW,xW_tab = gradient_Wolfe(pb5.f , pb5.f_grad , x0 , PREC , ITE_MAX )\n" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "###### Plots" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "from plotLib import *\n", 293 | "%matplotlib inline\n", 294 | "\n", 295 | "##### comparison\n", 296 | "level_2points_plot( pb5.f , x_tab , xW_tab , pb5.x1_min, pb5.x1_max, pb5.x2_min, pb5.x2_max, pb5.nb_points, pb5.levels , pb5.title )" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "# 2. Quasi Newton\n", 304 | "\n", 305 | "Now that we have a proper way of choosing a good stepsize, we see that the opposite of the gradient is not always a good descent direction. We saw in the previous Lab that Newton method was sometimes computationally expensive. In this section, we investigate a method to choose descent directions based on the approximation of the inverse Hessian.\n", 306 | "\n", 307 | "For a differentiable function $f$, Quasi-Newton methods iteratively construct an approximation $W_k$ of the inverse of the Hessian then use descent direction $-W_k\\nabla f(x_k)$.\n", 308 | "\n", 309 | "**BFGS.** (Broyden-Fletcher-Goldfarb-Shanno, 1970) The popular BFGS algorithm consist in performing the following iteration\n", 310 | "$$ x_{k+1}=x_k - \\gamma_k W_k \\nabla f(x_k)$$\n", 311 | "where $\\gamma_k$ is given by Wolfe's line-search and positive definite matrix $W_k$ is computed as\n", 312 | "$$ W_{k+1}=W_k - \\frac{s_k y_k^T W_k+W_k y_k s_k^T}{y_k^T s_k} +\\left[1+\\frac{y_k^T W_k y_k}{y_k^T s_k}\\right]\\frac{s_k s_k^T}{y_k^T s_k} $$\n", 313 | "with $s_k=x_{k+1}-x_{k}$ and $y_k=\\nabla f(x_{k+1}) - \\nabla f(x_{k})$.\n", 314 | "\n", 315 | "The general scheme is then:\n", 316 | "* from initial point $x_0$, and initial positive definite matrix $W_0$;\n", 317 | "* from gradient $\\nabla f(x_k)$, compute direction $d_k=-W_k \\nabla f(x_k)$;\n", 318 | "* compute stepsize $\\gamma_k$ by Wolfe's line-search;\n", 319 | "* from new point $x_{k+1}$, call the function oracle and compute $W_{k+1}$.\n", 320 | "\n", 321 | "> Implement BFGS method in `algorithms.py`.\n", 322 | "\n", 323 | "*Hint: Use fonction `np.outer(a,b)` to compute $ab^T$.*\n" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "---\n", 331 | "### Comparing constant stepsize gradient algorithm and Wolfe search on Problem 3\n", 332 | "\n", 333 | "> Compare the convergence of the gradient with line search and BFGS; then Newton vs BFGS\n" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "from algorithms import * # import all methods of the module into the current environment\n", 343 | "\n", 344 | "import numpy as np\n", 345 | "import problem3 as pb3\n", 346 | "\n", 347 | "\n", 348 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n", 349 | "PREC = 1e-4 # Sought precision\n", 350 | "ITE_MAX = 10000 # Max number of iterations\n", 351 | "x0 = np.array( (-1.0,1.2 ) ) # Initial point\n", 352 | "\n", 353 | "##### Wolfe line-search algorithm\n", 354 | "xW,xW_tab = gradient_Wolfe(pb3.f , pb3.f_grad , x0 , PREC , ITE_MAX )\n", 355 | "\n", 356 | "##### Newton algorithm\n", 357 | "xN,xN_tab = newton_algorithm(pb3.f , pb3.f_grad_hessian , x0 , PREC , ITE_MAX )\n", 358 | "\n", 359 | "##### BFGS algorithm\n", 360 | "xB,xB_tab = bfgs(pb3.f , pb3.f_grad , x0 , PREC , ITE_MAX )" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "###### Plots\n", 368 | "\n", 369 | "* Gradient with line search vs BFGS" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "from plotLib import *\n", 379 | "%matplotlib inline\n", 380 | "\n", 381 | "##### comparison\n", 382 | "level_2points_plot( pb3.f , xW_tab , xB_tab , pb3.x1_min, pb3.x1_max, pb3.x2_min, pb3.x2_max, pb3.nb_points, pb3.levels , pb3.title )" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "* Newton vs BFGS" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": { 396 | "scrolled": true 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "from plotLib import *\n", 401 | "%matplotlib inline\n", 402 | "\n", 403 | "##### comparison\n", 404 | "level_2points_plot( pb3.f , xN_tab , xB_tab , pb3.x1_min, pb3.x1_max, pb3.x2_min, pb3.x2_max, pb3.nb_points, pb3.levels , pb3.title )" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "# Appendix: Problems\n", 412 | "\n", 413 | "The problems we consider in this first lab are minimizations of unconstrained continous functions. " 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "\n", 421 | "> **1.** `problem1` features a simple quadratic function\n", 422 | "$$\\begin{array}{rrcll}\n", 423 | "f: & \\mathbb{R}^2 & \\to &\\mathbb{R}\\\\\n", 424 | "& (x_1,x_2) & \\mapsto & 4 (x_1-3)^2 + 2(x_2-1)^2\n", 425 | "\\end{array}$$\n", 426 | "
\n", 427 | "\n", 428 | "\n", 429 | "> **2.** `problem2` features a more involved but very smooth function\n", 430 | "$$\\begin{array}{rrcll}\n", 431 | "g: & \\mathbb{R}^2 & \\to &\\mathbb{R}\\\\\n", 432 | "& (x_1,x_2) & \\mapsto & \\log( 1 + \\exp(4 (x_1-3)^2 ) + \\exp( 2(x_2-1)^2 ) ) - \\log(3)\n", 433 | "\\end{array}$$\n", 434 | "
\n", 435 | "\n", 436 | "\n", 437 | "> **3.** `problem3` features Rosenbrock's smooth but non-convex function\n", 438 | "$$\\begin{array}{rrcll}\n", 439 | "r: & \\mathbb{R}^2 & \\to &\\mathbb{R}\\\\\n", 440 | "& (x_1,x_2) & \\mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2\n", 441 | "\\end{array}$$\n", 442 | "
\n", 443 | "\n", 444 | "\n", 445 | "> **4.** `problem4` features a smooth function with two distinct minimizers\n", 446 | "$$\\begin{array}{rrcll}\n", 447 | "t: & \\mathbb{R}^2 & \\to &\\mathbb{R}\\\\\n", 448 | "& (x_1,x_2) & \\mapsto & (0.6 x_1 + 0.2 x_2)^2 \\left((0.6 x_1 + 0.2 x_2)^2 - 4 (0.6 x_1 + 0.2 x_2)+4\\right) + (-0.2 x_1 + 0.6 x_2)^2\n", 449 | "\\end{array}$$\n", 450 | "
\n", 451 | "\n", 452 | "\n", 453 | "> **5.** `problem5` features a polyhedral function\n", 454 | "$$\\begin{array}{rrcll}\n", 455 | "p: & \\mathbb{R}^2 & \\to &\\mathbb{R}\\\\\n", 456 | "& (x_1,x_2) & \\mapsto & \\left| x_1-3 \\right| + 2\\left| x_2-1\\right| .\n", 457 | "\\end{array}$$\n", 458 | "
\n", 459 | "\n" 460 | ] 461 | } 462 | ], 463 | "metadata": { 464 | "kernelspec": { 465 | "display_name": "Python 3", 466 | "language": "python", 467 | "name": "python3" 468 | }, 469 | "language_info": { 470 | "codemirror_mode": { 471 | "name": "ipython", 472 | "version": 3 473 | }, 474 | "file_extension": ".py", 475 | "mimetype": "text/x-python", 476 | "name": "python", 477 | "nbconvert_exporter": "python", 478 | "pygments_lexer": "ipython3", 479 | "version": "3.7.5" 480 | } 481 | }, 482 | "nbformat": 4, 483 | "nbformat_minor": 1 484 | } 485 | -------------------------------------------------------------------------------- /Lab2_GradientDescent/algorithms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Gradient-based algorithms 5 | # 6 | # In this notebook, we code our gradient-based optimization algorithms. 7 | 8 | ################################# 9 | # # 1. Gradient algorithms 10 | ################################## 11 | # 12 | # For minimizing a differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given: 13 | # * the function to minimize `f` 14 | # * a 1st order oracle `f_grad` (see `problem1.ipynb` for instance) 15 | # * an initialization point `x0` 16 | # * the sought precision `PREC` 17 | # * a maximal number of iterations `ITE_MAX` 18 | # 19 | # 20 | # these algorithms perform iterations of the form 21 | # $$ x_{k+1} = x_k - \gamma_k \nabla f(x_k) $$ 22 | # where $\gamma_k$ is a stepsize to choose. 23 | 24 | # ### 1.a. Constant stepsize gradient algorithm 25 | # 26 | # First, we consider the case where the stepsize is fixed over iterations and passed an argument `step` to the algorithm. 27 | 28 | 29 | 30 | import numpy as np 31 | import timeit 32 | 33 | def gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ): 34 | x = np.copy(x0) 35 | stop = PREC*np.linalg.norm(f_grad(x0) ) 36 | 37 | x_tab = np.copy(x) 38 | print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step)) 39 | t_s = timeit.default_timer() 40 | for k in range(ITE_MAX): 41 | g = f_grad(x) 42 | x = x - step*g ####### ITERATION 43 | 44 | x_tab = np.vstack((x_tab,x)) 45 | 46 | if np.linalg.norm(g) < stop: 47 | break 48 | t_e = timeit.default_timer() 49 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1])) 50 | return x,x_tab 51 | 52 | 53 | # ### 1.b. Adaptive stepsize gradient algorithm 54 | # 55 | 56 | # Q: Complete the adaptive gradient below using your intuition 57 | 58 | 59 | import numpy as np 60 | import timeit 61 | 62 | def gradient_adaptive_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ): 63 | x = np.copy(x0) 64 | stop = PREC*np.linalg.norm(f_grad(x0) ) 65 | 66 | x_tab = np.copy(x) 67 | print("------------------------------------\nAdaptative Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step)) 68 | t_s = timeit.default_timer() 69 | for k in range(ITE_MAX): 70 | 71 | g = f_grad(x) 72 | x_prev = np.copy(x) 73 | 74 | x = x - step*g ####### ITERATION 75 | 76 | ### COMPLETE 77 | 78 | x_tab = np.vstack((x_tab,x)) 79 | 80 | if np.linalg.norm(g) < stop: 81 | break 82 | t_e = timeit.default_timer() 83 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1])) 84 | return x,x_tab 85 | 86 | 87 | # ### 1.c. Wolfe Line search 88 | 89 | 90 | # Q: Complete the function below accordingly. 91 | 92 | 93 | 94 | import numpy as np 95 | import timeit 96 | from scipy.optimize import line_search 97 | 98 | def gradient_Wolfe(f , f_grad , x0 , PREC , ITE_MAX ): 99 | x = np.copy(x0) 100 | g = f_grad(x0) 101 | stop = PREC*np.linalg.norm( g ) 102 | 103 | x_tab = np.copy(x) 104 | print("------------------------------------\n Gradient with Wolfe line search\n------------------------------------\nSTART") 105 | t_s = timeit.default_timer() 106 | for k in range(ITE_MAX): 107 | 108 | ########### TO FILL 109 | 110 | x = x ###### ITERATION 111 | 112 | x_tab = np.vstack((x_tab,x)) 113 | 114 | if np.linalg.norm(g) < stop: 115 | break 116 | t_e = timeit.default_timer() 117 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1])) 118 | return x,x_tab 119 | 120 | 121 | # # 2. Second Order algorithms 122 | # 123 | # For minimizing a *twice* differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given: 124 | # * the function to minimize `f` 125 | # * a 2nd order oracle `f_grad_hessian` (see `problem1.ipynb` for instance) 126 | # * an initialization point `x0` 127 | # * the sought precision `PREC` 128 | # * a maximal number of iterations `ITE_MAX` 129 | # 130 | # 131 | # these algorithms perform iterations of the form 132 | # $$ x_{k+1} = x_k - [\nabla^2 f(x_k) ]^{-1} \nabla f(x_k) .$$ 133 | 134 | 135 | 136 | import numpy as np 137 | import timeit 138 | 139 | def newton_algorithm(f , f_grad_hessian , x0 , PREC , ITE_MAX ): 140 | x = np.copy(x0) 141 | g0,H0 = f_grad_hessian(x0) 142 | stop = PREC*np.linalg.norm(g0 ) 143 | 144 | x_tab = np.copy(x) 145 | print("------------------------------------\nNewton's algorithm\n------------------------------------\nSTART") 146 | t_s = timeit.default_timer() 147 | for k in range(ITE_MAX): 148 | 149 | g,H = f_grad_hessian(x) 150 | x = x - np.linalg.solve(H,g) ####### ITERATION 151 | 152 | x_tab = np.vstack((x_tab,x)) 153 | 154 | if np.linalg.norm(g) < stop: 155 | break 156 | t_e = timeit.default_timer() 157 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1])) 158 | return x,x_tab 159 | 160 | 161 | # # 3. Quasi Newton algorithms 162 | # 163 | # **BFGS.** (Broyden-Fletcher-Goldfarb-Shanno, 1970) The popular BFGS algorithm consist in performing the following iteration 164 | # $$ x_{k+1}=x_k - \gamma_k W_k \nabla f(x_k)$$ 165 | # where $\gamma_k$ is given by Wolfe's line-search and positive definite matrix $W_k$ is computed as 166 | # $$ W_{k+1}=W_k - \frac{s_k y_k^T W_k+W_k y_k s_k^T}{y_k^T s_k} +\left[1+\frac{y_k^T W_k y_k}{y_k^T s_k}\right]\frac{s_k s_k^T}{y_k^T s_k} $$ 167 | # with $s_k=x_{k+1}-x_{k}$ and $y_k=\nabla f(x_{k+1}) - \nabla f(x_{k})$. 168 | 169 | 170 | # Q: Fill the function below accordingly. 171 | 172 | 173 | import numpy as np 174 | import timeit 175 | from scipy.optimize import line_search 176 | 177 | def bfgs(f , f_grad , x0 , PREC , ITE_MAX ): 178 | x = np.copy(x0) 179 | n = x0.size 180 | g = f_grad(x0) 181 | sim_eval = 1 182 | stop = PREC*np.linalg.norm( g ) 183 | 184 | W = np.eye(n) 185 | 186 | x_tab = np.copy(x) 187 | print("------------------------------------\n BFGS\n------------------------------------\nSTART") 188 | t_s = timeit.default_timer() 189 | for k in range(ITE_MAX): 190 | 191 | ########### TO FILL 192 | 193 | x = x ###### ITERATION 194 | 195 | x_tab = np.vstack((x_tab,x)) 196 | 197 | if np.linalg.norm(g) < stop: 198 | break 199 | 200 | t_e = timeit.default_timer() 201 | print("FINISHED -- {:d} iterations / {:.6f}s ({:d} sim. calls) -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,sim_eval,f(x),x[0],x[1])) 202 | return x,x_tab 203 | 204 | -------------------------------------------------------------------------------- /Lab2_GradientDescent/plotLib.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from matplotlib import cm 5 | from mpl_toolkits.mplot3d import Axes3D 6 | import time 7 | from IPython import display 8 | 9 | 10 | def custom_3dplot( f, x1_min,x1_max,x2_min,x2_max,nb_points, v_min, v_max ): 11 | 12 | def f_no_vector(x1,x2): 13 | return f( np.array( [x1,x2] ) ) 14 | 15 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 16 | z = f_no_vector(x,y) 17 | 18 | fig = plt.figure() 19 | ax = fig.gca(projection='3d') 20 | ax.plot_surface(x, y, z, cmap=cm.hot , vmin = v_min, vmax = v_max) 21 | ax.set_zlim(v_min, v_max) 22 | plt.show() 23 | 24 | 25 | def level_plot( f, x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 26 | 27 | 28 | def f_no_vector(x1,x2): 29 | return f( np.array( [x1,x2] ) ) 30 | 31 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 32 | z = f_no_vector(x,y) 33 | 34 | fig = plt.figure() 35 | graphe = plt.contour(x,y,z,levels) 36 | #plt.plot(3,1,'r*',markersize=15) 37 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 38 | plt.title(title) 39 | plt.show() 40 | 41 | 42 | def level_points_plot( f , x_tab , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 43 | 44 | def f_no_vector(x1,x2): 45 | return f( np.array( [x1,x2] ) ) 46 | 47 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 48 | z = f_no_vector(x,y) 49 | 50 | fig = plt.figure() 51 | graphe = plt.contour(x,y,z,levels) 52 | #plt.plot(3,1,'r*',markersize=15) 53 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 54 | plt.title(title) 55 | 56 | if x_tab.shape[0] > 40: 57 | sub = int(x_tab.shape[0]/40.0) 58 | x_tab = x_tab[::sub] 59 | 60 | delay = 2.0/x_tab.shape[0] 61 | for k in range(x_tab.shape[0]): 62 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10) 63 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1])) 64 | plt.draw() 65 | display.clear_output(wait=True) 66 | display.display(fig) 67 | time.sleep(delay) 68 | display.clear_output() 69 | plt.show() 70 | 71 | 72 | def level_2points_plot( f , x_tab , x_tab2 , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 73 | 74 | 75 | def f_no_vector(x1,x2): 76 | return f( np.array( [x1,x2] ) ) 77 | 78 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 79 | z = f_no_vector(x,y) 80 | 81 | fig = plt.figure() 82 | graphe = plt.contour(x,y,z,levels) 83 | #plt.plot(3,1,'r*',markersize=15) 84 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 85 | plt.xlim([x1_min,x1_max]) 86 | plt.ylim([x2_min,x2_max]) 87 | plt.title(title) 88 | 89 | if x_tab.shape[0] > 40: 90 | sub = int(x_tab.shape[0]/40.0) 91 | x_tab = x_tab[::sub] 92 | 93 | if x_tab2.shape[0] > 40: 94 | sub = int(x_tab2.shape[0]/40.0) 95 | x_tab2 = x_tab2[::sub] 96 | 97 | delay = 4.0/x_tab.shape[0] 98 | for k in range(x_tab.shape[0]): 99 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10) 100 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1])) 101 | plt.draw() 102 | #plt.pause(delay) 103 | 104 | delay = 4.0/x_tab2.shape[0] 105 | for k in range(x_tab2.shape[0]): 106 | plt.plot(x_tab2[k,0],x_tab2[k,1],'dg',markersize=8) 107 | #plt.annotate(k,(x_tab2[k,0],x_tab2[k,1])) 108 | #plt.pause(delay) 109 | plt.draw() 110 | 111 | plt.show() 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /Lab2_GradientDescent/problem1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 1 5 | # 6 | # 7 | # The objective of Problem 1 is to minimize a simple quadratic function $f$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # f: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & 4 (x_1-3)^2 + 2(x_2-1)^2 12 | # \end{array}$$ 13 | 14 | 15 | 16 | ##### Function definition 17 | def f(x): 18 | x1 = x[0] 19 | x2 = x[1] 20 | return 4*(x1-3)**2+2*(x2-1)**2 21 | #### 22 | 23 | ##### Plot parameters f 24 | x1_min = -0.5 25 | x1_max = 5.5 26 | x2_min = -0.5 27 | x2_max = 5.5 28 | nb_points = 200 29 | vmin = 0 30 | vmax = 80 31 | levels = [0.5,1,2,5,10,15] 32 | title = 'f: a simple function' 33 | #### 34 | 35 | 36 | # ### Some parameters 37 | # 38 | # Before solving things numerically, some useful things can be computed: 39 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc 40 | # * Good starting points (for hot starting e.g.) 41 | 42 | 43 | 44 | ###### Useful Parameters 45 | L = 8 # Lipschitz constant of the gradient 46 | 47 | 48 | # ### Oracles 49 | # 50 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian). 51 | 52 | 53 | 54 | # Q: Observe the first order oracle `f_grad`. 55 | 56 | 57 | 58 | import numpy as np 59 | 60 | ##### Gradient oracle 61 | def f_grad(x): 62 | x1 = x[0] 63 | x2 = x[1] 64 | gx = 8*(x1-3) 65 | gy = 4*(x2-1) 66 | return np.array( [ gx , gy ] ) 67 | #### 68 | 69 | 70 | # Q: Observe the second order oracle `f_grad_hessian`. 71 | 72 | 73 | import numpy as np 74 | 75 | ##### Hessian scaled Gradient computation 76 | def f_grad_hessian(x): 77 | x1 = x[0] 78 | x2 = x[1] 79 | gx = 8*(x1-3) 80 | gy = 4*(x2-1) 81 | g = np.array( [ gx , gy ] ) 82 | H = np.array( [ (8.0 , 0.0 ) , ( 0.0 , 4.0 ) ] ) 83 | return g,H 84 | #### 85 | 86 | -------------------------------------------------------------------------------- /Lab2_GradientDescent/problem2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 2 5 | # 6 | # 7 | # The objective of Problem 2 is to minimize a more involved but very smooth function function $g$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # g: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & \log( 1 + \exp(4 (x_1-3)^2 ) + \exp( 2(x_2-1)^2 ) ) - \log(3) 12 | # \end{array}$$ 13 | 14 | 15 | 16 | 17 | ##### Function definition 18 | def f(x): 19 | x1 = x[0] 20 | x2 = x[1] 21 | return np.log( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) - np.log(3) 22 | #### 23 | 24 | ##### Plot parameters f 25 | x1_min = -0.5 26 | x1_max = 5.5 27 | x2_min = -0.5 28 | x2_max = 5.5 29 | nb_points = 500 30 | vmin = 0 31 | vmax = 100 32 | levels = [0.5,1,2,5,10,15] 33 | title = 'a Harder function: g' 34 | #### 35 | 36 | 37 | 38 | ###### Useful Parameters 39 | L = 8 # Lipschitz constant of the gradient 40 | 41 | 42 | # ### Oracles 43 | 44 | 45 | # Q: Complete the first order oracle `f_grad`. 46 | 47 | 48 | 49 | import numpy as np 50 | 51 | ##### Gradient oracle 52 | def f_grad(x): 53 | x1 = x[0] 54 | x2 = x[1] 55 | gx = 0 ## To complete 56 | gy = 0 ## To complete 57 | return np.array( [ gx , gy ] ) 58 | #### 59 | 60 | 61 | # Q: Fill the following second order oracle `f_grad_hessian`. 62 | 63 | 64 | import numpy as np 65 | 66 | ##### Hessian scaled Gradient computation 67 | def f_grad_hessian(x): 68 | 69 | return g,H 70 | #### 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /Lab2_GradientDescent/problem3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 3 5 | # 6 | # 7 | # The objective of Problem 3 is to minimize non-convex smooth Rosenbrock function $r$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # r: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2 12 | # \end{array}$$ 13 | 14 | 15 | 16 | 17 | ##### Function definition 18 | def f(x): 19 | """Rosenbrock.""" 20 | x1 = x[0] 21 | x2 = x[1] 22 | return (1-x1)**2+100*(x2-x1**2)**2 23 | #### 24 | 25 | ##### Plot parameters f 26 | x1_min = -1.5 27 | x1_max = 1.55 28 | x2_min = -0.2 29 | x2_max = 1.5 30 | nb_points = 200 31 | vmin = 0 32 | vmax = 120 33 | levels = [0.05,1,5,15,50,100,200] 34 | title = 'Rosenbrock function' 35 | #### 36 | 37 | 38 | 39 | 40 | 41 | ### Oracles 42 | 43 | 44 | # Q: Complete the first order oracle `f_grad`. 45 | 46 | 47 | 48 | 49 | import numpy as np 50 | 51 | ##### Gradient oracle 52 | def f_grad(x): 53 | 54 | return 0.0 ### To complete 55 | #### 56 | 57 | 58 | # Q: Fill the following second order oracle `f_grad_hessian`. 59 | 60 | 61 | import numpy as np 62 | 63 | ##### Hessian scaled Gradient computation 64 | def f_grad_hessian(x): 65 | 66 | return g,H ### To complete 67 | #### 68 | 69 | -------------------------------------------------------------------------------- /Lab2_GradientDescent/problem4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 4 5 | # 6 | # 7 | # The objective of Problem 4 is to minimize a non-convex function $t$ with two minimizers on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # t: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & (0.6 x_1 + 0.2 x_2)^2 \left((0.6 x_1 + 0.2 x_2)^2 - 4 (0.6 x_1 + 0.2 x_2)+4\right) + (-0.2 x_1 + 0.6 x_2)^2 12 | # \end{array}$$ 13 | 14 | 15 | 16 | 17 | 18 | ##### Function definition 19 | def f(x): 20 | x1 = x[0] 21 | x2 = x[1] 22 | return (0.6*x1 + 0.2*x2)**2 * ((0.6*x1 + 0.2*x2)**2 - 4*(0.6*x1 + 0.2*x2)+4) + (-0.2*x1 + 0.6*x2)**2 23 | #### 24 | 25 | ##### Plot parameters f 26 | x1_min = -1 27 | x1_max = 4 28 | x2_min = -1 29 | x2_max = 4 30 | nb_points = 200 31 | levels = [0.05,0.5,1,2,5] 32 | vmin = 0 33 | vmax = 5 34 | title = 'two pits' 35 | #### 36 | 37 | 38 | 39 | 40 | 41 | ###### Useful Parameters 42 | L = 8 # Lipschitz constant of the gradient 43 | 44 | 45 | ### Oracles 46 | 47 | # Q: Complete the first order oracle `f_grad`. 48 | 49 | 50 | 51 | 52 | import numpy as np 53 | 54 | ##### Gradient oracle 55 | def f_grad(x): 56 | 57 | return ### To complete 58 | 59 | 60 | # Q: Does a second order oracle exist for any point? 61 | 62 | 63 | -------------------------------------------------------------------------------- /Lab2_GradientDescent/problem5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 5 5 | # 6 | # 7 | # The objective of Problem 5 is to minimize a polyhedral function $p$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # p: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & \left| x_1-3 \right| + 2\left| x_2-1\right| . 12 | # \end{array}$$ 13 | 14 | 15 | 16 | 17 | ##### Function definition 18 | def f(x): 19 | x1 = x[0] 20 | x2 = x[1] 21 | return np.abs(x1-3)+2*np.abs(x2-1) 22 | #### 23 | 24 | ##### Plot parameters f 25 | x1_min = -0.5 26 | x1_max = 5.5 27 | x2_min = -0.5 28 | x2_max = 5.5 29 | nb_points = 200 30 | levels = [0.05,0.5,1,2,5] 31 | vmin = 0 32 | vmax = 5 33 | title = 'polyhedral' 34 | #### 35 | 36 | 37 | ### Oracles 38 | 39 | 40 | # Q: Compute a first order oracle `f_grad`. Is it unique? 41 | 42 | 43 | 44 | import numpy as np 45 | 46 | ##### Gradient oracle 47 | def f_grad(x): 48 | 49 | return g ### To complete 50 | #### 51 | 52 | 53 | # Q: What about a second order oracle? 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/.DS_Store -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/._.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._.DS_Store -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/._1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._1.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/._2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._2.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/._3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._3.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/._4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._4.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/._5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._5.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/._UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._UGA.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/1.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/2.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/3.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/4.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/5.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/Fig/UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/UGA.png -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/algoProjGradient.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Projected Gradient-based algorithms 5 | # 6 | # In this notebook, we code our Projected gradient-based optimization algorithms. 7 | # We consider here 8 | # * Positivity constraints 9 | # * Interval constraints 10 | 11 | # # 1. Projected Gradient algorithms (for positivity or interval constraints) 12 | # 13 | # For minimizing a differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given: 14 | # * the function to minimize `f` 15 | # * a 1st order oracle `f_grad` (see `problem1.ipynb` for instance) 16 | # * an initialization point `x0` 17 | # * the sought precision `PREC` 18 | # * a maximal number of iterations `ITE_MAX` 19 | # 20 | # 21 | # these algorithms perform iterations of the form 22 | # $$ x_{k+1} = P\left(x_k - \gamma_k \nabla f(x_k)\right) $$ 23 | # where $\gamma_k$ is a stepsize to choose and $P$ is the projector onto the convex constraint set. We only consider positivity and interval constraints. 24 | 25 | # ### 1.a. Constant stepsize projected gradient algorithm for positivity constraints 26 | # 27 | # First, we consider the case where the stepsize is fixed over iterations and passed an argument `step` to the algorithm. 28 | 29 | 30 | # Q. Fill the function below accordingly. 31 | 32 | 33 | 34 | import numpy as np 35 | import timeit 36 | 37 | def positivity_gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ): 38 | x = np.copy(x0) 39 | g = f_grad(x) # we initialize both x and f_grad(x) 40 | stop = PREC*np.linalg.norm(g) 41 | 42 | epsilon = PREC*np.ones_like(x0) 43 | 44 | x_tab = np.copy(x) 45 | print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step)) 46 | t_s = timeit.default_timer() 47 | for k in range(ITE_MAX): 48 | 49 | x = x ####### ITERATION --> To complete by the projection onto the set "x >= 0" 50 | 51 | ####### 52 | x_tab = np.vstack((x_tab,x)) 53 | ####### 54 | ########################################################## 55 | ####### Why must the following stopping criteria be changed ? Propose a correct stopping rule 56 | #if np.linalg.norm(g) < stop: 57 | # break 58 | ############################################### 59 | 60 | # To complete 61 | if ... : 62 | break 63 | 64 | t_e = timeit.default_timer() 65 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1])) 66 | return x,x_tab 67 | 68 | 69 | # ### 1.b. Constant stepsize projected gradient algorithm for interval constraints 70 | # 71 | # First, we consider the case where the stepsize is fixed over iterations and passed an argument `step` to the algorithm. 72 | 73 | # Q. Fill the function below accordingly. Then, test you algorithm in `2_Optimization100.ipynb [Sec. 1a]` for Problem 1. 74 | 75 | 76 | 77 | import numpy as np 78 | import timeit 79 | 80 | def interval_gradient_algorithm(f , f_grad , x0 , infbound , supbound , step , PREC , ITE_MAX ): 81 | # compute the min of f with a gradient method with constant step under the constraint 82 | # borninf < x < bornesup 83 | x = np.copy(x0) 84 | g = f_grad(x) 85 | stop = PREC*np.linalg.norm(g) 86 | zero = np.zeros_like(x0) 87 | epsilon = PREC*np.ones_like(x0) 88 | 89 | x_tab = np.copy(x) 90 | print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step)) 91 | t_s = timeit.default_timer() 92 | for k in range(ITE_MAX): 93 | 94 | x = x ####### ITERATION --> To complete by the projection onto the set "x >= 0" 95 | 96 | 97 | x_tab = np.vstack((x_tab,x)) 98 | 99 | ####### Why must the following stopping criteria be changed ? Propose a correct stopping rule 100 | #if np.linalg.norm(g) < stop: 101 | # break 102 | 103 | # To complete 104 | if ... : 105 | break 106 | 107 | t_e = timeit.default_timer() 108 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1])) 109 | return x,x_tab 110 | 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/plotLib.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from matplotlib import cm 5 | from mpl_toolkits.mplot3d import Axes3D 6 | import time 7 | from IPython import display 8 | 9 | 10 | def custom_3dplot( f, x1_min,x1_max,x2_min,x2_max,nb_points, v_min, v_max ): 11 | 12 | def f_no_vector(x1,x2): 13 | return f( np.array( [x1,x2] ) ) 14 | 15 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 16 | z = f_no_vector(x,y) 17 | 18 | fig = plt.figure() 19 | ax = fig.gca(projection='3d') 20 | ax.plot_surface(x, y, z, cmap=cm.hot , vmin = v_min, vmax = v_max) 21 | ax.set_zlim(v_min, v_max) 22 | plt.show() 23 | 24 | 25 | def level_plot( f, x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 26 | 27 | 28 | def f_no_vector(x1,x2): 29 | return f( np.array( [x1,x2] ) ) 30 | 31 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 32 | z = f_no_vector(x,y) 33 | 34 | fig = plt.figure() 35 | graphe = plt.contour(x,y,z,levels) 36 | #plt.plot(3,1,'r*',markersize=15) 37 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 38 | plt.title(title) 39 | plt.show() 40 | 41 | 42 | def level_points_plot( f , x_tab , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 43 | 44 | def f_no_vector(x1,x2): 45 | return f( np.array( [x1,x2] ) ) 46 | 47 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 48 | z = f_no_vector(x,y) 49 | 50 | fig = plt.figure() 51 | graphe = plt.contour(x,y,z,levels) 52 | #plt.plot(3,1,'r*',markersize=15) 53 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 54 | plt.title(title) 55 | 56 | delay = 4.0/x_tab.shape[0] 57 | for k in range(x_tab.shape[0]): 58 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10) 59 | plt.xlim([x1_min,x1_max]) 60 | plt.ylim([x2_min,x2_max]) 61 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1])) 62 | plt.draw() 63 | display.clear_output(wait=True) 64 | display.display(fig) 65 | time.sleep(delay) 66 | display.clear_output() 67 | plt.show() 68 | 69 | 70 | def level_2points_plot( f , x_tab , x_tab2 , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 71 | 72 | 73 | def f_no_vector(x1,x2): 74 | return f( np.array( [x1,x2] ) ) 75 | 76 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 77 | z = f_no_vector(x,y) 78 | 79 | fig = plt.figure() 80 | graphe = plt.contour(x,y,z,levels) 81 | #plt.plot(3,1,'r*',markersize=15) 82 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 83 | plt.xlim([x1_min,x1_max]) 84 | plt.ylim([x2_min,x2_max]) 85 | plt.title(title) 86 | 87 | delay = 4.0/x_tab.shape[0] 88 | for k in range(x_tab.shape[0]): 89 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10) 90 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1])) 91 | plt.draw() 92 | #plt.pause(delay) 93 | 94 | delay = 4.0/x_tab2.shape[0] 95 | for k in range(x_tab2.shape[0]): 96 | plt.plot(x_tab2[k,0],x_tab2[k,1],'dg',markersize=8) 97 | #plt.annotate(k,(x_tab2[k,0],x_tab2[k,1])) 98 | #plt.pause(delay) 99 | plt.draw() 100 | 101 | plt.show() 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/problem1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 21 5 | # 6 | # 7 | # The objective of Problem 21 is to minimize a simple quadratic function $f$ on $\mathbb{R}^2$, constrained to $x\ge 0$: 8 | # 9 | # $$\begin{array}{rrcll} 10 | # f: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & 4 (x_1-1)^2 + 2(x_2+0.5)^2 12 | # \end{array}$$ 13 | #
14 | 15 | # ### Function definition 16 | 17 | # In[1]: 18 | 19 | 20 | ##### Function definition 21 | def f(x): 22 | x1 = x[0] 23 | x2 = x[1] 24 | return 4*(x1-1)**2+2*(x2+0.5)**2 25 | #### 26 | 27 | ##### Plot parameters f 28 | x1_min = -4. 29 | x1_max = 3. 30 | x2_min = -4. 31 | x2_max = 3. 32 | nb_points = 200 33 | vmin = 0 34 | vmax = 80 35 | levels = [0.5,1,2,5,10,15] 36 | title = 'f: a simple function' 37 | #### 38 | 39 | 40 | # ### Some parameters 41 | # 42 | # Before solving things numerically, some useful things can be computed: 43 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc 44 | # * Good starting points (for hot starting e.g.) 45 | 46 | # In[2]: 47 | 48 | 49 | ###### Useful Parameters 50 | L = 8 # Lipschitz constant of the gradient 51 | 52 | 53 | # ### Oracles 54 | # 55 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian). 56 | # 57 | # > Observe the first order oracle `f_grad`. 58 | # 59 | 60 | # In[3]: 61 | 62 | 63 | import numpy as np 64 | 65 | ##### Gradient oracle 66 | def f_grad(x): 67 | x1 = x[0] 68 | x2 = x[1] 69 | gx = 8*(x1-1) 70 | gy = 4*(x2+0.5) 71 | return np.array( [ gx , gy ] ) 72 | #### 73 | 74 | 75 | # > Fill the following second order oracle `f_grad_hessian`. 76 | 77 | # In[4]: 78 | 79 | 80 | import numpy as np 81 | 82 | ##### Hessian scaled Gradient computation 83 | def f_grad_hessian(x): 84 | x1 = x[0] 85 | x2 = x[1] 86 | gx = 8*(x1-1) 87 | gy = 4*(x2+0.5) 88 | g = np.array( [ gx , gy ] ) 89 | H = np.array( [ ( 8.0 , 0 ) , ( 0 , 4.0 ) ] ) ### -> To complete DONE 90 | return g,H 91 | #### 92 | 93 | -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/problem2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 2 5 | # 6 | # 7 | # The objective of Problem 2 is to minimize a more involved but very smooth function function $g$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # g: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & \log( 1 + \exp(4 (x_1-3)^2 ) + \exp( 2(x_2-1)^2 ) ) - \log(3) 12 | # \end{array}$$ 13 | #
14 | 15 | # ### Function definition 16 | 17 | # In[1]: 18 | 19 | 20 | ##### Function definition 21 | def f(x): 22 | x1 = x[0] 23 | x2 = x[1] 24 | return np.log( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) - np.log(3) 25 | #### 26 | 27 | ##### Plot parameters f 28 | x1_min = -0.5 29 | x1_max = 5.5 30 | x2_min = -0.5 31 | x2_max = 5.5 32 | nb_points = 500 33 | vmin = 0 34 | vmax = 100 35 | levels = [0.5,1,2,5,10,15] 36 | title = 'a Harder function: g' 37 | #### 38 | 39 | 40 | # ### Some parameters 41 | # 42 | # Before solving things numerically, some useful things can be computed: 43 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc 44 | # * Good starting points (for hot starting e.g.) 45 | 46 | # In[2]: 47 | 48 | 49 | ###### Useful Parameters 50 | L = 8 # Lipschitz constant of the gradient 51 | 52 | 53 | # ### Oracles 54 | # 55 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian). 56 | # 57 | # > Complete the first order oracle `f_grad`. 58 | # 59 | 60 | # In[2]: 61 | 62 | 63 | import numpy as np 64 | 65 | ##### Gradient oracle 66 | def f_grad(x): 67 | x1 = x[0] 68 | x2 = x[1] 69 | gx = 8*(x1-3)*np.exp(4*(x1-3)**2)/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) ## To complete 70 | gy = 4*(x2-1)*np.exp(2*(x2-1)**2)/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) ## To complete 71 | return np.array( [ gx , gy ] ) 72 | #### 73 | 74 | 75 | # > Fill the following second order oracle `f_grad_hessian`. 76 | 77 | # In[1]: 78 | 79 | 80 | import numpy as np 81 | 82 | ##### Hessian scaled Gradient computation 83 | def f_grad_hessian(x): 84 | x1 = x[0] 85 | x2 = x[1] 86 | gx = 8*(x1-3)*np.exp(4*(x1-3)**2)/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) ## To complete 87 | gy = 4*(x2-1)*np.exp(2*(x2-1)**2)/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) ## To complete 88 | 89 | hxx = (1+ 8*(x1-3)**2)*np.exp(4*(x1-3)**2)*( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) 90 | hxx = hxx -8* ((x1-3)*np.exp(4*(x1-3)**2))**2 91 | hxx = 8 * hxx/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) )**2 92 | hxy = -32*(x1-3)*(x2-1)*np.exp(4*(x1-3)**2)*np.exp(2*(x2-1)**2) 93 | hxy=hxy/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) )**2 94 | ## H is symetric thus hyx=hxy 95 | hyy = (1+4*(x2-1)**2)*np.exp(2*(x2-1)**2)*( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) 96 | hyy= hyy -4* ((x2-1)*np.exp(2*(x2-1)**2))**2 97 | hyy= 4* hyy / ( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) )**2 98 | 99 | g = np.array( [ gx , gy ] ) 100 | H = np.array( [ ( hxx , hxy ) , ( hxy , hyy ) ] ) ### -> To complete DONE 101 | 102 | return g,H 103 | #### 104 | 105 | 106 | # In[ ]: 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/problem3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 3 5 | # 6 | # 7 | # The objective of Problem 3 is to minimize non-convex smooth Rosenbrock function $r$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # r: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2 12 | # \end{array}$$ 13 | #
14 | 15 | # ### Function definition 16 | 17 | # In[1]: 18 | 19 | 20 | ##### Function definition 21 | def f(x): 22 | """Rosenbrock.""" 23 | x1 = x[0] 24 | x2 = x[1] 25 | return (1-x1)**2+100*(x2-x1**2)**2 26 | #### 27 | 28 | ##### Plot parameters f 29 | x1_min = -1.5 30 | x1_max = 1.55 31 | x2_min = -0.2 32 | x2_max = 1.5 33 | nb_points = 200 34 | vmin = 0 35 | vmax = 120 36 | levels = [0.05,1,5,15,50,100,200] 37 | title = 'Rosenbrock function' 38 | #### 39 | 40 | 41 | # ### Some parameters 42 | # 43 | # Before solving things numerically, some useful things can be computed: 44 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc 45 | # * Good starting points (for hot starting e.g.) 46 | 47 | # In[1]: 48 | 49 | 50 | ###### Useful Parameters 51 | 52 | 53 | # ### Oracles 54 | # 55 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian). 56 | # 57 | # > Complete the first order oracle `f_grad`. 58 | # 59 | 60 | # In[1]: 61 | 62 | 63 | import numpy as np 64 | 65 | ##### Gradient oracle ##### return grandient ### To complete 66 | def f_grad(x): 67 | x1 = x[0] 68 | x2 = x[1] 69 | return np.array( ( 2*(x1-1) + 400*x1*(x1**2-x2) , 200*( x2 - x1**2) ) ) 70 | #### 71 | 72 | 73 | # > Fill the following second order oracle `f_grad_hessian`. 74 | 75 | # In[4]: 76 | 77 | 78 | import numpy as np 79 | 80 | ##### Hessian scaled Gradient computation, #### return g,H ### To complete 81 | def f_grad_hessian(x): 82 | x1 = x[0] 83 | x2 = x[1] 84 | g = np.array( [ 2*(x1-1) + 400*x1*(x1**2-x2) , 200*( x2 - x1**2) ] ) 85 | H = np.array( [ ( 2 - 400*x2 + 3*400*x1**2 , -400*x1 ) , ( -400*x1 , 200 ) ] ) 86 | return g,H 87 | #### 88 | 89 | -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/problem4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 4 5 | # 6 | # 7 | # The objective of Problem 4 is to minimize a non-convex function $t$ with two minimizers on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # t: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & (0.6 x_1 + 0.2 x_2)^2 \left((0.6 x_1 + 0.2 x_2)^2 - 4 (0.6 x_1 + 0.2 x_2)+4\right) + (-0.2 x_1 + 0.6 x_2)^2 12 | # \end{array}$$ 13 | #
14 | 15 | # ### Function definition 16 | 17 | # In[1]: 18 | 19 | 20 | ##### Function definition 21 | def f(x): 22 | x1 = x[0] 23 | x2 = x[1] 24 | return (0.6*x1 + 0.2*x2)**2 * ((0.6*x1 + 0.2*x2)**2 - 4*(0.6*x1 + 0.2*x2)+4) + (-0.2*x1 + 0.6*x2)**2 25 | #### 26 | 27 | ##### Plot parameters f 28 | x1_min = -1 29 | x1_max = 4 30 | x2_min = -1 31 | x2_max = 4 32 | nb_points = 200 33 | levels = [0.05,0.5,1,2,5] 34 | vmin = 0 35 | vmax = 5 36 | title = 'two pits' 37 | #### 38 | 39 | 40 | # ### Some parameters 41 | # 42 | # Before solving things numerically, some useful things can be computed: 43 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc 44 | # * Good starting points (for hot starting e.g.) 45 | 46 | # In[2]: 47 | 48 | 49 | ###### Useful Parameters 50 | L = 8 # Lipschitz constant of the gradient 51 | 52 | 53 | # ### Oracles 54 | # 55 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian). 56 | # 57 | # > Complete the first order oracle `f_grad`. 58 | # 59 | 60 | # In[3]: 61 | 62 | 63 | import numpy as np 64 | 65 | ##### Gradient oracle ### To complete 66 | def f_grad(x): 67 | x1 = x[0] 68 | x2 = x[1] 69 | return np.array( ( 0.5184*x1**3+x1**2*(-2.592+0.5184*x2)+ x2*(0.72-0.288*x2+0.0192*x2**2)+ x1*(2.96-1.728*x2+0.1728*x2**2) , 0.1728*x1**3+x1**2*(-0.864+0.1728*x2)+x2*(1.04-0.096*x2+0.0064*x2**2)+x1*(0.72-0.576*x2+0.0576*x2**2) ) ) 70 | #### 71 | 72 | 73 | # > Does a second order oracle exist for any point? 74 | -------------------------------------------------------------------------------- /Lab3_ProjectedGradient/problem5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Problem 5 5 | # 6 | # 7 | # The objective of Problem 5 is to minimize a polyhedral function $p$ on $\mathbb{R}^2$ (unconstrained): 8 | # 9 | # $$\begin{array}{rrcll} 10 | # p: & \mathbb{R}^2 & \to &\mathbb{R}\\ 11 | # & (x_1,x_2) & \mapsto & \left| x_1-3 \right| + 2\left| x_2-1\right| . 12 | # \end{array}$$ 13 | #
14 | 15 | # ### Function definition 16 | 17 | # In[1]: 18 | 19 | 20 | ##### Function definition 21 | def f(x): 22 | x1 = x[0] 23 | x2 = x[1] 24 | return np.abs(x1-3)+2*np.abs(x2-1) 25 | #### 26 | 27 | ##### Plot parameters f 28 | x1_min = -0.5 29 | x1_max = 5.5 30 | x2_min = -0.5 31 | x2_max = 5.5 32 | nb_points = 200 33 | levels = [0.05,0.5,1,2,5] 34 | vmin = 0 35 | vmax = 5 36 | title = 'polyhedral' 37 | #### 38 | 39 | 40 | # ### Some parameters 41 | # 42 | # Before solving things numerically, some useful things can be computed: 43 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc 44 | # * Good starting points (for hot starting e.g.) 45 | 46 | # In[2]: 47 | 48 | 49 | ###### Useful Parameters 50 | 51 | 52 | # ### Oracles 53 | # 54 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian). 55 | # 56 | # > Compute a first order oracle `f_grad`. Is it unique? 57 | # 58 | 59 | # In[3]: 60 | 61 | 62 | import numpy as np 63 | 64 | ##### Gradient oracle 65 | def f_grad(x): 66 | x1 = x[0] 67 | x2 = x[1] 68 | g = np.array( [ 0.0 , 0.0 ] ) 69 | if x1 < 3: 70 | g[0] = -1.0 71 | elif x1 > 3: 72 | g[0] = 1.0 73 | if x2 < 1: 74 | g[1] = -2.0 75 | elif x2 > 1: 76 | g[1] = 2.0 77 | return g 78 | ###### return g ### To complete 79 | #### 80 | 81 | 82 | # > What about a second order oracle? 83 | 84 | # In[ ]: 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /Lab4_Prox/Fig/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/1.png -------------------------------------------------------------------------------- /Lab4_Prox/Fig/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/2.png -------------------------------------------------------------------------------- /Lab4_Prox/Fig/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/3.png -------------------------------------------------------------------------------- /Lab4_Prox/Fig/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/4.png -------------------------------------------------------------------------------- /Lab4_Prox/Fig/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/5.png -------------------------------------------------------------------------------- /Lab4_Prox/Fig/UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/UGA.png -------------------------------------------------------------------------------- /Lab4_Prox/Lab4_Proximal_algorithms.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "
\n", 8 | "

Master of Science in Industrial and Applied Mathematics (MSIAM) - 1st year

\n", 9 | "
\n", 10 | "

Numerical Optimization

\n", 11 | "

Lab 4: Proximal Algorithms

" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n", 19 | "---\n", 20 | "\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "%load_ext autoreload\n", 30 | "%autoreload 2" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "---\n", 38 | "\n", 39 | "# Composite minimization.\n", 40 | "\n", 41 | "In this lab, we will investigate optimization algorithms over composite functions composed of a smooth and a non-smooth part using the proximal gradient algorithm over a practical problem of machine learning: binary classification using logistic regression.
" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "We will consider the following function\n", 49 | " \n", 50 | "\\begin{align*}\n", 51 | "\\min_{x\\in\\mathbb{R}^d } F(x) := \\underbrace{ \\frac{1}{m} \\sum_{i=1}^m \\log( 1+\\exp(-b_i \\langle a_i,x \\rangle) ) + \\frac{\\lambda_2}{2} \\|x\\|_2^2}_{f(x)} + \\underbrace{\\lambda_1 \\|x\\|_1 }_{g(x)}.\n", 52 | "\\end{align*}\n", 53 | "\n", 54 | "for which we give:\n", 55 | "* the oracles for functions $f, g, F$;\n", 56 | "* the gradient oracle for $f$ and the Lipchitz constant of the gradient;\n", 57 | "* the size of the problem $n$;\n", 58 | "\n", 59 | "in `logistic_regression_student.py`. \n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "> Implement the proximal operation linked to $g(x) = \\lambda_1 \\|x\\|_1$ in `logistic_regression_student.py`. \n", 67 | "\n", 68 | "> Create a function coding the proximal gradient algorithm and test your algorithm below." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "import numpy as np\n", 78 | "import logistic_regression_student as pb\n", 79 | "\n", 80 | "\n", 81 | "##### proximal gradient algorithm\n", 82 | "#x,x_tab = pb.proximal_gradient_algorithm(pb.F , pb.f_grad , pb.g_prox , x0 , step , PREC, ITE_MAX)\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "> Investigate the decrease of the algorithm." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "import matplotlib.pyplot as plt\n", 99 | "% matplotlib inline\n" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "> Plot the support of the vector $x_k$ (i.e. one point for every non-null coordinate of $x_k$) versus the iterations. \n", 107 | "\n", 108 | "> What do you notice? Was it expected?" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "---\n", 123 | "\n", 124 | "# Regularization path.\n", 125 | "\n", 126 | "\n", 127 | "We saw above that the algorithm *selected* some coordinates as the other get to zero. Considering our machine learning task, this translates into the algorithm selecting a subset of the features that will be used for the prediction step (see also the features signification at the end of the notebook). \n", 128 | "\n", 129 | "> Change the parameter $\\lambda_1$ of the problem (`pb.lam1`) in the code above and investigate how it influences the number of selected features.\n", 130 | "\n", 131 | "In order to quantify the influence of this feature selection, let us consider the *regularization path* that is the support of the final points obtained by our minimization method versus the value of $\\lambda_1$.\n", 132 | "\n", 133 | "> For $\\lambda_1 = 2^{-12},2^{-11}, .. , 2^{1}$, run the proximal gradient algorithm on the obtained problem and store the support of the final point, the prediction performance on the *training set* (`pb.prediction_train`) and on the *testing set* (`pb.prediction_test`)." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "import matplotlib.pyplot as plt\n", 143 | "% matplotlib inline\n", 144 | "\n", 145 | "import numpy as np\n", 146 | "import logistic_regression_student as pb\n", 147 | "\n", 148 | "\n", 149 | "### TODO" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "> Plot the *regularization path* and look at the feature signification (at the end of the notebook) to see which are the most important features of the dataset." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "scrolled": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "# TODO" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "> Plot the *training* and *testing* accuracies versus the value of $\\lambda_1$." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "# TODO" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "# Features signification\n", 191 | "\n", 192 | "The dataset is comprised of $27$ features described below and the goal is to predict if the student may pass its year or not. It is thus of importance to investigate which features are the most significant for the student success. We will see how the $\\ell_1$ regularization can help to this goal." 193 | ] 194 | }, 195 | { 196 | "cell_type": "raw", 197 | "metadata": {}, 198 | "source": [ 199 | "1 sex - student's sex (binary: \"F\" - female or \"M\" - male)\n", 200 | "2 age - student's age (numeric: from 15 to 22)\n", 201 | "3 address - student's home address type (binary: \"U\" - urban or \"R\" - rural)\n", 202 | "4 famsize - family size (binary: \"LE3\" - less or equal to 3 or \"GT3\" - greater than 3)\n", 203 | "5 Pstatus - parent's cohabitation status (binary: \"T\" - living together or \"A\" - apart)\n", 204 | "6 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)\n", 205 | "7 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)\n", 206 | "8 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)\n", 207 | "9 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)\n", 208 | "10 failures - number of past class failures (numeric: n if 1<=n<3, else 4)\n", 209 | "11 schoolsup - extra educational support (binary: yes or no)\n", 210 | "12 famsup - family educational support (binary: yes or no)\n", 211 | "13 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)\n", 212 | "14 activities - extra-curricular activities (binary: yes or no)\n", 213 | "15 nursery - attended nursery school (binary: yes or no)\n", 214 | "16 higher - wants to take higher education (binary: yes or no)\n", 215 | "17 internet - Internet access at home (binary: yes or no)\n", 216 | "18 romantic - with a romantic relationship (binary: yes or no)\n", 217 | "19 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)\n", 218 | "20 freetime - free time after school (numeric: from 1 - very low to 5 - very high)\n", 219 | "21 goout - going out with friends (numeric: from 1 - very low to 5 - very high)\n", 220 | "22 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)\n", 221 | "23 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)\n", 222 | "24 health - current health status (numeric: from 1 - very bad to 5 - very good)\n", 223 | "25 absences - number of school absences (numeric: from 0 to 93)\n", 224 | "26 G1 - first period grade (numeric: from 0 to 20)\n", 225 | "27 G2 - second period grade (numeric: from 0 to 20)" 226 | ] 227 | } 228 | ], 229 | "metadata": { 230 | "kernelspec": { 231 | "display_name": "Python 3", 232 | "language": "python", 233 | "name": "python3" 234 | }, 235 | "language_info": { 236 | "codemirror_mode": { 237 | "name": "ipython", 238 | "version": 3 239 | }, 240 | "file_extension": ".py", 241 | "mimetype": "text/x-python", 242 | "name": "python", 243 | "nbconvert_exporter": "python", 244 | "pygments_lexer": "ipython3", 245 | "version": "3.7.5" 246 | } 247 | }, 248 | "nbformat": 4, 249 | "nbformat_minor": 2 250 | } 251 | -------------------------------------------------------------------------------- /Lab4_Prox/logistic_regression_student.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Regularized Problem 5 | # 6 | # In this lab, we add an $\ell_1$ regularization to promote sparsity of the iterates. The function (below) is non-smooth but it has a smooth part, $f$, and a non-smooth part, $g$, that we will treat with proximal operations. 7 | # 8 | # \begin{align*} 9 | # \min_{x\in\mathbb{R}^d } F(x) := \underbrace{ \frac{1}{m} \sum_{i=1}^m \log( 1+\exp(-b_i \langle a_i,x \rangle) ) + \frac{\lambda_2}{2} \|x\|_2^2}_{f(x)} + \underbrace{\lambda_1 \|x\|_1 }_{g(x)}. 10 | # \end{align*} 11 | 12 | # ### Function definition 13 | 14 | 15 | 16 | import numpy as np 17 | import csv 18 | from sklearn import preprocessing 19 | 20 | #### File reading 21 | dat_file = np.load('student.npz') 22 | A = dat_file['A_learn'] 23 | final_grades = dat_file['b_learn'] 24 | m = final_grades.size 25 | b = np.zeros(m) 26 | for i in range(m): 27 | if final_grades[i]>11: 28 | b[i] = 1.0 29 | else: 30 | b[i] = -1.0 31 | 32 | A_test = dat_file['A_test'] 33 | final_grades_test = dat_file['b_test'] 34 | m_test = final_grades_test.size 35 | b_test = np.zeros(m_test) 36 | for i in range(m_test): 37 | if final_grades_test[i]>11: 38 | b_test[i] = 1.0 39 | else: 40 | b_test[i] = -1.0 41 | 42 | 43 | 44 | 45 | d = 27 # features 46 | n = d+1 # with the intercept 47 | 48 | 49 | 50 | lam1 = 0.03 # for the 1-norm regularization best:0.03 51 | lam2 = 0.0 52 | 53 | 54 | L = 0.25*max(np.linalg.norm(A,2,axis=1))**2 + lam2 55 | 56 | # ## Oracles 57 | # 58 | # ### Related to function $f$ 59 | 60 | 61 | def f(x): 62 | l = 0.0 63 | for i in range(A.shape[0]): 64 | if b[i] > 0 : 65 | l += np.log( 1 + np.exp(-np.dot( A[i] , x ) ) ) 66 | else: 67 | l += np.log( 1 + np.exp(np.dot( A[i] , x ) ) ) 68 | return l/m + lam2/2.0*np.dot(x,x) 69 | 70 | def f_grad(x): 71 | g = np.zeros(n) 72 | for i in range(A.shape[0]): 73 | if b[i] > 0: 74 | g += -A[i]/( 1 + np.exp(np.dot( A[i] , x ) ) ) 75 | else: 76 | g += A[i]/( 1 + np.exp(-np.dot( A[i] , x ) ) ) 77 | return g/m + lam2*x 78 | 79 | 80 | # ### Related to function $g$ [TODO] 81 | 82 | 83 | def g(x): 84 | return lam1*np.linalg.norm(x,1) 85 | 86 | def g_prox(x,gamma): 87 | p = np.zeros(n) 88 | #TODO 89 | return p 90 | 91 | 92 | # ### Related to function $F$ 93 | 94 | 95 | 96 | def F(x): 97 | return f(x) + g(x) 98 | 99 | 100 | # ## Prediction Functions 101 | 102 | 103 | def prediction_train(w,PRINT=False): 104 | pred = np.zeros(A.shape[0]) 105 | perf = 0 106 | for i in range(A.shape[0]): 107 | p = 1.0/( 1 + np.exp(-np.dot( A[i] , w ) ) ) 108 | if p>0.5: 109 | pred[i] = 1.0 110 | if b[i]>0: 111 | correct = "True" 112 | perf += 1 113 | else: 114 | correct = "False" 115 | if PRINT: 116 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),1,(p-0.5)*200,correct)) 117 | else: 118 | pred[i] = -1.0 119 | if b[i]<0: 120 | correct = "True" 121 | perf += 1 122 | else: 123 | correct = "False" 124 | if PRINT: 125 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),-1,100-(0.5-p)*200,correct)) 126 | return pred,float(perf)/A.shape[0] 127 | 128 | def prediction_test(w,PRINT=False): 129 | pred = np.zeros(A_test.shape[0]) 130 | perf = 0 131 | for i in range(A_test.shape[0]): 132 | p = 1.0/( 1 + np.exp(-np.dot( A_test[i] , w ) ) ) 133 | if p>0.5: 134 | pred[i] = 1.0 135 | if b_test[i]>0: 136 | correct = "True" 137 | perf += 1 138 | else: 139 | correct = "False" 140 | if PRINT: 141 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),1,(p-0.5)*200,correct)) 142 | else: 143 | pred[i] = -1.0 144 | if b_test[i]<0: 145 | correct = "True" 146 | perf += 1 147 | else: 148 | correct = "False" 149 | if PRINT: 150 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),-1,100-(0.5-p)*200,correct)) 151 | return pred,float(perf)/A_test.shape[0] 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /Lab4_Prox/student.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/student.npz -------------------------------------------------------------------------------- /Lab4_Prox/student.txt: -------------------------------------------------------------------------------- 1 | # Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets: 2 | 1 sex - student's sex (binary: "F" - female or "M" - male) 3 | 2 age - student's age (numeric: from 15 to 22) 4 | 3 address - student's home address type (binary: "U" - urban or "R" - rural) 5 | 4 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3) 6 | 5 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart) 7 | 6 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) 8 | 7 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) 9 | 8 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) 10 | 9 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) 11 | 10 failures - number of past class failures (numeric: n if 1<=n<3, else 4) 12 | 11 schoolsup - extra educational support (binary: yes or no) 13 | 12 famsup - family educational support (binary: yes or no) 14 | 13 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) 15 | 14 activities - extra-curricular activities (binary: yes or no) 16 | 15 nursery - attended nursery school (binary: yes or no) 17 | 16 higher - wants to take higher education (binary: yes or no) 18 | 17 internet - Internet access at home (binary: yes or no) 19 | 18 romantic - with a romantic relationship (binary: yes or no) 20 | 19 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent) 21 | 20 freetime - free time after school (numeric: from 1 - very low to 5 - very high) 22 | 21 goout - going out with friends (numeric: from 1 - very low to 5 - very high) 23 | 22 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high) 24 | 23 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) 25 | 24 health - current health status (numeric: from 1 - very bad to 5 - very good) 26 | 25 absences - number of school absences (numeric: from 0 to 93) 27 | 26 G1 - first period grade (numeric: from 0 to 20) 28 | 27 G2 - second period grade (numeric: from 0 to 20) 29 | 30 | 28 G3 - final grade (numeric: from 0 to 20, output target) 31 | 32 | Additional note: there are several (382) students that belong to both datasets . 33 | These students can be identified by searching for identical attributes 34 | that characterize each student, as shown in the annexed R file. 35 | -------------------------------------------------------------------------------- /Lab5_MachineLearningExample/Fig/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/1.png -------------------------------------------------------------------------------- /Lab5_MachineLearningExample/Fig/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/2.png -------------------------------------------------------------------------------- /Lab5_MachineLearningExample/Fig/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/3.png -------------------------------------------------------------------------------- /Lab5_MachineLearningExample/Fig/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/4.png -------------------------------------------------------------------------------- /Lab5_MachineLearningExample/Fig/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/5.png -------------------------------------------------------------------------------- /Lab5_MachineLearningExample/Fig/UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/UGA.png -------------------------------------------------------------------------------- /Lab5_MachineLearningExample/Lab5_OptimForML.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "
\n", 8 | "

Master of Science in Industrial and Applied Mathematics (MSIAM) - 1st year

\n", 9 | "
\n", 10 | "

Numerical Optimization

\n", 11 | "

Lab 5: Optimization for ML

" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "\n", 19 | "---\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "%load_ext autoreload\n", 29 | "%autoreload 2" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "---\n", 37 | "\n", 38 | "# Algorithms performance on practical problems\n", 39 | "\n", 40 | "In this lab, we will investigate how to evaluate and display performance of optimization algorithms over a practical problem of machine learning: binary classification using logistic regression.
" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Machine Learning as an Optimization problem\n", 48 | "\n", 49 | "We have some *data* $\\mathcal{D}$ consisting of $m$ *examples* $\\{d_i\\}$; each example consisting of a *feature* vector $a_i\\in\\mathbb{R}^d$ and an *observation* $b_i\\in \\mathcal{O}$: $\\mathcal{D} = \\{[a_i,b_i]\\}_{i=1..m}$. In this lab, we will consider the ionosphere dataset.\n", 50 | " \n", 51 | "The goal of *supervised learning* is to construct a predictor for the observations when given feature vectors.\n", 52 | "\n", 53 | "A popular approach is based on *linear models* which are based on finding a *parameter* $x$ such that the real number $\\langle a_i , x \\rangle$ is used to predict the value of the observation through a *predictor function* $g:\\mathbb{R}\\to \\mathcal{O}$: $g(\\langle a_i , x \\rangle)$ is the predicted value from $a_i$.\n", 54 | "\n", 55 | "In order to find such a parameter, we use the available data and a *loss* $\\ell$ that penalizes the error made between the predicted $g(\\langle a_i , x \\rangle)$ and observed $b_i$ values. For each example $i$, the corresponding error function for a parameter $x$ is $f_i(x) = \\ell( g(\\langle a_i , x \\rangle) ; b_i )$. Using the whole data, the parameter that minimizes the total error is the solution of the minimization problem\n", 56 | "\n", 57 | "$$ \\min_{x\\in\\mathbb{R}^d} \\frac{1}{m} \\sum_{i=1}^m f_i(x) = \\frac{1}{m} \\sum_{i=1}^m \\ell( g(\\langle a_i , x \\rangle) ; b_i ). $$" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Binary Classification with Logisitic Regression\n", 65 | "\n", 66 | "In our setup, the observations are binary: $\\mathcal{O} = \\{-1 , +1 \\}$, and the *Logistic loss* is used to form the following optimization problem\n", 67 | "\\begin{align*}\n", 68 | "\\min_{x\\in\\mathbb{R}^d } f(x) := \\frac{1}{m} \\sum_{i=1}^m \\log( 1+\\exp(-b_i \\langle a_i,x \\rangle) ) + \\frac{\\lambda_2}{2} \\|x\\|_2^2.\n", 69 | "\\end{align*}\n", 70 | "where the last term is added as a regularization (of type $\\ell_2$, aka Tikhnov) to prevent overfitting.\n", 71 | "\n", 72 | "Under some statistical hypotheses, $x^\\star = \\arg\\min f(x)$ maximizes the likelihood of the labels knowing the features vector. Then, for a new point $d$ with features vector $a$, \n", 73 | "$$ p_1(a) = \\mathbb{P}[d\\in \\text{ class } +1] = \\frac{1}{1+\\exp(-\\langle a;x^\\star \\rangle)} $$\n", 74 | "\n", 75 | "Thus, from $a$, if $p_1(a)$ is close to $1$, one can decide that $d$ belongs to class $1$; and the opposite decision if $p(a)$ is close to $0$. Between the two, the appreciation is left to the data scientist depending on the application.\n", 76 | "\n", 77 | "## Objective of the optimizer\n", 78 | " \n", 79 | "Given oracles for the function and its gradient, as well as an upper-bound of the Lipschitz constant $L$ of the gradient, find a minimizer of $f$.\n", 80 | " " 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "> You are given *all* oracles of $f$ (function, gradient, Hessian) in `logistic_regression_ionosphere.py` and several algorithms in `algoGradient.py`." 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "### Influence of strong convexity on the speed of the gradient method\n", 95 | "\n", 96 | "\n", 97 | "> Run the following blocks for different values of parameter `lam2` of the problem. What do you notice in terms of speed of convergence, what is the reason?" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "from algoGradient import * # import all methods of the module into the current environment\n", 107 | "import numpy as np\n", 108 | "import logistic_regression_ionosphere as pb\n", 109 | "\n", 110 | "\n", 111 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n", 112 | "PREC = 1e-5 # Sought precision\n", 113 | "ITE_MAX = 5000 # Max number of iterations\n", 114 | "x0 = np.zeros(pb.n) # Initial point\n", 115 | "step = 1.0/pb.L\n", 116 | "\n", 117 | "pb.lam2 = 0.1\n", 118 | "\n", 119 | "##### gradient algorithm\n", 120 | "x,x_tab = gradient_algorithm(pb.f , pb.f_grad , x0 , step , PREC , ITE_MAX )\n", 121 | "\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "import matplotlib.pyplot as plt\n", 131 | "% matplotlib inline\n", 132 | "\n", 133 | "F = []\n", 134 | "for i in range(x_tab.shape[0]):\n", 135 | " F.append( pb.f(x_tab[i])) \n", 136 | "\n", 137 | "\n", 138 | "plt.figure()\n", 139 | "plt.plot( F, color=\"black\", linewidth=1.0, linestyle=\"-\",label='gradient')\n", 140 | "plt.grid(True)\n", 141 | "plt.legend()\n", 142 | "plt.show()\n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "# Accelerating poorly conditioned problems\n", 150 | "\n", 151 | "While the addition of strong convexity accelerates the rate in practice, it usually result shift the solutions of the original problem. For a learning problem, it affects the accuracy.\n", 152 | "\n", 153 | "In order to get faster convergences when the rate is slower, several acceleration techniques exist. We are going to present the most common in the following.\n", 154 | "\n", 155 | "### Nesterov's fast gradient\n", 156 | "\n", 157 | "In a series of papers published in the 80's, Yu. Nesterov proposed an acceleration technique in order to make the worst case rate of the gradient algorithm from $\\mathcal{O}(1/k)$ to $\\mathcal{O}(1/k^2)$. This technique is now immensely popular, notably in the machine learning and image processing communities.\n", 158 | " \n", 159 | "\n", 160 | "The iterations of Nesterov's accelerated gradient are as such:\n", 161 | "$$ \\left\\{ \\begin{array}{ll} x_{k+1} = y_k - \\gamma \\nabla f(y_k) \\\\ y_{k+1} = x_{k+1} + \\alpha_{k+1} (x_{k+1} - x_k ) \\end{array} \\right. $$\n", 162 | "with \n", 163 | "$$ \\alpha_{k+1} = \\frac{\\lambda_k -1 }{\\lambda_{k+1}} \\text{ with } \\lambda_0 = 0 \\text{ and } \\lambda_{k+1} = \\frac{1+\\sqrt{1+4\\lambda_k^2}}{2} . $$\n", 164 | " \n", 165 | "Although no clear intuition can be drawn, the extended point can be seen as an extension by inertia of the last points." 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "> Implement Nesterov's fast gradient algorithm in `algoGradient.py`.\n", 173 | "\n", 174 | "> Run the constant stepsize and fast gradient algorithms and compare the convergence rates (for lam2 = 0.001)." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "from algoGradient import * # import all methods of the module into the current environment\n", 184 | "\n", 185 | "import numpy as np\n", 186 | "import logistic_regression_ionosphere as pb\n", 187 | "\n", 188 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n", 189 | "PREC = 1e-5 # Sought precision\n", 190 | "ITE_MAX = 5000 # Max number of iterations\n", 191 | "x0 = np.zeros(pb.n) # Initial point\n", 192 | "step = 1.0/pb.L\n", 193 | "\n", 194 | "pb.lam2 = 0.001\n", 195 | "\n", 196 | "##### gradient algorithm\n", 197 | "x,x_tab = gradient_algorithm(pb.f , pb.f_grad , x0 , step , PREC , ITE_MAX )\n", 198 | "\n", 199 | "##### fast gradient algorithm\n", 200 | "xF,xF_tab = fast_gradient_algorithm(pb.f , pb.f_grad , x0 , step , PREC , ITE_MAX )" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "import matplotlib.pyplot as plt\n", 210 | "% matplotlib inline\n", 211 | "\n", 212 | "F = []\n", 213 | "G = []\n", 214 | "for i in range(x_tab.shape[0]):\n", 215 | " F.append( pb.f(x_tab[i])) \n", 216 | " G.append( np.linalg.norm(pb.f_grad(x_tab[i] )) )\n", 217 | "\n", 218 | "FF = []\n", 219 | "GF = []\n", 220 | "for i in range(xF_tab.shape[0]):\n", 221 | " FF.append( pb.f(xF_tab[i])) \n", 222 | " GF.append( np.linalg.norm(pb.f_grad(xF_tab[i] )) )\n", 223 | "\n", 224 | "plt.figure()\n", 225 | "plt.plot( F, color=\"black\", linewidth=1.0, linestyle=\"-\",label='gradient')\n", 226 | "plt.plot( FF, color=\"red\", linewidth=1.0, linestyle=\"-\",label='fast gradient')\n", 227 | "plt.grid(True)\n", 228 | "plt.legend()\n", 229 | "plt.show()\n", 230 | "\n", 231 | "\n", 232 | "plt.figure()\n", 233 | "plt.plot( G, color=\"black\", linewidth=1.0, linestyle=\"-\",label='gradient')\n", 234 | "plt.plot( GF, color=\"red\", linewidth=1.0, linestyle=\"-\",label='fast gradient')\n", 235 | "plt.yscale('log')\n", 236 | "plt.xscale('log')\n", 237 | "plt.grid(True)\n", 238 | "plt.legend()\n", 239 | "plt.show()\n" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "---\n", 247 | "\n", 248 | "\n", 249 | "### Other methods: line-search, BFGS\n", 250 | "\n", 251 | "\n", 252 | "Other popular methods to accelerate convergence are:\n", 253 | "* line-search (as seen quickly in the previous lab, it is implemented in 1.c of file `algoGradient.py` )\n", 254 | "* BFGS which is a Quasi-Newton method in the sense that it approximates second order information in an online setting. \n", 255 | "\n", 256 | "**BFGS.** (Broyden-Fletcher-Goldfarb-Shanno, 1970) The popular BFGS algorithm consist in performing the following iteration\n", 257 | "$$ x_{k+1}=x_k - \\gamma_k W_k \\nabla f(x_k)$$\n", 258 | "where $\\gamma_k$ is given by Wolfe's line-search and positive definite matrix $W_k$ is computed as\n", 259 | "$$ W_{k+1}=W_k - \\frac{s_k y_k^T W_k+W_k y_k s_k^T}{y_k^T s_k} +\\left[1+\\frac{y_k^T W_k y_k}{y_k^T s_k}\\right]\\frac{s_k s_k^T}{y_k^T s_k} $$\n", 260 | "with $s_k=x_{k+1}-x_{k}$ and $y_k=\\nabla f(x_{k+1}) - \\nabla f(x_{k})$." 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "> Implement BFGS in Section 3 of file `algoGradient.py` .\n", 268 | "\n", 269 | "> Compare the performance of the previously investigated algorithms. *(Note that you can also test the performance of Newton's method although it is a bit unfair compared to the other algorithms as the variable size is small)*" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "from algoGradient import * # import all methods of the module into the current environment\n", 279 | "\n", 280 | "import numpy as np\n", 281 | "import logistic_regression_ionosphere as pb\n", 282 | "\n", 283 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n", 284 | "PREC = 1e-5 # Sought precision\n", 285 | "ITE_MAX = 500 # Max number of iterations\n", 286 | "x0 = np.zeros(pb.n) # Initial point\n", 287 | "step = 1.0/pb.L\n", 288 | "\n", 289 | "##### gradient algorithm\n", 290 | "x,x_tab = gradient_algorithm(pb.f , pb.f_grad , x0 , step , PREC , ITE_MAX )\n", 291 | "\n", 292 | "##### fast gradient algorithm\n", 293 | "xF,xF_tab = fast_gradient_algorithm(pb.f , pb.f_grad , x0 , step , PREC , ITE_MAX )\n", 294 | "\n", 295 | "##### Wolfe line-search algorithm\n", 296 | "xW,xW_tab = gradient_Wolfe(pb.f , pb.f_grad , x0 , PREC , ITE_MAX )\n", 297 | "\n", 298 | "##### BFGS algorithm\n", 299 | "xB,xB_tab = bfgs(pb.f , pb.f_grad , x0 , PREC , ITE_MAX )\n" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "import matplotlib.pyplot as plt\n", 309 | "% matplotlib inline\n", 310 | "\n", 311 | "F = []\n", 312 | "G = []\n", 313 | "for i in range(x_tab.shape[0]):\n", 314 | " F.append( pb.f(x_tab[i])) \n", 315 | " G.append( np.linalg.norm(pb.f_grad(x_tab[i] )) )\n", 316 | "\n", 317 | "FF = []\n", 318 | "GF = []\n", 319 | "for i in range(xF_tab.shape[0]):\n", 320 | " FF.append( pb.f(xF_tab[i])) \n", 321 | " GF.append( np.linalg.norm(pb.f_grad(xF_tab[i] )) )\n", 322 | " \n", 323 | "FW = []\n", 324 | "GW = []\n", 325 | "for i in range(xW_tab.shape[0]):\n", 326 | " FW.append( pb.f(xW_tab[i])) \n", 327 | " GW.append( np.linalg.norm(pb.f_grad(xW_tab[i] )) )\n", 328 | " \n", 329 | " \n", 330 | "FB = []\n", 331 | "GB = []\n", 332 | "for i in range(xB_tab.shape[0]):\n", 333 | " FB.append( pb.f(xB_tab[i])) \n", 334 | " GB.append( np.linalg.norm(pb.f_grad(xB_tab[i] )) )\n", 335 | "\n", 336 | "plt.figure()\n", 337 | "plt.plot( F, color=\"black\", linewidth=1.0, linestyle=\"-\",label='gradient')\n", 338 | "plt.plot( FF, color=\"red\", linewidth=1.0, linestyle=\"-\",label='fast gradient')\n", 339 | "plt.plot( FW, color=\"magenta\", linewidth=1.0, linestyle=\"-\",label='Wolfe')\n", 340 | "plt.plot( FB, color=\"green\", linewidth=1.0, linestyle=\"-\",label='BFGS')\n", 341 | "plt.grid(True)\n", 342 | "plt.legend()\n", 343 | "plt.show()\n", 344 | "\n", 345 | "\n", 346 | "plt.figure()\n", 347 | "plt.plot( G, color=\"black\", linewidth=1.0, linestyle=\"-\",label='gradient')\n", 348 | "plt.plot( GF, color=\"red\", linewidth=1.0, linestyle=\"-\",label='fast gradient')\n", 349 | "plt.plot( GW, color=\"magenta\", linewidth=1.0, linestyle=\"-\",label='Wolfe')\n", 350 | "plt.plot( GB, color=\"green\", linewidth=1.0, linestyle=\"-\",label='BFGS')\n", 351 | "plt.yscale('log')\n", 352 | "plt.xscale('log')\n", 353 | "plt.grid(True)\n", 354 | "plt.legend()\n", 355 | "plt.show()\n" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "---\n", 363 | "\n", 364 | "# Performance on learning problems\n", 365 | "\n", 366 | "### Prediction power\n", 367 | "\n", 368 | "\n", 369 | "\n", 370 | "Our problem of interest is binary classification using logistic regression.
\n", 371 | "Although this is a machine learning task, the predictor construction amounts to minimizing a smooth convex optimization function $f$ called the *loss*, the final minimizer is called a *predictor* and its scalar product with the data vector gives a probability of belonging to class $1$.\n", 372 | "\n", 373 | "The previous test was based on the functional decrease whereas our task is binary classification. Let us look at the final accuracies obtained.\n", 374 | "\n", 375 | "> The file `logistic_regression.py` contains a `prediction` function that takes a *predictor* and resturn the accuracy of the predictor. Take a look at how the function is defined.\n", 376 | "\n", 377 | "> Observe the accuracy of all final points obtained before. What do you notice? " 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "pred,perf = pb.prediction(x,PRINT=False)\n", 387 | "print(\"Gradient algorithm: \\t{:.2f}%\".format(perf*100))\n", 388 | "\n", 389 | "predF,perfF = pb.prediction(xF,PRINT=False)\n", 390 | "print(\"Fast Gradient: \\t\\t{:.2f}%\".format(perfF*100))\n", 391 | "\n", 392 | "predW,perfW = pb.prediction(xW,PRINT=False)\n", 393 | "print(\"Wolfe: \\t\\t\\t{:.2f}%\".format(perfW*100))\n", 394 | "\n", 395 | "predB,perfB = pb.prediction(xB,PRINT=False)\n", 396 | "print(\"BFGS: \\t\\t\\t{:.2f}%\".format(perfB*100))" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": { 403 | "scrolled": true 404 | }, 405 | "outputs": [], 406 | "source": [ 407 | "predF,perfF = pb.prediction(xF,PRINT=True)" 408 | ] 409 | } 410 | ], 411 | "metadata": { 412 | "kernelspec": { 413 | "display_name": "Python 3", 414 | "language": "python", 415 | "name": "python3" 416 | }, 417 | "language_info": { 418 | "codemirror_mode": { 419 | "name": "ipython", 420 | "version": 3 421 | }, 422 | "file_extension": ".py", 423 | "mimetype": "text/x-python", 424 | "name": "python", 425 | "nbconvert_exporter": "python", 426 | "pygments_lexer": "ipython3", 427 | "version": "3.7.5" 428 | } 429 | }, 430 | "nbformat": 4, 431 | "nbformat_minor": 1 432 | } 433 | -------------------------------------------------------------------------------- /Lab5_MachineLearningExample/algoGradient.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Gradient-based algorithms 5 | # 6 | # In this notebook, we code our gradient-based optimization algorithms. 7 | 8 | # # 1. Gradient algorithms 9 | # 10 | # For minimizing a differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given: 11 | # * the function to minimize `f` 12 | # * a 1st order oracle `f_grad` (see `problem1.ipynb` for instance) 13 | # * an initialization point `x0` 14 | # * the sought precision `PREC` 15 | # * a maximal number of iterations `ITE_MAX` 16 | # 17 | # 18 | # these algorithms perform iterations of the form 19 | # $$ x_{k+1} = x_k - \gamma_k \nabla f(x_k) $$ 20 | # where $\gamma_k$ is a stepsize to choose. 21 | 22 | # ### 1.a. Constant stepsize gradient algorithm 23 | # 24 | 25 | 26 | import numpy as np 27 | import timeit 28 | 29 | def gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ): 30 | x = np.copy(x0) 31 | stop = PREC*np.linalg.norm(f_grad(x0) ) 32 | 33 | x_tab = np.copy(x) 34 | print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step)) 35 | t_s = timeit.default_timer() 36 | for k in range(ITE_MAX): 37 | g = f_grad(x) 38 | x = x - step*g 39 | 40 | x_tab = np.vstack((x_tab,x)) 41 | 42 | if np.linalg.norm(g) < stop: 43 | break 44 | t_e = timeit.default_timer() 45 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x))) 46 | return x,x_tab 47 | 48 | 49 | # ### 1.b. Adaptive stepsize gradient algorithm 50 | # 51 | 52 | import numpy as np 53 | import timeit 54 | 55 | 56 | def gradient_adaptive_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ): 57 | x = np.copy(x0) 58 | stop = PREC*np.linalg.norm(f_grad(x0) ) 59 | 60 | x_tab = np.copy(x) 61 | print("------------------------------------\nAdaptative Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step)) 62 | t_s = timeit.default_timer() 63 | for k in range(ITE_MAX): 64 | 65 | g = f_grad(x) 66 | x_prev = np.copy(x) 67 | 68 | x = x - step*g ####### ITERATION 69 | 70 | if f(x)>f(x_prev): 71 | x = np.copy(x_prev) 72 | step = step/2 73 | print("stepsize: = {:0}".format(step)) 74 | 75 | x_tab = np.vstack((x_tab,x)) 76 | 77 | if np.linalg.norm(g) < stop: 78 | break 79 | t_e = timeit.default_timer() 80 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x))) 81 | return x,x_tab 82 | 83 | 84 | # ### 1.c. Wolfe Line search 85 | # 86 | # 87 | 88 | 89 | 90 | import numpy as np 91 | import timeit 92 | from scipy.optimize import line_search 93 | 94 | def gradient_Wolfe(f , f_grad , x0 , PREC , ITE_MAX ): 95 | x = np.copy(x0) 96 | stop = PREC*np.linalg.norm(f_grad(x0) ) 97 | 98 | x_tab = np.copy(x) 99 | print("------------------------------------\n Gradient with Wolfe line search\n------------------------------------\nSTART") 100 | t_s = timeit.default_timer() 101 | for k in range(ITE_MAX): 102 | g = f_grad(x) 103 | 104 | res = line_search(f, f_grad, x, -g, gfk=None, old_fval=None, old_old_fval=None, args=(), c1=0.0001, c2=0.9, amax=50) 105 | 106 | x = x - res[0]*g 107 | 108 | x_tab = np.vstack((x_tab,x)) 109 | 110 | if np.linalg.norm(g) < stop: 111 | break 112 | t_e = timeit.default_timer() 113 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x))) 114 | return x,x_tab 115 | 116 | 117 | # ### 1.d. Nesterov's Fast gradient algorithm 118 | # 119 | # In a series of papers published in the 80's, Yu. Nesterov proposed an acceleration technique in order to make the worst case rate of the gradient algorithm from $\mathcal{O}(1/k)$ to $\mathcal{O}(1/k^2)$. This technique is now immensely popular, notably in the machine learning and image processing communities. 120 | # 121 | # The iterations of Nesterov's accelerated gradient are as such: 122 | # $$ \left\{ \begin{array}{ll} x_{k+1} = y_k - \gamma \nabla f(y_k) \\ y_{k+1} = x_{k+1} + \alpha_{k+1} (x_{k+1} - x_k ) \end{array} \right. $$ 123 | # with 124 | # $$ \alpha_{k+1} = \frac{\lambda_k -1 }{\lambda_{k+1}} \text{ with } \lambda_0 = 0 \text{ and } \lambda_{k+1} = \frac{1+\sqrt{1+4\lambda_k^2}}{2} . $$ 125 | # 126 | # Although no clear intuition can be drawn, the extended point can be seen as an extension by inertia of the last points. 127 | # 128 | # 129 | # Q. Fill the function below accordingly. 130 | 131 | 132 | 133 | import numpy as np 134 | import timeit 135 | 136 | def fast_gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ): 137 | x = np.copy(x0) 138 | y = np.copy(x0) 139 | stop = PREC*np.linalg.norm(f_grad(x0) ) 140 | 141 | 142 | 143 | x_tab = np.copy(x) 144 | print("------------------------------------\n Fast gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step)) 145 | t_s = timeit.default_timer() 146 | for k in range(ITE_MAX): 147 | g = f_grad(x) 148 | # TO FILL 149 | 150 | x_tab = np.vstack((x_tab,x)) 151 | 152 | if np.linalg.norm(g) < stop: 153 | break 154 | t_e = timeit.default_timer() 155 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x))) 156 | return x,x_tab 157 | 158 | 159 | # # 2. Second Order algorithms 160 | # 161 | # For minimizing a *twice* differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given: 162 | # * the function to minimize `f` 163 | # * a 2nd order oracle `f_grad_hessian` (see `problem1.ipynb` for instance) 164 | # * an initialization point `x0` 165 | # * the sought precision `PREC` 166 | # * a maximal number of iterations `ITE_MAX` 167 | # 168 | # 169 | # these algorithms perform iterations of the form 170 | # $$ x_{k+1} = x_k - [\nabla^2 f(x_k) ]^{-1} \nabla f(x_k) .$$ 171 | 172 | 173 | 174 | import numpy as np 175 | import timeit 176 | 177 | def newton_algorithm(f , f_grad_hessian , x0 , PREC , ITE_MAX ): 178 | x = np.copy(x0) 179 | g0,H0 = f_grad_hessian(x0) 180 | stop = PREC*np.linalg.norm(g0 ) 181 | 182 | x_tab = np.copy(x) 183 | print("------------------------------------\nNewton's algorithm\n------------------------------------\nSTART") 184 | t_s = timeit.default_timer() 185 | for k in range(ITE_MAX): 186 | 187 | g,H = f_grad_hessian(x) 188 | x = x - np.linalg.solve(H,g) 189 | 190 | x_tab = np.vstack((x_tab,x)) 191 | 192 | if np.linalg.norm(g) < stop: 193 | break 194 | t_e = timeit.default_timer() 195 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x))) 196 | return x,x_tab 197 | 198 | 199 | # # 3. Quasi Newton algorithms 200 | # 201 | # **BFGS.** (Broyden-Fletcher-Goldfarb-Shanno, 1970) The popular BFGS algorithm consist in performing the following iteration 202 | # $$ x_{k+1}=x_k - \gamma_k W_k \nabla f(x_k)$$ 203 | # where $\gamma_k$ is given by Wolfe's line-search and positive definite matrix $W_k$ is computed as 204 | # $$ W_{k+1}=W_k - \frac{s_k y_k^T W_k+W_k y_k s_k^T}{y_k^T s_k} +\left[1+\frac{y_k^T W_k y_k}{y_k^T s_k}\right]\frac{s_k s_k^T}{y_k^T s_k} $$ 205 | # with $s_k=x_{k+1}-x_{k}$ and $y_k=\nabla f(x_{k+1}) - \nabla f(x_{k})$. 206 | 207 | # Q. Implement BFGS 208 | 209 | import numpy as np 210 | import timeit 211 | from scipy.optimize import line_search 212 | 213 | def bfgs(f , f_grad , x0 , PREC , ITE_MAX ): 214 | x = np.copy(x0) 215 | n = x0.size 216 | g = f_grad(x0) 217 | sim_eval = 1 218 | stop = PREC*np.linalg.norm( g ) 219 | 220 | W = np.eye(n) 221 | 222 | x_tab = np.copy(x) 223 | print("------------------------------------\n BFGS\n------------------------------------\nSTART") 224 | t_s = timeit.default_timer() 225 | for k in range(ITE_MAX): 226 | 227 | x = x # To fill 228 | 229 | x_tab = np.vstack((x_tab,x)) 230 | 231 | t_e = timeit.default_timer() 232 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x))) 233 | return x,x_tab 234 | 235 | -------------------------------------------------------------------------------- /Lab5_MachineLearningExample/logistic_regression_ionosphere.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Logistic Regression Problem 5 | # 6 | # 7 | # 8 | # ### Machine Learning as an Optimization problem 9 | # 10 | # We have some *data* $\mathcal{D}$ consisting of $m$ *examples* $\{d_i\}$; each example consisting of a *feature* vector $a_i\in\mathbb{R}^d$ and an *observation* $b_i\in \mathcal{O}$: $\mathcal{D} = \{[a_i,b_i]\}_{i=1..m}$. In this lab, we will consider the ionosphere dataset. 11 | # 12 | # 13 | # The goal of *supervised learning* is to construct a predictor for the observations when given feature vectors. 14 | # 15 | # 16 | # A popular approach is based on *linear models* which are based on finding a *parameter* $x$ such that the real number $\langle a_i , x \rangle$ is used to predict the value of the observation through a *predictor function* $g:\mathbb{R}\to \mathcal{O}$: $g(\langle a_i , x \rangle)$ is the predicted value from $a_i$. 17 | # 18 | # 19 | # In order to find such a parameter, we use the available data and a *loss* $\ell$ that penalizes the error made between the predicted $g(\langle a_i , x \rangle)$ and observed $b_i$ values. For each example $i$, the corresponding error function for a parameter $x$ is $f_i(x) = \ell( g(\langle a_i , x \rangle) ; b_i )$. Using the whole data, the parameter that minimizes the total error is the solution of the minimization problem 20 | # $$ \min_{x\in\mathbb{R}^d} \frac{1}{m} \sum_{i=1}^m f_i(x) = \frac{1}{m} \sum_{i=1}^m \ell( g(\langle a_i , x \rangle) ; b_i ). $$ 21 | # 22 | # 23 | # ### Binary Classification with Logisitic Regression 24 | # 25 | # In our setup, the observations are binary: $\mathcal{O} = \{-1 , +1 \}$, and the *Logistic loss* is used to form the following optimization problem 26 | # \begin{align*} 27 | # \min_{x\in\mathbb{R}^d } f(x) := \frac{1}{m} \sum_{i=1}^m \log( 1+\exp(-b_i \langle a_i,x \rangle) ) + \frac{\lam2bda}{2} \|x\|_2^2. 28 | # \end{align*} 29 | # where the last term is added as a regularization (of type $\ell_2$, aka Tikhnov) to prevent overfitting. 30 | # 31 | # Under some statistical hypotheses, $x^\star = \arg\min f(x)$ maximizes the likelihood of the labels knowing the features vector. Then, for a new point $d$ with features vector $a$, 32 | # $$ p_1(a) = \mathbb{P}[d\in \text{ class } +1] = \frac{1}{1+\exp(-\langle a;x^\star \rangle)} $$ 33 | # Thus, from $a$, if $p_1(a)$ is close to $1$, one can decide that $d$ belongs to class $1$; and the opposite decision if $p(a)$ is close to $0$. Between the two, the appreciation is left to the data scientist depending on the application. 34 | # 35 | # 36 | # # Objective of the optimizer 37 | # 38 | # Given oracles for the function and its gradient, as well as an upper-bound of the Lipschitz constant $L$ of the gradient, find a minimizer of $f$. 39 | # 40 | 41 | # ### Function definition 42 | 43 | 44 | 45 | import numpy as np 46 | import csv 47 | from sklearn import preprocessing 48 | 49 | file = open('ionosphere.data') 50 | 51 | d = 34 52 | n = d+1 # Variable size + intercept 53 | 54 | m = 351 # Number of examples 55 | 56 | lam2 = 0.001 # regularization best:0.001 57 | 58 | A = np.zeros((m,d)) 59 | b = np.zeros(m) 60 | 61 | reader = csv.reader(file, delimiter=',') 62 | i = 0 63 | for row in reader: 64 | A[i] = np.array(row[:d]) 65 | if row[d] == 'b': 66 | b[i] = -1.0 67 | else: 68 | b[i] = 1.0 69 | i+=1 70 | 71 | scaler = preprocessing.StandardScaler().fit(A) 72 | A = scaler.transform(A) 73 | 74 | # Adding an intercept 75 | A_inter = np.ones((m,n)) 76 | A_inter[:,:-1] = A 77 | A = A_inter 78 | 79 | 80 | L = 0.25*max(np.linalg.norm(A,2,axis=1))**2 + lam2 81 | 82 | 83 | # ## Oracles 84 | 85 | 86 | 87 | 88 | def f(x): 89 | l = 0.0 90 | for i in range(A.shape[0]): 91 | if b[i] > 0 : 92 | l += np.log( 1 + np.exp(-np.dot( A[i] , x ) ) ) 93 | else: 94 | l += np.log( 1 + np.exp(np.dot( A[i] , x ) ) ) 95 | return l/m + lam2/2.0*np.dot(x,x) 96 | 97 | def f_grad(x): 98 | g = np.zeros(n) 99 | for i in range(A.shape[0]): 100 | if b[i] > 0: 101 | g += -A[i]/( 1 + np.exp(np.dot( A[i] , x ) ) ) 102 | else: 103 | g += A[i]/( 1 + np.exp(-np.dot( A[i] , x ) ) ) 104 | return g/m + lam2*x 105 | 106 | def f_grad_hessian(x): 107 | g = np.zeros(n) 108 | H = np.zeros((n,n)) 109 | for i in range(A.shape[0]): 110 | if b[i] > 0: 111 | g += -A[i]/( 1 + np.exp(np.dot( A[i] , x ) ) ) 112 | H += (np.exp(np.dot( A[i] , x ))/( 1 + np.exp(np.dot( A[i] , x ) ) )**2)*np.outer(A[i],A[i]) 113 | else: 114 | g += A[i]/( 1 + np.exp(-np.dot( A[i] , x ) ) ) 115 | H += (np.exp(-np.dot( A[i] , x ))/( 1 + np.exp(-np.dot( A[i] , x ) ) )**2)*np.outer(A[i],A[i]) 116 | g = g/m + lam2*x 117 | H = H/m + lam2*np.eye(n) 118 | return g,H 119 | 120 | 121 | # ## Prediction Function 122 | 123 | 124 | 125 | def prediction(w,PRINT=False): 126 | pred = np.zeros(A.shape[0]) 127 | perf = 0 128 | for i in range(A.shape[0]): 129 | p = 1.0/( 1 + np.exp(-np.dot( A[i] , w ) ) ) 130 | if p>0.5: 131 | pred[i] = 1.0 132 | if b[i]>0: 133 | correct = "True" 134 | perf += 1 135 | else: 136 | correct = "False" 137 | if PRINT: 138 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),1,(p-0.5)*200,correct)) 139 | else: 140 | pred[i] = -1.0 141 | if b[i]<0: 142 | correct = "True" 143 | perf += 1 144 | else: 145 | correct = "False" 146 | if PRINT: 147 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),-1,100-(0.5-p)*200,correct)) 148 | return pred,float(perf)/A.shape[0] 149 | 150 | 151 | -------------------------------------------------------------------------------- /Lab6_LPQP/Fig/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/1.png -------------------------------------------------------------------------------- /Lab6_LPQP/Fig/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/2.png -------------------------------------------------------------------------------- /Lab6_LPQP/Fig/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/3.png -------------------------------------------------------------------------------- /Lab6_LPQP/Fig/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/4.png -------------------------------------------------------------------------------- /Lab6_LPQP/Fig/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/5.png -------------------------------------------------------------------------------- /Lab6_LPQP/Fig/UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/UGA.png -------------------------------------------------------------------------------- /Lab6_LPQP/toy_problem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # a Toy problem 5 | 6 | # We consider the first illustrative example of the original paper 7 | # 8 | # Candes, E., Tao, T. "The Dantzig selector: Statistical estimation when p is much larger than n". 9 | # The Annals of Statistics, 2007 10 | 11 | # In this first example, the design matrix $X$ has $m = 72$ rows and $n = 256$ columns, with independent Gaussian entries (and then normalized so that the columns have unit-norm). We then select $\theta$ with $S := |\{i : \theta_i = 0\}| = 8$, and form $y = X\theta + \xi$, where the $\xi_i$’s are i.i.d. $\mathcal{N}(0, \sigma^2 )$. The noise level is adjusted so that 12 | # $$ \sigma = \frac{1}{3} \sqrt{\frac{S}{n}} .$$ 13 | 14 | # ### Problem 15 | 16 | 17 | import numpy as np 18 | 19 | # Parameters 20 | m = 72 21 | n = 256 22 | 23 | S = 8 24 | 25 | sigma = 1/3.0 * np.sqrt(S/float(m)) 26 | 27 | # X creation 28 | X = np.random.randn(m, n) 29 | 30 | n_col = np.linalg.norm(X, axis=0) 31 | X = np.dot(X,np.diag(1/n_col)) # Normalization per column [Get rid of it for the "To go further" part!] 32 | 33 | # theta creation 34 | theta = np.zeros(n) 35 | non_null = np.random.choice(n, S) 36 | theta[non_null] = np.random.randn(S) 37 | 38 | 39 | # y creation 40 | y = np.dot(X,theta) + sigma*np.random.randn(m) 41 | 42 | -------------------------------------------------------------------------------- /Lab7_StochasticMethods/Fig/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/1.png -------------------------------------------------------------------------------- /Lab7_StochasticMethods/Fig/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/2.png -------------------------------------------------------------------------------- /Lab7_StochasticMethods/Fig/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/3.png -------------------------------------------------------------------------------- /Lab7_StochasticMethods/Fig/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/4.png -------------------------------------------------------------------------------- /Lab7_StochasticMethods/Fig/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/5.png -------------------------------------------------------------------------------- /Lab7_StochasticMethods/Fig/UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/UGA.png -------------------------------------------------------------------------------- /Lab7_StochasticMethods/Lab7_StochMethods.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "
\n", 8 | "

Master of Science in Industrial and Applied Mathematics (MSIAM) - 1st year

\n", 9 | "
\n", 10 | "

Numerical Optimization

\n", 11 | "

Lab 7: Variance-Reduced Stochastic Gradient

" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "---" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "%load_ext autoreload\n", 28 | "%autoreload 2" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Logistic Regression Problem\n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | "### Machine Learning as an Optimization problem\n", 40 | " \n", 41 | "We have some *data* $\\mathcal{D}$ consisting of $m$ *examples* $\\{d_i\\}$; each example consisting of a *feature* vector $a_i\\in\\mathbb{R}^d$ and an *observation* $b_i\\in \\mathcal{O}$: $\\mathcal{D} = \\{[a_i,b_i]\\}_{i=1..m}$. In this lab, we will consider the student performance dataset.\n", 42 | " \n", 43 | " \n", 44 | "The goal of *supervised learning* is to construct a predictor for the observations when given feature vectors.\n", 45 | " \n", 46 | " \n", 47 | " A popular approach is based on *linear models* which are based on finding a *parameter* $x$ such that the real number $\\langle a_i , x \\rangle$ is used to predict the value of the observation through a *predictor function* $g:\\mathbb{R}\\to \\mathcal{O}$: $g(\\langle a_i , x \\rangle)$ is the predicted value from $a_i$.\n", 48 | " \n", 49 | " \n", 50 | " In order to find such a parameter, we use the available data and a *loss* $\\ell$ that penalizes the error made between the predicted $g(\\langle a_i , x \\rangle)$ and observed $b_i$ values. For each example $i$, the corresponding error function for a parameter $x$ is $f_i(x) = \\ell( g(\\langle a_i , x \\rangle) ; b_i )$. Using the whole data, the parameter that minimizes the total error is the solution of the minimization problem\n", 51 | " $$ \\min_{x\\in\\mathbb{R}^d} \\frac{1}{m} \\sum_{i=1}^m f_i(x) = \\frac{1}{m} \\sum_{i=1}^m \\ell( g(\\langle a_i , x \\rangle) ; b_i ). $$\n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | "### Regularized Problem \n", 56 | " \n", 57 | "In this lab, we will consider an $\\ell_1$ regularization to promote sparsity of the iterates. A sparse final solution would select the most important features. The new function (below) is non-smooth but it has a smooth part, $f$; and a non-smooth part, $g$, that we will treat with proximal operations.\n", 58 | " \n", 59 | " \\begin{align*}\n", 60 | " \\min_{x\\in\\mathbb{R}^d } F(x) := \\underbrace{\\frac{1}{m} \\sum_{i=1}^m \\overbrace{ \\log( 1+\\exp(-b_i \\langle a_i,x \\rangle) ) + \\frac{\\lambda_2}{2} \\|x\\|_2^2 }^{f_i(x)} }_{f(x)} + \\underbrace{\\lambda_1 \\|x\\|_1 }_{g(x)}.\n", 61 | " \\end{align*}\n", 62 | " \n", 63 | " \n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "\n", 71 | "\n", 72 | "# Recall of the proximal gradient algorithm\n", 73 | "\n" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "from algoProx import * # import all methods of the module into the current environment\n", 83 | "import numpy as np\n", 84 | "import logistic_regression_student as pb\n", 85 | "\n", 86 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n", 87 | "PREC = 1e-5 # Sought precision\n", 88 | "ITE_MAX = 1000 # Max number of iterations\n", 89 | "x0 = np.zeros(pb.n) # Initial point\n", 90 | "step = 1.0/pb.L\n", 91 | "\n", 92 | "##### gradient algorithm\n", 93 | "x,x_tab = proximal_gradient_algorithm(pb.F , pb.f_grad , pb.g_prox , x0 , step , PREC, ITE_MAX , True)\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "## Decrease of the algorithm." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "import matplotlib.pyplot as plt\n", 110 | "% matplotlib inline\n", 111 | "\n", 112 | "F = []\n", 113 | "for i in range(x_tab.shape[0]):\n", 114 | " F.append( pb.F(x_tab[i])) \n", 115 | "\n", 116 | "plt.figure()\n", 117 | "plt.plot( F, color=\"black\", linewidth=1.0, linestyle=\"-\")\n", 118 | "plt.grid(True)\n", 119 | "plt.show()" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### Support of the vector $x_k$ \n" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "plt.figure()\n", 136 | "\n", 137 | "for i in np.arange(0,x_tab.shape[0],int(x_tab.shape[0]/40)):\n", 138 | " for j in range(pb.n):\n", 139 | " if np.abs(x_tab[i,j])>1e-14:\n", 140 | " plt.plot( i , j , 'ko')\n", 141 | "\n", 142 | "plt.grid(True)\n", 143 | "plt.ylabel('Non-null Coordinates')\n", 144 | "plt.xlabel('Nb. Iterations')\n", 145 | "plt.ylim(-1,pb.d+1)\n", 146 | "plt.yticks(np.arange(0,pb.d+1))\n", 147 | "plt.show()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "---\n", 155 | "\n", 156 | "# Stochastic gradient \n", 157 | "\n", 158 | "\n", 159 | "In the following, instead of considering $f$ as a whole, we will use its structure \n", 160 | "$$ f(x) := \\frac{1}{m}\\sum_{i=1}^m f_i(x)$$\n", 161 | "\n", 162 | "> Implement the gradient related to $f_i$, related to one example, in `logistic_regression_student.py`\n", 163 | "\n", 164 | "With this structure a popular minimization algorithm is the *stochastic gradient algorithm* which writes as follows:\n", 165 | "* Select uniformly $i$ in $1,..,m$\n", 166 | "* $x_{k+1} = \\mathbf{prox}_{\\gamma_k g}\\left( x_k - \\gamma_k \\nabla f_i(x_k) \\right) $\n", 167 | "\n", 168 | "> Implement this algorithm with a stepsize vanishing as $1/k$" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "### Variance reduction\n", 183 | "\n", 184 | "The poor performance of this algorithm is notably due to the variance of the gradients. In order to overcome it, *variance reduced* algorithms have been proposed.\n", 185 | "\n", 186 | "We will consider here the popular **SAGA** algorithm (SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives\n", 187 | "A Defazio, F Bach, S Lacoste-Julien, NIPS 2014. ) \n", 188 | "\n", 189 | "> Implement SAGA from the paper ( http://papers.nips.cc/paper/5258-saga-a-fast-incremental-gradient-method-with-support-for-non-strongly-convex-composite-objectives ) and compare with the stochastic gradient algorithm.\n" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 3 (ipykernel)", 203 | "language": "python", 204 | "name": "python3" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 3 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython3", 216 | "version": "3.9.13" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 4 221 | } 222 | -------------------------------------------------------------------------------- /Lab7_StochasticMethods/algoProx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Proximal algorithms 5 | # 6 | # In this notebook, we code our proximal optimization algorithms. 7 | 8 | # # 1. Proximal Gradient algorithm 9 | # 10 | # For minimizing a function $F:\mathbb{R}^n \to \mathbb{R}$ equal to $f+g$ where $f$ is differentiable and the $\mathbf{prox}$ of $g$ is known, given: 11 | # * the function to minimize `F` 12 | # * a 1st order oracle for $f$ `f_grad` 13 | # * a proximity operator for $g$ `g_prox` 14 | # * an initialization point `x0` 15 | # * the sought precision `PREC` 16 | # * a maximal number of iterations `ITE_MAX` 17 | # * a display boolean variable `PRINT` 18 | # 19 | # these algorithms perform iterations of the form 20 | # $$ x_{k+1} = \mathbf{prox}_{\gamma g}\left( x_k - \gamma \nabla f(x_k) \right) $$ 21 | # where $\gamma$ is a stepsize to choose. 22 | 23 | # 24 | # 25 | # Q. How would you implement the precision stopping criterion? 26 | 27 | 28 | 29 | import numpy as np 30 | import timeit 31 | 32 | def proximal_gradient_algorithm(F , f_grad , g_prox , x0 , step , PREC , ITE_MAX , PRINT ): 33 | x = np.copy(x0) 34 | x_tab = np.copy(x) 35 | if PRINT: 36 | print("------------------------------------\n Proximal gradient algorithm\n------------------------------------\nSTART -- stepsize = {:0}".format(step)) 37 | t_s = timeit.default_timer() 38 | for k in range(ITE_MAX): 39 | g = f_grad(x) 40 | x = g_prox(x - step*g , step) ####### ITERATION 41 | 42 | x_tab = np.vstack((x_tab,x)) 43 | 44 | 45 | t_e = timeit.default_timer() 46 | if PRINT: 47 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,F(x))) 48 | return x,x_tab 49 | 50 | -------------------------------------------------------------------------------- /Lab7_StochasticMethods/logistic_regression_student.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Logistic Regression Problem 5 | # 6 | # 7 | # 8 | # ### Machine Learning as an Optimization problem 9 | # 10 | # We have some *data* $\mathcal{D}$ consisting of $m$ *examples* $\{d_i\}$; each example consisting of a *feature* vector $a_i\in\mathbb{R}^d$ and an *observation* $b_i\in \mathcal{O}$: $\mathcal{D} = \{[a_i,b_i]\}_{i=1..m}$. In this lab, we will consider the student performance dataset. 11 | # 12 | # 13 | # The goal of *supervised learning* is to construct a predictor for the observations when given feature vectors. 14 | # 15 | # 16 | # A popular approach is based on *linear models* which are based on finding a *parameter* $x$ such that the real number $\langle a_i , x \rangle$ is used to predict the value of the observation through a *predictor function* $g:\mathbb{R}\to \mathcal{O}$: $g(\langle a_i , x \rangle)$ is the predicted value from $a_i$. 17 | # 18 | # 19 | # In order to find such a parameter, we use the available data and a *loss* $\ell$ that penalizes the error made between the predicted $g(\langle a_i , x \rangle)$ and observed $b_i$ values. For each example $i$, the corresponding error function for a parameter $x$ is $f_i(x) = \ell( g(\langle a_i , x \rangle) ; b_i )$. Using the whole data, the parameter that minimizes the total error is the solution of the minimization problem 20 | # $$ \min_{x\in\mathbb{R}^d} \frac{1}{m} \sum_{i=1}^m f_i(x) = \frac{1}{m} \sum_{i=1}^m \ell( g(\langle a_i , x \rangle) ; b_i ). $$ 21 | # 22 | # 23 | # 24 | # # Regularized Problem 25 | # 26 | # In this lab, we will consider an $\ell_1$ regularization to promote sparsity of the iterates. A sparse final solution would select the most important features. The new function (below) is non-smooth but it has a smooth part, $f$, the same as in Lab3; and a non-smooth part, $g$, that we will treat with proximal operations. 27 | # 28 | # \begin{align*} 29 | # \min_{x\in\mathbb{R}^d } F(x) := \underbrace{ \frac{1}{m} \sum_{i=1}^m \log( 1+\exp(-b_i \langle a_i,x \rangle) ) + \frac{\lambda_2}{2} \|x\|_2^2}_{f(x)} + \underbrace{\lambda_1 \|x\|_1 }_{g(x)}. 30 | # \end{align*} 31 | # 32 | # 33 | 34 | 35 | # ### Function definition 36 | 37 | 38 | 39 | import numpy as np 40 | import csv 41 | from sklearn import preprocessing 42 | 43 | #### File reading 44 | dat_file = np.load('student.npz') 45 | A = dat_file['A_learn'] 46 | final_grades = dat_file['b_learn'] 47 | m = final_grades.size 48 | b = np.zeros(m) 49 | for i in range(m): 50 | if final_grades[i]>11: 51 | b[i] = 1.0 52 | else: 53 | b[i] = -1.0 54 | 55 | A_test = dat_file['A_test'] 56 | final_grades_test = dat_file['b_test'] 57 | m_test = final_grades_test.size 58 | b_test = np.zeros(m_test) 59 | for i in range(m_test): 60 | if final_grades_test[i]>11: 61 | b_test[i] = 1.0 62 | else: 63 | b_test[i] = -1.0 64 | 65 | 66 | d = 27 # features 67 | n = d+1 # with the intercept 68 | 69 | 70 | 71 | 72 | lam2 = 0.1 # for the 2-norm regularization best:0.1 73 | lam1 = 0.03 # for the 1-norm regularization best:0.03 74 | 75 | 76 | L = 0.25*max(np.linalg.norm(A,2,axis=1))**2 + lam2 77 | 78 | 79 | # ## Oracles 80 | # 81 | # ### Related to function $f$ 82 | 83 | 84 | 85 | def f(x): 86 | l = 0.0 87 | for i in range(A.shape[0]): 88 | if b[i] > 0 : 89 | l += np.log( 1 + np.exp(-np.dot( A[i] , x ) ) ) 90 | else: 91 | l += np.log( 1 + np.exp(np.dot( A[i] , x ) ) ) 92 | return l/m + lam2/2.0*np.dot(x,x) 93 | 94 | def f_grad(x): 95 | g = np.zeros(n) 96 | for i in range(A.shape[0]): 97 | if b[i] > 0: 98 | g += -A[i]/( 1 + np.exp(np.dot( A[i] , x ) ) ) 99 | else: 100 | g += A[i]/( 1 + np.exp(-np.dot( A[i] , x ) ) ) 101 | return g/m + lam2*x 102 | 103 | 104 | # ## Related to function $f_i$ (one example) 105 | 106 | # Q. To Fill 107 | 108 | 109 | 110 | def f_grad_ex(x,i): 111 | g = np.zeros(n) 112 | 113 | #### TODO 114 | 115 | return g 116 | 117 | 118 | # ### Related to function $g$ 119 | 120 | 121 | def g(x): 122 | return lam1*np.linalg.norm(x,1) 123 | 124 | def g_prox(x,gamma): 125 | p = np.zeros(n) 126 | for i in range(n): 127 | if x[i] < - lam1*gamma: 128 | p[i] = x[i] + lam1*gamma 129 | if x[i] > lam1*gamma: 130 | p[i] = x[i] - lam1*gamma 131 | return p 132 | 133 | 134 | # ### Related to function $F$ 135 | 136 | 137 | 138 | def F(x): 139 | return f(x) + g(x) 140 | 141 | 142 | # ## Prediction Function 143 | 144 | 145 | 146 | def prediction_train(w,PRINT): 147 | pred = np.zeros(A.shape[0]) 148 | perf = 0 149 | for i in range(A.shape[0]): 150 | p = 1.0/( 1 + np.exp(-np.dot( A[i] , w ) ) ) 151 | if p>0.5: 152 | pred[i] = 1.0 153 | if b[i]>0: 154 | correct = "True" 155 | perf += 1 156 | else: 157 | correct = "False" 158 | if PRINT: 159 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),1,(p-0.5)*200,correct)) 160 | else: 161 | pred[i] = -1.0 162 | if b[i]<0: 163 | correct = "True" 164 | perf += 1 165 | else: 166 | correct = "False" 167 | if PRINT: 168 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),-1,100-(0.5-p)*200,correct)) 169 | return pred,float(perf)/A.shape[0] 170 | 171 | def prediction_test(w,PRINT): 172 | pred = np.zeros(A_test.shape[0]) 173 | perf = 0 174 | for i in range(A_test.shape[0]): 175 | p = 1.0/( 1 + np.exp(-np.dot( A_test[i] , w ) ) ) 176 | if p>0.5: 177 | pred[i] = 1.0 178 | if b_test[i]>0: 179 | correct = "True" 180 | perf += 1 181 | else: 182 | correct = "False" 183 | if PRINT: 184 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),1,(p-0.5)*200,correct)) 185 | else: 186 | pred[i] = -1.0 187 | if b_test[i]<0: 188 | correct = "True" 189 | perf += 1 190 | else: 191 | correct = "False" 192 | if PRINT: 193 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),-1,100-(0.5-p)*200,correct)) 194 | return pred,float(perf)/A_test.shape[0] 195 | 196 | -------------------------------------------------------------------------------- /Lab7_StochasticMethods/plotLib.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from matplotlib import cm 5 | from mpl_toolkits.mplot3d import Axes3D 6 | import time 7 | from IPython import display 8 | 9 | 10 | def custom_3dplot( f, x1_min,x1_max,x2_min,x2_max,nb_points, v_min, v_max ): 11 | 12 | def f_no_vector(x1,x2): 13 | return f( np.array( [x1,x2] ) ) 14 | 15 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 16 | z = f_no_vector(x,y) 17 | 18 | fig = plt.figure() 19 | ax = fig.gca(projection='3d') 20 | ax.plot_surface(x, y, z, cmap=cm.hot , vmin = v_min, vmax = v_max) 21 | ax.set_zlim(v_min, v_max) 22 | plt.show() 23 | 24 | 25 | def level_plot( f, x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 26 | 27 | 28 | def f_no_vector(x1,x2): 29 | return f( np.array( [x1,x2] ) ) 30 | 31 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 32 | z = f_no_vector(x,y) 33 | 34 | fig = plt.figure() 35 | graphe = plt.contour(x,y,z,levels) 36 | #plt.plot(3,1,'r*',markersize=15) 37 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 38 | plt.title(title) 39 | plt.show() 40 | 41 | 42 | def level_points_plot( f , x_tab , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 43 | 44 | def f_no_vector(x1,x2): 45 | return f( np.array( [x1,x2] ) ) 46 | 47 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 48 | z = f_no_vector(x,y) 49 | 50 | fig = plt.figure() 51 | graphe = plt.contour(x,y,z,levels) 52 | #plt.plot(3,1,'r*',markersize=15) 53 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 54 | plt.title(title) 55 | 56 | if x_tab.shape[0] > 40: 57 | sub = int(x_tab.shape[0]/40.0) 58 | x_tab = x_tab[::sub] 59 | 60 | delay = 2.0/x_tab.shape[0] 61 | for k in range(x_tab.shape[0]): 62 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10) 63 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1])) 64 | plt.draw() 65 | display.clear_output(wait=True) 66 | display.display(fig) 67 | time.sleep(delay) 68 | display.clear_output() 69 | plt.show() 70 | 71 | 72 | def level_2points_plot( f , x_tab , x_tab2 , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 73 | 74 | 75 | def f_no_vector(x1,x2): 76 | return f( np.array( [x1,x2] ) ) 77 | 78 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 79 | z = f_no_vector(x,y) 80 | 81 | fig = plt.figure() 82 | graphe = plt.contour(x,y,z,levels) 83 | #plt.plot(3,1,'r*',markersize=15) 84 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 85 | plt.xlim([x1_min,x1_max]) 86 | plt.ylim([x2_min,x2_max]) 87 | plt.title(title) 88 | 89 | if x_tab.shape[0] > 40: 90 | sub = int(x_tab.shape[0]/40.0) 91 | x_tab = x_tab[::sub] 92 | 93 | if x_tab2.shape[0] > 40: 94 | sub = int(x_tab2.shape[0]/40.0) 95 | x_tab2 = x_tab2[::sub] 96 | 97 | delay = 4.0/x_tab.shape[0] 98 | for k in range(x_tab.shape[0]): 99 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10) 100 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1])) 101 | plt.draw() 102 | #plt.pause(delay) 103 | 104 | delay = 4.0/x_tab2.shape[0] 105 | for k in range(x_tab2.shape[0]): 106 | plt.plot(x_tab2[k,0],x_tab2[k,1],'dg',markersize=8) 107 | #plt.annotate(k,(x_tab2[k,0],x_tab2[k,1])) 108 | #plt.pause(delay) 109 | plt.draw() 110 | 111 | plt.show() 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /Lab7_StochasticMethods/student.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/student.npz -------------------------------------------------------------------------------- /Lab7_StochasticMethods/student.txt: -------------------------------------------------------------------------------- 1 | # Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets: 2 | 1 sex - student's sex (binary: "F" - female or "M" - male) 3 | 2 age - student's age (numeric: from 15 to 22) 4 | 3 address - student's home address type (binary: "U" - urban or "R" - rural) 5 | 4 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3) 6 | 5 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart) 7 | 6 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) 8 | 7 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education) 9 | 8 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) 10 | 9 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) 11 | 10 failures - number of past class failures (numeric: n if 1<=n<3, else 4) 12 | 11 schoolsup - extra educational support (binary: yes or no) 13 | 12 famsup - family educational support (binary: yes or no) 14 | 13 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) 15 | 14 activities - extra-curricular activities (binary: yes or no) 16 | 15 nursery - attended nursery school (binary: yes or no) 17 | 16 higher - wants to take higher education (binary: yes or no) 18 | 17 internet - Internet access at home (binary: yes or no) 19 | 18 romantic - with a romantic relationship (binary: yes or no) 20 | 19 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent) 21 | 20 freetime - free time after school (numeric: from 1 - very low to 5 - very high) 22 | 21 goout - going out with friends (numeric: from 1 - very low to 5 - very high) 23 | 22 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high) 24 | 23 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) 25 | 24 health - current health status (numeric: from 1 - very bad to 5 - very good) 26 | 25 absences - number of school absences (numeric: from 0 to 93) 27 | 26 G1 - first period grade (numeric: from 0 to 20) 28 | 27 G2 - second period grade (numeric: from 0 to 20) 29 | 30 | 28 G3 - final grade (numeric: from 0 to 20, output target) 31 | 32 | Additional note: there are several (382) students that belong to both datasets . 33 | These students can be identified by searching for identical attributes 34 | that characterize each student, as shown in the annexed R file. 35 | -------------------------------------------------------------------------------- /Lab8_MinMax/Fig/UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab8_MinMax/Fig/UGA.png -------------------------------------------------------------------------------- /Lab8_MinMax/Lab8_Two-player zero-sum games.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "
\n", 8 | "

Master of Science in Industrial and Applied Mathematics (MSIAM) - 1st year

\n", 9 | "
\n", 10 | "

Numerical Optimization

\n", 11 | "

Lab 9: Min-Max problem and Zero-sum games

" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "The goal is to solve problems of the form\n", 19 | "$$\n", 20 | "\\max_{x \\in \\Delta_n} \\min_{y \\in \\Delta_n} x^T A y \\tag{MinMax}\n", 21 | "$$\n", 22 | "where $A \\in \\mathbb{R}^{n \\times n}$, and $x$ and $y$ are probability distributions over ${1,\\dots,n}$, i.e.~they belong to the simplex of size $n$:\n", 23 | "$$ \\Delta_n = \\left\\{ p \\in \\mathbb{R}^n : p\\geq 0 , \\sum_{i=1}^n p_i = 1 \\right\\}. $$\n", 24 | "Our aim is thus to find a tuple $(x^*, y^*) \\in \\Delta_n \\times \\Delta_n$ that solves $\\mathrm{(MinMax)}$. $x^*$ is given by \n", 25 | "\\begin{align}\n", 26 | "\\tag{P1}\n", 27 | " x^\\star = \\arg\\max_{x\\in\\Delta_n} \\min_{y\\in\\Delta_n} x^\\top A y\n", 28 | "\\end{align}\n", 29 | "\n", 30 | "while \n", 31 | "\\begin{align}\n", 32 | "\\tag{P2}\n", 33 | " y^\\star = \\arg\\min_{y\\in\\Delta_n} \\max_{x\\in\\Delta_n} x^\\top A y\n", 34 | "\\end{align}\n", 35 | "\n", 36 | "This last relation stems from the equality\n", 37 | "\\begin{align}\n", 38 | "\\min_{x \\in \\Delta_n} \\max_{y \\in \\Delta_n} x^T A y\n", 39 | "=\n", 40 | "\\max_{x \\in \\Delta_n} \\min_{y \\in \\Delta_n} x^T A y\n", 41 | "\\end{align}" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "One fundamental interpretation of this problem is that $(x^*, y^*)$ are the Nash Equilibrium of the associated Zero-sum game?.\n", 49 | "\n", 50 | "## Formulation of the Nash Equilibrium as the solution of a Min-Max problem (Optional)\n", 51 | "\n", 52 | "\n", 53 | "Let us consider a game with 2 players, both having $n$ possible actions.\n", 54 | "\n", 55 | "\n", 56 | "They play against each other and whenever Player 1 plays action \\#i and Player 2 plays action \\#j, P1 gets a reward of $g_{ij}\\in\\mathbb{R}$ while P2 gets $-g_{ij}\\in\\mathbb{R}$ (hence the name zero sum).\n", 57 | "\n", 58 | "\n", 59 | "The goal for both players is to find a Nash Equilibrium, that is a probability distribution over the actions for each player such that neither player has an individual interest to deviate from this strategy." 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "\n", 67 | "\n", 68 | "Let us denote by $x$ the probability distribution of the actions of P1 (its \"strategy\"), and $y$ the one of P2. \n", 69 | "\n", 70 | "Both $x$ and $y$ are probability distributions over $n$ possible actions, thus they both belong to the simplex of size n:\n", 71 | "$$ \\Delta_n = \\left\\{ p \\in \\mathbb{R}^n : p\\geq 0 , \\sum_{i=1}^n p_i = 1 \\right\\} . $$\n", 72 | "\n", 73 | "\n", 74 | "Then, it can be shown that the NE is achieved by $(x^\\star,y^\\star)$ solution of the problems\n", 75 | "\\begin{align}\n", 76 | "%\\tag{P1}\n", 77 | " x^\\star = \\arg\\max_{x\\in\\Delta_n} \\min_{y\\in\\Delta_n} x^\\top A y\n", 78 | "\\end{align}\n", 79 | "where $A$ is the $n\\times n$ matrix such that $A_{ij} = g_{ij}$, the reward of P1 for actions $i$ and $j$.\n", 80 | "\n", 81 | "Similarly, we have\n", 82 | "\\begin{align}\n", 83 | "%\\tag{P2}\n", 84 | " y^\\star = \\arg\\min_{y\\in\\Delta_n} \\max_{x\\in\\Delta_n} x^\\top A y\n", 85 | "\\end{align}\n", 86 | "with the same matrix $A$." 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "# Numerical computation of constrained Min-Max problems" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "In this lab, we will first consider a zero-sum game characterized by matrix $A=\\left[\\begin{array}{cc} -6 & 9 \\\\ 4 & -6 \\end{array}\\right]$ ." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "import numpy as np\n", 110 | "import scipy.optimize as scopt" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "n = 2; m =2 # Dimension\n", 120 | "A = np.array([[-6,9],[4,-6]])\n", 121 | "A" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "n,m = A.shape" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "# Method 1: Linear Programming" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "### Optimal strategy for x\n", 145 | "\n", 146 | "We begin by finding the optimal $x^\\star$.\n", 147 | "\n", 148 | "> **1.** Reformulate the problem (P1) into a linear program and solve it using a LP solver." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "### Optimal strategy for y\n", 163 | "\n", 164 | "> **2.** Do the same thing with (P2) to find $y^\\star$." 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "scrolled": true 172 | }, 173 | "outputs": [], 174 | "source": [] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "### Value of the game" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "> **3.** Compare the values of problems (P1) and (P2). What is remarkable about $A y^\\star$? About $A^\\top x^\\star$? " 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "# Method 2: Optimization \n", 202 | "\n", 203 | "Finding the solution of a min-max optimization problem is harder in general than for a simple minimization problem. Nevertheless, it can still be achieved by first-order ``gradient-like'' methods. This kind of setup has attracted a lot of interest in the 2020's for the training of Generative Adversarial Networks (GANs). \n", 204 | "\n", 205 | "To do so, we can define $X=(x,y)\\in \\Delta_n\\times\\Delta_n$ and $v(X) = (-A y, A^\\top x)$. To solve the problem\n", 206 | "\\begin{align}\n", 207 | "\\tag{P}\n", 208 | "\\max_{x\\in\\Delta_n} \\min_{y\\in\\Delta_n} x^\\top A y ,\n", 209 | "\\end{align}\n", 210 | "we can try to move oppositely to its direction (ie. do a gradient ascent on $ x\\mapsto x^\\top A y $ and a gradient descent on $ y\\mapsto x^\\top A y $:\n", 211 | "\\begin{align}\n", 212 | " \\tag{Gradient Descent Ascent}\n", 213 | " X_{k+1} = \\mathrm{proj}_{\\Delta_n\\times\\Delta_n} (X_k-\\gamma_k v(X_k)).\n", 214 | "\\end{align}\n", 215 | "\n", 216 | "\n", 217 | "We first define the vector field" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "def v(X):\n", 227 | " x = X[0:n]\n", 228 | " y = X[n:]\n", 229 | " return np.concatenate((-A.dot(y),A.T.dot(x)))" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "And the dimension of the variables space." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "N = 2*n" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "We also need the projection to the contraints: $\\Delta_n\\times\\Delta_n$\n", 253 | "\n", 254 | "> **4.** Implement a function that projects a vector onto $\\Delta_n\\times\\Delta_n$" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "def proj_simplex(v):\n", 264 | " ## TODO\n", 265 | " return v\n", 266 | "\n", 267 | "def proj_2simplex(X):\n", 268 | " x = X[0:n]\n", 269 | " y = X[n:]\n", 270 | " return np.concatenate((proj_simplex(x),proj_simplex(y)))\n" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "#### Gradient Descent-Ascent\n", 278 | "\n", 279 | "> **5.** Run Gradient Descent Ascent by completing the code below." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "X = proj_2simplex(np.ones(N))\n", 289 | "K = 1000\n", 290 | "step = 0.01\n", 291 | "\n", 292 | "X_tab_GDA = np.copy(X)\n", 293 | "\n", 294 | "for k in range(1,K):\n", 295 | " X = X ## Step to fill\n", 296 | " if k%5==0:\n", 297 | " if k%25==0: print(\"ite. {:3d} : x= [{:.3f},{:.3f}] | y= [{:.3f},{:.3f}]\".format(k,X[0],X[1],X[2],X[3]))\n", 298 | " X_tab_GDA = np.vstack((X_tab_GDA,X))" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "> **6.** What do you observe in terms of convergence?" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "#### Extragradient\n", 313 | "\n", 314 | "To overcome the issues with gradient descent-ascent, the ExtraGradient method was proposed:\n", 315 | "\\begin{align}\n", 316 | " \\tag{ExtraGradient}\n", 317 | " \\left\\{ \n", 318 | " \\begin{array}{l}\n", 319 | " X_{k+1/2} = \\mathrm{proj}_{\\Delta_n\\times\\Delta_n} (X_k-\\gamma_k v(X_k) ) \\\\\n", 320 | " X_{k+1} = \\mathrm{proj}_{\\Delta_n\\times\\Delta_n} (X_k-\\gamma_k v(X_{k+1/2})))\n", 321 | " \\end{array}\n", 322 | " \\right. \n", 323 | "\\end{align}\n", 324 | "which intuitively consists in generating a leading point that will look forward the value of the field and apply it to the base point. This way, circular effects can be managed and convergence can be restored.\n", 325 | "\n", 326 | "\n", 327 | "> **7.** Run ExtraGradient by completing the code below." 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "X = proj_2simplex(np.ones(N))\n", 337 | "K = 1000\n", 338 | "step = 0.01\n", 339 | "\n", 340 | "X_tab_EG = np.copy(X)\n", 341 | "\n", 342 | "for k in range(1,K):\n", 343 | " X_lead = X ## Step to fill\n", 344 | " X = X ## Step to fill\n", 345 | " if k%5==0:\n", 346 | " if k%25==0: print(\"ite. {:3d} : x= [{:.3f},{:.3f}] | y= [{:.3f},{:.3f}]\".format(k,X_lead[0],X_lead[1],X_lead[2],X_lead[3]))\n", 347 | " X_tab_EG = np.vstack((X_tab_EG,X_lead))\n", 348 | " " 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "#### Comparison\n", 356 | "\n", 357 | "> **8.** Compare Gradient and ExtraGradient on the plot below.\n" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "import matplotlib.pyplot as plt\n", 367 | "\n", 368 | "plt.figure()\n", 369 | "plt.plot(X_tab_GDA[:,0],X_tab_GDA[:,2],color=\"red\",label=\"GDA\")\n", 370 | "plt.plot(X_tab_EG[:,0],X_tab_EG[:,2],color=\"blue\",label=\"EG\")\n", 371 | "plt.title(\"Behavior of x[1] and y[1]\")\n", 372 | "plt.legend()\n", 373 | "plt.show()" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "#### Mirror Prox\n", 381 | "\n", 382 | "A possibility to make the projections above easier to compute is to change the (implicit) Euclidean metric.\n", 383 | " For the simplex, an efficient example is the \\emph{Kullback-Liebler} divergence $D(x,y) = \\sum_{i=1}^n x_i\\log(x_i/y_i) - \\sum_{i=1}^n (x_i-y_i)$, which serve as a metric on strictly positive vectors.\n", 384 | " \n", 385 | "With this metric, for any positive vector $y$,\n", 386 | " \\begin{align}\n", 387 | " \\mathrm{proj}^{KL}_{\\Delta_n} (y) = \\arg\\min_{u\\in\\Delta_n} D(u,y) = \\frac{y}{ \\sum_{i=1}^n y_i} = \\frac{y}{ \\|y\\|_1}\n", 388 | " \\end{align}\n", 389 | " which is much easier to compute.\n", 390 | " \n", 391 | "By changing the metric of the Extragradient algorithm, by going from $X_{k+1}=\\arg\\min_X\\{ \\gamma\\langle v(X_k),X\\rangle + \\frac{1}{2} \\|X-X_k\\|^2 \\}$ to $X_{k+1}=\\arg\\min_X\\{ \\gamma\\langle v(X_k),X\\rangle + D(X,X_k) \\}$} we obtain the Mirror-Prox method.\n", 392 | "\n", 393 | "\n", 394 | "> **9.** Show that \n", 395 | "> $$ \\arg\\min_X\\{ \\gamma\\langle v(X_k),X\\rangle + D(X,X_k) \\} = X_k \\exp(-\\gamma v(X_{k} )) $$\n", 396 | "\n", 397 | "\n", 398 | "The Mirror Prox algorithm then writes:\n", 399 | " \\begin{align}\n", 400 | " \\tag{Mirror Prox}\n", 401 | " \\left\\{ \n", 402 | " \\begin{array}{l}\n", 403 | " (a_{k+1/2},b_{k+1/2}) = X_k \\exp(-\\gamma v(X_k)) \\\\\n", 404 | " X_{k+1/2} = (\\frac{a_{k+1/2}}{\\|a_{k+1/2}\\|_1},\\frac{b_{k+1/2}}{\\|,b_{k+1/2}\\|_1}) \\\\\n", 405 | " (a_{k+1},b_{k+1}) = X_k \\exp(-\\gamma v(X_{k+1/2})) \\\\\n", 406 | " X_{k+1} = (\\frac{a_{k+1}}{\\|a_{k+1}\\|_1},\\frac{b_{k+1}}{\\|,b_{k+1}\\|_1}) \\\\\n", 407 | " \\end{array}\n", 408 | " \\right. .\n", 409 | " \\end{align}\n", 410 | "\n", 411 | "\n", 412 | "This is ExtraGradient but with this adapted geometry." 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "\n", 420 | "> **10.** Run Mirror Prox by completing the code below and compare its behavior with the previous methods." 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "X = proj_2simplex(np.ones(N))\n", 430 | "K = 1000\n", 431 | "step = 0.05\n", 432 | "\n", 433 | "X_tab_MP = np.copy(X)\n", 434 | "\n", 435 | "for k in range(1,K):\n", 436 | " X_lead = X_lead ## Step to fill\n", 437 | " if k%1==0:\n", 438 | " if k%25==0: print(\"ite. {:3d} : x= [{:.3f},{:.3f}] | y= [{:.3f},{:.3f}]\".format(k,X_lead[0],X_lead[1],X_lead[2],X_lead[3]))\n", 439 | " X_tab_MP = np.vstack((X_tab_MP,X_lead))\n", 440 | " " 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "import matplotlib.pyplot as plt\n", 450 | "\n", 451 | "plt.figure()\n", 452 | "plt.plot(X_tab_GDA[:,0],X_tab_GDA[:,2],color=\"red\",label=\"GDA\")\n", 453 | "plt.plot(X_tab_EG[:,0],X_tab_EG[:,2],color=\"blue\",label=\"EG\")\n", 454 | "plt.plot(X_tab_MP[:,0],X_tab_MP[:,2],color=\"green\",label=\"MP\")\n", 455 | "plt.title(\"Behavior of x[1] and y[1]\")\n", 456 | "plt.legend()\n", 457 | "plt.show()" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [] 480 | } 481 | ], 482 | "metadata": { 483 | "kernelspec": { 484 | "display_name": "Python 3 (ipykernel)", 485 | "language": "python", 486 | "name": "python3" 487 | }, 488 | "language_info": { 489 | "codemirror_mode": { 490 | "name": "ipython", 491 | "version": 3 492 | }, 493 | "file_extension": ".py", 494 | "mimetype": "text/x-python", 495 | "name": "python", 496 | "nbconvert_exporter": "python", 497 | "pygments_lexer": "ipython3", 498 | "version": "3.9.13" 499 | } 500 | }, 501 | "nbformat": 4, 502 | "nbformat_minor": 4 503 | } 504 | -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/._1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._1.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/._2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._2.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/._3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._3.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/._4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._4.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/._5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._5.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/._UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._UGA.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/1.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/2.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/3.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/4.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/5.png -------------------------------------------------------------------------------- /Lab9_Uzawa/Fig/UGA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/UGA.png -------------------------------------------------------------------------------- /Lab9_Uzawa/plotLib.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from matplotlib import cm 5 | from mpl_toolkits.mplot3d import Axes3D 6 | import time 7 | from IPython import display 8 | 9 | 10 | def custom_3dplot( f, x1_min,x1_max,x2_min,x2_max,nb_points, v_min, v_max ): 11 | 12 | def f_no_vector(x1,x2): 13 | return f( np.array( [x1,x2] ) ) 14 | 15 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 16 | z = f_no_vector(x,y) 17 | 18 | fig = plt.figure(figsize = (12, 6)) 19 | ax = fig.gca(projection='3d') 20 | ax.plot_surface(x, y, z, cmap=cm.hot , vmin = v_min, vmax = v_max) 21 | ax.set_zlim(v_min, v_max) 22 | plt.show() 23 | 24 | 25 | def level_plot( f, x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 26 | 27 | 28 | def f_no_vector(x1,x2): 29 | return f( np.array( [x1,x2] ) ) 30 | 31 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 32 | z = f_no_vector(x,y) 33 | 34 | fig = plt.figure(figsize = (12, 6)) 35 | graphe = plt.contour(x,y,z,levels) 36 | #plt.plot(3,1,'r*',markersize=15) 37 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 38 | plt.title(title) 39 | plt.show() 40 | 41 | 42 | def level_points_plot( f , x_tab , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 43 | 44 | def f_no_vector(x1,x2): 45 | return f( np.array( [x1,x2] ) ) 46 | 47 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 48 | z = f_no_vector(x,y) 49 | 50 | fig = plt.figure(figsize = (12, 6)) 51 | graphe = plt.contour(x,y,z,levels) 52 | #plt.plot(3,1,'r*',markersize=15) 53 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 54 | plt.title(title) 55 | 56 | delay = 4.0/x_tab.shape[0] 57 | for k in range(x_tab.shape[0]): 58 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10) 59 | plt.xlim([x1_min,x1_max]) 60 | plt.ylim([x2_min,x2_max]) 61 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1])) 62 | plt.draw() 63 | display.clear_output(wait=True) 64 | display.display(fig) 65 | time.sleep(delay) 66 | display.clear_output() 67 | plt.show() 68 | 69 | 70 | def level_2points_plot( f , x_tab , x_tab2 , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ): 71 | 72 | 73 | def f_no_vector(x1,x2): 74 | return f( np.array( [x1,x2] ) ) 75 | 76 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points)) 77 | z = f_no_vector(x,y) 78 | 79 | fig = plt.figure(figsize = (12, 6)) 80 | graphe = plt.contour(x,y,z,levels) 81 | #plt.plot(3,1,'r*',markersize=15) 82 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f') 83 | plt.xlim([x1_min,x1_max]) 84 | plt.ylim([x2_min,x2_max]) 85 | plt.title(title) 86 | 87 | delay = 4.0/x_tab.shape[0] 88 | for k in range(x_tab.shape[0]): 89 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10) 90 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1])) 91 | plt.draw() 92 | #plt.pause(delay) 93 | 94 | delay = 4.0/x_tab2.shape[0] 95 | for k in range(x_tab2.shape[0]): 96 | plt.plot(x_tab2[k,0],x_tab2[k,1],'dg',markersize=8) 97 | #plt.annotate(k,(x_tab2[k,0],x_tab2[k,1])) 98 | #plt.pause(delay) 99 | plt.draw() 100 | 101 | plt.show() 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NumericalOptimization 2 | Jupyter Notebooks for the M1 MSIAM Course "Numerical Optimization" at Université Grenoble Alpes 3 | -------------------------------------------------------------------------------- /Tuto1_Basics/harder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/harder.png -------------------------------------------------------------------------------- /Tuto1_Basics/poly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/poly.png -------------------------------------------------------------------------------- /Tuto1_Basics/rosenbrock.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/rosenbrock.png -------------------------------------------------------------------------------- /Tuto1_Basics/simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/simple.png -------------------------------------------------------------------------------- /Tuto1_Basics/tuto1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/tuto1.pdf -------------------------------------------------------------------------------- /Tuto1_Basics/tuto1.tex: -------------------------------------------------------------------------------- 1 | %\documentclass[paper=a4, fontsize=9pt]{article} 2 | \documentclass[a4paper,twoside,10pt]{amsart} 3 | 4 | 5 | %\usepackage[scale=0.8]{geometry} 6 | \usepackage{fullpage} 7 | 8 | \usepackage[T1]{fontenc} % Use 8-bit encoding that has 256 glyphs 9 | \usepackage[english]{babel} % English language/hyphenation 10 | \usepackage{amsmath,amsfonts,amsthm} % Math packages 11 | \usepackage{xcolor} 12 | \usepackage{hyperref} 13 | \usepackage{tcolorbox} 14 | 15 | \usepackage{tikz} 16 | \usepackage{tkz-graph} 17 | 18 | \numberwithin{equation}{section} % Number equations within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 19 | \numberwithin{figure}{section} % Number figures within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 20 | \numberwithin{table}{section} % Number tables within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 21 | \usepackage{graphicx} 22 | \usepackage{caption} 23 | \usepackage{subcaption} 24 | 25 | 26 | \newcommand{\horrule}[1]{\rule{\linewidth}{#1}} % Create horizontal rule command with 1 argument of height 27 | \newcommand{\ans}[1]{ { \color{gray} \itshape #1} } % Create horizontal rule command with 1 argument of height 28 | 29 | \newtheorem{theo}{Theorem} 30 | \newtheorem{lemma}{Lemma} 31 | \theoremstyle{definition} 32 | \newtheorem{q_td}{Exercise } 33 | \newcommand{\reftd}[1]{ $\circ$ \ref{#1}} 34 | \newtheorem{q_tp}{$\diamond$} 35 | \newcommand{\reftp}[1]{$\diamond$ \ref{#1}} 36 | 37 | \begin{document} 38 | 39 | %---------------------------------------------------------------------------------------- 40 | % TITLE 41 | %---------------------------------------------------------------------------------------- 42 | 43 | 44 | \normalfont \normalsize 45 | \noindent\textsc{\small Universit\'e Grenoble Alpes }\\ 46 | \noindent\textsc{\small \hfill MSIAM 1st year} \\ [0.3cm] % Your university, school and/or department name(s) 47 | \horrule{0.5pt} \\[0.4cm] % Thin top horizontal rule 48 | \begin{center} 49 | {\LARGE \scshape Numerical Optimization\\ Tuto 1: Gradients and Minimization} \\ % The title 50 | \end{center} 51 | \noindent\textsc{\hfill L. Desbat \& F. Iutzeler } 52 | \horrule{2pt} \\[0.5cm] % Thick bottom horizontal rule 53 | 54 | 55 | 56 | %---------------------------------------------------------------------------------------- 57 | % TD 58 | %---------------------------------------------------------------------------------------- 59 | %\newpage 60 | \setcounter{section}{0} 61 | \renewcommand{\thesection}{\Alph{section}} 62 | \renewcommand*{\theHsection}{TD.\the\value{section}} 63 | 64 | 65 | \vspace*{0.5cm} 66 | 67 | \section{Differentiability, Minima, and Convexity} 68 | 69 | 70 | \begin{q_td}[Quadratic functions]\label{td:qp}\hfill 71 | 72 | \begin{itemize} 73 | \item[a.] In $\mathbb{R}^n$, compute the gradient of the squared Euclidean norm $\|\cdot\|_2^2$ at a generic point $x\in\mathbb{R}^n$. 74 | \item[b.] Let $A$ be an $m \times n$ real matrix and $b$ a size-$m$ real vector. We define $f(x) = \|Ax-b\|_2^2$. For a generic vector $a\in \mathbb{R}^n$, compute the gradient $\nabla f(a)$ and Hessian $H_f(a)$. 75 | \item[c.] Let $C$ be an $n \times n$ real matrix, $d$ a size-$n$ real vector, and $e\in\mathbb{R}$. We define $g(x) = x^\mathrm{T}Cx + d^\mathrm{T}x + e$. For a generic vector $a\in \mathbb{R}^n$, compute the gradient $\nabla g(a)$ and Hessian $H_g(a)$. 76 | \item[d.] Can all functions of the form of $f$ and be written in the form of $g$? And conversely? 77 | \end{itemize} 78 | \end{q_td} 79 | 80 | 81 | \vspace*{0.5cm} 82 | 83 | \begin{q_td}[Basic Differential calculus] 84 | \label{td:conv} 85 | Use the composition lemma to compute the gradients of: 86 | \begin{itemize} 87 | \item[a.] $f_1(x) = \|Ax-b\|_2^2$ . 88 | \item[b.] $f_2(x) = \|x\|_2$ . 89 | \end{itemize} 90 | \end{q_td} 91 | 92 | 93 | \vspace*{0.5cm} 94 | 95 | \begin{q_td}[Preparing the Lab] 96 | \label{td:fun} 97 | In the first lab, we will consider the following toy functions: 98 | \begin{align*} 99 | & \begin{array}{rrcll} 100 | f: & \mathbb{R}^2 & \to &\mathbb{R}\\ 101 | & (x_1,x_2) & \mapsto & 4 (x_1-3)^2 + 2(x_2-1)^2 102 | \end{array}\\ 103 | % 104 | & \begin{array}{rrcll} 105 | g: & \mathbb{R}^2 & \to &\mathbb{R}\\ 106 | & (x_1,x_2) & \mapsto & \log( 1 + \exp(4 (x_1-3)^2 ) + \exp( 2(x_2-1)^2 ) ) - \log(3) 107 | \end{array} \\ 108 | % 109 | & \begin{array}{rrcll} 110 | r: & \mathbb{R}^2 & \to &\mathbb{R}\\ 111 | & (x_1,x_2) & \mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2 112 | \end{array}\\ 113 | % 114 | & \begin{array}{rrcll} 115 | t: & \mathbb{R}^2 & \to &\mathbb{R}\\ 116 | & (x_1,x_2) & \mapsto & (0.6 x_1 + 0.2 x_2)^2 \left((0.6 x_1 + 0.2 x_2)^2 - 4 (0.6 x_1 + 0.2 x_2)+4\right) + (-0.2 x_1 + 0.6 x_2)^2 117 | \end{array}\\ 118 | % 119 | & \begin{array}{rrcll} 120 | p: & \mathbb{R}^2 & \to &\mathbb{R}\\ 121 | & (x_1,x_2) & \mapsto & \left| x_1-3 \right| + 2\left| x_2-1\right| . 122 | \end{array} 123 | \end{align*} 124 | \begin{itemize} 125 | \item[a.] From the 3D plots of \ref{fig:3d}, which functions are visibly non-convex. 126 | \item[b.] For all five functions, show that they are convex or give an argument for their non-convexity. 127 | \item[c.] For functions $f,g,r,t$, compute their gradient. 128 | \item[d.] For functions $f,g$, compute their Hessian. 129 | \end{itemize} 130 | \end{q_td} 131 | 132 | 133 | 134 | \begin{figure}[h!] 135 | \centering 136 | \begin{subfigure}[b]{0.48\textwidth} 137 | \centering 138 | \includegraphics[width=1.0\textwidth]{simple.png} 139 | \caption{a \emph{simple} function: $f$} 140 | \end{subfigure} 141 | ~ 142 | \begin{subfigure}[b]{0.48\textwidth} 143 | \centering 144 | \includegraphics[width=1.0\textwidth]{harder.png} 145 | \caption{some \emph{harder} function: $g$} 146 | \end{subfigure} \\ 147 | \centering 148 | \begin{subfigure}[b]{0.48\textwidth} 149 | \centering 150 | \includegraphics[width=1.0\textwidth]{rosenbrock.png} 151 | \caption{\emph{Rosenbrock}'s function: $r$} 152 | \end{subfigure} 153 | ~ 154 | \begin{subfigure}[b]{0.48\textwidth} 155 | \centering 156 | \includegraphics[width=1.0\textwidth]{two_pits.png} 157 | \caption{\emph{two pits} function: $t$} 158 | \end{subfigure} 159 | ~ 160 | \begin{subfigure}[b]{0.48\textwidth} 161 | \centering 162 | \includegraphics[width=1.0\textwidth]{poly.png} 163 | \caption{\emph{polyhedral} function: $p$} 164 | \end{subfigure} 165 | \caption{3D plots of the considered functions} 166 | \label{fig:3d} 167 | \end{figure} 168 | 169 | 170 | 171 | \vspace*{0.5cm} 172 | 173 | \begin{q_td}[Fundamentals of convexity] 174 | \label{td:conv} 175 | ~ 176 | \begin{itemize} 177 | \item[a.] Let $f$ and $g$ be two convex functions. Show that $m(x) = \max(f(x),g(x) )$ is convex. 178 | \item[b.] Show that $f_1(x) = \max(x^2-1 , 0)$ is convex. 179 | \item[c.] Let $f$ be a convex function and $g$ be a convex, non-decreasing function. Show that $c(x) = g(f(x))$ is convex. 180 | \item[d.] Show that $f_2(x) = \exp(x^2)$ is convex. What about $f_3(x) = \exp(-x^2)$ 181 | \item[e.] Justify why the $1$-norm, the $2$ norm, and the squared $2$-norm are convex. 182 | \end{itemize} 183 | \end{q_td} 184 | 185 | \vspace*{0.5cm} 186 | 187 | \begin{q_td}[Strict and strong convexity] 188 | \label{td:qp} A function $f:\mathbb{R}^n \to \mathbb{R}$ is said 189 | \begin{itemize} 190 | \item \emph{strictly convex} if for any $x \neq y \in\mathbb{R}^n$ and any $\alpha\in]0,1[$ 191 | $$ f(\alpha x + (1- \alpha )y ) < \alpha f(x) + (1- \alpha )f(y) $$ 192 | \item \emph{strongly convex} if there exists $\beta>0$ such that $f - \frac{\beta}{2}\|\cdot\|_2^2$ is convex. 193 | \end{itemize} 194 | \begin{itemize} 195 | \item[a.] For a strictly convex function $f$, show that the problem 196 | $$ \left\{ \begin{array}{l} \min f(x) \\ x \in C \end{array} \right. $$ 197 | where $C$ is a convex set admits at most one solution. 198 | \item[b.] Show that a strongly convex function is also strictly convex.\\ \emph{(hint: use the identity $\|\alpha x + (1-\alpha)y\|^2 = \alpha \|x\|^2 + (1-\alpha)\|y\|^2 - \alpha (1-\alpha)\|x-y\|^2 $.)} 199 | \end{itemize} 200 | \end{q_td} 201 | 202 | \vspace*{0.5cm} 203 | 204 | 205 | \begin{q_td}[Optimality conditions] 206 | \label{td:opt} 207 | Let $f:\mathbb{R}^n\to\mathbb{R}$ be a twice differentiable function and $\bar{x}\in\mathbb{R}^n$. We suppose that $f$ admits a local minimum at $\bar{x}$ that is $f(x)\geq f(\bar{x})$ for all $x$ in a neighborhood\footnote{Formally, one would write $\forall x \in \mathbb{R}^n$ such that $\|x-\bar{x}\|\leq \varepsilon$ for $\varepsilon>0$ and some norm $\|\cdot\|$. } of $\bar{x}$. 208 | \begin{itemize} 209 | \item[a.] For any direction $u\in\mathbb{R}^n$, we define the $\mathbb{R}\to\mathbb{R}$ function $q(t) = f(\bar{x}+tu)$. Compute $q'(t)$. 210 | \item[b.] By using the first order Taylor expansion of $q$ at $0$, show that $\nabla f(\bar{x}) = 0$. 211 | \item[c.] Compute $q''(t)$. By using the second order Taylor expansion of $q$ at $0$, show that $\nabla^2 f(\bar{x})$ is positive semi-definite. 212 | \end{itemize} 213 | \end{q_td} 214 | 215 | \vspace*{1cm} 216 | 217 | 218 | 219 | 220 | \section{the Gradient Algorithm} 221 | 222 | \begin{q_td}[Descent lemma] 223 | \label{td:smooth} 224 | A function $f:\mathbb{R}^n\to\mathbb{R}$ is said to be $L$-smooth if it is differentiable and its gradient $\nabla f$ is $L$-Lipchitz continuous, that is 225 | $$\forall x,y\in\mathbb{R}^n, ~~  \|\nabla f(x) - \nabla f(y) \| \leq L \|x-y\|. $$ 226 | The goal of the exercise is to prove that if $f:\mathbb{R}^n\to\mathbb{R}$ is $L$-smooth, then for all $x,y\in\mathbb{R}^n$, 227 | $$ f(x) \leq f(y) + (x-y)^\mathrm{T} \nabla f(y) + \frac{L}{2} \| x-y\|^2 $$ 228 | \begin{itemize} 229 | \item[a.] Starting from fundamental theorem of calculus stating that for all $x,y\in\mathbb{R}^n$, 230 | $$ f(x) - f(y) = \int_{0}^1 (x-y)^\mathrm{T} \nabla f(y + t(x-y) ) \mathrm{d}t $$ 231 | prove the descent lemma. 232 | \item[b.] Give a function for which the inequality is tight and one for which it is not. 233 | \end{itemize} 234 | \end{q_td} 235 | 236 | \vspace*{0.5cm} 237 | 238 | \begin{q_td}[Smooth functions] 239 | Consider the constant stepsize gradient algorithm $x_{k+1} = x_k - \gamma \nabla f(x_k)$ on an $L$-smooth function $f$ with some minimizer (i.e. some $x^\star$ such that $f(x)\geq f(x^\star)$ for all $x$). 240 | \begin{itemize} 241 | \item[a.] Use the \emph{descent lemma} to prove convergence of the sequence $(f(x_k))_k$ when $\gamma\leq 2/L$. 242 | \item[b.] Did you use at some point that the function was convex? Conclude about the convergence of the gradient algorithm on smooth non-convex functions. 243 | \end{itemize} 244 | \end{q_td} 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | \end{document} 253 | -------------------------------------------------------------------------------- /Tuto1_Basics/two_pits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/two_pits.png -------------------------------------------------------------------------------- /Tuto4_Prox/tuto4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto4_Prox/tuto4.pdf -------------------------------------------------------------------------------- /Tuto4_Prox/tuto4.tex: -------------------------------------------------------------------------------- 1 | %\documentclass[paper=a4, fontsize=9pt]{article} 2 | \documentclass[a4paper,twoside,10pt]{amsart} 3 | 4 | 5 | %\usepackage[scale=0.8]{geometry} 6 | \usepackage{fullpage} 7 | 8 | \usepackage[T1]{fontenc} % Use 8-bit encoding that has 256 glyphs 9 | \usepackage[english]{babel} % English language/hyphenation 10 | \usepackage{amsmath,amsfonts,amsthm} % Math packages 11 | \usepackage{xcolor} 12 | \usepackage{hyperref} 13 | 14 | \usepackage{tikz} 15 | \usepackage{tkz-graph} 16 | 17 | \numberwithin{equation}{section} % Number equations within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 18 | \numberwithin{figure}{section} % Number figures within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 19 | \numberwithin{table}{section} % Number tables within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 20 | \usepackage{graphicx} 21 | \usepackage{caption} 22 | \usepackage{subcaption} 23 | 24 | 25 | \newcommand{\horrule}[1]{\rule{\linewidth}{#1}} % Create horizontal rule command with 1 argument of height 26 | \newcommand{\ans}[1]{ { \color{gray} \itshape #1} } % Create horizontal rule command with 1 argument of height 27 | 28 | \newtheorem{theo}{Theorem} 29 | \newtheorem{lemma}{Lemma} 30 | \theoremstyle{definition} 31 | \newtheorem{q_td}{Exercise } 32 | \newcommand{\reftd}[1]{ $\circ$ \ref{#1}} 33 | \newtheorem{q_tp}{$\diamond$} 34 | \newcommand{\reftp}[1]{$\diamond$ \ref{#1}} 35 | 36 | \begin{document} 37 | 38 | %---------------------------------------------------------------------------------------- 39 | % TITLE 40 | %---------------------------------------------------------------------------------------- 41 | 42 | 43 | \normalfont \normalsize 44 | \noindent\textsc{\small Universit\'e Grenoble Alpes }\\ 45 | \noindent\textsc{ MSIAM 1st year} \\ [0.3cm] % Your university, school and/or department name(s) 46 | \horrule{0.5pt} \\[0.4cm] % Thin top horizontal rule 47 | \begin{center} 48 | {\LARGE \scshape Numerical Optimization \\ 49 | Tuto 4: Proximal methods} \\ % The title 50 | \end{center} 51 | \noindent\textsc{\hfill L. Desbat \& F. Iutzeler } 52 | \horrule{2pt} \\[0.5cm] % Thick bottom horizontal rule 53 | 54 | 55 | 56 | %---------------------------------------------------------------------------------------- 57 | % TD 58 | %---------------------------------------------------------------------------------------- 59 | %\newpage 60 | \setcounter{section}{0} 61 | \renewcommand{\thesection}{\Alph{section}} 62 | \renewcommand*{\theHsection}{TD.\the\value{section}} 63 | 64 | 65 | \vspace*{0.5cm} 66 | 67 | \section{the Proximity operator} 68 | 69 | In non-smooth optimization, that is when the objective function is not differentiable, the gradient may not be defined at each point. Instead, for any point $x\in\mathbb{R}$ and any convex function $g:\mathbb{R}^n \to \mathbb{R}\cup\{+\infty\}$, one can define a subdifferential $\partial g(x) \subset \mathbb{R}^n$ as 70 | $$ \partial g(x) = \{ u\in\mathbb{R}^n | g(z) \geq g(x) + \langle u ; z-x \rangle \text{ for all } z\in\mathbb{R}^n \}. $$ 71 | The optimality conditions and computation rules roughly translate. 72 | 73 | 74 | However, the sub-gradient algorithm $x_{k+1} = x_k - \gamma_k g_k$ where $g_k\in \partial g(x_k)$ rely on a vanishing stepsize $\gamma_k$ and is thus very slow in practice. In order to mend this case, a more evolved operator was introduced: its \emph{proximity operator} is defined for some positive constant $\gamma>0$ as 75 | \begin{equation} 76 | x = \mathbf{prox}_{\gamma g}(y) = \arg\min_{w\in\mathbb{R}^n} \left\{ \gamma g(w) + \frac{1}{2} \left\| w - y \right\|^2 \right\} . 77 | \end{equation} 78 | 79 | 80 | 81 | 82 | \begin{q_td}[First Properties]\label{td:prox0}\hfill 83 | 84 | \begin{itemize} 85 | \item[a.] Justify that for a proper convex function $g$, this definition as an $\arg\min$ indeed leads to a unique point. Would it still be the case if $g$ was not convex? 86 | \item[b.] This operation is sometimes called \emph{implicit gradient}. Find an explanation why.\\ 87 | \emph{\small Hint: Use First order optimality conditions.} 88 | \item[c.] Let $x = \mathbf{prox}_{\gamma g}(y)$ and $x' = \mathbf{prox}_{\gamma g}(y')$, show that 89 | $$ \|x - x'\|^2 \leq \langle x' - y ' ; x- y \rangle . $$ 90 | \emph{\small Hint: if $g_{x} \in \partial g(x)$ and $g_{x'} \in \partial g(x')$, the convexity of $g$ gives $\langle x -x'; g_x - g_{x'} \rangle \geq 0$.} 91 | \item[d.] Deduce that 92 | $$ \|x - x'\|^2 \leq \| y - y' \|^2 - \| (x-y) - (x'-y') \|^2 $$ 93 | and investigate the similarities with the gradient of a smooth function. 94 | \end{itemize} 95 | \end{q_td} 96 | 97 | \vspace*{0.5cm} 98 | 99 | We showed that the proximity operator of a convex function has the same contraction properties of a gradient operation with step $1/L$ on an $L$-smooth convex function. Let us now investigate the related algorithm. 100 | 101 | \vspace*{0.5cm} 102 | 103 | \begin{q_td}[Proximal point algorithm]\label{td:prox} The proximal point algorithm is simply obtained by successively applying the proximity operator of a function: 104 | $$x_{k+1} = \mathbf{prox}_{\gamma g}(x_k)$$ 105 | \begin{itemize} 106 | \item[a.] Let $x^\star$ be a \emph{fixed point} of $g$ (we will suppose that such a point exists), that is $x^\star = \mathbf{prox}_{\gamma g}(x^\star)$. Show that $x^\star$ is a minimizer of $g$. \\ 107 | \emph{\small Hint: Use First order optimality conditions.} 108 | \item[b.] Show that if $x = \mathbf{prox}_{\gamma g}(y) $, then $g(x)\leq g(y) - \frac{1}{2\gamma} \|x-y\|^2$.\\ 109 | \emph{\small Hint: Use that for $f$ $\mu$-strongly convex and $x^\star$ the minimizer of $f$, then $f(x^\star) \leq f(y) - \frac{\mu}{2}\|x^\star-y\|^2$.} 110 | \item[c.] Conclude that the \emph{Proximal Point Algorithm} converge to a minimizer of $g$. 111 | \end{itemize} 112 | \end{q_td} 113 | 114 | \vspace*{0.5cm} 115 | 116 | Now that we have seen the optimization-wise interest of the proximity operator, let us compute it explicitly on some functions. 117 | 118 | \vspace*{0.5cm} 119 | 120 | \begin{q_td}[Proximity Operators of basic functions] 121 | \label{td:fun} 122 | Compute the proximity operators of the following functions: 123 | \begin{itemize} 124 | \item[a.] $g_1(x) = \| x \|_2^2$ . 125 | \item[b.] $g_2(x) = \iota_C(x)$ with $\iota_C(x) = 0$ if $x$ belongs to convex set $C$ and $+\infty$ elsewhere. 126 | \item[c.] $g_3(x) = \|x\|_1 $ . 127 | \item[d.] $g_4(x) = \|x\|_2 $ . 128 | \end{itemize} 129 | \end{q_td} 130 | 131 | \vspace*{0.5cm} 132 | 133 | Unfortunately, in general, no explicit formulation can be found but i) the sub-optimization problems are now strongly convex and thus easier to solve; and more interestingly ii) proximity operator can be merged with other algorithms in order to minimize general functions. These algorithms are called \emph{proximal algorithms} of which the most popular is the proximal gradient algorithm which mixes gradient and proximity operations. 134 | 135 | \vspace*{0.5cm} 136 | 137 | \section{the Proximal Gradient algorithm} 138 | 139 | 140 | Let us consider the \emph{composite} optimization problem 141 | $$ \min_{x\in\mathbb{R}^n} F(x) := f(x) + g(x)$$ 142 | where $f:\mathbb{R}^n \to \mathbb{R}$ is $L$-smooth and convex; and $g:\mathbb{R}^n \to \mathbb{R}\cup\{+\infty\}$ is convex. The \emph{proximal gradient algorithm} writes 143 | $$ x_{k+1} = \mathbf{prox}_{\gamma g}\left( x_k - \gamma \nabla f(x_k) \right) . $$ 144 | 145 | \begin{q_td}[Analysis] 146 | \label{td:ana}\hfill 147 | 148 | 149 | \begin{itemize} 150 | \item[a.] Show that the fixed points of the iteration above are minimizers of $F$. 151 | \item[b.] Connect the proximal gradient with the projected gradient algorithm. 152 | \item[c.] Show that 153 | $$ F(x_{k+1}) \leq F(x_k) - \frac{(2-\gamma L)}{2\gamma} \|x_{k+1} - x_k \|^2 . $$ 154 | \emph{\small Hint: Use the descent lemmas for the gradient on smooth functions and the proximal point algorithm.} 155 | \item[d.] Give a range of stepsizes for which the sequence $F(x_k)$ converges as soon as minimizer exists. 156 | \end{itemize} 157 | \end{q_td} 158 | 159 | 160 | \vspace*{0.5cm} 161 | 162 | \begin{q_td}[Application] 163 | \label{td:app} 164 | The \emph{lasso} problem is a regularized linear regression problem that writes as 165 | $$ \min_{x\in\mathbb{R}^n } \frac{1}{2}\|Ax-b\|^2 + \lambda \|x\|_1 $$ 166 | where $A$ is a full rank $m\times n$ matrix and $b$ is a size $m$ vector. 167 | \begin{itemize} 168 | \item[a.] Write the iterations for a proximal gradient algorithm. Which stepsize can be used? 169 | \item[b.] The regularization $\lambda \|x\|_1$ is said to be \emph{sparsity enforcing}, guess why. 170 | \end{itemize} 171 | \end{q_td} 172 | 173 | 174 | 175 | 176 | \end{document} 177 | -------------------------------------------------------------------------------- /Tuto5_Rates/tuto5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto5_Rates/tuto5.pdf -------------------------------------------------------------------------------- /Tuto5_Rates/tuto5.tex: -------------------------------------------------------------------------------- 1 | %\documentclass[paper=a4, fontsize=9pt]{article} 2 | \documentclass[a4paper,twoside,10pt]{amsart} 3 | 4 | 5 | %\usepackage[scale=0.8]{geometry} 6 | \usepackage{fullpage} 7 | 8 | \usepackage[T1]{fontenc} % Use 8-bit encoding that has 256 glyphs 9 | \usepackage[english]{babel} % English language/hyphenation 10 | \usepackage{amsmath,amsfonts,amsthm} % Math packages 11 | \usepackage{xcolor} 12 | \usepackage{hyperref} 13 | \usepackage{tcolorbox} 14 | 15 | \usepackage{tikz} 16 | \usepackage{tkz-graph} 17 | 18 | \numberwithin{equation}{section} % Number equations within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 19 | \numberwithin{figure}{section} % Number figures within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 20 | \numberwithin{table}{section} % Number tables within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 21 | \usepackage{graphicx} 22 | \usepackage{caption} 23 | \usepackage{subcaption} 24 | 25 | 26 | \newcommand{\horrule}[1]{\rule{\linewidth}{#1}} % Create horizontal rule command with 1 argument of height 27 | \newcommand{\ans}[1]{ { \color{gray} \itshape #1} } % Create horizontal rule command with 1 argument of height 28 | 29 | \newtheorem{theo}{Theorem} 30 | \newtheorem{lemma}{Lemma} 31 | \theoremstyle{definition} 32 | \newtheorem{q_td}{Exercise } 33 | \newcommand{\reftd}[1]{ $\circ$ \ref{#1}} 34 | \newtheorem{q_tp}{$\diamond$} 35 | \newcommand{\reftp}[1]{$\diamond$ \ref{#1}} 36 | 37 | \begin{document} 38 | 39 | %---------------------------------------------------------------------------------------- 40 | % TITLE 41 | %---------------------------------------------------------------------------------------- 42 | 43 | 44 | \normalfont \normalsize 45 | \noindent\textsc{\small Universit\'e Grenoble Alpes }\\ 46 | \noindent\textsc{\small MSIAM 1st year} \\ [0.3cm] % Your university, school and/or department name(s) 47 | \horrule{0.5pt} \\[0.4cm] % Thin top horizontal rule 48 | \begin{center} 49 | {\LARGE \scshape Numerical Optimization \\ Tuto 5: Rates of first-order methods} \\ % The title 50 | \end{center} 51 | \noindent\textsc{\hfill L. Desbat \& F. Iutzeler } 52 | \horrule{2pt} \\[0.5cm] % Thick bottom horizontal rule 53 | 54 | 55 | 56 | %---------------------------------------------------------------------------------------- 57 | % TD 58 | %---------------------------------------------------------------------------------------- 59 | %\newpage 60 | \setcounter{section}{0} 61 | \renewcommand{\thesection}{\Alph{section}} 62 | \renewcommand*{\theHsection}{TD.\the\value{section}} 63 | 64 | 65 | \vspace*{0.5cm} 66 | 67 | In the whole tutorial, we will assume that $f: \mathbb{R}^n\to \mathbb{R}$ is an $L$-smooth \emph{convex} function with minimizers. 68 | 69 | \section{Convergence rates in the strongly convex case} 70 | \vspace*{0.5cm} 71 | 72 | 73 | \begin{q_td}[Some other descent lemmas] \label{td:descent2} \hfill 74 | 75 | The goal of this exercise is to provide useful lemmas for proving convergence rates. Let $x^\star$ be a minimizer of $f$. 76 | \begin{itemize} 77 | \item[a.] Show that for all $x,y\in\mathbb{R}^n$, 78 | $$ f(x) - f(y) \leq \langle x- y ; \nabla f(x) \rangle - \frac{1}{2L} \| \nabla f(x) - \nabla f(y) \|^2 $$ 79 | and thus $$ \frac{1}{L} \| \nabla f(x) - \nabla f(y) \|^2 \leq \langle x- y ; \nabla f(x) - \nabla f(y) \rangle \leq L \|x-y\|^2 .$$ 80 | \emph{Hint: Define $z=y - \frac{1}{L}(\nabla f(y) - \nabla f(x) ) $.\\ Use convexity to bound $f(x)-f(z)$ and smoothness to bound $f(z) - f(y)$ and sum both inequalities.} 81 | \item[b.] Let $f$ be in addition $\mu$-strongly convex; that is, $f-\frac{\mu}{2}\|\cdot\|^2 $ is convex. Show that for all $x\in\mathbb{R}^n$, 82 | $$ (x-x^\star)^\mathrm{T} \nabla f(x) \geq \frac{\mu L}{\mu + L} \|x-x^\star\|^2 + \frac{1}{\mu + L} \|\nabla f(x)\|^2 .$$ 83 | \emph{Hint: Use the fact that $f-\frac{\mu}{2}\|\cdot\|^2 $ is $(L-\mu)$-smooth and question a. } 84 | \end{itemize} 85 | \end{q_td} 86 | 87 | 88 | 89 | \vspace*{0.5cm} 90 | 91 | 92 | \begin{q_td}[Strongly convex case]\label{td:str}\hfill 93 | 94 | 95 | The goal of this exercise is to investigate the convergence rate of the fixed stepsize gradient algorithm on a $\mu$-strongly convex, $L$-smooth function: 96 | $$ x_{k+1} = x_k - \frac{2}{\mu+L} \nabla f(x_k)$$ 97 | which will introduce us to the mechanics of Optimization theory. 98 | \begin{itemize} 99 | \item[a.] From \ref{td:descent2}b., prove that 100 | \begin{align*} 101 | \|x_{k+1} - x^\star \|^2 &\leq \left( 1 - \frac{4\mu L}{(\mu+L)^2}\right) \|x_k - x^\star \|^2 \\ 102 | &= \left( \frac{\kappa - 1}{ \kappa+1}\right)^2 \|x_k - x^\star \|^2 103 | \end{align*} 104 | where $\kappa=L/\mu$ is the \emph{conditionning number} of the problem. 105 | \item[b.] Show that 106 | $$ f(x_k) - f(x^\star) \leq \frac{L}{2} \|x_k - x^\star \|^2 .$$ 107 | \item[c.] Conclude that for the gradient algorithm with stepsize ${2}/{(\mu+L)}$ we have 108 | $$ f(x_k) - f(x^\star) \leq \left( \frac{\kappa - 1}{ \kappa+1}\right)^{2k} \frac{L\|x_0 - x^\star \|^2}{2} . $$ 109 | \end{itemize} 110 | \end{q_td} 111 | 112 | 113 | \vspace*{0.5cm} 114 | 115 | 116 | \section{Convergence rates in the non-strongly convex case} 117 | \vspace*{0.5cm} 118 | 119 | 120 | \begin{q_td}[Smooth case]\label{td:smooth}\hfill 121 | 122 | 123 | The goal of this exercise is to investigate the convergence rate of the fixed stepsize gradient algorithm on an $L$-smooth function: 124 | $$ x_{k+1} = x_k - \frac{1}{L} \nabla f(x_k)$$ 125 | which will introduce us to the mechanics of Optimization theory. 126 | \begin{itemize} 127 | %\item[a.] Deduce from \ref{td:descent2}a. that $ (x-x^\star)^\mathrm{T} \nabla f(x) \geq \frac{1}{2L} \|\nabla f(x) \|^2 $. 128 | \item[a.] Prove that 129 | $$ \|x_{k+1} - x^\star \|^2 \leq \|x_k - x^\star \|^2 - \frac{1}{L^2} \| \nabla f(x_k) \|^2 = \|x_k - x^\star \|^2 - \| x_{k+1} - x_k \|^2 .$$ 130 | \item[b.] Show that 131 | $$ \delta_k := f(x_k) - f(x^\star) \leq \|x_k - x^\star \| \cdot \|\nabla f(x_k) \| \leq \|x_1 - x^\star \| \cdot \|\nabla f(x_k) \| .$$ 132 | \emph{Hint: Use convexity then a.} 133 | \item[c.] Use smoothness and b. to show that 134 | $$ 0 \leq \delta_{k+1} \leq \delta_k - \underbrace{\frac{1}{2L\|x_1-x^\star\|^2}}_{:=\omega} \delta_k^2 . $$ 135 | \item[d.] Deduce that 136 | $$ \frac{1}{\delta_{k+1}} - \frac{1}{\delta_{k}} \geq \omega .$$ 137 | \emph{Hint: Divide c. by $ \delta_{k}\delta_{k+1} $}. 138 | \item[e.] Conclude that for the gradient algorithm with stepsize $1/L$ we have 139 | $$ f(x_k) - f(x^\star) \leq \frac{2L\|x_1-x^\star\|^2}{k-1} . $$ 140 | \end{itemize} 141 | \end{q_td} 142 | 143 | 144 | 145 | \newpage 146 | 147 | 148 | \begin{tcolorbox}[width=\textwidth,colback={blue!5!white},title={\textbf{Optimization inequalities cheatsheet}},colbacktitle=black,coltitle=white] 149 | For any function $f$: 150 | \begin{itemize} 151 | \item[(convex)] convex 152 | \item[(diff)] differentiable 153 | \item[(min)] with minimizers $X^\star$, $x^\star \in X^\star$ 154 | \item[(smooth)] $L$-smooth (differentiable with $\nabla f$ $L$ Lipschitz continuous) 155 | \item[(strong)] $\mu$-strongly convex ($\mu$ can be taken equal to $0$ below) 156 | \end{itemize} 157 | 158 | 159 | \begin{align*} 160 | & f(y) \geq f(x) + (y-x)^\mathrm{T} \nabla f(x) \text{ (convex) + (diff) } \\ 161 | \Rightarrow &\langle x-y ; \nabla f(x)-\nabla f(y)\rangle\geq0 \text{ (convex) + (diff) } 162 | \end{align*} 163 | 164 | \begin{align*} 165 | & f(x^\star) \leq f(x) \forall x \text{ (minimizer) } \\ 166 | \Rightarrow & \nabla f(x^\star) = 0 \text{ (convex) + (diff) + (minimizer) } 167 | \end{align*} 168 | 169 | 170 | \begin{align*} 171 | & \|\nabla f(x) - \nabla f(y) \| \leq L \|x-y\| \text{ (smooth) } \\ 172 | \Rightarrow & f(x) \leq f(y) + (x-y)^\mathrm{T} \nabla f(y) + \frac{L}{2} \| x-y\|^2 \text{ (smooth) } \\ 173 | \Rightarrow & \langle x-y ; \nabla f(x)-\nabla f(y)\rangle \leq L \|x-y\|^2 \text{ (smooth) } 174 | \end{align*} 175 | 176 | 177 | 178 | \begin{align*} 179 | & f(x) - \frac{\mu}{2}\|x\|^2 \text{ is convex } \text{ (strong) } \\ 180 | \Rightarrow & f(y) + (x-y)^\mathrm{T} \nabla f(y) + \frac{\mu}{2} \| x-y\|^2 \leq f(x) \text{ (strong) + (diff) } \\ 181 | \Rightarrow & \mu \|x-y\|^2 \leq \langle x-y ; \nabla f(x)-\nabla f(y)\rangle \text{ (strong) + (diff) } 182 | \end{align*} 183 | 184 | 185 | \vspace*{1cm} 186 | 187 | Combining the above, when $f$ is $\mu$-strongly convex and $L$-smooth: 188 | 189 | \begin{align*} 190 | f(y) + (x-y)^\mathrm{T} \nabla f(y) + \frac{\mu}{2} \| x-y\|^2 \leq f(x) \leq f(y) + (x-y)^\mathrm{T} \nabla f(y) + \frac{L}{2} \| x-y\|^2 191 | \end{align*} 192 | 193 | 194 | 195 | \begin{align*} 196 | \frac{\mu L}{\mu + L} \|x-y\|^2 + \frac{1}{\mu + L} \|\nabla f(x) - \nabla f(y) \|^2 \leq \langle x-y ; \nabla f(x)-\nabla f(y)\rangle \leq L \|x-y\|^2 197 | \end{align*} 198 | 199 | 200 | If in addition, $f$ is twice differentiable, 201 | \begin{align*} 202 | \mu I \leq \nabla^2 f(x) \leq L I 203 | \end{align*} 204 | 205 | \end{tcolorbox} 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | \end{document} 214 | -------------------------------------------------------------------------------- /Tuto6_LPQP/tuto6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto6_LPQP/tuto6.pdf -------------------------------------------------------------------------------- /Tuto6_LPQP/tuto6.tex: -------------------------------------------------------------------------------- 1 | %\documentclass[paper=a4, fontsize=9pt]{article} 2 | \documentclass[a4paper,twoside,10pt]{amsart} 3 | 4 | 5 | %\usepackage[scale=0.8]{geometry} 6 | \usepackage{fullpage} 7 | 8 | \usepackage[T1]{fontenc} % Use 8-bit encoding that has 256 glyphs 9 | \usepackage[english]{babel} % English language/hyphenation 10 | \usepackage{amsmath,amsfonts,amsthm} % Math packages 11 | \usepackage{xcolor} 12 | \usepackage{hyperref} 13 | 14 | \usepackage{tikz} 15 | \usepackage{tkz-graph} 16 | 17 | \numberwithin{equation}{section} % Number equations within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 18 | \numberwithin{figure}{section} % Number figures within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 19 | \numberwithin{table}{section} % Number tables within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 20 | \usepackage{graphicx} 21 | \usepackage{caption} 22 | \usepackage{subcaption} 23 | 24 | 25 | \newcommand{\horrule}[1]{\rule{\linewidth}{#1}} % Create horizontal rule command with 1 argument of height 26 | \newcommand{\ans}[1]{ { \color{gray} \itshape #1} } % Create horizontal rule command with 1 argument of height 27 | 28 | \newtheorem{theo}{Theorem} 29 | \newtheorem{lemma}{Lemma} 30 | \theoremstyle{definition} 31 | \newtheorem{q_td}{Exercise } 32 | \newcommand{\reftd}[1]{ $\circ$ \ref{#1}} 33 | \newtheorem{q_tp}{$\diamond$} 34 | \newcommand{\reftp}[1]{$\diamond$ \ref{#1}} 35 | 36 | \begin{document} 37 | 38 | %---------------------------------------------------------------------------------------- 39 | % TITLE 40 | %---------------------------------------------------------------------------------------- 41 | 42 | 43 | \normalfont \normalsize 44 | \noindent\textsc{\small Universit\'e Grenoble Alpes }\\ 45 | \noindent\textsc{\small MSIAM 1st year} \\ [0.3cm] % Your university, school and/or department name(s) 46 | \horrule{0.5pt} \\[0.4cm] % Thin top horizontal rule 47 | \begin{center} 48 | {\LARGE \scshape Numerical Optimization \\ Tuto 6: Linear and Quadratic Programs} \\ % The title 49 | \end{center} 50 | \noindent\textsc{\hfill L. Desbat \& F. Iutzeler } 51 | \horrule{2pt} \\[0.5cm] % Thick bottom horizontal rule 52 | 53 | 54 | 55 | %---------------------------------------------------------------------------------------- 56 | % TD 57 | %---------------------------------------------------------------------------------------- 58 | %\newpage 59 | \setcounter{section}{0} 60 | \renewcommand{\thesection}{\Alph{section}} 61 | \renewcommand*{\theHsection}{TD.\the\value{section}} 62 | 63 | 64 | \vspace*{0.5cm} 65 | 66 | 67 | 68 | In this tutorial, we are going to investigate Linear and Quadratic problems, that is when minimizing linear or quadratic cost functions under linear inequalities constraints. Typical formulations of these problems is as such: 69 | 70 | \vspace*{0.5cm} 71 | 72 | 73 | \begin{minipage}{0.4\textwidth} 74 | \textbf{~~~~~~~~~~ Linear program (LP):}\\ 75 | \begin{align*} 76 | \min_{x\in\mathbb{R}^n} & ~~~ c^\mathrm{T} x \\ 77 | \text{subject to } & ~~~ Gx \leq h 78 | \end{align*} 79 | \end{minipage}\hfill 80 | \begin{minipage}{0.4\textwidth} 81 | \textbf{~~~~~ Quadratic program (QP):} 82 | \begin{align*} 83 | \min_{x\in\mathbb{R}^n} & ~~~ \frac{1}{2} x^\mathrm{T} P x + q^\mathrm{T} x \\ 84 | \text{subject to } & ~~~ Gx \leq h 85 | \end{align*} 86 | \end{minipage} 87 | 88 | 89 | \vspace*{0.5cm} 90 | 91 | 92 | where $c,q\in\mathbb{R}^n$, $G\in\mathbb{R}^{m\times n}$, $ h\in\mathbb{R}^m$, $P\in\mathbb{R}^{n\times n}$. 93 | 94 | \vspace*{0.5cm} 95 | 96 | Altough these problems are quite specific, a number of (sub-)problems in signal and data processing can actually reformulate linearly or quadratically. The interest of these reformulation is that there exist a large number of standard libraries implementing computationally efficient LP and QP solvers\footnote{generally based on interior point, active sets, simplex, ... algorithms and variants.}. 97 | 98 | 99 | \vspace*{0.5cm} 100 | 101 | 102 | 103 | \begin{q_td}[Equivalent problems]\label{td:eq} 104 | Let $f:\mathbb{R}^n\to\mathbb{R}$; we consider the problem 105 | \begin{align*} 106 | \min_{x\in\mathbb{R}^n} & ~~~ f(x) \\ 107 | \text{subject to } & ~~~ x \in C 108 | \end{align*} 109 | and we assume that a solution $\bar{x}$ exists. Show that this problem is \emph{equivalent} to solving 110 | \begin{align*} 111 | \min_{(x,r)\in\mathbb{R}^{n+1}} & ~~~ r \\ 112 | \text{subject to } & ~~~ f(x) \leq r \\ 113 | & ~~~ (x,r)\in C \times \mathbb{R} \subset \mathbb{R}^{n+1} 114 | \end{align*} 115 | in the sense that 116 | \begin{itemize} 117 | \item[(i)] if $\bar{x}$ is a solution of the first problem, then $(\bar{x}, f (\bar{x}))$ is a solution of the second one. 118 | \item[(ii)] if $(\bar{x},\bar{r})$ is a solution of the second problem, then $\bar{x}$ is a solution of the first one. 119 | \end{itemize} 120 | 121 | \end{q_td} 122 | 123 | \vspace*{0.5cm} 124 | 125 | 126 | \begin{q_td}[Linear reformulation]\label{td:ref} 127 | Let $A\in\mathbb{R}^{m\times n}$ and $ b\in\mathbb{R}^m$. Reformulate the problem 128 | \begin{align*} 129 | \min_{x\in\mathbb{R}^n} & ~~~ \|Ax-b\|_\infty 130 | \end{align*} 131 | as a linear problem. Notably, give the corresponding $(c,G,h)$ from the LP formulation. 132 | 133 | \end{q_td} 134 | 135 | \newpage 136 | 137 | \begin{q_td}[Linear reformulation II]\label{td:ref2} 138 | Let $A\in\mathbb{R}^{m\times n}$ and $ b\in\mathbb{R}^m$. Reformulate the problem 139 | \begin{align*} 140 | \min_{x\in\mathbb{R}^n} & ~~~ \|Ax-b\|_1 141 | \end{align*} 142 | as a linear problem by extending the technique of Ex.~\ref{td:eq} (without giving details). Notably, give the corresponding $(c,G,h)$ from the LP formulation. 143 | 144 | 145 | Do the same for the problem 146 | \begin{align*} 147 | \min_{x\in\mathbb{R}^n} & ~~~ \|x\|_1\\ 148 | \text{subject to } & ~~~ \|Ax-b\|_\infty \leq 1 149 | \end{align*} 150 | 151 | \end{q_td} 152 | 153 | \vspace*{0.5cm} 154 | 155 | \begin{q_td}[Quadratic reformulation]\label{td:ref2} 156 | We consider the regression model 157 | $$ y=X\theta+\xi,\;\;\xi\sim \mathcal{N}(0, \sigma I_m), $$ 158 | where $X\in \mathbb{R}^{m\times n}$ and $y\in \mathbb{R}^m$ are the observed values and $\theta\in \mathbb{R}^n$ is the unknown parameter we want to find. Show that maximizing the (log-)likelihood of $\theta$ amount to minimizing $\|X\theta-y\|_2^2$. 159 | 160 | Reformulate the maximum likelihood problem under bounded output error as a Quadratic problem. 161 | \begin{align*} 162 | \max_{\theta \in\mathbb{R}^n} & ~~~ \text{likelihood}(\theta) = p(y|\theta) \\ 163 | \text{subject to } & ~~~ | y_i - X_i \theta | \leq \varepsilon 164 | \end{align*} 165 | \emph{($X_i$) is the row vector of the $i$-th line of $X$.} 166 | 167 | 168 | What would change if $\xi$ followed a Laplace distribution? 169 | \end{q_td} 170 | 171 | \vspace*{0.5cm} 172 | 173 | \end{document} 174 | --------------------------------------------------------------------------------