├── .gitignore
├── Lab1_Basics
├── Fig
│ ├── 1.png
│ ├── 2.png
│ ├── 3.png
│ ├── 4.png
│ ├── 5.png
│ └── UGA.png
├── Lab1_Basics.ipynb
├── algorithms.py
├── plotLib.py
├── problem1.py
├── problem2.py
├── problem3.py
├── problem4.py
└── problem5.py
├── Lab2_GradientDescent
├── Fig
│ ├── 1.png
│ ├── 2.png
│ ├── 3.png
│ ├── 4.png
│ ├── 5.png
│ └── UGA.png
├── Lab2_GradientDescent.ipynb
├── algorithms.py
├── plotLib.py
├── problem1.py
├── problem2.py
├── problem3.py
├── problem4.py
└── problem5.py
├── Lab3_ProjectedGradient
├── Fig
│ ├── .DS_Store
│ ├── ._.DS_Store
│ ├── ._1.png
│ ├── ._2.png
│ ├── ._3.png
│ ├── ._4.png
│ ├── ._5.png
│ ├── ._UGA.png
│ ├── 1.png
│ ├── 2.png
│ ├── 3.png
│ ├── 4.png
│ ├── 5.png
│ └── UGA.png
├── Lab3_ProjectedGradient.ipynb
├── algoProjGradient.py
├── plotLib.py
├── problem1.py
├── problem2.py
├── problem3.py
├── problem4.py
└── problem5.py
├── Lab4_Prox
├── Fig
│ ├── 1.png
│ ├── 2.png
│ ├── 3.png
│ ├── 4.png
│ ├── 5.png
│ └── UGA.png
├── Lab4_Proximal_algorithms.ipynb
├── logistic_regression_student.py
├── student.npz
└── student.txt
├── Lab5_MachineLearningExample
├── Fig
│ ├── 1.png
│ ├── 2.png
│ ├── 3.png
│ ├── 4.png
│ ├── 5.png
│ └── UGA.png
├── Lab5_OptimForML.ipynb
├── algoGradient.py
├── ionosphere.data
└── logistic_regression_ionosphere.py
├── Lab6_LPQP
├── Fig
│ ├── 1.png
│ ├── 2.png
│ ├── 3.png
│ ├── 4.png
│ ├── 5.png
│ └── UGA.png
├── Lab6_LP_and_QP.ipynb
└── toy_problem.py
├── Lab7_StochasticMethods
├── Fig
│ ├── 1.png
│ ├── 2.png
│ ├── 3.png
│ ├── 4.png
│ ├── 5.png
│ └── UGA.png
├── Lab7_StochMethods.ipynb
├── algoProx.py
├── logistic_regression_student.py
├── plotLib.py
├── student.npz
└── student.txt
├── Lab8_MinMax
├── Fig
│ └── UGA.png
└── Lab8_Two-player zero-sum games.ipynb
├── Lab9_Uzawa
├── Fig
│ ├── ._1.png
│ ├── ._2.png
│ ├── ._3.png
│ ├── ._4.png
│ ├── ._5.png
│ ├── ._UGA.png
│ ├── 1.png
│ ├── 2.png
│ ├── 3.png
│ ├── 4.png
│ ├── 5.png
│ └── UGA.png
├── Lab9_constrainedOptimization.ipynb
└── plotLib.py
├── README.md
├── Tuto1_Basics
├── harder.png
├── poly.png
├── rosenbrock.png
├── simple.png
├── tuto1.pdf
├── tuto1.tex
└── two_pits.png
├── Tuto4_Prox
├── tuto4.pdf
└── tuto4.tex
├── Tuto5_Rates
├── tuto5.pdf
└── tuto5.tex
└── Tuto6_LPQP
├── tuto6.pdf
└── tuto6.tex
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Core latex/pdflatex auxiliary files:
2 | *.aux
3 | *.lof
4 | *.log
5 | *.lot
6 | *.fls
7 | *.out
8 | *.toc
9 | *.fmt
10 | *.fot
11 | *.cb
12 | *.cb2
13 | .*.lb
14 |
15 | ## Intermediate documents:
16 | *.dvi
17 | *.xdv
18 | *-converted-to.*
19 | *.fdb_latexmk
20 | *.synctex.gz
21 |
22 | # Notebooks
23 | __pycache__
24 | .ipynb_checkpoints
25 |
26 |
--------------------------------------------------------------------------------
/Lab1_Basics/Fig/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/1.png
--------------------------------------------------------------------------------
/Lab1_Basics/Fig/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/2.png
--------------------------------------------------------------------------------
/Lab1_Basics/Fig/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/3.png
--------------------------------------------------------------------------------
/Lab1_Basics/Fig/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/4.png
--------------------------------------------------------------------------------
/Lab1_Basics/Fig/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/5.png
--------------------------------------------------------------------------------
/Lab1_Basics/Fig/UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab1_Basics/Fig/UGA.png
--------------------------------------------------------------------------------
/Lab1_Basics/algorithms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Gradient-based algorithms
5 | #
6 | # In this file, we code our gradient-based optimization algorithms.
7 |
8 | #################################
9 | # # 1. Gradient algorithms
10 | ##################################
11 | #
12 | # For minimizing a differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given:
13 | # * the function to minimize `f`
14 | # * a 1st order oracle `f_grad` (see `problem1.ipynb` for instance)
15 | # * an initialization point `x0`
16 | # * the sought precision `PREC`
17 | # * a maximal number of iterations `ITE_MAX`
18 | #
19 | #
20 | # these algorithms perform iterations of the form
21 | # $$ x_{k+1} = x_k - \gamma_k \nabla f(x_k) $$
22 | # where $\gamma_k$ is a stepsize to choose.
23 |
24 | # ### 1.a. Constant stepsize gradient algorithm
25 | #
26 | # First, we consider the case where the stepsize is fixed over iterations and passed an argument `step` to the algorithm.
27 |
28 |
29 |
30 | import numpy as np
31 | import timeit
32 |
33 | def gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ):
34 | x = np.copy(x0)
35 | stop = PREC*np.linalg.norm(f_grad(x0) )
36 |
37 | x_tab = np.copy(x)
38 | print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
39 | t_s = timeit.default_timer()
40 | for k in range(ITE_MAX):
41 | g = f_grad(x)
42 | x = x - step*g ####### ITERATION
43 |
44 | x_tab = np.vstack((x_tab,x))
45 |
46 | if np.linalg.norm(g) < stop:
47 | break
48 | t_e = timeit.default_timer()
49 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
50 | return x,x_tab
51 |
52 | # # 1.b Newton algorithm
53 | #
54 | # For minimizing a *twice* differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given:
55 | # * the function to minimize `f`
56 | # * a 2nd order oracle `f_grad_hessian` (see `problem1.ipynb` for instance)
57 | # * an initialization point `x0`
58 | # * the sought precision `PREC`
59 | # * a maximal number of iterations `ITE_MAX`
60 | #
61 | #
62 | # these algorithms perform iterations of the form
63 | # $$ x_{k+1} = x_k - [\nabla^2 f(x_k) ]^{-1} \nabla f(x_k) .$$
64 |
65 |
66 | # Q: Complete the code for the Newton iterations
67 |
68 | import numpy as np
69 | import timeit
70 |
71 | def newton_algorithm(f , f_grad_hessian , x0 , PREC , ITE_MAX ):
72 | x = np.copy(x0)
73 | g0,H0 = f_grad_hessian(x0)
74 | stop = PREC*np.linalg.norm(g0 )
75 |
76 | x_tab = np.copy(x)
77 | print("------------------------------------\nNewton's algorithm\n------------------------------------\nSTART")
78 | t_s = timeit.default_timer()
79 | for k in range(ITE_MAX):
80 |
81 | g, H = f_grad_hessian(x)
82 |
83 | ### COMPLETE
84 |
85 | x_tab = np.vstack((x_tab,x))
86 |
87 | if np.linalg.norm(g) < stop:
88 | break
89 | t_e = timeit.default_timer()
90 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
91 | return x,x_tab
92 |
93 | #################################
94 | # # 2. More involved functions
95 | ##################################
96 |
97 | # # 2.b Adaptive stepsize gradient algorithm
98 |
99 | # Q: Complete the adaptive gradient below using your intuition
100 |
101 |
102 | import numpy as np
103 | import timeit
104 |
105 | def gradient_adaptive_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ):
106 | x = np.copy(x0)
107 | stop = PREC*np.linalg.norm(f_grad(x0) )
108 |
109 | x_tab = np.copy(x)
110 | print("------------------------------------\nAdaptative Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
111 | t_s = timeit.default_timer()
112 | for k in range(ITE_MAX):
113 |
114 | g = f_grad(x)
115 | x_prev = np.copy(x)
116 |
117 | x = x - step*g ####### ITERATION
118 |
119 | ### COMPLETE
120 |
121 | x_tab = np.vstack((x_tab,x))
122 |
123 | if np.linalg.norm(g) < stop:
124 | break
125 | t_e = timeit.default_timer()
126 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
127 | return x,x_tab
128 |
129 |
130 |
131 |
132 |
133 |
134 |
--------------------------------------------------------------------------------
/Lab1_Basics/plotLib.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib as mpl
3 | import matplotlib.pyplot as plt
4 | from matplotlib import cm
5 | from mpl_toolkits.mplot3d import Axes3D
6 | import time
7 | from IPython import display
8 |
9 |
10 | def custom_3dplot( f, x1_min,x1_max,x2_min,x2_max,nb_points, v_min, v_max ):
11 |
12 | def f_no_vector(x1,x2):
13 | return f( np.array( [x1,x2] ) )
14 |
15 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
16 | z = f_no_vector(x,y)
17 |
18 | fig = plt.figure()
19 | # Old syntax
20 | # ax = fig.gca(projection='3d')
21 | ax = fig.add_subplot(projection='3d')
22 | ax.plot_surface(x, y, z, cmap=cm.hot , vmin = v_min, vmax = v_max)
23 | ax.set_zlim(v_min, v_max)
24 | plt.show()
25 |
26 |
27 | def level_plot( f, x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
28 |
29 |
30 | def f_no_vector(x1,x2):
31 | return f( np.array( [x1,x2] ) )
32 |
33 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
34 | z = f_no_vector(x,y)
35 |
36 | fig = plt.figure()
37 | graphe = plt.contour(x,y,z,levels)
38 | #plt.plot(3,1,'r*',markersize=15)
39 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
40 | plt.title(title)
41 | plt.show()
42 |
43 |
44 | def level_points_plot( f , x_tab , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
45 |
46 | def f_no_vector(x1,x2):
47 | return f( np.array( [x1,x2] ) )
48 |
49 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
50 | z = f_no_vector(x,y)
51 |
52 | fig = plt.figure()
53 | graphe = plt.contour(x,y,z,levels)
54 | #plt.plot(3,1,'r*',markersize=15)
55 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
56 | plt.title(title)
57 |
58 | delay = 4.0/x_tab.shape[0]
59 | for k in range(x_tab.shape[0]):
60 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10)
61 | plt.xlim([x1_min,x1_max])
62 | plt.ylim([x2_min,x2_max])
63 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1]))
64 | plt.draw()
65 | display.clear_output(wait=True)
66 | display.display(fig)
67 | time.sleep(delay)
68 | display.clear_output()
69 | plt.show()
70 |
71 |
72 | def level_2points_plot( f , x_tab , x_tab2 , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
73 |
74 |
75 | def f_no_vector(x1,x2):
76 | return f( np.array( [x1,x2] ) )
77 |
78 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
79 | z = f_no_vector(x,y)
80 |
81 | fig = plt.figure()
82 | graphe = plt.contour(x,y,z,levels)
83 | #plt.plot(3,1,'r*',markersize=15)
84 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
85 | plt.xlim([x1_min,x1_max])
86 | plt.ylim([x2_min,x2_max])
87 | plt.title(title)
88 |
89 | delay = 4.0/x_tab.shape[0]
90 | for k in range(x_tab.shape[0]):
91 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10)
92 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1]))
93 | plt.draw()
94 | #plt.pause(delay)
95 |
96 | delay = 4.0/x_tab2.shape[0]
97 | for k in range(x_tab2.shape[0]):
98 | plt.plot(x_tab2[k,0],x_tab2[k,1],'dg',markersize=8)
99 | #plt.annotate(k,(x_tab2[k,0],x_tab2[k,1]))
100 | #plt.pause(delay)
101 | plt.draw()
102 |
103 | plt.show()
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
--------------------------------------------------------------------------------
/Lab1_Basics/problem1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 1
5 | #
6 | #
7 | # The objective of Problem 1 is to minimize a simple quadratic function $f$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # f: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & 4 (x_1-3)^2 + 2(x_2-1)^2
12 | # \end{array}$$
13 |
14 |
15 |
16 | ##### Function definition
17 | def f(x):
18 | x1 = x[0]
19 | x2 = x[1]
20 | return 4*(x1-3)**2+2*(x2-1)**2
21 | ####
22 |
23 | ##### Plot parameters f
24 | x1_min = -0.5
25 | x1_max = 5.5
26 | x2_min = -0.5
27 | x2_max = 5.5
28 | nb_points = 200
29 | vmin = 0
30 | vmax = 80
31 | levels = [0.5,1,2,5,10,15]
32 | title = 'f: a simple function'
33 | ####
34 |
35 |
36 | # ### Some parameters
37 | #
38 | # Before solving things numerically, some useful things can be computed:
39 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc
40 | # * Good starting points (for hot starting e.g.)
41 |
42 |
43 |
44 | ###### Useful Parameters
45 | L = 8 # Lipschitz constant of the gradient
46 |
47 |
48 | # ### Oracles
49 | #
50 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian).
51 |
52 |
53 |
54 | # Q: Observe the first order oracle `f_grad`.
55 |
56 |
57 |
58 | import numpy as np
59 |
60 | ##### Gradient oracle
61 | def f_grad(x):
62 | x1 = x[0]
63 | x2 = x[1]
64 | gx = 8*(x1-3)
65 | gy = 4*(x2-1)
66 | return np.array( [ gx , gy ] )
67 | ####
68 |
69 |
70 | # Q: Observe the second order oracle `f_grad_hessian`.
71 |
72 |
73 | import numpy as np
74 |
75 | ##### Hessian scaled Gradient computation
76 | def f_grad_hessian(x):
77 | x1 = x[0]
78 | x2 = x[1]
79 | gx = 8*(x1-3)
80 | gy = 4*(x2-1)
81 | g = np.array( [ gx , gy ] )
82 | H = np.array( [ (8.0 , 0.0 ) , ( 0.0 , 4.0 ) ] )
83 | return g,H
84 | ####
85 |
86 |
--------------------------------------------------------------------------------
/Lab1_Basics/problem2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 2
5 | #
6 | #
7 | # The objective of Problem 2 is to minimize a more involved but very smooth function function $g$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # g: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & \log( 1 + \exp(4 (x_1-3)^2 ) + \exp( 2(x_2-1)^2 ) ) - \log(3)
12 | # \end{array}$$
13 |
14 |
15 |
16 |
17 | ##### Function definition
18 | def f(x):
19 | x1 = x[0]
20 | x2 = x[1]
21 | return np.log( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) - np.log(3)
22 | ####
23 |
24 | ##### Plot parameters f
25 | x1_min = -0.5
26 | x1_max = 5.5
27 | x2_min = -0.5
28 | x2_max = 5.5
29 | nb_points = 500
30 | vmin = 0
31 | vmax = 100
32 | levels = [0.5,1,2,5,10,15]
33 | title = 'a Harder function: g'
34 | ####
35 |
36 |
37 |
38 | ###### Useful Parameters
39 | L = 8 # Lipschitz constant of the gradient
40 |
41 |
42 | # ### Oracles
43 |
44 |
45 | # Q: Complete the first order oracle `f_grad`.
46 |
47 |
48 |
49 | import numpy as np
50 |
51 | ##### Gradient oracle
52 | def f_grad(x):
53 | x1 = x[0]
54 | x2 = x[1]
55 | gx = 0 ## To complete
56 | gy = 0 ## To complete
57 | return np.array( [ gx , gy ] )
58 | ####
59 |
60 |
61 | # Q: Fill the following second order oracle `f_grad_hessian`.
62 |
63 |
64 | import numpy as np
65 |
66 | ##### Hessian scaled Gradient computation
67 | def f_grad_hessian(x):
68 |
69 | return g,H
70 | ####
71 |
72 |
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/Lab1_Basics/problem3.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 3
5 | #
6 | #
7 | # The objective of Problem 3 is to minimize non-convex smooth Rosenbrock function $r$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # r: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2
12 | # \end{array}$$
13 |
14 |
15 |
16 |
17 | ##### Function definition
18 | def f(x):
19 | """Rosenbrock."""
20 | x1 = x[0]
21 | x2 = x[1]
22 | return (1-x1)**2+100*(x2-x1**2)**2
23 | ####
24 |
25 | ##### Plot parameters f
26 | x1_min = -1.5
27 | x1_max = 1.55
28 | x2_min = -0.2
29 | x2_max = 1.5
30 | nb_points = 200
31 | vmin = 0
32 | vmax = 120
33 | levels = [0.05,1,5,15,50,100,200]
34 | title = 'Rosenbrock function'
35 | ####
36 |
37 |
38 |
39 |
40 |
41 | ### Oracles
42 |
43 |
44 | # Q: Complete the first order oracle `f_grad`.
45 |
46 |
47 |
48 |
49 | import numpy as np
50 |
51 | ##### Gradient oracle
52 | def f_grad(x):
53 |
54 | return 0.0 ### To complete
55 | ####
56 |
57 |
58 | # Q: Fill the following second order oracle `f_grad_hessian`.
59 |
60 |
61 | import numpy as np
62 |
63 | ##### Hessian scaled Gradient computation
64 | def f_grad_hessian(x):
65 |
66 | return g,H ### To complete
67 | ####
68 |
69 |
--------------------------------------------------------------------------------
/Lab1_Basics/problem4.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 4
5 | #
6 | #
7 | # The objective of Problem 4 is to minimize a non-convex function $t$ with two minimizers on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # t: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & (0.6 x_1 + 0.2 x_2)^2 \left((0.6 x_1 + 0.2 x_2)^2 - 4 (0.6 x_1 + 0.2 x_2)+4\right) + (-0.2 x_1 + 0.6 x_2)^2
12 | # \end{array}$$
13 |
14 |
15 |
16 |
17 |
18 | ##### Function definition
19 | def f(x):
20 | x1 = x[0]
21 | x2 = x[1]
22 | return (0.6*x1 + 0.2*x2)**2 * ((0.6*x1 + 0.2*x2)**2 - 4*(0.6*x1 + 0.2*x2)+4) + (-0.2*x1 + 0.6*x2)**2
23 | ####
24 |
25 | ##### Plot parameters f
26 | x1_min = -1
27 | x1_max = 4
28 | x2_min = -1
29 | x2_max = 4
30 | nb_points = 200
31 | levels = [0.05,0.5,1,2,5]
32 | vmin = 0
33 | vmax = 5
34 | title = 'two pits'
35 | ####
36 |
37 |
38 |
39 |
40 |
41 | ###### Useful Parameters
42 | L = 8 # Lipschitz constant of the gradient
43 |
44 |
45 | ### Oracles
46 |
47 | # Q: Complete the first order oracle `f_grad`.
48 |
49 |
50 |
51 |
52 | import numpy as np
53 |
54 | ##### Gradient oracle
55 | def f_grad(x):
56 |
57 | return ### To complete
58 |
59 |
60 | # Q: Does a second order oracle exist for any point?
61 |
62 |
63 |
--------------------------------------------------------------------------------
/Lab1_Basics/problem5.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 5
5 | #
6 | #
7 | # The objective of Problem 5 is to minimize a polyhedral function $p$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # p: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & \left| x_1-3 \right| + 2\left| x_2-1\right| .
12 | # \end{array}$$
13 |
14 |
15 |
16 |
17 | ##### Function definition
18 | def f(x):
19 | x1 = x[0]
20 | x2 = x[1]
21 | return np.abs(x1-3)+2*np.abs(x2-1)
22 | ####
23 |
24 | ##### Plot parameters f
25 | x1_min = -0.5
26 | x1_max = 5.5
27 | x2_min = -0.5
28 | x2_max = 5.5
29 | nb_points = 200
30 | levels = [0.05,0.5,1,2,5]
31 | vmin = 0
32 | vmax = 5
33 | title = 'polyhedral'
34 | ####
35 |
36 |
37 | ### Oracles
38 |
39 |
40 | # Q: Compute a first order oracle `f_grad`. Is it unique?
41 |
42 |
43 |
44 | import numpy as np
45 |
46 | ##### Gradient oracle
47 | def f_grad(x):
48 |
49 | return g ### To complete
50 | ####
51 |
52 |
53 | # Q: What about a second order oracle?
54 |
55 |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/Lab2_GradientDescent/Fig/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/1.png
--------------------------------------------------------------------------------
/Lab2_GradientDescent/Fig/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/2.png
--------------------------------------------------------------------------------
/Lab2_GradientDescent/Fig/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/3.png
--------------------------------------------------------------------------------
/Lab2_GradientDescent/Fig/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/4.png
--------------------------------------------------------------------------------
/Lab2_GradientDescent/Fig/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/5.png
--------------------------------------------------------------------------------
/Lab2_GradientDescent/Fig/UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab2_GradientDescent/Fig/UGA.png
--------------------------------------------------------------------------------
/Lab2_GradientDescent/Lab2_GradientDescent.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
\n",
8 | "Master of Science in Industrial and Applied Mathematics (MSIAM) - 1st year
\n",
9 | "
\n",
10 | "Optimization
\n",
11 | "Lab 2: Gradient algorithm
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n",
19 | "---\n"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "%load_ext autoreload\n",
29 | "%autoreload 2"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "---"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "# 1. Line-search\n",
44 | "\n",
45 | "In the previous Lab, we saw that it can be difficult to choose a satisfying stepsize.\n",
46 | "\n",
47 | "An option to choose a satisfying stepsize $\\gamma$ is to test different stepsizes by calling succesively the function oracles. Wolfe's line-search is implemented in `Scipy`'s `scipy.optimize.line_search`. \n",
48 | "\n",
49 | "\n",
50 | "**Wolfe's line-search.** Let $x$ be the current point, $d$ a descent direction, and $q(\\gamma)=f(x+\\gamma d)$.Wolfe's line-search consists in deciding that \n",
51 | "* $\\gamma$ is *satisfying* if $q(\\gamma)\\leq q(0)+m_1 \\gamma q'(0)$ and $q'(\\gamma)\\geq m_2 q'(0)$;\n",
52 | "* $\\gamma$ is *too big* if $q(\\gamma) > q(0)+m_1 \\gamma q'(0)$;\n",
53 | "* $\\gamma$ is *too small* if $q(\\gamma)\\leq q(0)+m_1 \\gamma q'(0)$ and $q'(\\gamma) Complete the function `gradient_Wolfe` in `algorithms.py`.
\n",
68 | "> Compare the convergence of this gradient with other gradient methods."
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {
74 | "collapsed": true
75 | },
76 | "source": [
77 | "---\n",
78 | "### 1a. Comparing constant stepsize gradient algorithm and Wolfe search on Problem 1\n",
79 | "\n",
80 | "> Print the stepsizes chosen by line search and compare with theoretical ones."
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "from algorithms import * # import all methods of the module into the current environment\n",
90 | "\n",
91 | "import numpy as np\n",
92 | "import problem1 as pb1\n",
93 | "\n",
94 | "\n",
95 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n",
96 | "PREC = 0.01 # Sought precision\n",
97 | "ITE_MAX = 20 # Max number of iterations\n",
98 | "x0 = np.array( (0.0,0.0 ) ) # Initial point\n",
99 | "step = 0.1\n",
100 | "\n",
101 | "##### gradient algorithm\n",
102 | "x,x_tab = gradient_algorithm(pb1.f , pb1.f_grad , x0 , step , PREC , ITE_MAX )\n",
103 | "\n",
104 | "##### Wolfe line-search algorithm\n",
105 | "xW,xW_tab = gradient_Wolfe(pb1.f , pb1.f_grad , x0 , PREC , ITE_MAX )\n"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "###### Plots"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "from plotLib import *\n",
122 | "%matplotlib inline\n",
123 | "\n",
124 | "##### comparison\n",
125 | "level_2points_plot( pb1.f , x_tab , xW_tab , pb1.x1_min, pb1.x1_max, pb1.x2_min, pb1.x2_max, pb1.nb_points, pb1.levels , pb1.title )"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {
131 | "collapsed": true
132 | },
133 | "source": [
134 | "---\n",
135 | "### 1b. Comparing constant stepsize gradient algorithm and Wolfe search on Problem 2\n",
136 | "\n",
137 | "> Try different starting points and observe the results of line search.\n"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "from algorithms import * # import all methods of the module into the current environment\n",
147 | "\n",
148 | "import numpy as np\n",
149 | "import problem2 as pb2\n",
150 | "\n",
151 | "\n",
152 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n",
153 | "PREC = 0.01 # Sought precision\n",
154 | "ITE_MAX = 20 # Max number of iterations\n",
155 | "x0 = np.array( (1.5,1.5 ) ) # Initial point\n",
156 | "step = 0.1\n",
157 | "\n",
158 | "##### gradient algorithm\n",
159 | "x,x_tab = gradient_algorithm(pb2.f , pb2.f_grad , x0 , step , PREC , ITE_MAX )\n",
160 | "\n",
161 | "##### Wolfe line-search algorithm\n",
162 | "xW,xW_tab = gradient_Wolfe(pb2.f , pb2.f_grad , x0 , PREC , ITE_MAX )\n"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "###### Plots"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "from plotLib import *\n",
179 | "%matplotlib inline\n",
180 | "\n",
181 | "##### comparison\n",
182 | "level_2points_plot( pb2.f , x_tab , xW_tab , pb2.x1_min, pb2.x1_max, pb2.x2_min, pb2.x2_max, pb2.nb_points, pb2.levels , pb2.title )"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "---\n",
190 | "### 1c. Comparing constant stepsize gradient algorithm and Wolfe search on Problem 3\n",
191 | "\n",
192 | "> Compare the convergence of the gradient with and without line search. Keeping in mind that Newton method takes around $30$ iterations to converge, what is the biggest problem for minimizing such function, the stepsize or the descent direction?\n"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {
199 | "scrolled": true
200 | },
201 | "outputs": [],
202 | "source": [
203 | "from algorithms import * # import all methods of the module into the current environment\n",
204 | "\n",
205 | "import numpy as np\n",
206 | "import problem3 as pb3\n",
207 | "\n",
208 | "\n",
209 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n",
210 | "PREC = 0.0001 # Sought precision\n",
211 | "ITE_MAX = 10000 # Max number of iterations\n",
212 | "x0 = np.array( (-1.0,1.2 ) ) # Initial point\n",
213 | "step = 0.001\n",
214 | "\n",
215 | "##### gradient algorithm\n",
216 | "x,x_tab = gradient_algorithm(pb3.f , pb3.f_grad , x0 , step , PREC , ITE_MAX )\n",
217 | "\n",
218 | "##### Wolfe line-search algorithm\n",
219 | "xW,xW_tab = gradient_Wolfe(pb3.f , pb3.f_grad , x0 , PREC , ITE_MAX )\n"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "###### Plots"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "from plotLib import *\n",
236 | "%matplotlib inline\n",
237 | "\n",
238 | "##### comparison\n",
239 | "level_2points_plot( pb3.f , x_tab , xW_tab , pb3.x1_min, pb3.x1_max, pb3.x2_min, pb3.x2_max, pb3.nb_points, pb3.levels , pb3.title )"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "---\n",
247 | "### 1d. Comparing constant stepsize gradient algorithm and Wolfe search on Problem 5\n",
248 | "\n",
249 | "> Try different starting points $(0,0)$ , $(0,1)$, $(1,0)$, $(0.2,0.4)$.\n"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "metadata": {
256 | "scrolled": true
257 | },
258 | "outputs": [],
259 | "source": [
260 | "from algorithms import * # import all methods of the module into the current environment\n",
261 | "\n",
262 | "import numpy as np\n",
263 | "import problem5 as pb5\n",
264 | "\n",
265 | "\n",
266 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n",
267 | "PREC = 0.001 # Sought precision\n",
268 | "ITE_MAX = 100 # Max number of iterations\n",
269 | "x0 = np.array( (0.,0. ) ) # Initial point\n",
270 | "step = 0.1\n",
271 | "\n",
272 | "##### gradient algorithm\n",
273 | "x,x_tab = gradient_algorithm(pb5.f , pb5.f_grad , x0 , step , PREC , ITE_MAX )\n",
274 | "\n",
275 | "##### Wolfe line-search algorithm\n",
276 | "xW,xW_tab = gradient_Wolfe(pb5.f , pb5.f_grad , x0 , PREC , ITE_MAX )\n"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "###### Plots"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "from plotLib import *\n",
293 | "%matplotlib inline\n",
294 | "\n",
295 | "##### comparison\n",
296 | "level_2points_plot( pb5.f , x_tab , xW_tab , pb5.x1_min, pb5.x1_max, pb5.x2_min, pb5.x2_max, pb5.nb_points, pb5.levels , pb5.title )"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "# 2. Quasi Newton\n",
304 | "\n",
305 | "Now that we have a proper way of choosing a good stepsize, we see that the opposite of the gradient is not always a good descent direction. We saw in the previous Lab that Newton method was sometimes computationally expensive. In this section, we investigate a method to choose descent directions based on the approximation of the inverse Hessian.\n",
306 | "\n",
307 | "For a differentiable function $f$, Quasi-Newton methods iteratively construct an approximation $W_k$ of the inverse of the Hessian then use descent direction $-W_k\\nabla f(x_k)$.\n",
308 | "\n",
309 | "**BFGS.** (Broyden-Fletcher-Goldfarb-Shanno, 1970) The popular BFGS algorithm consist in performing the following iteration\n",
310 | "$$ x_{k+1}=x_k - \\gamma_k W_k \\nabla f(x_k)$$\n",
311 | "where $\\gamma_k$ is given by Wolfe's line-search and positive definite matrix $W_k$ is computed as\n",
312 | "$$ W_{k+1}=W_k - \\frac{s_k y_k^T W_k+W_k y_k s_k^T}{y_k^T s_k} +\\left[1+\\frac{y_k^T W_k y_k}{y_k^T s_k}\\right]\\frac{s_k s_k^T}{y_k^T s_k} $$\n",
313 | "with $s_k=x_{k+1}-x_{k}$ and $y_k=\\nabla f(x_{k+1}) - \\nabla f(x_{k})$.\n",
314 | "\n",
315 | "The general scheme is then:\n",
316 | "* from initial point $x_0$, and initial positive definite matrix $W_0$;\n",
317 | "* from gradient $\\nabla f(x_k)$, compute direction $d_k=-W_k \\nabla f(x_k)$;\n",
318 | "* compute stepsize $\\gamma_k$ by Wolfe's line-search;\n",
319 | "* from new point $x_{k+1}$, call the function oracle and compute $W_{k+1}$.\n",
320 | "\n",
321 | "> Implement BFGS method in `algorithms.py`.\n",
322 | "\n",
323 | "*Hint: Use fonction `np.outer(a,b)` to compute $ab^T$.*\n"
324 | ]
325 | },
326 | {
327 | "cell_type": "markdown",
328 | "metadata": {},
329 | "source": [
330 | "---\n",
331 | "### Comparing constant stepsize gradient algorithm and Wolfe search on Problem 3\n",
332 | "\n",
333 | "> Compare the convergence of the gradient with line search and BFGS; then Newton vs BFGS\n"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "from algorithms import * # import all methods of the module into the current environment\n",
343 | "\n",
344 | "import numpy as np\n",
345 | "import problem3 as pb3\n",
346 | "\n",
347 | "\n",
348 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n",
349 | "PREC = 1e-4 # Sought precision\n",
350 | "ITE_MAX = 10000 # Max number of iterations\n",
351 | "x0 = np.array( (-1.0,1.2 ) ) # Initial point\n",
352 | "\n",
353 | "##### Wolfe line-search algorithm\n",
354 | "xW,xW_tab = gradient_Wolfe(pb3.f , pb3.f_grad , x0 , PREC , ITE_MAX )\n",
355 | "\n",
356 | "##### Newton algorithm\n",
357 | "xN,xN_tab = newton_algorithm(pb3.f , pb3.f_grad_hessian , x0 , PREC , ITE_MAX )\n",
358 | "\n",
359 | "##### BFGS algorithm\n",
360 | "xB,xB_tab = bfgs(pb3.f , pb3.f_grad , x0 , PREC , ITE_MAX )"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "###### Plots\n",
368 | "\n",
369 | "* Gradient with line search vs BFGS"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "from plotLib import *\n",
379 | "%matplotlib inline\n",
380 | "\n",
381 | "##### comparison\n",
382 | "level_2points_plot( pb3.f , xW_tab , xB_tab , pb3.x1_min, pb3.x1_max, pb3.x2_min, pb3.x2_max, pb3.nb_points, pb3.levels , pb3.title )"
383 | ]
384 | },
385 | {
386 | "cell_type": "markdown",
387 | "metadata": {},
388 | "source": [
389 | "* Newton vs BFGS"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": null,
395 | "metadata": {
396 | "scrolled": true
397 | },
398 | "outputs": [],
399 | "source": [
400 | "from plotLib import *\n",
401 | "%matplotlib inline\n",
402 | "\n",
403 | "##### comparison\n",
404 | "level_2points_plot( pb3.f , xN_tab , xB_tab , pb3.x1_min, pb3.x1_max, pb3.x2_min, pb3.x2_max, pb3.nb_points, pb3.levels , pb3.title )"
405 | ]
406 | },
407 | {
408 | "cell_type": "markdown",
409 | "metadata": {},
410 | "source": [
411 | "# Appendix: Problems\n",
412 | "\n",
413 | "The problems we consider in this first lab are minimizations of unconstrained continous functions. "
414 | ]
415 | },
416 | {
417 | "cell_type": "markdown",
418 | "metadata": {},
419 | "source": [
420 | "\n",
421 | "> **1.** `problem1` features a simple quadratic function\n",
422 | "$$\\begin{array}{rrcll}\n",
423 | "f: & \\mathbb{R}^2 & \\to &\\mathbb{R}\\\\\n",
424 | "& (x_1,x_2) & \\mapsto & 4 (x_1-3)^2 + 2(x_2-1)^2\n",
425 | "\\end{array}$$\n",
426 | "
\n",
427 | "\n",
428 | "\n",
429 | "> **2.** `problem2` features a more involved but very smooth function\n",
430 | "$$\\begin{array}{rrcll}\n",
431 | "g: & \\mathbb{R}^2 & \\to &\\mathbb{R}\\\\\n",
432 | "& (x_1,x_2) & \\mapsto & \\log( 1 + \\exp(4 (x_1-3)^2 ) + \\exp( 2(x_2-1)^2 ) ) - \\log(3)\n",
433 | "\\end{array}$$\n",
434 | "
\n",
435 | "\n",
436 | "\n",
437 | "> **3.** `problem3` features Rosenbrock's smooth but non-convex function\n",
438 | "$$\\begin{array}{rrcll}\n",
439 | "r: & \\mathbb{R}^2 & \\to &\\mathbb{R}\\\\\n",
440 | "& (x_1,x_2) & \\mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2\n",
441 | "\\end{array}$$\n",
442 | "
\n",
443 | "\n",
444 | "\n",
445 | "> **4.** `problem4` features a smooth function with two distinct minimizers\n",
446 | "$$\\begin{array}{rrcll}\n",
447 | "t: & \\mathbb{R}^2 & \\to &\\mathbb{R}\\\\\n",
448 | "& (x_1,x_2) & \\mapsto & (0.6 x_1 + 0.2 x_2)^2 \\left((0.6 x_1 + 0.2 x_2)^2 - 4 (0.6 x_1 + 0.2 x_2)+4\\right) + (-0.2 x_1 + 0.6 x_2)^2\n",
449 | "\\end{array}$$\n",
450 | "
\n",
451 | "\n",
452 | "\n",
453 | "> **5.** `problem5` features a polyhedral function\n",
454 | "$$\\begin{array}{rrcll}\n",
455 | "p: & \\mathbb{R}^2 & \\to &\\mathbb{R}\\\\\n",
456 | "& (x_1,x_2) & \\mapsto & \\left| x_1-3 \\right| + 2\\left| x_2-1\\right| .\n",
457 | "\\end{array}$$\n",
458 | "
\n",
459 | "\n"
460 | ]
461 | }
462 | ],
463 | "metadata": {
464 | "kernelspec": {
465 | "display_name": "Python 3",
466 | "language": "python",
467 | "name": "python3"
468 | },
469 | "language_info": {
470 | "codemirror_mode": {
471 | "name": "ipython",
472 | "version": 3
473 | },
474 | "file_extension": ".py",
475 | "mimetype": "text/x-python",
476 | "name": "python",
477 | "nbconvert_exporter": "python",
478 | "pygments_lexer": "ipython3",
479 | "version": "3.7.5"
480 | }
481 | },
482 | "nbformat": 4,
483 | "nbformat_minor": 1
484 | }
485 |
--------------------------------------------------------------------------------
/Lab2_GradientDescent/algorithms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Gradient-based algorithms
5 | #
6 | # In this notebook, we code our gradient-based optimization algorithms.
7 |
8 | #################################
9 | # # 1. Gradient algorithms
10 | ##################################
11 | #
12 | # For minimizing a differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given:
13 | # * the function to minimize `f`
14 | # * a 1st order oracle `f_grad` (see `problem1.ipynb` for instance)
15 | # * an initialization point `x0`
16 | # * the sought precision `PREC`
17 | # * a maximal number of iterations `ITE_MAX`
18 | #
19 | #
20 | # these algorithms perform iterations of the form
21 | # $$ x_{k+1} = x_k - \gamma_k \nabla f(x_k) $$
22 | # where $\gamma_k$ is a stepsize to choose.
23 |
24 | # ### 1.a. Constant stepsize gradient algorithm
25 | #
26 | # First, we consider the case where the stepsize is fixed over iterations and passed an argument `step` to the algorithm.
27 |
28 |
29 |
30 | import numpy as np
31 | import timeit
32 |
33 | def gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ):
34 | x = np.copy(x0)
35 | stop = PREC*np.linalg.norm(f_grad(x0) )
36 |
37 | x_tab = np.copy(x)
38 | print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
39 | t_s = timeit.default_timer()
40 | for k in range(ITE_MAX):
41 | g = f_grad(x)
42 | x = x - step*g ####### ITERATION
43 |
44 | x_tab = np.vstack((x_tab,x))
45 |
46 | if np.linalg.norm(g) < stop:
47 | break
48 | t_e = timeit.default_timer()
49 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
50 | return x,x_tab
51 |
52 |
53 | # ### 1.b. Adaptive stepsize gradient algorithm
54 | #
55 |
56 | # Q: Complete the adaptive gradient below using your intuition
57 |
58 |
59 | import numpy as np
60 | import timeit
61 |
62 | def gradient_adaptive_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ):
63 | x = np.copy(x0)
64 | stop = PREC*np.linalg.norm(f_grad(x0) )
65 |
66 | x_tab = np.copy(x)
67 | print("------------------------------------\nAdaptative Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
68 | t_s = timeit.default_timer()
69 | for k in range(ITE_MAX):
70 |
71 | g = f_grad(x)
72 | x_prev = np.copy(x)
73 |
74 | x = x - step*g ####### ITERATION
75 |
76 | ### COMPLETE
77 |
78 | x_tab = np.vstack((x_tab,x))
79 |
80 | if np.linalg.norm(g) < stop:
81 | break
82 | t_e = timeit.default_timer()
83 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
84 | return x,x_tab
85 |
86 |
87 | # ### 1.c. Wolfe Line search
88 |
89 |
90 | # Q: Complete the function below accordingly.
91 |
92 |
93 |
94 | import numpy as np
95 | import timeit
96 | from scipy.optimize import line_search
97 |
98 | def gradient_Wolfe(f , f_grad , x0 , PREC , ITE_MAX ):
99 | x = np.copy(x0)
100 | g = f_grad(x0)
101 | stop = PREC*np.linalg.norm( g )
102 |
103 | x_tab = np.copy(x)
104 | print("------------------------------------\n Gradient with Wolfe line search\n------------------------------------\nSTART")
105 | t_s = timeit.default_timer()
106 | for k in range(ITE_MAX):
107 |
108 | ########### TO FILL
109 |
110 | x = x ###### ITERATION
111 |
112 | x_tab = np.vstack((x_tab,x))
113 |
114 | if np.linalg.norm(g) < stop:
115 | break
116 | t_e = timeit.default_timer()
117 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
118 | return x,x_tab
119 |
120 |
121 | # # 2. Second Order algorithms
122 | #
123 | # For minimizing a *twice* differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given:
124 | # * the function to minimize `f`
125 | # * a 2nd order oracle `f_grad_hessian` (see `problem1.ipynb` for instance)
126 | # * an initialization point `x0`
127 | # * the sought precision `PREC`
128 | # * a maximal number of iterations `ITE_MAX`
129 | #
130 | #
131 | # these algorithms perform iterations of the form
132 | # $$ x_{k+1} = x_k - [\nabla^2 f(x_k) ]^{-1} \nabla f(x_k) .$$
133 |
134 |
135 |
136 | import numpy as np
137 | import timeit
138 |
139 | def newton_algorithm(f , f_grad_hessian , x0 , PREC , ITE_MAX ):
140 | x = np.copy(x0)
141 | g0,H0 = f_grad_hessian(x0)
142 | stop = PREC*np.linalg.norm(g0 )
143 |
144 | x_tab = np.copy(x)
145 | print("------------------------------------\nNewton's algorithm\n------------------------------------\nSTART")
146 | t_s = timeit.default_timer()
147 | for k in range(ITE_MAX):
148 |
149 | g,H = f_grad_hessian(x)
150 | x = x - np.linalg.solve(H,g) ####### ITERATION
151 |
152 | x_tab = np.vstack((x_tab,x))
153 |
154 | if np.linalg.norm(g) < stop:
155 | break
156 | t_e = timeit.default_timer()
157 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
158 | return x,x_tab
159 |
160 |
161 | # # 3. Quasi Newton algorithms
162 | #
163 | # **BFGS.** (Broyden-Fletcher-Goldfarb-Shanno, 1970) The popular BFGS algorithm consist in performing the following iteration
164 | # $$ x_{k+1}=x_k - \gamma_k W_k \nabla f(x_k)$$
165 | # where $\gamma_k$ is given by Wolfe's line-search and positive definite matrix $W_k$ is computed as
166 | # $$ W_{k+1}=W_k - \frac{s_k y_k^T W_k+W_k y_k s_k^T}{y_k^T s_k} +\left[1+\frac{y_k^T W_k y_k}{y_k^T s_k}\right]\frac{s_k s_k^T}{y_k^T s_k} $$
167 | # with $s_k=x_{k+1}-x_{k}$ and $y_k=\nabla f(x_{k+1}) - \nabla f(x_{k})$.
168 |
169 |
170 | # Q: Fill the function below accordingly.
171 |
172 |
173 | import numpy as np
174 | import timeit
175 | from scipy.optimize import line_search
176 |
177 | def bfgs(f , f_grad , x0 , PREC , ITE_MAX ):
178 | x = np.copy(x0)
179 | n = x0.size
180 | g = f_grad(x0)
181 | sim_eval = 1
182 | stop = PREC*np.linalg.norm( g )
183 |
184 | W = np.eye(n)
185 |
186 | x_tab = np.copy(x)
187 | print("------------------------------------\n BFGS\n------------------------------------\nSTART")
188 | t_s = timeit.default_timer()
189 | for k in range(ITE_MAX):
190 |
191 | ########### TO FILL
192 |
193 | x = x ###### ITERATION
194 |
195 | x_tab = np.vstack((x_tab,x))
196 |
197 | if np.linalg.norm(g) < stop:
198 | break
199 |
200 | t_e = timeit.default_timer()
201 | print("FINISHED -- {:d} iterations / {:.6f}s ({:d} sim. calls) -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,sim_eval,f(x),x[0],x[1]))
202 | return x,x_tab
203 |
204 |
--------------------------------------------------------------------------------
/Lab2_GradientDescent/plotLib.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib as mpl
3 | import matplotlib.pyplot as plt
4 | from matplotlib import cm
5 | from mpl_toolkits.mplot3d import Axes3D
6 | import time
7 | from IPython import display
8 |
9 |
10 | def custom_3dplot( f, x1_min,x1_max,x2_min,x2_max,nb_points, v_min, v_max ):
11 |
12 | def f_no_vector(x1,x2):
13 | return f( np.array( [x1,x2] ) )
14 |
15 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
16 | z = f_no_vector(x,y)
17 |
18 | fig = plt.figure()
19 | ax = fig.gca(projection='3d')
20 | ax.plot_surface(x, y, z, cmap=cm.hot , vmin = v_min, vmax = v_max)
21 | ax.set_zlim(v_min, v_max)
22 | plt.show()
23 |
24 |
25 | def level_plot( f, x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
26 |
27 |
28 | def f_no_vector(x1,x2):
29 | return f( np.array( [x1,x2] ) )
30 |
31 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
32 | z = f_no_vector(x,y)
33 |
34 | fig = plt.figure()
35 | graphe = plt.contour(x,y,z,levels)
36 | #plt.plot(3,1,'r*',markersize=15)
37 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
38 | plt.title(title)
39 | plt.show()
40 |
41 |
42 | def level_points_plot( f , x_tab , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
43 |
44 | def f_no_vector(x1,x2):
45 | return f( np.array( [x1,x2] ) )
46 |
47 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
48 | z = f_no_vector(x,y)
49 |
50 | fig = plt.figure()
51 | graphe = plt.contour(x,y,z,levels)
52 | #plt.plot(3,1,'r*',markersize=15)
53 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
54 | plt.title(title)
55 |
56 | if x_tab.shape[0] > 40:
57 | sub = int(x_tab.shape[0]/40.0)
58 | x_tab = x_tab[::sub]
59 |
60 | delay = 2.0/x_tab.shape[0]
61 | for k in range(x_tab.shape[0]):
62 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10)
63 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1]))
64 | plt.draw()
65 | display.clear_output(wait=True)
66 | display.display(fig)
67 | time.sleep(delay)
68 | display.clear_output()
69 | plt.show()
70 |
71 |
72 | def level_2points_plot( f , x_tab , x_tab2 , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
73 |
74 |
75 | def f_no_vector(x1,x2):
76 | return f( np.array( [x1,x2] ) )
77 |
78 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
79 | z = f_no_vector(x,y)
80 |
81 | fig = plt.figure()
82 | graphe = plt.contour(x,y,z,levels)
83 | #plt.plot(3,1,'r*',markersize=15)
84 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
85 | plt.xlim([x1_min,x1_max])
86 | plt.ylim([x2_min,x2_max])
87 | plt.title(title)
88 |
89 | if x_tab.shape[0] > 40:
90 | sub = int(x_tab.shape[0]/40.0)
91 | x_tab = x_tab[::sub]
92 |
93 | if x_tab2.shape[0] > 40:
94 | sub = int(x_tab2.shape[0]/40.0)
95 | x_tab2 = x_tab2[::sub]
96 |
97 | delay = 4.0/x_tab.shape[0]
98 | for k in range(x_tab.shape[0]):
99 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10)
100 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1]))
101 | plt.draw()
102 | #plt.pause(delay)
103 |
104 | delay = 4.0/x_tab2.shape[0]
105 | for k in range(x_tab2.shape[0]):
106 | plt.plot(x_tab2[k,0],x_tab2[k,1],'dg',markersize=8)
107 | #plt.annotate(k,(x_tab2[k,0],x_tab2[k,1]))
108 | #plt.pause(delay)
109 | plt.draw()
110 |
111 | plt.show()
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
--------------------------------------------------------------------------------
/Lab2_GradientDescent/problem1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 1
5 | #
6 | #
7 | # The objective of Problem 1 is to minimize a simple quadratic function $f$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # f: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & 4 (x_1-3)^2 + 2(x_2-1)^2
12 | # \end{array}$$
13 |
14 |
15 |
16 | ##### Function definition
17 | def f(x):
18 | x1 = x[0]
19 | x2 = x[1]
20 | return 4*(x1-3)**2+2*(x2-1)**2
21 | ####
22 |
23 | ##### Plot parameters f
24 | x1_min = -0.5
25 | x1_max = 5.5
26 | x2_min = -0.5
27 | x2_max = 5.5
28 | nb_points = 200
29 | vmin = 0
30 | vmax = 80
31 | levels = [0.5,1,2,5,10,15]
32 | title = 'f: a simple function'
33 | ####
34 |
35 |
36 | # ### Some parameters
37 | #
38 | # Before solving things numerically, some useful things can be computed:
39 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc
40 | # * Good starting points (for hot starting e.g.)
41 |
42 |
43 |
44 | ###### Useful Parameters
45 | L = 8 # Lipschitz constant of the gradient
46 |
47 |
48 | # ### Oracles
49 | #
50 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian).
51 |
52 |
53 |
54 | # Q: Observe the first order oracle `f_grad`.
55 |
56 |
57 |
58 | import numpy as np
59 |
60 | ##### Gradient oracle
61 | def f_grad(x):
62 | x1 = x[0]
63 | x2 = x[1]
64 | gx = 8*(x1-3)
65 | gy = 4*(x2-1)
66 | return np.array( [ gx , gy ] )
67 | ####
68 |
69 |
70 | # Q: Observe the second order oracle `f_grad_hessian`.
71 |
72 |
73 | import numpy as np
74 |
75 | ##### Hessian scaled Gradient computation
76 | def f_grad_hessian(x):
77 | x1 = x[0]
78 | x2 = x[1]
79 | gx = 8*(x1-3)
80 | gy = 4*(x2-1)
81 | g = np.array( [ gx , gy ] )
82 | H = np.array( [ (8.0 , 0.0 ) , ( 0.0 , 4.0 ) ] )
83 | return g,H
84 | ####
85 |
86 |
--------------------------------------------------------------------------------
/Lab2_GradientDescent/problem2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 2
5 | #
6 | #
7 | # The objective of Problem 2 is to minimize a more involved but very smooth function function $g$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # g: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & \log( 1 + \exp(4 (x_1-3)^2 ) + \exp( 2(x_2-1)^2 ) ) - \log(3)
12 | # \end{array}$$
13 |
14 |
15 |
16 |
17 | ##### Function definition
18 | def f(x):
19 | x1 = x[0]
20 | x2 = x[1]
21 | return np.log( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) - np.log(3)
22 | ####
23 |
24 | ##### Plot parameters f
25 | x1_min = -0.5
26 | x1_max = 5.5
27 | x2_min = -0.5
28 | x2_max = 5.5
29 | nb_points = 500
30 | vmin = 0
31 | vmax = 100
32 | levels = [0.5,1,2,5,10,15]
33 | title = 'a Harder function: g'
34 | ####
35 |
36 |
37 |
38 | ###### Useful Parameters
39 | L = 8 # Lipschitz constant of the gradient
40 |
41 |
42 | # ### Oracles
43 |
44 |
45 | # Q: Complete the first order oracle `f_grad`.
46 |
47 |
48 |
49 | import numpy as np
50 |
51 | ##### Gradient oracle
52 | def f_grad(x):
53 | x1 = x[0]
54 | x2 = x[1]
55 | gx = 0 ## To complete
56 | gy = 0 ## To complete
57 | return np.array( [ gx , gy ] )
58 | ####
59 |
60 |
61 | # Q: Fill the following second order oracle `f_grad_hessian`.
62 |
63 |
64 | import numpy as np
65 |
66 | ##### Hessian scaled Gradient computation
67 | def f_grad_hessian(x):
68 |
69 | return g,H
70 | ####
71 |
72 |
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/Lab2_GradientDescent/problem3.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 3
5 | #
6 | #
7 | # The objective of Problem 3 is to minimize non-convex smooth Rosenbrock function $r$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # r: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2
12 | # \end{array}$$
13 |
14 |
15 |
16 |
17 | ##### Function definition
18 | def f(x):
19 | """Rosenbrock."""
20 | x1 = x[0]
21 | x2 = x[1]
22 | return (1-x1)**2+100*(x2-x1**2)**2
23 | ####
24 |
25 | ##### Plot parameters f
26 | x1_min = -1.5
27 | x1_max = 1.55
28 | x2_min = -0.2
29 | x2_max = 1.5
30 | nb_points = 200
31 | vmin = 0
32 | vmax = 120
33 | levels = [0.05,1,5,15,50,100,200]
34 | title = 'Rosenbrock function'
35 | ####
36 |
37 |
38 |
39 |
40 |
41 | ### Oracles
42 |
43 |
44 | # Q: Complete the first order oracle `f_grad`.
45 |
46 |
47 |
48 |
49 | import numpy as np
50 |
51 | ##### Gradient oracle
52 | def f_grad(x):
53 |
54 | return 0.0 ### To complete
55 | ####
56 |
57 |
58 | # Q: Fill the following second order oracle `f_grad_hessian`.
59 |
60 |
61 | import numpy as np
62 |
63 | ##### Hessian scaled Gradient computation
64 | def f_grad_hessian(x):
65 |
66 | return g,H ### To complete
67 | ####
68 |
69 |
--------------------------------------------------------------------------------
/Lab2_GradientDescent/problem4.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 4
5 | #
6 | #
7 | # The objective of Problem 4 is to minimize a non-convex function $t$ with two minimizers on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # t: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & (0.6 x_1 + 0.2 x_2)^2 \left((0.6 x_1 + 0.2 x_2)^2 - 4 (0.6 x_1 + 0.2 x_2)+4\right) + (-0.2 x_1 + 0.6 x_2)^2
12 | # \end{array}$$
13 |
14 |
15 |
16 |
17 |
18 | ##### Function definition
19 | def f(x):
20 | x1 = x[0]
21 | x2 = x[1]
22 | return (0.6*x1 + 0.2*x2)**2 * ((0.6*x1 + 0.2*x2)**2 - 4*(0.6*x1 + 0.2*x2)+4) + (-0.2*x1 + 0.6*x2)**2
23 | ####
24 |
25 | ##### Plot parameters f
26 | x1_min = -1
27 | x1_max = 4
28 | x2_min = -1
29 | x2_max = 4
30 | nb_points = 200
31 | levels = [0.05,0.5,1,2,5]
32 | vmin = 0
33 | vmax = 5
34 | title = 'two pits'
35 | ####
36 |
37 |
38 |
39 |
40 |
41 | ###### Useful Parameters
42 | L = 8 # Lipschitz constant of the gradient
43 |
44 |
45 | ### Oracles
46 |
47 | # Q: Complete the first order oracle `f_grad`.
48 |
49 |
50 |
51 |
52 | import numpy as np
53 |
54 | ##### Gradient oracle
55 | def f_grad(x):
56 |
57 | return ### To complete
58 |
59 |
60 | # Q: Does a second order oracle exist for any point?
61 |
62 |
63 |
--------------------------------------------------------------------------------
/Lab2_GradientDescent/problem5.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 5
5 | #
6 | #
7 | # The objective of Problem 5 is to minimize a polyhedral function $p$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # p: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & \left| x_1-3 \right| + 2\left| x_2-1\right| .
12 | # \end{array}$$
13 |
14 |
15 |
16 |
17 | ##### Function definition
18 | def f(x):
19 | x1 = x[0]
20 | x2 = x[1]
21 | return np.abs(x1-3)+2*np.abs(x2-1)
22 | ####
23 |
24 | ##### Plot parameters f
25 | x1_min = -0.5
26 | x1_max = 5.5
27 | x2_min = -0.5
28 | x2_max = 5.5
29 | nb_points = 200
30 | levels = [0.05,0.5,1,2,5]
31 | vmin = 0
32 | vmax = 5
33 | title = 'polyhedral'
34 | ####
35 |
36 |
37 | ### Oracles
38 |
39 |
40 | # Q: Compute a first order oracle `f_grad`. Is it unique?
41 |
42 |
43 |
44 | import numpy as np
45 |
46 | ##### Gradient oracle
47 | def f_grad(x):
48 |
49 | return g ### To complete
50 | ####
51 |
52 |
53 | # Q: What about a second order oracle?
54 |
55 |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/.DS_Store
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/._.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._.DS_Store
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/._1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._1.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/._2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._2.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/._3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._3.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/._4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._4.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/._5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._5.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/._UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/._UGA.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/1.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/2.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/3.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/4.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/5.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/Fig/UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab3_ProjectedGradient/Fig/UGA.png
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/algoProjGradient.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Projected Gradient-based algorithms
5 | #
6 | # In this notebook, we code our Projected gradient-based optimization algorithms.
7 | # We consider here
8 | # * Positivity constraints
9 | # * Interval constraints
10 |
11 | # # 1. Projected Gradient algorithms (for positivity or interval constraints)
12 | #
13 | # For minimizing a differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given:
14 | # * the function to minimize `f`
15 | # * a 1st order oracle `f_grad` (see `problem1.ipynb` for instance)
16 | # * an initialization point `x0`
17 | # * the sought precision `PREC`
18 | # * a maximal number of iterations `ITE_MAX`
19 | #
20 | #
21 | # these algorithms perform iterations of the form
22 | # $$ x_{k+1} = P\left(x_k - \gamma_k \nabla f(x_k)\right) $$
23 | # where $\gamma_k$ is a stepsize to choose and $P$ is the projector onto the convex constraint set. We only consider positivity and interval constraints.
24 |
25 | # ### 1.a. Constant stepsize projected gradient algorithm for positivity constraints
26 | #
27 | # First, we consider the case where the stepsize is fixed over iterations and passed an argument `step` to the algorithm.
28 |
29 |
30 | # Q. Fill the function below accordingly.
31 |
32 |
33 |
34 | import numpy as np
35 | import timeit
36 |
37 | def positivity_gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ):
38 | x = np.copy(x0)
39 | g = f_grad(x) # we initialize both x and f_grad(x)
40 | stop = PREC*np.linalg.norm(g)
41 |
42 | epsilon = PREC*np.ones_like(x0)
43 |
44 | x_tab = np.copy(x)
45 | print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
46 | t_s = timeit.default_timer()
47 | for k in range(ITE_MAX):
48 |
49 | x = x ####### ITERATION --> To complete by the projection onto the set "x >= 0"
50 |
51 | #######
52 | x_tab = np.vstack((x_tab,x))
53 | #######
54 | ##########################################################
55 | ####### Why must the following stopping criteria be changed ? Propose a correct stopping rule
56 | #if np.linalg.norm(g) < stop:
57 | # break
58 | ###############################################
59 |
60 | # To complete
61 | if ... :
62 | break
63 |
64 | t_e = timeit.default_timer()
65 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
66 | return x,x_tab
67 |
68 |
69 | # ### 1.b. Constant stepsize projected gradient algorithm for interval constraints
70 | #
71 | # First, we consider the case where the stepsize is fixed over iterations and passed an argument `step` to the algorithm.
72 |
73 | # Q. Fill the function below accordingly. Then, test you algorithm in `2_Optimization100.ipynb [Sec. 1a]` for Problem 1.
74 |
75 |
76 |
77 | import numpy as np
78 | import timeit
79 |
80 | def interval_gradient_algorithm(f , f_grad , x0 , infbound , supbound , step , PREC , ITE_MAX ):
81 | # compute the min of f with a gradient method with constant step under the constraint
82 | # borninf < x < bornesup
83 | x = np.copy(x0)
84 | g = f_grad(x)
85 | stop = PREC*np.linalg.norm(g)
86 | zero = np.zeros_like(x0)
87 | epsilon = PREC*np.ones_like(x0)
88 |
89 | x_tab = np.copy(x)
90 | print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
91 | t_s = timeit.default_timer()
92 | for k in range(ITE_MAX):
93 |
94 | x = x ####### ITERATION --> To complete by the projection onto the set "x >= 0"
95 |
96 |
97 | x_tab = np.vstack((x_tab,x))
98 |
99 | ####### Why must the following stopping criteria be changed ? Propose a correct stopping rule
100 | #if np.linalg.norm(g) < stop:
101 | # break
102 |
103 | # To complete
104 | if ... :
105 | break
106 |
107 | t_e = timeit.default_timer()
108 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
109 | return x,x_tab
110 |
111 |
112 |
113 |
114 |
115 |
116 |
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/plotLib.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib as mpl
3 | import matplotlib.pyplot as plt
4 | from matplotlib import cm
5 | from mpl_toolkits.mplot3d import Axes3D
6 | import time
7 | from IPython import display
8 |
9 |
10 | def custom_3dplot( f, x1_min,x1_max,x2_min,x2_max,nb_points, v_min, v_max ):
11 |
12 | def f_no_vector(x1,x2):
13 | return f( np.array( [x1,x2] ) )
14 |
15 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
16 | z = f_no_vector(x,y)
17 |
18 | fig = plt.figure()
19 | ax = fig.gca(projection='3d')
20 | ax.plot_surface(x, y, z, cmap=cm.hot , vmin = v_min, vmax = v_max)
21 | ax.set_zlim(v_min, v_max)
22 | plt.show()
23 |
24 |
25 | def level_plot( f, x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
26 |
27 |
28 | def f_no_vector(x1,x2):
29 | return f( np.array( [x1,x2] ) )
30 |
31 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
32 | z = f_no_vector(x,y)
33 |
34 | fig = plt.figure()
35 | graphe = plt.contour(x,y,z,levels)
36 | #plt.plot(3,1,'r*',markersize=15)
37 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
38 | plt.title(title)
39 | plt.show()
40 |
41 |
42 | def level_points_plot( f , x_tab , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
43 |
44 | def f_no_vector(x1,x2):
45 | return f( np.array( [x1,x2] ) )
46 |
47 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
48 | z = f_no_vector(x,y)
49 |
50 | fig = plt.figure()
51 | graphe = plt.contour(x,y,z,levels)
52 | #plt.plot(3,1,'r*',markersize=15)
53 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
54 | plt.title(title)
55 |
56 | delay = 4.0/x_tab.shape[0]
57 | for k in range(x_tab.shape[0]):
58 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10)
59 | plt.xlim([x1_min,x1_max])
60 | plt.ylim([x2_min,x2_max])
61 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1]))
62 | plt.draw()
63 | display.clear_output(wait=True)
64 | display.display(fig)
65 | time.sleep(delay)
66 | display.clear_output()
67 | plt.show()
68 |
69 |
70 | def level_2points_plot( f , x_tab , x_tab2 , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
71 |
72 |
73 | def f_no_vector(x1,x2):
74 | return f( np.array( [x1,x2] ) )
75 |
76 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
77 | z = f_no_vector(x,y)
78 |
79 | fig = plt.figure()
80 | graphe = plt.contour(x,y,z,levels)
81 | #plt.plot(3,1,'r*',markersize=15)
82 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
83 | plt.xlim([x1_min,x1_max])
84 | plt.ylim([x2_min,x2_max])
85 | plt.title(title)
86 |
87 | delay = 4.0/x_tab.shape[0]
88 | for k in range(x_tab.shape[0]):
89 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10)
90 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1]))
91 | plt.draw()
92 | #plt.pause(delay)
93 |
94 | delay = 4.0/x_tab2.shape[0]
95 | for k in range(x_tab2.shape[0]):
96 | plt.plot(x_tab2[k,0],x_tab2[k,1],'dg',markersize=8)
97 | #plt.annotate(k,(x_tab2[k,0],x_tab2[k,1]))
98 | #plt.pause(delay)
99 | plt.draw()
100 |
101 | plt.show()
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/problem1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 21
5 | #
6 | #
7 | # The objective of Problem 21 is to minimize a simple quadratic function $f$ on $\mathbb{R}^2$, constrained to $x\ge 0$:
8 | #
9 | # $$\begin{array}{rrcll}
10 | # f: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & 4 (x_1-1)^2 + 2(x_2+0.5)^2
12 | # \end{array}$$
13 | #
14 |
15 | # ### Function definition
16 |
17 | # In[1]:
18 |
19 |
20 | ##### Function definition
21 | def f(x):
22 | x1 = x[0]
23 | x2 = x[1]
24 | return 4*(x1-1)**2+2*(x2+0.5)**2
25 | ####
26 |
27 | ##### Plot parameters f
28 | x1_min = -4.
29 | x1_max = 3.
30 | x2_min = -4.
31 | x2_max = 3.
32 | nb_points = 200
33 | vmin = 0
34 | vmax = 80
35 | levels = [0.5,1,2,5,10,15]
36 | title = 'f: a simple function'
37 | ####
38 |
39 |
40 | # ### Some parameters
41 | #
42 | # Before solving things numerically, some useful things can be computed:
43 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc
44 | # * Good starting points (for hot starting e.g.)
45 |
46 | # In[2]:
47 |
48 |
49 | ###### Useful Parameters
50 | L = 8 # Lipschitz constant of the gradient
51 |
52 |
53 | # ### Oracles
54 | #
55 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian).
56 | #
57 | # > Observe the first order oracle `f_grad`.
58 | #
59 |
60 | # In[3]:
61 |
62 |
63 | import numpy as np
64 |
65 | ##### Gradient oracle
66 | def f_grad(x):
67 | x1 = x[0]
68 | x2 = x[1]
69 | gx = 8*(x1-1)
70 | gy = 4*(x2+0.5)
71 | return np.array( [ gx , gy ] )
72 | ####
73 |
74 |
75 | # > Fill the following second order oracle `f_grad_hessian`.
76 |
77 | # In[4]:
78 |
79 |
80 | import numpy as np
81 |
82 | ##### Hessian scaled Gradient computation
83 | def f_grad_hessian(x):
84 | x1 = x[0]
85 | x2 = x[1]
86 | gx = 8*(x1-1)
87 | gy = 4*(x2+0.5)
88 | g = np.array( [ gx , gy ] )
89 | H = np.array( [ ( 8.0 , 0 ) , ( 0 , 4.0 ) ] ) ### -> To complete DONE
90 | return g,H
91 | ####
92 |
93 |
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/problem2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 2
5 | #
6 | #
7 | # The objective of Problem 2 is to minimize a more involved but very smooth function function $g$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # g: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & \log( 1 + \exp(4 (x_1-3)^2 ) + \exp( 2(x_2-1)^2 ) ) - \log(3)
12 | # \end{array}$$
13 | #
14 |
15 | # ### Function definition
16 |
17 | # In[1]:
18 |
19 |
20 | ##### Function definition
21 | def f(x):
22 | x1 = x[0]
23 | x2 = x[1]
24 | return np.log( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) - np.log(3)
25 | ####
26 |
27 | ##### Plot parameters f
28 | x1_min = -0.5
29 | x1_max = 5.5
30 | x2_min = -0.5
31 | x2_max = 5.5
32 | nb_points = 500
33 | vmin = 0
34 | vmax = 100
35 | levels = [0.5,1,2,5,10,15]
36 | title = 'a Harder function: g'
37 | ####
38 |
39 |
40 | # ### Some parameters
41 | #
42 | # Before solving things numerically, some useful things can be computed:
43 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc
44 | # * Good starting points (for hot starting e.g.)
45 |
46 | # In[2]:
47 |
48 |
49 | ###### Useful Parameters
50 | L = 8 # Lipschitz constant of the gradient
51 |
52 |
53 | # ### Oracles
54 | #
55 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian).
56 | #
57 | # > Complete the first order oracle `f_grad`.
58 | #
59 |
60 | # In[2]:
61 |
62 |
63 | import numpy as np
64 |
65 | ##### Gradient oracle
66 | def f_grad(x):
67 | x1 = x[0]
68 | x2 = x[1]
69 | gx = 8*(x1-3)*np.exp(4*(x1-3)**2)/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) ## To complete
70 | gy = 4*(x2-1)*np.exp(2*(x2-1)**2)/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) ## To complete
71 | return np.array( [ gx , gy ] )
72 | ####
73 |
74 |
75 | # > Fill the following second order oracle `f_grad_hessian`.
76 |
77 | # In[1]:
78 |
79 |
80 | import numpy as np
81 |
82 | ##### Hessian scaled Gradient computation
83 | def f_grad_hessian(x):
84 | x1 = x[0]
85 | x2 = x[1]
86 | gx = 8*(x1-3)*np.exp(4*(x1-3)**2)/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) ## To complete
87 | gy = 4*(x2-1)*np.exp(2*(x2-1)**2)/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) ## To complete
88 |
89 | hxx = (1+ 8*(x1-3)**2)*np.exp(4*(x1-3)**2)*( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) )
90 | hxx = hxx -8* ((x1-3)*np.exp(4*(x1-3)**2))**2
91 | hxx = 8 * hxx/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) )**2
92 | hxy = -32*(x1-3)*(x2-1)*np.exp(4*(x1-3)**2)*np.exp(2*(x2-1)**2)
93 | hxy=hxy/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) )**2
94 | ## H is symetric thus hyx=hxy
95 | hyy = (1+4*(x2-1)**2)*np.exp(2*(x2-1)**2)*( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) )
96 | hyy= hyy -4* ((x2-1)*np.exp(2*(x2-1)**2))**2
97 | hyy= 4* hyy / ( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) )**2
98 |
99 | g = np.array( [ gx , gy ] )
100 | H = np.array( [ ( hxx , hxy ) , ( hxy , hyy ) ] ) ### -> To complete DONE
101 |
102 | return g,H
103 | ####
104 |
105 |
106 | # In[ ]:
107 |
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/problem3.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 3
5 | #
6 | #
7 | # The objective of Problem 3 is to minimize non-convex smooth Rosenbrock function $r$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # r: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2
12 | # \end{array}$$
13 | #
14 |
15 | # ### Function definition
16 |
17 | # In[1]:
18 |
19 |
20 | ##### Function definition
21 | def f(x):
22 | """Rosenbrock."""
23 | x1 = x[0]
24 | x2 = x[1]
25 | return (1-x1)**2+100*(x2-x1**2)**2
26 | ####
27 |
28 | ##### Plot parameters f
29 | x1_min = -1.5
30 | x1_max = 1.55
31 | x2_min = -0.2
32 | x2_max = 1.5
33 | nb_points = 200
34 | vmin = 0
35 | vmax = 120
36 | levels = [0.05,1,5,15,50,100,200]
37 | title = 'Rosenbrock function'
38 | ####
39 |
40 |
41 | # ### Some parameters
42 | #
43 | # Before solving things numerically, some useful things can be computed:
44 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc
45 | # * Good starting points (for hot starting e.g.)
46 |
47 | # In[1]:
48 |
49 |
50 | ###### Useful Parameters
51 |
52 |
53 | # ### Oracles
54 | #
55 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian).
56 | #
57 | # > Complete the first order oracle `f_grad`.
58 | #
59 |
60 | # In[1]:
61 |
62 |
63 | import numpy as np
64 |
65 | ##### Gradient oracle ##### return grandient ### To complete
66 | def f_grad(x):
67 | x1 = x[0]
68 | x2 = x[1]
69 | return np.array( ( 2*(x1-1) + 400*x1*(x1**2-x2) , 200*( x2 - x1**2) ) )
70 | ####
71 |
72 |
73 | # > Fill the following second order oracle `f_grad_hessian`.
74 |
75 | # In[4]:
76 |
77 |
78 | import numpy as np
79 |
80 | ##### Hessian scaled Gradient computation, #### return g,H ### To complete
81 | def f_grad_hessian(x):
82 | x1 = x[0]
83 | x2 = x[1]
84 | g = np.array( [ 2*(x1-1) + 400*x1*(x1**2-x2) , 200*( x2 - x1**2) ] )
85 | H = np.array( [ ( 2 - 400*x2 + 3*400*x1**2 , -400*x1 ) , ( -400*x1 , 200 ) ] )
86 | return g,H
87 | ####
88 |
89 |
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/problem4.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 4
5 | #
6 | #
7 | # The objective of Problem 4 is to minimize a non-convex function $t$ with two minimizers on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # t: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & (0.6 x_1 + 0.2 x_2)^2 \left((0.6 x_1 + 0.2 x_2)^2 - 4 (0.6 x_1 + 0.2 x_2)+4\right) + (-0.2 x_1 + 0.6 x_2)^2
12 | # \end{array}$$
13 | #
14 |
15 | # ### Function definition
16 |
17 | # In[1]:
18 |
19 |
20 | ##### Function definition
21 | def f(x):
22 | x1 = x[0]
23 | x2 = x[1]
24 | return (0.6*x1 + 0.2*x2)**2 * ((0.6*x1 + 0.2*x2)**2 - 4*(0.6*x1 + 0.2*x2)+4) + (-0.2*x1 + 0.6*x2)**2
25 | ####
26 |
27 | ##### Plot parameters f
28 | x1_min = -1
29 | x1_max = 4
30 | x2_min = -1
31 | x2_max = 4
32 | nb_points = 200
33 | levels = [0.05,0.5,1,2,5]
34 | vmin = 0
35 | vmax = 5
36 | title = 'two pits'
37 | ####
38 |
39 |
40 | # ### Some parameters
41 | #
42 | # Before solving things numerically, some useful things can be computed:
43 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc
44 | # * Good starting points (for hot starting e.g.)
45 |
46 | # In[2]:
47 |
48 |
49 | ###### Useful Parameters
50 | L = 8 # Lipschitz constant of the gradient
51 |
52 |
53 | # ### Oracles
54 | #
55 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian).
56 | #
57 | # > Complete the first order oracle `f_grad`.
58 | #
59 |
60 | # In[3]:
61 |
62 |
63 | import numpy as np
64 |
65 | ##### Gradient oracle ### To complete
66 | def f_grad(x):
67 | x1 = x[0]
68 | x2 = x[1]
69 | return np.array( ( 0.5184*x1**3+x1**2*(-2.592+0.5184*x2)+ x2*(0.72-0.288*x2+0.0192*x2**2)+ x1*(2.96-1.728*x2+0.1728*x2**2) , 0.1728*x1**3+x1**2*(-0.864+0.1728*x2)+x2*(1.04-0.096*x2+0.0064*x2**2)+x1*(0.72-0.576*x2+0.0576*x2**2) ) )
70 | ####
71 |
72 |
73 | # > Does a second order oracle exist for any point?
74 |
--------------------------------------------------------------------------------
/Lab3_ProjectedGradient/problem5.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Problem 5
5 | #
6 | #
7 | # The objective of Problem 5 is to minimize a polyhedral function $p$ on $\mathbb{R}^2$ (unconstrained):
8 | #
9 | # $$\begin{array}{rrcll}
10 | # p: & \mathbb{R}^2 & \to &\mathbb{R}\\
11 | # & (x_1,x_2) & \mapsto & \left| x_1-3 \right| + 2\left| x_2-1\right| .
12 | # \end{array}$$
13 | #
14 |
15 | # ### Function definition
16 |
17 | # In[1]:
18 |
19 |
20 | ##### Function definition
21 | def f(x):
22 | x1 = x[0]
23 | x2 = x[1]
24 | return np.abs(x1-3)+2*np.abs(x2-1)
25 | ####
26 |
27 | ##### Plot parameters f
28 | x1_min = -0.5
29 | x1_max = 5.5
30 | x2_min = -0.5
31 | x2_max = 5.5
32 | nb_points = 200
33 | levels = [0.05,0.5,1,2,5]
34 | vmin = 0
35 | vmax = 5
36 | title = 'polyhedral'
37 | ####
38 |
39 |
40 | # ### Some parameters
41 | #
42 | # Before solving things numerically, some useful things can be computed:
43 | # * Properties of $f$: lower bounds, Lipschitz constant of $\nabla f$, strong convexity constant, etc
44 | # * Good starting points (for hot starting e.g.)
45 |
46 | # In[2]:
47 |
48 |
49 | ###### Useful Parameters
50 |
51 |
52 | # ### Oracles
53 | #
54 | # Numerical optimization methods need callable *oracles* for properties of $f$, that is a function that, given a point $x$ in the domain of $f$, returns $f$ and/or gradient, Hessian of $f$ at point $x$. We talk about the *order* of an oracle as the number of differentiations given (0th order for just $f$, 1st order for the gradient, 2nd for gradient + Hessian).
55 | #
56 | # > Compute a first order oracle `f_grad`. Is it unique?
57 | #
58 |
59 | # In[3]:
60 |
61 |
62 | import numpy as np
63 |
64 | ##### Gradient oracle
65 | def f_grad(x):
66 | x1 = x[0]
67 | x2 = x[1]
68 | g = np.array( [ 0.0 , 0.0 ] )
69 | if x1 < 3:
70 | g[0] = -1.0
71 | elif x1 > 3:
72 | g[0] = 1.0
73 | if x2 < 1:
74 | g[1] = -2.0
75 | elif x2 > 1:
76 | g[1] = 2.0
77 | return g
78 | ###### return g ### To complete
79 | ####
80 |
81 |
82 | # > What about a second order oracle?
83 |
84 | # In[ ]:
85 |
86 |
87 |
88 |
89 |
--------------------------------------------------------------------------------
/Lab4_Prox/Fig/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/1.png
--------------------------------------------------------------------------------
/Lab4_Prox/Fig/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/2.png
--------------------------------------------------------------------------------
/Lab4_Prox/Fig/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/3.png
--------------------------------------------------------------------------------
/Lab4_Prox/Fig/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/4.png
--------------------------------------------------------------------------------
/Lab4_Prox/Fig/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/5.png
--------------------------------------------------------------------------------
/Lab4_Prox/Fig/UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/Fig/UGA.png
--------------------------------------------------------------------------------
/Lab4_Prox/Lab4_Proximal_algorithms.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
\n",
8 | "Master of Science in Industrial and Applied Mathematics (MSIAM) - 1st year
\n",
9 | "
\n",
10 | "Numerical Optimization
\n",
11 | "Lab 4: Proximal Algorithms
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n",
19 | "---\n",
20 | "\n"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "%load_ext autoreload\n",
30 | "%autoreload 2"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "---\n",
38 | "\n",
39 | "# Composite minimization.\n",
40 | "\n",
41 | "In this lab, we will investigate optimization algorithms over composite functions composed of a smooth and a non-smooth part using the proximal gradient algorithm over a practical problem of machine learning: binary classification using logistic regression."
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "We will consider the following function\n",
49 | " \n",
50 | "\\begin{align*}\n",
51 | "\\min_{x\\in\\mathbb{R}^d } F(x) := \\underbrace{ \\frac{1}{m} \\sum_{i=1}^m \\log( 1+\\exp(-b_i \\langle a_i,x \\rangle) ) + \\frac{\\lambda_2}{2} \\|x\\|_2^2}_{f(x)} + \\underbrace{\\lambda_1 \\|x\\|_1 }_{g(x)}.\n",
52 | "\\end{align*}\n",
53 | "\n",
54 | "for which we give:\n",
55 | "* the oracles for functions $f, g, F$;\n",
56 | "* the gradient oracle for $f$ and the Lipchitz constant of the gradient;\n",
57 | "* the size of the problem $n$;\n",
58 | "\n",
59 | "in `logistic_regression_student.py`. \n"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "> Implement the proximal operation linked to $g(x) = \\lambda_1 \\|x\\|_1$ in `logistic_regression_student.py`. \n",
67 | "\n",
68 | "> Create a function coding the proximal gradient algorithm and test your algorithm below."
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "import numpy as np\n",
78 | "import logistic_regression_student as pb\n",
79 | "\n",
80 | "\n",
81 | "##### proximal gradient algorithm\n",
82 | "#x,x_tab = pb.proximal_gradient_algorithm(pb.F , pb.f_grad , pb.g_prox , x0 , step , PREC, ITE_MAX)\n"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "> Investigate the decrease of the algorithm."
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "import matplotlib.pyplot as plt\n",
99 | "% matplotlib inline\n"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "> Plot the support of the vector $x_k$ (i.e. one point for every non-null coordinate of $x_k$) versus the iterations. \n",
107 | "\n",
108 | "> What do you notice? Was it expected?"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": []
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "---\n",
123 | "\n",
124 | "# Regularization path.\n",
125 | "\n",
126 | "\n",
127 | "We saw above that the algorithm *selected* some coordinates as the other get to zero. Considering our machine learning task, this translates into the algorithm selecting a subset of the features that will be used for the prediction step (see also the features signification at the end of the notebook). \n",
128 | "\n",
129 | "> Change the parameter $\\lambda_1$ of the problem (`pb.lam1`) in the code above and investigate how it influences the number of selected features.\n",
130 | "\n",
131 | "In order to quantify the influence of this feature selection, let us consider the *regularization path* that is the support of the final points obtained by our minimization method versus the value of $\\lambda_1$.\n",
132 | "\n",
133 | "> For $\\lambda_1 = 2^{-12},2^{-11}, .. , 2^{1}$, run the proximal gradient algorithm on the obtained problem and store the support of the final point, the prediction performance on the *training set* (`pb.prediction_train`) and on the *testing set* (`pb.prediction_test`)."
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "import matplotlib.pyplot as plt\n",
143 | "% matplotlib inline\n",
144 | "\n",
145 | "import numpy as np\n",
146 | "import logistic_regression_student as pb\n",
147 | "\n",
148 | "\n",
149 | "### TODO"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "> Plot the *regularization path* and look at the feature signification (at the end of the notebook) to see which are the most important features of the dataset."
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {
163 | "scrolled": true
164 | },
165 | "outputs": [],
166 | "source": [
167 | "# TODO"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "> Plot the *training* and *testing* accuracies versus the value of $\\lambda_1$."
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "# TODO"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "# Features signification\n",
191 | "\n",
192 | "The dataset is comprised of $27$ features described below and the goal is to predict if the student may pass its year or not. It is thus of importance to investigate which features are the most significant for the student success. We will see how the $\\ell_1$ regularization can help to this goal."
193 | ]
194 | },
195 | {
196 | "cell_type": "raw",
197 | "metadata": {},
198 | "source": [
199 | "1 sex - student's sex (binary: \"F\" - female or \"M\" - male)\n",
200 | "2 age - student's age (numeric: from 15 to 22)\n",
201 | "3 address - student's home address type (binary: \"U\" - urban or \"R\" - rural)\n",
202 | "4 famsize - family size (binary: \"LE3\" - less or equal to 3 or \"GT3\" - greater than 3)\n",
203 | "5 Pstatus - parent's cohabitation status (binary: \"T\" - living together or \"A\" - apart)\n",
204 | "6 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)\n",
205 | "7 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)\n",
206 | "8 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)\n",
207 | "9 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)\n",
208 | "10 failures - number of past class failures (numeric: n if 1<=n<3, else 4)\n",
209 | "11 schoolsup - extra educational support (binary: yes or no)\n",
210 | "12 famsup - family educational support (binary: yes or no)\n",
211 | "13 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)\n",
212 | "14 activities - extra-curricular activities (binary: yes or no)\n",
213 | "15 nursery - attended nursery school (binary: yes or no)\n",
214 | "16 higher - wants to take higher education (binary: yes or no)\n",
215 | "17 internet - Internet access at home (binary: yes or no)\n",
216 | "18 romantic - with a romantic relationship (binary: yes or no)\n",
217 | "19 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)\n",
218 | "20 freetime - free time after school (numeric: from 1 - very low to 5 - very high)\n",
219 | "21 goout - going out with friends (numeric: from 1 - very low to 5 - very high)\n",
220 | "22 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)\n",
221 | "23 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)\n",
222 | "24 health - current health status (numeric: from 1 - very bad to 5 - very good)\n",
223 | "25 absences - number of school absences (numeric: from 0 to 93)\n",
224 | "26 G1 - first period grade (numeric: from 0 to 20)\n",
225 | "27 G2 - second period grade (numeric: from 0 to 20)"
226 | ]
227 | }
228 | ],
229 | "metadata": {
230 | "kernelspec": {
231 | "display_name": "Python 3",
232 | "language": "python",
233 | "name": "python3"
234 | },
235 | "language_info": {
236 | "codemirror_mode": {
237 | "name": "ipython",
238 | "version": 3
239 | },
240 | "file_extension": ".py",
241 | "mimetype": "text/x-python",
242 | "name": "python",
243 | "nbconvert_exporter": "python",
244 | "pygments_lexer": "ipython3",
245 | "version": "3.7.5"
246 | }
247 | },
248 | "nbformat": 4,
249 | "nbformat_minor": 2
250 | }
251 |
--------------------------------------------------------------------------------
/Lab4_Prox/logistic_regression_student.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Regularized Problem
5 | #
6 | # In this lab, we add an $\ell_1$ regularization to promote sparsity of the iterates. The function (below) is non-smooth but it has a smooth part, $f$, and a non-smooth part, $g$, that we will treat with proximal operations.
7 | #
8 | # \begin{align*}
9 | # \min_{x\in\mathbb{R}^d } F(x) := \underbrace{ \frac{1}{m} \sum_{i=1}^m \log( 1+\exp(-b_i \langle a_i,x \rangle) ) + \frac{\lambda_2}{2} \|x\|_2^2}_{f(x)} + \underbrace{\lambda_1 \|x\|_1 }_{g(x)}.
10 | # \end{align*}
11 |
12 | # ### Function definition
13 |
14 |
15 |
16 | import numpy as np
17 | import csv
18 | from sklearn import preprocessing
19 |
20 | #### File reading
21 | dat_file = np.load('student.npz')
22 | A = dat_file['A_learn']
23 | final_grades = dat_file['b_learn']
24 | m = final_grades.size
25 | b = np.zeros(m)
26 | for i in range(m):
27 | if final_grades[i]>11:
28 | b[i] = 1.0
29 | else:
30 | b[i] = -1.0
31 |
32 | A_test = dat_file['A_test']
33 | final_grades_test = dat_file['b_test']
34 | m_test = final_grades_test.size
35 | b_test = np.zeros(m_test)
36 | for i in range(m_test):
37 | if final_grades_test[i]>11:
38 | b_test[i] = 1.0
39 | else:
40 | b_test[i] = -1.0
41 |
42 |
43 |
44 |
45 | d = 27 # features
46 | n = d+1 # with the intercept
47 |
48 |
49 |
50 | lam1 = 0.03 # for the 1-norm regularization best:0.03
51 | lam2 = 0.0
52 |
53 |
54 | L = 0.25*max(np.linalg.norm(A,2,axis=1))**2 + lam2
55 |
56 | # ## Oracles
57 | #
58 | # ### Related to function $f$
59 |
60 |
61 | def f(x):
62 | l = 0.0
63 | for i in range(A.shape[0]):
64 | if b[i] > 0 :
65 | l += np.log( 1 + np.exp(-np.dot( A[i] , x ) ) )
66 | else:
67 | l += np.log( 1 + np.exp(np.dot( A[i] , x ) ) )
68 | return l/m + lam2/2.0*np.dot(x,x)
69 |
70 | def f_grad(x):
71 | g = np.zeros(n)
72 | for i in range(A.shape[0]):
73 | if b[i] > 0:
74 | g += -A[i]/( 1 + np.exp(np.dot( A[i] , x ) ) )
75 | else:
76 | g += A[i]/( 1 + np.exp(-np.dot( A[i] , x ) ) )
77 | return g/m + lam2*x
78 |
79 |
80 | # ### Related to function $g$ [TODO]
81 |
82 |
83 | def g(x):
84 | return lam1*np.linalg.norm(x,1)
85 |
86 | def g_prox(x,gamma):
87 | p = np.zeros(n)
88 | #TODO
89 | return p
90 |
91 |
92 | # ### Related to function $F$
93 |
94 |
95 |
96 | def F(x):
97 | return f(x) + g(x)
98 |
99 |
100 | # ## Prediction Functions
101 |
102 |
103 | def prediction_train(w,PRINT=False):
104 | pred = np.zeros(A.shape[0])
105 | perf = 0
106 | for i in range(A.shape[0]):
107 | p = 1.0/( 1 + np.exp(-np.dot( A[i] , w ) ) )
108 | if p>0.5:
109 | pred[i] = 1.0
110 | if b[i]>0:
111 | correct = "True"
112 | perf += 1
113 | else:
114 | correct = "False"
115 | if PRINT:
116 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),1,(p-0.5)*200,correct))
117 | else:
118 | pred[i] = -1.0
119 | if b[i]<0:
120 | correct = "True"
121 | perf += 1
122 | else:
123 | correct = "False"
124 | if PRINT:
125 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),-1,100-(0.5-p)*200,correct))
126 | return pred,float(perf)/A.shape[0]
127 |
128 | def prediction_test(w,PRINT=False):
129 | pred = np.zeros(A_test.shape[0])
130 | perf = 0
131 | for i in range(A_test.shape[0]):
132 | p = 1.0/( 1 + np.exp(-np.dot( A_test[i] , w ) ) )
133 | if p>0.5:
134 | pred[i] = 1.0
135 | if b_test[i]>0:
136 | correct = "True"
137 | perf += 1
138 | else:
139 | correct = "False"
140 | if PRINT:
141 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),1,(p-0.5)*200,correct))
142 | else:
143 | pred[i] = -1.0
144 | if b_test[i]<0:
145 | correct = "True"
146 | perf += 1
147 | else:
148 | correct = "False"
149 | if PRINT:
150 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),-1,100-(0.5-p)*200,correct))
151 | return pred,float(perf)/A_test.shape[0]
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/Lab4_Prox/student.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab4_Prox/student.npz
--------------------------------------------------------------------------------
/Lab4_Prox/student.txt:
--------------------------------------------------------------------------------
1 | # Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
2 | 1 sex - student's sex (binary: "F" - female or "M" - male)
3 | 2 age - student's age (numeric: from 15 to 22)
4 | 3 address - student's home address type (binary: "U" - urban or "R" - rural)
5 | 4 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6 | 5 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7 | 6 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
8 | 7 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
9 | 8 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
10 | 9 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
11 | 10 failures - number of past class failures (numeric: n if 1<=n<3, else 4)
12 | 11 schoolsup - extra educational support (binary: yes or no)
13 | 12 famsup - family educational support (binary: yes or no)
14 | 13 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
15 | 14 activities - extra-curricular activities (binary: yes or no)
16 | 15 nursery - attended nursery school (binary: yes or no)
17 | 16 higher - wants to take higher education (binary: yes or no)
18 | 17 internet - Internet access at home (binary: yes or no)
19 | 18 romantic - with a romantic relationship (binary: yes or no)
20 | 19 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
21 | 20 freetime - free time after school (numeric: from 1 - very low to 5 - very high)
22 | 21 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
23 | 22 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
24 | 23 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
25 | 24 health - current health status (numeric: from 1 - very bad to 5 - very good)
26 | 25 absences - number of school absences (numeric: from 0 to 93)
27 | 26 G1 - first period grade (numeric: from 0 to 20)
28 | 27 G2 - second period grade (numeric: from 0 to 20)
29 |
30 | 28 G3 - final grade (numeric: from 0 to 20, output target)
31 |
32 | Additional note: there are several (382) students that belong to both datasets .
33 | These students can be identified by searching for identical attributes
34 | that characterize each student, as shown in the annexed R file.
35 |
--------------------------------------------------------------------------------
/Lab5_MachineLearningExample/Fig/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/1.png
--------------------------------------------------------------------------------
/Lab5_MachineLearningExample/Fig/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/2.png
--------------------------------------------------------------------------------
/Lab5_MachineLearningExample/Fig/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/3.png
--------------------------------------------------------------------------------
/Lab5_MachineLearningExample/Fig/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/4.png
--------------------------------------------------------------------------------
/Lab5_MachineLearningExample/Fig/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/5.png
--------------------------------------------------------------------------------
/Lab5_MachineLearningExample/Fig/UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab5_MachineLearningExample/Fig/UGA.png
--------------------------------------------------------------------------------
/Lab5_MachineLearningExample/Lab5_OptimForML.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
\n",
8 | "Master of Science in Industrial and Applied Mathematics (MSIAM) - 1st year
\n",
9 | "
\n",
10 | "Numerical Optimization
\n",
11 | "Lab 5: Optimization for ML
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "\n",
19 | "---\n"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "%load_ext autoreload\n",
29 | "%autoreload 2"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "---\n",
37 | "\n",
38 | "# Algorithms performance on practical problems\n",
39 | "\n",
40 | "In this lab, we will investigate how to evaluate and display performance of optimization algorithms over a practical problem of machine learning: binary classification using logistic regression."
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "## Machine Learning as an Optimization problem\n",
48 | "\n",
49 | "We have some *data* $\\mathcal{D}$ consisting of $m$ *examples* $\\{d_i\\}$; each example consisting of a *feature* vector $a_i\\in\\mathbb{R}^d$ and an *observation* $b_i\\in \\mathcal{O}$: $\\mathcal{D} = \\{[a_i,b_i]\\}_{i=1..m}$. In this lab, we will consider the ionosphere dataset.\n",
50 | " \n",
51 | "The goal of *supervised learning* is to construct a predictor for the observations when given feature vectors.\n",
52 | "\n",
53 | "A popular approach is based on *linear models* which are based on finding a *parameter* $x$ such that the real number $\\langle a_i , x \\rangle$ is used to predict the value of the observation through a *predictor function* $g:\\mathbb{R}\\to \\mathcal{O}$: $g(\\langle a_i , x \\rangle)$ is the predicted value from $a_i$.\n",
54 | "\n",
55 | "In order to find such a parameter, we use the available data and a *loss* $\\ell$ that penalizes the error made between the predicted $g(\\langle a_i , x \\rangle)$ and observed $b_i$ values. For each example $i$, the corresponding error function for a parameter $x$ is $f_i(x) = \\ell( g(\\langle a_i , x \\rangle) ; b_i )$. Using the whole data, the parameter that minimizes the total error is the solution of the minimization problem\n",
56 | "\n",
57 | "$$ \\min_{x\\in\\mathbb{R}^d} \\frac{1}{m} \\sum_{i=1}^m f_i(x) = \\frac{1}{m} \\sum_{i=1}^m \\ell( g(\\langle a_i , x \\rangle) ; b_i ). $$"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "## Binary Classification with Logisitic Regression\n",
65 | "\n",
66 | "In our setup, the observations are binary: $\\mathcal{O} = \\{-1 , +1 \\}$, and the *Logistic loss* is used to form the following optimization problem\n",
67 | "\\begin{align*}\n",
68 | "\\min_{x\\in\\mathbb{R}^d } f(x) := \\frac{1}{m} \\sum_{i=1}^m \\log( 1+\\exp(-b_i \\langle a_i,x \\rangle) ) + \\frac{\\lambda_2}{2} \\|x\\|_2^2.\n",
69 | "\\end{align*}\n",
70 | "where the last term is added as a regularization (of type $\\ell_2$, aka Tikhnov) to prevent overfitting.\n",
71 | "\n",
72 | "Under some statistical hypotheses, $x^\\star = \\arg\\min f(x)$ maximizes the likelihood of the labels knowing the features vector. Then, for a new point $d$ with features vector $a$, \n",
73 | "$$ p_1(a) = \\mathbb{P}[d\\in \\text{ class } +1] = \\frac{1}{1+\\exp(-\\langle a;x^\\star \\rangle)} $$\n",
74 | "\n",
75 | "Thus, from $a$, if $p_1(a)$ is close to $1$, one can decide that $d$ belongs to class $1$; and the opposite decision if $p(a)$ is close to $0$. Between the two, the appreciation is left to the data scientist depending on the application.\n",
76 | "\n",
77 | "## Objective of the optimizer\n",
78 | " \n",
79 | "Given oracles for the function and its gradient, as well as an upper-bound of the Lipschitz constant $L$ of the gradient, find a minimizer of $f$.\n",
80 | " "
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "> You are given *all* oracles of $f$ (function, gradient, Hessian) in `logistic_regression_ionosphere.py` and several algorithms in `algoGradient.py`."
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "### Influence of strong convexity on the speed of the gradient method\n",
95 | "\n",
96 | "\n",
97 | "> Run the following blocks for different values of parameter `lam2` of the problem. What do you notice in terms of speed of convergence, what is the reason?"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "from algoGradient import * # import all methods of the module into the current environment\n",
107 | "import numpy as np\n",
108 | "import logistic_regression_ionosphere as pb\n",
109 | "\n",
110 | "\n",
111 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n",
112 | "PREC = 1e-5 # Sought precision\n",
113 | "ITE_MAX = 5000 # Max number of iterations\n",
114 | "x0 = np.zeros(pb.n) # Initial point\n",
115 | "step = 1.0/pb.L\n",
116 | "\n",
117 | "pb.lam2 = 0.1\n",
118 | "\n",
119 | "##### gradient algorithm\n",
120 | "x,x_tab = gradient_algorithm(pb.f , pb.f_grad , x0 , step , PREC , ITE_MAX )\n",
121 | "\n"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "import matplotlib.pyplot as plt\n",
131 | "% matplotlib inline\n",
132 | "\n",
133 | "F = []\n",
134 | "for i in range(x_tab.shape[0]):\n",
135 | " F.append( pb.f(x_tab[i])) \n",
136 | "\n",
137 | "\n",
138 | "plt.figure()\n",
139 | "plt.plot( F, color=\"black\", linewidth=1.0, linestyle=\"-\",label='gradient')\n",
140 | "plt.grid(True)\n",
141 | "plt.legend()\n",
142 | "plt.show()\n"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "# Accelerating poorly conditioned problems\n",
150 | "\n",
151 | "While the addition of strong convexity accelerates the rate in practice, it usually result shift the solutions of the original problem. For a learning problem, it affects the accuracy.\n",
152 | "\n",
153 | "In order to get faster convergences when the rate is slower, several acceleration techniques exist. We are going to present the most common in the following.\n",
154 | "\n",
155 | "### Nesterov's fast gradient\n",
156 | "\n",
157 | "In a series of papers published in the 80's, Yu. Nesterov proposed an acceleration technique in order to make the worst case rate of the gradient algorithm from $\\mathcal{O}(1/k)$ to $\\mathcal{O}(1/k^2)$. This technique is now immensely popular, notably in the machine learning and image processing communities.\n",
158 | " \n",
159 | "\n",
160 | "The iterations of Nesterov's accelerated gradient are as such:\n",
161 | "$$ \\left\\{ \\begin{array}{ll} x_{k+1} = y_k - \\gamma \\nabla f(y_k) \\\\ y_{k+1} = x_{k+1} + \\alpha_{k+1} (x_{k+1} - x_k ) \\end{array} \\right. $$\n",
162 | "with \n",
163 | "$$ \\alpha_{k+1} = \\frac{\\lambda_k -1 }{\\lambda_{k+1}} \\text{ with } \\lambda_0 = 0 \\text{ and } \\lambda_{k+1} = \\frac{1+\\sqrt{1+4\\lambda_k^2}}{2} . $$\n",
164 | " \n",
165 | "Although no clear intuition can be drawn, the extended point can be seen as an extension by inertia of the last points."
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "> Implement Nesterov's fast gradient algorithm in `algoGradient.py`.\n",
173 | "\n",
174 | "> Run the constant stepsize and fast gradient algorithms and compare the convergence rates (for lam2 = 0.001)."
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "from algoGradient import * # import all methods of the module into the current environment\n",
184 | "\n",
185 | "import numpy as np\n",
186 | "import logistic_regression_ionosphere as pb\n",
187 | "\n",
188 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n",
189 | "PREC = 1e-5 # Sought precision\n",
190 | "ITE_MAX = 5000 # Max number of iterations\n",
191 | "x0 = np.zeros(pb.n) # Initial point\n",
192 | "step = 1.0/pb.L\n",
193 | "\n",
194 | "pb.lam2 = 0.001\n",
195 | "\n",
196 | "##### gradient algorithm\n",
197 | "x,x_tab = gradient_algorithm(pb.f , pb.f_grad , x0 , step , PREC , ITE_MAX )\n",
198 | "\n",
199 | "##### fast gradient algorithm\n",
200 | "xF,xF_tab = fast_gradient_algorithm(pb.f , pb.f_grad , x0 , step , PREC , ITE_MAX )"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "import matplotlib.pyplot as plt\n",
210 | "% matplotlib inline\n",
211 | "\n",
212 | "F = []\n",
213 | "G = []\n",
214 | "for i in range(x_tab.shape[0]):\n",
215 | " F.append( pb.f(x_tab[i])) \n",
216 | " G.append( np.linalg.norm(pb.f_grad(x_tab[i] )) )\n",
217 | "\n",
218 | "FF = []\n",
219 | "GF = []\n",
220 | "for i in range(xF_tab.shape[0]):\n",
221 | " FF.append( pb.f(xF_tab[i])) \n",
222 | " GF.append( np.linalg.norm(pb.f_grad(xF_tab[i] )) )\n",
223 | "\n",
224 | "plt.figure()\n",
225 | "plt.plot( F, color=\"black\", linewidth=1.0, linestyle=\"-\",label='gradient')\n",
226 | "plt.plot( FF, color=\"red\", linewidth=1.0, linestyle=\"-\",label='fast gradient')\n",
227 | "plt.grid(True)\n",
228 | "plt.legend()\n",
229 | "plt.show()\n",
230 | "\n",
231 | "\n",
232 | "plt.figure()\n",
233 | "plt.plot( G, color=\"black\", linewidth=1.0, linestyle=\"-\",label='gradient')\n",
234 | "plt.plot( GF, color=\"red\", linewidth=1.0, linestyle=\"-\",label='fast gradient')\n",
235 | "plt.yscale('log')\n",
236 | "plt.xscale('log')\n",
237 | "plt.grid(True)\n",
238 | "plt.legend()\n",
239 | "plt.show()\n"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "---\n",
247 | "\n",
248 | "\n",
249 | "### Other methods: line-search, BFGS\n",
250 | "\n",
251 | "\n",
252 | "Other popular methods to accelerate convergence are:\n",
253 | "* line-search (as seen quickly in the previous lab, it is implemented in 1.c of file `algoGradient.py` )\n",
254 | "* BFGS which is a Quasi-Newton method in the sense that it approximates second order information in an online setting. \n",
255 | "\n",
256 | "**BFGS.** (Broyden-Fletcher-Goldfarb-Shanno, 1970) The popular BFGS algorithm consist in performing the following iteration\n",
257 | "$$ x_{k+1}=x_k - \\gamma_k W_k \\nabla f(x_k)$$\n",
258 | "where $\\gamma_k$ is given by Wolfe's line-search and positive definite matrix $W_k$ is computed as\n",
259 | "$$ W_{k+1}=W_k - \\frac{s_k y_k^T W_k+W_k y_k s_k^T}{y_k^T s_k} +\\left[1+\\frac{y_k^T W_k y_k}{y_k^T s_k}\\right]\\frac{s_k s_k^T}{y_k^T s_k} $$\n",
260 | "with $s_k=x_{k+1}-x_{k}$ and $y_k=\\nabla f(x_{k+1}) - \\nabla f(x_{k})$."
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "> Implement BFGS in Section 3 of file `algoGradient.py` .\n",
268 | "\n",
269 | "> Compare the performance of the previously investigated algorithms. *(Note that you can also test the performance of Newton's method although it is a bit unfair compared to the other algorithms as the variable size is small)*"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "from algoGradient import * # import all methods of the module into the current environment\n",
279 | "\n",
280 | "import numpy as np\n",
281 | "import logistic_regression_ionosphere as pb\n",
282 | "\n",
283 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n",
284 | "PREC = 1e-5 # Sought precision\n",
285 | "ITE_MAX = 500 # Max number of iterations\n",
286 | "x0 = np.zeros(pb.n) # Initial point\n",
287 | "step = 1.0/pb.L\n",
288 | "\n",
289 | "##### gradient algorithm\n",
290 | "x,x_tab = gradient_algorithm(pb.f , pb.f_grad , x0 , step , PREC , ITE_MAX )\n",
291 | "\n",
292 | "##### fast gradient algorithm\n",
293 | "xF,xF_tab = fast_gradient_algorithm(pb.f , pb.f_grad , x0 , step , PREC , ITE_MAX )\n",
294 | "\n",
295 | "##### Wolfe line-search algorithm\n",
296 | "xW,xW_tab = gradient_Wolfe(pb.f , pb.f_grad , x0 , PREC , ITE_MAX )\n",
297 | "\n",
298 | "##### BFGS algorithm\n",
299 | "xB,xB_tab = bfgs(pb.f , pb.f_grad , x0 , PREC , ITE_MAX )\n"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "import matplotlib.pyplot as plt\n",
309 | "% matplotlib inline\n",
310 | "\n",
311 | "F = []\n",
312 | "G = []\n",
313 | "for i in range(x_tab.shape[0]):\n",
314 | " F.append( pb.f(x_tab[i])) \n",
315 | " G.append( np.linalg.norm(pb.f_grad(x_tab[i] )) )\n",
316 | "\n",
317 | "FF = []\n",
318 | "GF = []\n",
319 | "for i in range(xF_tab.shape[0]):\n",
320 | " FF.append( pb.f(xF_tab[i])) \n",
321 | " GF.append( np.linalg.norm(pb.f_grad(xF_tab[i] )) )\n",
322 | " \n",
323 | "FW = []\n",
324 | "GW = []\n",
325 | "for i in range(xW_tab.shape[0]):\n",
326 | " FW.append( pb.f(xW_tab[i])) \n",
327 | " GW.append( np.linalg.norm(pb.f_grad(xW_tab[i] )) )\n",
328 | " \n",
329 | " \n",
330 | "FB = []\n",
331 | "GB = []\n",
332 | "for i in range(xB_tab.shape[0]):\n",
333 | " FB.append( pb.f(xB_tab[i])) \n",
334 | " GB.append( np.linalg.norm(pb.f_grad(xB_tab[i] )) )\n",
335 | "\n",
336 | "plt.figure()\n",
337 | "plt.plot( F, color=\"black\", linewidth=1.0, linestyle=\"-\",label='gradient')\n",
338 | "plt.plot( FF, color=\"red\", linewidth=1.0, linestyle=\"-\",label='fast gradient')\n",
339 | "plt.plot( FW, color=\"magenta\", linewidth=1.0, linestyle=\"-\",label='Wolfe')\n",
340 | "plt.plot( FB, color=\"green\", linewidth=1.0, linestyle=\"-\",label='BFGS')\n",
341 | "plt.grid(True)\n",
342 | "plt.legend()\n",
343 | "plt.show()\n",
344 | "\n",
345 | "\n",
346 | "plt.figure()\n",
347 | "plt.plot( G, color=\"black\", linewidth=1.0, linestyle=\"-\",label='gradient')\n",
348 | "plt.plot( GF, color=\"red\", linewidth=1.0, linestyle=\"-\",label='fast gradient')\n",
349 | "plt.plot( GW, color=\"magenta\", linewidth=1.0, linestyle=\"-\",label='Wolfe')\n",
350 | "plt.plot( GB, color=\"green\", linewidth=1.0, linestyle=\"-\",label='BFGS')\n",
351 | "plt.yscale('log')\n",
352 | "plt.xscale('log')\n",
353 | "plt.grid(True)\n",
354 | "plt.legend()\n",
355 | "plt.show()\n"
356 | ]
357 | },
358 | {
359 | "cell_type": "markdown",
360 | "metadata": {},
361 | "source": [
362 | "---\n",
363 | "\n",
364 | "# Performance on learning problems\n",
365 | "\n",
366 | "### Prediction power\n",
367 | "\n",
368 | "\n",
369 | "\n",
370 | "Our problem of interest is binary classification using logistic regression.\n",
371 | "Although this is a machine learning task, the predictor construction amounts to minimizing a smooth convex optimization function $f$ called the *loss*, the final minimizer is called a *predictor* and its scalar product with the data vector gives a probability of belonging to class $1$.\n",
372 | "\n",
373 | "The previous test was based on the functional decrease whereas our task is binary classification. Let us look at the final accuracies obtained.\n",
374 | "\n",
375 | "> The file `logistic_regression.py` contains a `prediction` function that takes a *predictor* and resturn the accuracy of the predictor. Take a look at how the function is defined.\n",
376 | "\n",
377 | "> Observe the accuracy of all final points obtained before. What do you notice? "
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "pred,perf = pb.prediction(x,PRINT=False)\n",
387 | "print(\"Gradient algorithm: \\t{:.2f}%\".format(perf*100))\n",
388 | "\n",
389 | "predF,perfF = pb.prediction(xF,PRINT=False)\n",
390 | "print(\"Fast Gradient: \\t\\t{:.2f}%\".format(perfF*100))\n",
391 | "\n",
392 | "predW,perfW = pb.prediction(xW,PRINT=False)\n",
393 | "print(\"Wolfe: \\t\\t\\t{:.2f}%\".format(perfW*100))\n",
394 | "\n",
395 | "predB,perfB = pb.prediction(xB,PRINT=False)\n",
396 | "print(\"BFGS: \\t\\t\\t{:.2f}%\".format(perfB*100))"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": null,
402 | "metadata": {
403 | "scrolled": true
404 | },
405 | "outputs": [],
406 | "source": [
407 | "predF,perfF = pb.prediction(xF,PRINT=True)"
408 | ]
409 | }
410 | ],
411 | "metadata": {
412 | "kernelspec": {
413 | "display_name": "Python 3",
414 | "language": "python",
415 | "name": "python3"
416 | },
417 | "language_info": {
418 | "codemirror_mode": {
419 | "name": "ipython",
420 | "version": 3
421 | },
422 | "file_extension": ".py",
423 | "mimetype": "text/x-python",
424 | "name": "python",
425 | "nbconvert_exporter": "python",
426 | "pygments_lexer": "ipython3",
427 | "version": "3.7.5"
428 | }
429 | },
430 | "nbformat": 4,
431 | "nbformat_minor": 1
432 | }
433 |
--------------------------------------------------------------------------------
/Lab5_MachineLearningExample/algoGradient.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Gradient-based algorithms
5 | #
6 | # In this notebook, we code our gradient-based optimization algorithms.
7 |
8 | # # 1. Gradient algorithms
9 | #
10 | # For minimizing a differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given:
11 | # * the function to minimize `f`
12 | # * a 1st order oracle `f_grad` (see `problem1.ipynb` for instance)
13 | # * an initialization point `x0`
14 | # * the sought precision `PREC`
15 | # * a maximal number of iterations `ITE_MAX`
16 | #
17 | #
18 | # these algorithms perform iterations of the form
19 | # $$ x_{k+1} = x_k - \gamma_k \nabla f(x_k) $$
20 | # where $\gamma_k$ is a stepsize to choose.
21 |
22 | # ### 1.a. Constant stepsize gradient algorithm
23 | #
24 |
25 |
26 | import numpy as np
27 | import timeit
28 |
29 | def gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ):
30 | x = np.copy(x0)
31 | stop = PREC*np.linalg.norm(f_grad(x0) )
32 |
33 | x_tab = np.copy(x)
34 | print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
35 | t_s = timeit.default_timer()
36 | for k in range(ITE_MAX):
37 | g = f_grad(x)
38 | x = x - step*g
39 |
40 | x_tab = np.vstack((x_tab,x))
41 |
42 | if np.linalg.norm(g) < stop:
43 | break
44 | t_e = timeit.default_timer()
45 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x)))
46 | return x,x_tab
47 |
48 |
49 | # ### 1.b. Adaptive stepsize gradient algorithm
50 | #
51 |
52 | import numpy as np
53 | import timeit
54 |
55 |
56 | def gradient_adaptive_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ):
57 | x = np.copy(x0)
58 | stop = PREC*np.linalg.norm(f_grad(x0) )
59 |
60 | x_tab = np.copy(x)
61 | print("------------------------------------\nAdaptative Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
62 | t_s = timeit.default_timer()
63 | for k in range(ITE_MAX):
64 |
65 | g = f_grad(x)
66 | x_prev = np.copy(x)
67 |
68 | x = x - step*g ####### ITERATION
69 |
70 | if f(x)>f(x_prev):
71 | x = np.copy(x_prev)
72 | step = step/2
73 | print("stepsize: = {:0}".format(step))
74 |
75 | x_tab = np.vstack((x_tab,x))
76 |
77 | if np.linalg.norm(g) < stop:
78 | break
79 | t_e = timeit.default_timer()
80 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x)))
81 | return x,x_tab
82 |
83 |
84 | # ### 1.c. Wolfe Line search
85 | #
86 | #
87 |
88 |
89 |
90 | import numpy as np
91 | import timeit
92 | from scipy.optimize import line_search
93 |
94 | def gradient_Wolfe(f , f_grad , x0 , PREC , ITE_MAX ):
95 | x = np.copy(x0)
96 | stop = PREC*np.linalg.norm(f_grad(x0) )
97 |
98 | x_tab = np.copy(x)
99 | print("------------------------------------\n Gradient with Wolfe line search\n------------------------------------\nSTART")
100 | t_s = timeit.default_timer()
101 | for k in range(ITE_MAX):
102 | g = f_grad(x)
103 |
104 | res = line_search(f, f_grad, x, -g, gfk=None, old_fval=None, old_old_fval=None, args=(), c1=0.0001, c2=0.9, amax=50)
105 |
106 | x = x - res[0]*g
107 |
108 | x_tab = np.vstack((x_tab,x))
109 |
110 | if np.linalg.norm(g) < stop:
111 | break
112 | t_e = timeit.default_timer()
113 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x)))
114 | return x,x_tab
115 |
116 |
117 | # ### 1.d. Nesterov's Fast gradient algorithm
118 | #
119 | # In a series of papers published in the 80's, Yu. Nesterov proposed an acceleration technique in order to make the worst case rate of the gradient algorithm from $\mathcal{O}(1/k)$ to $\mathcal{O}(1/k^2)$. This technique is now immensely popular, notably in the machine learning and image processing communities.
120 | #
121 | # The iterations of Nesterov's accelerated gradient are as such:
122 | # $$ \left\{ \begin{array}{ll} x_{k+1} = y_k - \gamma \nabla f(y_k) \\ y_{k+1} = x_{k+1} + \alpha_{k+1} (x_{k+1} - x_k ) \end{array} \right. $$
123 | # with
124 | # $$ \alpha_{k+1} = \frac{\lambda_k -1 }{\lambda_{k+1}} \text{ with } \lambda_0 = 0 \text{ and } \lambda_{k+1} = \frac{1+\sqrt{1+4\lambda_k^2}}{2} . $$
125 | #
126 | # Although no clear intuition can be drawn, the extended point can be seen as an extension by inertia of the last points.
127 | #
128 | #
129 | # Q. Fill the function below accordingly.
130 |
131 |
132 |
133 | import numpy as np
134 | import timeit
135 |
136 | def fast_gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ):
137 | x = np.copy(x0)
138 | y = np.copy(x0)
139 | stop = PREC*np.linalg.norm(f_grad(x0) )
140 |
141 |
142 |
143 | x_tab = np.copy(x)
144 | print("------------------------------------\n Fast gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
145 | t_s = timeit.default_timer()
146 | for k in range(ITE_MAX):
147 | g = f_grad(x)
148 | # TO FILL
149 |
150 | x_tab = np.vstack((x_tab,x))
151 |
152 | if np.linalg.norm(g) < stop:
153 | break
154 | t_e = timeit.default_timer()
155 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x)))
156 | return x,x_tab
157 |
158 |
159 | # # 2. Second Order algorithms
160 | #
161 | # For minimizing a *twice* differentiable function $f:\mathbb{R}^n \to \mathbb{R}$, given:
162 | # * the function to minimize `f`
163 | # * a 2nd order oracle `f_grad_hessian` (see `problem1.ipynb` for instance)
164 | # * an initialization point `x0`
165 | # * the sought precision `PREC`
166 | # * a maximal number of iterations `ITE_MAX`
167 | #
168 | #
169 | # these algorithms perform iterations of the form
170 | # $$ x_{k+1} = x_k - [\nabla^2 f(x_k) ]^{-1} \nabla f(x_k) .$$
171 |
172 |
173 |
174 | import numpy as np
175 | import timeit
176 |
177 | def newton_algorithm(f , f_grad_hessian , x0 , PREC , ITE_MAX ):
178 | x = np.copy(x0)
179 | g0,H0 = f_grad_hessian(x0)
180 | stop = PREC*np.linalg.norm(g0 )
181 |
182 | x_tab = np.copy(x)
183 | print("------------------------------------\nNewton's algorithm\n------------------------------------\nSTART")
184 | t_s = timeit.default_timer()
185 | for k in range(ITE_MAX):
186 |
187 | g,H = f_grad_hessian(x)
188 | x = x - np.linalg.solve(H,g)
189 |
190 | x_tab = np.vstack((x_tab,x))
191 |
192 | if np.linalg.norm(g) < stop:
193 | break
194 | t_e = timeit.default_timer()
195 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x)))
196 | return x,x_tab
197 |
198 |
199 | # # 3. Quasi Newton algorithms
200 | #
201 | # **BFGS.** (Broyden-Fletcher-Goldfarb-Shanno, 1970) The popular BFGS algorithm consist in performing the following iteration
202 | # $$ x_{k+1}=x_k - \gamma_k W_k \nabla f(x_k)$$
203 | # where $\gamma_k$ is given by Wolfe's line-search and positive definite matrix $W_k$ is computed as
204 | # $$ W_{k+1}=W_k - \frac{s_k y_k^T W_k+W_k y_k s_k^T}{y_k^T s_k} +\left[1+\frac{y_k^T W_k y_k}{y_k^T s_k}\right]\frac{s_k s_k^T}{y_k^T s_k} $$
205 | # with $s_k=x_{k+1}-x_{k}$ and $y_k=\nabla f(x_{k+1}) - \nabla f(x_{k})$.
206 |
207 | # Q. Implement BFGS
208 |
209 | import numpy as np
210 | import timeit
211 | from scipy.optimize import line_search
212 |
213 | def bfgs(f , f_grad , x0 , PREC , ITE_MAX ):
214 | x = np.copy(x0)
215 | n = x0.size
216 | g = f_grad(x0)
217 | sim_eval = 1
218 | stop = PREC*np.linalg.norm( g )
219 |
220 | W = np.eye(n)
221 |
222 | x_tab = np.copy(x)
223 | print("------------------------------------\n BFGS\n------------------------------------\nSTART")
224 | t_s = timeit.default_timer()
225 | for k in range(ITE_MAX):
226 |
227 | x = x # To fill
228 |
229 | x_tab = np.vstack((x_tab,x))
230 |
231 | t_e = timeit.default_timer()
232 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,f(x)))
233 | return x,x_tab
234 |
235 |
--------------------------------------------------------------------------------
/Lab5_MachineLearningExample/logistic_regression_ionosphere.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Logistic Regression Problem
5 | #
6 | #
7 | #
8 | # ### Machine Learning as an Optimization problem
9 | #
10 | # We have some *data* $\mathcal{D}$ consisting of $m$ *examples* $\{d_i\}$; each example consisting of a *feature* vector $a_i\in\mathbb{R}^d$ and an *observation* $b_i\in \mathcal{O}$: $\mathcal{D} = \{[a_i,b_i]\}_{i=1..m}$. In this lab, we will consider the ionosphere dataset.
11 | #
12 | #
13 | # The goal of *supervised learning* is to construct a predictor for the observations when given feature vectors.
14 | #
15 | #
16 | # A popular approach is based on *linear models* which are based on finding a *parameter* $x$ such that the real number $\langle a_i , x \rangle$ is used to predict the value of the observation through a *predictor function* $g:\mathbb{R}\to \mathcal{O}$: $g(\langle a_i , x \rangle)$ is the predicted value from $a_i$.
17 | #
18 | #
19 | # In order to find such a parameter, we use the available data and a *loss* $\ell$ that penalizes the error made between the predicted $g(\langle a_i , x \rangle)$ and observed $b_i$ values. For each example $i$, the corresponding error function for a parameter $x$ is $f_i(x) = \ell( g(\langle a_i , x \rangle) ; b_i )$. Using the whole data, the parameter that minimizes the total error is the solution of the minimization problem
20 | # $$ \min_{x\in\mathbb{R}^d} \frac{1}{m} \sum_{i=1}^m f_i(x) = \frac{1}{m} \sum_{i=1}^m \ell( g(\langle a_i , x \rangle) ; b_i ). $$
21 | #
22 | #
23 | # ### Binary Classification with Logisitic Regression
24 | #
25 | # In our setup, the observations are binary: $\mathcal{O} = \{-1 , +1 \}$, and the *Logistic loss* is used to form the following optimization problem
26 | # \begin{align*}
27 | # \min_{x\in\mathbb{R}^d } f(x) := \frac{1}{m} \sum_{i=1}^m \log( 1+\exp(-b_i \langle a_i,x \rangle) ) + \frac{\lam2bda}{2} \|x\|_2^2.
28 | # \end{align*}
29 | # where the last term is added as a regularization (of type $\ell_2$, aka Tikhnov) to prevent overfitting.
30 | #
31 | # Under some statistical hypotheses, $x^\star = \arg\min f(x)$ maximizes the likelihood of the labels knowing the features vector. Then, for a new point $d$ with features vector $a$,
32 | # $$ p_1(a) = \mathbb{P}[d\in \text{ class } +1] = \frac{1}{1+\exp(-\langle a;x^\star \rangle)} $$
33 | # Thus, from $a$, if $p_1(a)$ is close to $1$, one can decide that $d$ belongs to class $1$; and the opposite decision if $p(a)$ is close to $0$. Between the two, the appreciation is left to the data scientist depending on the application.
34 | #
35 | #
36 | # # Objective of the optimizer
37 | #
38 | # Given oracles for the function and its gradient, as well as an upper-bound of the Lipschitz constant $L$ of the gradient, find a minimizer of $f$.
39 | #
40 |
41 | # ### Function definition
42 |
43 |
44 |
45 | import numpy as np
46 | import csv
47 | from sklearn import preprocessing
48 |
49 | file = open('ionosphere.data')
50 |
51 | d = 34
52 | n = d+1 # Variable size + intercept
53 |
54 | m = 351 # Number of examples
55 |
56 | lam2 = 0.001 # regularization best:0.001
57 |
58 | A = np.zeros((m,d))
59 | b = np.zeros(m)
60 |
61 | reader = csv.reader(file, delimiter=',')
62 | i = 0
63 | for row in reader:
64 | A[i] = np.array(row[:d])
65 | if row[d] == 'b':
66 | b[i] = -1.0
67 | else:
68 | b[i] = 1.0
69 | i+=1
70 |
71 | scaler = preprocessing.StandardScaler().fit(A)
72 | A = scaler.transform(A)
73 |
74 | # Adding an intercept
75 | A_inter = np.ones((m,n))
76 | A_inter[:,:-1] = A
77 | A = A_inter
78 |
79 |
80 | L = 0.25*max(np.linalg.norm(A,2,axis=1))**2 + lam2
81 |
82 |
83 | # ## Oracles
84 |
85 |
86 |
87 |
88 | def f(x):
89 | l = 0.0
90 | for i in range(A.shape[0]):
91 | if b[i] > 0 :
92 | l += np.log( 1 + np.exp(-np.dot( A[i] , x ) ) )
93 | else:
94 | l += np.log( 1 + np.exp(np.dot( A[i] , x ) ) )
95 | return l/m + lam2/2.0*np.dot(x,x)
96 |
97 | def f_grad(x):
98 | g = np.zeros(n)
99 | for i in range(A.shape[0]):
100 | if b[i] > 0:
101 | g += -A[i]/( 1 + np.exp(np.dot( A[i] , x ) ) )
102 | else:
103 | g += A[i]/( 1 + np.exp(-np.dot( A[i] , x ) ) )
104 | return g/m + lam2*x
105 |
106 | def f_grad_hessian(x):
107 | g = np.zeros(n)
108 | H = np.zeros((n,n))
109 | for i in range(A.shape[0]):
110 | if b[i] > 0:
111 | g += -A[i]/( 1 + np.exp(np.dot( A[i] , x ) ) )
112 | H += (np.exp(np.dot( A[i] , x ))/( 1 + np.exp(np.dot( A[i] , x ) ) )**2)*np.outer(A[i],A[i])
113 | else:
114 | g += A[i]/( 1 + np.exp(-np.dot( A[i] , x ) ) )
115 | H += (np.exp(-np.dot( A[i] , x ))/( 1 + np.exp(-np.dot( A[i] , x ) ) )**2)*np.outer(A[i],A[i])
116 | g = g/m + lam2*x
117 | H = H/m + lam2*np.eye(n)
118 | return g,H
119 |
120 |
121 | # ## Prediction Function
122 |
123 |
124 |
125 | def prediction(w,PRINT=False):
126 | pred = np.zeros(A.shape[0])
127 | perf = 0
128 | for i in range(A.shape[0]):
129 | p = 1.0/( 1 + np.exp(-np.dot( A[i] , w ) ) )
130 | if p>0.5:
131 | pred[i] = 1.0
132 | if b[i]>0:
133 | correct = "True"
134 | perf += 1
135 | else:
136 | correct = "False"
137 | if PRINT:
138 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),1,(p-0.5)*200,correct))
139 | else:
140 | pred[i] = -1.0
141 | if b[i]<0:
142 | correct = "True"
143 | perf += 1
144 | else:
145 | correct = "False"
146 | if PRINT:
147 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),-1,100-(0.5-p)*200,correct))
148 | return pred,float(perf)/A.shape[0]
149 |
150 |
151 |
--------------------------------------------------------------------------------
/Lab6_LPQP/Fig/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/1.png
--------------------------------------------------------------------------------
/Lab6_LPQP/Fig/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/2.png
--------------------------------------------------------------------------------
/Lab6_LPQP/Fig/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/3.png
--------------------------------------------------------------------------------
/Lab6_LPQP/Fig/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/4.png
--------------------------------------------------------------------------------
/Lab6_LPQP/Fig/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/5.png
--------------------------------------------------------------------------------
/Lab6_LPQP/Fig/UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab6_LPQP/Fig/UGA.png
--------------------------------------------------------------------------------
/Lab6_LPQP/toy_problem.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # a Toy problem
5 |
6 | # We consider the first illustrative example of the original paper
7 | #
8 | # Candes, E., Tao, T. "The Dantzig selector: Statistical estimation when p is much larger than n".
9 | # The Annals of Statistics, 2007
10 |
11 | # In this first example, the design matrix $X$ has $m = 72$ rows and $n = 256$ columns, with independent Gaussian entries (and then normalized so that the columns have unit-norm). We then select $\theta$ with $S := |\{i : \theta_i = 0\}| = 8$, and form $y = X\theta + \xi$, where the $\xi_i$’s are i.i.d. $\mathcal{N}(0, \sigma^2 )$. The noise level is adjusted so that
12 | # $$ \sigma = \frac{1}{3} \sqrt{\frac{S}{n}} .$$
13 |
14 | # ### Problem
15 |
16 |
17 | import numpy as np
18 |
19 | # Parameters
20 | m = 72
21 | n = 256
22 |
23 | S = 8
24 |
25 | sigma = 1/3.0 * np.sqrt(S/float(m))
26 |
27 | # X creation
28 | X = np.random.randn(m, n)
29 |
30 | n_col = np.linalg.norm(X, axis=0)
31 | X = np.dot(X,np.diag(1/n_col)) # Normalization per column [Get rid of it for the "To go further" part!]
32 |
33 | # theta creation
34 | theta = np.zeros(n)
35 | non_null = np.random.choice(n, S)
36 | theta[non_null] = np.random.randn(S)
37 |
38 |
39 | # y creation
40 | y = np.dot(X,theta) + sigma*np.random.randn(m)
41 |
42 |
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/Fig/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/1.png
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/Fig/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/2.png
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/Fig/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/3.png
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/Fig/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/4.png
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/Fig/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/5.png
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/Fig/UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/Fig/UGA.png
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/Lab7_StochMethods.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
\n",
8 | "Master of Science in Industrial and Applied Mathematics (MSIAM) - 1st year
\n",
9 | "
\n",
10 | "Numerical Optimization
\n",
11 | "Lab 7: Variance-Reduced Stochastic Gradient
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "---"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "%load_ext autoreload\n",
28 | "%autoreload 2"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "# Logistic Regression Problem\n",
36 | " \n",
37 | " \n",
38 | " \n",
39 | "### Machine Learning as an Optimization problem\n",
40 | " \n",
41 | "We have some *data* $\\mathcal{D}$ consisting of $m$ *examples* $\\{d_i\\}$; each example consisting of a *feature* vector $a_i\\in\\mathbb{R}^d$ and an *observation* $b_i\\in \\mathcal{O}$: $\\mathcal{D} = \\{[a_i,b_i]\\}_{i=1..m}$. In this lab, we will consider the student performance dataset.\n",
42 | " \n",
43 | " \n",
44 | "The goal of *supervised learning* is to construct a predictor for the observations when given feature vectors.\n",
45 | " \n",
46 | " \n",
47 | " A popular approach is based on *linear models* which are based on finding a *parameter* $x$ such that the real number $\\langle a_i , x \\rangle$ is used to predict the value of the observation through a *predictor function* $g:\\mathbb{R}\\to \\mathcal{O}$: $g(\\langle a_i , x \\rangle)$ is the predicted value from $a_i$.\n",
48 | " \n",
49 | " \n",
50 | " In order to find such a parameter, we use the available data and a *loss* $\\ell$ that penalizes the error made between the predicted $g(\\langle a_i , x \\rangle)$ and observed $b_i$ values. For each example $i$, the corresponding error function for a parameter $x$ is $f_i(x) = \\ell( g(\\langle a_i , x \\rangle) ; b_i )$. Using the whole data, the parameter that minimizes the total error is the solution of the minimization problem\n",
51 | " $$ \\min_{x\\in\\mathbb{R}^d} \\frac{1}{m} \\sum_{i=1}^m f_i(x) = \\frac{1}{m} \\sum_{i=1}^m \\ell( g(\\langle a_i , x \\rangle) ; b_i ). $$\n",
52 | " \n",
53 | " \n",
54 | " \n",
55 | "### Regularized Problem \n",
56 | " \n",
57 | "In this lab, we will consider an $\\ell_1$ regularization to promote sparsity of the iterates. A sparse final solution would select the most important features. The new function (below) is non-smooth but it has a smooth part, $f$; and a non-smooth part, $g$, that we will treat with proximal operations.\n",
58 | " \n",
59 | " \\begin{align*}\n",
60 | " \\min_{x\\in\\mathbb{R}^d } F(x) := \\underbrace{\\frac{1}{m} \\sum_{i=1}^m \\overbrace{ \\log( 1+\\exp(-b_i \\langle a_i,x \\rangle) ) + \\frac{\\lambda_2}{2} \\|x\\|_2^2 }^{f_i(x)} }_{f(x)} + \\underbrace{\\lambda_1 \\|x\\|_1 }_{g(x)}.\n",
61 | " \\end{align*}\n",
62 | " \n",
63 | " \n"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "\n",
71 | "\n",
72 | "# Recall of the proximal gradient algorithm\n",
73 | "\n"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "from algoProx import * # import all methods of the module into the current environment\n",
83 | "import numpy as np\n",
84 | "import logistic_regression_student as pb\n",
85 | "\n",
86 | "#### Parameter we give at our algorithm (see algoGradient.ipynb)\n",
87 | "PREC = 1e-5 # Sought precision\n",
88 | "ITE_MAX = 1000 # Max number of iterations\n",
89 | "x0 = np.zeros(pb.n) # Initial point\n",
90 | "step = 1.0/pb.L\n",
91 | "\n",
92 | "##### gradient algorithm\n",
93 | "x,x_tab = proximal_gradient_algorithm(pb.F , pb.f_grad , pb.g_prox , x0 , step , PREC, ITE_MAX , True)\n"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "## Decrease of the algorithm."
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "import matplotlib.pyplot as plt\n",
110 | "% matplotlib inline\n",
111 | "\n",
112 | "F = []\n",
113 | "for i in range(x_tab.shape[0]):\n",
114 | " F.append( pb.F(x_tab[i])) \n",
115 | "\n",
116 | "plt.figure()\n",
117 | "plt.plot( F, color=\"black\", linewidth=1.0, linestyle=\"-\")\n",
118 | "plt.grid(True)\n",
119 | "plt.show()"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "### Support of the vector $x_k$ \n"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "plt.figure()\n",
136 | "\n",
137 | "for i in np.arange(0,x_tab.shape[0],int(x_tab.shape[0]/40)):\n",
138 | " for j in range(pb.n):\n",
139 | " if np.abs(x_tab[i,j])>1e-14:\n",
140 | " plt.plot( i , j , 'ko')\n",
141 | "\n",
142 | "plt.grid(True)\n",
143 | "plt.ylabel('Non-null Coordinates')\n",
144 | "plt.xlabel('Nb. Iterations')\n",
145 | "plt.ylim(-1,pb.d+1)\n",
146 | "plt.yticks(np.arange(0,pb.d+1))\n",
147 | "plt.show()"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "---\n",
155 | "\n",
156 | "# Stochastic gradient \n",
157 | "\n",
158 | "\n",
159 | "In the following, instead of considering $f$ as a whole, we will use its structure \n",
160 | "$$ f(x) := \\frac{1}{m}\\sum_{i=1}^m f_i(x)$$\n",
161 | "\n",
162 | "> Implement the gradient related to $f_i$, related to one example, in `logistic_regression_student.py`\n",
163 | "\n",
164 | "With this structure a popular minimization algorithm is the *stochastic gradient algorithm* which writes as follows:\n",
165 | "* Select uniformly $i$ in $1,..,m$\n",
166 | "* $x_{k+1} = \\mathbf{prox}_{\\gamma_k g}\\left( x_k - \\gamma_k \\nabla f_i(x_k) \\right) $\n",
167 | "\n",
168 | "> Implement this algorithm with a stepsize vanishing as $1/k$"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": []
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "### Variance reduction\n",
183 | "\n",
184 | "The poor performance of this algorithm is notably due to the variance of the gradients. In order to overcome it, *variance reduced* algorithms have been proposed.\n",
185 | "\n",
186 | "We will consider here the popular **SAGA** algorithm (SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives\n",
187 | "A Defazio, F Bach, S Lacoste-Julien, NIPS 2014. ) \n",
188 | "\n",
189 | "> Implement SAGA from the paper ( http://papers.nips.cc/paper/5258-saga-a-fast-incremental-gradient-method-with-support-for-non-strongly-convex-composite-objectives ) and compare with the stochastic gradient algorithm.\n"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": []
198 | }
199 | ],
200 | "metadata": {
201 | "kernelspec": {
202 | "display_name": "Python 3 (ipykernel)",
203 | "language": "python",
204 | "name": "python3"
205 | },
206 | "language_info": {
207 | "codemirror_mode": {
208 | "name": "ipython",
209 | "version": 3
210 | },
211 | "file_extension": ".py",
212 | "mimetype": "text/x-python",
213 | "name": "python",
214 | "nbconvert_exporter": "python",
215 | "pygments_lexer": "ipython3",
216 | "version": "3.9.13"
217 | }
218 | },
219 | "nbformat": 4,
220 | "nbformat_minor": 4
221 | }
222 |
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/algoProx.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Proximal algorithms
5 | #
6 | # In this notebook, we code our proximal optimization algorithms.
7 |
8 | # # 1. Proximal Gradient algorithm
9 | #
10 | # For minimizing a function $F:\mathbb{R}^n \to \mathbb{R}$ equal to $f+g$ where $f$ is differentiable and the $\mathbf{prox}$ of $g$ is known, given:
11 | # * the function to minimize `F`
12 | # * a 1st order oracle for $f$ `f_grad`
13 | # * a proximity operator for $g$ `g_prox`
14 | # * an initialization point `x0`
15 | # * the sought precision `PREC`
16 | # * a maximal number of iterations `ITE_MAX`
17 | # * a display boolean variable `PRINT`
18 | #
19 | # these algorithms perform iterations of the form
20 | # $$ x_{k+1} = \mathbf{prox}_{\gamma g}\left( x_k - \gamma \nabla f(x_k) \right) $$
21 | # where $\gamma$ is a stepsize to choose.
22 |
23 | #
24 | #
25 | # Q. How would you implement the precision stopping criterion?
26 |
27 |
28 |
29 | import numpy as np
30 | import timeit
31 |
32 | def proximal_gradient_algorithm(F , f_grad , g_prox , x0 , step , PREC , ITE_MAX , PRINT ):
33 | x = np.copy(x0)
34 | x_tab = np.copy(x)
35 | if PRINT:
36 | print("------------------------------------\n Proximal gradient algorithm\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
37 | t_s = timeit.default_timer()
38 | for k in range(ITE_MAX):
39 | g = f_grad(x)
40 | x = g_prox(x - step*g , step) ####### ITERATION
41 |
42 | x_tab = np.vstack((x_tab,x))
43 |
44 |
45 | t_e = timeit.default_timer()
46 | if PRINT:
47 | print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f}\n\n".format(k,t_e-t_s,F(x)))
48 | return x,x_tab
49 |
50 |
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/logistic_regression_student.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Logistic Regression Problem
5 | #
6 | #
7 | #
8 | # ### Machine Learning as an Optimization problem
9 | #
10 | # We have some *data* $\mathcal{D}$ consisting of $m$ *examples* $\{d_i\}$; each example consisting of a *feature* vector $a_i\in\mathbb{R}^d$ and an *observation* $b_i\in \mathcal{O}$: $\mathcal{D} = \{[a_i,b_i]\}_{i=1..m}$. In this lab, we will consider the student performance dataset.
11 | #
12 | #
13 | # The goal of *supervised learning* is to construct a predictor for the observations when given feature vectors.
14 | #
15 | #
16 | # A popular approach is based on *linear models* which are based on finding a *parameter* $x$ such that the real number $\langle a_i , x \rangle$ is used to predict the value of the observation through a *predictor function* $g:\mathbb{R}\to \mathcal{O}$: $g(\langle a_i , x \rangle)$ is the predicted value from $a_i$.
17 | #
18 | #
19 | # In order to find such a parameter, we use the available data and a *loss* $\ell$ that penalizes the error made between the predicted $g(\langle a_i , x \rangle)$ and observed $b_i$ values. For each example $i$, the corresponding error function for a parameter $x$ is $f_i(x) = \ell( g(\langle a_i , x \rangle) ; b_i )$. Using the whole data, the parameter that minimizes the total error is the solution of the minimization problem
20 | # $$ \min_{x\in\mathbb{R}^d} \frac{1}{m} \sum_{i=1}^m f_i(x) = \frac{1}{m} \sum_{i=1}^m \ell( g(\langle a_i , x \rangle) ; b_i ). $$
21 | #
22 | #
23 | #
24 | # # Regularized Problem
25 | #
26 | # In this lab, we will consider an $\ell_1$ regularization to promote sparsity of the iterates. A sparse final solution would select the most important features. The new function (below) is non-smooth but it has a smooth part, $f$, the same as in Lab3; and a non-smooth part, $g$, that we will treat with proximal operations.
27 | #
28 | # \begin{align*}
29 | # \min_{x\in\mathbb{R}^d } F(x) := \underbrace{ \frac{1}{m} \sum_{i=1}^m \log( 1+\exp(-b_i \langle a_i,x \rangle) ) + \frac{\lambda_2}{2} \|x\|_2^2}_{f(x)} + \underbrace{\lambda_1 \|x\|_1 }_{g(x)}.
30 | # \end{align*}
31 | #
32 | #
33 |
34 |
35 | # ### Function definition
36 |
37 |
38 |
39 | import numpy as np
40 | import csv
41 | from sklearn import preprocessing
42 |
43 | #### File reading
44 | dat_file = np.load('student.npz')
45 | A = dat_file['A_learn']
46 | final_grades = dat_file['b_learn']
47 | m = final_grades.size
48 | b = np.zeros(m)
49 | for i in range(m):
50 | if final_grades[i]>11:
51 | b[i] = 1.0
52 | else:
53 | b[i] = -1.0
54 |
55 | A_test = dat_file['A_test']
56 | final_grades_test = dat_file['b_test']
57 | m_test = final_grades_test.size
58 | b_test = np.zeros(m_test)
59 | for i in range(m_test):
60 | if final_grades_test[i]>11:
61 | b_test[i] = 1.0
62 | else:
63 | b_test[i] = -1.0
64 |
65 |
66 | d = 27 # features
67 | n = d+1 # with the intercept
68 |
69 |
70 |
71 |
72 | lam2 = 0.1 # for the 2-norm regularization best:0.1
73 | lam1 = 0.03 # for the 1-norm regularization best:0.03
74 |
75 |
76 | L = 0.25*max(np.linalg.norm(A,2,axis=1))**2 + lam2
77 |
78 |
79 | # ## Oracles
80 | #
81 | # ### Related to function $f$
82 |
83 |
84 |
85 | def f(x):
86 | l = 0.0
87 | for i in range(A.shape[0]):
88 | if b[i] > 0 :
89 | l += np.log( 1 + np.exp(-np.dot( A[i] , x ) ) )
90 | else:
91 | l += np.log( 1 + np.exp(np.dot( A[i] , x ) ) )
92 | return l/m + lam2/2.0*np.dot(x,x)
93 |
94 | def f_grad(x):
95 | g = np.zeros(n)
96 | for i in range(A.shape[0]):
97 | if b[i] > 0:
98 | g += -A[i]/( 1 + np.exp(np.dot( A[i] , x ) ) )
99 | else:
100 | g += A[i]/( 1 + np.exp(-np.dot( A[i] , x ) ) )
101 | return g/m + lam2*x
102 |
103 |
104 | # ## Related to function $f_i$ (one example)
105 |
106 | # Q. To Fill
107 |
108 |
109 |
110 | def f_grad_ex(x,i):
111 | g = np.zeros(n)
112 |
113 | #### TODO
114 |
115 | return g
116 |
117 |
118 | # ### Related to function $g$
119 |
120 |
121 | def g(x):
122 | return lam1*np.linalg.norm(x,1)
123 |
124 | def g_prox(x,gamma):
125 | p = np.zeros(n)
126 | for i in range(n):
127 | if x[i] < - lam1*gamma:
128 | p[i] = x[i] + lam1*gamma
129 | if x[i] > lam1*gamma:
130 | p[i] = x[i] - lam1*gamma
131 | return p
132 |
133 |
134 | # ### Related to function $F$
135 |
136 |
137 |
138 | def F(x):
139 | return f(x) + g(x)
140 |
141 |
142 | # ## Prediction Function
143 |
144 |
145 |
146 | def prediction_train(w,PRINT):
147 | pred = np.zeros(A.shape[0])
148 | perf = 0
149 | for i in range(A.shape[0]):
150 | p = 1.0/( 1 + np.exp(-np.dot( A[i] , w ) ) )
151 | if p>0.5:
152 | pred[i] = 1.0
153 | if b[i]>0:
154 | correct = "True"
155 | perf += 1
156 | else:
157 | correct = "False"
158 | if PRINT:
159 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),1,(p-0.5)*200,correct))
160 | else:
161 | pred[i] = -1.0
162 | if b[i]<0:
163 | correct = "True"
164 | perf += 1
165 | else:
166 | correct = "False"
167 | if PRINT:
168 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),-1,100-(0.5-p)*200,correct))
169 | return pred,float(perf)/A.shape[0]
170 |
171 | def prediction_test(w,PRINT):
172 | pred = np.zeros(A_test.shape[0])
173 | perf = 0
174 | for i in range(A_test.shape[0]):
175 | p = 1.0/( 1 + np.exp(-np.dot( A_test[i] , w ) ) )
176 | if p>0.5:
177 | pred[i] = 1.0
178 | if b_test[i]>0:
179 | correct = "True"
180 | perf += 1
181 | else:
182 | correct = "False"
183 | if PRINT:
184 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),1,(p-0.5)*200,correct))
185 | else:
186 | pred[i] = -1.0
187 | if b_test[i]<0:
188 | correct = "True"
189 | perf += 1
190 | else:
191 | correct = "False"
192 | if PRINT:
193 | print("True class: {:d} \t-- Predicted: {} \t(confidence: {:.1f}%)\t{}".format(int(b[i]),-1,100-(0.5-p)*200,correct))
194 | return pred,float(perf)/A_test.shape[0]
195 |
196 |
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/plotLib.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib as mpl
3 | import matplotlib.pyplot as plt
4 | from matplotlib import cm
5 | from mpl_toolkits.mplot3d import Axes3D
6 | import time
7 | from IPython import display
8 |
9 |
10 | def custom_3dplot( f, x1_min,x1_max,x2_min,x2_max,nb_points, v_min, v_max ):
11 |
12 | def f_no_vector(x1,x2):
13 | return f( np.array( [x1,x2] ) )
14 |
15 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
16 | z = f_no_vector(x,y)
17 |
18 | fig = plt.figure()
19 | ax = fig.gca(projection='3d')
20 | ax.plot_surface(x, y, z, cmap=cm.hot , vmin = v_min, vmax = v_max)
21 | ax.set_zlim(v_min, v_max)
22 | plt.show()
23 |
24 |
25 | def level_plot( f, x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
26 |
27 |
28 | def f_no_vector(x1,x2):
29 | return f( np.array( [x1,x2] ) )
30 |
31 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
32 | z = f_no_vector(x,y)
33 |
34 | fig = plt.figure()
35 | graphe = plt.contour(x,y,z,levels)
36 | #plt.plot(3,1,'r*',markersize=15)
37 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
38 | plt.title(title)
39 | plt.show()
40 |
41 |
42 | def level_points_plot( f , x_tab , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
43 |
44 | def f_no_vector(x1,x2):
45 | return f( np.array( [x1,x2] ) )
46 |
47 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
48 | z = f_no_vector(x,y)
49 |
50 | fig = plt.figure()
51 | graphe = plt.contour(x,y,z,levels)
52 | #plt.plot(3,1,'r*',markersize=15)
53 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
54 | plt.title(title)
55 |
56 | if x_tab.shape[0] > 40:
57 | sub = int(x_tab.shape[0]/40.0)
58 | x_tab = x_tab[::sub]
59 |
60 | delay = 2.0/x_tab.shape[0]
61 | for k in range(x_tab.shape[0]):
62 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10)
63 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1]))
64 | plt.draw()
65 | display.clear_output(wait=True)
66 | display.display(fig)
67 | time.sleep(delay)
68 | display.clear_output()
69 | plt.show()
70 |
71 |
72 | def level_2points_plot( f , x_tab , x_tab2 , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
73 |
74 |
75 | def f_no_vector(x1,x2):
76 | return f( np.array( [x1,x2] ) )
77 |
78 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
79 | z = f_no_vector(x,y)
80 |
81 | fig = plt.figure()
82 | graphe = plt.contour(x,y,z,levels)
83 | #plt.plot(3,1,'r*',markersize=15)
84 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
85 | plt.xlim([x1_min,x1_max])
86 | plt.ylim([x2_min,x2_max])
87 | plt.title(title)
88 |
89 | if x_tab.shape[0] > 40:
90 | sub = int(x_tab.shape[0]/40.0)
91 | x_tab = x_tab[::sub]
92 |
93 | if x_tab2.shape[0] > 40:
94 | sub = int(x_tab2.shape[0]/40.0)
95 | x_tab2 = x_tab2[::sub]
96 |
97 | delay = 4.0/x_tab.shape[0]
98 | for k in range(x_tab.shape[0]):
99 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10)
100 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1]))
101 | plt.draw()
102 | #plt.pause(delay)
103 |
104 | delay = 4.0/x_tab2.shape[0]
105 | for k in range(x_tab2.shape[0]):
106 | plt.plot(x_tab2[k,0],x_tab2[k,1],'dg',markersize=8)
107 | #plt.annotate(k,(x_tab2[k,0],x_tab2[k,1]))
108 | #plt.pause(delay)
109 | plt.draw()
110 |
111 | plt.show()
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/student.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab7_StochasticMethods/student.npz
--------------------------------------------------------------------------------
/Lab7_StochasticMethods/student.txt:
--------------------------------------------------------------------------------
1 | # Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
2 | 1 sex - student's sex (binary: "F" - female or "M" - male)
3 | 2 age - student's age (numeric: from 15 to 22)
4 | 3 address - student's home address type (binary: "U" - urban or "R" - rural)
5 | 4 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6 | 5 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7 | 6 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
8 | 7 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
9 | 8 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
10 | 9 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
11 | 10 failures - number of past class failures (numeric: n if 1<=n<3, else 4)
12 | 11 schoolsup - extra educational support (binary: yes or no)
13 | 12 famsup - family educational support (binary: yes or no)
14 | 13 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
15 | 14 activities - extra-curricular activities (binary: yes or no)
16 | 15 nursery - attended nursery school (binary: yes or no)
17 | 16 higher - wants to take higher education (binary: yes or no)
18 | 17 internet - Internet access at home (binary: yes or no)
19 | 18 romantic - with a romantic relationship (binary: yes or no)
20 | 19 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
21 | 20 freetime - free time after school (numeric: from 1 - very low to 5 - very high)
22 | 21 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
23 | 22 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
24 | 23 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
25 | 24 health - current health status (numeric: from 1 - very bad to 5 - very good)
26 | 25 absences - number of school absences (numeric: from 0 to 93)
27 | 26 G1 - first period grade (numeric: from 0 to 20)
28 | 27 G2 - second period grade (numeric: from 0 to 20)
29 |
30 | 28 G3 - final grade (numeric: from 0 to 20, output target)
31 |
32 | Additional note: there are several (382) students that belong to both datasets .
33 | These students can be identified by searching for identical attributes
34 | that characterize each student, as shown in the annexed R file.
35 |
--------------------------------------------------------------------------------
/Lab8_MinMax/Fig/UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab8_MinMax/Fig/UGA.png
--------------------------------------------------------------------------------
/Lab8_MinMax/Lab8_Two-player zero-sum games.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
\n",
8 | "Master of Science in Industrial and Applied Mathematics (MSIAM) - 1st year
\n",
9 | "
\n",
10 | "Numerical Optimization
\n",
11 | "Lab 9: Min-Max problem and Zero-sum games
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "The goal is to solve problems of the form\n",
19 | "$$\n",
20 | "\\max_{x \\in \\Delta_n} \\min_{y \\in \\Delta_n} x^T A y \\tag{MinMax}\n",
21 | "$$\n",
22 | "where $A \\in \\mathbb{R}^{n \\times n}$, and $x$ and $y$ are probability distributions over ${1,\\dots,n}$, i.e.~they belong to the simplex of size $n$:\n",
23 | "$$ \\Delta_n = \\left\\{ p \\in \\mathbb{R}^n : p\\geq 0 , \\sum_{i=1}^n p_i = 1 \\right\\}. $$\n",
24 | "Our aim is thus to find a tuple $(x^*, y^*) \\in \\Delta_n \\times \\Delta_n$ that solves $\\mathrm{(MinMax)}$. $x^*$ is given by \n",
25 | "\\begin{align}\n",
26 | "\\tag{P1}\n",
27 | " x^\\star = \\arg\\max_{x\\in\\Delta_n} \\min_{y\\in\\Delta_n} x^\\top A y\n",
28 | "\\end{align}\n",
29 | "\n",
30 | "while \n",
31 | "\\begin{align}\n",
32 | "\\tag{P2}\n",
33 | " y^\\star = \\arg\\min_{y\\in\\Delta_n} \\max_{x\\in\\Delta_n} x^\\top A y\n",
34 | "\\end{align}\n",
35 | "\n",
36 | "This last relation stems from the equality\n",
37 | "\\begin{align}\n",
38 | "\\min_{x \\in \\Delta_n} \\max_{y \\in \\Delta_n} x^T A y\n",
39 | "=\n",
40 | "\\max_{x \\in \\Delta_n} \\min_{y \\in \\Delta_n} x^T A y\n",
41 | "\\end{align}"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "One fundamental interpretation of this problem is that $(x^*, y^*)$ are the Nash Equilibrium of the associated Zero-sum game?.\n",
49 | "\n",
50 | "## Formulation of the Nash Equilibrium as the solution of a Min-Max problem (Optional)\n",
51 | "\n",
52 | "\n",
53 | "Let us consider a game with 2 players, both having $n$ possible actions.\n",
54 | "\n",
55 | "\n",
56 | "They play against each other and whenever Player 1 plays action \\#i and Player 2 plays action \\#j, P1 gets a reward of $g_{ij}\\in\\mathbb{R}$ while P2 gets $-g_{ij}\\in\\mathbb{R}$ (hence the name zero sum).\n",
57 | "\n",
58 | "\n",
59 | "The goal for both players is to find a Nash Equilibrium, that is a probability distribution over the actions for each player such that neither player has an individual interest to deviate from this strategy."
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "\n",
67 | "\n",
68 | "Let us denote by $x$ the probability distribution of the actions of P1 (its \"strategy\"), and $y$ the one of P2. \n",
69 | "\n",
70 | "Both $x$ and $y$ are probability distributions over $n$ possible actions, thus they both belong to the simplex of size n:\n",
71 | "$$ \\Delta_n = \\left\\{ p \\in \\mathbb{R}^n : p\\geq 0 , \\sum_{i=1}^n p_i = 1 \\right\\} . $$\n",
72 | "\n",
73 | "\n",
74 | "Then, it can be shown that the NE is achieved by $(x^\\star,y^\\star)$ solution of the problems\n",
75 | "\\begin{align}\n",
76 | "%\\tag{P1}\n",
77 | " x^\\star = \\arg\\max_{x\\in\\Delta_n} \\min_{y\\in\\Delta_n} x^\\top A y\n",
78 | "\\end{align}\n",
79 | "where $A$ is the $n\\times n$ matrix such that $A_{ij} = g_{ij}$, the reward of P1 for actions $i$ and $j$.\n",
80 | "\n",
81 | "Similarly, we have\n",
82 | "\\begin{align}\n",
83 | "%\\tag{P2}\n",
84 | " y^\\star = \\arg\\min_{y\\in\\Delta_n} \\max_{x\\in\\Delta_n} x^\\top A y\n",
85 | "\\end{align}\n",
86 | "with the same matrix $A$."
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "# Numerical computation of constrained Min-Max problems"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "In this lab, we will first consider a zero-sum game characterized by matrix $A=\\left[\\begin{array}{cc} -6 & 9 \\\\ 4 & -6 \\end{array}\\right]$ ."
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "import numpy as np\n",
110 | "import scipy.optimize as scopt"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "n = 2; m =2 # Dimension\n",
120 | "A = np.array([[-6,9],[4,-6]])\n",
121 | "A"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "n,m = A.shape"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "# Method 1: Linear Programming"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "### Optimal strategy for x\n",
145 | "\n",
146 | "We begin by finding the optimal $x^\\star$.\n",
147 | "\n",
148 | "> **1.** Reformulate the problem (P1) into a linear program and solve it using a LP solver."
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": []
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "### Optimal strategy for y\n",
163 | "\n",
164 | "> **2.** Do the same thing with (P2) to find $y^\\star$."
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {
171 | "scrolled": true
172 | },
173 | "outputs": [],
174 | "source": []
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "### Value of the game"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "metadata": {},
186 | "source": [
187 | "> **3.** Compare the values of problems (P1) and (P2). What is remarkable about $A y^\\star$? About $A^\\top x^\\star$? "
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {},
194 | "outputs": [],
195 | "source": []
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "# Method 2: Optimization \n",
202 | "\n",
203 | "Finding the solution of a min-max optimization problem is harder in general than for a simple minimization problem. Nevertheless, it can still be achieved by first-order ``gradient-like'' methods. This kind of setup has attracted a lot of interest in the 2020's for the training of Generative Adversarial Networks (GANs). \n",
204 | "\n",
205 | "To do so, we can define $X=(x,y)\\in \\Delta_n\\times\\Delta_n$ and $v(X) = (-A y, A^\\top x)$. To solve the problem\n",
206 | "\\begin{align}\n",
207 | "\\tag{P}\n",
208 | "\\max_{x\\in\\Delta_n} \\min_{y\\in\\Delta_n} x^\\top A y ,\n",
209 | "\\end{align}\n",
210 | "we can try to move oppositely to its direction (ie. do a gradient ascent on $ x\\mapsto x^\\top A y $ and a gradient descent on $ y\\mapsto x^\\top A y $:\n",
211 | "\\begin{align}\n",
212 | " \\tag{Gradient Descent Ascent}\n",
213 | " X_{k+1} = \\mathrm{proj}_{\\Delta_n\\times\\Delta_n} (X_k-\\gamma_k v(X_k)).\n",
214 | "\\end{align}\n",
215 | "\n",
216 | "\n",
217 | "We first define the vector field"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "def v(X):\n",
227 | " x = X[0:n]\n",
228 | " y = X[n:]\n",
229 | " return np.concatenate((-A.dot(y),A.T.dot(x)))"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "And the dimension of the variables space."
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "N = 2*n"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "We also need the projection to the contraints: $\\Delta_n\\times\\Delta_n$\n",
253 | "\n",
254 | "> **4.** Implement a function that projects a vector onto $\\Delta_n\\times\\Delta_n$"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "def proj_simplex(v):\n",
264 | " ## TODO\n",
265 | " return v\n",
266 | "\n",
267 | "def proj_2simplex(X):\n",
268 | " x = X[0:n]\n",
269 | " y = X[n:]\n",
270 | " return np.concatenate((proj_simplex(x),proj_simplex(y)))\n"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {},
276 | "source": [
277 | "#### Gradient Descent-Ascent\n",
278 | "\n",
279 | "> **5.** Run Gradient Descent Ascent by completing the code below."
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "X = proj_2simplex(np.ones(N))\n",
289 | "K = 1000\n",
290 | "step = 0.01\n",
291 | "\n",
292 | "X_tab_GDA = np.copy(X)\n",
293 | "\n",
294 | "for k in range(1,K):\n",
295 | " X = X ## Step to fill\n",
296 | " if k%5==0:\n",
297 | " if k%25==0: print(\"ite. {:3d} : x= [{:.3f},{:.3f}] | y= [{:.3f},{:.3f}]\".format(k,X[0],X[1],X[2],X[3]))\n",
298 | " X_tab_GDA = np.vstack((X_tab_GDA,X))"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "> **6.** What do you observe in terms of convergence?"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "#### Extragradient\n",
313 | "\n",
314 | "To overcome the issues with gradient descent-ascent, the ExtraGradient method was proposed:\n",
315 | "\\begin{align}\n",
316 | " \\tag{ExtraGradient}\n",
317 | " \\left\\{ \n",
318 | " \\begin{array}{l}\n",
319 | " X_{k+1/2} = \\mathrm{proj}_{\\Delta_n\\times\\Delta_n} (X_k-\\gamma_k v(X_k) ) \\\\\n",
320 | " X_{k+1} = \\mathrm{proj}_{\\Delta_n\\times\\Delta_n} (X_k-\\gamma_k v(X_{k+1/2})))\n",
321 | " \\end{array}\n",
322 | " \\right. \n",
323 | "\\end{align}\n",
324 | "which intuitively consists in generating a leading point that will look forward the value of the field and apply it to the base point. This way, circular effects can be managed and convergence can be restored.\n",
325 | "\n",
326 | "\n",
327 | "> **7.** Run ExtraGradient by completing the code below."
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "X = proj_2simplex(np.ones(N))\n",
337 | "K = 1000\n",
338 | "step = 0.01\n",
339 | "\n",
340 | "X_tab_EG = np.copy(X)\n",
341 | "\n",
342 | "for k in range(1,K):\n",
343 | " X_lead = X ## Step to fill\n",
344 | " X = X ## Step to fill\n",
345 | " if k%5==0:\n",
346 | " if k%25==0: print(\"ite. {:3d} : x= [{:.3f},{:.3f}] | y= [{:.3f},{:.3f}]\".format(k,X_lead[0],X_lead[1],X_lead[2],X_lead[3]))\n",
347 | " X_tab_EG = np.vstack((X_tab_EG,X_lead))\n",
348 | " "
349 | ]
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "metadata": {},
354 | "source": [
355 | "#### Comparison\n",
356 | "\n",
357 | "> **8.** Compare Gradient and ExtraGradient on the plot below.\n"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": null,
363 | "metadata": {},
364 | "outputs": [],
365 | "source": [
366 | "import matplotlib.pyplot as plt\n",
367 | "\n",
368 | "plt.figure()\n",
369 | "plt.plot(X_tab_GDA[:,0],X_tab_GDA[:,2],color=\"red\",label=\"GDA\")\n",
370 | "plt.plot(X_tab_EG[:,0],X_tab_EG[:,2],color=\"blue\",label=\"EG\")\n",
371 | "plt.title(\"Behavior of x[1] and y[1]\")\n",
372 | "plt.legend()\n",
373 | "plt.show()"
374 | ]
375 | },
376 | {
377 | "cell_type": "markdown",
378 | "metadata": {},
379 | "source": [
380 | "#### Mirror Prox\n",
381 | "\n",
382 | "A possibility to make the projections above easier to compute is to change the (implicit) Euclidean metric.\n",
383 | " For the simplex, an efficient example is the \\emph{Kullback-Liebler} divergence $D(x,y) = \\sum_{i=1}^n x_i\\log(x_i/y_i) - \\sum_{i=1}^n (x_i-y_i)$, which serve as a metric on strictly positive vectors.\n",
384 | " \n",
385 | "With this metric, for any positive vector $y$,\n",
386 | " \\begin{align}\n",
387 | " \\mathrm{proj}^{KL}_{\\Delta_n} (y) = \\arg\\min_{u\\in\\Delta_n} D(u,y) = \\frac{y}{ \\sum_{i=1}^n y_i} = \\frac{y}{ \\|y\\|_1}\n",
388 | " \\end{align}\n",
389 | " which is much easier to compute.\n",
390 | " \n",
391 | "By changing the metric of the Extragradient algorithm, by going from $X_{k+1}=\\arg\\min_X\\{ \\gamma\\langle v(X_k),X\\rangle + \\frac{1}{2} \\|X-X_k\\|^2 \\}$ to $X_{k+1}=\\arg\\min_X\\{ \\gamma\\langle v(X_k),X\\rangle + D(X,X_k) \\}$} we obtain the Mirror-Prox method.\n",
392 | "\n",
393 | "\n",
394 | "> **9.** Show that \n",
395 | "> $$ \\arg\\min_X\\{ \\gamma\\langle v(X_k),X\\rangle + D(X,X_k) \\} = X_k \\exp(-\\gamma v(X_{k} )) $$\n",
396 | "\n",
397 | "\n",
398 | "The Mirror Prox algorithm then writes:\n",
399 | " \\begin{align}\n",
400 | " \\tag{Mirror Prox}\n",
401 | " \\left\\{ \n",
402 | " \\begin{array}{l}\n",
403 | " (a_{k+1/2},b_{k+1/2}) = X_k \\exp(-\\gamma v(X_k)) \\\\\n",
404 | " X_{k+1/2} = (\\frac{a_{k+1/2}}{\\|a_{k+1/2}\\|_1},\\frac{b_{k+1/2}}{\\|,b_{k+1/2}\\|_1}) \\\\\n",
405 | " (a_{k+1},b_{k+1}) = X_k \\exp(-\\gamma v(X_{k+1/2})) \\\\\n",
406 | " X_{k+1} = (\\frac{a_{k+1}}{\\|a_{k+1}\\|_1},\\frac{b_{k+1}}{\\|,b_{k+1}\\|_1}) \\\\\n",
407 | " \\end{array}\n",
408 | " \\right. .\n",
409 | " \\end{align}\n",
410 | "\n",
411 | "\n",
412 | "This is ExtraGradient but with this adapted geometry."
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "metadata": {},
418 | "source": [
419 | "\n",
420 | "> **10.** Run Mirror Prox by completing the code below and compare its behavior with the previous methods."
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": null,
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "X = proj_2simplex(np.ones(N))\n",
430 | "K = 1000\n",
431 | "step = 0.05\n",
432 | "\n",
433 | "X_tab_MP = np.copy(X)\n",
434 | "\n",
435 | "for k in range(1,K):\n",
436 | " X_lead = X_lead ## Step to fill\n",
437 | " if k%1==0:\n",
438 | " if k%25==0: print(\"ite. {:3d} : x= [{:.3f},{:.3f}] | y= [{:.3f},{:.3f}]\".format(k,X_lead[0],X_lead[1],X_lead[2],X_lead[3]))\n",
439 | " X_tab_MP = np.vstack((X_tab_MP,X_lead))\n",
440 | " "
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": null,
446 | "metadata": {},
447 | "outputs": [],
448 | "source": [
449 | "import matplotlib.pyplot as plt\n",
450 | "\n",
451 | "plt.figure()\n",
452 | "plt.plot(X_tab_GDA[:,0],X_tab_GDA[:,2],color=\"red\",label=\"GDA\")\n",
453 | "plt.plot(X_tab_EG[:,0],X_tab_EG[:,2],color=\"blue\",label=\"EG\")\n",
454 | "plt.plot(X_tab_MP[:,0],X_tab_MP[:,2],color=\"green\",label=\"MP\")\n",
455 | "plt.title(\"Behavior of x[1] and y[1]\")\n",
456 | "plt.legend()\n",
457 | "plt.show()"
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": null,
463 | "metadata": {},
464 | "outputs": [],
465 | "source": []
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "metadata": {},
471 | "outputs": [],
472 | "source": []
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "metadata": {},
478 | "outputs": [],
479 | "source": []
480 | }
481 | ],
482 | "metadata": {
483 | "kernelspec": {
484 | "display_name": "Python 3 (ipykernel)",
485 | "language": "python",
486 | "name": "python3"
487 | },
488 | "language_info": {
489 | "codemirror_mode": {
490 | "name": "ipython",
491 | "version": 3
492 | },
493 | "file_extension": ".py",
494 | "mimetype": "text/x-python",
495 | "name": "python",
496 | "nbconvert_exporter": "python",
497 | "pygments_lexer": "ipython3",
498 | "version": "3.9.13"
499 | }
500 | },
501 | "nbformat": 4,
502 | "nbformat_minor": 4
503 | }
504 |
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/._1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._1.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/._2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._2.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/._3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._3.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/._4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._4.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/._5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._5.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/._UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/._UGA.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/1.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/2.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/3.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/4.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/5.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/Fig/UGA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Lab9_Uzawa/Fig/UGA.png
--------------------------------------------------------------------------------
/Lab9_Uzawa/plotLib.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib as mpl
3 | import matplotlib.pyplot as plt
4 | from matplotlib import cm
5 | from mpl_toolkits.mplot3d import Axes3D
6 | import time
7 | from IPython import display
8 |
9 |
10 | def custom_3dplot( f, x1_min,x1_max,x2_min,x2_max,nb_points, v_min, v_max ):
11 |
12 | def f_no_vector(x1,x2):
13 | return f( np.array( [x1,x2] ) )
14 |
15 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
16 | z = f_no_vector(x,y)
17 |
18 | fig = plt.figure(figsize = (12, 6))
19 | ax = fig.gca(projection='3d')
20 | ax.plot_surface(x, y, z, cmap=cm.hot , vmin = v_min, vmax = v_max)
21 | ax.set_zlim(v_min, v_max)
22 | plt.show()
23 |
24 |
25 | def level_plot( f, x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
26 |
27 |
28 | def f_no_vector(x1,x2):
29 | return f( np.array( [x1,x2] ) )
30 |
31 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
32 | z = f_no_vector(x,y)
33 |
34 | fig = plt.figure(figsize = (12, 6))
35 | graphe = plt.contour(x,y,z,levels)
36 | #plt.plot(3,1,'r*',markersize=15)
37 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
38 | plt.title(title)
39 | plt.show()
40 |
41 |
42 | def level_points_plot( f , x_tab , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
43 |
44 | def f_no_vector(x1,x2):
45 | return f( np.array( [x1,x2] ) )
46 |
47 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
48 | z = f_no_vector(x,y)
49 |
50 | fig = plt.figure(figsize = (12, 6))
51 | graphe = plt.contour(x,y,z,levels)
52 | #plt.plot(3,1,'r*',markersize=15)
53 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
54 | plt.title(title)
55 |
56 | delay = 4.0/x_tab.shape[0]
57 | for k in range(x_tab.shape[0]):
58 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10)
59 | plt.xlim([x1_min,x1_max])
60 | plt.ylim([x2_min,x2_max])
61 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1]))
62 | plt.draw()
63 | display.clear_output(wait=True)
64 | display.display(fig)
65 | time.sleep(delay)
66 | display.clear_output()
67 | plt.show()
68 |
69 |
70 | def level_2points_plot( f , x_tab , x_tab2 , x1_min,x1_max,x2_min,x2_max,nb_points, levels , title ):
71 |
72 |
73 | def f_no_vector(x1,x2):
74 | return f( np.array( [x1,x2] ) )
75 |
76 | x , y = np.meshgrid(np.linspace(x1_min,x1_max,nb_points),np.linspace(x2_min,x2_max,nb_points))
77 | z = f_no_vector(x,y)
78 |
79 | fig = plt.figure(figsize = (12, 6))
80 | graphe = plt.contour(x,y,z,levels)
81 | #plt.plot(3,1,'r*',markersize=15)
82 | plt.clabel(graphe, inline=1, fontsize=10,fmt='%3.2f')
83 | plt.xlim([x1_min,x1_max])
84 | plt.ylim([x2_min,x2_max])
85 | plt.title(title)
86 |
87 | delay = 4.0/x_tab.shape[0]
88 | for k in range(x_tab.shape[0]):
89 | plt.plot(x_tab[k,0],x_tab[k,1],'*b',markersize=10)
90 | #plt.annotate(k,(x_tab[k,0],x_tab[k,1]))
91 | plt.draw()
92 | #plt.pause(delay)
93 |
94 | delay = 4.0/x_tab2.shape[0]
95 | for k in range(x_tab2.shape[0]):
96 | plt.plot(x_tab2[k,0],x_tab2[k,1],'dg',markersize=8)
97 | #plt.annotate(k,(x_tab2[k,0],x_tab2[k,1]))
98 | #plt.pause(delay)
99 | plt.draw()
100 |
101 | plt.show()
102 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NumericalOptimization
2 | Jupyter Notebooks for the M1 MSIAM Course "Numerical Optimization" at Université Grenoble Alpes
3 |
--------------------------------------------------------------------------------
/Tuto1_Basics/harder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/harder.png
--------------------------------------------------------------------------------
/Tuto1_Basics/poly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/poly.png
--------------------------------------------------------------------------------
/Tuto1_Basics/rosenbrock.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/rosenbrock.png
--------------------------------------------------------------------------------
/Tuto1_Basics/simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/simple.png
--------------------------------------------------------------------------------
/Tuto1_Basics/tuto1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/tuto1.pdf
--------------------------------------------------------------------------------
/Tuto1_Basics/tuto1.tex:
--------------------------------------------------------------------------------
1 | %\documentclass[paper=a4, fontsize=9pt]{article}
2 | \documentclass[a4paper,twoside,10pt]{amsart}
3 |
4 |
5 | %\usepackage[scale=0.8]{geometry}
6 | \usepackage{fullpage}
7 |
8 | \usepackage[T1]{fontenc} % Use 8-bit encoding that has 256 glyphs
9 | \usepackage[english]{babel} % English language/hyphenation
10 | \usepackage{amsmath,amsfonts,amsthm} % Math packages
11 | \usepackage{xcolor}
12 | \usepackage{hyperref}
13 | \usepackage{tcolorbox}
14 |
15 | \usepackage{tikz}
16 | \usepackage{tkz-graph}
17 |
18 | \numberwithin{equation}{section} % Number equations within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
19 | \numberwithin{figure}{section} % Number figures within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
20 | \numberwithin{table}{section} % Number tables within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
21 | \usepackage{graphicx}
22 | \usepackage{caption}
23 | \usepackage{subcaption}
24 |
25 |
26 | \newcommand{\horrule}[1]{\rule{\linewidth}{#1}} % Create horizontal rule command with 1 argument of height
27 | \newcommand{\ans}[1]{ { \color{gray} \itshape #1} } % Create horizontal rule command with 1 argument of height
28 |
29 | \newtheorem{theo}{Theorem}
30 | \newtheorem{lemma}{Lemma}
31 | \theoremstyle{definition}
32 | \newtheorem{q_td}{Exercise }
33 | \newcommand{\reftd}[1]{ $\circ$ \ref{#1}}
34 | \newtheorem{q_tp}{$\diamond$}
35 | \newcommand{\reftp}[1]{$\diamond$ \ref{#1}}
36 |
37 | \begin{document}
38 |
39 | %----------------------------------------------------------------------------------------
40 | % TITLE
41 | %----------------------------------------------------------------------------------------
42 |
43 |
44 | \normalfont \normalsize
45 | \noindent\textsc{\small Universit\'e Grenoble Alpes }\\
46 | \noindent\textsc{\small \hfill MSIAM 1st year} \\ [0.3cm] % Your university, school and/or department name(s)
47 | \horrule{0.5pt} \\[0.4cm] % Thin top horizontal rule
48 | \begin{center}
49 | {\LARGE \scshape Numerical Optimization\\ Tuto 1: Gradients and Minimization} \\ % The title
50 | \end{center}
51 | \noindent\textsc{\hfill L. Desbat \& F. Iutzeler }
52 | \horrule{2pt} \\[0.5cm] % Thick bottom horizontal rule
53 |
54 |
55 |
56 | %----------------------------------------------------------------------------------------
57 | % TD
58 | %----------------------------------------------------------------------------------------
59 | %\newpage
60 | \setcounter{section}{0}
61 | \renewcommand{\thesection}{\Alph{section}}
62 | \renewcommand*{\theHsection}{TD.\the\value{section}}
63 |
64 |
65 | \vspace*{0.5cm}
66 |
67 | \section{Differentiability, Minima, and Convexity}
68 |
69 |
70 | \begin{q_td}[Quadratic functions]\label{td:qp}\hfill
71 |
72 | \begin{itemize}
73 | \item[a.] In $\mathbb{R}^n$, compute the gradient of the squared Euclidean norm $\|\cdot\|_2^2$ at a generic point $x\in\mathbb{R}^n$.
74 | \item[b.] Let $A$ be an $m \times n$ real matrix and $b$ a size-$m$ real vector. We define $f(x) = \|Ax-b\|_2^2$. For a generic vector $a\in \mathbb{R}^n$, compute the gradient $\nabla f(a)$ and Hessian $H_f(a)$.
75 | \item[c.] Let $C$ be an $n \times n$ real matrix, $d$ a size-$n$ real vector, and $e\in\mathbb{R}$. We define $g(x) = x^\mathrm{T}Cx + d^\mathrm{T}x + e$. For a generic vector $a\in \mathbb{R}^n$, compute the gradient $\nabla g(a)$ and Hessian $H_g(a)$.
76 | \item[d.] Can all functions of the form of $f$ and be written in the form of $g$? And conversely?
77 | \end{itemize}
78 | \end{q_td}
79 |
80 |
81 | \vspace*{0.5cm}
82 |
83 | \begin{q_td}[Basic Differential calculus]
84 | \label{td:conv}
85 | Use the composition lemma to compute the gradients of:
86 | \begin{itemize}
87 | \item[a.] $f_1(x) = \|Ax-b\|_2^2$ .
88 | \item[b.] $f_2(x) = \|x\|_2$ .
89 | \end{itemize}
90 | \end{q_td}
91 |
92 |
93 | \vspace*{0.5cm}
94 |
95 | \begin{q_td}[Preparing the Lab]
96 | \label{td:fun}
97 | In the first lab, we will consider the following toy functions:
98 | \begin{align*}
99 | & \begin{array}{rrcll}
100 | f: & \mathbb{R}^2 & \to &\mathbb{R}\\
101 | & (x_1,x_2) & \mapsto & 4 (x_1-3)^2 + 2(x_2-1)^2
102 | \end{array}\\
103 | %
104 | & \begin{array}{rrcll}
105 | g: & \mathbb{R}^2 & \to &\mathbb{R}\\
106 | & (x_1,x_2) & \mapsto & \log( 1 + \exp(4 (x_1-3)^2 ) + \exp( 2(x_2-1)^2 ) ) - \log(3)
107 | \end{array} \\
108 | %
109 | & \begin{array}{rrcll}
110 | r: & \mathbb{R}^2 & \to &\mathbb{R}\\
111 | & (x_1,x_2) & \mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2
112 | \end{array}\\
113 | %
114 | & \begin{array}{rrcll}
115 | t: & \mathbb{R}^2 & \to &\mathbb{R}\\
116 | & (x_1,x_2) & \mapsto & (0.6 x_1 + 0.2 x_2)^2 \left((0.6 x_1 + 0.2 x_2)^2 - 4 (0.6 x_1 + 0.2 x_2)+4\right) + (-0.2 x_1 + 0.6 x_2)^2
117 | \end{array}\\
118 | %
119 | & \begin{array}{rrcll}
120 | p: & \mathbb{R}^2 & \to &\mathbb{R}\\
121 | & (x_1,x_2) & \mapsto & \left| x_1-3 \right| + 2\left| x_2-1\right| .
122 | \end{array}
123 | \end{align*}
124 | \begin{itemize}
125 | \item[a.] From the 3D plots of \ref{fig:3d}, which functions are visibly non-convex.
126 | \item[b.] For all five functions, show that they are convex or give an argument for their non-convexity.
127 | \item[c.] For functions $f,g,r,t$, compute their gradient.
128 | \item[d.] For functions $f,g$, compute their Hessian.
129 | \end{itemize}
130 | \end{q_td}
131 |
132 |
133 |
134 | \begin{figure}[h!]
135 | \centering
136 | \begin{subfigure}[b]{0.48\textwidth}
137 | \centering
138 | \includegraphics[width=1.0\textwidth]{simple.png}
139 | \caption{a \emph{simple} function: $f$}
140 | \end{subfigure}
141 | ~
142 | \begin{subfigure}[b]{0.48\textwidth}
143 | \centering
144 | \includegraphics[width=1.0\textwidth]{harder.png}
145 | \caption{some \emph{harder} function: $g$}
146 | \end{subfigure} \\
147 | \centering
148 | \begin{subfigure}[b]{0.48\textwidth}
149 | \centering
150 | \includegraphics[width=1.0\textwidth]{rosenbrock.png}
151 | \caption{\emph{Rosenbrock}'s function: $r$}
152 | \end{subfigure}
153 | ~
154 | \begin{subfigure}[b]{0.48\textwidth}
155 | \centering
156 | \includegraphics[width=1.0\textwidth]{two_pits.png}
157 | \caption{\emph{two pits} function: $t$}
158 | \end{subfigure}
159 | ~
160 | \begin{subfigure}[b]{0.48\textwidth}
161 | \centering
162 | \includegraphics[width=1.0\textwidth]{poly.png}
163 | \caption{\emph{polyhedral} function: $p$}
164 | \end{subfigure}
165 | \caption{3D plots of the considered functions}
166 | \label{fig:3d}
167 | \end{figure}
168 |
169 |
170 |
171 | \vspace*{0.5cm}
172 |
173 | \begin{q_td}[Fundamentals of convexity]
174 | \label{td:conv}
175 | ~
176 | \begin{itemize}
177 | \item[a.] Let $f$ and $g$ be two convex functions. Show that $m(x) = \max(f(x),g(x) )$ is convex.
178 | \item[b.] Show that $f_1(x) = \max(x^2-1 , 0)$ is convex.
179 | \item[c.] Let $f$ be a convex function and $g$ be a convex, non-decreasing function. Show that $c(x) = g(f(x))$ is convex.
180 | \item[d.] Show that $f_2(x) = \exp(x^2)$ is convex. What about $f_3(x) = \exp(-x^2)$
181 | \item[e.] Justify why the $1$-norm, the $2$ norm, and the squared $2$-norm are convex.
182 | \end{itemize}
183 | \end{q_td}
184 |
185 | \vspace*{0.5cm}
186 |
187 | \begin{q_td}[Strict and strong convexity]
188 | \label{td:qp} A function $f:\mathbb{R}^n \to \mathbb{R}$ is said
189 | \begin{itemize}
190 | \item \emph{strictly convex} if for any $x \neq y \in\mathbb{R}^n$ and any $\alpha\in]0,1[$
191 | $$ f(\alpha x + (1- \alpha )y ) < \alpha f(x) + (1- \alpha )f(y) $$
192 | \item \emph{strongly convex} if there exists $\beta>0$ such that $f - \frac{\beta}{2}\|\cdot\|_2^2$ is convex.
193 | \end{itemize}
194 | \begin{itemize}
195 | \item[a.] For a strictly convex function $f$, show that the problem
196 | $$ \left\{ \begin{array}{l} \min f(x) \\ x \in C \end{array} \right. $$
197 | where $C$ is a convex set admits at most one solution.
198 | \item[b.] Show that a strongly convex function is also strictly convex.\\ \emph{(hint: use the identity $\|\alpha x + (1-\alpha)y\|^2 = \alpha \|x\|^2 + (1-\alpha)\|y\|^2 - \alpha (1-\alpha)\|x-y\|^2 $.)}
199 | \end{itemize}
200 | \end{q_td}
201 |
202 | \vspace*{0.5cm}
203 |
204 |
205 | \begin{q_td}[Optimality conditions]
206 | \label{td:opt}
207 | Let $f:\mathbb{R}^n\to\mathbb{R}$ be a twice differentiable function and $\bar{x}\in\mathbb{R}^n$. We suppose that $f$ admits a local minimum at $\bar{x}$ that is $f(x)\geq f(\bar{x})$ for all $x$ in a neighborhood\footnote{Formally, one would write $\forall x \in \mathbb{R}^n$ such that $\|x-\bar{x}\|\leq \varepsilon$ for $\varepsilon>0$ and some norm $\|\cdot\|$. } of $\bar{x}$.
208 | \begin{itemize}
209 | \item[a.] For any direction $u\in\mathbb{R}^n$, we define the $\mathbb{R}\to\mathbb{R}$ function $q(t) = f(\bar{x}+tu)$. Compute $q'(t)$.
210 | \item[b.] By using the first order Taylor expansion of $q$ at $0$, show that $\nabla f(\bar{x}) = 0$.
211 | \item[c.] Compute $q''(t)$. By using the second order Taylor expansion of $q$ at $0$, show that $\nabla^2 f(\bar{x})$ is positive semi-definite.
212 | \end{itemize}
213 | \end{q_td}
214 |
215 | \vspace*{1cm}
216 |
217 |
218 |
219 |
220 | \section{the Gradient Algorithm}
221 |
222 | \begin{q_td}[Descent lemma]
223 | \label{td:smooth}
224 | A function $f:\mathbb{R}^n\to\mathbb{R}$ is said to be $L$-smooth if it is differentiable and its gradient $\nabla f$ is $L$-Lipchitz continuous, that is
225 | $$\forall x,y\in\mathbb{R}^n, ~~ \|\nabla f(x) - \nabla f(y) \| \leq L \|x-y\|. $$
226 | The goal of the exercise is to prove that if $f:\mathbb{R}^n\to\mathbb{R}$ is $L$-smooth, then for all $x,y\in\mathbb{R}^n$,
227 | $$ f(x) \leq f(y) + (x-y)^\mathrm{T} \nabla f(y) + \frac{L}{2} \| x-y\|^2 $$
228 | \begin{itemize}
229 | \item[a.] Starting from fundamental theorem of calculus stating that for all $x,y\in\mathbb{R}^n$,
230 | $$ f(x) - f(y) = \int_{0}^1 (x-y)^\mathrm{T} \nabla f(y + t(x-y) ) \mathrm{d}t $$
231 | prove the descent lemma.
232 | \item[b.] Give a function for which the inequality is tight and one for which it is not.
233 | \end{itemize}
234 | \end{q_td}
235 |
236 | \vspace*{0.5cm}
237 |
238 | \begin{q_td}[Smooth functions]
239 | Consider the constant stepsize gradient algorithm $x_{k+1} = x_k - \gamma \nabla f(x_k)$ on an $L$-smooth function $f$ with some minimizer (i.e. some $x^\star$ such that $f(x)\geq f(x^\star)$ for all $x$).
240 | \begin{itemize}
241 | \item[a.] Use the \emph{descent lemma} to prove convergence of the sequence $(f(x_k))_k$ when $\gamma\leq 2/L$.
242 | \item[b.] Did you use at some point that the function was convex? Conclude about the convergence of the gradient algorithm on smooth non-convex functions.
243 | \end{itemize}
244 | \end{q_td}
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 | \end{document}
253 |
--------------------------------------------------------------------------------
/Tuto1_Basics/two_pits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto1_Basics/two_pits.png
--------------------------------------------------------------------------------
/Tuto4_Prox/tuto4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto4_Prox/tuto4.pdf
--------------------------------------------------------------------------------
/Tuto4_Prox/tuto4.tex:
--------------------------------------------------------------------------------
1 | %\documentclass[paper=a4, fontsize=9pt]{article}
2 | \documentclass[a4paper,twoside,10pt]{amsart}
3 |
4 |
5 | %\usepackage[scale=0.8]{geometry}
6 | \usepackage{fullpage}
7 |
8 | \usepackage[T1]{fontenc} % Use 8-bit encoding that has 256 glyphs
9 | \usepackage[english]{babel} % English language/hyphenation
10 | \usepackage{amsmath,amsfonts,amsthm} % Math packages
11 | \usepackage{xcolor}
12 | \usepackage{hyperref}
13 |
14 | \usepackage{tikz}
15 | \usepackage{tkz-graph}
16 |
17 | \numberwithin{equation}{section} % Number equations within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
18 | \numberwithin{figure}{section} % Number figures within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
19 | \numberwithin{table}{section} % Number tables within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
20 | \usepackage{graphicx}
21 | \usepackage{caption}
22 | \usepackage{subcaption}
23 |
24 |
25 | \newcommand{\horrule}[1]{\rule{\linewidth}{#1}} % Create horizontal rule command with 1 argument of height
26 | \newcommand{\ans}[1]{ { \color{gray} \itshape #1} } % Create horizontal rule command with 1 argument of height
27 |
28 | \newtheorem{theo}{Theorem}
29 | \newtheorem{lemma}{Lemma}
30 | \theoremstyle{definition}
31 | \newtheorem{q_td}{Exercise }
32 | \newcommand{\reftd}[1]{ $\circ$ \ref{#1}}
33 | \newtheorem{q_tp}{$\diamond$}
34 | \newcommand{\reftp}[1]{$\diamond$ \ref{#1}}
35 |
36 | \begin{document}
37 |
38 | %----------------------------------------------------------------------------------------
39 | % TITLE
40 | %----------------------------------------------------------------------------------------
41 |
42 |
43 | \normalfont \normalsize
44 | \noindent\textsc{\small Universit\'e Grenoble Alpes }\\
45 | \noindent\textsc{ MSIAM 1st year} \\ [0.3cm] % Your university, school and/or department name(s)
46 | \horrule{0.5pt} \\[0.4cm] % Thin top horizontal rule
47 | \begin{center}
48 | {\LARGE \scshape Numerical Optimization \\
49 | Tuto 4: Proximal methods} \\ % The title
50 | \end{center}
51 | \noindent\textsc{\hfill L. Desbat \& F. Iutzeler }
52 | \horrule{2pt} \\[0.5cm] % Thick bottom horizontal rule
53 |
54 |
55 |
56 | %----------------------------------------------------------------------------------------
57 | % TD
58 | %----------------------------------------------------------------------------------------
59 | %\newpage
60 | \setcounter{section}{0}
61 | \renewcommand{\thesection}{\Alph{section}}
62 | \renewcommand*{\theHsection}{TD.\the\value{section}}
63 |
64 |
65 | \vspace*{0.5cm}
66 |
67 | \section{the Proximity operator}
68 |
69 | In non-smooth optimization, that is when the objective function is not differentiable, the gradient may not be defined at each point. Instead, for any point $x\in\mathbb{R}$ and any convex function $g:\mathbb{R}^n \to \mathbb{R}\cup\{+\infty\}$, one can define a subdifferential $\partial g(x) \subset \mathbb{R}^n$ as
70 | $$ \partial g(x) = \{ u\in\mathbb{R}^n | g(z) \geq g(x) + \langle u ; z-x \rangle \text{ for all } z\in\mathbb{R}^n \}. $$
71 | The optimality conditions and computation rules roughly translate.
72 |
73 |
74 | However, the sub-gradient algorithm $x_{k+1} = x_k - \gamma_k g_k$ where $g_k\in \partial g(x_k)$ rely on a vanishing stepsize $\gamma_k$ and is thus very slow in practice. In order to mend this case, a more evolved operator was introduced: its \emph{proximity operator} is defined for some positive constant $\gamma>0$ as
75 | \begin{equation}
76 | x = \mathbf{prox}_{\gamma g}(y) = \arg\min_{w\in\mathbb{R}^n} \left\{ \gamma g(w) + \frac{1}{2} \left\| w - y \right\|^2 \right\} .
77 | \end{equation}
78 |
79 |
80 |
81 |
82 | \begin{q_td}[First Properties]\label{td:prox0}\hfill
83 |
84 | \begin{itemize}
85 | \item[a.] Justify that for a proper convex function $g$, this definition as an $\arg\min$ indeed leads to a unique point. Would it still be the case if $g$ was not convex?
86 | \item[b.] This operation is sometimes called \emph{implicit gradient}. Find an explanation why.\\
87 | \emph{\small Hint: Use First order optimality conditions.}
88 | \item[c.] Let $x = \mathbf{prox}_{\gamma g}(y)$ and $x' = \mathbf{prox}_{\gamma g}(y')$, show that
89 | $$ \|x - x'\|^2 \leq \langle x' - y ' ; x- y \rangle . $$
90 | \emph{\small Hint: if $g_{x} \in \partial g(x)$ and $g_{x'} \in \partial g(x')$, the convexity of $g$ gives $\langle x -x'; g_x - g_{x'} \rangle \geq 0$.}
91 | \item[d.] Deduce that
92 | $$ \|x - x'\|^2 \leq \| y - y' \|^2 - \| (x-y) - (x'-y') \|^2 $$
93 | and investigate the similarities with the gradient of a smooth function.
94 | \end{itemize}
95 | \end{q_td}
96 |
97 | \vspace*{0.5cm}
98 |
99 | We showed that the proximity operator of a convex function has the same contraction properties of a gradient operation with step $1/L$ on an $L$-smooth convex function. Let us now investigate the related algorithm.
100 |
101 | \vspace*{0.5cm}
102 |
103 | \begin{q_td}[Proximal point algorithm]\label{td:prox} The proximal point algorithm is simply obtained by successively applying the proximity operator of a function:
104 | $$x_{k+1} = \mathbf{prox}_{\gamma g}(x_k)$$
105 | \begin{itemize}
106 | \item[a.] Let $x^\star$ be a \emph{fixed point} of $g$ (we will suppose that such a point exists), that is $x^\star = \mathbf{prox}_{\gamma g}(x^\star)$. Show that $x^\star$ is a minimizer of $g$. \\
107 | \emph{\small Hint: Use First order optimality conditions.}
108 | \item[b.] Show that if $x = \mathbf{prox}_{\gamma g}(y) $, then $g(x)\leq g(y) - \frac{1}{2\gamma} \|x-y\|^2$.\\
109 | \emph{\small Hint: Use that for $f$ $\mu$-strongly convex and $x^\star$ the minimizer of $f$, then $f(x^\star) \leq f(y) - \frac{\mu}{2}\|x^\star-y\|^2$.}
110 | \item[c.] Conclude that the \emph{Proximal Point Algorithm} converge to a minimizer of $g$.
111 | \end{itemize}
112 | \end{q_td}
113 |
114 | \vspace*{0.5cm}
115 |
116 | Now that we have seen the optimization-wise interest of the proximity operator, let us compute it explicitly on some functions.
117 |
118 | \vspace*{0.5cm}
119 |
120 | \begin{q_td}[Proximity Operators of basic functions]
121 | \label{td:fun}
122 | Compute the proximity operators of the following functions:
123 | \begin{itemize}
124 | \item[a.] $g_1(x) = \| x \|_2^2$ .
125 | \item[b.] $g_2(x) = \iota_C(x)$ with $\iota_C(x) = 0$ if $x$ belongs to convex set $C$ and $+\infty$ elsewhere.
126 | \item[c.] $g_3(x) = \|x\|_1 $ .
127 | \item[d.] $g_4(x) = \|x\|_2 $ .
128 | \end{itemize}
129 | \end{q_td}
130 |
131 | \vspace*{0.5cm}
132 |
133 | Unfortunately, in general, no explicit formulation can be found but i) the sub-optimization problems are now strongly convex and thus easier to solve; and more interestingly ii) proximity operator can be merged with other algorithms in order to minimize general functions. These algorithms are called \emph{proximal algorithms} of which the most popular is the proximal gradient algorithm which mixes gradient and proximity operations.
134 |
135 | \vspace*{0.5cm}
136 |
137 | \section{the Proximal Gradient algorithm}
138 |
139 |
140 | Let us consider the \emph{composite} optimization problem
141 | $$ \min_{x\in\mathbb{R}^n} F(x) := f(x) + g(x)$$
142 | where $f:\mathbb{R}^n \to \mathbb{R}$ is $L$-smooth and convex; and $g:\mathbb{R}^n \to \mathbb{R}\cup\{+\infty\}$ is convex. The \emph{proximal gradient algorithm} writes
143 | $$ x_{k+1} = \mathbf{prox}_{\gamma g}\left( x_k - \gamma \nabla f(x_k) \right) . $$
144 |
145 | \begin{q_td}[Analysis]
146 | \label{td:ana}\hfill
147 |
148 |
149 | \begin{itemize}
150 | \item[a.] Show that the fixed points of the iteration above are minimizers of $F$.
151 | \item[b.] Connect the proximal gradient with the projected gradient algorithm.
152 | \item[c.] Show that
153 | $$ F(x_{k+1}) \leq F(x_k) - \frac{(2-\gamma L)}{2\gamma} \|x_{k+1} - x_k \|^2 . $$
154 | \emph{\small Hint: Use the descent lemmas for the gradient on smooth functions and the proximal point algorithm.}
155 | \item[d.] Give a range of stepsizes for which the sequence $F(x_k)$ converges as soon as minimizer exists.
156 | \end{itemize}
157 | \end{q_td}
158 |
159 |
160 | \vspace*{0.5cm}
161 |
162 | \begin{q_td}[Application]
163 | \label{td:app}
164 | The \emph{lasso} problem is a regularized linear regression problem that writes as
165 | $$ \min_{x\in\mathbb{R}^n } \frac{1}{2}\|Ax-b\|^2 + \lambda \|x\|_1 $$
166 | where $A$ is a full rank $m\times n$ matrix and $b$ is a size $m$ vector.
167 | \begin{itemize}
168 | \item[a.] Write the iterations for a proximal gradient algorithm. Which stepsize can be used?
169 | \item[b.] The regularization $\lambda \|x\|_1$ is said to be \emph{sparsity enforcing}, guess why.
170 | \end{itemize}
171 | \end{q_td}
172 |
173 |
174 |
175 |
176 | \end{document}
177 |
--------------------------------------------------------------------------------
/Tuto5_Rates/tuto5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto5_Rates/tuto5.pdf
--------------------------------------------------------------------------------
/Tuto5_Rates/tuto5.tex:
--------------------------------------------------------------------------------
1 | %\documentclass[paper=a4, fontsize=9pt]{article}
2 | \documentclass[a4paper,twoside,10pt]{amsart}
3 |
4 |
5 | %\usepackage[scale=0.8]{geometry}
6 | \usepackage{fullpage}
7 |
8 | \usepackage[T1]{fontenc} % Use 8-bit encoding that has 256 glyphs
9 | \usepackage[english]{babel} % English language/hyphenation
10 | \usepackage{amsmath,amsfonts,amsthm} % Math packages
11 | \usepackage{xcolor}
12 | \usepackage{hyperref}
13 | \usepackage{tcolorbox}
14 |
15 | \usepackage{tikz}
16 | \usepackage{tkz-graph}
17 |
18 | \numberwithin{equation}{section} % Number equations within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
19 | \numberwithin{figure}{section} % Number figures within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
20 | \numberwithin{table}{section} % Number tables within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
21 | \usepackage{graphicx}
22 | \usepackage{caption}
23 | \usepackage{subcaption}
24 |
25 |
26 | \newcommand{\horrule}[1]{\rule{\linewidth}{#1}} % Create horizontal rule command with 1 argument of height
27 | \newcommand{\ans}[1]{ { \color{gray} \itshape #1} } % Create horizontal rule command with 1 argument of height
28 |
29 | \newtheorem{theo}{Theorem}
30 | \newtheorem{lemma}{Lemma}
31 | \theoremstyle{definition}
32 | \newtheorem{q_td}{Exercise }
33 | \newcommand{\reftd}[1]{ $\circ$ \ref{#1}}
34 | \newtheorem{q_tp}{$\diamond$}
35 | \newcommand{\reftp}[1]{$\diamond$ \ref{#1}}
36 |
37 | \begin{document}
38 |
39 | %----------------------------------------------------------------------------------------
40 | % TITLE
41 | %----------------------------------------------------------------------------------------
42 |
43 |
44 | \normalfont \normalsize
45 | \noindent\textsc{\small Universit\'e Grenoble Alpes }\\
46 | \noindent\textsc{\small MSIAM 1st year} \\ [0.3cm] % Your university, school and/or department name(s)
47 | \horrule{0.5pt} \\[0.4cm] % Thin top horizontal rule
48 | \begin{center}
49 | {\LARGE \scshape Numerical Optimization \\ Tuto 5: Rates of first-order methods} \\ % The title
50 | \end{center}
51 | \noindent\textsc{\hfill L. Desbat \& F. Iutzeler }
52 | \horrule{2pt} \\[0.5cm] % Thick bottom horizontal rule
53 |
54 |
55 |
56 | %----------------------------------------------------------------------------------------
57 | % TD
58 | %----------------------------------------------------------------------------------------
59 | %\newpage
60 | \setcounter{section}{0}
61 | \renewcommand{\thesection}{\Alph{section}}
62 | \renewcommand*{\theHsection}{TD.\the\value{section}}
63 |
64 |
65 | \vspace*{0.5cm}
66 |
67 | In the whole tutorial, we will assume that $f: \mathbb{R}^n\to \mathbb{R}$ is an $L$-smooth \emph{convex} function with minimizers.
68 |
69 | \section{Convergence rates in the strongly convex case}
70 | \vspace*{0.5cm}
71 |
72 |
73 | \begin{q_td}[Some other descent lemmas] \label{td:descent2} \hfill
74 |
75 | The goal of this exercise is to provide useful lemmas for proving convergence rates. Let $x^\star$ be a minimizer of $f$.
76 | \begin{itemize}
77 | \item[a.] Show that for all $x,y\in\mathbb{R}^n$,
78 | $$ f(x) - f(y) \leq \langle x- y ; \nabla f(x) \rangle - \frac{1}{2L} \| \nabla f(x) - \nabla f(y) \|^2 $$
79 | and thus $$ \frac{1}{L} \| \nabla f(x) - \nabla f(y) \|^2 \leq \langle x- y ; \nabla f(x) - \nabla f(y) \rangle \leq L \|x-y\|^2 .$$
80 | \emph{Hint: Define $z=y - \frac{1}{L}(\nabla f(y) - \nabla f(x) ) $.\\ Use convexity to bound $f(x)-f(z)$ and smoothness to bound $f(z) - f(y)$ and sum both inequalities.}
81 | \item[b.] Let $f$ be in addition $\mu$-strongly convex; that is, $f-\frac{\mu}{2}\|\cdot\|^2 $ is convex. Show that for all $x\in\mathbb{R}^n$,
82 | $$ (x-x^\star)^\mathrm{T} \nabla f(x) \geq \frac{\mu L}{\mu + L} \|x-x^\star\|^2 + \frac{1}{\mu + L} \|\nabla f(x)\|^2 .$$
83 | \emph{Hint: Use the fact that $f-\frac{\mu}{2}\|\cdot\|^2 $ is $(L-\mu)$-smooth and question a. }
84 | \end{itemize}
85 | \end{q_td}
86 |
87 |
88 |
89 | \vspace*{0.5cm}
90 |
91 |
92 | \begin{q_td}[Strongly convex case]\label{td:str}\hfill
93 |
94 |
95 | The goal of this exercise is to investigate the convergence rate of the fixed stepsize gradient algorithm on a $\mu$-strongly convex, $L$-smooth function:
96 | $$ x_{k+1} = x_k - \frac{2}{\mu+L} \nabla f(x_k)$$
97 | which will introduce us to the mechanics of Optimization theory.
98 | \begin{itemize}
99 | \item[a.] From \ref{td:descent2}b., prove that
100 | \begin{align*}
101 | \|x_{k+1} - x^\star \|^2 &\leq \left( 1 - \frac{4\mu L}{(\mu+L)^2}\right) \|x_k - x^\star \|^2 \\
102 | &= \left( \frac{\kappa - 1}{ \kappa+1}\right)^2 \|x_k - x^\star \|^2
103 | \end{align*}
104 | where $\kappa=L/\mu$ is the \emph{conditionning number} of the problem.
105 | \item[b.] Show that
106 | $$ f(x_k) - f(x^\star) \leq \frac{L}{2} \|x_k - x^\star \|^2 .$$
107 | \item[c.] Conclude that for the gradient algorithm with stepsize ${2}/{(\mu+L)}$ we have
108 | $$ f(x_k) - f(x^\star) \leq \left( \frac{\kappa - 1}{ \kappa+1}\right)^{2k} \frac{L\|x_0 - x^\star \|^2}{2} . $$
109 | \end{itemize}
110 | \end{q_td}
111 |
112 |
113 | \vspace*{0.5cm}
114 |
115 |
116 | \section{Convergence rates in the non-strongly convex case}
117 | \vspace*{0.5cm}
118 |
119 |
120 | \begin{q_td}[Smooth case]\label{td:smooth}\hfill
121 |
122 |
123 | The goal of this exercise is to investigate the convergence rate of the fixed stepsize gradient algorithm on an $L$-smooth function:
124 | $$ x_{k+1} = x_k - \frac{1}{L} \nabla f(x_k)$$
125 | which will introduce us to the mechanics of Optimization theory.
126 | \begin{itemize}
127 | %\item[a.] Deduce from \ref{td:descent2}a. that $ (x-x^\star)^\mathrm{T} \nabla f(x) \geq \frac{1}{2L} \|\nabla f(x) \|^2 $.
128 | \item[a.] Prove that
129 | $$ \|x_{k+1} - x^\star \|^2 \leq \|x_k - x^\star \|^2 - \frac{1}{L^2} \| \nabla f(x_k) \|^2 = \|x_k - x^\star \|^2 - \| x_{k+1} - x_k \|^2 .$$
130 | \item[b.] Show that
131 | $$ \delta_k := f(x_k) - f(x^\star) \leq \|x_k - x^\star \| \cdot \|\nabla f(x_k) \| \leq \|x_1 - x^\star \| \cdot \|\nabla f(x_k) \| .$$
132 | \emph{Hint: Use convexity then a.}
133 | \item[c.] Use smoothness and b. to show that
134 | $$ 0 \leq \delta_{k+1} \leq \delta_k - \underbrace{\frac{1}{2L\|x_1-x^\star\|^2}}_{:=\omega} \delta_k^2 . $$
135 | \item[d.] Deduce that
136 | $$ \frac{1}{\delta_{k+1}} - \frac{1}{\delta_{k}} \geq \omega .$$
137 | \emph{Hint: Divide c. by $ \delta_{k}\delta_{k+1} $}.
138 | \item[e.] Conclude that for the gradient algorithm with stepsize $1/L$ we have
139 | $$ f(x_k) - f(x^\star) \leq \frac{2L\|x_1-x^\star\|^2}{k-1} . $$
140 | \end{itemize}
141 | \end{q_td}
142 |
143 |
144 |
145 | \newpage
146 |
147 |
148 | \begin{tcolorbox}[width=\textwidth,colback={blue!5!white},title={\textbf{Optimization inequalities cheatsheet}},colbacktitle=black,coltitle=white]
149 | For any function $f$:
150 | \begin{itemize}
151 | \item[(convex)] convex
152 | \item[(diff)] differentiable
153 | \item[(min)] with minimizers $X^\star$, $x^\star \in X^\star$
154 | \item[(smooth)] $L$-smooth (differentiable with $\nabla f$ $L$ Lipschitz continuous)
155 | \item[(strong)] $\mu$-strongly convex ($\mu$ can be taken equal to $0$ below)
156 | \end{itemize}
157 |
158 |
159 | \begin{align*}
160 | & f(y) \geq f(x) + (y-x)^\mathrm{T} \nabla f(x) \text{ (convex) + (diff) } \\
161 | \Rightarrow &\langle x-y ; \nabla f(x)-\nabla f(y)\rangle\geq0 \text{ (convex) + (diff) }
162 | \end{align*}
163 |
164 | \begin{align*}
165 | & f(x^\star) \leq f(x) \forall x \text{ (minimizer) } \\
166 | \Rightarrow & \nabla f(x^\star) = 0 \text{ (convex) + (diff) + (minimizer) }
167 | \end{align*}
168 |
169 |
170 | \begin{align*}
171 | & \|\nabla f(x) - \nabla f(y) \| \leq L \|x-y\| \text{ (smooth) } \\
172 | \Rightarrow & f(x) \leq f(y) + (x-y)^\mathrm{T} \nabla f(y) + \frac{L}{2} \| x-y\|^2 \text{ (smooth) } \\
173 | \Rightarrow & \langle x-y ; \nabla f(x)-\nabla f(y)\rangle \leq L \|x-y\|^2 \text{ (smooth) }
174 | \end{align*}
175 |
176 |
177 |
178 | \begin{align*}
179 | & f(x) - \frac{\mu}{2}\|x\|^2 \text{ is convex } \text{ (strong) } \\
180 | \Rightarrow & f(y) + (x-y)^\mathrm{T} \nabla f(y) + \frac{\mu}{2} \| x-y\|^2 \leq f(x) \text{ (strong) + (diff) } \\
181 | \Rightarrow & \mu \|x-y\|^2 \leq \langle x-y ; \nabla f(x)-\nabla f(y)\rangle \text{ (strong) + (diff) }
182 | \end{align*}
183 |
184 |
185 | \vspace*{1cm}
186 |
187 | Combining the above, when $f$ is $\mu$-strongly convex and $L$-smooth:
188 |
189 | \begin{align*}
190 | f(y) + (x-y)^\mathrm{T} \nabla f(y) + \frac{\mu}{2} \| x-y\|^2 \leq f(x) \leq f(y) + (x-y)^\mathrm{T} \nabla f(y) + \frac{L}{2} \| x-y\|^2
191 | \end{align*}
192 |
193 |
194 |
195 | \begin{align*}
196 | \frac{\mu L}{\mu + L} \|x-y\|^2 + \frac{1}{\mu + L} \|\nabla f(x) - \nabla f(y) \|^2 \leq \langle x-y ; \nabla f(x)-\nabla f(y)\rangle \leq L \|x-y\|^2
197 | \end{align*}
198 |
199 |
200 | If in addition, $f$ is twice differentiable,
201 | \begin{align*}
202 | \mu I \leq \nabla^2 f(x) \leq L I
203 | \end{align*}
204 |
205 | \end{tcolorbox}
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 | \end{document}
214 |
--------------------------------------------------------------------------------
/Tuto6_LPQP/tuto6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iutzeler/NumericalOptimization/347b6c0b57d55c6bb665cbb3d0a83990ea711560/Tuto6_LPQP/tuto6.pdf
--------------------------------------------------------------------------------
/Tuto6_LPQP/tuto6.tex:
--------------------------------------------------------------------------------
1 | %\documentclass[paper=a4, fontsize=9pt]{article}
2 | \documentclass[a4paper,twoside,10pt]{amsart}
3 |
4 |
5 | %\usepackage[scale=0.8]{geometry}
6 | \usepackage{fullpage}
7 |
8 | \usepackage[T1]{fontenc} % Use 8-bit encoding that has 256 glyphs
9 | \usepackage[english]{babel} % English language/hyphenation
10 | \usepackage{amsmath,amsfonts,amsthm} % Math packages
11 | \usepackage{xcolor}
12 | \usepackage{hyperref}
13 |
14 | \usepackage{tikz}
15 | \usepackage{tkz-graph}
16 |
17 | \numberwithin{equation}{section} % Number equations within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
18 | \numberwithin{figure}{section} % Number figures within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
19 | \numberwithin{table}{section} % Number tables within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4)
20 | \usepackage{graphicx}
21 | \usepackage{caption}
22 | \usepackage{subcaption}
23 |
24 |
25 | \newcommand{\horrule}[1]{\rule{\linewidth}{#1}} % Create horizontal rule command with 1 argument of height
26 | \newcommand{\ans}[1]{ { \color{gray} \itshape #1} } % Create horizontal rule command with 1 argument of height
27 |
28 | \newtheorem{theo}{Theorem}
29 | \newtheorem{lemma}{Lemma}
30 | \theoremstyle{definition}
31 | \newtheorem{q_td}{Exercise }
32 | \newcommand{\reftd}[1]{ $\circ$ \ref{#1}}
33 | \newtheorem{q_tp}{$\diamond$}
34 | \newcommand{\reftp}[1]{$\diamond$ \ref{#1}}
35 |
36 | \begin{document}
37 |
38 | %----------------------------------------------------------------------------------------
39 | % TITLE
40 | %----------------------------------------------------------------------------------------
41 |
42 |
43 | \normalfont \normalsize
44 | \noindent\textsc{\small Universit\'e Grenoble Alpes }\\
45 | \noindent\textsc{\small MSIAM 1st year} \\ [0.3cm] % Your university, school and/or department name(s)
46 | \horrule{0.5pt} \\[0.4cm] % Thin top horizontal rule
47 | \begin{center}
48 | {\LARGE \scshape Numerical Optimization \\ Tuto 6: Linear and Quadratic Programs} \\ % The title
49 | \end{center}
50 | \noindent\textsc{\hfill L. Desbat \& F. Iutzeler }
51 | \horrule{2pt} \\[0.5cm] % Thick bottom horizontal rule
52 |
53 |
54 |
55 | %----------------------------------------------------------------------------------------
56 | % TD
57 | %----------------------------------------------------------------------------------------
58 | %\newpage
59 | \setcounter{section}{0}
60 | \renewcommand{\thesection}{\Alph{section}}
61 | \renewcommand*{\theHsection}{TD.\the\value{section}}
62 |
63 |
64 | \vspace*{0.5cm}
65 |
66 |
67 |
68 | In this tutorial, we are going to investigate Linear and Quadratic problems, that is when minimizing linear or quadratic cost functions under linear inequalities constraints. Typical formulations of these problems is as such:
69 |
70 | \vspace*{0.5cm}
71 |
72 |
73 | \begin{minipage}{0.4\textwidth}
74 | \textbf{~~~~~~~~~~ Linear program (LP):}\\
75 | \begin{align*}
76 | \min_{x\in\mathbb{R}^n} & ~~~ c^\mathrm{T} x \\
77 | \text{subject to } & ~~~ Gx \leq h
78 | \end{align*}
79 | \end{minipage}\hfill
80 | \begin{minipage}{0.4\textwidth}
81 | \textbf{~~~~~ Quadratic program (QP):}
82 | \begin{align*}
83 | \min_{x\in\mathbb{R}^n} & ~~~ \frac{1}{2} x^\mathrm{T} P x + q^\mathrm{T} x \\
84 | \text{subject to } & ~~~ Gx \leq h
85 | \end{align*}
86 | \end{minipage}
87 |
88 |
89 | \vspace*{0.5cm}
90 |
91 |
92 | where $c,q\in\mathbb{R}^n$, $G\in\mathbb{R}^{m\times n}$, $ h\in\mathbb{R}^m$, $P\in\mathbb{R}^{n\times n}$.
93 |
94 | \vspace*{0.5cm}
95 |
96 | Altough these problems are quite specific, a number of (sub-)problems in signal and data processing can actually reformulate linearly or quadratically. The interest of these reformulation is that there exist a large number of standard libraries implementing computationally efficient LP and QP solvers\footnote{generally based on interior point, active sets, simplex, ... algorithms and variants.}.
97 |
98 |
99 | \vspace*{0.5cm}
100 |
101 |
102 |
103 | \begin{q_td}[Equivalent problems]\label{td:eq}
104 | Let $f:\mathbb{R}^n\to\mathbb{R}$; we consider the problem
105 | \begin{align*}
106 | \min_{x\in\mathbb{R}^n} & ~~~ f(x) \\
107 | \text{subject to } & ~~~ x \in C
108 | \end{align*}
109 | and we assume that a solution $\bar{x}$ exists. Show that this problem is \emph{equivalent} to solving
110 | \begin{align*}
111 | \min_{(x,r)\in\mathbb{R}^{n+1}} & ~~~ r \\
112 | \text{subject to } & ~~~ f(x) \leq r \\
113 | & ~~~ (x,r)\in C \times \mathbb{R} \subset \mathbb{R}^{n+1}
114 | \end{align*}
115 | in the sense that
116 | \begin{itemize}
117 | \item[(i)] if $\bar{x}$ is a solution of the first problem, then $(\bar{x}, f (\bar{x}))$ is a solution of the second one.
118 | \item[(ii)] if $(\bar{x},\bar{r})$ is a solution of the second problem, then $\bar{x}$ is a solution of the first one.
119 | \end{itemize}
120 |
121 | \end{q_td}
122 |
123 | \vspace*{0.5cm}
124 |
125 |
126 | \begin{q_td}[Linear reformulation]\label{td:ref}
127 | Let $A\in\mathbb{R}^{m\times n}$ and $ b\in\mathbb{R}^m$. Reformulate the problem
128 | \begin{align*}
129 | \min_{x\in\mathbb{R}^n} & ~~~ \|Ax-b\|_\infty
130 | \end{align*}
131 | as a linear problem. Notably, give the corresponding $(c,G,h)$ from the LP formulation.
132 |
133 | \end{q_td}
134 |
135 | \newpage
136 |
137 | \begin{q_td}[Linear reformulation II]\label{td:ref2}
138 | Let $A\in\mathbb{R}^{m\times n}$ and $ b\in\mathbb{R}^m$. Reformulate the problem
139 | \begin{align*}
140 | \min_{x\in\mathbb{R}^n} & ~~~ \|Ax-b\|_1
141 | \end{align*}
142 | as a linear problem by extending the technique of Ex.~\ref{td:eq} (without giving details). Notably, give the corresponding $(c,G,h)$ from the LP formulation.
143 |
144 |
145 | Do the same for the problem
146 | \begin{align*}
147 | \min_{x\in\mathbb{R}^n} & ~~~ \|x\|_1\\
148 | \text{subject to } & ~~~ \|Ax-b\|_\infty \leq 1
149 | \end{align*}
150 |
151 | \end{q_td}
152 |
153 | \vspace*{0.5cm}
154 |
155 | \begin{q_td}[Quadratic reformulation]\label{td:ref2}
156 | We consider the regression model
157 | $$ y=X\theta+\xi,\;\;\xi\sim \mathcal{N}(0, \sigma I_m), $$
158 | where $X\in \mathbb{R}^{m\times n}$ and $y\in \mathbb{R}^m$ are the observed values and $\theta\in \mathbb{R}^n$ is the unknown parameter we want to find. Show that maximizing the (log-)likelihood of $\theta$ amount to minimizing $\|X\theta-y\|_2^2$.
159 |
160 | Reformulate the maximum likelihood problem under bounded output error as a Quadratic problem.
161 | \begin{align*}
162 | \max_{\theta \in\mathbb{R}^n} & ~~~ \text{likelihood}(\theta) = p(y|\theta) \\
163 | \text{subject to } & ~~~ | y_i - X_i \theta | \leq \varepsilon
164 | \end{align*}
165 | \emph{($X_i$) is the row vector of the $i$-th line of $X$.}
166 |
167 |
168 | What would change if $\xi$ followed a Laplace distribution?
169 | \end{q_td}
170 |
171 | \vspace*{0.5cm}
172 |
173 | \end{document}
174 |
--------------------------------------------------------------------------------