├── Jupyter_notebooks
    ├── 01-Julia-intro.ipynb
    ├── 02-derivatives.ipynb
    ├── 03-bracketing methods.ipynb
    ├── 04-local-descent.ipynb
    ├── 05-SD-CG.ipynb
    ├── 06-other-1st-order-methods.ipynb
    ├── 07-Newton.ipynb
    └── 08-quasi-newton.ipynb
├── Lecture_notes
    ├── 01_intro.pdf
    ├── 01_intro.tex
    ├── 02_derivative.pdf
    ├── 02_derivative.tex
    ├── 03_bracket.pdf
    ├── 03_bracket.tex
    ├── 04_local_decent.pdf
    ├── 04_local_decent.tex
    ├── 05_first_order_1.pdf
    ├── 05_first_order_1.tex
    ├── 06_first_order_2.pdf
    ├── 06_first_order_2.tex
    ├── 06_gradient_descent.pdf
    ├── 07_Newton_method.pdf
    ├── 07_Newton_method.tex
    ├── 08_Quasi_Newton.pdf
    ├── 08_Quasi_Newton.tex
    ├── 09_direct_methods.pdf
    ├── 09_direct_methods.tex
    ├── 10_Stochastic_methods.tex
    ├── 11_evolutinary_methods.tex
    ├── 12_constrained_optimization.tex
    ├── 13_sampling_plans.tex
    ├── 14_surrogate_models.tex
    ├── 15_surrogate_models_prob.tex
    ├── 16_surrogate_optimization.tex
    ├── 17_uncertainty.tex
    ├── 18_symbolic_regression.tex
    ├── A1_trust_region.pdf
    ├── A1_trust_region.tex
    ├── Figs
    │   ├── C60.png
    │   ├── CMA.jpeg
    │   ├── Cross-entropy.jpeg
    │   ├── EI.jpeg
    │   ├── GPR-raw.png
    │   ├── GPR-train-1.png
    │   ├── GPR-train-2.png
    │   ├── LB.jpeg
    │   ├── algo_opt.jpg
    │   ├── bracket.jpeg
    │   ├── cauchy.jpeg
    │   ├── cg-sd.jpg
    │   ├── constraint-ab.jpeg
    │   ├── coordinate-improved.jpeg
    │   ├── coordinate.jpeg
    │   ├── cross-valid.jpeg
    │   ├── curvature1.jpeg
    │   ├── curvature2.jpeg
    │   ├── derivative-comparison.jpeg
    │   ├── error-explore.jpeg
    │   ├── firefly.jpeg
    │   ├── flat.jpeg
    │   ├── gaussian.jpeg
    │   ├── gp-opt.png
    │   ├── graph1.jpeg
    │   ├── graph2.jpeg
    │   ├── grid_search.jpeg
    │   ├── holdout.jpeg
    │   ├── ip.gif
    │   ├── julia-comp.png
    │   ├── julia.png
    │   ├── kernel.jpeg
    │   ├── linesearch.jpeg
    │   ├── minimum.jpeg
    │   ├── momentum.jpeg
    │   ├── multi-minima.jpeg
    │   ├── n-momentum.jpeg
    │   ├── newton-1d.jpeg
    │   ├── orthogonal.jpeg
    │   ├── powell.jpeg
    │   ├── prob.jpeg
    │   ├── pso.jpeg
    │   ├── quasi-newton.jpeg
    │   ├── rbf.jpeg
    │   ├── sample_all.jpeg
    │   ├── search.jpeg
    │   ├── selection.jpeg
    │   ├── sgd.jpeg
    │   ├── simplex-performance.jpeg
    │   ├── simplex.jpeg
    │   ├── simplex_algo.jpeg
    │   ├── solution-space.jpeg
    │   ├── strafied.jpeg
    │   ├── sufficient_decrease.jpeg
    │   ├── tree.jpeg
    │   ├── trust-region.jpeg
    │   ├── two-conditions.jpeg
    │   ├── uncertainty.jpeg
    │   ├── uni-proj.jpeg
    │   └── unimodal.jpeg
    └── trust.jl
├── README.md
└── compile.sh


/Jupyter_notebooks/02-derivatives.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 2 Derivatives\n",
  8 |     "\n",
  9 |     "This notebook was automatically generated from the Algorithms for Optimization source code. Each cell generates a figure from the original text. While this code is not optimized for use in lectures, we provide it here to be adapted for such projects. We hope you find it useful."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {
 16 |     "scrolled": false
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "#import Pkg; \n",
 21 |     "#Pkg.add(\"SymEngine\");\n",
 22 |     "using SymEngine"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "# 2.1 Analytic gradient "
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "scrolled": false
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/plain": [
 42 |        "1/2 + 2*x + sin(x)/x^2 - cos(x)/x"
 43 |       ]
 44 |      },
 45 |      "execution_count": 2,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "# one variables\n",
 52 |     "@vars x;\n",
 53 |     "f = x^2 + x/2 - sin(x)/x;\n",
 54 |     "diff(f, x)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "sin(x2)\n",
 67 |       "x1*cos(x2)\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "# many variables\n",
 73 |     "@vars x1, x2;\n",
 74 |     "f = x1*sin(x2) + 1;\n",
 75 |     "println(diff(f, x1))\n",
 76 |     "println(diff(f, x2))"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "# 2.2 Numerical gradient\n",
 84 |     "- Finite difference\n",
 85 |     "- Complex step"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 4,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "# define a target function\n",
 95 |     "f0(x) = x^2 + x/2 - sin(x)/x;"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 5,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "0.7333000227808952\n",
108 |       "0.733300007879734\n",
109 |       "0.7332999929785728\n"
110 |      ]
111 |     }
112 |    ],
113 |    "source": [
114 |     "# Finite difference method\n",
115 |     "diff_forward(f, x; h = sqrt(eps(Float64))) = (f(x+h) - f(x))/h;\n",
116 |     "diff_central(f, x; h = sqrt(eps(Float64))) = (f(x+h/2) - f(x-h/2))/h;\n",
117 |     "diff_backward(f, x; h = sqrt(eps(Float64))) = (f(x) - f(x-h))/h;\n",
118 |     "\n",
119 |     "\n",
120 |     "println(diff_forward(f0, 0.1))\n",
121 |     "println(diff_central(f0, 0.1))\n",
122 |     "println(diff_backward(f0, 0.1))"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 6,
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "0.7333000119025557\n"
135 |      ]
136 |     }
137 |    ],
138 |    "source": [
139 |     "# Complex step method\n",
140 |     "diff_complex(f, x; h=1e-20) = imag(f(x+h*im))/h\n",
141 |     "\n",
142 |     "println(diff_complex(f0, 0.1))"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 7,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "#import Pkg; Pkg.add(\"Zygote\")"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 8,
157 |    "metadata": {},
158 |    "outputs": [
159 |     {
160 |      "data": {
161 |       "text/plain": [
162 |        "(0.7333000119025559,)"
163 |       ]
164 |      },
165 |      "execution_count": 8,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     }
169 |    ],
170 |    "source": [
171 |     "# Automatic differentiation\n",
172 |     "\n",
173 |     "import Zygote: gradient\n",
174 |     "gradient(f0, 0.1)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 9,
180 |    "metadata": {},
181 |    "outputs": [
182 |     {
183 |      "data": {
184 |       "text/plain": [
185 |        "(0.07196888754292625, -0.17110198196123422)"
186 |       ]
187 |      },
188 |      "execution_count": 9,
189 |      "metadata": {},
190 |      "output_type": "execute_result"
191 |     }
192 |    ],
193 |    "source": [
194 |     "f1(a, b) = log(a*b, max(a,2));\n",
195 |     "gradient(f1, 3.0, 2.0)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "# 2.3 Automatic Differentiation\n",
203 |     "- Dual numbers\n",
204 |     "- Forward pass"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "### 2.3.1 Dual Number Notation\n",
212 |     "\n",
213 |     "Instead of D(a,b) we can write a + b ϵ, where ϵ satisfies ϵ^2=0.  (Some people like to recall imaginary numbers where an i is introduced with i^2=-1.) \n",
214 |     "\n",
215 |     "Others like to think of how engineers just drop the O(ϵ^2) terms.\n",
216 |     "\n",
217 |     "The four rules are\n",
218 |     "\n",
219 |     "$ (a+b\\epsilon) \\pm (c+d\\epsilon) = (a \\pm c) + (b \\pm d)\\epsilon$\n",
220 |     "\n",
221 |     "$ (a+b\\epsilon) * (c+d\\epsilon) = (ac) + (bc+ad)\\epsilon$\n",
222 |     "\n",
223 |     "$ (a+b\\epsilon) / (c+d\\epsilon) = (a/c) + (bc-ad)/c^2 \\epsilon $"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 32,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "data": {
233 |       "text/plain": [
234 |        "promote_rule (generic function with 159 methods)"
235 |       ]
236 |      },
237 |      "execution_count": 32,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "struct D <: Number  # D is a function-derivative pair\n",
244 |     "    f::Tuple{Float64,Float64}\n",
245 |     "end\n",
246 |     "\n",
247 |     "# Add the last two rules\n",
248 |     "import Base: -,*,+, /, convert, promote_rule\n",
249 |     "-(x::D, y::D) = D(x.f .- y.f)\n",
250 |     "*(x::D, y::D) = D((x.f[1]*y.f[1], (x.f[2]*y.f[1] + x.f[1]*y.f[2])))\n",
251 |     "\n",
252 |     "+(x::D, y::D) = D(x.f .+ y.f)\n",
253 |     "/(x::D, y::D) = D((x.f[1]/y.f[1], (y.f[1]*x.f[2] - x.f[1]*y.f[2])/y.f[1]^2))\n",
254 |     "convert(::Type{D}, x::Real) = D((x,zero(x)))\n",
255 |     "promote_rule(::Type{D}, ::Type{<:Number}) = D"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 33,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/plain": [
266 |        "D((0.0, 1.0))"
267 |       ]
268 |      },
269 |      "execution_count": 33,
270 |      "metadata": {},
271 |      "output_type": "execute_result"
272 |     }
273 |    ],
274 |    "source": [
275 |     "ϵ  = D((0,1))"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 34,
281 |    "metadata": {},
282 |    "outputs": [
283 |     {
284 |      "data": {
285 |       "text/plain": [
286 |        "D((0.0, 0.0))"
287 |       ]
288 |      },
289 |      "execution_count": 34,
290 |      "metadata": {},
291 |      "output_type": "execute_result"
292 |     }
293 |    ],
294 |    "source": [
295 |     "ϵ * ϵ"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 38,
301 |    "metadata": {},
302 |    "outputs": [
303 |     {
304 |      "data": {
305 |       "text/plain": [
306 |        "D((1.0, -1.0))"
307 |       ]
308 |      },
309 |      "execution_count": 38,
310 |      "metadata": {},
311 |      "output_type": "execute_result"
312 |     }
313 |    ],
314 |    "source": [
315 |     "1/(1+ϵ)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 39,
321 |    "metadata": {},
322 |    "outputs": [
323 |     {
324 |      "data": {
325 |       "text/plain": [
326 |        "D((3.0, 2.0))"
327 |       ]
328 |      },
329 |      "execution_count": 39,
330 |      "metadata": {},
331 |      "output_type": "execute_result"
332 |     }
333 |    ],
334 |    "source": [
335 |     "(1+2*ϵ)*(3-4*ϵ)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "### 2.3.2 Forward Differentiation"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": 17,
348 |    "metadata": {},
349 |    "outputs": [
350 |     {
351 |      "data": {
352 |       "text/plain": [
353 |        "Dual{Nothing}(2.1972245773362196,0.3333333333333333)"
354 |       ]
355 |      },
356 |      "execution_count": 17,
357 |      "metadata": {},
358 |      "output_type": "execute_result"
359 |     }
360 |    ],
361 |    "source": [
362 |     "using ForwardDiff"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": [
371 |     "a = ForwardDiff.Dual(3,1)\n",
372 |     "log(a^2)"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 19,
378 |    "metadata": {},
379 |    "outputs": [
380 |     {
381 |      "data": {
382 |       "text/plain": [
383 |        "Dual{Nothing}(2.1972245773362196,0.6666666666666666)"
384 |       ]
385 |      },
386 |      "execution_count": 19,
387 |      "metadata": {},
388 |      "output_type": "execute_result"
389 |     }
390 |    ],
391 |    "source": [
392 |     "a = ForwardDiff.Dual(3,1)\n",
393 |     "b = ForwardDiff.Dual(2,0)\n",
394 |     "log(a*b + max(a,2))"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": []
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": []
410 |   }
411 |  ],
412 |  "metadata": {
413 |   "kernelspec": {
414 |    "display_name": "Julia 1.0.5",
415 |    "language": "julia",
416 |    "name": "julia-1.0"
417 |   },
418 |   "language_info": {
419 |    "file_extension": ".jl",
420 |    "mimetype": "application/julia",
421 |    "name": "julia",
422 |    "version": "1.0.5"
423 |   }
424 |  },
425 |  "nbformat": 4,
426 |  "nbformat_minor": 2
427 | }
428 | 


--------------------------------------------------------------------------------
/Jupyter_notebooks/04-local-descent.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 4 Local Descent"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# necessary libraries\n",
 17 |     "using Plots\n",
 18 |     "using ForwardDiff\n",
 19 |     "using Printf\n",
 20 |     "using LinearAlgebra\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "# trial function and gradient\n",
 30 |     "#f_booth_0(x1, x2) = (x1 + 2*x2 -7)^2 + (2*x1 + x2 -5)^2\n",
 31 |     "#f_booth((x1, x2)) = [(x1 + 2*x2 -7)^2 + (2*x1 + x2 -5)^2]\n",
 32 |     "\n",
 33 |     "f_0(x1, x2) = x1^2 + x1*x2 + x2^2\n",
 34 |     "f((x1, x2)) = [x1^2 + x1*x2 + x2^2]\n",
 35 |     "\n",
 36 |     "function f_prime(a)\n",
 37 |     "    return ForwardDiff.jacobian(f, a)[1,:]\n",
 38 |     "end\n",
 39 |     "\n",
 40 |     "x0 = [1.0, 2.0]\n",
 41 |     "println(f(x0))\n",
 42 |     "println(f_prime(x0))"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "# plot function\n",
 52 |     "x = -5:1:8\n",
 53 |     "y = -5:1:8\n",
 54 |     "plot(\n",
 55 |     "contour(x, y, f_0; levels = collect(0:1:30)),\n",
 56 |     "contourf(x, y, f_0; levels = collect(0:1:30)),    \n",
 57 |     "size=[800, 300]\n",
 58 |     ")"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "x0 = [1, 2]\n",
 68 |     "d = -f_prime(x0)\n",
 69 |     "n, max = 101, 1\n",
 70 |     "res = max/(n-1)\n",
 71 |     "a0 = 0:res:max\n",
 72 |     "y0 = zeros(n)\n",
 73 |     "\n",
 74 |     "println(\"f(x) along the direction: \", d)\n",
 75 |     "for i in 1:1:n\n",
 76 |     "    y0[i] = f(x0 + a0[i]*d)[1]\n",
 77 |     "    #@printf(\"%4d %8.3f %8.3f  %6.3f %6.3f\\n\",i, a0[i], y0[i], (x0+a0[i]*d)[1], (x0+a0[i]*d)[2])\n",
 78 |     "end\n",
 79 |     "plot(a0, y0, xlabel=\"a\")\n",
 80 |     "\n",
 81 |     "    "
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "function strong_backtracking(f, ∇, x, d; α=5, β=1e-4, σ=0.1) \n",
 91 |     "    y0, g0, y_prev, α_prev = f(x)[1], ∇(x)⋅d, NaN, 0\n",
 92 |     "    αlo, αhi = NaN, NaN\n",
 93 |     "    # bracket phase\n",
 94 |     "    while true\n",
 95 |     "        y = f(x + α*d)[1]\n",
 96 |     "        if y > y0 + β*α*g0 || (!isnan(y_prev) && y ≥ y_prev) \n",
 97 |     "            αlo, αhi = α_prev, α\n",
 98 |     "            break \n",
 99 |     "        end\n",
100 |     "                \n",
101 |     "        g = ∇(x + α*d)⋅d \n",
102 |     "        if abs(g) ≤ -σ*g0\n",
103 |     "            return α \n",
104 |     "        elseif g ≥ 0\n",
105 |     "            αlo, αhi = α, α_prev\n",
106 |     "            break \n",
107 |     "        end\n",
108 |     "        y_prev, α_prev, α = y, α, 2α \n",
109 |     "    end\n",
110 |     "    \n",
111 |     "    @printf(\"The initial interval: %6.3f %6.3f\\n\", αlo, αhi)\n",
112 |     "\n",
113 |     "    # zoom phase\n",
114 |     "    ylo = f(x + αlo*d)[1]\n",
115 |     "    n = 0\n",
116 |     "    while n < 10\n",
117 |     "        α = (αlo + αhi)/2\n",
118 |     "        y = f(x + α*d)[1]\n",
119 |     "        @printf(\"The interval: %6.3f %6.3f\\n\", αlo, αhi)\n",
120 |     "        if y > y0 + β*α*g0 || y ≥ ylo #\n",
121 |     "            @printf(\"No sufficient decrease: %6.3f %6.3f %6.3f %6.3f\\n\", α, y, y0, ylo)\n",
122 |     "            αhi = α \n",
123 |     "        else\n",
124 |     "            g = ∇(x + α*d)⋅d \n",
125 |     "            if abs(g) ≤ -σ*g0\n",
126 |     "                return α\n",
127 |     "            elseif g*(αhi - αlo) ≥ 0\n",
128 |     "                αhi = αlo \n",
129 |     "            end\n",
130 |     "            αlo = α \n",
131 |     "        end\n",
132 |     "        n += 1\n",
133 |     "    end \n",
134 |     "end\n",
135 |     "\n",
136 |     "strong_backtracking(f, f_prime, x0, d)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": []
145 |   }
146 |  ],
147 |  "metadata": {
148 |   "kernelspec": {
149 |    "display_name": "Julia 1.0.5",
150 |    "language": "julia",
151 |    "name": "julia-1.0"
152 |   },
153 |   "language_info": {
154 |    "file_extension": ".jl",
155 |    "mimetype": "application/julia",
156 |    "name": "julia",
157 |    "version": "1.0.5"
158 |   }
159 |  },
160 |  "nbformat": 4,
161 |  "nbformat_minor": 2
162 | }
163 | 


--------------------------------------------------------------------------------
/Lecture_notes/01_intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/01_intro.pdf


--------------------------------------------------------------------------------
/Lecture_notes/01_intro.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Optimization]{Numerical Optimization 01: Introduction} % The short title appears at the bottom of every slide, the full title is only on the title page
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Syllabus}
139 | \begin{frame}{Syllabus}
140 | 
141 |     
142 | \begin{columns}
143 | 
144 | \begin{column}{.6\textwidth}
145 | We have two goals: 
146 | \begin{itemize}
147 |     \item Learn Julia programming
148 |     \item Understand the optimization methods
149 | \end{itemize}
150 | Subjects to be covered
151 | \begin{itemize}
152 |     \item Julia programming
153 |     \item Local Optimization
154 |     \begin{itemize}
155 |         \item Derivatives and Gradients
156 |         \item Bracketing
157 |         \item First/second-Order optimization
158 |         \item Gradient free methods
159 |         \item Stochastic methods
160 |     \end{itemize}
161 |     \item Global Optimization
162 |     \item Sampling Plans
163 |     \item Surrogate Optimization
164 |     \item Expression Optimization
165 | 
166 | \end{itemize}
167 | \end{column}
168 | \pause
169 | \begin{column}{.4\textwidth}
170 | \begin{figure}
171 | \centering
172 | \includegraphics[width=30mm]{Figs/algo_opt.jpg}
173 | \end{figure}
174 | Virtual Meet twice a week (~90 minutes each time).\\
175 | \begin{itemize}
176 |     \item review of homework (20-30 mins)
177 |     \item lecture (30-50 mins)
178 |     \item coding (20-30 mins)
179 | \end{itemize}
180 | \end{column}
181 | 
182 | \end{columns}
183 | 
184 | \end{frame}
185 | 
186 | \section{Why Optimization?}
187 | \begin{frame}{Why optimization}
188 | \begin{columns}
189 | \begin{column}{.55\textwidth}
190 | A typical optimization problem is to 
191 | \begin{equation*}
192 | \begin{split}
193 |     \textrm{minimize} &~~ f(x)\\
194 |     \textrm{subject to} &~~ x \in X    
195 | \end{split}
196 | \end{equation*}
197 | A design point ($x$) can be represented as a vector of values
198 | corresponding to different design variables.
199 | \end{column}
200 | 
201 | \begin{column}{.45\textwidth}
202 | \begin{figure}
203 | \centering
204 | \includegraphics[width=40mm]{Figs/solution-space.jpeg}
205 | \end{figure}
206 | \end{column}
207 | \end{columns}
208 | 
209 | A \textcolor{blue}{necessary condition?} for $f(x)$ reaches the minimum is that \textcolor{blue}{$f`(x)=0$}.
210 | \end{frame}
211 | 
212 | 
213 | \begin{frame}{Optimization is hard!}
214 | \begin{itemize}
215 |     \item $f`(x)=0$ is not a sufficient condition.
216 | \begin{figure}
217 | \centering
218 | \includegraphics[width=80mm]{Figs/minimum.jpeg}
219 | \end{figure}
220 |     \item There exist many points where {$f`(x)=0$}.
221 | \begin{figure}
222 | \centering
223 | \includegraphics[width=60mm]{Figs/multi-minima.jpeg}
224 | \end{figure}
225 |     \item $f(x)$ and $f`(x)$ are hard to evaluate.
226 | 
227 | \end{itemize}
228 | \end{frame}
229 | 
230 | 
231 | \section{Why Julia?}
232 | \begin{frame}{Why Julia?}
233 | \textcolor{red}{Run Julia at } \url{https://www.juliabox.com}
234 | 
235 | \begin{columns}
236 | 
237 | \begin{column}{.5\textwidth}
238 | \begin{itemize}
239 |     \item Math-friendly
240 |     \item Looks like Python 
241 |     \item Runs like C/Fortran
242 |     \item Growing ecosystem
243 | \end{itemize}
244 | \end{column}
245 | 
246 | \begin{column}{.5\textwidth}
247 | \begin{figure}
248 | \centering
249 | \includegraphics[width=40mm]{Figs/julia.png}
250 | \end{figure}
251 | \end{column}
252 | \end{columns}
253 | \begin{figure}
254 | \centering
255 | \includegraphics[width=90mm]{Figs/julia-comp.png}
256 | \end{figure}
257 | \end{frame}
258 | 
259 | 
260 | \section{Summary}
261 | \begin{frame}{Summary}
262 |     \begin{itemize}
263 |         \item Optimization in engineering is the process of finding the best system design subject to a set of constraints.
264 |         \item Optimization can be transformed to a math problem but it is sometimes hard to solve
265 |         \item We will extensively using the Julia language to learn how to solve the optimization numerically
266 |     \end{itemize}
267 | \end{frame}
268 | 
269 | 
270 | \section{Homework}
271 | \begin{frame}{Homework}
272 | In Julia, write the following trial functions, make the contour plots and analyze their minima behavior.
273 | \begin{itemize}
274 |     \item Booth's function
275 |     \begin{equation*}
276 |         f(x_1, x_2) = (x_1 + 2x_2 -7)^2 + (2x_1 + x_2 -5)^2
277 |     \end{equation*}
278 | 
279 |     \item Barnin function
280 |     \begin{equation*}
281 |         f(x_1, x_2) = a(x_2 - bx_1^2 + cx_1 - r)^2 + s(1-t)cos(x_1) +s
282 |     \end{equation*}
283 |     where $a=1, b=5.1/(4\pi^2), c=5/\pi, r=6, s=10, t=1/8\pi$.
284 |     
285 |     \item Rosenbrock's Banana function
286 |     \begin{equation*}
287 |         f(x_1, x_2) = (a-x_1)^2 + b(x_2-x_1^2)^2
288 |     \end{equation*}
289 |     where $a=1, b=5$.
290 |     
291 | \end{itemize}
292 | More functions can be found at \url{https://en.wikipedia.org/wiki/Test_functions_for_optimization}
293 | \end{frame}
294 | 
295 | \end{document}
296 | 
297 | 


--------------------------------------------------------------------------------
/Lecture_notes/02_derivative.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/02_derivative.pdf


--------------------------------------------------------------------------------
/Lecture_notes/02_derivative.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Optimization]{Numerical Optimization 02: Derivatives} % The short title appears at the bottom of every slide, the full title is only on the title page
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Derivative}
139 | \begin{frame}{Derivative}
140 | The goal of optimization is to find the point that minimizes an objective function. Knowing how the value of a function changes (derivative) is useful.
141 | 
142 |  \begin{equation*}
143 |      f(x + \Delta x) \approx f(x) + f`(x)\Delta x
144 |  \end{equation*}
145 |  
146 | \begin{equation*}
147 |     f`(x) = \frac{\Delta f(x)}{\Delta x}
148 | \end{equation*}
149 | 
150 | Derivatives in multiple dimensions
151 | \begin{equation*}
152 | \textrm{\textcolor{blue}{Jacobian}}~~~~~     \nabla f(x) = \Bigg[\frac{\partial f(x)}{\partial x_1}, \frac{\partial f(x)}{\partial x_2},  \cdots \frac{\partial f(x)}{\partial x_n} \Bigg]
153 | \end{equation*}
154 | 
155 | \begin{equation*}
156 | \textrm{\textcolor{blue}{Hessian}}~~~~~    \nabla^2 f(x) = 
157 |     \begin{bmatrix}
158 | \frac{\partial^2 f(x)}{\partial x_1^2 } & \frac{\partial^2 f(x)}{\partial x_1 \partial x_2 } & \cdots \frac{\partial^2 f(x)}{\partial x_1 \partial x_n }\\
159 |  & \vdots & \\
160 | \frac{\partial^2 f(x)}{\partial x_n \partial x_1} & \frac{\partial^2 f(x)}{\partial x_n \partial x_2 } & \cdots \frac{\partial^2 f(x)}{\partial x_n^2 } \\
161 | \end{bmatrix}
162 | \end{equation*}
163 | \end{frame}
164 | 
165 | \section{Numerical Differentiation}
166 | \begin{frame}{Numerical Differentiation}
167 | For practical applications, we rely on numerical methods to evaluate the derivatives.
168 | 
169 | \begin{itemize}
170 |      \item Finite Difference Methods
171 |      \begin{equation*}
172 |          f`(x) \approx
173 |     \begin{cases}
174 |          & \frac{f(x+h)-f(x)}{h} ~~~~~~~~~~~~~~~~~\textrm{forward} \\
175 |          & \frac{f(x+h/2)-f(x-h/2)}{h} ~~~~~~~~~~\textrm{central}\\
176 |          & \frac{f(x)-f(x-h)}{h} ~~~~~~~~~~~~~~~~~\textrm{backward}    
177 |      \end{cases}
178 |      \end{equation*}
179 |      \item Complex Step Method
180 |      \begin{equation*}
181 |          f`(x) = \textrm{imag}(f(x+ih)/h)
182 |      \end{equation*}
183 |      %\item Automatic Differentiation
184 | \end{itemize}
185 | \end{frame}
186 | 
187 | \begin{frame}{Finite Difference - forward}
188 | \begin{equation*}
189 |     f(x+h) = f(x) + \frac{f`(x)}{1!}h + \frac{f``(x)}{2!}h^2 + \frac{f```(x)}{3!}h^3 + \cdots 
190 | \end{equation*}
191 | \pause
192 | We can arrange it to 
193 | 
194 | \begin{equation*}
195 | \begin{split}
196 |     f`(x)h &= f(x+h) - f(x) - \frac{f``(x)}{2!}h^2  - \frac{f```(x)}{3!}h^3 + \cdots \\
197 |     f`(x) &= \frac{f(x+h) - f(x)}{h} - \frac{f``(x)}{2!}h  - \frac{f```(x)}{3!}h^2 \\
198 |     f`(x) &= \frac{f(x+h) - f(x)}{h} + O(h)       
199 | \end{split}
200 | \end{equation*}
201 | Therefore, forward difference has linear error.
202 | \end{frame}
203 | 
204 | 
205 | \begin{frame}{Finite Difference - central}
206 | \begin{equation*}
207 | \begin{split}
208 |     f(x+h/2) &= f(x) + \frac{f`(x)}{1!}\frac{h}{2} + \frac{f``(x)}{2!}(\frac{h}{2})^2 + \frac{f```(x)}{3!}(\frac{h}{2})^3 + \cdots \\
209 |     f(x-h/2) &= f(x) - \frac{f`(x)}{1!}\frac{h}{2} + \frac{f``(x)}{2!}(\frac{h}{2})^2 - \frac{f```(x)}{3!}(\frac{h}{2})^3 + \cdots \\
210 |     f`(x) &= \frac{f(x+h/2) - f(x-h/2)}{h} + O(h^2)       
211 | \end{split}
212 | \end{equation*}
213 | 
214 | Therefore, central difference has quadratic error.
215 | 
216 | \end{frame}
217 | 
218 | \begin{frame}{Complex Step}
219 | According to Taylor expansion,
220 | \begin{equation*}
221 |     f(x+ih) = f(x) + ihf`(x) - h^2\frac{f``(x)}{2!} - ih^3\frac{f```(x)}{3!} + \cdots 
222 | \end{equation*}
223 | \pause
224 | If we take only the imaginary part,
225 | \begin{equation*}
226 | \begin{split}
227 |     &\textrm{Im} (f(x+ih)) = hf`(x) -  h^3\frac{f```(x)}{3!} + \cdots \\
228 |     &f`(x) = \frac{\textrm{Im}(f(x+ih))}{h} + h^2\frac{f```(x)}{3!} - \cdots = \frac{\textrm{Im}(f(x+ih))}{h} + O(h^2)      
229 | \end{split}
230 | \end{equation*}
231 | \pause
232 | While the real part is
233 | \begin{equation*}
234 | \begin{split}
235 | 		\textrm{Re}(f(x+ih)) &= f(x) - h^3\frac{f``(x)}{2!} + \cdots \\
236 | 		f(x) &= \textrm{Re}(f(x+ih)) + O(h^2)
237 | \end{split}    
238 | \end{equation*}
239 | \pause
240 | The complex step method is advantageous since 
241 | 
242 | \begin{itemize}
243 |     \item Both $f(x)$ and $f`(x)$ can be evaluated in a single run
244 |     \item $f`(x)$ has a quadratic error
245 | \end{itemize}
246 | 
247 | \end{frame}
248 | 
249 | \begin{frame}{Comparison}
250 | 
251 | \begin{figure}
252 | \centering
253 | \includegraphics[width=120mm]{Figs/derivative-comparison.jpeg}
254 | \end{figure}
255 | 
256 | \textcolor{blue}{Homework: reproduce the above figure by yourself!}
257 | \end{frame}
258 | 
259 | \begin{frame}{Why is complex step better than central difference?}
260 | \begin{equation*}
261 | \begin{split}
262 |     f(x+ih) &= u(x,h) + iv(x,h)\\
263 |     \textrm{Im}(f(x+ih)) &= h\frac{\partial v(x, y)}{\partial y}|_{y=0} + O(h^2)
264 | \end{split}
265 | \end{equation*}
266 | If $v(x,0)=0$, $f(x)=u(x,0)$. Dividing by $h$, we have f`(x)
267 | \begin{equation*}
268 |     \frac{\partial v(x, y)}{\partial y}|_{y=0} = \frac{\partial u(x, 0)}{\partial x}
269 | \end{equation*}
270 | The left side is what we used to compute the complex step. The right side is due to \textcolor{blue}{Cauchy-Riemann equation}, which is used by the finite difference. Note that two method use two different functions ($u$ and $v$).
271 | 
272 | 
273 | \end{frame}
274 | 
275 | \begin{frame}{Why is complex step better than central difference?}
276 | 
277 | Consider a function $f(z) = z^2$, 
278 | \begin{equation*}
279 |     f(z) = z^2 = x^2 - y^2 + i2xy
280 | \end{equation*}
281 | The finite difference does the function $x^2$, while the complex step gives 2$x$, in the case for any $h=y>0$.
282 | \vspace{10mm}
283 | 
284 | \pause
285 | Try another function:
286 | \begin{equation*}
287 |     \cos(x+iy)=\cos(x)\textrm{cosh}(y) - i\sin(x)\textrm{sinh}(y).
288 | \end{equation*}
289 | The imaginary part is $−\sin(x)\sinh(y)$. For a small $y$, it gives -sin$(x)$.
290 | 
291 | \end{frame}
292 | 
293 | 
294 | \section{Automatic Differentiation}
295 | \begin{frame}{Dual numbers}
296 | Dual numbers can be expressed mathematically by including the abstract quantity $\epsilon$, where $\epsilon^2$ is 0. So that,
297 | \begin{equation*}
298 | \begin{split}
299 |     (a+b\epsilon)+(c+d\epsilon) &= (a+c) + (b+d)\epsilon\\
300 |     (a+b\epsilon)*(c+d\epsilon) &= (ac) + (ad+bc)\epsilon
301 | \end{split}
302 | \end{equation*}
303 | 
304 | The function's evaluation and derivative can be expressed simultaneously in an \textcolor{blue}{exact manner}.
305 | \begin{equation*}
306 | \begin{split}
307 |     f(x) &= \sum_{k=0}^\infty \frac{f^k(a)}{k!}(x-a)^k \\
308 |     f(a+b\epsilon) &= \sum_{k=0}^\infty \frac{f^k(a)}{k!}(a+b\epsilon-a)^k
309 |     = \sum_{k=0}^\infty \frac{f^k(a)b^k\epsilon^k}{k!}\\
310 |     &= f(a) + bf`(a)\epsilon + \epsilon^2 \sum_{k=2}^\infty \frac{f^k(a)b^k}{k!}\epsilon^{k-2}\\
311 |     &= f(a) + bf`(a)\epsilon
312 | \end{split}
313 | \end{equation*}
314 | 
315 | \end{frame}
316 | 
317 | \begin{frame}{Express a function as the computational graph}
318 | Suppose we have a target function
319 | \begin{equation*}
320 | f(a, b) = \ln(ab + \textrm{max}(a, 2))
321 | \end{equation*}
322 | 
323 | It can be expressed as
324 | \begin{figure}
325 | \centering
326 | \includegraphics[width=80mm]{Figs/graph1.jpeg}
327 | \end{figure}
328 | 
329 | \end{frame}
330 | 
331 | \begin{frame}{The derivative from the computational graph}
332 | Suppose we have a target function
333 | \begin{equation*}
334 | f(a, b) = \ln(ab + \textrm{max}(a, 2))
335 | \end{equation*}
336 | The derivative is 
337 | \begin{equation*}
338 | \frac{df}{dx} = \frac{df}{dc_4}\frac{dc_4}{d_x} 
339 | = \frac{df}{dc_4}\bigg(\frac{dc_4}{dc_3}\frac{dc_3}{dx}\bigg)
340 | = \frac{df}{dc_4}\bigg(\frac{dc_4}{dc_3}\bigg(\frac{dc_3}{dc_2}\frac{dc_2}{dx} + \frac{dc_3}{dc_1}\frac{dc_1}{dx}\bigg)\bigg)
341 | \end{equation*}
342 | 
343 | \begin{figure}
344 | \centering
345 | \includegraphics[width=80mm]{Figs/graph2.jpeg}
346 | \end{figure}
347 | 
348 | \end{frame}
349 | 
350 | \section{Summary}
351 | \begin{frame}{Summary}
352 |     \begin{itemize}
353 |         \item Derivatives are important for optimization. 
354 |         \item We rely on numerical derivatives in practical optimization
355 |         \item Finite differences are the most easy ways to compute derivative
356 |         \item The complex step method has better accuracy
357 |         \item Dual numbers allow the exact evaluation of function and derivative simultaneously
358 |         \item Analytic differentiation methods include forward and reverse accumulation on computational graphs
359 |     \end{itemize}
360 | \end{frame}
361 | \end{document}
362 | 
363 | 


--------------------------------------------------------------------------------
/Lecture_notes/03_bracket.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/03_bracket.pdf


--------------------------------------------------------------------------------
/Lecture_notes/03_bracket.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Optimization]{Numerical Optimization 03: Bracket and Zoom} % The short title appears at the bottom of every slide, the full title is only on the title page
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Bracketing Methods}
139 | \begin{frame}{Bracketing}
140 | \begin{itemize}
141 |     \item identifying an interval in which a local minimum lies and then successively shrinking the interval.
142 |     \item applied to a unimodal function
143 | \end{itemize}
144 | A \textcolor{blue}{unimodal function} $f$ is one where there is a unique $x_0$, such that $f$ is monotonically decreasing for $x \leq x_0$ and monotonically increasing for $x \geq x_0$. It follows from this definition that the unique global minimum is at $x_0$, and there are no other local minima.
145 | \begin{figure}
146 | \centering
147 | \includegraphics[width=60mm]{Figs/unimodal.jpeg}
148 | \end{figure}
149 | \end{frame}
150 | 
151 | %\section{Finding an Initial Bracket}
152 | \begin{frame}{Initial Bracket}
153 | When optimizing a function, we often start by first bracketing an interval containing a local minimum.
154 | \begin{itemize}
155 |     \item Starting at a given point, a trial move in the positive direction (1e-2) 
156 |     \item search in the downhill direction to find a new point that exceeds the lowest point. 
157 |     \item expand the step size by some factor of 2. 
158 | \end{itemize}
159 | \begin{figure}
160 | \centering
161 | \includegraphics[width=120mm]{Figs/bracket.jpeg}
162 | \end{figure}
163 | \end{frame}
164 | 
165 | 
166 | \section{Fibonacci Search}
167 | \begin{frame}{Fibonacci Search}
168 | \begin{columns}
169 | \begin{column}{.6\textwidth}
170 | Suppose we have a unimodal $f$ bracketed by the interval $[a, b]$.
171 | \begin{itemize}
172 |     \item Query $f$ on the 1/3 and 2/3 points on the interval
173 |     \item Query $f$ on the center of the new interval
174 |     \item Three queries ensures to shrink the interval by a factor of three
175 |     \item $\cdots$
176 | \end{itemize}
177 |    
178 | This actually follows %a \textcolor{red}{Fibonacci sequence}!
179 | \begin{equation*}
180 | F_n = 
181 |     \begin{cases}
182 |     1 & \textrm{if~} n\leq 2\\
183 |     F_{n-1} + F_{n-2} & \textrm{otherwise}
184 |     \end{cases}
185 | \end{equation*}
186 | 
187 | \end{column}
188 | 
189 | \begin{column}{.4\textwidth}
190 | \end{column}
191 | 
192 | \end{columns}
193 | \end{frame}
194 | 
195 | \begin{frame}{Fibonacci Search Algorithm}
196 | Let's try to formulate the problem in a more rigorous way. 
197 | Ideally, we want to shrink the interval $[a, b]$ within $n$ iterations. For each update, we want to shrink by a factor of $\tau$,
198 | \begin{columns}
199 | 
200 | \begin{column}{0.6 \textwidth}
201 | \begin{equation*}
202 |    b_{k+1} - a_{k+1} = \frac{F_{n-k}}{F_{n-k+1}}(b_k - a_k)
203 | \end{equation*}
204 | \begin{equation*}
205 | \begin{split}
206 |     F_0 &= F_1 = 1,\\
207 |     F_{k+1} &= F_k + F_{k-1}~~~~~(k=1, 2, \cdots),     
208 | \end{split}
209 | \end{equation*}
210 | Therefore,
211 | \begin{equation*}
212 | \begin{split}
213 |     b_n - a_n &= \frac{F_1}{F_2}(b_{n-1}- a_{n-1})\\
214 |               &= \frac{F_1}{F_2}\frac{F_2}{F_3} \cdots \frac{F_{n-1}}{F_n} (b_1-a_1)\\
215 |               &= \frac{1}{F_n}(b_1 - a_1)
216 |               %= \frac{1}{r^n}(b_1 - a_1)
217 | \end{split}
218 | \end{equation*}
219 | \end{column}
220 | 
221 | \begin{column}{0.4 \textwidth}
222 | \centering
223 | \textcolor{blue}{Solution of $F_k$}\\
224 | let $F_k = \tau^k$
225 | \begin{equation*}
226 |     \tau^2 = \tau + 1
227 | \end{equation*}
228 | \begin{equation*}
229 | \begin{split}
230 |     \tau_{1,2} &= \frac{1\pm \sqrt{5}}{2}\\
231 |     F_k & = A\tau_1^k + B\tau_2^k
232 | \end{split}
233 | \end{equation*}
234 | 
235 | Since $F_0 = F_1 = 1$, 
236 | \begin{equation*}
237 |     F_k = \frac{1}{\sqrt{5}} (\tau_1^{k+1} - \tau_2^{k+1})
238 | \end{equation*}
239 | \end{column}
240 | 
241 | \end{columns}
242 | 
243 | \end{frame}
244 | 
245 | %\end{frame}
246 | 
247 | \begin{frame}{Fibonacci Algorithm}
248 | Suppose we have a unimodal $f$ bracketed by the interval [a, b].
249 | \begin{enumerate}
250 |     \item Query $f$ on two points ($\lambda_k, \mu_k$) on the interval $[a_k, b_k]$
251 |     \begin{equation*}
252 |     \begin{split}
253 |         \lambda_k &= a_k + \bigg(1-\frac{F_{n-k}}{F_{n-k+1}}\bigg)(b_k - a_k)\\
254 |         \mu_k &= a_k + \frac{F_{n-k}}{F_{n-k+1}}(b_k - a_k)        
255 |     \end{split}
256 |     \end{equation*}
257 |     \item if $f(\lambda_k) > f(\mu_k)$, go to step 3; otherwise, go to step 4
258 |     \item if $b_k - \lambda_k < \delta$, terminate. otherwise
259 |     \begin{equation*}
260 |         a_{k+1} = \lambda_k, ~~ b_{k+1} = b_k, 
261 |     \end{equation*}
262 |     \item if $u_k - a_k \leq \delta$, terminate. Otherwise
263 |     \begin{equation*}
264 |         a_{k+1} = a_k, b_{k+1} = \mu_k, ~~ \mu_{k+1} = \lambda_k, 
265 |     \end{equation*}
266 |     \item k += 1, go to step 2
267 | \end{enumerate}
268 | \end{frame}
269 | 
270 | \section{0.618 Search}
271 | 
272 | \begin{frame}{Golden Ratio Search}
273 | If we take the limit for large $n$, the ratio between successive values of the Fibonacci sequence approaches the golden ratio:
274 | 
275 | \begin{equation*}
276 | \lim_{n\rightarrow\infty} \frac{F_{k-1}}{F_k} = \frac{\sqrt{5}-1}{2} = 0.618
277 | \end{equation*}
278 | 
279 | Therefore, we can always use 0.618 and 0.392 to check the two points within the updated interval.\\
280 | 
281 | Both Fibonacci and golden ratio searches have the property of linear scaling. Fibonacci is in principle the optimum search strategy for bracketing a unimodal function. However, golden ratio is more popular due to its simplicity.
282 | 
283 | \end{frame}
284 | 
285 | \begin{frame}{Comparison}
286 | 
287 | \begin{figure}
288 | \centering
289 | \includegraphics[width=120mm]{Figs/search.jpeg}
290 | \end{figure}
291 | 
292 | \textcolor{blue}{Homework: reproduce the above figure by yourself!}
293 | \end{frame}
294 | 
295 | \section{Interpolation}
296 | 
297 | \begin{frame}{Interpolation with the help of gradient}
298 | Both Fibonacci and 0.618 searches do not need the gradient information. However, if the gradient is available, identifying the phase can be even faster. The idea is to approximate the target function in a analytic manner. 
299 | \begin{itemize}
300 |     \item Linear: bisection
301 |     \item quadratic fit
302 |     \item cubic interpolation
303 | \end{itemize}
304 | 
305 | \end{frame}
306 | 
307 | \begin{frame}{Bisection}
308 | The bisection method maintains a bracket $[a, b]$ in which at least one root is known to exist. If $f$ is continuous on $[a, b]$, and there is some $y \in [ f (a), f (b)]$, then the intermediate value theorem stipulates that there exists at least one $x \in [a, b]$, such that $f(x) = y$. It follows that a bracket [a, b] is guaranteed to contain a zero if $f(a)$ and $f(b)$ have opposite signs.
309 | 
310 | \begin{itemize}
311 |     \item Starting at [$a_1, b_1]$ 
312 |     \item if $f`(a_k) \leq 0$ and $f`(b_k) \geq 0$, let $c_k = \frac{1}{2} (a_k + b_k)$
313 |     \item if $f`(c_k) \geq 0$, let $a_{k+1} = a_k$, $b_{k+1}=c_k$, otherwise, $a_{k+1}=c_k$, $b_{k+1}=b_k$ 
314 |     \item terminate if $(b_{k+1} - a_{k+1}) \leq \delta $ or it reaches the given number of iterations
315 | \end{itemize}
316 | 
317 | Bisection method is commonly used to find roots of a function, or points where the function is zero.    
318 | \end{frame}
319 | 
320 | \begin{frame}{Quadratic fit search}
321 | Given the bracketing point $a<b<c$, we want to find the coefficient $p_1, p_2, p_3$ which satisfies:
322 | \begin{equation*}
323 |     q(x) = p_1 + p_2x + p_3x^2
324 | \end{equation*}
325 | \begin{columns}
326 | \begin{column}{.4\textwidth}
327 | \begin{equation*}
328 |     \begin{split}
329 |         y_a &= p_1 + p_2a + p_3a^2\\
330 |         y_b &= p_1 + p_2b + p_3b^2\\
331 |         y_c &= p_1 + p_2c + p_3c^2
332 |     \end{split}
333 | \end{equation*}
334 | \end{column}
335 | 
336 | \begin{column}{.6\textwidth}
337 | In matrix form, we have \\
338 | \begin{equation*}
339 | \begin{bmatrix}
340 | y_a\\
341 | y_b\\
342 | y_c
343 | \end{bmatrix}
344 | = 
345 | \begin{bmatrix}
346 | 1 & a & a^2\\
347 | 1 & b & b^2\\
348 | 1 & c & c^2
349 | \end{bmatrix}
350 | \cdot
351 | \begin{bmatrix}
352 | p_1\\
353 | p_2\\
354 | p_3
355 | \end{bmatrix}
356 | \end{equation*}
357 | 
358 | \end{column}
359 | \end{columns}
360 | 
361 | \begin{equation*}
362 |     q(x) = y_a\frac{(x-b)(x-c)}{(a-b)(a-c)} + y_b\frac{(x-a)(x-c)}{(b-a)(b-c)} + y_c\frac{(x-a)(x-b)}{(c-a)(c-b)} 
363 | \end{equation*}
364 | 
365 | We can solve the minimum by finding the derivative is zero:
366 | \begin{equation*}
367 |     x^* = \frac{1}{2} \frac{y_a(b^2-c^2) + y_b(c^2-a^2) + y_c(a^2-b^2)}{y_a(b-c) + y_b(c-a) + y_c(c-b)}
368 | \end{equation*}
369 | \end{frame}
370 | 
371 | 
372 | 
373 | 
374 | \section{Summary}
375 | \begin{frame}{Summary}
376 |     \begin{itemize}
377 |         \item Many optimization methods shrink a bracketing interval
378 |         \item The step length is then selected by a zoom phase
379 |         \item Fibonacci search and golden section search has the linear scaling of $\tau = 0.618$. They are derivative free.
380 |         \item Root-finding methods like the bisection method can be used to find where the derivative of a function is zero.
381 |         \item quadratic or cubic fit is more common choice in the zoom phase of line search
382 |     \end{itemize}
383 | \end{frame}
384 | \end{document}
385 | 
386 | 


--------------------------------------------------------------------------------
/Lecture_notes/04_local_decent.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/04_local_decent.pdf


--------------------------------------------------------------------------------
/Lecture_notes/04_local_decent.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Local Descent]{Numerical Optimization 04: Local Descent} % The short title appears at the bottom of every slide, the full title is only on the title page
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{A general model for optimization}
139 | \begin{frame}{Optimization involving multivariate functions}
140 | Similar to the single variable function, a common approach to optimization is to incrementally improve a design point $x$ by taking a step that minimizes the objective value based on a local model. The local model may be obtained, for example, from a first- or second-order Taylor approximation.
141 | \begin{itemize}
142 |     \item Check whether $x_k$ satisfies the termination conditions. If it does, terminate; otherwise proceed to the next step.
143 |     \item Determine the descent direction $d_k$ using local information such as the gradient or Hessian. 
144 |     \item Determine the step size or learning rate $\alpha_k$.
145 |     \item Compute the next design point according to:
146 |     \begin{equation*}
147 |         x_{k+1} = x_k + \alpha_k d_k
148 |     \end{equation*}
149 | \end{itemize}
150 | 
151 | \end{frame}
152 | 
153 | \section{Line Search}
154 | \begin{frame}{Line Search}
155 | Assuming that we have chosen a descent direction $d$. We need to choose the step factor $\alpha$ to obtain our next design point. One approach is to use \textcolor{blue}{line search}, which selects the step factor that minimizes the one-dimensional function:
156 | \begin{equation*}
157 |     \underset{\alpha}{\textrm{minimize}}: f(x+\alpha d)
158 | \end{equation*}
159 | 
160 | Line search is a univariate optimization problem, which was covered in the previous lecture. We can apply the univariate optimization method of our choice. To inform the search, we can use the \textcolor{blue}{derivative} of the line search objective, which is simply the directional derivative along $d$ at $x + \alpha d$.\\
161 | 
162 | One needs to be cautious in choosing $\alpha$. Large steps will result in faster convergence but risk overshooting the minimum. Smaller steps is more stable but very slow. A fixed step factor $\alpha$ is sometimes referred to as a learning rate.
163 | 
164 | \end{frame}
165 | 
166 | 
167 | \begin{frame}{Approximate line search}
168 | It is often more computationally efficient to perform more iterations of a descent method than to do exact line search at each iteration. In this case, the goal is to \textcolor{blue}{find a suitable step size with a small number of evaluations}.
169 | 
170 | Ideally, it needs to satisfy the following
171 | 
172 | \begin{itemize}
173 |     \item Sufficient decrease
174 |     \begin{equation*}
175 |         f(x^{k+1}) \leq f(x^k) + \beta\alpha \nabla _{d^k} f(x^k)
176 |     \end{equation*}
177 |     \item Curvature condition
178 |     \begin{equation*}
179 |         \nabla _{d^k} f(x^{k+1}) \geq \sigma \nabla _{d^k} f(x^k)
180 |     \end{equation*}
181 | \end{itemize}
182 | 
183 | 
184 | \end{frame}
185 | 
186 | \begin{frame}{Sufficient decrease}
187 |     \begin{equation*}
188 |         f(x^{k+1}) \leq f(x^k) + \beta\alpha \nabla _{d^k} f(x^k)
189 |     \end{equation*}
190 |     
191 | where $\beta \in [0, 1]$. A common choice is 1e-4.  
192 | \begin{figure}
193 | \centering
194 | \includegraphics[width=120mm]{Figs/sufficient_decrease.jpeg}
195 | \end{figure}
196 | 
197 | \textcolor{blue}{Question: what will happen if you adjust $\beta$?}
198 | \end{frame}
199 | 
200 | \begin{frame}{Curvature condition}
201 |     \begin{equation*}
202 |         \nabla _{d^k} f(x^{k+1}) \geq \sigma \nabla _{d^k} f(x^k)
203 |     \end{equation*}
204 | where $\sigma$ controls how shallow the next directional derivative must be.
205 | It is common to set $\beta < \sigma < 1$ with $\sigma$ = 0.1 in the conjugate gradient method and 0.9 in Newton’s method.
206 | \begin{figure}
207 | \centering
208 | \includegraphics[width=120mm]{Figs/curvature1.jpeg}
209 | \end{figure}
210 | \end{frame}
211 | 
212 | 
213 | \begin{frame}{More restrictive curvature condition (strong Wolfe)}
214 |     \begin{equation*}
215 |         |\nabla _{d^k} f(x^{k+1})| \leq -\sigma \nabla _{d^k} f(x^k)
216 |     \end{equation*}
217 | where $\sigma$ controls how shallow the next directional derivative must be.
218 | It is common to set $\beta < \sigma < 1$ with $\sigma$ = 0.1 in the conjugate gradient method and 0.9 in Newton’s method.
219 | \begin{figure}
220 | \centering
221 | \includegraphics[width=120mm]{Figs/curvature2.jpeg}
222 | \end{figure}
223 | \end{frame}
224 | 
225 | \begin{frame}{When both conditions are applied}
226 | \begin{figure}
227 | \centering
228 | \includegraphics[width=120mm]{Figs/two-conditions.jpeg}
229 | \end{figure}
230 | \end{frame}
231 | 
232 | \section{A practical line search}
233 | \begin{frame}{Graphical illustration of line search}
234 | 
235 | \begin{itemize}
236 |     \item Initial Bracket
237 |     \item Fibonacci/0.618/bisection until it satisfies the conditions
238 | \end{itemize}
239 | 
240 | \begin{figure}
241 | \centering
242 | \includegraphics[width=120mm]{Figs/linesearch.jpeg}
243 | \end{figure}
244 | 
245 | \end{frame}
246 | 
247 | \begin{frame}{Terminations conditions}
248 | \begin{itemize}
249 |     \item Maximum iterations.
250 |     \item Absolute improvement. If the change is smaller than a given threshold, it will terminate:
251 |     \begin{equation*}
252 |         f(x_k) - f(x_{k+1}) < \epsilon_a
253 |     \end{equation*}
254 |     
255 |     \item Relative improvement. If the change is smaller than a given threshold, it will terminate:
256 |     \begin{equation*}
257 |         f(x_k) - f(x_{k+1}) < \epsilon_r |f(x_k)|
258 |     \end{equation*}
259 |     
260 |     \item Gradient magnitude. We can also terminate based on the magnitude of the gradient:
261 |         \begin{equation*}
262 |             |\nabla f(x_{k+1})| < \epsilon_g 
263 |         \end{equation*}
264 | \end{itemize}
265 | 
266 | \end{frame}
267 | 
268 | 
269 | 
270 | \section{Summary}
271 | \begin{frame}{Summary}
272 |     \begin{itemize}
273 |         \item Descent direction methods incrementally descend toward a local optimum.
274 |         \item Univariate optimization can be applied during line search.
275 |         \item Approximate line search can be used to identify appropriate descent step sizes.
276 |         \item Termination conditions for descent methods can be based on multiple criteria
277 |     \end{itemize}
278 | \end{frame}
279 | \end{document}
280 | 
281 | 


--------------------------------------------------------------------------------
/Lecture_notes/05_first_order_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/05_first_order_1.pdf


--------------------------------------------------------------------------------
/Lecture_notes/05_first_order_1.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Gradient Descent]{Numerical Optimization 05: 1st order methods} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{In choosing the direction}
139 | \begin{frame}{The choice of descent direction}
140 | In the previous chapter, we have talked about the general strategy for optimization is to decide a direction and then use the line search method to obtain a sufficient decrease. Repeating it for many time, we expect to arrive at the local minimum.
141 | \begin{equation*}
142 |     x^{k+1} = x^k + \alpha^k d^k
143 | \end{equation*} 
144 | The search direction often has the form
145 | \begin{equation}
146 | 		d^k = -(B^k)^{-1} \nabla f(x^k)
147 | \end{equation}
148 | 
149 | where $B^k$ is a symmetric and nonsingular matrix. In some method (e.g., steepest descent), $B^k$ is the identify matrix, while in (quasi-) Newton's method, $B^k$ is the approximate or exact Hessian. 
150 | 
151 | In this lecture, we will cover the \textcolor{blue}{first-order} methods which \textcolor{blue}{purely rely on the gradient information}.
152 | 
153 | \end{frame}
154 | 
155 | \section{Gradient Descent}
156 | \begin{frame}{Gradient descent}
157 | An intuitive choice for the descent direction is the direction of steepest descent ($g^k = \nabla f(x^k)$).
158 | \begin{equation*}
159 |     d^k = - \frac{g^k}{||g^k||}
160 | \end{equation*}
161 | 
162 | If we optimize the step size at each step, we have
163 | \begin{equation*}
164 |     \alpha^k = \underset{\alpha}{\arg \min} f(x^k + \alpha d^k)
165 | \end{equation*}
166 | 
167 | Since 
168 | 
169 | \begin{equation*}
170 |     \nabla f(x^k + \alpha d^k)^T d^k = 0
171 | \end{equation*}
172 | %We know
173 | 
174 | \begin{equation*}
175 |     d^{k+1} = - \frac{\nabla f(x^k + \alpha d^k)}{||\nabla f(x^k + \alpha^k)||}
176 | \end{equation*}
177 | 
178 | It is obvious that the two consecutive directions are \textcolor{blue}{orthogonal}.
179 | 
180 | \begin{equation*}
181 | 	(d^{k+1})^T d^k = 0
182 | \end{equation*}
183 | 
184 | \end{frame}
185 | 
186 | \section{Conjugate gradient}
187 | \begin{frame}{Conjugate gradient}
188 | Gradient descent can perform poorly in narrow valleys. The conjugate gradient method overcomes this issue by doing a small transformation.
189 | 
190 | When minimizing the quadratic functions:
191 | \begin{equation*}
192 |     \underset{\alpha}{\textrm{minimize}}: f(x) = \frac{1}{2} x^T A x - b^T x 
193 | \end{equation*}
194 | 
195 | is equivalent to solving the linear equation
196 | \begin{equation*}
197 |     Ax = b
198 | \end{equation*}
199 | where $A$ is $N \times N$ symmetric and positive definite, and thus $f$ has a unique local minimum.
200 | 
201 | When solving $Ax = b$, a powerful method is to find a sequence of $N$ \textcolor{blue}{conjugate directions} satisfying 
202 | \begin{equation*}
203 | 		(d^i)^T A d^j = 0 ~~~ (i\neq j)
204 | \end{equation*}
205 | 
206 | \end{frame}
207 | 
208 | \begin{frame}{To find the successive conjugate directions}
209 | One can start with the direction of steepest descent
210 |     \begin{equation*}
211 |         d^1 = - g^1
212 |     \end{equation*}
213 | We then use line search to find the next design point. For quadratic functions $f= \frac{1}{2} x^T A x - b^T x $, the step factor $\alpha$ can be computed as
214 | \begin{equation*}
215 | \begin{split}
216 |     \frac{\partial f(x+\alpha d)}{\partial \alpha} & = \frac{\partial}{\partial\alpha} \Bigg[\frac{1}{2} (x+\alpha d)^T A (x+\alpha d) + b^T (x+\alpha d) + c \Bigg]\\
217 |     & = d^T A(x + \alpha d) + d^T b\\
218 |     & = d^T(Ax + b) + \alpha d^T A d
219 | \end{split}
220 | \end{equation*}
221 | Let the gradient be zero, 
222 | \begin{equation*}
223 |     \alpha = - \frac{d^T(Ax + b)}{d^T A d}
224 | \end{equation*}
225 | 
226 | Then the update is
227 | \begin{equation*}
228 |     x^2 = x^1 + \alpha d^1
229 | \end{equation*}
230 | 
231 | \end{frame}
232 | 
233 | \begin{frame}{To find the successive conjugate directions (continued)}
234 | For the next step
235 |     \begin{equation*}
236 |         d^{k+1} = -g^{k+1} + \beta^k d^k
237 |     \end{equation*}
238 | where $\beta^k$ is a series of scalar parameters. Larger values of $\beta$ indicate that the previous descent direction contributes strongly.
239 | 
240 | We solve $\beta$, from the followings
241 | \begin{gather*}
242 |     d^{(k+1)T} A d^k = 0 \\
243 |     (-g^{k+1} + \beta^k d^{(k)})^T A d^{(k)} = 0\\
244 |     -g^{k+1}A d^{(k)} + \beta^k d^{(k)T} A d^{(k)} = 0 \\
245 |     \beta^k = \frac{g^{(k+1)T}A d^{(k)}}{d^{(k)T}A d^{(k)}} 
246 | \end{gather*}
247 | The conjugate method is exact for quadratic functions. But it can be applied to non quadractic functions as well when the quadratic function is a good approximation.
248 | 
249 | \end{frame}
250 | 
251 | \begin{frame}{To Approximate $A$ and $\beta$}
252 | Unfortunately, we don't know the value of $A$ that best approximate $f$ around $x^k$. So we choose some way to compute $\beta$.
253 | 
254 | \begin{alertblock}{Fletcher-Reeves}
255 | \begin{equation*}
256 |     \beta^k = \frac{g^{(k)T} g^{(k)}}{g^{(k-1)T} g^{(k-1)}}
257 | \end{equation*}
258 | \end{alertblock}
259 | \vfill
260 | \begin{alertblock}{Polak-Ribiere}
261 | \begin{equation*}
262 |     \beta^k = \frac{g^{(k)T} (g^{(k)}-g^{(k-1)})}{g^{(k-1)T} g^{(k-1)}}
263 | \end{equation*}
264 | \end{alertblock}
265 | 
266 | \end{frame}
267 | 
268 | 
269 | \begin{frame}{Comparison between Conjugate Gradient and Steepest Descent}
270 | \begin{figure}
271 | \centering
272 | \includegraphics[width=120mm]{Figs/cg-sd.jpg}
273 | \end{figure}
274 | \end{frame}
275 | 
276 | 
277 | 
278 | \section{Summary}
279 | \begin{frame}{Summary}
280 |     \begin{itemize}
281 |         \item Gradient descent follows the direction of steepest descent
282 |         \item Two consecutive search directions in gradient descent are orthogonal
283 |         \item In conjugate gradient, the search directions are conjugate with respect to an approximate hessian.
284 |         \item Both SD and CG work with the line search method
285 |     \end{itemize}
286 | \end{frame}
287 | \end{document}
288 | 
289 | 


--------------------------------------------------------------------------------
/Lecture_notes/06_first_order_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/06_first_order_2.pdf


--------------------------------------------------------------------------------
/Lecture_notes/06_first_order_2.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | \usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Gradient Descent]{Numerical Optimization 06: 1st order methods} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | \section{Gradient methods with fixed learning rate}
138 | \begin{frame}{Gradient methods with fixed learning rate}
139 | We have talked about steepest descent and conjugate gradient methods, which usually work with the line search methods. Alternatively, it is popular to use the fixed learning rate method based on Gradient descent. However, the standard version will take a long time to traverse a nearly flat surface.
140 | Several methods have been proposed. They are commonly used in machine learning neural networks training.
141 | \begin{columns}
142 | \begin{column}{.4\textwidth}
143 | \begin{itemize}
144 |     \item Momentum
145 |     \item Nesterov Momentum
146 |     \item Adagrad
147 |     \item RMSProp
148 |     \item Adadelta
149 |     \item Adam
150 | \end{itemize}
151 | \end{column}
152 | 
153 | \begin{column}{.6\textwidth}
154 | \begin{figure}
155 | \centering
156 | \includegraphics[width=60mm]{Figs/flat.jpeg}
157 | \end{figure}
158 | \end{column}
159 | \end{columns}
160 | \end{frame}
161 | 
162 | 
163 | 
164 | \section{Momentum}
165 | \begin{frame}{Momentum}
166 | Allowing momentum to accumulate is one way to speed the progress. 
167 | Thus we can modify the gradient descent to incorporate momentum.
168 | \begin{gather*}
169 |     \boldsymbol{v}^{k+1} = \beta \boldsymbol{v}^k + \alpha \boldsymbol{g}^k \\
170 |     \boldsymbol{x}^{k+1} = \boldsymbol{x}^k + \boldsymbol{v}^{k+1} 
171 | \end{gather*}
172 | When $\beta$=0, it is gradient descent. 
173 | \begin{figure}
174 | \centering
175 | \includegraphics[width=100mm]{Figs/momentum.jpeg}
176 | \end{figure}
177 | 
178 | 
179 | \end{frame}
180 | 
181 | \section{Nesterov Momentum}
182 | \begin{frame}{Nesterov Momentum}
183 | One issue of momentum is that the steps do not slow down enough at the bottom of a valley and it tends to \textcolor{blue}{overshoot the valley}. Nesterov Momentum remedies the issue by the following updates,
184 | \begin{gather*}
185 |     \boldsymbol{v}^{k+1} = \beta \boldsymbol{v}^k + \alpha \nabla f(\boldsymbol{x} + \beta \boldsymbol{v}^k) \\
186 |     \boldsymbol{x}^{k+1} = \boldsymbol{x}^k + \boldsymbol{v}^{k+1} 
187 | \end{gather*}
188 | 
189 | \begin{figure}
190 | \centering
191 | \includegraphics[width=100mm]{Figs/n-momentum.jpeg}
192 | \end{figure}
193 | 
194 | \end{frame}
195 | 
196 | \section{Adagrad}
197 | \begin{frame}{Adagrad}
198 | Momentum and Nesterov Momentum update all componens of $x$ with the same learning rate. The adaptive subgradient method (Adagrad), adapts a learning rate for each one in x.
199 | \begin{gather*}
200 |     x_i^{k+1} = x_i^k - \frac{\alpha}{\epsilon + \sqrt{s_i^k}} g^k \\
201 |     s_i^k = \sum_{j=1}^k \bigg(g_i^j\bigg)^2 
202 | \end{gather*}
203 | where $\epsilon$ is a small value on the order of 1e-8, to prevent the case of division by zero. 
204 | Adagrad is far less sensitive to the learning rate $\alpha$. 
205 | \end{frame}
206 | 
207 | \section{RMSProp and Adadelta}
208 | \begin{frame}{RMSProp and Adadelta}
209 | 
210 | In Adagrad, the learning rate may monotonically decrease. To prevent this, \textcolor{blue}{RMPprop} maintains a decaying average of squared gradients.
211 | \begin{equation*}
212 |     \boldsymbol{s}^{k+1} = \gamma \boldsymbol{s}^k + (1-\gamma)(\boldsymbol{g}^k \odot \boldsymbol{g}^k)
213 | \end{equation*}
214 | 
215 | where $\gamma$ is between 0 and 1, and usually is 0.9.
216 | 
217 | \begin{equation*}
218 | \begin{split}
219 |     x_i^{k+1} &= x_i^k - \frac{\alpha}{\epsilon + \sqrt{s_i^k}} g_i^k \\
220 |               &= x_i^k - \frac{\alpha}{\epsilon + \textrm{RMS}(g_i)} g_i^k        
221 | \end{split}
222 | \end{equation*}
223 | 
224 | While in \textcolor{blue}{Adadelta}, an exponentially decaying average is used,
225 | 
226 | \begin{equation*}
227 |     x_i^{k+1} = x_i^k - \frac{\textrm{RMS}(\delta x_i)}{\epsilon + \textrm{RMS}(g_i)} g_i^k        
228 | \end{equation*}
229 | 
230 | \end{frame}
231 | 
232 | \section{Adam}
233 | \begin{frame}{Adam}
234 | 
235 | The adaptive moment estimation (Adam) is so far the most widely used optimization method in neural network training.
236 | It stores both an exponentially decaying squared gradient like RMSProp and Adadelta, but also an exponentially decaying gradient like momentum.
237 | \begin{equation*}
238 |  \begin{split}
239 |     \boldsymbol{v}^{k+1} &= \gamma_v \boldsymbol{v}^k + (1-\gamma_v) \boldsymbol{g}^k \\
240 |     \boldsymbol{s}^{k+1} &= \gamma_s \boldsymbol{s}^k + (1-\gamma_s) \bigg(\boldsymbol{g}^k \odot \boldsymbol{g}^k \bigg)\\
241 |     \hat{\boldsymbol{v}}^{k+1} &= \boldsymbol{v}^{k+1}/(1-\gamma_v^k)\\
242 |     \hat{\boldsymbol{s}}^{k+1} &= \boldsymbol{s^{k+1}}/(1-\gamma_s^k)\\
243 |     \boldsymbol{x}^{k+1} &= \boldsymbol{x}^k - \alpha \hat{\boldsymbol{v}}^{k+1}/\bigg(\epsilon + \sqrt{\hat{\boldsymbol{s}}^{k+1}}\bigg)
244 |  \end{split}
245 | \end{equation*}
246 | 
247 | 
248 | \end{frame}
249 | 
250 | 
251 | \section{Summary}
252 | \begin{frame}{Summary}
253 |     \begin{itemize}
254 |         \item Descent methods with momentum build up progress in favorable directions
255 |         \item A wide variety of accelerated descent methods use special techniques to speed up descent
256 |     \end{itemize}
257 | \end{frame}
258 | \end{document}
259 | 
260 | 


--------------------------------------------------------------------------------
/Lecture_notes/06_gradient_descent.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/06_gradient_descent.pdf


--------------------------------------------------------------------------------
/Lecture_notes/07_Newton_method.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/07_Newton_method.pdf


--------------------------------------------------------------------------------
/Lecture_notes/07_Newton_method.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Gradient Descent]{Numerical Optimization 07: 2nd order methods} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Newton's method}
139 | \begin{frame}{Newton's method}
140 | In optimization, knowing the first-order information can help determine the direction to travel, but does not help to determine how far to step to the local minimum. A better way is to use the second-order information.
141 | 
142 | In univariable optimization, the quadratic approximation about a point ($x^k$) come from 
143 | \begin{equation*}
144 |     q(x) = f(x^k) + (x-x^k)f`(x^k) + \frac{(x-x^k)^2}{2}f``(x^k)
145 | \end{equation*} 
146 | Setting the derivative to zero, 
147 | \begin{equation*}
148 | \begin{split}
149 | 		\frac{\partial q(x)}{\partial x} &= f`(x^k) + (x-x^k)f``(x^k) = 0 \\
150 | 		x^{k+1} &= x^k - \frac{f`(x^k)}{f``(x^k)}
151 | \end{split}
152 | \end{equation*}
153 | 
154 | \end{frame}
155 | 
156 | \begin{frame}{Various cases}
157 | \begin{figure}
158 | \centering
159 | \includegraphics[width=80mm]{Figs/newton-1d.jpeg}
160 | \end{figure}    
161 |     
162 | \end{frame}
163 | 
164 | 
165 | \section{Extension to multivariate optimization}
166 | \begin{frame}{Extension to multivariate optimization}
167 | If $f$ is a multivariate function
168 | \begin{equation*}
169 |     f(\boldsymbol{x}) = f(\boldsymbol{x}^k) + (\boldsymbol{g}^k)^T(\boldsymbol{x}-\boldsymbol{x}^k) 
170 |     + \frac{1}{2} (\boldsymbol{x}-\boldsymbol{x^k})^T \boldsymbol{H}^k (\boldsymbol{x}-\boldsymbol{x}^k) 
171 | \end{equation*}
172 | 
173 | Setting the gradient to be zero,
174 | \begin{equation*}
175 |     \nabla q(\boldsymbol{x}^k) = \boldsymbol{g}^k + \boldsymbol{H}^k (\boldsymbol{x}-\boldsymbol{x}^k)
176 | \end{equation*}
177 | 
178 | \begin{alertblock}{Quiz}
179 | The Booth's function is
180 | \begin{equation*}
181 |     f(\boldsymbol{x}) = (x_1 + 2x_2 -7)^2 + (2x_1 + x_2 -5)^2
182 | \end{equation*}
183 | use the Newton's method to find the minimum when $\boldsymbol{x}$ =[9, 8]
184 | 
185 | 
186 | \end{alertblock}
187 | 
188 | \end{frame}
189 | 
190 | \begin{frame}{Newton's method with line search}
191 | Newton’s method can also be used to supply a descent direction to line search or can be modified to use a step factor. Smaller steps toward the minimum or line searches along the descent direction can increase the method’s robustness. The descent direction is:
192 |     \begin{equation*}
193 |         \boldsymbol{d}^k = -(\boldsymbol{H}^k)^{-1}\boldsymbol{g}^k
194 |     \end{equation*}
195 | 
196 | \end{frame}
197 | 
198 | \section{Secant Method}
199 | \begin{frame}{Secant Method}
200 | Newton's method for \textcolor{blue}{univariate} function minimization needs to know the first and second derivatives. However, the second derivative is not easy to compute for some cases. The secant method use estimates of $H$ as follows
201 | \begin{gather*}
202 |     f``(x^k) \approx \frac{f`(x^k) - f`(x^{k-1})} {x^k-x^{k-1}}
203 | \end{gather*}
204 | 
205 | The secant method requires \textcolor{blue}{an additional initial design point}. It suffers from the same problems as Newton's method when quadratic function is not a good approximation.
206 | 
207 | \end{frame}
208 | 
209 | 
210 | 
211 | \section{Summary}
212 | \begin{frame}{Summary}
213 |     \begin{itemize}
214 |         \item Incorporating second-order information in descent methods often speeds convergence.
215 |         \item Newton’s method is a root-finding method that leverages second-order information to quickly descend to a local minimum.
216 |         \item The secant method approximate Newton’s method when the second-order information is not directly available.
217 |     \end{itemize}
218 | \end{frame}
219 | \end{document}
220 | 
221 | 


--------------------------------------------------------------------------------
/Lecture_notes/08_Quasi_Newton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/08_Quasi_Newton.pdf


--------------------------------------------------------------------------------
/Lecture_notes/09_direct_methods.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/09_direct_methods.pdf


--------------------------------------------------------------------------------
/Lecture_notes/09_direct_methods.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Gradient Descent]{Numerical Optimization 09: Direct Methods} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Direct methods without gradient}
139 | \begin{frame}{Direct method}
140 | Direct methods rely solely on the objective function $f$. They are usually called
141 | \begin{itemize}
142 |     \item zero-orther
143 |     \item black box
144 |     \item pattern search
145 |     \item derivative free
146 | \end{itemize}
147 | 
148 | The most important feature is that they do not rely on derivative information. 
149 | They use other criteria to choose the next search direction to judge if the search is converged.
150 | 
151 | \end{frame}
152 | 
153 | \section{Cyclic Coordinate Search}
154 | \begin{frame}{Cyclic Coordinate Search}
155 | This method simply alternate coordinate directions for its line search. The search starts from an initial $\boldsymbol{x}^1$ and optimize the first input.
156 | \begin{columns}
157 | 
158 | \begin{column}{.6\textwidth}
159 | \begin{equation*}
160 |     \boldsymbol{x}^2 = \underset{x_1}{\arg \min} f(x_1, x_2^1, x_3^1, \cdots, x_n^1) 
161 | \end{equation*}
162 | 
163 | Then, it moves to the next coordinate,
164 | \begin{equation*}
165 |     \boldsymbol{x}^3 = \underset{x_1}{\arg \min} f(x_1^2, x_2, x_3^2, \cdots, x_n^2) 
166 | \end{equation*}
167 | This process is equivalent to doing a sequence of line searches along the set of $n$ basis vectors.
168 | It is terminated after no significant improvement is made.
169 | \end{column}
170 | \pause
171 | \begin{column}{.4\textwidth}
172 | \begin{figure}
173 | \centering
174 | \includegraphics[width=30mm]{Figs/coordinate.jpeg}
175 | \end{figure} 
176 | \end{column}
177 | \end{columns}
178 | 
179 | \end{frame}
180 | 
181 | \begin{frame}{Acceleration}
182 | Similar to the momentum method in the gradient descent, the cyclic method can be augmented with an acceleration step to help traverse diagonal valleys. For each full cycle starting with $\boldsymbol{x}^1$ from 1 to $n$, an additional line search is conducted along with 
183 | the direction of $\boldsymbol{x}^{n+1}-\boldsymbol{x}^1$.
184 | 
185 | \begin{figure}
186 | \centering
187 | \includegraphics[width=60mm]{Figs/coordinate-improved.jpeg}
188 | \end{figure} 
189 | 
190 | \end{frame}
191 | 
192 | 
193 | 
194 | \section{Powell's method}
195 | \begin{frame}{Powell's method}
196 | This algorithm maintains a list of search directions $\boldsymbol{u}^1, \cdots, \boldsymbol{u}^n$, which are initially the basis vectors.
197 | Starting at $\boldsymbol{x}^1$, Powell's method conduct a line search for each direction, updating the design point each time,
198 | Then shift each $u$ by one index and drop $u^1$.
199 | The last direction is replaced with the direction of $\boldsymbol{x}^{n+1} - \boldsymbol{x}^1$.
200 | 
201 | 
202 | \begin{columns}
203 | \begin{column}{.7\textwidth}
204 | \begin{equation*}
205 | \begin{split}
206 | 		\boldsymbol{x}^{i+1} &\leftarrow ~{\textrm{line search}} (f, \boldsymbol{x}^i, \boldsymbol{u}^i) ~{\textrm{for all}} i in 1, \cdots, n\\
207 | 		\boldsymbol{u}^{i+1} &\leftarrow \boldsymbol{u}^{i+1} \\~ {\textrm{for all}} i in 1, \cdots, n-1\\
208 |     \boldsymbol{u}^{n} &\leftarrow \boldsymbol{x}^{n+1} - \boldsymbol{x}^n    
209 | \end{split}
210 | \end{equation*}
211 | \end{column}
212 | 
213 | %\pause
214 | \begin{column}{.3\textwidth}
215 | \begin{figure}
216 | \centering
217 | \includegraphics[width=30mm]{Figs/powell.jpeg}
218 | \end{figure} 
219 | \end{column}
220 | 
221 | \end{columns}
222 | 
223 | Powell showed that for quadratic functions, after $k$ full iterations the last $k$ direction will be mutually conjugate.
224 | It is recommended to reset every $n$ or $n+1$ iterations.
225 | 
226 | \end{frame}
227 | 
228 | \section{Nelder-Mead Simplex Method}
229 | \begin{frame}{Nelder-Mead Simplex Method}
230 | 
231 | The Nelder-Mead simplex method uses a simplex to traverse the space in search
232 | of a minimum. A simplex is a $n+1$-vertices polyhedron in $n$-dimensional space. 
233 | \begin{columns}
234 | \begin{column}{0.4 \textwidth}
235 | \begin{itemize}
236 |     \item $x_h$, pt of highest $f$, 
237 |     \item $x_s$, pt of 2nd highest $f$, 
238 |     \item $x_l$, pt of lowest $f$, 
239 |     \item $\Bar{x}$, mean pt  excluding $x_h$.
240 | \end{itemize}
241 | \end{column}
242 | \begin{column}{0.6 \textwidth}
243 | \begin{itemize}
244 |     \item Reflection. $x_r = \Bar{x} + (\Bar{x} - x_h)$, 
245 |     \item Expansion. $x_e = \Bar{x} + 2(x_r - \Bar{x})$, 
246 |     \item Contraction. $x_c = \Bar{x} + 0.5(x_h - \Bar{x})$, 
247 |     \item Shrinkage, halving the distance to $x_l$.
248 | \end{itemize}
249 | \end{column}
250 | 
251 | \end{columns}
252 | 
253 | \begin{figure}
254 | \centering
255 | \includegraphics[width=120mm]{Figs/simplex.jpeg}
256 | \end{figure}   
257 | \end{frame}
258 | 
259 | 
260 | \begin{frame}{Nelder-Mead Simplex Algorithm}
261 | 
262 | \begin{figure}
263 | \centering
264 | \includegraphics[width=110mm]{Figs/simplex_algo.jpeg}
265 | \end{figure}   
266 | \end{frame}
267 | 
268 | \begin{frame}{Nelder-Mead Simplex method in practice}
269 | 
270 | \begin{figure}
271 | \centering
272 | \includegraphics[width=110mm]{Figs/simplex-performance.jpeg}
273 | \end{figure}   
274 | \end{frame}
275 | 
276 | \section{Summary}
277 | \begin{frame}{Summary}
278 |     \begin{itemize}
279 |         \item Direct methods rely solely on the objective function and do not use derivative information.
280 |         \item Cyclic coordinate search optimizes one coordinate direction at a time.
281 |         \item Powell’s method adapts the set of search directions based on the direction of progress.
282 |         \item The Nelder-Mead simplex method uses a simplex to search the design space, adaptively expanding and contracting the size of the simplex in response to evaluations of the objective function.
283 |     \end{itemize}
284 | \end{frame}
285 | \end{document}
286 | 
287 | 


--------------------------------------------------------------------------------
/Lecture_notes/10_Stochastic_methods.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Gradient Descent]{Numerical Optimization 10: Stochastic Methods} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Noisy Descent}
139 | \begin{frame}{Noisy Descent}
140 | Adding stochasticity to gradient descent can be beneficial in large nonlinear optimization problems. Saddle points, where the gradient is very close to zero, can cause descent methods to select step sizes that are too small to be useful. One approach is to add Gaussian noise at each descent step
141 | 
142 | \begin{equation*}
143 |     \boldsymbol{x}^{k+1} \leftarrow \boldsymbol{x}^k - \alpha^k \boldsymbol{g}^k + \boldsymbol{\epsilon}^k
144 | \end{equation*}
145 | 
146 | where $\boldsymbol{\epsilon}(k)$ is zero-mean Gaussian noise with standard deviation $\sigma$. The amount of noise is typically reduced over time. The standard deviation of the noise is typically a decreasing sequence $\sigma(k)$ such as $1/k$. 
147 | 
148 | \begin{figure}
149 | \centering
150 | \includegraphics[width=40mm]{Figs/sgd.jpeg}
151 | \end{figure}   
152 | 
153 | \end{frame}
154 | 
155 | \section{Simulated Annealing}
156 | \begin{frame}{Simulated Annealing}
157 | Simulated annealing borrows inspiration from metallurgy. 
158 | \textcolor{blue}{Temperature} is used to control the degree of stochasticity during the randomized search.
159 | \begin{itemize}
160 |     \item $t$ starts high, allowing the process to freely move , with the hope of finding a good region with the best local minimum. 
161 | 
162 |     \item $t$ is then slowly brought down, reducing the stochasticity and forcing the search to converge to a minimum. Simulated annealing is often used on functions with many local minima due to its ability to escape local minima.
163 |     
164 | \end{itemize}
165 | 
166 | 
167 | At every iteration, a candidate transition from $\boldsymbol{x}$ to $\boldsymbol{x}′$ is sampled from a transition distribution $T$ and is accepted with \textcolor{blue}{probability}
168 | 
169 | \begin{equation*}
170 | \begin{cases}
171 | 1 & \textrm{if } \Delta y \leq 0\\
172 | \min(\exp(-\Delta y/t), 1) & \textrm{if }\Delta y >0
173 | \end{cases}
174 | \end{equation*}
175 | 
176 | where $\Delta y = f(\boldsymbol{x})- f(\boldsymbol{x`})$
177 | 
178 | \end{frame}
179 | 
180 | 
181 | \section{Cross-Entropy Method}
182 | \begin{frame}{Cross-Entropy Method}
183 | This probability distribution, often called a \textcolor{blue}{proposal distribution}, is used to propose new samples for the next iteration. At each iteration, we sample from the proposal distribution and then update the proposal distribution to fit a collection of the best samples. 
184 | 
185 | 		It requires choosing a family of distributions parameterized by $\theta$, such as multivariate normal distributions with \textcolor{blue}{a mean vector and a covariance matrix}. The algorithm also requires us to specify the number of elite samples, $m_{\textrm{elite}}$ , to use when fitting the parameters for the next iteration.
186 | 
187 | \begin{equation*}
188 | \begin{split}
189 |     \boldsymbol{\mu}^{k+1} &= \frac{1}{m_{\textrm{elite}}} \sum_{i=1}^{m_{\textrm{elite}}} \boldsymbol{x}^i\\
190 |     \Sigma^{k+1} &= \frac{1}{m_{\textrm{elite}}} \sum_{i=1}^{m_{\textrm{elite}}} (\boldsymbol{x}^i - \boldsymbol{\mu}^{k+1})(\boldsymbol{x}^i - \boldsymbol{\mu}^{k+1})^T
191 | \end{split}
192 | \end{equation*}
193 | \end{frame}
194 | 
195 | \begin{frame}{Cross-Entropy Method}
196 | This probability distribution, often called a \textcolor{blue}{proposal distribution}, is used to propose new samples for the next iteration. At each iteration, we sample from the proposal distribution and then update the proposal distribution to fit a collection of the best samples. 
197 | 
198 | \begin{figure}
199 | \centering
200 | \includegraphics[width=120mm]{Figs/Cross-entropy.jpeg}
201 | \end{figure}   
202 | \end{frame}
203 | 
204 | \section{Covariance Matrix Adaptation}
205 | \begin{frame}{Covariance Matrix Adaptation}
206 | Covariance matrix adaptation maintains a mean vector $\boldsymbol{\mu}$, a covariance matrix $\boldsymbol{\Sigma}$, and an additional step-size scalar $\delta$. The covariance matrix only increases or decreases in a single direction with every iteration, whereas the step-size scalar is adapted to control the overall spread of the distribution. At every iteration, m designs are sampled from the multivariate Gaussian
207 | \begin{equation*}
208 |     \boldsymbol{x} \sim \mathcal{N} (\boldsymbol{\mu}, \sigma^2 \Sigma)
209 | \end{equation*}
210 | 
211 | The designs are then sorted according to their objective function values such that $f(x^1) \leq f(x^2) \leq \cdots \leq f(x^m)$. A new mean vector  $\boldsymbol{\mu}^{k+1}$ is formed using a weighted average of the sampled designs:
212 | 
213 | \begin{gather*}
214 |     \boldsymbol{\mu}^{k+1} \leftarrow \sum_{i=1}^m w_i \boldsymbol{x}^i\\
215 |     \sum_i^m w_i = 1 ~~~~ w_1>w_2>\cdots>w_m>0    
216 | \end{gather*}
217 | 
218 | \end{frame}
219 | 
220 | \begin{frame}{Covariance Matrix Adaptation}
221 | The recommended weighting is obtained by
222 | \begin{equation*}
223 |     w`_i = \ln \frac{m+1}{2} - \ln i ~ \textrm{for~} i \in \{1, \cdots, m\}
224 | \end{equation*}
225 | to obtain $\boldsymbol{w} = \boldsymbol{w}`/\sum_i w`_i $.
226 | 
227 | The step size is updated using a cumulative $\boldsymbol{p}_\sigma$ that tracks steps over time
228 | \begin{equation*}
229 |     \begin{split}
230 |         \boldsymbol{p}_\sigma^1 &= 0\\
231 |         \boldsymbol{p}_\sigma^{k+1} &\leftarrow (1-c_\sigma)\boldsymbol{p}_\sigma + \sqrt{c_\sigma(2-c_\sigma)\mu_{\textrm{eff}}} (\Sigma^k)^{-1/2} \sigma_w \\
232 |         \mu_{\textrm{eff}} &= \frac{1}{\sum_i w^2_i}\\
233 |         \sigma_w &= \sum_{i=1}^{m_{\textrm{elite}}} 
234 |         w_i \sigma^i ~ \textrm{for~} \sigma^i = \frac{\boldsymbol{x}^i - \boldsymbol{\mu}^k}{\sigma^k}
235 |     \end{split}
236 | \end{equation*}
237 | \end{frame}
238 | 
239 | \begin{frame}{Covariance Matrix Adaptation}
240 | The new step size is 
241 | \begin{equation*}
242 |     \sigma^{k+1} \leftarrow \sigma^k \exp\bigg(\frac{c_\sigma}{d_\sigma} \bigg[\frac{||\boldsymbol{p}_\sigma
243 |     ||}{\mathbb{E}||\mathcal{N}(0, \boldsymbol{I})||}-1\bigg]\bigg)
244 | \end{equation*}
245 | where $\mathbb{E}$ is the expected length of a vector drawn from Gaussian distribution.
246 | \begin{equation*}
247 |     \mathbb{E}||\mathcal{N}(0, \boldsymbol{I})|| = \sqrt{2} \frac{\Gamma(\frac{n+1}{2})}{\Gamma(\frac{n}{2})}
248 |     \approx \sqrt{n}\bigg(1-\frac{1}{4n}+\frac{1}{21n^2}\bigg)
249 | \end{equation*}
250 | 
251 | \begin{equation*}
252 |     \begin{split}
253 |     c_\sigma &= (\mu_{\textrm{eff}}+2)/(n+\mu_{\textrm{eff}}+5)\\
254 |     d_\sigma &= 1 + 2\max (0, \sqrt{\mu_{\textrm{eff}}-1)/(n+1)} -1 ) + c_\sigma         
255 |     \end{split}
256 | \end{equation*}
257 | 
258 | \end{frame}
259 | 
260 | \begin{frame}{Covariance Matrix Adaptation}
261 | The covariance matrix is updated as follows
262 | \begin{equation*}
263 |     \begin{split}
264 |     \boldsymbol{p}_\Sigma^1 &= 0\\
265 |     \boldsymbol{p}_\Sigma^{k+1} &\leftarrow (1-c\Sigma)\boldsymbol{p_\Sigma^k} + h_\sigma \sqrt{c\Sigma (2-c_\Sigma) \mu_{\textrm{eff}}}\boldsymbol{\sigma}_w
266 |     \end{split}
267 | \end{equation*}
268 | 
269 | where 
270 | \begin{equation*}
271 |     h_\sigma =
272 |     \begin{cases}
273 |     1& if \frac{||\boldsymbol{p}_\Sigma||}{(1-c_\sigma^{2k+1})} < (1.4 + \frac{2}{n+1})  \mathbb{E}||\mathcal{N}(0, \boldsymbol{I})||\\
274 |     0& \textrm{otherwise}
275 |     \end{cases}
276 | \end{equation*}
277 | 
278 | The update requires the adjusted weights $\boldsymbol{w}$:
279 | 
280 | \begin{equation*}
281 |     w_i^0 =
282 |     \begin{cases}
283 |     w_i& \textrm{if~} w_i \geq 0 \\
284 |     \frac{nw_i}{||\Sigma^{-1/2}\boldsymbol{\delta}^i||^2}& \textrm{otherwise}
285 |     \end{cases}
286 | \end{equation*}
287 | 
288 | \end{frame}
289 | 
290 | \begin{frame}{Covariance Matrix Adaptation}
291 | The The covariance update is then
292 | \begin{equation*}
293 |     \Sigma^{k+1} \leftarrow [1 + c_1 c_\sigma(1-h_\sigma)(2 - c_\sigma) - c_1 - c_\mu]\Sigma^k 
294 |     + c_1 \boldsymbol{p}_\Sigma \boldsymbol{p}_\Sigma^T + c_\mu \sum_{i=1}^\mu w_i^0 \boldsymbol{\delta}^i (\boldsymbol{\delta}^i)^T
295 | \end{equation*}
296 | 
297 | The constants have the following recommended values
298 | \begin{equation*}
299 |     \begin{split}
300 |     c_\Sigma &= \frac{4+\mu_{\textrm{eff}}/n}{n+4+2\mu_{\textrm{eff}}/n}\\
301 |     c_1 &= \frac{2}{(n+1.3)^2 + \mu_{\textrm{eff}}} \\
302 |     c_\mu &= \min \bigg( 1-c_1,  2\frac{\mu_{\textrm{eff}}-2+1/\mu_{\textrm{eff}}} {(n+2)^2 + \mu_{\textrm{eff}} } \bigg)
303 |     \end{split}
304 | \end{equation*}
305 | 
306 | \end{frame}
307 | 
308 | \begin{frame}{Covariance Matrix Adaptation}
309 | \begin{figure}
310 | \centering
311 | \includegraphics[width=100mm]{Figs/CMA.jpeg}
312 | \end{figure}   
313 | \end{frame}
314 | 
315 | 
316 | \section{Summary}
317 | \begin{frame}{Summary}
318 |     \begin{itemize}
319 |         \item Stochastic methods employ random numbers during the optimization process
320 |         \item Simulated annealin guses a temperature that controls random exploration and which is reduced over time to converge on a local minimum.
321 |         \item The cross-entropy method and evolution strategies maintain proposal distributions from which they sample in order to inform updates.
322 |         \item Covariance matrix adaptation is a robust and sample-efficient optimizer that maintains a multivariate Gaussian proposal distribution with a full covariance matrix.
323 |     \end{itemize}
324 | \end{frame}
325 | \end{document}
326 | 
327 | 


--------------------------------------------------------------------------------
/Lecture_notes/11_evolutinary_methods.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Evolutionary Methods]{Numerical Optimization 11: Evolutionary Methods} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Population Methods}
139 | \begin{frame}{Population Methods}
140 | Previous lecture discussed some methods require a group of points to colleHaving a large number of individuals distributed throughout the design space can help the algorithm avoid becoming stuck in a local minimum. Information at different points in the design space can be shared between individuals to globally optimize the objective function. Most population methods are stochastic in nature, and it is generally easy to parallelize the computation. 
141 | 
142 | These methods typically have the following steps
143 | \begin{itemize}
144 |     \item Initialization
145 |     \item Encoding
146 |     \item Mutation
147 |     \item Crossover
148 |     \item Selection
149 | \end{itemize}
150 | 
151 | \end{frame}
152 | 
153 | \section{Initialization}
154 | \begin{frame}{Initialization}
155 | Population methods begin with an initial population, just as descent methods require an initial design point. The initial population should be spread over the design space to increase the chances that the samples are close to the best regions. 
156 | The following strategies can be applied
157 | \begin{itemize}
158 |     \item Uniform distribution in a bounded region
159 |     \item Multivariate normal distribution centered over a region of interest.
160 |     \item The Cauchy distribution has an unbounded variance and can cover a much broader space.
161 | \end{itemize}
162 | 
163 | \begin{figure}
164 | \centering
165 | \includegraphics[width=60mm]{Figs/cauchy.jpeg}
166 | \end{figure}   
167 | \end{frame}
168 | 
169 | 
170 | \section{Genetic Algorithm}
171 | \begin{frame}{Chromosomes}
172 | There are several ways to represent chromosomes. The simplest is the binary string chromosome, a representation that is similar to the way DNA is encoded.
173 | \end{frame}
174 | 
175 | \begin{frame}{Selection}
176 | Selection is the process of choosing chromosomes to use as parents for the next generation. For a population with m chromosomes, a selection method will produce a list of $m$ parental pairs for the m children of the next generation. The selected pairs may contain duplicates.
177 | \begin{itemize}
178 |     \item Truncation, random one from the best $k$ truncation
179 |     \item Tournament, the fittest out of $k$ randomly chosen
180 |     \item Roulette wheel, chosen with a probability proportional to the fitness
181 | \end{itemize}
182 | 
183 | \begin{figure}
184 | \centering
185 | \includegraphics[width=120mm]{Figs/selection.jpeg}
186 | \end{figure}   
187 | \end{frame}
188 | 
189 | \section{Covariance Matrix Adaptation}
190 | \begin{frame}{Covariance Matrix Adaptation}
191 | Covariance matrix adaptation maintains a mean vector $\boldsymbol{\mu}$, a covariance matrix $\boldsymbol{\Sigma}$, and an additional step-size scalar $\delta$. The covariance matrix only increases or decreases in a single direction with every iteration, whereas the step-size scalar is adapted to control the overall spread of the distribution. At every iteration, m designs are sampled from the multivariate Gaussian
192 | \begin{equation*}
193 |     \boldsymbol{x} \sim \mathcal{N} (\boldsymbol{\mu}, \sigma^2 \Sigma)
194 | \end{equation*}
195 | 
196 | The designs are then sorted according to their objective function values such that $f(x^1) \leq f(x^2) \leq \cdots \leq f(x^m)$. A new mean vector  $\boldsymbol{\mu}^{k+1}$ is formed using a weighted average of the sampled designs:
197 | 
198 | \begin{gather*}
199 |     \boldsymbol{\mu}^{k+1} \leftarrow \sum_{i=1}^m w_i \boldsymbol{x}^i\\
200 |     \sum_i^m w_i = 1 ~~~~ w_1>w_2>\cdots>w_m>0    
201 | \end{gather*}
202 | 
203 | \end{frame}
204 | 
205 | 
206 | \section{Particle Swarm Optimization}
207 | \begin{frame}{Particle Swarm Optimization}
208 | Particle swarm optimization introduces momentum to accelerate convergence toward minima. Each individual (particle), in the population keeps track of its current position, velocity, and the best position it has seen so far. Momentum allows an individual to accumulate speed in a favorable direction, independent of local perturbations.
209 | 
210 | \begin{equation*}
211 | \begin{split}
212 |     \boldsymbol{x}^i & \leftarrow \boldsymbol{x}^i + \boldsymbol{v}^i \\
213 |     \boldsymbol{v}^i & \leftarrow w\boldsymbol{v}^i + c_1 r_1 (\boldsymbol{x}_{lbest}^i - x^i) + c_2 r_2(\boldsymbol{x}_{gbest} - x^i)
214 | \end{split}
215 | \end{equation*}
216 | 
217 | where 
218 | \begin{itemize}
219 |     \item $x_{lbest}$: the current local best locations for the given population
220 |     \item $x_{gbest}$: the global best locations
221 |     \item $w, c_1, c_2$: empirical parameters
222 |     \item $r_1, r_2$: random numbers drawn from $U(0, 1)$
223 | \end{itemize}
224 | 
225 | \end{frame}
226 | 
227 | \begin{frame}{PSO search}
228 | \begin{figure}
229 | \centering
230 | \includegraphics[width=100mm]{Figs/pso.jpeg}
231 | %\caption{Firefly search with $\alpha$ = 0.5, $\beta$ = 1,and $\gamma$ = 0.1 applied to the Branin function}
232 | \end{figure}   
233 | \end{frame}
234 | 
235 | 
236 | \begin{frame}{Firefly Algorithm}
237 | The firefly algorithm was inspired by the manner in which fireflies flash their lights to attract mates. In the firefly algorithm, each individual in the population is a firefly and can flash to attract other fireflies. At each iteration, all fireflies are moved toward all more attractive fireflies. A firefly $x_a$ is moved toward a firefly $x_b$ with greater attraction according to
238 | 
239 | \begin{equation*}
240 | x_a \leftarrow x_a + \beta I (||x_b - x_a||)(x_b - x_a) + \alpha \epsilon
241 | \end{equation*}
242 | 
243 | where $I$ is the intensity of the attraction and $\beta$ is the source intensity. 
244 | When $\beta$ = 0, it returns to a random walk. where $\epsilon$ is drawn from a zero-mean, unit covariance multivariate Gaussian, and $\alpha$ scales the step size. The resulting update is a random walk biased toward brighter fireflies
245 | 
246 | The intensity $I$ decreases as the distance $r$ between the two fireflies increases and is defined to be 1 when $r$ = 0. It can be approximated as
247 | \begin{equation*}
248 |     I(r) = e^{-\gamma r^2}
249 | \end{equation*}
250 | 
251 | \end{frame}
252 | 
253 | \begin{frame}{Firefly search}
254 | Firefly search with $\alpha$ = 0.5, $\beta$ = 1,and $\gamma$ = 0.1 applied to the Branin function.
255 | \begin{figure}
256 | \includegraphics[width=120mm]{Figs/firefly.jpeg}
257 | \end{figure}   
258 | 
259 | \end{frame}
260 | 
261 | 
262 | \section{Summary}
263 | \begin{frame}{Summary}
264 |     \begin{itemize}
265 |         \item Population methods use a collection of individuals in the design space to guide progression toward an optimum.
266 |         \item Genetic algorithms leverage selection, crossover, and mutations to produce better subsequent generations.
267 |         \item Particle swarm optimization and the firefly algorithm include rules and mechanisms for attracting design points to the best individuals in the population while maintaining suitable state space exploration.
268 |         \item Population methods can be extended with local search approaches to improve convergence.
269 |     \end{itemize}
270 | \end{frame}
271 | \end{document}
272 | 
273 | 


--------------------------------------------------------------------------------
/Lecture_notes/12_constrained_optimization.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Constrained Optimization]{Numerical Optimization 12: Constrained Optimization} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Constrained Optimization}
139 | \begin{frame}{Noisy Descent}
140 | Some constraints are simply upper or lower bounds on the design variables, as we have seen in bracketed line search, in which $x$ must lie between $a$ and $b$. A bracketing constraint $x \in [a, b]$ can be replaced by two inequality constraints: $x \geq a$ and $x \leq b$
141 | 
142 | \begin{figure}
143 | \centering
144 | \includegraphics[width=120mm]{Figs/constraint-ab.jpeg}
145 | \end{figure}   
146 | 
147 | \end{frame}
148 | 
149 | \section{Constraints}
150 | \begin{frame}{Constraints}
151 | Constraints are not typically specified directly through a known feasible set X . Instead, the feasible set is typically formed from two types of constraints:
152 | 
153 | \begin{itemize}
154 |     \item equality constraints, $h(x)=0$ 
155 |     \item inequality constraints, $g(x) \leq 0$
156 |     
157 | \end{itemize}
158 | 
159 | Any optimization problem can be rewritten using these constraints
160 | \begin{gather*}
161 |     ~~~~~ \underset{\boldsymbol{x}}{\min} ~ f(\boldsymbol{x})\\
162 |     {s.t.}~~~~ h_i(x) = 0 \\
163 |     ~~~~~~~~~ g_j(x) = 0
164 | \end{gather*}
165 | 
166 | \end{frame}
167 | 
168 | \section{Transformations to Remove Constraints}
169 | \begin{frame}{Transformations to Remove Constraints}
170 | In some cases, it may be possible to transform a problem so that constraints can be removed. For example, bound constraints a ≤ x ≤ b can be removed by passing x through a transform
171 | 
172 | \begin{equation*}
173 | x = \frac{b+a}{2} + \frac{b-a}{2}\bigg(\frac{2\hat{x}}{1+\hat{x}^2}\bigg)
174 | \end{equation*}
175 | 
176 | Below is an example
177 | \begin{gather*}
178 |     ~~~~~ \underset{x}{\min} ~ x\sin{x}\\
179 |     {s.t.}~~~~ 2\leq x \leq 6 \\
180 | \end{gather*}
181 | Can be transformed to 
182 | \begin{gather*}
183 |     \underset{\hat{x}}{\min} ~ \bigg[4+2\bigg(\frac{2\hat{x}}{1+\hat{x}^2}\bigg)x
184 |     + \sin \bigg[ 4 + 2\frac{2\hat{x}}{1+\hat{x}^2}\bigg]
185 | \end{gather*}
186 | 
187 | \end{frame}
188 | 
189 | \section{Lagrange Multipliers}
190 | \begin{frame}{Lagrange Multipliers}
191 | The method of Lagrange multipliers is used to optimize a function subject to equality
192 | constraints. 
193 | \begin{gather*}
194 |     ~~~~~ \underset{\boldsymbol{x}}{\min} ~ f(\boldsymbol{x})\\
195 |     {s.t.}~~~~ h_i(x) = 0 
196 | \end{gather*}
197 | 
198 | where $f$ and $h$ have continuous partial derivatives.
199 | 
200 | We can formulate the Lagrangian, which is a function of the design variables,
201 | \begin{gather*}
202 |     \mathcal{L}(x, \lambda) = f(x) - \lambda h(x) 
203 | \end{gather*}
204 | 
205 | Solving $\nabla \mathcal{L}(x, \lambda)$ = 0. Specifically, $\nabla_x \mathcal{L}$ = 0 gives us the condition $\nabla f= \lambda \nabla h$, and $\nabla \lambda \mathcal{L}=0$ gives us $h(x)=0$. Any solution is considered a critical point.
206 | 
207 | \end{frame}
208 | 
209 | \begin{frame}{Lagrange Multipliers to a single equality condition}
210 | The method of Lagrange multipliers is used to optimize a function subject to equality
211 | constraints. 
212 | \begin{gather*}
213 |     ~~~~~ \underset{\boldsymbol{x}}{\min} ~ -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] \\
214 |     {s.t.}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ x_1 - x_2^2 = 0 
215 | \end{gather*}
216 | 
217 | We can formulate the Lagrangian, 
218 | \begin{equation*}
219 |     \mathcal{L}(x, \lambda) = -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] + \lambda(x_1 - x_2^2)
220 | \end{equation*}
221 | We compute
222 | \begin{itemize}
223 |     \item $\frac{\partial \mathcal{L}}{\partial x_1}$
224 |     \item $\frac{\partial \mathcal{L}}{\partial x_2}$
225 |     \item $\frac{\partial \mathcal{L}}{\partial \lambda}$
226 | \end{itemize}
227 | 
228 | \end{frame}
229 | 
230 | 
231 | \begin{frame}{Lagrange Multipliers to multiple equality conditions}
232 | The method of Lagrange multipliers is used to optimize a function subject to equality
233 | constraints. 
234 | \begin{gather*}
235 |     ~~~~~ \underset{\boldsymbol{x}}{\min} ~ -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] \\
236 |     {s.t.}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ x_1 - x_2^2 = 0 
237 | \end{gather*}
238 | 
239 | We can formulate the Lagrangian, 
240 | \begin{equation*}
241 |     \mathcal{L}(x, \lambda) = -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] + \lambda(x_1 - x_2^2)
242 | \end{equation*}
243 | We compute
244 | \begin{itemize}
245 |     \item $\frac{\partial \mathcal{L}}{\partial x_1}$
246 |     \item $\frac{\partial \mathcal{L}}{\partial x_2}$
247 |     \item $\frac{\partial \mathcal{L}}{\partial \lambda}$
248 | \end{itemize}
249 | 
250 | \end{frame}
251 | 
252 | \section{Summary}
253 | \begin{frame}{Summary}
254 |     \begin{itemize}
255 |         \item Constraints are requirements on the design points that a solution must satisfy.
256 |         \item Some constraints can be transformed or substituted into the problem to result in an unconstrained optimization problem.
257 |         \item Analytical methods using Lagrange multipliers yield the generalized Lagrangian and the necessary conditions for optimality under constraints.
258 |         \item A constrained optimization problem has a dual problem formulation that is easier to solve and whose solution is a lower bound of the solution to the original problem.
259 |     \end{itemize}
260 | \end{frame}
261 | \end{document}
262 | 
263 | 


--------------------------------------------------------------------------------
/Lecture_notes/13_sampling_plans.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Sampling Plans]{Numerical Optimization 13: Sampling Plans} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Sampling}
139 | \begin{frame}{Optimization with expensive function evaluations }
140 | For many optimization problems, function evaluations can be quite expensive. 
141 | \begin{itemize}
142 |     \item an aircraft design may require a wind tunnel test
143 |     \item deep learning hyperparameters may require a week of GPU training
144 |     \item $\cdots$
145 | \end{itemize} 
146 | A common approach for optimizing in these contexts is to build a \textcolor{blue}{surrogate model},
147 | Further evaluations of the true objective function can be used to improve the model. Fitting such models requires an initial set of points, ideally points that are space-filling; that is, points that cover the region as well as possible. 
148 | \end{frame}
149 | 
150 | \section{Full Factorial}
151 | \begin{frame}{Full Factorial}
152 | The full factorial sampling plan places a grid of evenly spaced points over the search space.
153 | \begin{itemize}
154 |     \item a lower/upper-bound vector $a, b$ such that $a_i \leq x_i \leq b_i$
155 |     \item $m_i$ samples in each $x_i$ separated by a distance $(b_i-a_i)/(m_i-1)$
156 | \end{itemize} 
157 | \begin{figure}
158 | \centering
159 | \includegraphics[width=80mm]{Figs/grid_search.jpeg}
160 | \end{figure} 
161 | 
162 | \end{frame}
163 | 
164 | \section{Random Sampling}
165 | \begin{frame}{Random Sampling}
166 | In some cases, it may be possible to transform a problem so that constraints can be removed. For example, bound constraints a ≤ x ≤ b can be removed by passing x through a transform
167 | 
168 | \end{frame}
169 | 
170 | 
171 | \section{Uniform Projection Plans}
172 | \begin{frame}{Uniform Projection Plans}
173 | A uniform projection plan with m samples on an $m \times m$ grid can be constructed using an $m$-element permutation. There are therefore $m!$ possible uniform projection plans.
174 | \begin{figure}
175 | \centering
176 | \includegraphics[width=120mm]{Figs/uni-proj.jpeg}
177 | \end{figure} 
178 | \end{frame}
179 | 
180 | \section{Stratified Sampling}
181 | \begin{frame}{Uniform Projection Plans}
182 | Stratified sampling modifies any grid-based sampling plan, including full factorial and uniform projection plans. Cells are sampled at a point chosen uniformly at random from within the cell rather than at the cell’s center。
183 | \begin{figure}
184 | \centering
185 | \includegraphics[width=120mm]{Figs/strafied.jpeg}
186 | \end{figure} 
187 | \end{frame}
188 | 
189 | 
190 | \section{Space-Filling Metrics}
191 | \begin{frame}{Space-Filling Metrics}
192 | A good sampling plan fills the design space since the ability for a surrogate model to generalize from samples decays with the distance from those samples. Not all plans, even uniform projection plans, are equally good at covering the search space. 
193 | \begin{itemize}
194 |     \item Discrepancy, the maximum difference between the fraction of samples in a hyper-rectangular subset H and that subset’s volume:
195 |     \begin{equation*}
196 |         d(X) = \underset{H}{\sup} |\frac{\#X\cap H}{\#X} - \lambda(H)|
197 |     \end{equation*}
198 |     where $\#X$ and $\#X \cap H$ are the numbers of $X$ points and $X$ in $H$. 
199 |     \item Pairwise Distances between all points within each sampling plan
200 |     
201 | \end{itemize}
202 | 
203 | 
204 | \end{frame}
205 | 
206 | \section{Quasi-Random Sequences}
207 | \begin{frame}{Quasi-Random Sequences}
208 | Quasi-random sequences are often used in the context of trying to approximate an integral over a multidimensional space:
209 | \begin{equation*}
210 |     \int_\chi f(\boldsymbol{x})d\boldsymbol{x} \approx \frac{v}{m}\sum_{i=1}^m f(\boldsymbol{x}^i)
211 | \end{equation*}
212 | where each $\boldsymbol{x}^i$ is sampled uniformly at random over the domain $X$ and $v$ is the volume of $\boldsymbol{\chi}$ .
213 | 
214 | Quasi-random sequences are deterministic sequences that fill the space in a systematic manner so that the integral converges as fast as possible in the number of points $m$. They are typically constructed for the unit n-dimensional hypercube with the following methods.
215 | \begin{itemize}
216 |     \item Additive Recurrence
217 |     \item Halton Sequence
218 |     \item Sobol Sequence
219 | \end{itemize}
220 | \end{frame}
221 | 
222 | \begin{frame}{Additive Recurrence}
223 | Quasi-random sequences are often used in the context of trying to approximate an integral over a multidimensional space:
224 | \begin{equation*}
225 |     x^{k+1} = x^k + c ~~~(\mod 1)
226 | \end{equation*}
227 | produce space-filling sets provided that $c$ is irrational. The value of $c$ leading to
228 | \begin{equation*}
229 |     c = 1 - \Phi = \frac{\sqrt{5}-1}{2} = 0.618
230 | \end{equation*}
231 | where $\Phi$ is the golden ratio.
232 | We can construct a space-filling set over $n$ dimensions using an additive recurrence sequence for each coordinate, each with its own value of $c$. The square roots of the primes are known to be irrational, and can thus be used to obtain different sequences for each coordinate:
233 | \begin{equation*}
234 |     c_1 =\sqrt{2}, ~c_2 =\sqrt{3}, ~c_3 =\sqrt{5}, c_4 =\sqrt{7}, c_5 =\sqrt{11},
235 | \end{equation*}
236 | \end{frame}
237 | 
238 | \begin{frame}{Halton Sequence}
239 | \textcolor{blue}{Radical Inversion}
240 | \begin{equation*}
241 | \begin{split}
242 |     i & = \sum_{k=0}^{M-1} a_k(i)b^k \\
243 | \Psi_{b, C} &= (b^{-1}, \cdots, b^{-M}) [C (a_0(i), \cdots, a_M(i) )^T]   
244 | \end{split}
245 | \end{equation*}
246 | where $b$ is the \textcolor{blue}{base number}, and $C$ is the \textcolor{blue}{generator matrix}. When $C$ is the identity matrix, it is called \textcolor{blue}{van der Corput sequences},
247 | \begin{itemize}
248 |     \item $b$ = 2 
249 |     \begin{equation*}
250 |         X = \bigg\{ \frac{1}{2}, \frac{1}{4}, \frac{3}{4}, \frac{1}{8}, \frac{5}{8}, \frac{3}{8}, \frac{7}{8}, \frac{1}{16}, \cdots \bigg\}
251 |     \end{equation*}
252 |     \item $b$ = 5 
253 |     \begin{equation*}
254 |         X = \bigg\{ \frac{1}{5}, \frac{2}{5}, \frac{3}{5}, \frac{4}{5}, \frac{1}{25}, \frac{6}{25}, \frac{11}{25}, \cdots \bigg\}
255 |     \end{equation*}
256 | \end{itemize}
257 | 
258 | Halton Sequence uses coprime numbers in order to be uncorrelated. 
259 | 
260 | \end{frame}
261 | 
262 | \begin{frame}{Sobol Sequence}
263 | In the Sobol sequence, each dimension uses the base 2 with different $C$.
264 | \begin{figure}
265 | \centering
266 | \includegraphics[width=120mm]{Figs/sample_all.jpeg}
267 | \end{figure} 
268 | 
269 | 
270 | 
271 | \end{frame}
272 | 
273 | 
274 | \section{Summary}
275 | \begin{frame}{Summary}
276 |     \begin{itemize}
277 |         \item Sampling plans are used to cover search spaces with a limited number of points.
278 |         \item Full factorial sampling, which involves sampling at the vertices of a uniformly discretized grid, requires a number of points exponential in the number of dimensions.
279 |         \item Uniform projection plans,which project uniformly over each dimension,can be efficiently generated and can be optimized to be space filling.
280 |         \item Quasi-random sequences are deterministic procedures by which space-filling sampling plans can be generated.
281 |     \end{itemize}
282 |     
283 | \end{frame}
284 | \end{document}
285 | 
286 | 


--------------------------------------------------------------------------------
/Lecture_notes/14_surrogate_models.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Surrogate models]{Numerical Optimization 14: Surrogate models} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Surrogate Models}
139 | \begin{frame}{Surrogate Models}
140 | The \textcolor{blue}{surrogate models} are designed to be smooth and inexpensive to evaluate so that they can be efficiently optimized from the given sampling points. A surrogate model $\hat{f}$ parameterized by $\theta$ is designed to mimic the true objective function $f$. The parameters $\theta$ can be adjusted to fit the model based on samples collected from $f$. 
141 | 
142 | Suppose we have 
143 | \begin{itemize}
144 |     \item $m$ design points: $\{x^1, x^2, \cdots, x^m\}$
145 |     \item associated function evaluations: $\{y^1, y^2, \cdots, y^m\}$
146 | \end{itemize}
147 | For a particular set of parameters, the model will predict
148 | \begin{equation*}
149 |     \hat{y} = \{\hat{f}_\theta(x^1), \hat{f}_\theta(x^2), \cdots, \hat{f}_\theta(x^m)\}
150 | \end{equation*}
151 | 
152 | In turn, this is a minimization problem
153 | \begin{equation*}
154 |     \underset{\theta}{\min} = ||y-\hat{y}||
155 | \end{equation*}
156 | \end{frame}
157 | 
158 | \section{Linear Models}
159 | \begin{frame}{Linear Models}
160 | A simple surrogate model is the linear model, which has the form
161 | 
162 | \begin{gather*}
163 |     \hat{f} = w_0 + \boldsymbol{w}^T x ~~~~~~~~~~~~ \theta= \{w_0, \boldsymbol{w}\}
164 | \end{gather*}
165 | For an $n$-dimensional design space, the linear model has $n+1$ parameters, and thus requires at least $n+1$ samples to fit unambiguously.
166 | 
167 | Instead of having both $w$ and $w_0$ as parameters, it is common to construct a single vector of parameters $\theta = [w_0, \boldsymbol{w}]$ and prepend 1 to the vector x to get
168 | \begin{equation*}
169 |     \hat{f} = \boldsymbol{\theta}^T \boldsymbol{x}
170 | \end{equation*}
171 | 
172 | Finding an optimal $\boldsymbol{\theta}$ requires solving a linear regression problem:
173 | \begin{equation*}
174 |     \underset{\theta}{\min}~||\boldsymbol{y}-\hat{\boldsymbol{y}}|| ~~\textrm{or}~~ ||y-\boldsymbol{X\theta}||
175 | \end{equation*}
176 | where $\boldsymbol{X}$ is a design matrix, $[(\boldsymbol{x}^1)^T; \cdots; (\boldsymbol{x}^m)^T]$
177 | 
178 | \end{frame}
179 | 
180 | \section{Basis Functions}
181 | \begin{frame}{Basis Functions}
182 | The linear model is a linear combination of the components of $\boldsymbol{x}$:
183 | \begin{equation*}
184 |     \hat{f}(\boldsymbol{x}) = \theta_1 x_1 + \cdots + \theta_n x_n = \sum_{i=1}^n \theta_i x_i = \boldsymbol{\theta}^T \boldsymbol{x}
185 | \end{equation*}
186 | which is a specific example of a more general linear combination of basis functions.
187 | \begin{equation*}
188 |      \hat{f}(\boldsymbol{x}) = \theta_1 b(x_1) + \cdots + \theta_n b(x_n) = \sum_{i=1}^n \theta_i b(x_i) = \boldsymbol{\theta}^T b(\boldsymbol{x})
189 | \end{equation*}
190 | 
191 | Linear models cannot capture nonlinear relations. There are a variety of other families of basis functions that can represent more expressive surrogate models. The remainder of this section discusses a few common families.
192 | 
193 | \end{frame}
194 | 
195 | \begin{frame}{Polynomial Basis Functions}
196 | Polynomial basis functions consist of a product of design vector components, each raised to a power. Linear basis functions are a special case of polynomial basis functions.
197 | 
198 | In one dimension, a polynomial model of degree $k$ has the form
199 | \begin{equation*}
200 |     \hat{f}(x) = \theta_0 + \theta_1 x + \theta_2 x^2 + \cdots = \sum_{i=1}^k \theta_i x^i
201 | \end{equation*}
202 | 
203 | In two dimensions, a polynomial model of degree $k$ has basis functions of the form
204 | \begin{equation*}
205 |     b_{ij}(\boldsymbol{x}) = x_1^j x_2^j ~~{\textrm{for~}} i, j \in \{0, \cdots, k\}, i+j \leq k
206 | \end{equation*}
207 | \end{frame}
208 | 
209 | \begin{frame}{Sinusoidal Basis Functions}
210 | Any continuous function over a finite domain can be represented using an infinite set of sinusoidal basis functions. A Fourier series can be constructed for any integrable univariate function $f$ on an interval $[a, b]$
211 | 
212 | \begin{equation*}
213 |     f(x) = \frac{\theta_0}{2} + \sum_{i=1}^\infty \theta_i^{\sin} \sin \bigg(\frac{2\pi ix}{b-a}\bigg)
214 |     \sum_{i=1}^\infty \theta_i^{\cos} \cos \bigg(\frac{2\pi ix}{b-a}\bigg)
215 | \end{equation*}
216 | 
217 | where
218 | \begin{equation*}
219 |     \begin{split}
220 |         \theta_0 &= \frac{2}{b-a} \int_a^b f(x)dx \\ 
221 |         \theta_i^{\sin}&= \frac{2}{b-a} \int_a^b f(x)\sin \bigg(\frac{2\pi ix}{b-a}\bigg) dx \\ 
222 |         \theta_i^{\cos}&= \frac{2}{b-a} \int_a^b f(x)\cos \bigg(\frac{2\pi ix}{b-a}\bigg) dx 
223 |     \end{split}
224 | \end{equation*}
225 | 
226 | \end{frame}
227 | 
228 | 
229 | \begin{frame}{Radial Basis Functions}
230 | A radial function $\Psi$ is one which depends only on the distance of a point from some center point $c$, such that it can be written $\Psi(x, c) = \Psi(|x − c|)) = \Psi(r)$.
231 | 
232 | \begin{figure}
233 | \centering
234 | \includegraphics[width=100mm]{Figs/rbf.jpeg}
235 | \end{figure} 
236 | 
237 | \end{frame}
238 | 
239 | \section{Fitting Noisy Objective Functions}
240 | \begin{frame}{Fitting Noisy Objective Functions}
241 | Models fit using regression will pass as close as possible to every design point. When the objective function evaluations are noisy, complex models are likely to excessively contort themselves to pass through every point. However, smoother fits are often better predictors of the true underlying objective function. A regularization term is added in addition to the prediction error in order to give preference to solutions with lower weights. The resulting basis regression problem with L2 regularization is:
242 | \begin{equation*}
243 |     \underset{\theta}{\min} ||\boldsymbol{y-B\theta}||^2 + \lambda ||\boldsymbol{\theta}||^2_2
244 | \end{equation*}
245 | 
246 | The optimal parameter vector is given by:
247 | \begin{equation*}
248 |     \boldsymbol{\theta} = (\boldsymbol{B^TB} + \lambda {\bf I}) \boldsymbol{B^Ty}
249 | \end{equation*}
250 | 
251 | where \textbf{I} is the identity matrix.
252 | \end{frame}
253 | 
254 | \section{Model Selection}
255 | 
256 | \begin{frame}{Model Selection}
257 | So far, we have discussed how to fit a particular model to data. We generally want to minimize generalization error, which is a measure of the error of the model on the full design space, including points that may not be included in the data used to train the model. One way to measure generalization error is to use the expected squared error of its predictions:
258 | \begin{equation*}
259 |     \epsilon_{\textrm{gen}} = \mathbb{E}_{\boldsymbol{x} \sim \boldsymbol{\chi}} \bigg[\bigg(f(x)-\hat{f}(x)\bigg)^2\bigg]
260 | \end{equation*}
261 | 
262 | which impossible to compute. It may be tempting to estimate the generalization error of a model from the training error by using the mean squared error (MSE) of the model evaluated on the $m$ samples:
263 | \begin{equation*}
264 |     \epsilon_{\textrm{train}} = \frac{1}{m} \sum_i^m \bigg[\bigg(f(x^i)-\hat{f}(x^i)\bigg)^2\bigg]
265 | \end{equation*}
266 | 
267 | \end{frame}
268 | 
269 | \begin{frame}{Holdout}
270 | A simple approach to estimating the generalization error is the holdout method, which partitions the available data into a test set $D_h$ with $h$ samples and a training set Dt consisting of all remaining $m-h$ samples. The training set is used to fit model parameters. The held out test set is not used during model fitting, and can thus be used to estimate the generalization error. Different split ratios are used, typically ranging from 50\% train, 50\% test to 90\% train, 10\% test, depending on the size and nature of the dataset. Using too few samples for training can result in poor fits, whereas using too many will result in poor generalization estimates.
271 | \begin{figure}
272 | \centering
273 | \includegraphics[width=100mm]{Figs/holdout.jpeg}
274 | \end{figure} 
275 | 
276 | \end{frame}
277 | 
278 | 
279 | 
280 | \begin{frame}{Cross validation}
281 | Here, the original dataset D is randomly partitioned into $k$ sets $D_1, D_2, \cdots, D_k$ of equal, or approximately equal, size. We then train $k$ models, one on each subset of $k-1$ sets, and we use the withheld set to estimate the generalization error. The cross-validation estimate of generalization error is the mean generalization error over all folds   
282 | \begin{figure}
283 | \centering
284 | \includegraphics[width=100mm]{Figs/cross-valid.jpeg}
285 | \end{figure} 
286 | \end{frame}
287 | 
288 | \section{Summary}
289 | \begin{frame}{Summary}
290 |     \begin{itemize}
291 |         \item Surrogate models are function approximations that can be optimized instead of the true, potentially expensive objective function.
292 |         \item Many surrogate models can be represented using a linear combination of basis functions.
293 |         \item Model selection involves a bias-variance trade off between models with low complexity that cannot capture important trends and models with high complexity that overfit to noise.
294 |         \item Generalization error can be estimated using techniques such as hold out,k-fold cross validation, and the bootstrap.
295 |     \end{itemize}
296 | \end{frame}
297 | \end{document}
298 | 
299 | 


--------------------------------------------------------------------------------
/Lecture_notes/16_surrogate_optimization.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Surrogate Optimization]{Numerical Optimization 16: Surrogate Optimization} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Prediction-Based Exploration}
139 | \begin{frame}{Prediction-Based Exploration}
140 | Gaussian process probes the probability distributions over the true objective function. These distributions can be used to guide an optimization process toward better design points. In prediction-based exploration, we select the minimizer of the surrogate function. If we use a Gaussian process surrogate model, prediction-based optimization has us select the minimizer of the mean function
141 | \begin{equation*}
142 |     \boldsymbol{x}^{m+1} = \underset{\boldsymbol{x}\in \chi}{\arg\min}~~ \hat{\boldsymbol{\mu}}(\boldsymbol{x})
143 | \end{equation*}
144 | 
145 | where $\hat{\boldsymbol{\mu}}(\boldsymbol{x})$ is the predicted mean of a Gaussian process at a design point $x$ based on the previous m design points. 
146 | 
147 | Prediction-based optimization does not take uncertainty into account, and new samples can be generated very close to existing samples, which is a waste of time.
148 | \end{frame}
149 | 
150 | \section{Error-Based Exploration}
151 | \begin{frame}{Error-Based Exploration}
152 | Error-based exploration seeks to increase confidence in the true function. A Gaussian process can tell us both the mean and standard deviation at every point. The next sample point is:
153 | \begin{equation*}
154 |     \boldsymbol{x}^{m+1} = \underset{\boldsymbol{x}\in \chi}{\arg\max}~~ \hat{\boldsymbol{\sigma}}(\boldsymbol{x})
155 | \end{equation*}
156 | where $\hat{\boldsymbol{\sigma}}(\boldsymbol{x})$ is the predicted standard variance of a Gaussian process at a design point $x$ based on the previous m design points. 
157 | 
158 | \begin{figure}
159 | \centering
160 | \includegraphics[width=120mm]{Figs/error-explore.jpeg}
161 | \end{figure} 
162 | \end{frame}
163 | 
164 | 
165 | \section{Lower Confidence Bound Exploration}
166 | \begin{frame}{Lower Confidence Bound Exploration}
167 | The error-based exploration may sample the regions that are unpromising. Lower confidence bound exploration trades off between greedy minimization employed by prediction-based optimization and uncertainty reduction employed by error-based exploration with the following strategy,
168 | 
169 | \begin{gather*}
170 |     LB(x) = \hat{\mu}(\boldsymbol{x})-\alpha \hat{\sigma}(\boldsymbol{x})
171 | \end{gather*}
172 | $\alpha \geq 0$ is to control the trade-off between exploration and exploitation. 
173 | \begin{figure}
174 | \centering
175 | \includegraphics[width=120mm]{Figs/LB.jpeg}
176 | \end{figure} 
177 | 
178 | \end{frame}
179 | 
180 | \section{Probability of Improvement Exploration}
181 | \begin{frame}{Probability of Improvement Exploration}
182 | We select the design point that maximizes the chance that the new point will be better than any other. The improvement for a function sampled at $x$ producing $y=f(x)$ is
183 | \begin{equation*}
184 |     I(y) = 
185 |     \begin{cases}
186 |     y_{\min} - y & \textrm{if~} y<y_{\min}\\
187 |     0 & {\textrm{otherwise}}\\
188 |     \end{cases}
189 | \end{equation*}
190 | The probability of improvement at points where $\hat{\sigma}>0$ is
191 | \begin{equation*}
192 | P(y<y_{\min}) = \int_0^{y_{\min}} \mathcal{N}(y| \hat{\mu}, \hat{\sigma})dy = \Phi(\frac{y_{\min}-\hat{\mu}}{\hat{\sigma}})
193 | \end{equation*}
194 | 
195 | \begin{figure}
196 | \centering
197 | \includegraphics[width=100mm]{Figs/prob.jpeg}
198 | \end{figure} 
199 | 
200 | \end{frame}
201 | 
202 | \section{Expected Improvement Exploration}
203 | \begin{frame}{Expected Improvement Exploration}
204 | We can also focus our exploration of points that maximize our expected improvement over the current best function value.
205 | Through a substitution
206 | \begin{equation*}
207 |     z = \frac{y-\hat{\mu}}{\hat{\sigma}} ~~~~~~~~~~~ y`_{\min} = \frac{y_{\min} - \hat{\mu}}{\hat{\sigma}}
208 | \end{equation*}
209 | we can write the improvement as
210 | \begin{equation*}
211 |     I(y) = 
212 |     \begin{cases}
213 |     \hat{\sigma} (y`_{\min} -z) & {\textrm{if}~} z<y`_{\min} {\textrm{~and~}} \hat{\sigma} >0 \\
214 |     0 & {\textrm{otherwise}}\\
215 |     \end{cases}
216 | \end{equation*}
217 | where $\hat{\mu}$ and $\hat{\sigma}$ are the predicted mean and standard deviation.
218 | 
219 | We can calculate the expected improvement using Gaussian process:
220 | \begin{equation*}
221 | \begin{split}
222 |     \mathbb{E}[I(y)] &= \hat{\sigma} \int_{-\infty}^{y`_{\min}}  \mathcal{N}(z|0,1)dz\\
223 |     & = (y_{\min} - \hat{\mu}) P (y \leq y_{\min}) + \hat{\sigma} \mathcal{N}(y_{\min} | \hat{\mu}, \hat{\sigma}ˆ2)
224 | \end{split}
225 | \end{equation*}
226 | 
227 | \end{frame}
228 | 
229 | \begin{frame}{Expected Improvement Exploration}
230 | 
231 | \begin{figure}
232 | \centering
233 | \includegraphics[width=125mm]{Figs/EI.jpeg}
234 | \end{figure} 
235 | 
236 | \end{frame}
237 | 
238 | \section{Summary}
239 | \begin{frame}{Summary}
240 |     \begin{itemize}
241 |         \item Gaussian processes can be used to guide the optimization process using a variety of strategies that use estimates of quantities such as the lower confidence bound, probability of improvement, and expected improvement.
242 |         \item Some problems do not allow for the evaluation of unsafe designs, in which case we can use safe exploration strategies that rely on Gaussian processes.
243 |     \end{itemize}
244 | \end{frame}
245 | \end{document}
246 | 
247 | 


--------------------------------------------------------------------------------
/Lecture_notes/17_uncertainty.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Uncertainty]{Numerical Optimization 17: Uncertainty} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Uncertainty}
139 | \begin{frame}{Uncertainty}
140 | In many engineering tasks, however, there may be uncertainty due to a number of factors, such as model approximations, imprecision, and fluctuations of parameters over time. We want to minimize $f(x, z)$, but we do not have control over $z$. Feasibility depends on both the design vector $x$ and the uncertain vector $z$. 
141 | \begin{figure}
142 | \centering
143 | \includegraphics[width=110mm]{Figs/uncertainty.jpeg}
144 | \end{figure} 
145 | \end{frame}
146 | 
147 | \section{Polynomial Chaos}
148 | \begin{frame}{Polynomial chaos}
149 | \textcolor{blue}{Polynomial chaos} is a method for fitting a polynomial to $f(x, z)$ and using the resulting surrogate model to estimate the mean and variance. 
150 | 
151 | In one dimension, we approximate $f(z)$ with a surrogate model consisting of $k$ polynomial basis functions, $b_1, \cdots, b_k$:
152 | \begin{equation*}
153 |     f(z) = \hat{f}(z) = \sum_{i=1}^k \theta_i b_i(z)
154 | \end{equation*}
155 | The mean of $\hat{f}$ can be derived as follows
156 | \begin{equation*}
157 | \begin{split}
158 |     \hat{\mu} & = \int_Z p(z)\hat{f}(z) dz = \int_Z \sum_{i=1}^k p(z) \theta_i b_i(z) dx
159 |     = \sum_{i=1}^k \int_Z \theta_i b_i(z) p(z)dz \\
160 |               & = \theta_1 \int_Z b_1(z) p(z)dz + \cdots + \theta_n \int_Z b_n(z) p(z)dz
161 | \end{split}
162 | \end{equation*}
163 | \end{frame}
164 | 
165 | \begin{frame}{Polynomial chaos}
166 | The variance of $\hat{f}$ can be derived as follows
167 | \begin{equation*}
168 | \begin{split}
169 |     \hat{\sigma} & = \mathbb{E}[\hat{f}^2] - (\mathbb{E}[\hat{f}])^2
170 |     = \int_Z p^2(z)\hat{f}(z) dz - \mu^2\\
171 |     &= \int_Z \sum_{i=1}^k\sum_{j=1}^k  \theta_i\theta_j b_i(z)b_j(z)p(z)dz - \mu^2\\
172 |     & = \int_Z \bigg( \sum_{i=1}^k \theta_i^2 b_i^2(z) + 2\sum_{i=2}^k\sum_{j=1}^{i-1} \theta_i\theta_j b_i(z)b_j(z)\bigg)p(z)dz -\mu^2 \\
173 |     & =  \sum_{i=1}^k \theta_i^2 \int_Z b_i^2(z)dz + 2\sum_{i=2}^k\sum_{j=1}^{i-1} \theta_i\theta_j \int_Z b_i(z)b_j(z)p(z)dz
174 |     - \mu^2
175 | \end{split}
176 | \end{equation*}
177 | \end{frame}
178 | 
179 | \section{Orthogonal polynomial basis}
180 | \begin{frame}{Orthogonal polynomial basis}
181 | The mean and variance can be efficiently computed if the basis functions are chosen to be orthogonal under $p$. Two basis functions $b_i$ and $b_j$ are orthogonal with respect to a probability density $p(z)$ if
182 | \begin{equation*}
183 |      \int_Z b_i(z)b_j(z)p(z)dz = 0. ~({\textrm{if~}} i\neq j)
184 | \end{equation*}
185 | 
186 | If the chosen basis functions are all orthogonal to one another and the first basis function is $b_1(z) = 1$, the mean is:
187 | \begin{equation*}
188 | \begin{split}
189 |     \hat{\mu} & = \theta_1 \int_Z b_1(z) p(z)dz + \cdots + \theta_n \int_Z b_n(z) p(z)dz\\
190 |     & = \theta_1 \int_Z b^2_1(z) p(z)dz + \cdots + \theta_n \int_Z b_1(z) b_n(z) p(z)dz\\
191 |     & = \theta_1
192 | \end{split}
193 | \end{equation*}
194 | 
195 | \end{frame}
196 | 
197 | \begin{frame}{Orthogonal polynomial basis}
198 | Similarly, the variance is 
199 | \begin{equation*}
200 | \begin{split}
201 |     \hat{\sigma} & = \sum_{i=1}^k \theta_i^2 \int_Z b_i^2(z)dz + 2\sum_{i=2}^k\sum_{j=1}^{i-1} \theta_i\theta_j \int_Z b_i(z)b_j(z)p(z)dz - \mu^2\\
202 |     & = \sum_{i=1}^k \theta_i^2 \int_Z b_i^2(z)dz - \mu^2 \\
203 |     & = \theta_1^2 \int_Z b_1^2(z)dz - \sum_{i=1}^k \theta_i^2 \int_Z b_i^2(z)dz- \mu^2 \\
204 |     & = \sum_{i=1}^k \theta_i^2 \int_Z b_i^2(z)dz
205 | \end{split}
206 | \end{equation*}
207 | 
208 | \end{frame}
209 | 
210 | \begin{frame}{Orthogonal polynomial basis}
211 | The mean thus falls immediately from fitting a surrogate model to the observed data, and the variance can be very efficiently computed given the values $\int_Z b_i^2(z)p(z)dz$ for a choice of basis functions and probability distribution. All orthogonal polynomials satisfy the recurrence relation:
212 | \begin{equation*}
213 |     b_{i+1}(z) = 
214 |     \begin{cases}
215 |     (z-a_i)b_i(z) & i=1\\
216 |     (z-a_i)b_i(z) - \beta_i b_{i-1}z & {\textrm{else}}
217 |     \end{cases}
218 | \end{equation*}
219 | with $b_1(z)$ = 1 and weights
220 | \begin{equation*}
221 |     \begin{split}
222 |     \alpha_i &= \frac{\int_Z z b_i^2(z)p(z)dz} {\int_Z b_i^2(z)p(z)dz}\\
223 |     \beta_i &= \frac{\int_Z b_i^2(z)p(z)dz}{\int_Z b_{i-1}^2(z)p(z)dz}      
224 |     \end{split}
225 | \end{equation*}
226 | The recurrence relation can be used to generate the basis functions. Each basis function $b_i$ is a polynomial of degree $i-1$. 
227 | 
228 | \end{frame}
229 | 
230 | \begin{frame}{Orthogonal polynomial basis functions}
231 | \begin{figure}
232 | \centering
233 | \includegraphics[width=120mm]{Figs/orthogonal.jpeg}
234 | \end{figure} 
235 | \end{frame}
236 | 
237 | \section{Coefficients}
238 | \begin{frame}{Coefficients}
239 | The coefficients $\theta_1, \cdots, \theta_k$ can be inferred by exploiting the orthogonality of the basis functions, producing an integration term amenable to \textcolor{blue}{Gaussian quadrature}.
240 | \begin{equation*}
241 | \begin{split}
242 |     f(z) &= \sum_{i=1}^k \theta_i b_i(z)\\
243 |     \int_Z f(z)b_j(z)p(z)dz &= \int_Z \bigg(\sum_{i=1}^k \theta_i b_i(z) \bigg) b_j(z)p(z)dz\\
244 |     &= \sum_{i=1}^k \theta_i \int_Z b_i(z)b_j(z)p(z)dz \\
245 |     &= \theta_j \int_Z b_j(z)p(z)dz\\
246 |     \implies \theta_j &= \frac{\int_Z f(z)b_j(z)p(z)dz}{\int_Z b_j(z)p(z)dz}
247 | \end{split}
248 | \end{equation*}
249 | \end{frame}
250 | 
251 | \section{Multivariate}
252 | \begin{frame}{Multivariate}
253 | Polynomial chaos can be applied to functions with multiple random inputs. Multivariate basis functions over m variables are constructed as a product over univariate orthogonal polynomials:
254 | %\begin{equation*}
255 | %\begin{split}
256 | %\end{split}
257 | %\end{equation*}
258 | \end{frame}
259 | 
260 | \section{Summary}
261 | \begin{frame}{Summary}
262 |     \begin{itemize}
263 |         \item Polynomial chaos is a powerful uncertainty propagation technique basedon orthogonal polynomials.
264 |         \item Bayesian Monte Carlo uses Gaussian processes to efficiently arrive at the moments with analytic results for Gaussian kernels.
265 |     \end{itemize}
266 | \end{frame}
267 | \end{document}
268 | 
269 | 


--------------------------------------------------------------------------------
/Lecture_notes/18_symbolic_regression.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | \lstset{style=mystyle}
 32 | 
 33 | \mode<presentation> {
 34 | 
 35 | % The Beamer class comes with a number of default slide themes
 36 | % which change the colors and layouts of slides. Below this is a list
 37 | % of all the themes, uncomment each in turn to see what they look like.
 38 | 
 39 | %\usetheme{default}
 40 | \usetheme{AnnArbor}
 41 | %\usetheme{Antibes}
 42 | %\usetheme{Bergen}
 43 | %\usetheme{Berkeley}
 44 | %\usetheme{Berlin}
 45 | %\usetheme{Boadilla}
 46 | %\usetheme{CambridgeUS}
 47 | %\usetheme{Copenhagen}
 48 | %\usetheme{Darmstadt}
 49 | %\usetheme{Dresden}
 50 | %\usetheme{Frankfurt}
 51 | %\usetheme{Goettingen}
 52 | %\usetheme{Hannover}
 53 | %\usetheme{Ilmenau}
 54 | %\usetheme{JuanLesPins}
 55 | %\usetheme{Luebeck}
 56 | %\usetheme{Madrid}
 57 | %\usetheme{Malmoe}
 58 | %\usetheme{Marburg}
 59 | %\usetheme{Montpellier}
 60 | %\usetheme{PaloAlto}
 61 | %\usetheme{Pittsburgh}
 62 | %\usetheme{Rochester}
 63 | %\usetheme{Singapore}
 64 | %\usetheme{Szeged}
 65 | %\usetheme{Warsaw}
 66 | 
 67 | % As well as themes, the Beamer class has a number of color themes
 68 | % for any slide theme. Uncomment each of these in turn to see how it
 69 | % changes the colors of your current slide theme.
 70 | 
 71 | %\usecolortheme{albatross}
 72 | %\usecolortheme{beaver}
 73 | %\usecolortheme{beetle}
 74 | %\usecolortheme{crane}
 75 | %\usecolortheme{dolphin}
 76 | %\usecolortheme{dove}
 77 | %\usecolortheme{fly}
 78 | %\usecolortheme{lily}
 79 | %\usecolortheme{orchid}
 80 | %\usecolortheme{rose}
 81 | %\usecolortheme{seagull}
 82 | %\usecolortheme{seahorse}
 83 | %\usecolortheme{whale}
 84 | %\usecolortheme{wolverine}
 85 | 
 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
 88 | 
 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
 90 | }
 91 | 
 92 | \usepackage{graphicx} % Allows including images
 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 94 | %\usepackage {tikz}
 95 | \usepackage{tkz-graph}
 96 | \GraphInit[vstyle = Shade]
 97 | \tikzset{
 98 |   LabelStyle/.style = { rectangle, rounded corners, draw,
 99 |                         minimum width = 2em, fill = yellow!50,
100 |                         text = red, font = \bfseries },
101 |   VertexStyle/.append style = { inner sep=5pt,
102 |                                 font = \normalsize\bfseries},
103 |   EdgeStyle/.append style = {->, bend left} }
104 | \usetikzlibrary {positioning}
105 | %\usepackage {xcolor}
106 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
107 | %----------------------------------------------------------------------------------------
108 | %	TITLE PAGE
109 | %----------------------------------------------------------------------------------------
110 | 
111 | \title[Symbolic Regression]{Numerical Optimization 18: Symbolic Regression} %
112 | 
113 | \author{Qiang Zhu} % Your name
114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
115 | {
116 | University of Nevada Las Vegas\\ % Your institution for the title page
117 | \medskip
118 | }
119 | \date{\today} % Date, can be changed to a custom date
120 | 
121 | \begin{document}
122 | 
123 | \begin{frame}
124 | \titlepage % Print the title page as the first slide
125 | \end{frame}
126 | 
127 | \begin{frame}
128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
130 | \end{frame}
131 | 
132 | %----------------------------------------------------------------------------------------
133 | %	PRESENTATION SLIDES
134 | %----------------------------------------------------------------------------------------
135 | 
136 | %------------------------------------------------
137 | 
138 | \section{Grammars}
139 | \begin{frame}{Grammars}
140 | An expression can be represented by a tree of symbols. For example, the mathematical expression $x + \ln2$ can be represented using the tree consisting of the symbols $+, x, \ln$, and 2. Grammars specify constraints on the space of possible expressions.
141 | \begin{figure}
142 | \centering
143 | \includegraphics[width=125mm]{Figs/tree.jpeg}
144 | \end{figure} 
145 | 
146 | \end{frame}
147 | 
148 | \section{Constraints}
149 | \begin{frame}{Constraints}
150 | Constraints are not typically specified directly through a known feasible set X . Instead, the feasible set is typically formed from two types of constraints:
151 | 
152 | \begin{itemize}
153 |     \item equality constraints, $h(x)=0$ 
154 |     \item inequality constraints, $g(x) \leq 0$
155 |     
156 | \end{itemize}
157 | 
158 | Any optimization problem can be rewritten using these constraints
159 | \begin{gather*}
160 |     ~~~~~ \underset{\boldsymbol{x}}{\min} ~ f(\boldsymbol{x})\\
161 |     {s.t.}~~~~ h_i(x) = 0 \\
162 |     ~~~~~~~~~ g_j(x) = 0
163 | \end{gather*}
164 | 
165 | \end{frame}
166 | 
167 | \section{Genetic Programming}
168 | \begin{frame}{Genetic Programming}
169 | Genetic programming represents individuals using trees instead, which are better at representing mathematical functions, programs, decision trees, and other hierarchical structures.
170 | 
171 | \begin{equation*}
172 | x = \frac{b+a}{2} + \frac{b-a}{2}\bigg(\frac{2\hat{x}}{1+\hat{x}^2}\bigg)
173 | \end{equation*}
174 | 
175 | \end{frame}
176 | 
177 | \section{Lagrange Multipliers}
178 | \begin{frame}{Lagrange Multipliers}
179 | The method of Lagrange multipliers is used to optimize a function subject to equality
180 | constraints. 
181 | \begin{gather*}
182 |     ~~~~~ \underset{\boldsymbol{x}}{\min} ~ f(\boldsymbol{x})\\
183 |     {s.t.}~~~~ h_i(x) = 0 
184 | \end{gather*}
185 | 
186 | where $f$ and $h$ have continuous partial derivatives.
187 | 
188 | We can formulate the Lagrangian, which is a function of the design variables,
189 | \begin{gather*}
190 |     \mathcal{L}(x, \lambda) = f(x) - \lambda h(x) 
191 | \end{gather*}
192 | 
193 | Solving $\nabla \mathcal{L}(x, \lambda)$ = 0. Specifically, $\nabla_x \mathcal{L}$ = 0 gives us the condition $\nabla f= \lambda \nabla h$, and $\nabla \lambda \mathcal{L}=0$ gives us $h(x)=0$. Any solution is considered a critical point.
194 | 
195 | \end{frame}
196 | 
197 | \begin{frame}{Lagrange Multipliers to a single equality condition}
198 | The method of Lagrange multipliers is used to optimize a function subject to equality
199 | constraints. 
200 | \begin{gather*}
201 |     ~~~~~ \underset{\boldsymbol{x}}{\min} ~ -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] \\
202 |     {s.t.}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ x_1 - x_2^2 = 0 
203 | \end{gather*}
204 | 
205 | We can formulate the Lagrangian, 
206 | \begin{equation*}
207 |     \mathcal{L}(x, \lambda) = -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] + \lambda(x_1 - x_2^2)
208 | \end{equation*}
209 | We compute
210 | \begin{itemize}
211 |     \item $\frac{\partial \mathcal{L}}{\partial x_1}$
212 |     \item $\frac{\partial \mathcal{L}}{\partial x_2}$
213 |     \item $\frac{\partial \mathcal{L}}{\partial \lambda}$
214 | \end{itemize}
215 | 
216 | \end{frame}
217 | 
218 | 
219 | \begin{frame}{Lagrange Multipliers to multiple equality conditions}
220 | The method of Lagrange multipliers is used to optimize a function subject to equality
221 | constraints. 
222 | \begin{gather*}
223 |     ~~~~~ \underset{\boldsymbol{x}}{\min} ~ -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] \\
224 |     {s.t.}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ x_1 - x_2^2 = 0 
225 | \end{gather*}
226 | 
227 | We can formulate the Lagrangian, 
228 | \begin{equation*}
229 |     \mathcal{L}(x, \lambda) = -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] + \lambda(x_1 - x_2^2)
230 | \end{equation*}
231 | We compute
232 | \begin{itemize}
233 |     \item $\frac{\partial \mathcal{L}}{\partial x_1}$
234 |     \item $\frac{\partial \mathcal{L}}{\partial x_2}$
235 |     \item $\frac{\partial \mathcal{L}}{\partial \lambda}$
236 | \end{itemize}
237 | 
238 | \end{frame}
239 | 
240 | \section{Summary}
241 | \begin{frame}{Summary}
242 |     \begin{itemize}
243 |         \item Constraints are requirements on the design points that a solution must satisfy.
244 |         \item Some constraints can be transformed or substituted into the problem to result in an unconstrained optimization problem.
245 |         \item Analytical methods using Lagrange multipliers yield the generalized Lagrangian and the necessary conditions for optimality under constraints.
246 |         \item A constrained optimization problem has a dual problem formulation that is easier to solve and whose solution is a lower bound of the solution to the original problem.
247 |     \end{itemize}
248 | \end{frame}
249 | \end{document}
250 | 
251 | 


--------------------------------------------------------------------------------
/Lecture_notes/A1_trust_region.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/A1_trust_region.pdf


--------------------------------------------------------------------------------
/Lecture_notes/A1_trust_region.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{amsmath}
  3 | \usepackage{hyperref}
  4 | \usepackage{listings}
  5 | \usepackage{xcolor}
  6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
  7 | \definecolor{codegreen}{rgb}{0,0.6,0}
  8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
  9 | \definecolor{codepurple}{rgb}{0.58,0,0.82}
 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92}
 11 |  
 12 | \lstdefinestyle{mystyle}{
 13 |     backgroundcolor=\color{backcolour},   
 14 |     commentstyle=\color{codegreen},
 15 |     keywordstyle=\color{magenta},
 16 |     numberstyle=\tiny\color{codegray},
 17 |     stringstyle=\color{codepurple},
 18 |     basicstyle=\ttfamily\footnotesize,
 19 |     breakatwhitespace=false,         
 20 |     breaklines=true,                 
 21 |     captionpos=b,                    
 22 |     keepspaces=true,                 
 23 |     %numbers=left,                    
 24 |     numbersep=5pt,                  
 25 |     showspaces=false,                
 26 |     showstringspaces=false,
 27 |     showtabs=false,                  
 28 |     tabsize=2
 29 | }
 30 |  
 31 | %%
 32 | %% Julia definition (c) 2014 Jubobs
 33 | %%
 34 | \lstdefinelanguage{Julia}%
 35 |   {morekeywords={abstract,break,case,catch,const,continue,do,else,elseif,%
 36 |       end,export,false,for,function,immutable,import,importall,if,in,%
 37 |       macro,module,otherwise,quote,return,switch,true,try,type,typealias,%
 38 |       using,while},%
 39 |    sensitive=true,%
 40 |    alsoother={$},%
 41 |    morecomment=[l]\#,%
 42 |    morecomment=[n]{\#=}{=\#},%
 43 |    morestring=[s]{"}{"},%
 44 |    morestring=[m]{'}{'},%
 45 | }[keywords,comments,strings]%
 46 | 
 47 | \lstset{%
 48 |     language         = Julia,
 49 |     basicstyle       = \ttfamily,
 50 |     keywordstyle     = \bfseries\color{blue},
 51 |     stringstyle      = \color{magenta},
 52 |     commentstyle     = \color{ForestGreen},
 53 |     showstringspaces = false,
 54 | }
 55 |  
 56 |  
 57 | \lstset{style=mystyle}
 58 | 
 59 | \mode<presentation> {
 60 | 
 61 | % The Beamer class comes with a number of default slide themes
 62 | % which change the colors and layouts of slides. Below this is a list
 63 | % of all the themes, uncomment each in turn to see what they look like.
 64 | 
 65 | %\usetheme{default}
 66 | \usetheme{AnnArbor}
 67 | %\usetheme{Antibes}
 68 | %\usetheme{Bergen}
 69 | %\usetheme{Berkeley}
 70 | %\usetheme{Berlin}
 71 | %\usetheme{Boadilla}
 72 | %\usetheme{CambridgeUS}
 73 | %\usetheme{Copenhagen}
 74 | %\usetheme{Darmstadt}
 75 | %\usetheme{Dresden}
 76 | %\usetheme{Frankfurt}
 77 | %\usetheme{Goettingen}
 78 | %\usetheme{Hannover}
 79 | %\usetheme{Ilmenau}
 80 | %\usetheme{JuanLesPins}
 81 | %\usetheme{Luebeck}
 82 | %\usetheme{Madrid}
 83 | %\usetheme{Malmoe}
 84 | %\usetheme{Marburg}
 85 | %\usetheme{Montpellier}
 86 | %\usetheme{PaloAlto}
 87 | %\usetheme{Pittsburgh}
 88 | %\usetheme{Rochester}
 89 | %\usetheme{Singapore}
 90 | %\usetheme{Szeged}
 91 | %\usetheme{Warsaw}
 92 | 
 93 | % As well as themes, the Beamer class has a number of color themes
 94 | % for any slide theme. Uncomment each of these in turn to see how it
 95 | % changes the colors of your current slide theme.
 96 | 
 97 | %\usecolortheme{albatross}
 98 | %\usecolortheme{beaver}
 99 | %\usecolortheme{beetle}
100 | %\usecolortheme{crane}
101 | %\usecolortheme{dolphin}
102 | %\usecolortheme{dove}
103 | %\usecolortheme{fly}
104 | %\usecolortheme{lily}
105 | %\usecolortheme{orchid}
106 | %\usecolortheme{rose}
107 | %\usecolortheme{seagull}
108 | %\usecolortheme{seahorse}
109 | %\usecolortheme{whale}
110 | %\usecolortheme{wolverine}
111 | 
112 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line
113 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line
114 | 
115 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line
116 | }
117 | 
118 | \usepackage{graphicx} % Allows including images
119 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
120 | %\usepackage {tikz}
121 | \usepackage{tkz-graph}
122 | \GraphInit[vstyle = Shade]
123 | \tikzset{
124 |   LabelStyle/.style = { rectangle, rounded corners, draw,
125 |                         minimum width = 2em, fill = yellow!50,
126 |                         text = red, font = \bfseries },
127 |   VertexStyle/.append style = { inner sep=5pt,
128 |                                 font = \normalsize\bfseries},
129 |   EdgeStyle/.append style = {->, bend left} }
130 | \usetikzlibrary {positioning}
131 | %\usepackage {xcolor}
132 | \definecolor {processblue}{cmyk}{0.96,0,0,0}
133 | %----------------------------------------------------------------------------------------
134 | %	TITLE PAGE
135 | %----------------------------------------------------------------------------------------
136 | 
137 | \title[Trust Region Methods]{Numerical Optimization: Trust Region Methods} % The short title appears at the bottom of every slide, the full title is only on the title page
138 | 
139 | \author{Qiang Zhu} % Your name
140 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
141 | {
142 | University of Nevada Las Vegas\\ % Your institution for the title page
143 | \medskip
144 | }
145 | \date{\today} % Date, can be changed to a custom date
146 | 
147 | \begin{document}
148 | 
149 | \begin{frame}
150 | \titlepage % Print the title page as the first slide
151 | \end{frame}
152 | 
153 | \begin{frame}
154 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it
155 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation
156 | \end{frame}
157 | 
158 | %----------------------------------------------------------------------------------------
159 | %	PRESENTATION SLIDES
160 | %----------------------------------------------------------------------------------------
161 | 
162 | %------------------------------------------------
163 | 
164 | \section{Trust Region Model}
165 | \begin{frame}{The problem of line search}
166 | In the line search method, one usually use the direction based on the first- or second-order derivative, and then do an approximate 1D search. 
167 | If the derivative is far from the local minimum, such search may result in excessively large steps or premature convergence.
168 | 
169 | \begin{figure}
170 | \centering
171 | \includegraphics[width=90mm]{Figs/trust-region.jpeg}
172 | \end{figure}
173 | \end{frame}
174 | 
175 | \begin{frame}{Line search .v.s trust region}
176 | \begin{alertblock}{Line search}
177 | \begin{itemize}
178 |     \item Find the direction of improvement
179 |     \item Select a step length
180 | \end{itemize}
181 | \end{alertblock}
182 | \vfill
183 | \begin{alertblock}{Trust region}
184 | \begin{itemize}
185 |     \item Select a trust region (within a hypersphere)
186 |     \item Find a point of improvement
187 | \end{itemize}
188 | \end{alertblock}
189 | 
190 | \end{frame}
191 | 
192 | \section{The outline of trust region approach}
193 | \begin{frame}{Quadratic approximation}
194 | In this chapter, we will assume that the model function $m_k$ that is used at each iterate $x_k$ is quadratic. 
195 | $m_k$ is based on the Taylor-series expansion of $f$.
196 | \begin{equation*}
197 |     f(x_k + p) = f(x_k) + \nabla f(x_k)^T p + \frac{1}{2}p^T \nabla^2 f(x_k + tp)p,
198 | \end{equation*}
199 | where $t$ is some scalar in the interval (0,1). 
200 | 
201 | By using an approximation $B_k$ to the Hessian in the second-order term, $m_k$ is defined as follows:
202 | \begin{equation*}
203 |     m_k(p) = f(x_k) + \nabla f(x_k)^T p + \frac{1}{2}p^T B_k f(x_k + tp)p,,
204 | \end{equation*}
205 | 
206 | The difference between $m_k(p)$ and $f(x_k + p)$ is $O(p^2)$ ,which is small when $p$ is small.
207 | 
208 | \end{frame}
209 | 
210 | \begin{frame}{Trust region step}
211 | The trust-region method steps to the minimizer of $m_k$ within the dotted circle, yielding a more significant reduction in $f$ and better progress toward the solution.
212 | 
213 | To obtain each step, we seek a solution of the subproblem
214 | \begin{gather*}
215 |     \textrm{min}~ m_k(p) = f(x_k) + \nabla f(x_k)^T p + \frac{1}{2}p^T B_k f(x_k + tp)p, \\
216 |     \textrm{s.t.}~~ ||p||_2 \leq \Delta_k,    
217 | \end{gather*}
218 | 
219 | where $\Delta_k$ is the \textcolor{blue}{trust-region radius}.
220 | 
221 | Thus, the trust-region approach requires us to solve a sequence of subproblems
222 | in which the objective function and constraint (which can be written as $p^Tp \leq \Delta_k$)
223 | are both quadratic, which is easy to solve if it is convex. 
224 | \end{frame}
225 | 
226 | \begin{frame}{How to adjust the $\Delta_k$?}
227 | For a given step, we define 
228 | \begin{equation*}
229 |     \rho_k = \frac{f(x_k) - f(x_k+p_k)}{m_k(0) - m_k(p_k)}
230 | \end{equation*}
231 | The numerator is called the \textcolor{blue}{actual reduction}. \\
232 | The denominator is the \textcolor{blue}{predicted reduction}, which is non-negative. \\
233 | 
234 | \begin{itemize}
235 |     \item if $\rho_k < 0$, the new objective value $f(x_k + p_k)$ is greater than $f(x_k)$, \textcolor{red}{reject}.
236 |     \item if $\rho_k \approx 1$, there is good agreement between the model $m_k$ and the function $f$, \textcolor{red}{expand the trust region} 
237 |     \item if $0<\rho_k \ll 1$, \textcolor{red}{shrink the trust region} by reducing $\delta_k$
238 | \end{itemize}
239 | \end{frame}
240 | 
241 | \begin{frame}{Algorithm}
242 | 
243 | \lstinputlisting[language=julia]{trust.jl}
244 | 
245 | \end{frame}
246 | 
247 | \section{Summary}
248 | \begin{frame}{Summary}
249 |     \begin{itemize}
250 |         \item Trust region method may perform better when the initial point is far from the local minimum
251 |         \item The correctness of trust region method relies on the accuracy of the model function
252 |         \item The step size is controled by the trust-region radius which is updated each step
253 |         \item Quadratic approximation needs the information of hessian
254 |         \item The subproblem optimization may be tricky when hessian is not positive definite
255 |     \end{itemize}
256 | \end{frame}
257 | \end{document}
258 | 
259 | 


--------------------------------------------------------------------------------
/Lecture_notes/Figs/C60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/C60.png


--------------------------------------------------------------------------------
/Lecture_notes/Figs/CMA.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/CMA.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/Cross-entropy.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/Cross-entropy.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/EI.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/EI.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/GPR-raw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/GPR-raw.png


--------------------------------------------------------------------------------
/Lecture_notes/Figs/GPR-train-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/GPR-train-1.png


--------------------------------------------------------------------------------
/Lecture_notes/Figs/GPR-train-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/GPR-train-2.png


--------------------------------------------------------------------------------
/Lecture_notes/Figs/LB.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/LB.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/algo_opt.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/algo_opt.jpg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/bracket.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/bracket.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/cauchy.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/cauchy.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/cg-sd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/cg-sd.jpg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/constraint-ab.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/constraint-ab.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/coordinate-improved.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/coordinate-improved.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/coordinate.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/coordinate.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/cross-valid.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/cross-valid.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/curvature1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/curvature1.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/curvature2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/curvature2.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/derivative-comparison.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/derivative-comparison.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/error-explore.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/error-explore.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/firefly.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/firefly.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/flat.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/flat.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/gaussian.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/gaussian.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/gp-opt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/gp-opt.png


--------------------------------------------------------------------------------
/Lecture_notes/Figs/graph1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/graph1.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/graph2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/graph2.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/grid_search.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/grid_search.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/holdout.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/holdout.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/ip.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/ip.gif


--------------------------------------------------------------------------------
/Lecture_notes/Figs/julia-comp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/julia-comp.png


--------------------------------------------------------------------------------
/Lecture_notes/Figs/julia.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/julia.png


--------------------------------------------------------------------------------
/Lecture_notes/Figs/kernel.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/kernel.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/linesearch.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/linesearch.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/minimum.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/minimum.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/momentum.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/momentum.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/multi-minima.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/multi-minima.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/n-momentum.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/n-momentum.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/newton-1d.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/newton-1d.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/orthogonal.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/orthogonal.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/powell.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/powell.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/prob.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/prob.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/pso.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/pso.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/quasi-newton.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/quasi-newton.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/rbf.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/rbf.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/sample_all.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/sample_all.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/search.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/search.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/selection.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/selection.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/sgd.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/sgd.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/simplex-performance.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/simplex-performance.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/simplex.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/simplex.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/simplex_algo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/simplex_algo.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/solution-space.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/solution-space.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/strafied.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/strafied.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/sufficient_decrease.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/sufficient_decrease.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/tree.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/tree.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/trust-region.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/trust-region.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/two-conditions.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/two-conditions.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/uncertainty.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/uncertainty.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/uni-proj.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/uni-proj.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/Figs/unimodal.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/unimodal.jpeg


--------------------------------------------------------------------------------
/Lecture_notes/trust.jl:
--------------------------------------------------------------------------------
 1 | function trust_region_descent(f, G, H, x, k_max; 
 2 |     eta1=0.25, eta2=0.5, gamma1=0.5, gamma2=2.0, delta=1.0)
 3 |     
 4 |     y = f(x)
 5 |     for k in 1 : k_max
 6 |         x1, y1 = solve_subproblem(G, H, x, delta) 
 7 |         r = (y - f(x1)) / (y - y1)
 8 |         if r < eta1
 9 |             delta *= gamma1 
10 |         else
11 |             x, y = x1, y1 
12 |             if r > eta2
13 |                 delta *= gamma2 
14 |             end
15 |         end 
16 |     end
17 |     return x 
18 | end


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Numerical-Optimization
 2 | This is a course material for numerical optimization to be taught in summer 2020 (June-August, 10 weeks) through webex. I have not finalized the schedule yet. It is completely open to everyone. If you are interested, please feel free to contact qiang.zhu@unlv.edu between 2020/06-2020/08. 
 3 | 
 4 | 
 5 | ![E](https://github.com/qzhu2017/Numerical-Optimization/blob/master/Lecture_notes/Figs/ip.gif)
 6 | 
 7 | ## Textbooks
 8 | This course is intended to cover
 9 | - Various optmization methods used in scientific computing
10 | - Julia programming
11 | 
12 | The course will mostly follow the book of [Algorithms for Optimization by Mykel J. Kochenderfer and Tim A. Wheeler](https://mitpress.mit.edu/books/algorithms-optimization).
13 | Here is an intersting [video](https://www.youtube.com/watch?v=ofWy5kaZU3g) by one of the authors talking about how they wrote the book in a recent Julia confernece.
14 | 
15 | For some details on the optimization algorithms, we will refer to [Numerical Optimization by Jorge Nocedal and Stephen J. Wright](https://link.springer.com/book/10.1007/978-0-387-40065-5)
16 | 
17 | ## Format
18 | Though this is a virtual class, we plan to make it as interactive as possible. Typically, each class is composed of three units. 
19 | 
20 | - Codes review (1 or 2 volunteers to review the previous homework assignments)
21 | - Lecture (Math details for each algorithm)
22 | - Coding session (code accomplishment of the algos in each lecture)
23 | 
24 | All codings will be done through jupyter notebook.
25 | 
26 | Each class will take about 90 minutes.
27 | 
28 | 


--------------------------------------------------------------------------------
/compile.sh:
--------------------------------------------------------------------------------
1 | cd Lecture_notes/
2 | for f in *.tex ; do
3 |   pdflatex $f 
4 | done
5 | rm *.aux *.out *.log *.nav *.snm *.toc
6 | 


--------------------------------------------------------------------------------