├── Jupyter_notebooks ├── 01-Julia-intro.ipynb ├── 02-derivatives.ipynb ├── 03-bracketing methods.ipynb ├── 04-local-descent.ipynb ├── 05-SD-CG.ipynb ├── 06-other-1st-order-methods.ipynb ├── 07-Newton.ipynb └── 08-quasi-newton.ipynb ├── Lecture_notes ├── 01_intro.pdf ├── 01_intro.tex ├── 02_derivative.pdf ├── 02_derivative.tex ├── 03_bracket.pdf ├── 03_bracket.tex ├── 04_local_decent.pdf ├── 04_local_decent.tex ├── 05_first_order_1.pdf ├── 05_first_order_1.tex ├── 06_first_order_2.pdf ├── 06_first_order_2.tex ├── 06_gradient_descent.pdf ├── 07_Newton_method.pdf ├── 07_Newton_method.tex ├── 08_Quasi_Newton.pdf ├── 08_Quasi_Newton.tex ├── 09_direct_methods.pdf ├── 09_direct_methods.tex ├── 10_Stochastic_methods.tex ├── 11_evolutinary_methods.tex ├── 12_constrained_optimization.tex ├── 13_sampling_plans.tex ├── 14_surrogate_models.tex ├── 15_surrogate_models_prob.tex ├── 16_surrogate_optimization.tex ├── 17_uncertainty.tex ├── 18_symbolic_regression.tex ├── A1_trust_region.pdf ├── A1_trust_region.tex ├── Figs │ ├── C60.png │ ├── CMA.jpeg │ ├── Cross-entropy.jpeg │ ├── EI.jpeg │ ├── GPR-raw.png │ ├── GPR-train-1.png │ ├── GPR-train-2.png │ ├── LB.jpeg │ ├── algo_opt.jpg │ ├── bracket.jpeg │ ├── cauchy.jpeg │ ├── cg-sd.jpg │ ├── constraint-ab.jpeg │ ├── coordinate-improved.jpeg │ ├── coordinate.jpeg │ ├── cross-valid.jpeg │ ├── curvature1.jpeg │ ├── curvature2.jpeg │ ├── derivative-comparison.jpeg │ ├── error-explore.jpeg │ ├── firefly.jpeg │ ├── flat.jpeg │ ├── gaussian.jpeg │ ├── gp-opt.png │ ├── graph1.jpeg │ ├── graph2.jpeg │ ├── grid_search.jpeg │ ├── holdout.jpeg │ ├── ip.gif │ ├── julia-comp.png │ ├── julia.png │ ├── kernel.jpeg │ ├── linesearch.jpeg │ ├── minimum.jpeg │ ├── momentum.jpeg │ ├── multi-minima.jpeg │ ├── n-momentum.jpeg │ ├── newton-1d.jpeg │ ├── orthogonal.jpeg │ ├── powell.jpeg │ ├── prob.jpeg │ ├── pso.jpeg │ ├── quasi-newton.jpeg │ ├── rbf.jpeg │ ├── sample_all.jpeg │ ├── search.jpeg │ ├── selection.jpeg │ ├── sgd.jpeg │ ├── simplex-performance.jpeg │ ├── simplex.jpeg │ ├── simplex_algo.jpeg │ ├── solution-space.jpeg │ ├── strafied.jpeg │ ├── sufficient_decrease.jpeg │ ├── tree.jpeg │ ├── trust-region.jpeg │ ├── two-conditions.jpeg │ ├── uncertainty.jpeg │ ├── uni-proj.jpeg │ └── unimodal.jpeg └── trust.jl ├── README.md └── compile.sh /Jupyter_notebooks/02-derivatives.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 2 Derivatives\n", 8 | "\n", 9 | "This notebook was automatically generated from the Algorithms for Optimization source code. Each cell generates a figure from the original text. While this code is not optimized for use in lectures, we provide it here to be adapted for such projects. We hope you find it useful." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "scrolled": false 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "#import Pkg; \n", 21 | "#Pkg.add(\"SymEngine\");\n", 22 | "using SymEngine" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "# 2.1 Analytic gradient " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "scrolled": false 37 | }, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "1/2 + 2*x + sin(x)/x^2 - cos(x)/x" 43 | ] 44 | }, 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "# one variables\n", 52 | "@vars x;\n", 53 | "f = x^2 + x/2 - sin(x)/x;\n", 54 | "diff(f, x)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "sin(x2)\n", 67 | "x1*cos(x2)\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "# many variables\n", 73 | "@vars x1, x2;\n", 74 | "f = x1*sin(x2) + 1;\n", 75 | "println(diff(f, x1))\n", 76 | "println(diff(f, x2))" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "# 2.2 Numerical gradient\n", 84 | "- Finite difference\n", 85 | "- Complex step" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# define a target function\n", 95 | "f0(x) = x^2 + x/2 - sin(x)/x;" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "0.7333000227808952\n", 108 | "0.733300007879734\n", 109 | "0.7332999929785728\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "# Finite difference method\n", 115 | "diff_forward(f, x; h = sqrt(eps(Float64))) = (f(x+h) - f(x))/h;\n", 116 | "diff_central(f, x; h = sqrt(eps(Float64))) = (f(x+h/2) - f(x-h/2))/h;\n", 117 | "diff_backward(f, x; h = sqrt(eps(Float64))) = (f(x) - f(x-h))/h;\n", 118 | "\n", 119 | "\n", 120 | "println(diff_forward(f0, 0.1))\n", 121 | "println(diff_central(f0, 0.1))\n", 122 | "println(diff_backward(f0, 0.1))" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 6, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "0.7333000119025557\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "# Complex step method\n", 140 | "diff_complex(f, x; h=1e-20) = imag(f(x+h*im))/h\n", 141 | "\n", 142 | "println(diff_complex(f0, 0.1))" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "#import Pkg; Pkg.add(\"Zygote\")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 8, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "(0.7333000119025559,)" 163 | ] 164 | }, 165 | "execution_count": 8, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "# Automatic differentiation\n", 172 | "\n", 173 | "import Zygote: gradient\n", 174 | "gradient(f0, 0.1)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 9, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "data": { 184 | "text/plain": [ 185 | "(0.07196888754292625, -0.17110198196123422)" 186 | ] 187 | }, 188 | "execution_count": 9, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "f1(a, b) = log(a*b, max(a,2));\n", 195 | "gradient(f1, 3.0, 2.0)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "# 2.3 Automatic Differentiation\n", 203 | "- Dual numbers\n", 204 | "- Forward pass" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "### 2.3.1 Dual Number Notation\n", 212 | "\n", 213 | "Instead of D(a,b) we can write a + b ϵ, where ϵ satisfies ϵ^2=0. (Some people like to recall imaginary numbers where an i is introduced with i^2=-1.) \n", 214 | "\n", 215 | "Others like to think of how engineers just drop the O(ϵ^2) terms.\n", 216 | "\n", 217 | "The four rules are\n", 218 | "\n", 219 | "$ (a+b\\epsilon) \\pm (c+d\\epsilon) = (a \\pm c) + (b \\pm d)\\epsilon$\n", 220 | "\n", 221 | "$ (a+b\\epsilon) * (c+d\\epsilon) = (ac) + (bc+ad)\\epsilon$\n", 222 | "\n", 223 | "$ (a+b\\epsilon) / (c+d\\epsilon) = (a/c) + (bc-ad)/c^2 \\epsilon $" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 32, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "promote_rule (generic function with 159 methods)" 235 | ] 236 | }, 237 | "execution_count": 32, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "struct D <: Number # D is a function-derivative pair\n", 244 | " f::Tuple{Float64,Float64}\n", 245 | "end\n", 246 | "\n", 247 | "# Add the last two rules\n", 248 | "import Base: -,*,+, /, convert, promote_rule\n", 249 | "-(x::D, y::D) = D(x.f .- y.f)\n", 250 | "*(x::D, y::D) = D((x.f[1]*y.f[1], (x.f[2]*y.f[1] + x.f[1]*y.f[2])))\n", 251 | "\n", 252 | "+(x::D, y::D) = D(x.f .+ y.f)\n", 253 | "/(x::D, y::D) = D((x.f[1]/y.f[1], (y.f[1]*x.f[2] - x.f[1]*y.f[2])/y.f[1]^2))\n", 254 | "convert(::Type{D}, x::Real) = D((x,zero(x)))\n", 255 | "promote_rule(::Type{D}, ::Type{<:Number}) = D" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 33, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "D((0.0, 1.0))" 267 | ] 268 | }, 269 | "execution_count": 33, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "ϵ = D((0,1))" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 34, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "D((0.0, 0.0))" 287 | ] 288 | }, 289 | "execution_count": 34, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "ϵ * ϵ" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 38, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "D((1.0, -1.0))" 307 | ] 308 | }, 309 | "execution_count": 38, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "1/(1+ϵ)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 39, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "D((3.0, 2.0))" 327 | ] 328 | }, 329 | "execution_count": 39, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "(1+2*ϵ)*(3-4*ϵ)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "### 2.3.2 Forward Differentiation" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 17, 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "data": { 352 | "text/plain": [ 353 | "Dual{Nothing}(2.1972245773362196,0.3333333333333333)" 354 | ] 355 | }, 356 | "execution_count": 17, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "using ForwardDiff" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "a = ForwardDiff.Dual(3,1)\n", 372 | "log(a^2)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 19, 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "text/plain": [ 383 | "Dual{Nothing}(2.1972245773362196,0.6666666666666666)" 384 | ] 385 | }, 386 | "execution_count": 19, 387 | "metadata": {}, 388 | "output_type": "execute_result" 389 | } 390 | ], 391 | "source": [ 392 | "a = ForwardDiff.Dual(3,1)\n", 393 | "b = ForwardDiff.Dual(2,0)\n", 394 | "log(a*b + max(a,2))" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [] 410 | } 411 | ], 412 | "metadata": { 413 | "kernelspec": { 414 | "display_name": "Julia 1.0.5", 415 | "language": "julia", 416 | "name": "julia-1.0" 417 | }, 418 | "language_info": { 419 | "file_extension": ".jl", 420 | "mimetype": "application/julia", 421 | "name": "julia", 422 | "version": "1.0.5" 423 | } 424 | }, 425 | "nbformat": 4, 426 | "nbformat_minor": 2 427 | } 428 | -------------------------------------------------------------------------------- /Jupyter_notebooks/04-local-descent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 4 Local Descent" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# necessary libraries\n", 17 | "using Plots\n", 18 | "using ForwardDiff\n", 19 | "using Printf\n", 20 | "using LinearAlgebra\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# trial function and gradient\n", 30 | "#f_booth_0(x1, x2) = (x1 + 2*x2 -7)^2 + (2*x1 + x2 -5)^2\n", 31 | "#f_booth((x1, x2)) = [(x1 + 2*x2 -7)^2 + (2*x1 + x2 -5)^2]\n", 32 | "\n", 33 | "f_0(x1, x2) = x1^2 + x1*x2 + x2^2\n", 34 | "f((x1, x2)) = [x1^2 + x1*x2 + x2^2]\n", 35 | "\n", 36 | "function f_prime(a)\n", 37 | " return ForwardDiff.jacobian(f, a)[1,:]\n", 38 | "end\n", 39 | "\n", 40 | "x0 = [1.0, 2.0]\n", 41 | "println(f(x0))\n", 42 | "println(f_prime(x0))" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# plot function\n", 52 | "x = -5:1:8\n", 53 | "y = -5:1:8\n", 54 | "plot(\n", 55 | "contour(x, y, f_0; levels = collect(0:1:30)),\n", 56 | "contourf(x, y, f_0; levels = collect(0:1:30)), \n", 57 | "size=[800, 300]\n", 58 | ")" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "x0 = [1, 2]\n", 68 | "d = -f_prime(x0)\n", 69 | "n, max = 101, 1\n", 70 | "res = max/(n-1)\n", 71 | "a0 = 0:res:max\n", 72 | "y0 = zeros(n)\n", 73 | "\n", 74 | "println(\"f(x) along the direction: \", d)\n", 75 | "for i in 1:1:n\n", 76 | " y0[i] = f(x0 + a0[i]*d)[1]\n", 77 | " #@printf(\"%4d %8.3f %8.3f %6.3f %6.3f\\n\",i, a0[i], y0[i], (x0+a0[i]*d)[1], (x0+a0[i]*d)[2])\n", 78 | "end\n", 79 | "plot(a0, y0, xlabel=\"a\")\n", 80 | "\n", 81 | " " 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "function strong_backtracking(f, ∇, x, d; α=5, β=1e-4, σ=0.1) \n", 91 | " y0, g0, y_prev, α_prev = f(x)[1], ∇(x)⋅d, NaN, 0\n", 92 | " αlo, αhi = NaN, NaN\n", 93 | " # bracket phase\n", 94 | " while true\n", 95 | " y = f(x + α*d)[1]\n", 96 | " if y > y0 + β*α*g0 || (!isnan(y_prev) && y ≥ y_prev) \n", 97 | " αlo, αhi = α_prev, α\n", 98 | " break \n", 99 | " end\n", 100 | " \n", 101 | " g = ∇(x + α*d)⋅d \n", 102 | " if abs(g) ≤ -σ*g0\n", 103 | " return α \n", 104 | " elseif g ≥ 0\n", 105 | " αlo, αhi = α, α_prev\n", 106 | " break \n", 107 | " end\n", 108 | " y_prev, α_prev, α = y, α, 2α \n", 109 | " end\n", 110 | " \n", 111 | " @printf(\"The initial interval: %6.3f %6.3f\\n\", αlo, αhi)\n", 112 | "\n", 113 | " # zoom phase\n", 114 | " ylo = f(x + αlo*d)[1]\n", 115 | " n = 0\n", 116 | " while n < 10\n", 117 | " α = (αlo + αhi)/2\n", 118 | " y = f(x + α*d)[1]\n", 119 | " @printf(\"The interval: %6.3f %6.3f\\n\", αlo, αhi)\n", 120 | " if y > y0 + β*α*g0 || y ≥ ylo #\n", 121 | " @printf(\"No sufficient decrease: %6.3f %6.3f %6.3f %6.3f\\n\", α, y, y0, ylo)\n", 122 | " αhi = α \n", 123 | " else\n", 124 | " g = ∇(x + α*d)⋅d \n", 125 | " if abs(g) ≤ -σ*g0\n", 126 | " return α\n", 127 | " elseif g*(αhi - αlo) ≥ 0\n", 128 | " αhi = αlo \n", 129 | " end\n", 130 | " αlo = α \n", 131 | " end\n", 132 | " n += 1\n", 133 | " end \n", 134 | "end\n", 135 | "\n", 136 | "strong_backtracking(f, f_prime, x0, d)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "Julia 1.0.5", 150 | "language": "julia", 151 | "name": "julia-1.0" 152 | }, 153 | "language_info": { 154 | "file_extension": ".jl", 155 | "mimetype": "application/julia", 156 | "name": "julia", 157 | "version": "1.0.5" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 2 162 | } 163 | -------------------------------------------------------------------------------- /Lecture_notes/01_intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/01_intro.pdf -------------------------------------------------------------------------------- /Lecture_notes/01_intro.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Optimization]{Numerical Optimization 01: Introduction} % The short title appears at the bottom of every slide, the full title is only on the title page 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Syllabus} 139 | \begin{frame}{Syllabus} 140 | 141 | 142 | \begin{columns} 143 | 144 | \begin{column}{.6\textwidth} 145 | We have two goals: 146 | \begin{itemize} 147 | \item Learn Julia programming 148 | \item Understand the optimization methods 149 | \end{itemize} 150 | Subjects to be covered 151 | \begin{itemize} 152 | \item Julia programming 153 | \item Local Optimization 154 | \begin{itemize} 155 | \item Derivatives and Gradients 156 | \item Bracketing 157 | \item First/second-Order optimization 158 | \item Gradient free methods 159 | \item Stochastic methods 160 | \end{itemize} 161 | \item Global Optimization 162 | \item Sampling Plans 163 | \item Surrogate Optimization 164 | \item Expression Optimization 165 | 166 | \end{itemize} 167 | \end{column} 168 | \pause 169 | \begin{column}{.4\textwidth} 170 | \begin{figure} 171 | \centering 172 | \includegraphics[width=30mm]{Figs/algo_opt.jpg} 173 | \end{figure} 174 | Virtual Meet twice a week (~90 minutes each time).\\ 175 | \begin{itemize} 176 | \item review of homework (20-30 mins) 177 | \item lecture (30-50 mins) 178 | \item coding (20-30 mins) 179 | \end{itemize} 180 | \end{column} 181 | 182 | \end{columns} 183 | 184 | \end{frame} 185 | 186 | \section{Why Optimization?} 187 | \begin{frame}{Why optimization} 188 | \begin{columns} 189 | \begin{column}{.55\textwidth} 190 | A typical optimization problem is to 191 | \begin{equation*} 192 | \begin{split} 193 | \textrm{minimize} &~~ f(x)\\ 194 | \textrm{subject to} &~~ x \in X 195 | \end{split} 196 | \end{equation*} 197 | A design point ($x$) can be represented as a vector of values 198 | corresponding to different design variables. 199 | \end{column} 200 | 201 | \begin{column}{.45\textwidth} 202 | \begin{figure} 203 | \centering 204 | \includegraphics[width=40mm]{Figs/solution-space.jpeg} 205 | \end{figure} 206 | \end{column} 207 | \end{columns} 208 | 209 | A \textcolor{blue}{necessary condition?} for $f(x)$ reaches the minimum is that \textcolor{blue}{$f`(x)=0$}. 210 | \end{frame} 211 | 212 | 213 | \begin{frame}{Optimization is hard!} 214 | \begin{itemize} 215 | \item $f`(x)=0$ is not a sufficient condition. 216 | \begin{figure} 217 | \centering 218 | \includegraphics[width=80mm]{Figs/minimum.jpeg} 219 | \end{figure} 220 | \item There exist many points where {$f`(x)=0$}. 221 | \begin{figure} 222 | \centering 223 | \includegraphics[width=60mm]{Figs/multi-minima.jpeg} 224 | \end{figure} 225 | \item $f(x)$ and $f`(x)$ are hard to evaluate. 226 | 227 | \end{itemize} 228 | \end{frame} 229 | 230 | 231 | \section{Why Julia?} 232 | \begin{frame}{Why Julia?} 233 | \textcolor{red}{Run Julia at } \url{https://www.juliabox.com} 234 | 235 | \begin{columns} 236 | 237 | \begin{column}{.5\textwidth} 238 | \begin{itemize} 239 | \item Math-friendly 240 | \item Looks like Python 241 | \item Runs like C/Fortran 242 | \item Growing ecosystem 243 | \end{itemize} 244 | \end{column} 245 | 246 | \begin{column}{.5\textwidth} 247 | \begin{figure} 248 | \centering 249 | \includegraphics[width=40mm]{Figs/julia.png} 250 | \end{figure} 251 | \end{column} 252 | \end{columns} 253 | \begin{figure} 254 | \centering 255 | \includegraphics[width=90mm]{Figs/julia-comp.png} 256 | \end{figure} 257 | \end{frame} 258 | 259 | 260 | \section{Summary} 261 | \begin{frame}{Summary} 262 | \begin{itemize} 263 | \item Optimization in engineering is the process of finding the best system design subject to a set of constraints. 264 | \item Optimization can be transformed to a math problem but it is sometimes hard to solve 265 | \item We will extensively using the Julia language to learn how to solve the optimization numerically 266 | \end{itemize} 267 | \end{frame} 268 | 269 | 270 | \section{Homework} 271 | \begin{frame}{Homework} 272 | In Julia, write the following trial functions, make the contour plots and analyze their minima behavior. 273 | \begin{itemize} 274 | \item Booth's function 275 | \begin{equation*} 276 | f(x_1, x_2) = (x_1 + 2x_2 -7)^2 + (2x_1 + x_2 -5)^2 277 | \end{equation*} 278 | 279 | \item Barnin function 280 | \begin{equation*} 281 | f(x_1, x_2) = a(x_2 - bx_1^2 + cx_1 - r)^2 + s(1-t)cos(x_1) +s 282 | \end{equation*} 283 | where $a=1, b=5.1/(4\pi^2), c=5/\pi, r=6, s=10, t=1/8\pi$. 284 | 285 | \item Rosenbrock's Banana function 286 | \begin{equation*} 287 | f(x_1, x_2) = (a-x_1)^2 + b(x_2-x_1^2)^2 288 | \end{equation*} 289 | where $a=1, b=5$. 290 | 291 | \end{itemize} 292 | More functions can be found at \url{https://en.wikipedia.org/wiki/Test_functions_for_optimization} 293 | \end{frame} 294 | 295 | \end{document} 296 | 297 | -------------------------------------------------------------------------------- /Lecture_notes/02_derivative.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/02_derivative.pdf -------------------------------------------------------------------------------- /Lecture_notes/02_derivative.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Optimization]{Numerical Optimization 02: Derivatives} % The short title appears at the bottom of every slide, the full title is only on the title page 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Derivative} 139 | \begin{frame}{Derivative} 140 | The goal of optimization is to find the point that minimizes an objective function. Knowing how the value of a function changes (derivative) is useful. 141 | 142 | \begin{equation*} 143 | f(x + \Delta x) \approx f(x) + f`(x)\Delta x 144 | \end{equation*} 145 | 146 | \begin{equation*} 147 | f`(x) = \frac{\Delta f(x)}{\Delta x} 148 | \end{equation*} 149 | 150 | Derivatives in multiple dimensions 151 | \begin{equation*} 152 | \textrm{\textcolor{blue}{Jacobian}}~~~~~ \nabla f(x) = \Bigg[\frac{\partial f(x)}{\partial x_1}, \frac{\partial f(x)}{\partial x_2}, \cdots \frac{\partial f(x)}{\partial x_n} \Bigg] 153 | \end{equation*} 154 | 155 | \begin{equation*} 156 | \textrm{\textcolor{blue}{Hessian}}~~~~~ \nabla^2 f(x) = 157 | \begin{bmatrix} 158 | \frac{\partial^2 f(x)}{\partial x_1^2 } & \frac{\partial^2 f(x)}{\partial x_1 \partial x_2 } & \cdots \frac{\partial^2 f(x)}{\partial x_1 \partial x_n }\\ 159 | & \vdots & \\ 160 | \frac{\partial^2 f(x)}{\partial x_n \partial x_1} & \frac{\partial^2 f(x)}{\partial x_n \partial x_2 } & \cdots \frac{\partial^2 f(x)}{\partial x_n^2 } \\ 161 | \end{bmatrix} 162 | \end{equation*} 163 | \end{frame} 164 | 165 | \section{Numerical Differentiation} 166 | \begin{frame}{Numerical Differentiation} 167 | For practical applications, we rely on numerical methods to evaluate the derivatives. 168 | 169 | \begin{itemize} 170 | \item Finite Difference Methods 171 | \begin{equation*} 172 | f`(x) \approx 173 | \begin{cases} 174 | & \frac{f(x+h)-f(x)}{h} ~~~~~~~~~~~~~~~~~\textrm{forward} \\ 175 | & \frac{f(x+h/2)-f(x-h/2)}{h} ~~~~~~~~~~\textrm{central}\\ 176 | & \frac{f(x)-f(x-h)}{h} ~~~~~~~~~~~~~~~~~\textrm{backward} 177 | \end{cases} 178 | \end{equation*} 179 | \item Complex Step Method 180 | \begin{equation*} 181 | f`(x) = \textrm{imag}(f(x+ih)/h) 182 | \end{equation*} 183 | %\item Automatic Differentiation 184 | \end{itemize} 185 | \end{frame} 186 | 187 | \begin{frame}{Finite Difference - forward} 188 | \begin{equation*} 189 | f(x+h) = f(x) + \frac{f`(x)}{1!}h + \frac{f``(x)}{2!}h^2 + \frac{f```(x)}{3!}h^3 + \cdots 190 | \end{equation*} 191 | \pause 192 | We can arrange it to 193 | 194 | \begin{equation*} 195 | \begin{split} 196 | f`(x)h &= f(x+h) - f(x) - \frac{f``(x)}{2!}h^2 - \frac{f```(x)}{3!}h^3 + \cdots \\ 197 | f`(x) &= \frac{f(x+h) - f(x)}{h} - \frac{f``(x)}{2!}h - \frac{f```(x)}{3!}h^2 \\ 198 | f`(x) &= \frac{f(x+h) - f(x)}{h} + O(h) 199 | \end{split} 200 | \end{equation*} 201 | Therefore, forward difference has linear error. 202 | \end{frame} 203 | 204 | 205 | \begin{frame}{Finite Difference - central} 206 | \begin{equation*} 207 | \begin{split} 208 | f(x+h/2) &= f(x) + \frac{f`(x)}{1!}\frac{h}{2} + \frac{f``(x)}{2!}(\frac{h}{2})^2 + \frac{f```(x)}{3!}(\frac{h}{2})^3 + \cdots \\ 209 | f(x-h/2) &= f(x) - \frac{f`(x)}{1!}\frac{h}{2} + \frac{f``(x)}{2!}(\frac{h}{2})^2 - \frac{f```(x)}{3!}(\frac{h}{2})^3 + \cdots \\ 210 | f`(x) &= \frac{f(x+h/2) - f(x-h/2)}{h} + O(h^2) 211 | \end{split} 212 | \end{equation*} 213 | 214 | Therefore, central difference has quadratic error. 215 | 216 | \end{frame} 217 | 218 | \begin{frame}{Complex Step} 219 | According to Taylor expansion, 220 | \begin{equation*} 221 | f(x+ih) = f(x) + ihf`(x) - h^2\frac{f``(x)}{2!} - ih^3\frac{f```(x)}{3!} + \cdots 222 | \end{equation*} 223 | \pause 224 | If we take only the imaginary part, 225 | \begin{equation*} 226 | \begin{split} 227 | &\textrm{Im} (f(x+ih)) = hf`(x) - h^3\frac{f```(x)}{3!} + \cdots \\ 228 | &f`(x) = \frac{\textrm{Im}(f(x+ih))}{h} + h^2\frac{f```(x)}{3!} - \cdots = \frac{\textrm{Im}(f(x+ih))}{h} + O(h^2) 229 | \end{split} 230 | \end{equation*} 231 | \pause 232 | While the real part is 233 | \begin{equation*} 234 | \begin{split} 235 | \textrm{Re}(f(x+ih)) &= f(x) - h^3\frac{f``(x)}{2!} + \cdots \\ 236 | f(x) &= \textrm{Re}(f(x+ih)) + O(h^2) 237 | \end{split} 238 | \end{equation*} 239 | \pause 240 | The complex step method is advantageous since 241 | 242 | \begin{itemize} 243 | \item Both $f(x)$ and $f`(x)$ can be evaluated in a single run 244 | \item $f`(x)$ has a quadratic error 245 | \end{itemize} 246 | 247 | \end{frame} 248 | 249 | \begin{frame}{Comparison} 250 | 251 | \begin{figure} 252 | \centering 253 | \includegraphics[width=120mm]{Figs/derivative-comparison.jpeg} 254 | \end{figure} 255 | 256 | \textcolor{blue}{Homework: reproduce the above figure by yourself!} 257 | \end{frame} 258 | 259 | \begin{frame}{Why is complex step better than central difference?} 260 | \begin{equation*} 261 | \begin{split} 262 | f(x+ih) &= u(x,h) + iv(x,h)\\ 263 | \textrm{Im}(f(x+ih)) &= h\frac{\partial v(x, y)}{\partial y}|_{y=0} + O(h^2) 264 | \end{split} 265 | \end{equation*} 266 | If $v(x,0)=0$, $f(x)=u(x,0)$. Dividing by $h$, we have f`(x) 267 | \begin{equation*} 268 | \frac{\partial v(x, y)}{\partial y}|_{y=0} = \frac{\partial u(x, 0)}{\partial x} 269 | \end{equation*} 270 | The left side is what we used to compute the complex step. The right side is due to \textcolor{blue}{Cauchy-Riemann equation}, which is used by the finite difference. Note that two method use two different functions ($u$ and $v$). 271 | 272 | 273 | \end{frame} 274 | 275 | \begin{frame}{Why is complex step better than central difference?} 276 | 277 | Consider a function $f(z) = z^2$, 278 | \begin{equation*} 279 | f(z) = z^2 = x^2 - y^2 + i2xy 280 | \end{equation*} 281 | The finite difference does the function $x^2$, while the complex step gives 2$x$, in the case for any $h=y>0$. 282 | \vspace{10mm} 283 | 284 | \pause 285 | Try another function: 286 | \begin{equation*} 287 | \cos(x+iy)=\cos(x)\textrm{cosh}(y) - i\sin(x)\textrm{sinh}(y). 288 | \end{equation*} 289 | The imaginary part is $−\sin(x)\sinh(y)$. For a small $y$, it gives -sin$(x)$. 290 | 291 | \end{frame} 292 | 293 | 294 | \section{Automatic Differentiation} 295 | \begin{frame}{Dual numbers} 296 | Dual numbers can be expressed mathematically by including the abstract quantity $\epsilon$, where $\epsilon^2$ is 0. So that, 297 | \begin{equation*} 298 | \begin{split} 299 | (a+b\epsilon)+(c+d\epsilon) &= (a+c) + (b+d)\epsilon\\ 300 | (a+b\epsilon)*(c+d\epsilon) &= (ac) + (ad+bc)\epsilon 301 | \end{split} 302 | \end{equation*} 303 | 304 | The function's evaluation and derivative can be expressed simultaneously in an \textcolor{blue}{exact manner}. 305 | \begin{equation*} 306 | \begin{split} 307 | f(x) &= \sum_{k=0}^\infty \frac{f^k(a)}{k!}(x-a)^k \\ 308 | f(a+b\epsilon) &= \sum_{k=0}^\infty \frac{f^k(a)}{k!}(a+b\epsilon-a)^k 309 | = \sum_{k=0}^\infty \frac{f^k(a)b^k\epsilon^k}{k!}\\ 310 | &= f(a) + bf`(a)\epsilon + \epsilon^2 \sum_{k=2}^\infty \frac{f^k(a)b^k}{k!}\epsilon^{k-2}\\ 311 | &= f(a) + bf`(a)\epsilon 312 | \end{split} 313 | \end{equation*} 314 | 315 | \end{frame} 316 | 317 | \begin{frame}{Express a function as the computational graph} 318 | Suppose we have a target function 319 | \begin{equation*} 320 | f(a, b) = \ln(ab + \textrm{max}(a, 2)) 321 | \end{equation*} 322 | 323 | It can be expressed as 324 | \begin{figure} 325 | \centering 326 | \includegraphics[width=80mm]{Figs/graph1.jpeg} 327 | \end{figure} 328 | 329 | \end{frame} 330 | 331 | \begin{frame}{The derivative from the computational graph} 332 | Suppose we have a target function 333 | \begin{equation*} 334 | f(a, b) = \ln(ab + \textrm{max}(a, 2)) 335 | \end{equation*} 336 | The derivative is 337 | \begin{equation*} 338 | \frac{df}{dx} = \frac{df}{dc_4}\frac{dc_4}{d_x} 339 | = \frac{df}{dc_4}\bigg(\frac{dc_4}{dc_3}\frac{dc_3}{dx}\bigg) 340 | = \frac{df}{dc_4}\bigg(\frac{dc_4}{dc_3}\bigg(\frac{dc_3}{dc_2}\frac{dc_2}{dx} + \frac{dc_3}{dc_1}\frac{dc_1}{dx}\bigg)\bigg) 341 | \end{equation*} 342 | 343 | \begin{figure} 344 | \centering 345 | \includegraphics[width=80mm]{Figs/graph2.jpeg} 346 | \end{figure} 347 | 348 | \end{frame} 349 | 350 | \section{Summary} 351 | \begin{frame}{Summary} 352 | \begin{itemize} 353 | \item Derivatives are important for optimization. 354 | \item We rely on numerical derivatives in practical optimization 355 | \item Finite differences are the most easy ways to compute derivative 356 | \item The complex step method has better accuracy 357 | \item Dual numbers allow the exact evaluation of function and derivative simultaneously 358 | \item Analytic differentiation methods include forward and reverse accumulation on computational graphs 359 | \end{itemize} 360 | \end{frame} 361 | \end{document} 362 | 363 | -------------------------------------------------------------------------------- /Lecture_notes/03_bracket.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/03_bracket.pdf -------------------------------------------------------------------------------- /Lecture_notes/03_bracket.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Optimization]{Numerical Optimization 03: Bracket and Zoom} % The short title appears at the bottom of every slide, the full title is only on the title page 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Bracketing Methods} 139 | \begin{frame}{Bracketing} 140 | \begin{itemize} 141 | \item identifying an interval in which a local minimum lies and then successively shrinking the interval. 142 | \item applied to a unimodal function 143 | \end{itemize} 144 | A \textcolor{blue}{unimodal function} $f$ is one where there is a unique $x_0$, such that $f$ is monotonically decreasing for $x \leq x_0$ and monotonically increasing for $x \geq x_0$. It follows from this definition that the unique global minimum is at $x_0$, and there are no other local minima. 145 | \begin{figure} 146 | \centering 147 | \includegraphics[width=60mm]{Figs/unimodal.jpeg} 148 | \end{figure} 149 | \end{frame} 150 | 151 | %\section{Finding an Initial Bracket} 152 | \begin{frame}{Initial Bracket} 153 | When optimizing a function, we often start by first bracketing an interval containing a local minimum. 154 | \begin{itemize} 155 | \item Starting at a given point, a trial move in the positive direction (1e-2) 156 | \item search in the downhill direction to find a new point that exceeds the lowest point. 157 | \item expand the step size by some factor of 2. 158 | \end{itemize} 159 | \begin{figure} 160 | \centering 161 | \includegraphics[width=120mm]{Figs/bracket.jpeg} 162 | \end{figure} 163 | \end{frame} 164 | 165 | 166 | \section{Fibonacci Search} 167 | \begin{frame}{Fibonacci Search} 168 | \begin{columns} 169 | \begin{column}{.6\textwidth} 170 | Suppose we have a unimodal $f$ bracketed by the interval $[a, b]$. 171 | \begin{itemize} 172 | \item Query $f$ on the 1/3 and 2/3 points on the interval 173 | \item Query $f$ on the center of the new interval 174 | \item Three queries ensures to shrink the interval by a factor of three 175 | \item $\cdots$ 176 | \end{itemize} 177 | 178 | This actually follows %a \textcolor{red}{Fibonacci sequence}! 179 | \begin{equation*} 180 | F_n = 181 | \begin{cases} 182 | 1 & \textrm{if~} n\leq 2\\ 183 | F_{n-1} + F_{n-2} & \textrm{otherwise} 184 | \end{cases} 185 | \end{equation*} 186 | 187 | \end{column} 188 | 189 | \begin{column}{.4\textwidth} 190 | \end{column} 191 | 192 | \end{columns} 193 | \end{frame} 194 | 195 | \begin{frame}{Fibonacci Search Algorithm} 196 | Let's try to formulate the problem in a more rigorous way. 197 | Ideally, we want to shrink the interval $[a, b]$ within $n$ iterations. For each update, we want to shrink by a factor of $\tau$, 198 | \begin{columns} 199 | 200 | \begin{column}{0.6 \textwidth} 201 | \begin{equation*} 202 | b_{k+1} - a_{k+1} = \frac{F_{n-k}}{F_{n-k+1}}(b_k - a_k) 203 | \end{equation*} 204 | \begin{equation*} 205 | \begin{split} 206 | F_0 &= F_1 = 1,\\ 207 | F_{k+1} &= F_k + F_{k-1}~~~~~(k=1, 2, \cdots), 208 | \end{split} 209 | \end{equation*} 210 | Therefore, 211 | \begin{equation*} 212 | \begin{split} 213 | b_n - a_n &= \frac{F_1}{F_2}(b_{n-1}- a_{n-1})\\ 214 | &= \frac{F_1}{F_2}\frac{F_2}{F_3} \cdots \frac{F_{n-1}}{F_n} (b_1-a_1)\\ 215 | &= \frac{1}{F_n}(b_1 - a_1) 216 | %= \frac{1}{r^n}(b_1 - a_1) 217 | \end{split} 218 | \end{equation*} 219 | \end{column} 220 | 221 | \begin{column}{0.4 \textwidth} 222 | \centering 223 | \textcolor{blue}{Solution of $F_k$}\\ 224 | let $F_k = \tau^k$ 225 | \begin{equation*} 226 | \tau^2 = \tau + 1 227 | \end{equation*} 228 | \begin{equation*} 229 | \begin{split} 230 | \tau_{1,2} &= \frac{1\pm \sqrt{5}}{2}\\ 231 | F_k & = A\tau_1^k + B\tau_2^k 232 | \end{split} 233 | \end{equation*} 234 | 235 | Since $F_0 = F_1 = 1$, 236 | \begin{equation*} 237 | F_k = \frac{1}{\sqrt{5}} (\tau_1^{k+1} - \tau_2^{k+1}) 238 | \end{equation*} 239 | \end{column} 240 | 241 | \end{columns} 242 | 243 | \end{frame} 244 | 245 | %\end{frame} 246 | 247 | \begin{frame}{Fibonacci Algorithm} 248 | Suppose we have a unimodal $f$ bracketed by the interval [a, b]. 249 | \begin{enumerate} 250 | \item Query $f$ on two points ($\lambda_k, \mu_k$) on the interval $[a_k, b_k]$ 251 | \begin{equation*} 252 | \begin{split} 253 | \lambda_k &= a_k + \bigg(1-\frac{F_{n-k}}{F_{n-k+1}}\bigg)(b_k - a_k)\\ 254 | \mu_k &= a_k + \frac{F_{n-k}}{F_{n-k+1}}(b_k - a_k) 255 | \end{split} 256 | \end{equation*} 257 | \item if $f(\lambda_k) > f(\mu_k)$, go to step 3; otherwise, go to step 4 258 | \item if $b_k - \lambda_k < \delta$, terminate. otherwise 259 | \begin{equation*} 260 | a_{k+1} = \lambda_k, ~~ b_{k+1} = b_k, 261 | \end{equation*} 262 | \item if $u_k - a_k \leq \delta$, terminate. Otherwise 263 | \begin{equation*} 264 | a_{k+1} = a_k, b_{k+1} = \mu_k, ~~ \mu_{k+1} = \lambda_k, 265 | \end{equation*} 266 | \item k += 1, go to step 2 267 | \end{enumerate} 268 | \end{frame} 269 | 270 | \section{0.618 Search} 271 | 272 | \begin{frame}{Golden Ratio Search} 273 | If we take the limit for large $n$, the ratio between successive values of the Fibonacci sequence approaches the golden ratio: 274 | 275 | \begin{equation*} 276 | \lim_{n\rightarrow\infty} \frac{F_{k-1}}{F_k} = \frac{\sqrt{5}-1}{2} = 0.618 277 | \end{equation*} 278 | 279 | Therefore, we can always use 0.618 and 0.392 to check the two points within the updated interval.\\ 280 | 281 | Both Fibonacci and golden ratio searches have the property of linear scaling. Fibonacci is in principle the optimum search strategy for bracketing a unimodal function. However, golden ratio is more popular due to its simplicity. 282 | 283 | \end{frame} 284 | 285 | \begin{frame}{Comparison} 286 | 287 | \begin{figure} 288 | \centering 289 | \includegraphics[width=120mm]{Figs/search.jpeg} 290 | \end{figure} 291 | 292 | \textcolor{blue}{Homework: reproduce the above figure by yourself!} 293 | \end{frame} 294 | 295 | \section{Interpolation} 296 | 297 | \begin{frame}{Interpolation with the help of gradient} 298 | Both Fibonacci and 0.618 searches do not need the gradient information. However, if the gradient is available, identifying the phase can be even faster. The idea is to approximate the target function in a analytic manner. 299 | \begin{itemize} 300 | \item Linear: bisection 301 | \item quadratic fit 302 | \item cubic interpolation 303 | \end{itemize} 304 | 305 | \end{frame} 306 | 307 | \begin{frame}{Bisection} 308 | The bisection method maintains a bracket $[a, b]$ in which at least one root is known to exist. If $f$ is continuous on $[a, b]$, and there is some $y \in [ f (a), f (b)]$, then the intermediate value theorem stipulates that there exists at least one $x \in [a, b]$, such that $f(x) = y$. It follows that a bracket [a, b] is guaranteed to contain a zero if $f(a)$ and $f(b)$ have opposite signs. 309 | 310 | \begin{itemize} 311 | \item Starting at [$a_1, b_1]$ 312 | \item if $f`(a_k) \leq 0$ and $f`(b_k) \geq 0$, let $c_k = \frac{1}{2} (a_k + b_k)$ 313 | \item if $f`(c_k) \geq 0$, let $a_{k+1} = a_k$, $b_{k+1}=c_k$, otherwise, $a_{k+1}=c_k$, $b_{k+1}=b_k$ 314 | \item terminate if $(b_{k+1} - a_{k+1}) \leq \delta $ or it reaches the given number of iterations 315 | \end{itemize} 316 | 317 | Bisection method is commonly used to find roots of a function, or points where the function is zero. 318 | \end{frame} 319 | 320 | \begin{frame}{Quadratic fit search} 321 | Given the bracketing point $a { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Local Descent]{Numerical Optimization 04: Local Descent} % The short title appears at the bottom of every slide, the full title is only on the title page 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{A general model for optimization} 139 | \begin{frame}{Optimization involving multivariate functions} 140 | Similar to the single variable function, a common approach to optimization is to incrementally improve a design point $x$ by taking a step that minimizes the objective value based on a local model. The local model may be obtained, for example, from a first- or second-order Taylor approximation. 141 | \begin{itemize} 142 | \item Check whether $x_k$ satisfies the termination conditions. If it does, terminate; otherwise proceed to the next step. 143 | \item Determine the descent direction $d_k$ using local information such as the gradient or Hessian. 144 | \item Determine the step size or learning rate $\alpha_k$. 145 | \item Compute the next design point according to: 146 | \begin{equation*} 147 | x_{k+1} = x_k + \alpha_k d_k 148 | \end{equation*} 149 | \end{itemize} 150 | 151 | \end{frame} 152 | 153 | \section{Line Search} 154 | \begin{frame}{Line Search} 155 | Assuming that we have chosen a descent direction $d$. We need to choose the step factor $\alpha$ to obtain our next design point. One approach is to use \textcolor{blue}{line search}, which selects the step factor that minimizes the one-dimensional function: 156 | \begin{equation*} 157 | \underset{\alpha}{\textrm{minimize}}: f(x+\alpha d) 158 | \end{equation*} 159 | 160 | Line search is a univariate optimization problem, which was covered in the previous lecture. We can apply the univariate optimization method of our choice. To inform the search, we can use the \textcolor{blue}{derivative} of the line search objective, which is simply the directional derivative along $d$ at $x + \alpha d$.\\ 161 | 162 | One needs to be cautious in choosing $\alpha$. Large steps will result in faster convergence but risk overshooting the minimum. Smaller steps is more stable but very slow. A fixed step factor $\alpha$ is sometimes referred to as a learning rate. 163 | 164 | \end{frame} 165 | 166 | 167 | \begin{frame}{Approximate line search} 168 | It is often more computationally efficient to perform more iterations of a descent method than to do exact line search at each iteration. In this case, the goal is to \textcolor{blue}{find a suitable step size with a small number of evaluations}. 169 | 170 | Ideally, it needs to satisfy the following 171 | 172 | \begin{itemize} 173 | \item Sufficient decrease 174 | \begin{equation*} 175 | f(x^{k+1}) \leq f(x^k) + \beta\alpha \nabla _{d^k} f(x^k) 176 | \end{equation*} 177 | \item Curvature condition 178 | \begin{equation*} 179 | \nabla _{d^k} f(x^{k+1}) \geq \sigma \nabla _{d^k} f(x^k) 180 | \end{equation*} 181 | \end{itemize} 182 | 183 | 184 | \end{frame} 185 | 186 | \begin{frame}{Sufficient decrease} 187 | \begin{equation*} 188 | f(x^{k+1}) \leq f(x^k) + \beta\alpha \nabla _{d^k} f(x^k) 189 | \end{equation*} 190 | 191 | where $\beta \in [0, 1]$. A common choice is 1e-4. 192 | \begin{figure} 193 | \centering 194 | \includegraphics[width=120mm]{Figs/sufficient_decrease.jpeg} 195 | \end{figure} 196 | 197 | \textcolor{blue}{Question: what will happen if you adjust $\beta$?} 198 | \end{frame} 199 | 200 | \begin{frame}{Curvature condition} 201 | \begin{equation*} 202 | \nabla _{d^k} f(x^{k+1}) \geq \sigma \nabla _{d^k} f(x^k) 203 | \end{equation*} 204 | where $\sigma$ controls how shallow the next directional derivative must be. 205 | It is common to set $\beta < \sigma < 1$ with $\sigma$ = 0.1 in the conjugate gradient method and 0.9 in Newton’s method. 206 | \begin{figure} 207 | \centering 208 | \includegraphics[width=120mm]{Figs/curvature1.jpeg} 209 | \end{figure} 210 | \end{frame} 211 | 212 | 213 | \begin{frame}{More restrictive curvature condition (strong Wolfe)} 214 | \begin{equation*} 215 | |\nabla _{d^k} f(x^{k+1})| \leq -\sigma \nabla _{d^k} f(x^k) 216 | \end{equation*} 217 | where $\sigma$ controls how shallow the next directional derivative must be. 218 | It is common to set $\beta < \sigma < 1$ with $\sigma$ = 0.1 in the conjugate gradient method and 0.9 in Newton’s method. 219 | \begin{figure} 220 | \centering 221 | \includegraphics[width=120mm]{Figs/curvature2.jpeg} 222 | \end{figure} 223 | \end{frame} 224 | 225 | \begin{frame}{When both conditions are applied} 226 | \begin{figure} 227 | \centering 228 | \includegraphics[width=120mm]{Figs/two-conditions.jpeg} 229 | \end{figure} 230 | \end{frame} 231 | 232 | \section{A practical line search} 233 | \begin{frame}{Graphical illustration of line search} 234 | 235 | \begin{itemize} 236 | \item Initial Bracket 237 | \item Fibonacci/0.618/bisection until it satisfies the conditions 238 | \end{itemize} 239 | 240 | \begin{figure} 241 | \centering 242 | \includegraphics[width=120mm]{Figs/linesearch.jpeg} 243 | \end{figure} 244 | 245 | \end{frame} 246 | 247 | \begin{frame}{Terminations conditions} 248 | \begin{itemize} 249 | \item Maximum iterations. 250 | \item Absolute improvement. If the change is smaller than a given threshold, it will terminate: 251 | \begin{equation*} 252 | f(x_k) - f(x_{k+1}) < \epsilon_a 253 | \end{equation*} 254 | 255 | \item Relative improvement. If the change is smaller than a given threshold, it will terminate: 256 | \begin{equation*} 257 | f(x_k) - f(x_{k+1}) < \epsilon_r |f(x_k)| 258 | \end{equation*} 259 | 260 | \item Gradient magnitude. We can also terminate based on the magnitude of the gradient: 261 | \begin{equation*} 262 | |\nabla f(x_{k+1})| < \epsilon_g 263 | \end{equation*} 264 | \end{itemize} 265 | 266 | \end{frame} 267 | 268 | 269 | 270 | \section{Summary} 271 | \begin{frame}{Summary} 272 | \begin{itemize} 273 | \item Descent direction methods incrementally descend toward a local optimum. 274 | \item Univariate optimization can be applied during line search. 275 | \item Approximate line search can be used to identify appropriate descent step sizes. 276 | \item Termination conditions for descent methods can be based on multiple criteria 277 | \end{itemize} 278 | \end{frame} 279 | \end{document} 280 | 281 | -------------------------------------------------------------------------------- /Lecture_notes/05_first_order_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/05_first_order_1.pdf -------------------------------------------------------------------------------- /Lecture_notes/05_first_order_1.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Gradient Descent]{Numerical Optimization 05: 1st order methods} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{In choosing the direction} 139 | \begin{frame}{The choice of descent direction} 140 | In the previous chapter, we have talked about the general strategy for optimization is to decide a direction and then use the line search method to obtain a sufficient decrease. Repeating it for many time, we expect to arrive at the local minimum. 141 | \begin{equation*} 142 | x^{k+1} = x^k + \alpha^k d^k 143 | \end{equation*} 144 | The search direction often has the form 145 | \begin{equation} 146 | d^k = -(B^k)^{-1} \nabla f(x^k) 147 | \end{equation} 148 | 149 | where $B^k$ is a symmetric and nonsingular matrix. In some method (e.g., steepest descent), $B^k$ is the identify matrix, while in (quasi-) Newton's method, $B^k$ is the approximate or exact Hessian. 150 | 151 | In this lecture, we will cover the \textcolor{blue}{first-order} methods which \textcolor{blue}{purely rely on the gradient information}. 152 | 153 | \end{frame} 154 | 155 | \section{Gradient Descent} 156 | \begin{frame}{Gradient descent} 157 | An intuitive choice for the descent direction is the direction of steepest descent ($g^k = \nabla f(x^k)$). 158 | \begin{equation*} 159 | d^k = - \frac{g^k}{||g^k||} 160 | \end{equation*} 161 | 162 | If we optimize the step size at each step, we have 163 | \begin{equation*} 164 | \alpha^k = \underset{\alpha}{\arg \min} f(x^k + \alpha d^k) 165 | \end{equation*} 166 | 167 | Since 168 | 169 | \begin{equation*} 170 | \nabla f(x^k + \alpha d^k)^T d^k = 0 171 | \end{equation*} 172 | %We know 173 | 174 | \begin{equation*} 175 | d^{k+1} = - \frac{\nabla f(x^k + \alpha d^k)}{||\nabla f(x^k + \alpha^k)||} 176 | \end{equation*} 177 | 178 | It is obvious that the two consecutive directions are \textcolor{blue}{orthogonal}. 179 | 180 | \begin{equation*} 181 | (d^{k+1})^T d^k = 0 182 | \end{equation*} 183 | 184 | \end{frame} 185 | 186 | \section{Conjugate gradient} 187 | \begin{frame}{Conjugate gradient} 188 | Gradient descent can perform poorly in narrow valleys. The conjugate gradient method overcomes this issue by doing a small transformation. 189 | 190 | When minimizing the quadratic functions: 191 | \begin{equation*} 192 | \underset{\alpha}{\textrm{minimize}}: f(x) = \frac{1}{2} x^T A x - b^T x 193 | \end{equation*} 194 | 195 | is equivalent to solving the linear equation 196 | \begin{equation*} 197 | Ax = b 198 | \end{equation*} 199 | where $A$ is $N \times N$ symmetric and positive definite, and thus $f$ has a unique local minimum. 200 | 201 | When solving $Ax = b$, a powerful method is to find a sequence of $N$ \textcolor{blue}{conjugate directions} satisfying 202 | \begin{equation*} 203 | (d^i)^T A d^j = 0 ~~~ (i\neq j) 204 | \end{equation*} 205 | 206 | \end{frame} 207 | 208 | \begin{frame}{To find the successive conjugate directions} 209 | One can start with the direction of steepest descent 210 | \begin{equation*} 211 | d^1 = - g^1 212 | \end{equation*} 213 | We then use line search to find the next design point. For quadratic functions $f= \frac{1}{2} x^T A x - b^T x $, the step factor $\alpha$ can be computed as 214 | \begin{equation*} 215 | \begin{split} 216 | \frac{\partial f(x+\alpha d)}{\partial \alpha} & = \frac{\partial}{\partial\alpha} \Bigg[\frac{1}{2} (x+\alpha d)^T A (x+\alpha d) + b^T (x+\alpha d) + c \Bigg]\\ 217 | & = d^T A(x + \alpha d) + d^T b\\ 218 | & = d^T(Ax + b) + \alpha d^T A d 219 | \end{split} 220 | \end{equation*} 221 | Let the gradient be zero, 222 | \begin{equation*} 223 | \alpha = - \frac{d^T(Ax + b)}{d^T A d} 224 | \end{equation*} 225 | 226 | Then the update is 227 | \begin{equation*} 228 | x^2 = x^1 + \alpha d^1 229 | \end{equation*} 230 | 231 | \end{frame} 232 | 233 | \begin{frame}{To find the successive conjugate directions (continued)} 234 | For the next step 235 | \begin{equation*} 236 | d^{k+1} = -g^{k+1} + \beta^k d^k 237 | \end{equation*} 238 | where $\beta^k$ is a series of scalar parameters. Larger values of $\beta$ indicate that the previous descent direction contributes strongly. 239 | 240 | We solve $\beta$, from the followings 241 | \begin{gather*} 242 | d^{(k+1)T} A d^k = 0 \\ 243 | (-g^{k+1} + \beta^k d^{(k)})^T A d^{(k)} = 0\\ 244 | -g^{k+1}A d^{(k)} + \beta^k d^{(k)T} A d^{(k)} = 0 \\ 245 | \beta^k = \frac{g^{(k+1)T}A d^{(k)}}{d^{(k)T}A d^{(k)}} 246 | \end{gather*} 247 | The conjugate method is exact for quadratic functions. But it can be applied to non quadractic functions as well when the quadratic function is a good approximation. 248 | 249 | \end{frame} 250 | 251 | \begin{frame}{To Approximate $A$ and $\beta$} 252 | Unfortunately, we don't know the value of $A$ that best approximate $f$ around $x^k$. So we choose some way to compute $\beta$. 253 | 254 | \begin{alertblock}{Fletcher-Reeves} 255 | \begin{equation*} 256 | \beta^k = \frac{g^{(k)T} g^{(k)}}{g^{(k-1)T} g^{(k-1)}} 257 | \end{equation*} 258 | \end{alertblock} 259 | \vfill 260 | \begin{alertblock}{Polak-Ribiere} 261 | \begin{equation*} 262 | \beta^k = \frac{g^{(k)T} (g^{(k)}-g^{(k-1)})}{g^{(k-1)T} g^{(k-1)}} 263 | \end{equation*} 264 | \end{alertblock} 265 | 266 | \end{frame} 267 | 268 | 269 | \begin{frame}{Comparison between Conjugate Gradient and Steepest Descent} 270 | \begin{figure} 271 | \centering 272 | \includegraphics[width=120mm]{Figs/cg-sd.jpg} 273 | \end{figure} 274 | \end{frame} 275 | 276 | 277 | 278 | \section{Summary} 279 | \begin{frame}{Summary} 280 | \begin{itemize} 281 | \item Gradient descent follows the direction of steepest descent 282 | \item Two consecutive search directions in gradient descent are orthogonal 283 | \item In conjugate gradient, the search directions are conjugate with respect to an approximate hessian. 284 | \item Both SD and CG work with the line search method 285 | \end{itemize} 286 | \end{frame} 287 | \end{document} 288 | 289 | -------------------------------------------------------------------------------- /Lecture_notes/06_first_order_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/06_first_order_2.pdf -------------------------------------------------------------------------------- /Lecture_notes/06_first_order_2.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | \usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Gradient Descent]{Numerical Optimization 06: 1st order methods} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | \section{Gradient methods with fixed learning rate} 138 | \begin{frame}{Gradient methods with fixed learning rate} 139 | We have talked about steepest descent and conjugate gradient methods, which usually work with the line search methods. Alternatively, it is popular to use the fixed learning rate method based on Gradient descent. However, the standard version will take a long time to traverse a nearly flat surface. 140 | Several methods have been proposed. They are commonly used in machine learning neural networks training. 141 | \begin{columns} 142 | \begin{column}{.4\textwidth} 143 | \begin{itemize} 144 | \item Momentum 145 | \item Nesterov Momentum 146 | \item Adagrad 147 | \item RMSProp 148 | \item Adadelta 149 | \item Adam 150 | \end{itemize} 151 | \end{column} 152 | 153 | \begin{column}{.6\textwidth} 154 | \begin{figure} 155 | \centering 156 | \includegraphics[width=60mm]{Figs/flat.jpeg} 157 | \end{figure} 158 | \end{column} 159 | \end{columns} 160 | \end{frame} 161 | 162 | 163 | 164 | \section{Momentum} 165 | \begin{frame}{Momentum} 166 | Allowing momentum to accumulate is one way to speed the progress. 167 | Thus we can modify the gradient descent to incorporate momentum. 168 | \begin{gather*} 169 | \boldsymbol{v}^{k+1} = \beta \boldsymbol{v}^k + \alpha \boldsymbol{g}^k \\ 170 | \boldsymbol{x}^{k+1} = \boldsymbol{x}^k + \boldsymbol{v}^{k+1} 171 | \end{gather*} 172 | When $\beta$=0, it is gradient descent. 173 | \begin{figure} 174 | \centering 175 | \includegraphics[width=100mm]{Figs/momentum.jpeg} 176 | \end{figure} 177 | 178 | 179 | \end{frame} 180 | 181 | \section{Nesterov Momentum} 182 | \begin{frame}{Nesterov Momentum} 183 | One issue of momentum is that the steps do not slow down enough at the bottom of a valley and it tends to \textcolor{blue}{overshoot the valley}. Nesterov Momentum remedies the issue by the following updates, 184 | \begin{gather*} 185 | \boldsymbol{v}^{k+1} = \beta \boldsymbol{v}^k + \alpha \nabla f(\boldsymbol{x} + \beta \boldsymbol{v}^k) \\ 186 | \boldsymbol{x}^{k+1} = \boldsymbol{x}^k + \boldsymbol{v}^{k+1} 187 | \end{gather*} 188 | 189 | \begin{figure} 190 | \centering 191 | \includegraphics[width=100mm]{Figs/n-momentum.jpeg} 192 | \end{figure} 193 | 194 | \end{frame} 195 | 196 | \section{Adagrad} 197 | \begin{frame}{Adagrad} 198 | Momentum and Nesterov Momentum update all componens of $x$ with the same learning rate. The adaptive subgradient method (Adagrad), adapts a learning rate for each one in x. 199 | \begin{gather*} 200 | x_i^{k+1} = x_i^k - \frac{\alpha}{\epsilon + \sqrt{s_i^k}} g^k \\ 201 | s_i^k = \sum_{j=1}^k \bigg(g_i^j\bigg)^2 202 | \end{gather*} 203 | where $\epsilon$ is a small value on the order of 1e-8, to prevent the case of division by zero. 204 | Adagrad is far less sensitive to the learning rate $\alpha$. 205 | \end{frame} 206 | 207 | \section{RMSProp and Adadelta} 208 | \begin{frame}{RMSProp and Adadelta} 209 | 210 | In Adagrad, the learning rate may monotonically decrease. To prevent this, \textcolor{blue}{RMPprop} maintains a decaying average of squared gradients. 211 | \begin{equation*} 212 | \boldsymbol{s}^{k+1} = \gamma \boldsymbol{s}^k + (1-\gamma)(\boldsymbol{g}^k \odot \boldsymbol{g}^k) 213 | \end{equation*} 214 | 215 | where $\gamma$ is between 0 and 1, and usually is 0.9. 216 | 217 | \begin{equation*} 218 | \begin{split} 219 | x_i^{k+1} &= x_i^k - \frac{\alpha}{\epsilon + \sqrt{s_i^k}} g_i^k \\ 220 | &= x_i^k - \frac{\alpha}{\epsilon + \textrm{RMS}(g_i)} g_i^k 221 | \end{split} 222 | \end{equation*} 223 | 224 | While in \textcolor{blue}{Adadelta}, an exponentially decaying average is used, 225 | 226 | \begin{equation*} 227 | x_i^{k+1} = x_i^k - \frac{\textrm{RMS}(\delta x_i)}{\epsilon + \textrm{RMS}(g_i)} g_i^k 228 | \end{equation*} 229 | 230 | \end{frame} 231 | 232 | \section{Adam} 233 | \begin{frame}{Adam} 234 | 235 | The adaptive moment estimation (Adam) is so far the most widely used optimization method in neural network training. 236 | It stores both an exponentially decaying squared gradient like RMSProp and Adadelta, but also an exponentially decaying gradient like momentum. 237 | \begin{equation*} 238 | \begin{split} 239 | \boldsymbol{v}^{k+1} &= \gamma_v \boldsymbol{v}^k + (1-\gamma_v) \boldsymbol{g}^k \\ 240 | \boldsymbol{s}^{k+1} &= \gamma_s \boldsymbol{s}^k + (1-\gamma_s) \bigg(\boldsymbol{g}^k \odot \boldsymbol{g}^k \bigg)\\ 241 | \hat{\boldsymbol{v}}^{k+1} &= \boldsymbol{v}^{k+1}/(1-\gamma_v^k)\\ 242 | \hat{\boldsymbol{s}}^{k+1} &= \boldsymbol{s^{k+1}}/(1-\gamma_s^k)\\ 243 | \boldsymbol{x}^{k+1} &= \boldsymbol{x}^k - \alpha \hat{\boldsymbol{v}}^{k+1}/\bigg(\epsilon + \sqrt{\hat{\boldsymbol{s}}^{k+1}}\bigg) 244 | \end{split} 245 | \end{equation*} 246 | 247 | 248 | \end{frame} 249 | 250 | 251 | \section{Summary} 252 | \begin{frame}{Summary} 253 | \begin{itemize} 254 | \item Descent methods with momentum build up progress in favorable directions 255 | \item A wide variety of accelerated descent methods use special techniques to speed up descent 256 | \end{itemize} 257 | \end{frame} 258 | \end{document} 259 | 260 | -------------------------------------------------------------------------------- /Lecture_notes/06_gradient_descent.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/06_gradient_descent.pdf -------------------------------------------------------------------------------- /Lecture_notes/07_Newton_method.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/07_Newton_method.pdf -------------------------------------------------------------------------------- /Lecture_notes/07_Newton_method.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Gradient Descent]{Numerical Optimization 07: 2nd order methods} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Newton's method} 139 | \begin{frame}{Newton's method} 140 | In optimization, knowing the first-order information can help determine the direction to travel, but does not help to determine how far to step to the local minimum. A better way is to use the second-order information. 141 | 142 | In univariable optimization, the quadratic approximation about a point ($x^k$) come from 143 | \begin{equation*} 144 | q(x) = f(x^k) + (x-x^k)f`(x^k) + \frac{(x-x^k)^2}{2}f``(x^k) 145 | \end{equation*} 146 | Setting the derivative to zero, 147 | \begin{equation*} 148 | \begin{split} 149 | \frac{\partial q(x)}{\partial x} &= f`(x^k) + (x-x^k)f``(x^k) = 0 \\ 150 | x^{k+1} &= x^k - \frac{f`(x^k)}{f``(x^k)} 151 | \end{split} 152 | \end{equation*} 153 | 154 | \end{frame} 155 | 156 | \begin{frame}{Various cases} 157 | \begin{figure} 158 | \centering 159 | \includegraphics[width=80mm]{Figs/newton-1d.jpeg} 160 | \end{figure} 161 | 162 | \end{frame} 163 | 164 | 165 | \section{Extension to multivariate optimization} 166 | \begin{frame}{Extension to multivariate optimization} 167 | If $f$ is a multivariate function 168 | \begin{equation*} 169 | f(\boldsymbol{x}) = f(\boldsymbol{x}^k) + (\boldsymbol{g}^k)^T(\boldsymbol{x}-\boldsymbol{x}^k) 170 | + \frac{1}{2} (\boldsymbol{x}-\boldsymbol{x^k})^T \boldsymbol{H}^k (\boldsymbol{x}-\boldsymbol{x}^k) 171 | \end{equation*} 172 | 173 | Setting the gradient to be zero, 174 | \begin{equation*} 175 | \nabla q(\boldsymbol{x}^k) = \boldsymbol{g}^k + \boldsymbol{H}^k (\boldsymbol{x}-\boldsymbol{x}^k) 176 | \end{equation*} 177 | 178 | \begin{alertblock}{Quiz} 179 | The Booth's function is 180 | \begin{equation*} 181 | f(\boldsymbol{x}) = (x_1 + 2x_2 -7)^2 + (2x_1 + x_2 -5)^2 182 | \end{equation*} 183 | use the Newton's method to find the minimum when $\boldsymbol{x}$ =[9, 8] 184 | 185 | 186 | \end{alertblock} 187 | 188 | \end{frame} 189 | 190 | \begin{frame}{Newton's method with line search} 191 | Newton’s method can also be used to supply a descent direction to line search or can be modified to use a step factor. Smaller steps toward the minimum or line searches along the descent direction can increase the method’s robustness. The descent direction is: 192 | \begin{equation*} 193 | \boldsymbol{d}^k = -(\boldsymbol{H}^k)^{-1}\boldsymbol{g}^k 194 | \end{equation*} 195 | 196 | \end{frame} 197 | 198 | \section{Secant Method} 199 | \begin{frame}{Secant Method} 200 | Newton's method for \textcolor{blue}{univariate} function minimization needs to know the first and second derivatives. However, the second derivative is not easy to compute for some cases. The secant method use estimates of $H$ as follows 201 | \begin{gather*} 202 | f``(x^k) \approx \frac{f`(x^k) - f`(x^{k-1})} {x^k-x^{k-1}} 203 | \end{gather*} 204 | 205 | The secant method requires \textcolor{blue}{an additional initial design point}. It suffers from the same problems as Newton's method when quadratic function is not a good approximation. 206 | 207 | \end{frame} 208 | 209 | 210 | 211 | \section{Summary} 212 | \begin{frame}{Summary} 213 | \begin{itemize} 214 | \item Incorporating second-order information in descent methods often speeds convergence. 215 | \item Newton’s method is a root-finding method that leverages second-order information to quickly descend to a local minimum. 216 | \item The secant method approximate Newton’s method when the second-order information is not directly available. 217 | \end{itemize} 218 | \end{frame} 219 | \end{document} 220 | 221 | -------------------------------------------------------------------------------- /Lecture_notes/08_Quasi_Newton.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/08_Quasi_Newton.pdf -------------------------------------------------------------------------------- /Lecture_notes/09_direct_methods.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/09_direct_methods.pdf -------------------------------------------------------------------------------- /Lecture_notes/09_direct_methods.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Gradient Descent]{Numerical Optimization 09: Direct Methods} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Direct methods without gradient} 139 | \begin{frame}{Direct method} 140 | Direct methods rely solely on the objective function $f$. They are usually called 141 | \begin{itemize} 142 | \item zero-orther 143 | \item black box 144 | \item pattern search 145 | \item derivative free 146 | \end{itemize} 147 | 148 | The most important feature is that they do not rely on derivative information. 149 | They use other criteria to choose the next search direction to judge if the search is converged. 150 | 151 | \end{frame} 152 | 153 | \section{Cyclic Coordinate Search} 154 | \begin{frame}{Cyclic Coordinate Search} 155 | This method simply alternate coordinate directions for its line search. The search starts from an initial $\boldsymbol{x}^1$ and optimize the first input. 156 | \begin{columns} 157 | 158 | \begin{column}{.6\textwidth} 159 | \begin{equation*} 160 | \boldsymbol{x}^2 = \underset{x_1}{\arg \min} f(x_1, x_2^1, x_3^1, \cdots, x_n^1) 161 | \end{equation*} 162 | 163 | Then, it moves to the next coordinate, 164 | \begin{equation*} 165 | \boldsymbol{x}^3 = \underset{x_1}{\arg \min} f(x_1^2, x_2, x_3^2, \cdots, x_n^2) 166 | \end{equation*} 167 | This process is equivalent to doing a sequence of line searches along the set of $n$ basis vectors. 168 | It is terminated after no significant improvement is made. 169 | \end{column} 170 | \pause 171 | \begin{column}{.4\textwidth} 172 | \begin{figure} 173 | \centering 174 | \includegraphics[width=30mm]{Figs/coordinate.jpeg} 175 | \end{figure} 176 | \end{column} 177 | \end{columns} 178 | 179 | \end{frame} 180 | 181 | \begin{frame}{Acceleration} 182 | Similar to the momentum method in the gradient descent, the cyclic method can be augmented with an acceleration step to help traverse diagonal valleys. For each full cycle starting with $\boldsymbol{x}^1$ from 1 to $n$, an additional line search is conducted along with 183 | the direction of $\boldsymbol{x}^{n+1}-\boldsymbol{x}^1$. 184 | 185 | \begin{figure} 186 | \centering 187 | \includegraphics[width=60mm]{Figs/coordinate-improved.jpeg} 188 | \end{figure} 189 | 190 | \end{frame} 191 | 192 | 193 | 194 | \section{Powell's method} 195 | \begin{frame}{Powell's method} 196 | This algorithm maintains a list of search directions $\boldsymbol{u}^1, \cdots, \boldsymbol{u}^n$, which are initially the basis vectors. 197 | Starting at $\boldsymbol{x}^1$, Powell's method conduct a line search for each direction, updating the design point each time, 198 | Then shift each $u$ by one index and drop $u^1$. 199 | The last direction is replaced with the direction of $\boldsymbol{x}^{n+1} - \boldsymbol{x}^1$. 200 | 201 | 202 | \begin{columns} 203 | \begin{column}{.7\textwidth} 204 | \begin{equation*} 205 | \begin{split} 206 | \boldsymbol{x}^{i+1} &\leftarrow ~{\textrm{line search}} (f, \boldsymbol{x}^i, \boldsymbol{u}^i) ~{\textrm{for all}} i in 1, \cdots, n\\ 207 | \boldsymbol{u}^{i+1} &\leftarrow \boldsymbol{u}^{i+1} \\~ {\textrm{for all}} i in 1, \cdots, n-1\\ 208 | \boldsymbol{u}^{n} &\leftarrow \boldsymbol{x}^{n+1} - \boldsymbol{x}^n 209 | \end{split} 210 | \end{equation*} 211 | \end{column} 212 | 213 | %\pause 214 | \begin{column}{.3\textwidth} 215 | \begin{figure} 216 | \centering 217 | \includegraphics[width=30mm]{Figs/powell.jpeg} 218 | \end{figure} 219 | \end{column} 220 | 221 | \end{columns} 222 | 223 | Powell showed that for quadratic functions, after $k$ full iterations the last $k$ direction will be mutually conjugate. 224 | It is recommended to reset every $n$ or $n+1$ iterations. 225 | 226 | \end{frame} 227 | 228 | \section{Nelder-Mead Simplex Method} 229 | \begin{frame}{Nelder-Mead Simplex Method} 230 | 231 | The Nelder-Mead simplex method uses a simplex to traverse the space in search 232 | of a minimum. A simplex is a $n+1$-vertices polyhedron in $n$-dimensional space. 233 | \begin{columns} 234 | \begin{column}{0.4 \textwidth} 235 | \begin{itemize} 236 | \item $x_h$, pt of highest $f$, 237 | \item $x_s$, pt of 2nd highest $f$, 238 | \item $x_l$, pt of lowest $f$, 239 | \item $\Bar{x}$, mean pt excluding $x_h$. 240 | \end{itemize} 241 | \end{column} 242 | \begin{column}{0.6 \textwidth} 243 | \begin{itemize} 244 | \item Reflection. $x_r = \Bar{x} + (\Bar{x} - x_h)$, 245 | \item Expansion. $x_e = \Bar{x} + 2(x_r - \Bar{x})$, 246 | \item Contraction. $x_c = \Bar{x} + 0.5(x_h - \Bar{x})$, 247 | \item Shrinkage, halving the distance to $x_l$. 248 | \end{itemize} 249 | \end{column} 250 | 251 | \end{columns} 252 | 253 | \begin{figure} 254 | \centering 255 | \includegraphics[width=120mm]{Figs/simplex.jpeg} 256 | \end{figure} 257 | \end{frame} 258 | 259 | 260 | \begin{frame}{Nelder-Mead Simplex Algorithm} 261 | 262 | \begin{figure} 263 | \centering 264 | \includegraphics[width=110mm]{Figs/simplex_algo.jpeg} 265 | \end{figure} 266 | \end{frame} 267 | 268 | \begin{frame}{Nelder-Mead Simplex method in practice} 269 | 270 | \begin{figure} 271 | \centering 272 | \includegraphics[width=110mm]{Figs/simplex-performance.jpeg} 273 | \end{figure} 274 | \end{frame} 275 | 276 | \section{Summary} 277 | \begin{frame}{Summary} 278 | \begin{itemize} 279 | \item Direct methods rely solely on the objective function and do not use derivative information. 280 | \item Cyclic coordinate search optimizes one coordinate direction at a time. 281 | \item Powell’s method adapts the set of search directions based on the direction of progress. 282 | \item The Nelder-Mead simplex method uses a simplex to search the design space, adaptively expanding and contracting the size of the simplex in response to evaluations of the objective function. 283 | \end{itemize} 284 | \end{frame} 285 | \end{document} 286 | 287 | -------------------------------------------------------------------------------- /Lecture_notes/10_Stochastic_methods.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Gradient Descent]{Numerical Optimization 10: Stochastic Methods} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Noisy Descent} 139 | \begin{frame}{Noisy Descent} 140 | Adding stochasticity to gradient descent can be beneficial in large nonlinear optimization problems. Saddle points, where the gradient is very close to zero, can cause descent methods to select step sizes that are too small to be useful. One approach is to add Gaussian noise at each descent step 141 | 142 | \begin{equation*} 143 | \boldsymbol{x}^{k+1} \leftarrow \boldsymbol{x}^k - \alpha^k \boldsymbol{g}^k + \boldsymbol{\epsilon}^k 144 | \end{equation*} 145 | 146 | where $\boldsymbol{\epsilon}(k)$ is zero-mean Gaussian noise with standard deviation $\sigma$. The amount of noise is typically reduced over time. The standard deviation of the noise is typically a decreasing sequence $\sigma(k)$ such as $1/k$. 147 | 148 | \begin{figure} 149 | \centering 150 | \includegraphics[width=40mm]{Figs/sgd.jpeg} 151 | \end{figure} 152 | 153 | \end{frame} 154 | 155 | \section{Simulated Annealing} 156 | \begin{frame}{Simulated Annealing} 157 | Simulated annealing borrows inspiration from metallurgy. 158 | \textcolor{blue}{Temperature} is used to control the degree of stochasticity during the randomized search. 159 | \begin{itemize} 160 | \item $t$ starts high, allowing the process to freely move , with the hope of finding a good region with the best local minimum. 161 | 162 | \item $t$ is then slowly brought down, reducing the stochasticity and forcing the search to converge to a minimum. Simulated annealing is often used on functions with many local minima due to its ability to escape local minima. 163 | 164 | \end{itemize} 165 | 166 | 167 | At every iteration, a candidate transition from $\boldsymbol{x}$ to $\boldsymbol{x}′$ is sampled from a transition distribution $T$ and is accepted with \textcolor{blue}{probability} 168 | 169 | \begin{equation*} 170 | \begin{cases} 171 | 1 & \textrm{if } \Delta y \leq 0\\ 172 | \min(\exp(-\Delta y/t), 1) & \textrm{if }\Delta y >0 173 | \end{cases} 174 | \end{equation*} 175 | 176 | where $\Delta y = f(\boldsymbol{x})- f(\boldsymbol{x`})$ 177 | 178 | \end{frame} 179 | 180 | 181 | \section{Cross-Entropy Method} 182 | \begin{frame}{Cross-Entropy Method} 183 | This probability distribution, often called a \textcolor{blue}{proposal distribution}, is used to propose new samples for the next iteration. At each iteration, we sample from the proposal distribution and then update the proposal distribution to fit a collection of the best samples. 184 | 185 | It requires choosing a family of distributions parameterized by $\theta$, such as multivariate normal distributions with \textcolor{blue}{a mean vector and a covariance matrix}. The algorithm also requires us to specify the number of elite samples, $m_{\textrm{elite}}$ , to use when fitting the parameters for the next iteration. 186 | 187 | \begin{equation*} 188 | \begin{split} 189 | \boldsymbol{\mu}^{k+1} &= \frac{1}{m_{\textrm{elite}}} \sum_{i=1}^{m_{\textrm{elite}}} \boldsymbol{x}^i\\ 190 | \Sigma^{k+1} &= \frac{1}{m_{\textrm{elite}}} \sum_{i=1}^{m_{\textrm{elite}}} (\boldsymbol{x}^i - \boldsymbol{\mu}^{k+1})(\boldsymbol{x}^i - \boldsymbol{\mu}^{k+1})^T 191 | \end{split} 192 | \end{equation*} 193 | \end{frame} 194 | 195 | \begin{frame}{Cross-Entropy Method} 196 | This probability distribution, often called a \textcolor{blue}{proposal distribution}, is used to propose new samples for the next iteration. At each iteration, we sample from the proposal distribution and then update the proposal distribution to fit a collection of the best samples. 197 | 198 | \begin{figure} 199 | \centering 200 | \includegraphics[width=120mm]{Figs/Cross-entropy.jpeg} 201 | \end{figure} 202 | \end{frame} 203 | 204 | \section{Covariance Matrix Adaptation} 205 | \begin{frame}{Covariance Matrix Adaptation} 206 | Covariance matrix adaptation maintains a mean vector $\boldsymbol{\mu}$, a covariance matrix $\boldsymbol{\Sigma}$, and an additional step-size scalar $\delta$. The covariance matrix only increases or decreases in a single direction with every iteration, whereas the step-size scalar is adapted to control the overall spread of the distribution. At every iteration, m designs are sampled from the multivariate Gaussian 207 | \begin{equation*} 208 | \boldsymbol{x} \sim \mathcal{N} (\boldsymbol{\mu}, \sigma^2 \Sigma) 209 | \end{equation*} 210 | 211 | The designs are then sorted according to their objective function values such that $f(x^1) \leq f(x^2) \leq \cdots \leq f(x^m)$. A new mean vector $\boldsymbol{\mu}^{k+1}$ is formed using a weighted average of the sampled designs: 212 | 213 | \begin{gather*} 214 | \boldsymbol{\mu}^{k+1} \leftarrow \sum_{i=1}^m w_i \boldsymbol{x}^i\\ 215 | \sum_i^m w_i = 1 ~~~~ w_1>w_2>\cdots>w_m>0 216 | \end{gather*} 217 | 218 | \end{frame} 219 | 220 | \begin{frame}{Covariance Matrix Adaptation} 221 | The recommended weighting is obtained by 222 | \begin{equation*} 223 | w`_i = \ln \frac{m+1}{2} - \ln i ~ \textrm{for~} i \in \{1, \cdots, m\} 224 | \end{equation*} 225 | to obtain $\boldsymbol{w} = \boldsymbol{w}`/\sum_i w`_i $. 226 | 227 | The step size is updated using a cumulative $\boldsymbol{p}_\sigma$ that tracks steps over time 228 | \begin{equation*} 229 | \begin{split} 230 | \boldsymbol{p}_\sigma^1 &= 0\\ 231 | \boldsymbol{p}_\sigma^{k+1} &\leftarrow (1-c_\sigma)\boldsymbol{p}_\sigma + \sqrt{c_\sigma(2-c_\sigma)\mu_{\textrm{eff}}} (\Sigma^k)^{-1/2} \sigma_w \\ 232 | \mu_{\textrm{eff}} &= \frac{1}{\sum_i w^2_i}\\ 233 | \sigma_w &= \sum_{i=1}^{m_{\textrm{elite}}} 234 | w_i \sigma^i ~ \textrm{for~} \sigma^i = \frac{\boldsymbol{x}^i - \boldsymbol{\mu}^k}{\sigma^k} 235 | \end{split} 236 | \end{equation*} 237 | \end{frame} 238 | 239 | \begin{frame}{Covariance Matrix Adaptation} 240 | The new step size is 241 | \begin{equation*} 242 | \sigma^{k+1} \leftarrow \sigma^k \exp\bigg(\frac{c_\sigma}{d_\sigma} \bigg[\frac{||\boldsymbol{p}_\sigma 243 | ||}{\mathbb{E}||\mathcal{N}(0, \boldsymbol{I})||}-1\bigg]\bigg) 244 | \end{equation*} 245 | where $\mathbb{E}$ is the expected length of a vector drawn from Gaussian distribution. 246 | \begin{equation*} 247 | \mathbb{E}||\mathcal{N}(0, \boldsymbol{I})|| = \sqrt{2} \frac{\Gamma(\frac{n+1}{2})}{\Gamma(\frac{n}{2})} 248 | \approx \sqrt{n}\bigg(1-\frac{1}{4n}+\frac{1}{21n^2}\bigg) 249 | \end{equation*} 250 | 251 | \begin{equation*} 252 | \begin{split} 253 | c_\sigma &= (\mu_{\textrm{eff}}+2)/(n+\mu_{\textrm{eff}}+5)\\ 254 | d_\sigma &= 1 + 2\max (0, \sqrt{\mu_{\textrm{eff}}-1)/(n+1)} -1 ) + c_\sigma 255 | \end{split} 256 | \end{equation*} 257 | 258 | \end{frame} 259 | 260 | \begin{frame}{Covariance Matrix Adaptation} 261 | The covariance matrix is updated as follows 262 | \begin{equation*} 263 | \begin{split} 264 | \boldsymbol{p}_\Sigma^1 &= 0\\ 265 | \boldsymbol{p}_\Sigma^{k+1} &\leftarrow (1-c\Sigma)\boldsymbol{p_\Sigma^k} + h_\sigma \sqrt{c\Sigma (2-c_\Sigma) \mu_{\textrm{eff}}}\boldsymbol{\sigma}_w 266 | \end{split} 267 | \end{equation*} 268 | 269 | where 270 | \begin{equation*} 271 | h_\sigma = 272 | \begin{cases} 273 | 1& if \frac{||\boldsymbol{p}_\Sigma||}{(1-c_\sigma^{2k+1})} < (1.4 + \frac{2}{n+1}) \mathbb{E}||\mathcal{N}(0, \boldsymbol{I})||\\ 274 | 0& \textrm{otherwise} 275 | \end{cases} 276 | \end{equation*} 277 | 278 | The update requires the adjusted weights $\boldsymbol{w}$: 279 | 280 | \begin{equation*} 281 | w_i^0 = 282 | \begin{cases} 283 | w_i& \textrm{if~} w_i \geq 0 \\ 284 | \frac{nw_i}{||\Sigma^{-1/2}\boldsymbol{\delta}^i||^2}& \textrm{otherwise} 285 | \end{cases} 286 | \end{equation*} 287 | 288 | \end{frame} 289 | 290 | \begin{frame}{Covariance Matrix Adaptation} 291 | The The covariance update is then 292 | \begin{equation*} 293 | \Sigma^{k+1} \leftarrow [1 + c_1 c_\sigma(1-h_\sigma)(2 - c_\sigma) - c_1 - c_\mu]\Sigma^k 294 | + c_1 \boldsymbol{p}_\Sigma \boldsymbol{p}_\Sigma^T + c_\mu \sum_{i=1}^\mu w_i^0 \boldsymbol{\delta}^i (\boldsymbol{\delta}^i)^T 295 | \end{equation*} 296 | 297 | The constants have the following recommended values 298 | \begin{equation*} 299 | \begin{split} 300 | c_\Sigma &= \frac{4+\mu_{\textrm{eff}}/n}{n+4+2\mu_{\textrm{eff}}/n}\\ 301 | c_1 &= \frac{2}{(n+1.3)^2 + \mu_{\textrm{eff}}} \\ 302 | c_\mu &= \min \bigg( 1-c_1, 2\frac{\mu_{\textrm{eff}}-2+1/\mu_{\textrm{eff}}} {(n+2)^2 + \mu_{\textrm{eff}} } \bigg) 303 | \end{split} 304 | \end{equation*} 305 | 306 | \end{frame} 307 | 308 | \begin{frame}{Covariance Matrix Adaptation} 309 | \begin{figure} 310 | \centering 311 | \includegraphics[width=100mm]{Figs/CMA.jpeg} 312 | \end{figure} 313 | \end{frame} 314 | 315 | 316 | \section{Summary} 317 | \begin{frame}{Summary} 318 | \begin{itemize} 319 | \item Stochastic methods employ random numbers during the optimization process 320 | \item Simulated annealin guses a temperature that controls random exploration and which is reduced over time to converge on a local minimum. 321 | \item The cross-entropy method and evolution strategies maintain proposal distributions from which they sample in order to inform updates. 322 | \item Covariance matrix adaptation is a robust and sample-efficient optimizer that maintains a multivariate Gaussian proposal distribution with a full covariance matrix. 323 | \end{itemize} 324 | \end{frame} 325 | \end{document} 326 | 327 | -------------------------------------------------------------------------------- /Lecture_notes/11_evolutinary_methods.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Evolutionary Methods]{Numerical Optimization 11: Evolutionary Methods} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Population Methods} 139 | \begin{frame}{Population Methods} 140 | Previous lecture discussed some methods require a group of points to colleHaving a large number of individuals distributed throughout the design space can help the algorithm avoid becoming stuck in a local minimum. Information at different points in the design space can be shared between individuals to globally optimize the objective function. Most population methods are stochastic in nature, and it is generally easy to parallelize the computation. 141 | 142 | These methods typically have the following steps 143 | \begin{itemize} 144 | \item Initialization 145 | \item Encoding 146 | \item Mutation 147 | \item Crossover 148 | \item Selection 149 | \end{itemize} 150 | 151 | \end{frame} 152 | 153 | \section{Initialization} 154 | \begin{frame}{Initialization} 155 | Population methods begin with an initial population, just as descent methods require an initial design point. The initial population should be spread over the design space to increase the chances that the samples are close to the best regions. 156 | The following strategies can be applied 157 | \begin{itemize} 158 | \item Uniform distribution in a bounded region 159 | \item Multivariate normal distribution centered over a region of interest. 160 | \item The Cauchy distribution has an unbounded variance and can cover a much broader space. 161 | \end{itemize} 162 | 163 | \begin{figure} 164 | \centering 165 | \includegraphics[width=60mm]{Figs/cauchy.jpeg} 166 | \end{figure} 167 | \end{frame} 168 | 169 | 170 | \section{Genetic Algorithm} 171 | \begin{frame}{Chromosomes} 172 | There are several ways to represent chromosomes. The simplest is the binary string chromosome, a representation that is similar to the way DNA is encoded. 173 | \end{frame} 174 | 175 | \begin{frame}{Selection} 176 | Selection is the process of choosing chromosomes to use as parents for the next generation. For a population with m chromosomes, a selection method will produce a list of $m$ parental pairs for the m children of the next generation. The selected pairs may contain duplicates. 177 | \begin{itemize} 178 | \item Truncation, random one from the best $k$ truncation 179 | \item Tournament, the fittest out of $k$ randomly chosen 180 | \item Roulette wheel, chosen with a probability proportional to the fitness 181 | \end{itemize} 182 | 183 | \begin{figure} 184 | \centering 185 | \includegraphics[width=120mm]{Figs/selection.jpeg} 186 | \end{figure} 187 | \end{frame} 188 | 189 | \section{Covariance Matrix Adaptation} 190 | \begin{frame}{Covariance Matrix Adaptation} 191 | Covariance matrix adaptation maintains a mean vector $\boldsymbol{\mu}$, a covariance matrix $\boldsymbol{\Sigma}$, and an additional step-size scalar $\delta$. The covariance matrix only increases or decreases in a single direction with every iteration, whereas the step-size scalar is adapted to control the overall spread of the distribution. At every iteration, m designs are sampled from the multivariate Gaussian 192 | \begin{equation*} 193 | \boldsymbol{x} \sim \mathcal{N} (\boldsymbol{\mu}, \sigma^2 \Sigma) 194 | \end{equation*} 195 | 196 | The designs are then sorted according to their objective function values such that $f(x^1) \leq f(x^2) \leq \cdots \leq f(x^m)$. A new mean vector $\boldsymbol{\mu}^{k+1}$ is formed using a weighted average of the sampled designs: 197 | 198 | \begin{gather*} 199 | \boldsymbol{\mu}^{k+1} \leftarrow \sum_{i=1}^m w_i \boldsymbol{x}^i\\ 200 | \sum_i^m w_i = 1 ~~~~ w_1>w_2>\cdots>w_m>0 201 | \end{gather*} 202 | 203 | \end{frame} 204 | 205 | 206 | \section{Particle Swarm Optimization} 207 | \begin{frame}{Particle Swarm Optimization} 208 | Particle swarm optimization introduces momentum to accelerate convergence toward minima. Each individual (particle), in the population keeps track of its current position, velocity, and the best position it has seen so far. Momentum allows an individual to accumulate speed in a favorable direction, independent of local perturbations. 209 | 210 | \begin{equation*} 211 | \begin{split} 212 | \boldsymbol{x}^i & \leftarrow \boldsymbol{x}^i + \boldsymbol{v}^i \\ 213 | \boldsymbol{v}^i & \leftarrow w\boldsymbol{v}^i + c_1 r_1 (\boldsymbol{x}_{lbest}^i - x^i) + c_2 r_2(\boldsymbol{x}_{gbest} - x^i) 214 | \end{split} 215 | \end{equation*} 216 | 217 | where 218 | \begin{itemize} 219 | \item $x_{lbest}$: the current local best locations for the given population 220 | \item $x_{gbest}$: the global best locations 221 | \item $w, c_1, c_2$: empirical parameters 222 | \item $r_1, r_2$: random numbers drawn from $U(0, 1)$ 223 | \end{itemize} 224 | 225 | \end{frame} 226 | 227 | \begin{frame}{PSO search} 228 | \begin{figure} 229 | \centering 230 | \includegraphics[width=100mm]{Figs/pso.jpeg} 231 | %\caption{Firefly search with $\alpha$ = 0.5, $\beta$ = 1,and $\gamma$ = 0.1 applied to the Branin function} 232 | \end{figure} 233 | \end{frame} 234 | 235 | 236 | \begin{frame}{Firefly Algorithm} 237 | The firefly algorithm was inspired by the manner in which fireflies flash their lights to attract mates. In the firefly algorithm, each individual in the population is a firefly and can flash to attract other fireflies. At each iteration, all fireflies are moved toward all more attractive fireflies. A firefly $x_a$ is moved toward a firefly $x_b$ with greater attraction according to 238 | 239 | \begin{equation*} 240 | x_a \leftarrow x_a + \beta I (||x_b - x_a||)(x_b - x_a) + \alpha \epsilon 241 | \end{equation*} 242 | 243 | where $I$ is the intensity of the attraction and $\beta$ is the source intensity. 244 | When $\beta$ = 0, it returns to a random walk. where $\epsilon$ is drawn from a zero-mean, unit covariance multivariate Gaussian, and $\alpha$ scales the step size. The resulting update is a random walk biased toward brighter fireflies 245 | 246 | The intensity $I$ decreases as the distance $r$ between the two fireflies increases and is defined to be 1 when $r$ = 0. It can be approximated as 247 | \begin{equation*} 248 | I(r) = e^{-\gamma r^2} 249 | \end{equation*} 250 | 251 | \end{frame} 252 | 253 | \begin{frame}{Firefly search} 254 | Firefly search with $\alpha$ = 0.5, $\beta$ = 1,and $\gamma$ = 0.1 applied to the Branin function. 255 | \begin{figure} 256 | \includegraphics[width=120mm]{Figs/firefly.jpeg} 257 | \end{figure} 258 | 259 | \end{frame} 260 | 261 | 262 | \section{Summary} 263 | \begin{frame}{Summary} 264 | \begin{itemize} 265 | \item Population methods use a collection of individuals in the design space to guide progression toward an optimum. 266 | \item Genetic algorithms leverage selection, crossover, and mutations to produce better subsequent generations. 267 | \item Particle swarm optimization and the firefly algorithm include rules and mechanisms for attracting design points to the best individuals in the population while maintaining suitable state space exploration. 268 | \item Population methods can be extended with local search approaches to improve convergence. 269 | \end{itemize} 270 | \end{frame} 271 | \end{document} 272 | 273 | -------------------------------------------------------------------------------- /Lecture_notes/12_constrained_optimization.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Constrained Optimization]{Numerical Optimization 12: Constrained Optimization} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Constrained Optimization} 139 | \begin{frame}{Noisy Descent} 140 | Some constraints are simply upper or lower bounds on the design variables, as we have seen in bracketed line search, in which $x$ must lie between $a$ and $b$. A bracketing constraint $x \in [a, b]$ can be replaced by two inequality constraints: $x \geq a$ and $x \leq b$ 141 | 142 | \begin{figure} 143 | \centering 144 | \includegraphics[width=120mm]{Figs/constraint-ab.jpeg} 145 | \end{figure} 146 | 147 | \end{frame} 148 | 149 | \section{Constraints} 150 | \begin{frame}{Constraints} 151 | Constraints are not typically specified directly through a known feasible set X . Instead, the feasible set is typically formed from two types of constraints: 152 | 153 | \begin{itemize} 154 | \item equality constraints, $h(x)=0$ 155 | \item inequality constraints, $g(x) \leq 0$ 156 | 157 | \end{itemize} 158 | 159 | Any optimization problem can be rewritten using these constraints 160 | \begin{gather*} 161 | ~~~~~ \underset{\boldsymbol{x}}{\min} ~ f(\boldsymbol{x})\\ 162 | {s.t.}~~~~ h_i(x) = 0 \\ 163 | ~~~~~~~~~ g_j(x) = 0 164 | \end{gather*} 165 | 166 | \end{frame} 167 | 168 | \section{Transformations to Remove Constraints} 169 | \begin{frame}{Transformations to Remove Constraints} 170 | In some cases, it may be possible to transform a problem so that constraints can be removed. For example, bound constraints a ≤ x ≤ b can be removed by passing x through a transform 171 | 172 | \begin{equation*} 173 | x = \frac{b+a}{2} + \frac{b-a}{2}\bigg(\frac{2\hat{x}}{1+\hat{x}^2}\bigg) 174 | \end{equation*} 175 | 176 | Below is an example 177 | \begin{gather*} 178 | ~~~~~ \underset{x}{\min} ~ x\sin{x}\\ 179 | {s.t.}~~~~ 2\leq x \leq 6 \\ 180 | \end{gather*} 181 | Can be transformed to 182 | \begin{gather*} 183 | \underset{\hat{x}}{\min} ~ \bigg[4+2\bigg(\frac{2\hat{x}}{1+\hat{x}^2}\bigg)x 184 | + \sin \bigg[ 4 + 2\frac{2\hat{x}}{1+\hat{x}^2}\bigg] 185 | \end{gather*} 186 | 187 | \end{frame} 188 | 189 | \section{Lagrange Multipliers} 190 | \begin{frame}{Lagrange Multipliers} 191 | The method of Lagrange multipliers is used to optimize a function subject to equality 192 | constraints. 193 | \begin{gather*} 194 | ~~~~~ \underset{\boldsymbol{x}}{\min} ~ f(\boldsymbol{x})\\ 195 | {s.t.}~~~~ h_i(x) = 0 196 | \end{gather*} 197 | 198 | where $f$ and $h$ have continuous partial derivatives. 199 | 200 | We can formulate the Lagrangian, which is a function of the design variables, 201 | \begin{gather*} 202 | \mathcal{L}(x, \lambda) = f(x) - \lambda h(x) 203 | \end{gather*} 204 | 205 | Solving $\nabla \mathcal{L}(x, \lambda)$ = 0. Specifically, $\nabla_x \mathcal{L}$ = 0 gives us the condition $\nabla f= \lambda \nabla h$, and $\nabla \lambda \mathcal{L}=0$ gives us $h(x)=0$. Any solution is considered a critical point. 206 | 207 | \end{frame} 208 | 209 | \begin{frame}{Lagrange Multipliers to a single equality condition} 210 | The method of Lagrange multipliers is used to optimize a function subject to equality 211 | constraints. 212 | \begin{gather*} 213 | ~~~~~ \underset{\boldsymbol{x}}{\min} ~ -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] \\ 214 | {s.t.}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ x_1 - x_2^2 = 0 215 | \end{gather*} 216 | 217 | We can formulate the Lagrangian, 218 | \begin{equation*} 219 | \mathcal{L}(x, \lambda) = -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] + \lambda(x_1 - x_2^2) 220 | \end{equation*} 221 | We compute 222 | \begin{itemize} 223 | \item $\frac{\partial \mathcal{L}}{\partial x_1}$ 224 | \item $\frac{\partial \mathcal{L}}{\partial x_2}$ 225 | \item $\frac{\partial \mathcal{L}}{\partial \lambda}$ 226 | \end{itemize} 227 | 228 | \end{frame} 229 | 230 | 231 | \begin{frame}{Lagrange Multipliers to multiple equality conditions} 232 | The method of Lagrange multipliers is used to optimize a function subject to equality 233 | constraints. 234 | \begin{gather*} 235 | ~~~~~ \underset{\boldsymbol{x}}{\min} ~ -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] \\ 236 | {s.t.}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ x_1 - x_2^2 = 0 237 | \end{gather*} 238 | 239 | We can formulate the Lagrangian, 240 | \begin{equation*} 241 | \mathcal{L}(x, \lambda) = -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] + \lambda(x_1 - x_2^2) 242 | \end{equation*} 243 | We compute 244 | \begin{itemize} 245 | \item $\frac{\partial \mathcal{L}}{\partial x_1}$ 246 | \item $\frac{\partial \mathcal{L}}{\partial x_2}$ 247 | \item $\frac{\partial \mathcal{L}}{\partial \lambda}$ 248 | \end{itemize} 249 | 250 | \end{frame} 251 | 252 | \section{Summary} 253 | \begin{frame}{Summary} 254 | \begin{itemize} 255 | \item Constraints are requirements on the design points that a solution must satisfy. 256 | \item Some constraints can be transformed or substituted into the problem to result in an unconstrained optimization problem. 257 | \item Analytical methods using Lagrange multipliers yield the generalized Lagrangian and the necessary conditions for optimality under constraints. 258 | \item A constrained optimization problem has a dual problem formulation that is easier to solve and whose solution is a lower bound of the solution to the original problem. 259 | \end{itemize} 260 | \end{frame} 261 | \end{document} 262 | 263 | -------------------------------------------------------------------------------- /Lecture_notes/13_sampling_plans.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Sampling Plans]{Numerical Optimization 13: Sampling Plans} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Sampling} 139 | \begin{frame}{Optimization with expensive function evaluations } 140 | For many optimization problems, function evaluations can be quite expensive. 141 | \begin{itemize} 142 | \item an aircraft design may require a wind tunnel test 143 | \item deep learning hyperparameters may require a week of GPU training 144 | \item $\cdots$ 145 | \end{itemize} 146 | A common approach for optimizing in these contexts is to build a \textcolor{blue}{surrogate model}, 147 | Further evaluations of the true objective function can be used to improve the model. Fitting such models requires an initial set of points, ideally points that are space-filling; that is, points that cover the region as well as possible. 148 | \end{frame} 149 | 150 | \section{Full Factorial} 151 | \begin{frame}{Full Factorial} 152 | The full factorial sampling plan places a grid of evenly spaced points over the search space. 153 | \begin{itemize} 154 | \item a lower/upper-bound vector $a, b$ such that $a_i \leq x_i \leq b_i$ 155 | \item $m_i$ samples in each $x_i$ separated by a distance $(b_i-a_i)/(m_i-1)$ 156 | \end{itemize} 157 | \begin{figure} 158 | \centering 159 | \includegraphics[width=80mm]{Figs/grid_search.jpeg} 160 | \end{figure} 161 | 162 | \end{frame} 163 | 164 | \section{Random Sampling} 165 | \begin{frame}{Random Sampling} 166 | In some cases, it may be possible to transform a problem so that constraints can be removed. For example, bound constraints a ≤ x ≤ b can be removed by passing x through a transform 167 | 168 | \end{frame} 169 | 170 | 171 | \section{Uniform Projection Plans} 172 | \begin{frame}{Uniform Projection Plans} 173 | A uniform projection plan with m samples on an $m \times m$ grid can be constructed using an $m$-element permutation. There are therefore $m!$ possible uniform projection plans. 174 | \begin{figure} 175 | \centering 176 | \includegraphics[width=120mm]{Figs/uni-proj.jpeg} 177 | \end{figure} 178 | \end{frame} 179 | 180 | \section{Stratified Sampling} 181 | \begin{frame}{Uniform Projection Plans} 182 | Stratified sampling modifies any grid-based sampling plan, including full factorial and uniform projection plans. Cells are sampled at a point chosen uniformly at random from within the cell rather than at the cell’s center。 183 | \begin{figure} 184 | \centering 185 | \includegraphics[width=120mm]{Figs/strafied.jpeg} 186 | \end{figure} 187 | \end{frame} 188 | 189 | 190 | \section{Space-Filling Metrics} 191 | \begin{frame}{Space-Filling Metrics} 192 | A good sampling plan fills the design space since the ability for a surrogate model to generalize from samples decays with the distance from those samples. Not all plans, even uniform projection plans, are equally good at covering the search space. 193 | \begin{itemize} 194 | \item Discrepancy, the maximum difference between the fraction of samples in a hyper-rectangular subset H and that subset’s volume: 195 | \begin{equation*} 196 | d(X) = \underset{H}{\sup} |\frac{\#X\cap H}{\#X} - \lambda(H)| 197 | \end{equation*} 198 | where $\#X$ and $\#X \cap H$ are the numbers of $X$ points and $X$ in $H$. 199 | \item Pairwise Distances between all points within each sampling plan 200 | 201 | \end{itemize} 202 | 203 | 204 | \end{frame} 205 | 206 | \section{Quasi-Random Sequences} 207 | \begin{frame}{Quasi-Random Sequences} 208 | Quasi-random sequences are often used in the context of trying to approximate an integral over a multidimensional space: 209 | \begin{equation*} 210 | \int_\chi f(\boldsymbol{x})d\boldsymbol{x} \approx \frac{v}{m}\sum_{i=1}^m f(\boldsymbol{x}^i) 211 | \end{equation*} 212 | where each $\boldsymbol{x}^i$ is sampled uniformly at random over the domain $X$ and $v$ is the volume of $\boldsymbol{\chi}$ . 213 | 214 | Quasi-random sequences are deterministic sequences that fill the space in a systematic manner so that the integral converges as fast as possible in the number of points $m$. They are typically constructed for the unit n-dimensional hypercube with the following methods. 215 | \begin{itemize} 216 | \item Additive Recurrence 217 | \item Halton Sequence 218 | \item Sobol Sequence 219 | \end{itemize} 220 | \end{frame} 221 | 222 | \begin{frame}{Additive Recurrence} 223 | Quasi-random sequences are often used in the context of trying to approximate an integral over a multidimensional space: 224 | \begin{equation*} 225 | x^{k+1} = x^k + c ~~~(\mod 1) 226 | \end{equation*} 227 | produce space-filling sets provided that $c$ is irrational. The value of $c$ leading to 228 | \begin{equation*} 229 | c = 1 - \Phi = \frac{\sqrt{5}-1}{2} = 0.618 230 | \end{equation*} 231 | where $\Phi$ is the golden ratio. 232 | We can construct a space-filling set over $n$ dimensions using an additive recurrence sequence for each coordinate, each with its own value of $c$. The square roots of the primes are known to be irrational, and can thus be used to obtain different sequences for each coordinate: 233 | \begin{equation*} 234 | c_1 =\sqrt{2}, ~c_2 =\sqrt{3}, ~c_3 =\sqrt{5}, c_4 =\sqrt{7}, c_5 =\sqrt{11}, 235 | \end{equation*} 236 | \end{frame} 237 | 238 | \begin{frame}{Halton Sequence} 239 | \textcolor{blue}{Radical Inversion} 240 | \begin{equation*} 241 | \begin{split} 242 | i & = \sum_{k=0}^{M-1} a_k(i)b^k \\ 243 | \Psi_{b, C} &= (b^{-1}, \cdots, b^{-M}) [C (a_0(i), \cdots, a_M(i) )^T] 244 | \end{split} 245 | \end{equation*} 246 | where $b$ is the \textcolor{blue}{base number}, and $C$ is the \textcolor{blue}{generator matrix}. When $C$ is the identity matrix, it is called \textcolor{blue}{van der Corput sequences}, 247 | \begin{itemize} 248 | \item $b$ = 2 249 | \begin{equation*} 250 | X = \bigg\{ \frac{1}{2}, \frac{1}{4}, \frac{3}{4}, \frac{1}{8}, \frac{5}{8}, \frac{3}{8}, \frac{7}{8}, \frac{1}{16}, \cdots \bigg\} 251 | \end{equation*} 252 | \item $b$ = 5 253 | \begin{equation*} 254 | X = \bigg\{ \frac{1}{5}, \frac{2}{5}, \frac{3}{5}, \frac{4}{5}, \frac{1}{25}, \frac{6}{25}, \frac{11}{25}, \cdots \bigg\} 255 | \end{equation*} 256 | \end{itemize} 257 | 258 | Halton Sequence uses coprime numbers in order to be uncorrelated. 259 | 260 | \end{frame} 261 | 262 | \begin{frame}{Sobol Sequence} 263 | In the Sobol sequence, each dimension uses the base 2 with different $C$. 264 | \begin{figure} 265 | \centering 266 | \includegraphics[width=120mm]{Figs/sample_all.jpeg} 267 | \end{figure} 268 | 269 | 270 | 271 | \end{frame} 272 | 273 | 274 | \section{Summary} 275 | \begin{frame}{Summary} 276 | \begin{itemize} 277 | \item Sampling plans are used to cover search spaces with a limited number of points. 278 | \item Full factorial sampling, which involves sampling at the vertices of a uniformly discretized grid, requires a number of points exponential in the number of dimensions. 279 | \item Uniform projection plans,which project uniformly over each dimension,can be efficiently generated and can be optimized to be space filling. 280 | \item Quasi-random sequences are deterministic procedures by which space-filling sampling plans can be generated. 281 | \end{itemize} 282 | 283 | \end{frame} 284 | \end{document} 285 | 286 | -------------------------------------------------------------------------------- /Lecture_notes/14_surrogate_models.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Surrogate models]{Numerical Optimization 14: Surrogate models} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Surrogate Models} 139 | \begin{frame}{Surrogate Models} 140 | The \textcolor{blue}{surrogate models} are designed to be smooth and inexpensive to evaluate so that they can be efficiently optimized from the given sampling points. A surrogate model $\hat{f}$ parameterized by $\theta$ is designed to mimic the true objective function $f$. The parameters $\theta$ can be adjusted to fit the model based on samples collected from $f$. 141 | 142 | Suppose we have 143 | \begin{itemize} 144 | \item $m$ design points: $\{x^1, x^2, \cdots, x^m\}$ 145 | \item associated function evaluations: $\{y^1, y^2, \cdots, y^m\}$ 146 | \end{itemize} 147 | For a particular set of parameters, the model will predict 148 | \begin{equation*} 149 | \hat{y} = \{\hat{f}_\theta(x^1), \hat{f}_\theta(x^2), \cdots, \hat{f}_\theta(x^m)\} 150 | \end{equation*} 151 | 152 | In turn, this is a minimization problem 153 | \begin{equation*} 154 | \underset{\theta}{\min} = ||y-\hat{y}|| 155 | \end{equation*} 156 | \end{frame} 157 | 158 | \section{Linear Models} 159 | \begin{frame}{Linear Models} 160 | A simple surrogate model is the linear model, which has the form 161 | 162 | \begin{gather*} 163 | \hat{f} = w_0 + \boldsymbol{w}^T x ~~~~~~~~~~~~ \theta= \{w_0, \boldsymbol{w}\} 164 | \end{gather*} 165 | For an $n$-dimensional design space, the linear model has $n+1$ parameters, and thus requires at least $n+1$ samples to fit unambiguously. 166 | 167 | Instead of having both $w$ and $w_0$ as parameters, it is common to construct a single vector of parameters $\theta = [w_0, \boldsymbol{w}]$ and prepend 1 to the vector x to get 168 | \begin{equation*} 169 | \hat{f} = \boldsymbol{\theta}^T \boldsymbol{x} 170 | \end{equation*} 171 | 172 | Finding an optimal $\boldsymbol{\theta}$ requires solving a linear regression problem: 173 | \begin{equation*} 174 | \underset{\theta}{\min}~||\boldsymbol{y}-\hat{\boldsymbol{y}}|| ~~\textrm{or}~~ ||y-\boldsymbol{X\theta}|| 175 | \end{equation*} 176 | where $\boldsymbol{X}$ is a design matrix, $[(\boldsymbol{x}^1)^T; \cdots; (\boldsymbol{x}^m)^T]$ 177 | 178 | \end{frame} 179 | 180 | \section{Basis Functions} 181 | \begin{frame}{Basis Functions} 182 | The linear model is a linear combination of the components of $\boldsymbol{x}$: 183 | \begin{equation*} 184 | \hat{f}(\boldsymbol{x}) = \theta_1 x_1 + \cdots + \theta_n x_n = \sum_{i=1}^n \theta_i x_i = \boldsymbol{\theta}^T \boldsymbol{x} 185 | \end{equation*} 186 | which is a specific example of a more general linear combination of basis functions. 187 | \begin{equation*} 188 | \hat{f}(\boldsymbol{x}) = \theta_1 b(x_1) + \cdots + \theta_n b(x_n) = \sum_{i=1}^n \theta_i b(x_i) = \boldsymbol{\theta}^T b(\boldsymbol{x}) 189 | \end{equation*} 190 | 191 | Linear models cannot capture nonlinear relations. There are a variety of other families of basis functions that can represent more expressive surrogate models. The remainder of this section discusses a few common families. 192 | 193 | \end{frame} 194 | 195 | \begin{frame}{Polynomial Basis Functions} 196 | Polynomial basis functions consist of a product of design vector components, each raised to a power. Linear basis functions are a special case of polynomial basis functions. 197 | 198 | In one dimension, a polynomial model of degree $k$ has the form 199 | \begin{equation*} 200 | \hat{f}(x) = \theta_0 + \theta_1 x + \theta_2 x^2 + \cdots = \sum_{i=1}^k \theta_i x^i 201 | \end{equation*} 202 | 203 | In two dimensions, a polynomial model of degree $k$ has basis functions of the form 204 | \begin{equation*} 205 | b_{ij}(\boldsymbol{x}) = x_1^j x_2^j ~~{\textrm{for~}} i, j \in \{0, \cdots, k\}, i+j \leq k 206 | \end{equation*} 207 | \end{frame} 208 | 209 | \begin{frame}{Sinusoidal Basis Functions} 210 | Any continuous function over a finite domain can be represented using an infinite set of sinusoidal basis functions. A Fourier series can be constructed for any integrable univariate function $f$ on an interval $[a, b]$ 211 | 212 | \begin{equation*} 213 | f(x) = \frac{\theta_0}{2} + \sum_{i=1}^\infty \theta_i^{\sin} \sin \bigg(\frac{2\pi ix}{b-a}\bigg) 214 | \sum_{i=1}^\infty \theta_i^{\cos} \cos \bigg(\frac{2\pi ix}{b-a}\bigg) 215 | \end{equation*} 216 | 217 | where 218 | \begin{equation*} 219 | \begin{split} 220 | \theta_0 &= \frac{2}{b-a} \int_a^b f(x)dx \\ 221 | \theta_i^{\sin}&= \frac{2}{b-a} \int_a^b f(x)\sin \bigg(\frac{2\pi ix}{b-a}\bigg) dx \\ 222 | \theta_i^{\cos}&= \frac{2}{b-a} \int_a^b f(x)\cos \bigg(\frac{2\pi ix}{b-a}\bigg) dx 223 | \end{split} 224 | \end{equation*} 225 | 226 | \end{frame} 227 | 228 | 229 | \begin{frame}{Radial Basis Functions} 230 | A radial function $\Psi$ is one which depends only on the distance of a point from some center point $c$, such that it can be written $\Psi(x, c) = \Psi(|x − c|)) = \Psi(r)$. 231 | 232 | \begin{figure} 233 | \centering 234 | \includegraphics[width=100mm]{Figs/rbf.jpeg} 235 | \end{figure} 236 | 237 | \end{frame} 238 | 239 | \section{Fitting Noisy Objective Functions} 240 | \begin{frame}{Fitting Noisy Objective Functions} 241 | Models fit using regression will pass as close as possible to every design point. When the objective function evaluations are noisy, complex models are likely to excessively contort themselves to pass through every point. However, smoother fits are often better predictors of the true underlying objective function. A regularization term is added in addition to the prediction error in order to give preference to solutions with lower weights. The resulting basis regression problem with L2 regularization is: 242 | \begin{equation*} 243 | \underset{\theta}{\min} ||\boldsymbol{y-B\theta}||^2 + \lambda ||\boldsymbol{\theta}||^2_2 244 | \end{equation*} 245 | 246 | The optimal parameter vector is given by: 247 | \begin{equation*} 248 | \boldsymbol{\theta} = (\boldsymbol{B^TB} + \lambda {\bf I}) \boldsymbol{B^Ty} 249 | \end{equation*} 250 | 251 | where \textbf{I} is the identity matrix. 252 | \end{frame} 253 | 254 | \section{Model Selection} 255 | 256 | \begin{frame}{Model Selection} 257 | So far, we have discussed how to fit a particular model to data. We generally want to minimize generalization error, which is a measure of the error of the model on the full design space, including points that may not be included in the data used to train the model. One way to measure generalization error is to use the expected squared error of its predictions: 258 | \begin{equation*} 259 | \epsilon_{\textrm{gen}} = \mathbb{E}_{\boldsymbol{x} \sim \boldsymbol{\chi}} \bigg[\bigg(f(x)-\hat{f}(x)\bigg)^2\bigg] 260 | \end{equation*} 261 | 262 | which impossible to compute. It may be tempting to estimate the generalization error of a model from the training error by using the mean squared error (MSE) of the model evaluated on the $m$ samples: 263 | \begin{equation*} 264 | \epsilon_{\textrm{train}} = \frac{1}{m} \sum_i^m \bigg[\bigg(f(x^i)-\hat{f}(x^i)\bigg)^2\bigg] 265 | \end{equation*} 266 | 267 | \end{frame} 268 | 269 | \begin{frame}{Holdout} 270 | A simple approach to estimating the generalization error is the holdout method, which partitions the available data into a test set $D_h$ with $h$ samples and a training set Dt consisting of all remaining $m-h$ samples. The training set is used to fit model parameters. The held out test set is not used during model fitting, and can thus be used to estimate the generalization error. Different split ratios are used, typically ranging from 50\% train, 50\% test to 90\% train, 10\% test, depending on the size and nature of the dataset. Using too few samples for training can result in poor fits, whereas using too many will result in poor generalization estimates. 271 | \begin{figure} 272 | \centering 273 | \includegraphics[width=100mm]{Figs/holdout.jpeg} 274 | \end{figure} 275 | 276 | \end{frame} 277 | 278 | 279 | 280 | \begin{frame}{Cross validation} 281 | Here, the original dataset D is randomly partitioned into $k$ sets $D_1, D_2, \cdots, D_k$ of equal, or approximately equal, size. We then train $k$ models, one on each subset of $k-1$ sets, and we use the withheld set to estimate the generalization error. The cross-validation estimate of generalization error is the mean generalization error over all folds 282 | \begin{figure} 283 | \centering 284 | \includegraphics[width=100mm]{Figs/cross-valid.jpeg} 285 | \end{figure} 286 | \end{frame} 287 | 288 | \section{Summary} 289 | \begin{frame}{Summary} 290 | \begin{itemize} 291 | \item Surrogate models are function approximations that can be optimized instead of the true, potentially expensive objective function. 292 | \item Many surrogate models can be represented using a linear combination of basis functions. 293 | \item Model selection involves a bias-variance trade off between models with low complexity that cannot capture important trends and models with high complexity that overfit to noise. 294 | \item Generalization error can be estimated using techniques such as hold out,k-fold cross validation, and the bootstrap. 295 | \end{itemize} 296 | \end{frame} 297 | \end{document} 298 | 299 | -------------------------------------------------------------------------------- /Lecture_notes/16_surrogate_optimization.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Surrogate Optimization]{Numerical Optimization 16: Surrogate Optimization} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Prediction-Based Exploration} 139 | \begin{frame}{Prediction-Based Exploration} 140 | Gaussian process probes the probability distributions over the true objective function. These distributions can be used to guide an optimization process toward better design points. In prediction-based exploration, we select the minimizer of the surrogate function. If we use a Gaussian process surrogate model, prediction-based optimization has us select the minimizer of the mean function 141 | \begin{equation*} 142 | \boldsymbol{x}^{m+1} = \underset{\boldsymbol{x}\in \chi}{\arg\min}~~ \hat{\boldsymbol{\mu}}(\boldsymbol{x}) 143 | \end{equation*} 144 | 145 | where $\hat{\boldsymbol{\mu}}(\boldsymbol{x})$ is the predicted mean of a Gaussian process at a design point $x$ based on the previous m design points. 146 | 147 | Prediction-based optimization does not take uncertainty into account, and new samples can be generated very close to existing samples, which is a waste of time. 148 | \end{frame} 149 | 150 | \section{Error-Based Exploration} 151 | \begin{frame}{Error-Based Exploration} 152 | Error-based exploration seeks to increase confidence in the true function. A Gaussian process can tell us both the mean and standard deviation at every point. The next sample point is: 153 | \begin{equation*} 154 | \boldsymbol{x}^{m+1} = \underset{\boldsymbol{x}\in \chi}{\arg\max}~~ \hat{\boldsymbol{\sigma}}(\boldsymbol{x}) 155 | \end{equation*} 156 | where $\hat{\boldsymbol{\sigma}}(\boldsymbol{x})$ is the predicted standard variance of a Gaussian process at a design point $x$ based on the previous m design points. 157 | 158 | \begin{figure} 159 | \centering 160 | \includegraphics[width=120mm]{Figs/error-explore.jpeg} 161 | \end{figure} 162 | \end{frame} 163 | 164 | 165 | \section{Lower Confidence Bound Exploration} 166 | \begin{frame}{Lower Confidence Bound Exploration} 167 | The error-based exploration may sample the regions that are unpromising. Lower confidence bound exploration trades off between greedy minimization employed by prediction-based optimization and uncertainty reduction employed by error-based exploration with the following strategy, 168 | 169 | \begin{gather*} 170 | LB(x) = \hat{\mu}(\boldsymbol{x})-\alpha \hat{\sigma}(\boldsymbol{x}) 171 | \end{gather*} 172 | $\alpha \geq 0$ is to control the trade-off between exploration and exploitation. 173 | \begin{figure} 174 | \centering 175 | \includegraphics[width=120mm]{Figs/LB.jpeg} 176 | \end{figure} 177 | 178 | \end{frame} 179 | 180 | \section{Probability of Improvement Exploration} 181 | \begin{frame}{Probability of Improvement Exploration} 182 | We select the design point that maximizes the chance that the new point will be better than any other. The improvement for a function sampled at $x$ producing $y=f(x)$ is 183 | \begin{equation*} 184 | I(y) = 185 | \begin{cases} 186 | y_{\min} - y & \textrm{if~} y0$ is 191 | \begin{equation*} 192 | P(y0 \\ 214 | 0 & {\textrm{otherwise}}\\ 215 | \end{cases} 216 | \end{equation*} 217 | where $\hat{\mu}$ and $\hat{\sigma}$ are the predicted mean and standard deviation. 218 | 219 | We can calculate the expected improvement using Gaussian process: 220 | \begin{equation*} 221 | \begin{split} 222 | \mathbb{E}[I(y)] &= \hat{\sigma} \int_{-\infty}^{y`_{\min}} \mathcal{N}(z|0,1)dz\\ 223 | & = (y_{\min} - \hat{\mu}) P (y \leq y_{\min}) + \hat{\sigma} \mathcal{N}(y_{\min} | \hat{\mu}, \hat{\sigma}ˆ2) 224 | \end{split} 225 | \end{equation*} 226 | 227 | \end{frame} 228 | 229 | \begin{frame}{Expected Improvement Exploration} 230 | 231 | \begin{figure} 232 | \centering 233 | \includegraphics[width=125mm]{Figs/EI.jpeg} 234 | \end{figure} 235 | 236 | \end{frame} 237 | 238 | \section{Summary} 239 | \begin{frame}{Summary} 240 | \begin{itemize} 241 | \item Gaussian processes can be used to guide the optimization process using a variety of strategies that use estimates of quantities such as the lower confidence bound, probability of improvement, and expected improvement. 242 | \item Some problems do not allow for the evaluation of unsafe designs, in which case we can use safe exploration strategies that rely on Gaussian processes. 243 | \end{itemize} 244 | \end{frame} 245 | \end{document} 246 | 247 | -------------------------------------------------------------------------------- /Lecture_notes/17_uncertainty.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Uncertainty]{Numerical Optimization 17: Uncertainty} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Uncertainty} 139 | \begin{frame}{Uncertainty} 140 | In many engineering tasks, however, there may be uncertainty due to a number of factors, such as model approximations, imprecision, and fluctuations of parameters over time. We want to minimize $f(x, z)$, but we do not have control over $z$. Feasibility depends on both the design vector $x$ and the uncertain vector $z$. 141 | \begin{figure} 142 | \centering 143 | \includegraphics[width=110mm]{Figs/uncertainty.jpeg} 144 | \end{figure} 145 | \end{frame} 146 | 147 | \section{Polynomial Chaos} 148 | \begin{frame}{Polynomial chaos} 149 | \textcolor{blue}{Polynomial chaos} is a method for fitting a polynomial to $f(x, z)$ and using the resulting surrogate model to estimate the mean and variance. 150 | 151 | In one dimension, we approximate $f(z)$ with a surrogate model consisting of $k$ polynomial basis functions, $b_1, \cdots, b_k$: 152 | \begin{equation*} 153 | f(z) = \hat{f}(z) = \sum_{i=1}^k \theta_i b_i(z) 154 | \end{equation*} 155 | The mean of $\hat{f}$ can be derived as follows 156 | \begin{equation*} 157 | \begin{split} 158 | \hat{\mu} & = \int_Z p(z)\hat{f}(z) dz = \int_Z \sum_{i=1}^k p(z) \theta_i b_i(z) dx 159 | = \sum_{i=1}^k \int_Z \theta_i b_i(z) p(z)dz \\ 160 | & = \theta_1 \int_Z b_1(z) p(z)dz + \cdots + \theta_n \int_Z b_n(z) p(z)dz 161 | \end{split} 162 | \end{equation*} 163 | \end{frame} 164 | 165 | \begin{frame}{Polynomial chaos} 166 | The variance of $\hat{f}$ can be derived as follows 167 | \begin{equation*} 168 | \begin{split} 169 | \hat{\sigma} & = \mathbb{E}[\hat{f}^2] - (\mathbb{E}[\hat{f}])^2 170 | = \int_Z p^2(z)\hat{f}(z) dz - \mu^2\\ 171 | &= \int_Z \sum_{i=1}^k\sum_{j=1}^k \theta_i\theta_j b_i(z)b_j(z)p(z)dz - \mu^2\\ 172 | & = \int_Z \bigg( \sum_{i=1}^k \theta_i^2 b_i^2(z) + 2\sum_{i=2}^k\sum_{j=1}^{i-1} \theta_i\theta_j b_i(z)b_j(z)\bigg)p(z)dz -\mu^2 \\ 173 | & = \sum_{i=1}^k \theta_i^2 \int_Z b_i^2(z)dz + 2\sum_{i=2}^k\sum_{j=1}^{i-1} \theta_i\theta_j \int_Z b_i(z)b_j(z)p(z)dz 174 | - \mu^2 175 | \end{split} 176 | \end{equation*} 177 | \end{frame} 178 | 179 | \section{Orthogonal polynomial basis} 180 | \begin{frame}{Orthogonal polynomial basis} 181 | The mean and variance can be efficiently computed if the basis functions are chosen to be orthogonal under $p$. Two basis functions $b_i$ and $b_j$ are orthogonal with respect to a probability density $p(z)$ if 182 | \begin{equation*} 183 | \int_Z b_i(z)b_j(z)p(z)dz = 0. ~({\textrm{if~}} i\neq j) 184 | \end{equation*} 185 | 186 | If the chosen basis functions are all orthogonal to one another and the first basis function is $b_1(z) = 1$, the mean is: 187 | \begin{equation*} 188 | \begin{split} 189 | \hat{\mu} & = \theta_1 \int_Z b_1(z) p(z)dz + \cdots + \theta_n \int_Z b_n(z) p(z)dz\\ 190 | & = \theta_1 \int_Z b^2_1(z) p(z)dz + \cdots + \theta_n \int_Z b_1(z) b_n(z) p(z)dz\\ 191 | & = \theta_1 192 | \end{split} 193 | \end{equation*} 194 | 195 | \end{frame} 196 | 197 | \begin{frame}{Orthogonal polynomial basis} 198 | Similarly, the variance is 199 | \begin{equation*} 200 | \begin{split} 201 | \hat{\sigma} & = \sum_{i=1}^k \theta_i^2 \int_Z b_i^2(z)dz + 2\sum_{i=2}^k\sum_{j=1}^{i-1} \theta_i\theta_j \int_Z b_i(z)b_j(z)p(z)dz - \mu^2\\ 202 | & = \sum_{i=1}^k \theta_i^2 \int_Z b_i^2(z)dz - \mu^2 \\ 203 | & = \theta_1^2 \int_Z b_1^2(z)dz - \sum_{i=1}^k \theta_i^2 \int_Z b_i^2(z)dz- \mu^2 \\ 204 | & = \sum_{i=1}^k \theta_i^2 \int_Z b_i^2(z)dz 205 | \end{split} 206 | \end{equation*} 207 | 208 | \end{frame} 209 | 210 | \begin{frame}{Orthogonal polynomial basis} 211 | The mean thus falls immediately from fitting a surrogate model to the observed data, and the variance can be very efficiently computed given the values $\int_Z b_i^2(z)p(z)dz$ for a choice of basis functions and probability distribution. All orthogonal polynomials satisfy the recurrence relation: 212 | \begin{equation*} 213 | b_{i+1}(z) = 214 | \begin{cases} 215 | (z-a_i)b_i(z) & i=1\\ 216 | (z-a_i)b_i(z) - \beta_i b_{i-1}z & {\textrm{else}} 217 | \end{cases} 218 | \end{equation*} 219 | with $b_1(z)$ = 1 and weights 220 | \begin{equation*} 221 | \begin{split} 222 | \alpha_i &= \frac{\int_Z z b_i^2(z)p(z)dz} {\int_Z b_i^2(z)p(z)dz}\\ 223 | \beta_i &= \frac{\int_Z b_i^2(z)p(z)dz}{\int_Z b_{i-1}^2(z)p(z)dz} 224 | \end{split} 225 | \end{equation*} 226 | The recurrence relation can be used to generate the basis functions. Each basis function $b_i$ is a polynomial of degree $i-1$. 227 | 228 | \end{frame} 229 | 230 | \begin{frame}{Orthogonal polynomial basis functions} 231 | \begin{figure} 232 | \centering 233 | \includegraphics[width=120mm]{Figs/orthogonal.jpeg} 234 | \end{figure} 235 | \end{frame} 236 | 237 | \section{Coefficients} 238 | \begin{frame}{Coefficients} 239 | The coefficients $\theta_1, \cdots, \theta_k$ can be inferred by exploiting the orthogonality of the basis functions, producing an integration term amenable to \textcolor{blue}{Gaussian quadrature}. 240 | \begin{equation*} 241 | \begin{split} 242 | f(z) &= \sum_{i=1}^k \theta_i b_i(z)\\ 243 | \int_Z f(z)b_j(z)p(z)dz &= \int_Z \bigg(\sum_{i=1}^k \theta_i b_i(z) \bigg) b_j(z)p(z)dz\\ 244 | &= \sum_{i=1}^k \theta_i \int_Z b_i(z)b_j(z)p(z)dz \\ 245 | &= \theta_j \int_Z b_j(z)p(z)dz\\ 246 | \implies \theta_j &= \frac{\int_Z f(z)b_j(z)p(z)dz}{\int_Z b_j(z)p(z)dz} 247 | \end{split} 248 | \end{equation*} 249 | \end{frame} 250 | 251 | \section{Multivariate} 252 | \begin{frame}{Multivariate} 253 | Polynomial chaos can be applied to functions with multiple random inputs. Multivariate basis functions over m variables are constructed as a product over univariate orthogonal polynomials: 254 | %\begin{equation*} 255 | %\begin{split} 256 | %\end{split} 257 | %\end{equation*} 258 | \end{frame} 259 | 260 | \section{Summary} 261 | \begin{frame}{Summary} 262 | \begin{itemize} 263 | \item Polynomial chaos is a powerful uncertainty propagation technique basedon orthogonal polynomials. 264 | \item Bayesian Monte Carlo uses Gaussian processes to efficiently arrive at the moments with analytic results for Gaussian kernels. 265 | \end{itemize} 266 | \end{frame} 267 | \end{document} 268 | 269 | -------------------------------------------------------------------------------- /Lecture_notes/18_symbolic_regression.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | \lstset{style=mystyle} 32 | 33 | \mode { 34 | 35 | % The Beamer class comes with a number of default slide themes 36 | % which change the colors and layouts of slides. Below this is a list 37 | % of all the themes, uncomment each in turn to see what they look like. 38 | 39 | %\usetheme{default} 40 | \usetheme{AnnArbor} 41 | %\usetheme{Antibes} 42 | %\usetheme{Bergen} 43 | %\usetheme{Berkeley} 44 | %\usetheme{Berlin} 45 | %\usetheme{Boadilla} 46 | %\usetheme{CambridgeUS} 47 | %\usetheme{Copenhagen} 48 | %\usetheme{Darmstadt} 49 | %\usetheme{Dresden} 50 | %\usetheme{Frankfurt} 51 | %\usetheme{Goettingen} 52 | %\usetheme{Hannover} 53 | %\usetheme{Ilmenau} 54 | %\usetheme{JuanLesPins} 55 | %\usetheme{Luebeck} 56 | %\usetheme{Madrid} 57 | %\usetheme{Malmoe} 58 | %\usetheme{Marburg} 59 | %\usetheme{Montpellier} 60 | %\usetheme{PaloAlto} 61 | %\usetheme{Pittsburgh} 62 | %\usetheme{Rochester} 63 | %\usetheme{Singapore} 64 | %\usetheme{Szeged} 65 | %\usetheme{Warsaw} 66 | 67 | % As well as themes, the Beamer class has a number of color themes 68 | % for any slide theme. Uncomment each of these in turn to see how it 69 | % changes the colors of your current slide theme. 70 | 71 | %\usecolortheme{albatross} 72 | %\usecolortheme{beaver} 73 | %\usecolortheme{beetle} 74 | %\usecolortheme{crane} 75 | %\usecolortheme{dolphin} 76 | %\usecolortheme{dove} 77 | %\usecolortheme{fly} 78 | %\usecolortheme{lily} 79 | %\usecolortheme{orchid} 80 | %\usecolortheme{rose} 81 | %\usecolortheme{seagull} 82 | %\usecolortheme{seahorse} 83 | %\usecolortheme{whale} 84 | %\usecolortheme{wolverine} 85 | 86 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 87 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 88 | 89 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 90 | } 91 | 92 | \usepackage{graphicx} % Allows including images 93 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 94 | %\usepackage {tikz} 95 | \usepackage{tkz-graph} 96 | \GraphInit[vstyle = Shade] 97 | \tikzset{ 98 | LabelStyle/.style = { rectangle, rounded corners, draw, 99 | minimum width = 2em, fill = yellow!50, 100 | text = red, font = \bfseries }, 101 | VertexStyle/.append style = { inner sep=5pt, 102 | font = \normalsize\bfseries}, 103 | EdgeStyle/.append style = {->, bend left} } 104 | \usetikzlibrary {positioning} 105 | %\usepackage {xcolor} 106 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 107 | %---------------------------------------------------------------------------------------- 108 | % TITLE PAGE 109 | %---------------------------------------------------------------------------------------- 110 | 111 | \title[Symbolic Regression]{Numerical Optimization 18: Symbolic Regression} % 112 | 113 | \author{Qiang Zhu} % Your name 114 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 115 | { 116 | University of Nevada Las Vegas\\ % Your institution for the title page 117 | \medskip 118 | } 119 | \date{\today} % Date, can be changed to a custom date 120 | 121 | \begin{document} 122 | 123 | \begin{frame} 124 | \titlepage % Print the title page as the first slide 125 | \end{frame} 126 | 127 | \begin{frame} 128 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 129 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 130 | \end{frame} 131 | 132 | %---------------------------------------------------------------------------------------- 133 | % PRESENTATION SLIDES 134 | %---------------------------------------------------------------------------------------- 135 | 136 | %------------------------------------------------ 137 | 138 | \section{Grammars} 139 | \begin{frame}{Grammars} 140 | An expression can be represented by a tree of symbols. For example, the mathematical expression $x + \ln2$ can be represented using the tree consisting of the symbols $+, x, \ln$, and 2. Grammars specify constraints on the space of possible expressions. 141 | \begin{figure} 142 | \centering 143 | \includegraphics[width=125mm]{Figs/tree.jpeg} 144 | \end{figure} 145 | 146 | \end{frame} 147 | 148 | \section{Constraints} 149 | \begin{frame}{Constraints} 150 | Constraints are not typically specified directly through a known feasible set X . Instead, the feasible set is typically formed from two types of constraints: 151 | 152 | \begin{itemize} 153 | \item equality constraints, $h(x)=0$ 154 | \item inequality constraints, $g(x) \leq 0$ 155 | 156 | \end{itemize} 157 | 158 | Any optimization problem can be rewritten using these constraints 159 | \begin{gather*} 160 | ~~~~~ \underset{\boldsymbol{x}}{\min} ~ f(\boldsymbol{x})\\ 161 | {s.t.}~~~~ h_i(x) = 0 \\ 162 | ~~~~~~~~~ g_j(x) = 0 163 | \end{gather*} 164 | 165 | \end{frame} 166 | 167 | \section{Genetic Programming} 168 | \begin{frame}{Genetic Programming} 169 | Genetic programming represents individuals using trees instead, which are better at representing mathematical functions, programs, decision trees, and other hierarchical structures. 170 | 171 | \begin{equation*} 172 | x = \frac{b+a}{2} + \frac{b-a}{2}\bigg(\frac{2\hat{x}}{1+\hat{x}^2}\bigg) 173 | \end{equation*} 174 | 175 | \end{frame} 176 | 177 | \section{Lagrange Multipliers} 178 | \begin{frame}{Lagrange Multipliers} 179 | The method of Lagrange multipliers is used to optimize a function subject to equality 180 | constraints. 181 | \begin{gather*} 182 | ~~~~~ \underset{\boldsymbol{x}}{\min} ~ f(\boldsymbol{x})\\ 183 | {s.t.}~~~~ h_i(x) = 0 184 | \end{gather*} 185 | 186 | where $f$ and $h$ have continuous partial derivatives. 187 | 188 | We can formulate the Lagrangian, which is a function of the design variables, 189 | \begin{gather*} 190 | \mathcal{L}(x, \lambda) = f(x) - \lambda h(x) 191 | \end{gather*} 192 | 193 | Solving $\nabla \mathcal{L}(x, \lambda)$ = 0. Specifically, $\nabla_x \mathcal{L}$ = 0 gives us the condition $\nabla f= \lambda \nabla h$, and $\nabla \lambda \mathcal{L}=0$ gives us $h(x)=0$. Any solution is considered a critical point. 194 | 195 | \end{frame} 196 | 197 | \begin{frame}{Lagrange Multipliers to a single equality condition} 198 | The method of Lagrange multipliers is used to optimize a function subject to equality 199 | constraints. 200 | \begin{gather*} 201 | ~~~~~ \underset{\boldsymbol{x}}{\min} ~ -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] \\ 202 | {s.t.}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ x_1 - x_2^2 = 0 203 | \end{gather*} 204 | 205 | We can formulate the Lagrangian, 206 | \begin{equation*} 207 | \mathcal{L}(x, \lambda) = -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] + \lambda(x_1 - x_2^2) 208 | \end{equation*} 209 | We compute 210 | \begin{itemize} 211 | \item $\frac{\partial \mathcal{L}}{\partial x_1}$ 212 | \item $\frac{\partial \mathcal{L}}{\partial x_2}$ 213 | \item $\frac{\partial \mathcal{L}}{\partial \lambda}$ 214 | \end{itemize} 215 | 216 | \end{frame} 217 | 218 | 219 | \begin{frame}{Lagrange Multipliers to multiple equality conditions} 220 | The method of Lagrange multipliers is used to optimize a function subject to equality 221 | constraints. 222 | \begin{gather*} 223 | ~~~~~ \underset{\boldsymbol{x}}{\min} ~ -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] \\ 224 | {s.t.}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ x_1 - x_2^2 = 0 225 | \end{gather*} 226 | 227 | We can formulate the Lagrangian, 228 | \begin{equation*} 229 | \mathcal{L}(x, \lambda) = -\exp[-(x_1x_2-3/2)^2 - (x_2-3/2)^2] + \lambda(x_1 - x_2^2) 230 | \end{equation*} 231 | We compute 232 | \begin{itemize} 233 | \item $\frac{\partial \mathcal{L}}{\partial x_1}$ 234 | \item $\frac{\partial \mathcal{L}}{\partial x_2}$ 235 | \item $\frac{\partial \mathcal{L}}{\partial \lambda}$ 236 | \end{itemize} 237 | 238 | \end{frame} 239 | 240 | \section{Summary} 241 | \begin{frame}{Summary} 242 | \begin{itemize} 243 | \item Constraints are requirements on the design points that a solution must satisfy. 244 | \item Some constraints can be transformed or substituted into the problem to result in an unconstrained optimization problem. 245 | \item Analytical methods using Lagrange multipliers yield the generalized Lagrangian and the necessary conditions for optimality under constraints. 246 | \item A constrained optimization problem has a dual problem formulation that is easier to solve and whose solution is a lower bound of the solution to the original problem. 247 | \end{itemize} 248 | \end{frame} 249 | \end{document} 250 | 251 | -------------------------------------------------------------------------------- /Lecture_notes/A1_trust_region.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/A1_trust_region.pdf -------------------------------------------------------------------------------- /Lecture_notes/A1_trust_region.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{amsmath} 3 | \usepackage{hyperref} 4 | \usepackage{listings} 5 | \usepackage{xcolor} 6 | \hypersetup{colorlinks=true, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue} 7 | \definecolor{codegreen}{rgb}{0,0.6,0} 8 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 9 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 10 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 11 | 12 | \lstdefinestyle{mystyle}{ 13 | backgroundcolor=\color{backcolour}, 14 | commentstyle=\color{codegreen}, 15 | keywordstyle=\color{magenta}, 16 | numberstyle=\tiny\color{codegray}, 17 | stringstyle=\color{codepurple}, 18 | basicstyle=\ttfamily\footnotesize, 19 | breakatwhitespace=false, 20 | breaklines=true, 21 | captionpos=b, 22 | keepspaces=true, 23 | %numbers=left, 24 | numbersep=5pt, 25 | showspaces=false, 26 | showstringspaces=false, 27 | showtabs=false, 28 | tabsize=2 29 | } 30 | 31 | %% 32 | %% Julia definition (c) 2014 Jubobs 33 | %% 34 | \lstdefinelanguage{Julia}% 35 | {morekeywords={abstract,break,case,catch,const,continue,do,else,elseif,% 36 | end,export,false,for,function,immutable,import,importall,if,in,% 37 | macro,module,otherwise,quote,return,switch,true,try,type,typealias,% 38 | using,while},% 39 | sensitive=true,% 40 | alsoother={$},% 41 | morecomment=[l]\#,% 42 | morecomment=[n]{\#=}{=\#},% 43 | morestring=[s]{"}{"},% 44 | morestring=[m]{'}{'},% 45 | }[keywords,comments,strings]% 46 | 47 | \lstset{% 48 | language = Julia, 49 | basicstyle = \ttfamily, 50 | keywordstyle = \bfseries\color{blue}, 51 | stringstyle = \color{magenta}, 52 | commentstyle = \color{ForestGreen}, 53 | showstringspaces = false, 54 | } 55 | 56 | 57 | \lstset{style=mystyle} 58 | 59 | \mode { 60 | 61 | % The Beamer class comes with a number of default slide themes 62 | % which change the colors and layouts of slides. Below this is a list 63 | % of all the themes, uncomment each in turn to see what they look like. 64 | 65 | %\usetheme{default} 66 | \usetheme{AnnArbor} 67 | %\usetheme{Antibes} 68 | %\usetheme{Bergen} 69 | %\usetheme{Berkeley} 70 | %\usetheme{Berlin} 71 | %\usetheme{Boadilla} 72 | %\usetheme{CambridgeUS} 73 | %\usetheme{Copenhagen} 74 | %\usetheme{Darmstadt} 75 | %\usetheme{Dresden} 76 | %\usetheme{Frankfurt} 77 | %\usetheme{Goettingen} 78 | %\usetheme{Hannover} 79 | %\usetheme{Ilmenau} 80 | %\usetheme{JuanLesPins} 81 | %\usetheme{Luebeck} 82 | %\usetheme{Madrid} 83 | %\usetheme{Malmoe} 84 | %\usetheme{Marburg} 85 | %\usetheme{Montpellier} 86 | %\usetheme{PaloAlto} 87 | %\usetheme{Pittsburgh} 88 | %\usetheme{Rochester} 89 | %\usetheme{Singapore} 90 | %\usetheme{Szeged} 91 | %\usetheme{Warsaw} 92 | 93 | % As well as themes, the Beamer class has a number of color themes 94 | % for any slide theme. Uncomment each of these in turn to see how it 95 | % changes the colors of your current slide theme. 96 | 97 | %\usecolortheme{albatross} 98 | %\usecolortheme{beaver} 99 | %\usecolortheme{beetle} 100 | %\usecolortheme{crane} 101 | %\usecolortheme{dolphin} 102 | %\usecolortheme{dove} 103 | %\usecolortheme{fly} 104 | %\usecolortheme{lily} 105 | %\usecolortheme{orchid} 106 | %\usecolortheme{rose} 107 | %\usecolortheme{seagull} 108 | %\usecolortheme{seahorse} 109 | %\usecolortheme{whale} 110 | %\usecolortheme{wolverine} 111 | 112 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 113 | \setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 114 | 115 | \setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 116 | } 117 | 118 | \usepackage{graphicx} % Allows including images 119 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 120 | %\usepackage {tikz} 121 | \usepackage{tkz-graph} 122 | \GraphInit[vstyle = Shade] 123 | \tikzset{ 124 | LabelStyle/.style = { rectangle, rounded corners, draw, 125 | minimum width = 2em, fill = yellow!50, 126 | text = red, font = \bfseries }, 127 | VertexStyle/.append style = { inner sep=5pt, 128 | font = \normalsize\bfseries}, 129 | EdgeStyle/.append style = {->, bend left} } 130 | \usetikzlibrary {positioning} 131 | %\usepackage {xcolor} 132 | \definecolor {processblue}{cmyk}{0.96,0,0,0} 133 | %---------------------------------------------------------------------------------------- 134 | % TITLE PAGE 135 | %---------------------------------------------------------------------------------------- 136 | 137 | \title[Trust Region Methods]{Numerical Optimization: Trust Region Methods} % The short title appears at the bottom of every slide, the full title is only on the title page 138 | 139 | \author{Qiang Zhu} % Your name 140 | \institute[University of Nevada Las Vegas] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 141 | { 142 | University of Nevada Las Vegas\\ % Your institution for the title page 143 | \medskip 144 | } 145 | \date{\today} % Date, can be changed to a custom date 146 | 147 | \begin{document} 148 | 149 | \begin{frame} 150 | \titlepage % Print the title page as the first slide 151 | \end{frame} 152 | 153 | \begin{frame} 154 | \frametitle{Overview} % Table of contents slide, comment this block out to remove it 155 | \tableofcontents % Throughout your presentation, if you choose to use \section{} and \subsection{} commands, these will automatically be printed on this slide as an overview of your presentation 156 | \end{frame} 157 | 158 | %---------------------------------------------------------------------------------------- 159 | % PRESENTATION SLIDES 160 | %---------------------------------------------------------------------------------------- 161 | 162 | %------------------------------------------------ 163 | 164 | \section{Trust Region Model} 165 | \begin{frame}{The problem of line search} 166 | In the line search method, one usually use the direction based on the first- or second-order derivative, and then do an approximate 1D search. 167 | If the derivative is far from the local minimum, such search may result in excessively large steps or premature convergence. 168 | 169 | \begin{figure} 170 | \centering 171 | \includegraphics[width=90mm]{Figs/trust-region.jpeg} 172 | \end{figure} 173 | \end{frame} 174 | 175 | \begin{frame}{Line search .v.s trust region} 176 | \begin{alertblock}{Line search} 177 | \begin{itemize} 178 | \item Find the direction of improvement 179 | \item Select a step length 180 | \end{itemize} 181 | \end{alertblock} 182 | \vfill 183 | \begin{alertblock}{Trust region} 184 | \begin{itemize} 185 | \item Select a trust region (within a hypersphere) 186 | \item Find a point of improvement 187 | \end{itemize} 188 | \end{alertblock} 189 | 190 | \end{frame} 191 | 192 | \section{The outline of trust region approach} 193 | \begin{frame}{Quadratic approximation} 194 | In this chapter, we will assume that the model function $m_k$ that is used at each iterate $x_k$ is quadratic. 195 | $m_k$ is based on the Taylor-series expansion of $f$. 196 | \begin{equation*} 197 | f(x_k + p) = f(x_k) + \nabla f(x_k)^T p + \frac{1}{2}p^T \nabla^2 f(x_k + tp)p, 198 | \end{equation*} 199 | where $t$ is some scalar in the interval (0,1). 200 | 201 | By using an approximation $B_k$ to the Hessian in the second-order term, $m_k$ is defined as follows: 202 | \begin{equation*} 203 | m_k(p) = f(x_k) + \nabla f(x_k)^T p + \frac{1}{2}p^T B_k f(x_k + tp)p,, 204 | \end{equation*} 205 | 206 | The difference between $m_k(p)$ and $f(x_k + p)$ is $O(p^2)$ ,which is small when $p$ is small. 207 | 208 | \end{frame} 209 | 210 | \begin{frame}{Trust region step} 211 | The trust-region method steps to the minimizer of $m_k$ within the dotted circle, yielding a more significant reduction in $f$ and better progress toward the solution. 212 | 213 | To obtain each step, we seek a solution of the subproblem 214 | \begin{gather*} 215 | \textrm{min}~ m_k(p) = f(x_k) + \nabla f(x_k)^T p + \frac{1}{2}p^T B_k f(x_k + tp)p, \\ 216 | \textrm{s.t.}~~ ||p||_2 \leq \Delta_k, 217 | \end{gather*} 218 | 219 | where $\Delta_k$ is the \textcolor{blue}{trust-region radius}. 220 | 221 | Thus, the trust-region approach requires us to solve a sequence of subproblems 222 | in which the objective function and constraint (which can be written as $p^Tp \leq \Delta_k$) 223 | are both quadratic, which is easy to solve if it is convex. 224 | \end{frame} 225 | 226 | \begin{frame}{How to adjust the $\Delta_k$?} 227 | For a given step, we define 228 | \begin{equation*} 229 | \rho_k = \frac{f(x_k) - f(x_k+p_k)}{m_k(0) - m_k(p_k)} 230 | \end{equation*} 231 | The numerator is called the \textcolor{blue}{actual reduction}. \\ 232 | The denominator is the \textcolor{blue}{predicted reduction}, which is non-negative. \\ 233 | 234 | \begin{itemize} 235 | \item if $\rho_k < 0$, the new objective value $f(x_k + p_k)$ is greater than $f(x_k)$, \textcolor{red}{reject}. 236 | \item if $\rho_k \approx 1$, there is good agreement between the model $m_k$ and the function $f$, \textcolor{red}{expand the trust region} 237 | \item if $0<\rho_k \ll 1$, \textcolor{red}{shrink the trust region} by reducing $\delta_k$ 238 | \end{itemize} 239 | \end{frame} 240 | 241 | \begin{frame}{Algorithm} 242 | 243 | \lstinputlisting[language=julia]{trust.jl} 244 | 245 | \end{frame} 246 | 247 | \section{Summary} 248 | \begin{frame}{Summary} 249 | \begin{itemize} 250 | \item Trust region method may perform better when the initial point is far from the local minimum 251 | \item The correctness of trust region method relies on the accuracy of the model function 252 | \item The step size is controled by the trust-region radius which is updated each step 253 | \item Quadratic approximation needs the information of hessian 254 | \item The subproblem optimization may be tricky when hessian is not positive definite 255 | \end{itemize} 256 | \end{frame} 257 | \end{document} 258 | 259 | -------------------------------------------------------------------------------- /Lecture_notes/Figs/C60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/C60.png -------------------------------------------------------------------------------- /Lecture_notes/Figs/CMA.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/CMA.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/Cross-entropy.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/Cross-entropy.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/EI.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/EI.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/GPR-raw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/GPR-raw.png -------------------------------------------------------------------------------- /Lecture_notes/Figs/GPR-train-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/GPR-train-1.png -------------------------------------------------------------------------------- /Lecture_notes/Figs/GPR-train-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/GPR-train-2.png -------------------------------------------------------------------------------- /Lecture_notes/Figs/LB.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/LB.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/algo_opt.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/algo_opt.jpg -------------------------------------------------------------------------------- /Lecture_notes/Figs/bracket.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/bracket.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/cauchy.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/cauchy.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/cg-sd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/cg-sd.jpg -------------------------------------------------------------------------------- /Lecture_notes/Figs/constraint-ab.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/constraint-ab.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/coordinate-improved.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/coordinate-improved.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/coordinate.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/coordinate.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/cross-valid.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/cross-valid.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/curvature1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/curvature1.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/curvature2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/curvature2.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/derivative-comparison.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/derivative-comparison.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/error-explore.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/error-explore.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/firefly.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/firefly.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/flat.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/flat.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/gaussian.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/gaussian.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/gp-opt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/gp-opt.png -------------------------------------------------------------------------------- /Lecture_notes/Figs/graph1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/graph1.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/graph2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/graph2.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/grid_search.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/grid_search.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/holdout.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/holdout.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/ip.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/ip.gif -------------------------------------------------------------------------------- /Lecture_notes/Figs/julia-comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/julia-comp.png -------------------------------------------------------------------------------- /Lecture_notes/Figs/julia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/julia.png -------------------------------------------------------------------------------- /Lecture_notes/Figs/kernel.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/kernel.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/linesearch.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/linesearch.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/minimum.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/minimum.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/momentum.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/momentum.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/multi-minima.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/multi-minima.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/n-momentum.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/n-momentum.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/newton-1d.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/newton-1d.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/orthogonal.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/orthogonal.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/powell.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/powell.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/prob.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/prob.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/pso.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/pso.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/quasi-newton.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/quasi-newton.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/rbf.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/rbf.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/sample_all.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/sample_all.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/search.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/search.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/selection.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/selection.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/sgd.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/sgd.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/simplex-performance.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/simplex-performance.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/simplex.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/simplex.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/simplex_algo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/simplex_algo.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/solution-space.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/solution-space.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/strafied.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/strafied.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/sufficient_decrease.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/sufficient_decrease.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/tree.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/tree.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/trust-region.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/trust-region.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/two-conditions.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/two-conditions.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/uncertainty.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/uncertainty.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/uni-proj.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/uni-proj.jpeg -------------------------------------------------------------------------------- /Lecture_notes/Figs/unimodal.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qzhu2017/Numerical-Optimization/173a1723e0fd1d76beae2235567969472431c24d/Lecture_notes/Figs/unimodal.jpeg -------------------------------------------------------------------------------- /Lecture_notes/trust.jl: -------------------------------------------------------------------------------- 1 | function trust_region_descent(f, G, H, x, k_max; 2 | eta1=0.25, eta2=0.5, gamma1=0.5, gamma2=2.0, delta=1.0) 3 | 4 | y = f(x) 5 | for k in 1 : k_max 6 | x1, y1 = solve_subproblem(G, H, x, delta) 7 | r = (y - f(x1)) / (y - y1) 8 | if r < eta1 9 | delta *= gamma1 10 | else 11 | x, y = x1, y1 12 | if r > eta2 13 | delta *= gamma2 14 | end 15 | end 16 | end 17 | return x 18 | end -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Numerical-Optimization 2 | This is a course material for numerical optimization to be taught in summer 2020 (June-August, 10 weeks) through webex. I have not finalized the schedule yet. It is completely open to everyone. If you are interested, please feel free to contact qiang.zhu@unlv.edu between 2020/06-2020/08. 3 | 4 | 5 | ![E](https://github.com/qzhu2017/Numerical-Optimization/blob/master/Lecture_notes/Figs/ip.gif) 6 | 7 | ## Textbooks 8 | This course is intended to cover 9 | - Various optmization methods used in scientific computing 10 | - Julia programming 11 | 12 | The course will mostly follow the book of [Algorithms for Optimization by Mykel J. Kochenderfer and Tim A. Wheeler](https://mitpress.mit.edu/books/algorithms-optimization). 13 | Here is an intersting [video](https://www.youtube.com/watch?v=ofWy5kaZU3g) by one of the authors talking about how they wrote the book in a recent Julia confernece. 14 | 15 | For some details on the optimization algorithms, we will refer to [Numerical Optimization by Jorge Nocedal and Stephen J. Wright](https://link.springer.com/book/10.1007/978-0-387-40065-5) 16 | 17 | ## Format 18 | Though this is a virtual class, we plan to make it as interactive as possible. Typically, each class is composed of three units. 19 | 20 | - Codes review (1 or 2 volunteers to review the previous homework assignments) 21 | - Lecture (Math details for each algorithm) 22 | - Coding session (code accomplishment of the algos in each lecture) 23 | 24 | All codings will be done through jupyter notebook. 25 | 26 | Each class will take about 90 minutes. 27 | 28 | -------------------------------------------------------------------------------- /compile.sh: -------------------------------------------------------------------------------- 1 | cd Lecture_notes/ 2 | for f in *.tex ; do 3 | pdflatex $f 4 | done 5 | rm *.aux *.out *.log *.nav *.snm *.toc 6 | --------------------------------------------------------------------------------