├── .gitignore ├── APPM4720_5720_Fall2018_Syllabus.pdf ├── Demos ├── AD_demos │ ├── ADiGator_demo │ │ ├── .gitignore │ │ ├── f1.m │ │ ├── f1_demo.m │ │ └── polydatafit │ │ │ ├── .DS_Store │ │ │ ├── Contents.m │ │ │ ├── fit.m │ │ │ ├── fit4numjac.m │ │ │ ├── fit_ADiGatorJac.m │ │ │ ├── fit_ADiGatorJac.mat │ │ │ ├── fit_Jac.m │ │ │ └── main.m │ ├── ForwardDiff.jl │ │ └── ForwardDiff_demo.ipynb │ └── README.md ├── AutoDiffByHand.ipynb ├── AutoDiff_threeReLU_implementations.ipynb ├── AutomaticDifferentiation.ipynb ├── BlockMultiplies │ ├── MatrixMultiplyDemo.html │ ├── MatrixMultiplyDemo.mlx │ ├── MultiplyMatrices_C.pdf │ ├── README.md │ ├── compareSpeed_homegrown_vs_MKL.m │ ├── matrixMultiply.m │ ├── multiplyMatrices.c │ └── multiplyMatrices.mexmaci64 ├── CVX_demo │ ├── Handout2_cvx_tutorial.pdf │ ├── README.md │ ├── cvx_demo.mlx │ ├── cvx_demo.pdf │ ├── cvxpy_intro.ipynb │ ├── cvxpy_intro.pdf │ ├── tutorialSolutions.ipynb │ ├── tutorialSolutions.m │ └── tutorialSolutions.py ├── ConjugateGradientDemo.ipynb ├── ConvergenceRateDemo.ipynb ├── RPCA_case_study.ipynb ├── RPCA_case_study_solutions.ipynb └── nonconvex_example_2D.ipynb ├── Fall2018_day-by-day_schedule.pdf ├── Handouts ├── FirmlyNonexpansive.pdf ├── FixedPtTheorems.pdf ├── StrongConvexityLipschitz.pdf ├── StrongConvexityLipshitz.pdf ├── SubOptimalityBounds.pdf └── SubgradientDescent.pdf ├── Homeworks ├── APPM5630Spring25Homework01-02.pdf ├── APPM5630Spring25Homework03-04.pdf ├── APPM5630Spring25Homework05-06.pdf ├── APPM5630Spring25Homework07-08.pdf ├── APPM5630Spring25Homework09-10.pdf ├── HW01_helper_polyhedrality.pdf ├── HW04 │ ├── APPM5630_HW4_helperFunctions.ipynb │ ├── implicit2explicit.m │ ├── quadraticObjective.m │ └── test_adjoint.m ├── HW10 │ ├── .gitignore │ ├── README.md │ ├── adjointShortTimeDCT.m │ ├── forwardShortTimeDCT.m │ ├── handel.mat │ ├── handel.pkl │ ├── handel2.pkl │ ├── listen_to_Handel.ipynb │ ├── listen_to_Handel.m │ ├── listen_to_Handel.py │ ├── my_upsample.m │ ├── project_l1.m │ ├── python_functions.py │ └── test_python_functions.py ├── ProjectInformation.md ├── ProjectRubric.pdf └── README.md ├── LICENSE ├── Notes ├── 00_IntroToOptProblems.pdf ├── 00a_metaRules.pdf ├── 01_TypesOfMinimizers_IntroConvexity.pdf ├── 02_ConvexSets_part1.pdf ├── 03_ConvexSets_part2.pdf ├── 04_SeparatingHyperplanes_Farkas.pdf ├── 05_ConvexFunctions_part1.pdf ├── 05_ConvexFunctions_part2.pdf ├── 05_ConvexFunctions_part3_LipschitzGradient.pdf ├── 05_ConvexFunctions_part4_examples.pdf ├── 05_ConvexFunctions_part5_preservingConvexity.pdf ├── 06_ConjugateFunctions.pdf ├── 07_GradientDescent_intro.pdf ├── 08_ExistenceUniquenessMinimizers.pdf ├── 09_ProximityOperators.pdf ├── 10_OptimizationProblems.pdf ├── 11_FirstViewLagrangeMultipliers.pdf ├── 12_ConicOptimizationProblems.pdf ├── 13_moreOnSDPs.pdf ├── 14_LagrangianAndDualProblem.pdf ├── 15_MoreDuality.pdf ├── 16_SaddlePtsSharedLagrangians.pdf ├── 17_GameTheoryConnections.pdf ├── 18_FenchelRockafellarDuality.pdf ├── 19_KKT_and_complementarySlackness.pdf ├── 22_ProximalGradientDescent_convergenceAnalysis.pdf ├── 22a_ProximalGradientDescent_motivation.pdf ├── 23_NesterovAcceleration_convergenceAnalysis.pdf ├── 24_ConvergenceRates.pdf ├── 25_ConjugateGradientMethod.pdf ├── 26_QuasiNewtonMethods.pdf ├── 28_FindingGradientsByHand.pdf ├── 29_AutomaticDifferentiation.pdf ├── 29a_AdjointStateMethod.pdf ├── 30_GradientsParameterizedFunctions.pdf ├── 31_NewtonAndIPM.pdf ├── 32_ADMM_DRS_PrimalDual.pdf ├── 33_LinearPrograms.pdf ├── 34_IntegerLinearPrograms.pdf ├── README.md ├── appendix_notes_01.pdf ├── supplement_Geometry_Differentiability.pdf ├── supplement_LagrangeMultipliersIn2D.pdf ├── supplement_Slater_PrimalNotAchieved.pdf ├── supplement_VariationalInequalities_and_LCP.pdf ├── supplement_convergenceIteratesGradientDescent.pdf └── supplement_dualityPracticeProblem.pdf ├── README.md ├── SlideshowAllPresentations_4720Fall18.pdf ├── SlideshowAllPresentations_5630_Spring21.pdf ├── TypedNotes ├── APPM5720Notes.pdf ├── APPM5720Notes.tex ├── README.md ├── lecture_notes.pdf ├── lecture_notes_tex │ ├── lec_01.tex │ ├── lec_02.tex │ ├── lec_03.tex │ ├── lec_04.tex │ ├── lec_05.tex │ ├── lec_06.tex │ ├── lec_07.tex │ ├── lec_08.tex │ ├── lec_09.tex │ ├── lec_10.tex │ ├── lec_11.tex │ ├── lec_12.tex │ ├── lec_13.tex │ ├── lec_14.tex │ ├── lec_15.tex │ ├── lec_16.tex │ ├── lec_17.tex │ ├── lec_18.tex │ ├── lec_19.tex │ ├── lec_20.tex │ ├── lec_21.tex │ ├── lec_22.tex │ ├── lec_23.tex │ ├── lec_24.tex │ ├── lec_25.tex │ ├── lec_26.tex │ ├── lec_27.tex │ ├── lec_28.tex │ ├── lec_29.tex │ ├── lec_30.tex │ ├── lec_31.tex │ ├── lec_32.tex │ ├── lec_33.tex │ ├── lec_34.tex │ ├── lec_35.tex │ ├── lec_36.tex │ ├── lec_37.tex │ ├── lec_38.tex │ ├── lec_39.tex │ ├── master.tex │ └── preamble.tex └── notes.sty ├── policies.md ├── policies_CU.pdf ├── syllabus.md └── utilities ├── APPM5630_utilities.ipynb ├── README.md ├── firstOrderMethods.py ├── fminunc_wrapper_simple.m └── secondOrderMethods.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /APPM4720_5720_Fall2018_Syllabus.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/APPM4720_5720_Fall2018_Syllabus.pdf -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/.gitignore: -------------------------------------------------------------------------------- 1 | .trash/ 2 | Df1* 3 | -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/f1.m: -------------------------------------------------------------------------------- 1 | function [y] = f1(x,scale) 2 | % Like the simple example we used in class. 3 | % https://en.wikipedia.org/wiki/Automatic_differentiation 4 | 5 | y = scale*(x(1)*x(2) + sin(x(1))); 6 | 7 | end 8 | -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/f1_demo.m: -------------------------------------------------------------------------------- 1 | function [] = f1_demo() 2 | %function [y] = f1(x,scale) 3 | %% Like the simple example we used in class. 4 | %% https://en.wikipedia.org/wiki/Automatic_differentiation 5 | % 6 | % y = scale*(x(1)*x(2) + sin(x(1))); 7 | % 8 | %end 9 | 10 | % print source 11 | type f1 12 | fprintf('\n'); 13 | 14 | % Set up inputs (and optionally parameters) 15 | x = adigatorCreateDerivInput([2 1], 'x'); 16 | aux = adigatorCreateAuxInput([1 1]); 17 | 18 | % Generate the derivative function file 19 | opt = adigatorOptions('overwrite', 1); % overwrite existing generated AD sources 20 | adigator('f1', {x, aux}, 'Df1', opt); 21 | 22 | % Call the derivative 23 | x = [1; 2]; 24 | scale = 1; 25 | x_ad = struct('f', x, 'dx', ones([2 1])); 26 | y = Df1(x_ad, scale) 27 | fprintf('Derivative with ADiGator:\n'); 28 | y.dx 29 | 30 | fprintf('Check derivative with analytic derivative:\n'); 31 | Df1_check(x, scale) 32 | 33 | end 34 | 35 | function [dx] = Df1_check(x, scale) 36 | 37 | dx = scale*[x(2) + cos(x(1)); x(1)]; 38 | 39 | end 40 | -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/polydatafit/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/AD_demos/ADiGator_demo/polydatafit/.DS_Store -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/polydatafit/Contents.m: -------------------------------------------------------------------------------- 1 | % ADiGator polynomial data fitting Jacobian example 2 | % 3 | % Copyright 2011-2015 Matthew J. Weinstein and Anil V. Rao 4 | % Distributed under the GNU General Public License version 3.0 5 | % 6 | % ----------------------------------------------------------------------- % 7 | % FILES: 8 | % fit.m - polynomial data fitting function 9 | % fit4numjac.m - polynomial data fitting function to be called by numjac 10 | % main.m - computes Jacobian of fit function using ADiGator and 11 | % numjac -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/polydatafit/fit.m: -------------------------------------------------------------------------------- 1 | function p = fit(x, d, m) 2 | % FIT -- Given x and d, fit() returns p 3 | % such that norm(V*p-d) = min, where 4 | % V = [1, x, x.^2, ... x.^(m-1)]. 5 | 6 | dim_x = size(x, 1); 7 | if dim_x < m 8 | error('x must have at least m entries'); 9 | end 10 | 11 | V = ones(dim_x, 1); 12 | 13 | for count = 1 : (m-1) 14 | V = [V, x.^count]; 15 | end 16 | p = V\d; -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/polydatafit/fit4numjac.m: -------------------------------------------------------------------------------- 1 | function p= fit4numjac(t,x, d, m) 2 | % FIT -- Given x and d, fit() returns p 3 | % such that norm(V*p-d) = min, where 4 | % V = [1, x, x.?2, ... x.?(m-1)]. 5 | 6 | dim_x = size(x, 1); 7 | if dim_x < m 8 | error('x must have at least m entries'); 9 | end 10 | 11 | V = ones(dim_x, 1); 12 | 13 | for count = 1 : (m-1) 14 | V = [V, x.^count]; 15 | end 16 | 17 | p = V \ d; -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/polydatafit/fit_ADiGatorJac.m: -------------------------------------------------------------------------------- 1 | % This code was generated using ADiGator version 1.3 2 | % ©2010-2014 Matthew J. Weinstein and Anil V. Rao 3 | % ADiGator may be obtained at https://sourceforge.net/projects/adigator/ 4 | % Contact: mweinstein@ufl.edu 5 | % Bugs/suggestions may be reported to the sourceforge forums 6 | % DISCLAIMER 7 | % ADiGator is a general-purpose software distributed under the GNU General 8 | % Public License version 3.0. While the software is distributed with the 9 | % hope that it will be useful, both the software and generated code are 10 | % provided 'AS IS' with NO WARRANTIES OF ANY KIND and no merchantability 11 | % or fitness for any purpose or application. 12 | 13 | function p = fit_ADiGatorJac(x,d,m) 14 | global ADiGator_fit_ADiGatorJac 15 | if isempty(ADiGator_fit_ADiGatorJac); ADiGator_LoadData(); end 16 | Gator1Data = ADiGator_fit_ADiGatorJac.fit_ADiGatorJac.Gator1Data; 17 | % ADiGator Start Derivative Computations 18 | %User Line: % FIT -- Given x and d, fit() returns p 19 | %User Line: % such that norm(V*p-d) = min, where 20 | %User Line: % V = [1, x, x.^2, ... x.^(m-1)]. 21 | dim_x.f = size(x.f,1); 22 | %User Line: dim_x = size(x, 1); 23 | cadaconditional1 = lt(dim_x.f,m); 24 | %User Line: cadaconditional1 = dim_x < m; 25 | V.f = ones(dim_x.f,1); 26 | %User Line: V = ones(dim_x, 1); 27 | cada1f1 = m - 1; 28 | cadaforvar1.f = 1:cada1f1; 29 | %User Line: cadaforvar1 = 1 : (m-1); 30 | V.dx = zeros(700,1); 31 | V.f(100,8) = 0; 32 | for cadaforcount1 = 1:7 33 | count.f = cadaforvar1.f(:,cadaforcount1); 34 | %User Line: count = cadaforvar1(:,cadaforcount1); 35 | cada1f1dx = count.f.*x.f(:).^(count.f-1).*x.dx; 36 | cada1f1dx((x.f(:) == 0 & x.dx == 0) | count.f == 0) = 0; 37 | cada1f1 = x.f.^count.f; 38 | V.dx = V.dx(Gator1Data.Index4,1); 39 | V.f = V.f(:,1:7); 40 | cada1td1 = zeros(700,1); 41 | cada1td1(logical(Gator1Data.Index1(:,cadaforcount1))) = V.dx(nonzeros(Gator1Data.Index1(:,cadaforcount1))); 42 | cada1td1(logical(Gator1Data.Index2(:,cadaforcount1))) = cada1f1dx(nonzeros(Gator1Data.Index2(:,cadaforcount1))); 43 | V.dx = cada1td1; 44 | cada1tempf1 = [V.f(:,1:Gator1Data.Index3(cadaforcount1)),cada1f1]; 45 | V.f = zeros(100,8); 46 | V.f(:,1:size(cada1tempf1,2)) = cada1tempf1; 47 | %User Line: V = [V, x.^count]; 48 | end 49 | cada1tf3 = V.f\d; 50 | cada1td1 = zeros(8,100); 51 | cada1td1(Gator1Data.Index5) = V.dx; 52 | cada1td1 = cada1tf3.'*cada1td1; 53 | cada1td1 = cada1td1(:); 54 | cada1td3 = cada1td1(Gator1Data.Index6); 55 | cada1tf4 = V.f.'; 56 | cada1td1 = sparse(Gator1Data.Index7,Gator1Data.Index8,cada1td3,100,100); 57 | cada1td1 = cada1tf4*cada1td1; 58 | cada1td1 = cada1td1(:); 59 | cada1td4 = full(cada1td1(Gator1Data.Index9)); 60 | cada1tf4 = (V.f*cada1tf3 - d).'; 61 | cada1td1 = sparse(Gator1Data.Index10,Gator1Data.Index11,V.dx,100,700); 62 | cada1td1 = cada1tf4*cada1td1; 63 | cada1td1 = cada1td1(:); 64 | cada1td5 = full(cada1td1(Gator1Data.Index12)); 65 | cada1td3 = cada1td4; 66 | cada1td3(Gator1Data.Index13) = cada1td3(Gator1Data.Index13) + cada1td5; 67 | cada1tf4 = -(V.f.'*V.f); 68 | cada1td1 = zeros(8,100); 69 | cada1td1(Gator1Data.Index14) = cada1td3; 70 | cada1td1 = cada1tf4\cada1td1; 71 | cada1td1 = cada1td1(:); 72 | p.dx = cada1td1(Gator1Data.Index15); 73 | p.f = cada1tf3; 74 | %User Line: p = V\d 75 | p.dx_size = [8,100]; 76 | p.dx_location = Gator1Data.Index16; 77 | end 78 | 79 | 80 | function ADiGator_LoadData() 81 | global ADiGator_fit_ADiGatorJac 82 | ADiGator_fit_ADiGatorJac = load('fit_ADiGatorJac.mat'); 83 | return 84 | end -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/polydatafit/fit_ADiGatorJac.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/AD_demos/ADiGator_demo/polydatafit/fit_ADiGatorJac.mat -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/polydatafit/fit_Jac.m: -------------------------------------------------------------------------------- 1 | % function [Jac,Fun] = fit_Jac(x,d,m) 2 | % 3 | % Jacobian wrapper file generated by ADiGator 4 | % ©2010-2014 Matthew J. Weinstein and Anil V. Rao 5 | % ADiGator may be obtained at https://sourceforge.net/projects/adigator/ 6 | % Contact: mweinstein@ufl.edu 7 | % Bugs/suggestions may be reported to the sourceforge forums 8 | % DISCLAIMER 9 | % ADiGator is a general-purpose software distributed under the GNU General 10 | % Public License version 3.0. While the software is distributed with the 11 | % hope that it will be useful, both the software and generated code are 12 | % provided 'AS IS' with NO WARRANTIES OF ANY KIND and no merchantability 13 | % or fitness for any purpose or application. 14 | 15 | function [Jac,Fun] = fit_Jac(x,d,m) 16 | gator_x.f = x; 17 | gator_x.dx = ones(100,1); 18 | p = fit_ADiGatorJac(gator_x,d,m); 19 | Jac = reshape(p.dx,[8 100]); 20 | Fun = p.f; 21 | end -------------------------------------------------------------------------------- /Demos/AD_demos/ADiGator_demo/polydatafit/main.m: -------------------------------------------------------------------------------- 1 | % This file uses both MATLAB finite differences as well as adigator in order 2 | % to compute derivatives of the fit.m function. The user can change m and n 3 | % to change to problem size. 4 | % Copyright 2011-2014 Matthew J. Weinstein and Anil V. Rao 5 | % Distributed under the GNU General Public License version 3.0 6 | m = 8; 7 | n = 100; 8 | TOL = 1e-5; 9 | 10 | x = floor(rand(n,1)*1000)/1000; 11 | d = floor(rand(n,1)*1000)/1000; 12 | numeval = 25; 13 | 14 | % Create the Jacobian file 15 | tic 16 | gx = adigatorCreateDerivInput([n,1],'x'); 17 | adigatorGenJacFile('fit',{gx,d,m}); 18 | gentime = toc; 19 | 20 | 21 | % Use the ADiGator generated Jacobian file 22 | tic 23 | for i = 1:numeval 24 | [J,p] = fit_Jac(x,d,m); 25 | end 26 | adigatortime = toc/numeval; 27 | 28 | 29 | % numerically compute the Jacobian using FD 30 | tic 31 | for i = 1:numeval 32 | dpdx2 = numjac(@(t,x)fit4numjac(t,x,d,m),0,x,p,TOL*ones(n,1),[],0); 33 | end 34 | fdtime = toc/numeval; 35 | 36 | 37 | fprintf('Derivatives of fit function:\n'); 38 | fprintf(['m = %1.0f, n = %1.0f, TOL = ',num2str(TOL),'\n'],m,n); 39 | fprintf(['ADiGator File Generation Time: ',num2str(gentime),'\n']); 40 | fprintf(['ADiGator Average Eval Time: ',num2str(adigatortime),'\n']); 41 | fprintf(['F Diff Average Eval Time: ',num2str(fdtime),'\n']); 42 | -------------------------------------------------------------------------------- /Demos/AD_demos/README.md: -------------------------------------------------------------------------------- 1 | # Automatic Differentiation demos 2 | 3 | - **Python**: see [../AutomaticDifferentiation.ipynb](../AutomaticDifferentiation.ipynb) 4 | - **Julia**: see [ForwardDiff.jl/](ForwardDiff.jl/) package. Julia has a rich ecosystem of autodiff tools, which are constantly evolving, so we have not attempted to be up-to-date 5 | - A [discourse post](https://discourse.julialang.org/t/state-of-automatic-differentiation-in-julia/43083) from about 2020 (and see followup comments) lists about 20 packages: FowardDiff 6 | ForwardDiff2, Nabla, Tracker, Yota, Zygote, ReverseDiff, AutoGrad, NiLang, ModelingToolkit, XGrad, Calculus, FiniteDifferences, FiniteDiff, TaylorSeries, DualNumbers, HyperDualNumbers, Knet, Capstan, Flux, ... 7 | - **Matlab**: historically, Matlab hasn't had a rich autodiff community. When this class first ran in 2016, we used ADiGator (see [ADiGator_demo/](ADiGator_demo/) 8 | - As of version R2021a, with the Deep Learning Toolbox, there is now much better native support. See, e.g., [mathworks.com/help/deeplearning/ug/include-automatic-differentiation.html](https://www.mathworks.com/help/deeplearning/ug/include-automatic-differentiation.html). 9 | -------------------------------------------------------------------------------- /Demos/BlockMultiplies/MatrixMultiplyDemo.mlx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/BlockMultiplies/MatrixMultiplyDemo.mlx -------------------------------------------------------------------------------- /Demos/BlockMultiplies/MultiplyMatrices_C.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/BlockMultiplies/MultiplyMatrices_C.pdf -------------------------------------------------------------------------------- /Demos/BlockMultiplies/README.md: -------------------------------------------------------------------------------- 1 | # Block computation demo 2 | 3 | For discussion of block/mini-batch methods 4 | 5 | See the demo script: 6 | 7 | [Demo in HTML format](http://htmlpreview.github.io/?https://github.com/stephenbeckr/CambridgeOptimisationCourse/blob/master/BlockComputation_demo/MatrixMultiplyDemo.html) 8 | (this link uses the nice htmlpreview.github.io website; if you just click the raw html file, it will not render) 9 | -------------------------------------------------------------------------------- /Demos/BlockMultiplies/compareSpeed_homegrown_vs_MKL.m: -------------------------------------------------------------------------------- 1 | %{ 2 | Compare the speed of matrix multiplication, of my simple C for loop 3 | vs using Matlab's version (which calls Intel's MKL BLAS) 4 | 5 | %} 6 | 7 | % Compile, if not already done: 8 | % mex multiplyMatrices.c -O 9 | 10 | % First, convince you that my function gives the right answer 11 | A = randn(50,51); 12 | B = randn(51,52); % pick rectangular, helps find bugs 13 | C = multiplyMatrices(A,B); 14 | err = norm( C - A*B, 'fro' )/norm( A*B, 'fro' ); 15 | fprintf('Error is: %g\n', err ); 16 | 17 | %% Now, try some speed tests 18 | 19 | nList = [50,100,500,800,1e3,1.2e3,1.5e3]; 20 | nReps = 3; 21 | [results1,results2] = deal( zeros( nReps,length(nList) ) ); 22 | for ni = 1:length(nList) 23 | n = nList(ni); 24 | A = randn(n); 25 | B = randn(n); 26 | 27 | for rep = 1:nReps 28 | tic; 29 | C = multiplyMatrices( A, B ); 30 | t = toc; 31 | 32 | results1(rep,ni) = t; 33 | 34 | tic; 35 | C = A*B; 36 | t = toc; 37 | 38 | results2(rep,ni) = t; 39 | end 40 | end 41 | 42 | %% Plot 43 | figure(1); clf; 44 | loglog( nList, mean(results1), 'o--', 'linewidth', 2,'markersize',10 ); 45 | hold all 46 | loglog( nList, mean(results2), '*--', 'linewidth', 2,'markersize',10 ); 47 | set(gca,'fontsize',20); 48 | legend('My implementation','Matlab''s implementation','location','northwest'); 49 | title('Time to multiply n x n matrices'); 50 | ylabel('Time (s)'); 51 | xlabel('Dimension n'); 52 | xlim([50,1.6e3]); 53 | text( 100,.5, 'At n=1500, C code took 27 sec, Matlab took 0.17 sec' ); 54 | export_fig 'MultiplyMatrices_C' '-pdf' '-transparent' -------------------------------------------------------------------------------- /Demos/BlockMultiplies/matrixMultiply.m: -------------------------------------------------------------------------------- 1 | %{ 2 | Demonstration of effect of memory/communication cost 3 | on performance 4 | 5 | Stephen Becker, June 11 2018 6 | %} 7 | 8 | n = 5e3; 9 | A = randn(n); 10 | X = randn(n); 11 | 12 | blockSizes = [1, 5, 10, 20, 100, 1e3, 5e3 ]; 13 | times = zeros( length(blockSizes), 1 ); 14 | 15 | for i = 1:length(blockSizes) 16 | fprintf('i is %d\n', i ); 17 | block = blockSizes( i ); 18 | if block < 1e2 19 | reps = round( 1e2/block ); 20 | elseif block < 1e3 21 | reps = round( 1e3/block ); 22 | else 23 | reps = round( 1e4/block ); 24 | end 25 | % to get accurate timing, we'll repeat this 26 | t1 = tic; 27 | for trial = 1:reps 28 | y = A*X(:,1:block); 29 | end 30 | tm = toc(t1); 31 | tm = tm/reps; 32 | times(i) = tm; 33 | end 34 | %% Display results 35 | figure(1); clf; 36 | loglog( blockSizes, times, 'o--','markersize',12,'linewidth',2 ) 37 | set(gca,'fontsize',18); 38 | xlabel('n'); 39 | ylabel('Time to multiply A*X(:,1:n)'); 40 | hold all 41 | loglog( blockSizes, blockSizes*times(2)/blockSizes(2), ':','linewidth',2 ) 42 | loglog( blockSizes, blockSizes*times(5)/blockSizes(5), ':','linewidth',2 ) 43 | ylim([1e-2,10]); 44 | 45 | %% Display them another way 46 | figure(2); clf; 47 | semilogx( blockSizes, times.*(n./blockSizes'), 'o--','markersize',12,'linewidth',2 ) 48 | set(gca,'fontsize',18); 49 | xlabel('n'); 50 | ylabel('Time to multiply A*X'); 51 | hold all -------------------------------------------------------------------------------- /Demos/BlockMultiplies/multiplyMatrices.c: -------------------------------------------------------------------------------- 1 | #if defined(__GNUC__) && !(defined(__clang__)) && defined(NEEDS_UCHAR) 2 | #include 3 | #endif 4 | #include 5 | #include "mex.h" 6 | 7 | 8 | void mexFunction( int nlhs, mxArray *plhs[], 9 | int nrhs, const mxArray*prhs[] ) 10 | { 11 | double *A, *B, *C; 12 | mwSize m, n, k; 13 | mwIndex i, j, u; 14 | 15 | if ( nrhs != 2 ) { 16 | mexPrintf("Usage: C = multiplyMatrices(A,B)\n"); 17 | mexErrMsgIdAndTxt("MATLAB:mexFile:invlaidNumInputs","Need 2 inputs"); 18 | } 19 | 20 | A = mxGetPr( prhs[0] ); /* A is m x k */ 21 | m = mxGetM( prhs[0] ); 22 | k = mxGetN( prhs[0] ); 23 | 24 | B = mxGetPr( prhs[1] ); /* A is m x k */ 25 | if (k != mxGetM( prhs[1] ) ){ 26 | mexPrintf("Usage: C = multiplyMatrices(A,B), where size(A,2)==size(B,1)\n"); 27 | mexErrMsgIdAndTxt("MATLAB:mexFile:invlaidNumInputs","Wrong size"); 28 | } 29 | n = mxGetN( prhs[1] ); 30 | 31 | plhs[0] = mxCreateDoubleMatrix( m, n, mxREAL ); 32 | C = mxGetPr( plhs[0] ); 33 | 34 | 35 | /* Now, the actual computation */ 36 | for (i=0; i> 0 ] # X is PSD 135 | 136 | prob = cvx.Problem(obj, constraints) 137 | prob.solve() 138 | 139 | print_status(prob, X) 140 | 141 | def problem9(): 142 | A, y = get_vars() 143 | print('Rerun Problem 1 without parameterizing ...') 144 | x = cvx.Variable(10) 145 | obj = cvx.Minimize(cvx.norm(x)) 146 | constraints = [cvx.norm(A@x - y) <= 0.1] 147 | prob = cvx.Problem(obj, constraints) 148 | t = time.time() 149 | prob.solve() 150 | elapsed = time.time() - t 151 | print(f" Elapsed time: {elapsed} seconds.") 152 | print('Now parameterize y ...') 153 | b = cvx.Parameter(5) 154 | obj = cvx.Minimize(cvx.norm(x)) 155 | constraints = [cvx.norm(A@x - b) <= 0.1] 156 | prob = cvx.Problem(obj, constraints) 157 | 158 | for i in range(10): 159 | b.value = np.random.rand(5) 160 | t = time.time() 161 | prob.solve() 162 | elapsed = time.time() - t 163 | print(f" {i=}, Elapsed time: {elapsed} seconds.") 164 | 165 | if __name__ == '__main__': 166 | parser = argparse.ArgumentParser() 167 | parser.add_argument('num', type=int, help='number of problem to run', default=1) 168 | args = parser.parse_args() 169 | 170 | num = args.num 171 | if num < 1 or num > 9: 172 | raise argparse.ArgumentError('Problem number should be in [1..9]') 173 | 174 | problem_funs = [eval('problem'+str(i)) for i in range(1,10)] 175 | problem_funs[num-1]() 176 | 177 | 178 | # vim: set ts=3 sw=3 sts=3 et : 179 | -------------------------------------------------------------------------------- /Fall2018_day-by-day_schedule.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Fall2018_day-by-day_schedule.pdf -------------------------------------------------------------------------------- /Handouts/FirmlyNonexpansive.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/FirmlyNonexpansive.pdf -------------------------------------------------------------------------------- /Handouts/FixedPtTheorems.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/FixedPtTheorems.pdf -------------------------------------------------------------------------------- /Handouts/StrongConvexityLipschitz.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/StrongConvexityLipschitz.pdf -------------------------------------------------------------------------------- /Handouts/StrongConvexityLipshitz.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/StrongConvexityLipshitz.pdf -------------------------------------------------------------------------------- /Handouts/SubOptimalityBounds.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/SubOptimalityBounds.pdf -------------------------------------------------------------------------------- /Handouts/SubgradientDescent.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/SubgradientDescent.pdf -------------------------------------------------------------------------------- /Homeworks/APPM5630Spring25Homework01-02.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/APPM5630Spring25Homework01-02.pdf -------------------------------------------------------------------------------- /Homeworks/APPM5630Spring25Homework03-04.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/APPM5630Spring25Homework03-04.pdf -------------------------------------------------------------------------------- /Homeworks/APPM5630Spring25Homework05-06.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/APPM5630Spring25Homework05-06.pdf -------------------------------------------------------------------------------- /Homeworks/APPM5630Spring25Homework07-08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/APPM5630Spring25Homework07-08.pdf -------------------------------------------------------------------------------- /Homeworks/APPM5630Spring25Homework09-10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/APPM5630Spring25Homework09-10.pdf -------------------------------------------------------------------------------- /Homeworks/HW01_helper_polyhedrality.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/HW01_helper_polyhedrality.pdf -------------------------------------------------------------------------------- /Homeworks/HW04/implicit2explicit.m: -------------------------------------------------------------------------------- 1 | function A = implicit2explicit(Afun,m,n) 2 | %IMPLICIT2EXPLICIT takes a linear function Afun(x) and builds the corresponding matrix 3 | % Makes an explicit matrix using the linear function 4 | % in the function handle "Afun", where the domain is R^n 5 | % and the range is in R^m 6 | % 7 | % Usage: implicit2explicit(Afun,m,n) 8 | % 9 | % If n = [n1,n2], the domain is the space of n1 x n2 matrices 10 | % Output of Afun should always be a m x 1 column vector 11 | % 12 | % Stephen Becker, stephen.becker@colorado.edu, 2/13/2017 13 | 14 | if nargin < 3, n = m; end 15 | 16 | A = zeros(m,prod(n)); 17 | if numel(n) == 1 18 | e = zeros(n,1); 19 | else 20 | if numel(n) ~= 2, error('bad value for size of domain'); end 21 | e = zeros(n(1),n(2)); 22 | end 23 | for j = 1:prod(n) 24 | e(j) = 1; 25 | A(:,j) = Afun(e); 26 | e(j) = 0; 27 | end 28 | -------------------------------------------------------------------------------- /Homeworks/HW04/quadraticObjective.m: -------------------------------------------------------------------------------- 1 | function [f,g] = quadraticObjective(x,A,b, At) 2 | % f = quadraticObjective( x, A, b ) 3 | % computes f(x) = 1/2 || Ax-b ||^2 4 | % where "A" is a linear operator (either a matrix or a function handle) 5 | % 6 | % [f,g] = ... 7 | % also returns the gradient g(x) = \nabla f(x) = A'*(A*x-b) 8 | % 9 | % ... = quadraticObjective( x, A, b, At ) 10 | % uses "At" for the adjoint of the linear operator "A". 11 | % This is only necessary if "A" is a function handle 12 | % 13 | % ... = quadraticObjective( x, A, b, At, c ) 14 | % uses f(x) = 1/2 ||Ax-b||^2 + dot(c,x) 15 | % 16 | % This form is suitable for most of Matlab's solvers 17 | % and for 3rd party packages like Mark Schmidt's minFunc 18 | % 19 | % Stephen.Becker@Colorado.edu, 2/13/2017 20 | 21 | if ~isa( A, 'function_handle') 22 | At = @(x) A'*x; 23 | A = @(x) A*x; % overload notation 24 | elseif nargin < 4 25 | error('If "A" is a function hande, need 4 inputs'); 26 | end 27 | if nargin < 5 28 | c = []; 29 | end 30 | 31 | r = A(x) - b; % residual 32 | f = norm( r )^2/2; 33 | if ~isempty(c) 34 | f = f + dot(c,x); 35 | end 36 | if nargout >= 2 37 | g = At(r); 38 | if ~isempty(c) 39 | g = g + c; 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /Homeworks/HW04/test_adjoint.m: -------------------------------------------------------------------------------- 1 | function test_adjoint( A, At, sz, nRep ) 2 | % TEST_ADJOINT( A, At, sz ) 3 | % tests whether the function handles A and At are 4 | % adjoints (= conjugate transpose) of each other. 5 | % 6 | % sz should be the size of the domain, e.g., 7 | % sz = n for domain to be n x 1 column vectors 8 | % 9 | % sz = [n1,n2] for the domain to be n1 x n2 matrices 10 | % 11 | % TEST_ADJOINT( A, At, sz, nRep ) 12 | % controls how many tests to perform (default: 10) 13 | % 14 | % Stephen.Becker@Colorado.edu, 2/13/2017 15 | 16 | if nargin < 4, nRep = 10; end 17 | 18 | for rep = 1:nRep 19 | 20 | if numel(sz) == 1 21 | n = sz; 22 | x = randn(n,1); 23 | else 24 | x = randn(sz); % could be a matrix 25 | end 26 | 27 | Ax = A(x); 28 | y = randn( size(Ax) ); 29 | Aty = At(y); 30 | 31 | er = dot( Ax, y ) - dot( x, Aty ); 32 | fprintf('Error in adjoint: %.2g\n', abs(er)/sqrt(norm(x)*norm(y)) ); 33 | end 34 | -------------------------------------------------------------------------------- /Homeworks/HW10/.gitignore: -------------------------------------------------------------------------------- 1 | *.mat 2 | *.wav 3 | -------------------------------------------------------------------------------- /Homeworks/HW10/README.md: -------------------------------------------------------------------------------- 1 | ### Note on `handel.pkl`/`handel2.pkl` 2 | `handel.pkl` was pickled in Python 3 with the default protocol=3, which is not 3 | backwards compatible with the Python 2 pickler. 4 | `handel2.pkl` was pickled with protocol=2, which is backwards compatible with 5 | Python 2. 6 | -------------------------------------------------------------------------------- /Homeworks/HW10/adjointShortTimeDCT.m: -------------------------------------------------------------------------------- 1 | function y = adjointShortTimeDCT( coeff, win, Ntrue ) 2 | % y = adjointShortTimeDCT( coeff, win ) 3 | % applies the adoint/transpose MDCT to the coefficients "coeff" 4 | % This is also the pseudo-inverse of the forward MDCT 5 | % 6 | % y = adjointShortTimeDCT( coeff, win, N_original ) 7 | % should be used when the forward MDCT is applied to signals 8 | % of length N_original. We need to know N_original so we can 9 | % undo the zero-padding (which is done when N_original is not 10 | % a multiple of the blockSizez) 11 | % 12 | % see forwardShortTimeDCT.m for an example of the window "win" 13 | % 14 | % Stephen Becker, 3/18/2017 15 | % See also forwardShortTimeDCT.m 16 | 17 | 18 | N = length(coeff)/2; 19 | blockSize = length(win); 20 | nBlocks = ceil( N/blockSize ); 21 | if nargin >= 3 22 | assert( Ntrue <= N ); 23 | else 24 | Ntrue = []; 25 | end 26 | 27 | Win = spdiags(win(:),0,blockSize,blockSize); 28 | 29 | C = reshape( coeff(1:N), blockSize, nBlocks ); 30 | Y = Win*idct( C ); 31 | y = Y(:); 32 | 33 | C = reshape( coeff(N+1:end), blockSize, nBlocks ); 34 | Y2 = Win*idct( C ); 35 | y2 = circshift(Y2(:),blockSize/2); 36 | 37 | y = y + y2; 38 | 39 | if ~isempty(Ntrue) 40 | y = y(1:Ntrue); 41 | end 42 | -------------------------------------------------------------------------------- /Homeworks/HW10/forwardShortTimeDCT.m: -------------------------------------------------------------------------------- 1 | function coeff = forwardShortTimeDCT( y, win ) 2 | % coeff = forwardShortTimeDCT( y, win ) 3 | % applies the Modified DCT to the signal y 4 | % This is a linear function. 5 | % Assumes y is a column vector of length N 6 | % The blockSize is encoded in the length of win 7 | % This code then uses a lapped (50% overlapping) 8 | % DCT on segments of y of length blockSize. 9 | % 10 | % Note: this function zero-pads y to be an even multiple of blockSize 11 | % 12 | % An example window that we recommend, so that 13 | % the transpose of this function is its pseudo-inverse, 14 | % is: 15 | % win = sin( pi*( (1:blockSize) + 1/2)/(blockSize) ); 16 | % (a typical value of blockSize = 1024) 17 | % This satisifes the Princen-Bradley conditions, meaning that 18 | % we can guarantee win.^2 + circshift( win, blockSize/2).^2 = 1 19 | % 20 | % Stephen Becker, 3/18/2017 21 | % See also adjointShortTimeDCT.m 22 | 23 | N = length(y); 24 | blockSize = length(win); 25 | % Make y be a multiple of the blockSize by zero-padding 26 | nBlocks = ceil( N/blockSize ); 27 | y = [y; zeros(blockSize*nBlocks-N,1) ]; 28 | 29 | Win = spdiags(win(:),0,blockSize,blockSize); 30 | 31 | Y = reshape( y, blockSize, nBlocks ); 32 | C = dct( Win*Y ); 33 | 34 | % and 50% shift 35 | Y = reshape( circshift(y,-blockSize/2), blockSize, nBlocks ); 36 | C2 = dct( Win*Y ); 37 | 38 | coeff = [ C(:); C2(:) ]; 39 | % or instead, intersperse 40 | % coeff = [C;C2]; coeff = coeff(:); 41 | -------------------------------------------------------------------------------- /Homeworks/HW10/handel.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/HW10/handel.mat -------------------------------------------------------------------------------- /Homeworks/HW10/handel.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/HW10/handel.pkl -------------------------------------------------------------------------------- /Homeworks/HW10/handel2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/HW10/handel2.pkl -------------------------------------------------------------------------------- /Homeworks/HW10/listen_to_Handel.m: -------------------------------------------------------------------------------- 1 | %{ 2 | For HW 10 to demonstrate listening to the "Handel" audio file 3 | 4 | Stephen Becker, 3/18/2017 5 | %} 6 | 7 | load handel.mat % loads y, Fs 8 | 9 | %% Play the sound 10 | playerObj = audioplayer(y,Fs); 11 | play( playerObj ) 12 | %% Play the down-sampled sound (by a factor of 2) 13 | playerObj = audioplayer(y(1:2:end),Fs/2); 14 | play( playerObj ) 15 | %% Play the down-sampled sound (by a factor of 4) 16 | playerObj = audioplayer(y(1:4:end),Fs/4); 17 | play( playerObj ) 18 | %% Filter-then-downsample to avoid aliasing 19 | % first low-pass filter it 20 | Rp = 1e-3; % for peak-to-peak ripple 21 | Rst = 1e-3; % for stopband attenuation 22 | ordr= 100; % filter order 23 | eqnum = firceqrip( ordr, 1/4, [Rp Rst], 'passedge'); 24 | % fvtool(eqnum, 'Fs',Fs ); % Visualize the filter's frequency response 25 | lowpassFIR = dsp.FIRFilter('Numerator',eqnum); 26 | yFiltered = lowpassFIR( y ); 27 | 28 | % playerObj = audioplayer(yFiltered,Fs); 29 | playerObj = audioplayer(yFiltered(1:4:end),Fs/4); 30 | play( playerObj ) 31 | 32 | %% Plot spectrograms 33 | % Use pwelch estimate instead of "spectrogram" function 34 | blockSize = 1e3; 35 | win = window( @barthannwin, blockSize ); 36 | figure(1); 37 | subplot(2,2,1); 38 | pwelch( y, win, [], [], Fs ); title('Original signal'); 39 | subplot(2,2,2); 40 | pwelch( y(1:4:end), win, [], [], Fs/4 ); title('Downsampled (aliased)'); 41 | subplot(2,2,3); 42 | pwelch( yFiltered, win, [], [], Fs ); title('Filtered'); 43 | subplot(2,2,4); 44 | pwelch( yFiltered(1:4:end), win, [], [], Fs/4 ); title('Filtered-then-downsampled'); 45 | 46 | 47 | %% and a time-frequency plot 48 | figure(3); 49 | spectrogram( y, 5e2, [], [], Fs ) 50 | title('Spectrogram'); -------------------------------------------------------------------------------- /Homeworks/HW10/listen_to_Handel.py: -------------------------------------------------------------------------------- 1 | """ 2 | For HW 10 to demonstrate listening to the "Handel" audio filename 3 | Stephen Becker, 3/18/2017 4 | """ 5 | import numpy as np 6 | import pickle 7 | import scipy.io.wavfile 8 | import scipy.signal as sig 9 | 10 | def save_wav(filename, data, Fs=44100): 11 | # assume we're working with floats 12 | # rescale to [-1,1] 13 | _data = 2*data/(data.max() - data.min()) 14 | scipy.io.wavfile.write(filename, int(Fs), _data.astype(np.float32)) 15 | 16 | def listen(): 17 | y,Fs = pickle.load(open('handel.pkl', 'rb')) 18 | y = y.ravel() # want a vector, not (n,1) array 19 | Fs = Fs[0][0] # want a scalar, not a (1,1) array 20 | sounds = [] 21 | do_plots = True 22 | if do_plots: import matplotlib.pyplot as plt 23 | 24 | # The original signal 25 | save_wav('handel.wav', y, Fs) 26 | sounds.append(('Handel', 'handel.wav')) 27 | 28 | # Naive downsample by a factor of 2 29 | y2 = y[::2]; fn = 'handel_dec2.wav' 30 | save_wav(fn, y2, Fs/2) 31 | sounds.append(('Handel Downsampled by a factor of 2', fn)) 32 | 33 | # Naive downsample by a factor of 4 34 | y4 = y[::4]; fn = 'handel_dec4.wav' 35 | save_wav(fn, y4, Fs/4) 36 | sounds.append(('Handel Downsampled by a factor of 4', fn)) 37 | 38 | # Anti-alias (lowpass) filter then downsample 39 | # Use Parks-McClellan to design a lowpass filter with: 40 | # * a passband from 0*Nyquist to 0.25*Nyquist 41 | # * a stopband from 0.3*Nyquist to 0.5*Nyquist 42 | filt = sig.remez(100, [0, 0.125, 0.15, 0.5], [1, 0]) 43 | if do_plots: # Plot filt's frequency response 44 | w, h = sig.freqz(filt) 45 | fig = plt.figure() 46 | plt.title('Anti-alias Frequency Response') 47 | plt.plot(w/np.pi*Fs/2, 20 * np.log10(abs(h)), 'b') 48 | plt.ylabel('Amplitude [dB]', color='b') 49 | plt.xlabel('Frequency [Hz]') 50 | 51 | # perform the filtering with the filter we designed above 52 | yFiltered = sig.lfilter(filt, np.array([1]), y) 53 | fn = 'handel_lp.wav' 54 | save_wav(fn, yFiltered, Fs) 55 | sounds.append(('Handel Lowpass Filtered', fn)) 56 | 57 | # It should now be safe to downsample by a factor of 4 58 | yFiltered4 = yFiltered[::4]; fn = 'handel_lp_dec4.wav' 59 | save_wav(fn, yFiltered4, Fs/4) 60 | sounds.append(('Handel Lowpass Filtered and Downsampled by a factor of 4', fn)) 61 | 62 | # Print some info 63 | print('You should play the following files with your preferred media player') 64 | for desc, fn in sounds: 65 | print('{}:\n --> {}'.format(desc, fn)) 66 | 67 | if do_plots: # estimate PSD 68 | fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2) 69 | 70 | f, Pxx = sig.welch(y, fs=Fs, window='barthann', nperseg=1024, 71 | scaling='density') 72 | ax1.plot(f, 10*np.log10(Pxx)) 73 | ax1.set_title('Original signal') 74 | 75 | f, Pxx = sig.welch(y4, fs=Fs/4, window='barthann', nperseg=128, 76 | scaling='density') 77 | ax2.plot(f, 10*np.log10(Pxx)) 78 | ax2.set_title('Downsampled (x4) (aliased)') 79 | 80 | f, Pxx = sig.welch(yFiltered, fs=Fs, window='barthann', nperseg=1024, 81 | scaling='density') 82 | ax3.plot(f, 10*np.log10(Pxx)) 83 | ax3.set_title('Filtered') 84 | 85 | f, Pxx = sig.welch(yFiltered4, fs=Fs/4, window='barthann', nperseg=128, 86 | scaling='density') 87 | ax4.plot(f, 10*np.log10(Pxx)) 88 | ax4.set_title('Filtered then Downsampled') 89 | 90 | for ax in (ax1, ax2, ax3, ax4): 91 | ax.set_xlabel('Frequency [Hz]') 92 | ax.set_ylabel('Power Spectral Density [dB/Hz]') 93 | 94 | if do_plots: # time-frequency raster 95 | fig = plt.figure() 96 | f, t, Sxx = sig.spectrogram(y, fs=Fs, window='barthann', nperseg=512, 97 | scaling='density') 98 | plt.pcolormesh(f, t, 10*np.log10(Sxx.T)) 99 | plt.colorbar() 100 | plt.xlabel('Frequency [Hz]') 101 | plt.ylabel('Time [sec]') 102 | 103 | if do_plots: plt.show() 104 | 105 | if __name__ == '__main__': 106 | listen() 107 | -------------------------------------------------------------------------------- /Homeworks/HW10/my_upsample.m: -------------------------------------------------------------------------------- 1 | function x = my_upsample( y, sampleSet, n ) 2 | % x = my_upsample( y, sampleSet, n ) 3 | % returns x of length n such that x(sampleSet) = y 4 | x = zeros(n,size(y,2)); 5 | x(sampleSet,:) = y; 6 | -------------------------------------------------------------------------------- /Homeworks/HW10/project_l1.m: -------------------------------------------------------------------------------- 1 | function y = project_l1(x, tau) 2 | % y = project_l1( x, tau ) 3 | % projects x onto the scaled l1 ball, ||x||_1 <= tau 4 | % If tau is not provided, default is tau = 1. 5 | % 6 | % If x is a matrix, the operation is performed along 7 | % each column. 8 | % 9 | % Stephen Becker and Emmanuel Candes, 2009/2010 10 | % Crucial bug fix: 3/17/2017 11 | 12 | if nargin < 2, tau = 1; end 13 | 14 | row_vec = 0; 15 | if size(x,1) == 1 && size(x,2) > 1 16 | row_vec = 1; 17 | x = x(:); 18 | end 19 | 20 | absx = abs(x); 21 | s = sort(absx, 1, 'descend'); 22 | cs = cumsum(s, 1); 23 | 24 | I = find(cs(end,:) > tau); 25 | % If in I, then x is not feasible, and we must project; 26 | % if not in I, then x is already feasible. 27 | % Bug found by SRB on 3/17/2017 28 | y = x; 29 | 30 | % Do projections where needed 31 | for i=1:numel(I) 32 | ind = I(i); 33 | 34 | % JMF 27/03/2017: There's probably a slicker way to do this. 35 | thresh = get_vector_thresh(x(:,i), tau, s(:,i), cs(:,i)); 36 | y(:,i) = sign(x(:,i)).*max(absx(:,i) - thresh, 0); 37 | end 38 | 39 | if row_vec % restore size 40 | y = y.'; 41 | end 42 | 43 | end 44 | 45 | function [thresh] = get_vector_thresh(x, tau, s, cs) 46 | % Check some "discrete" levels of shrinkage, e.g. [s(2:end),0] 47 | % This lets us discover which indices will be nonzero 48 | i_tau = find(cs - (1:numel(x))'.* [s(2:end) ; 0] >= tau, 1); 49 | 50 | % Now that we know which indices are involved, it's a very simple problem: 51 | thresh = (cs(i_tau) - tau)/i_tau; 52 | end 53 | -------------------------------------------------------------------------------- /Homeworks/HW10/python_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse, scipy.fftpack 3 | 4 | def project_l1(x, tau=1.): 5 | """ 6 | project_l1(x, tau) -> y 7 | projects x onto the scaled l1 ball, ||x||_1 <= tau 8 | If tau is not provided, the default is tau = 1. 9 | 10 | Stephen Becker and Emmanuel Candes, 2009/2010. 11 | Crucial bug fix: 3/17/2017, SRB 12 | """ 13 | absx = np.abs(x) 14 | s = np.sort(absx, axis=None)[::-1] # sort in descending order 15 | cs = np.cumsum(s) 16 | 17 | if cs[-1] <= tau: 18 | # x is already feasible, so no thresholding needed 19 | return x 20 | 21 | # Check some "discrete" levels of shrinkage, e.g. [s(2:end),0] 22 | # This lets us discover which indices will be nonzero 23 | n = x.size 24 | i_tau = np.where(cs - 25 | np.arange(1,n+1)*np.concatenate((s[1:],0), axis=None) >= tau)[0][0] 26 | 27 | # Now that we know which indices are involved, it's a very simple problem 28 | thresh = (cs[i_tau]-tau) / (i_tau+1) 29 | 30 | # Shrink x by the amount "thresh" 31 | return np.sign(x)*np.maximum(absx - thresh, 0) 32 | 33 | def forwardShortTimeDCT(y, win=None): 34 | """ 35 | forwardShortTimeDCT(y, win=None) -> coeff (, win) 36 | 37 | Applies the Modified DCT to the signal y 38 | This is a linear function. 39 | Assumes y is a vector of length N 40 | This code then uses a lapped (50% overlapping) 41 | DCT on segments of y of length blockSize. 42 | 43 | Note: this function zero-pads y to be an even multiple of blockSize. 44 | 45 | An example window that we recommend, so that the transpose of this 46 | function is its pseudo-inverse is 47 | 48 | win = np.sin(np.pi*(np.arange(1,blockSize+1)+0.5)/blockSize) 49 | (a typical value of blockSize = 1024) 50 | 51 | On input, if win is not specified, we return (coeff, win) so the 52 | caller has access to the window we used. 53 | 54 | This satisfies the Princen-Bradley conditions, meaning that we can 55 | guarantee win**2+np.roll(win, int(blockSize/2))**2 == 1. 56 | 57 | Stephen Becker, 3/18/2017 58 | See also adjointShortTimeDCT 59 | """ 60 | # Make a window if not provided by the user 61 | if win is None: 62 | blockSize = 1024 63 | win = np.sin(np.pi*(np.arange(1,blockSize+1)+0.5)/blockSize) 64 | return_win = True 65 | else: 66 | blockSize = win.size 67 | return_win = False 68 | 69 | # Zero-pad y so it is a multiple of blockSize 70 | N = y.size 71 | nBlocks = int(np.ceil(float(N)/blockSize)) 72 | y = np.concatenate((y, np.zeros(nBlocks*blockSize-N))) 73 | 74 | # Apply DCT to aligned blocks 75 | Win = scipy.sparse.spdiags(win, [0], blockSize, blockSize) 76 | Y = np.reshape(y, (blockSize, nBlocks), order='f') 77 | C = scipy.fftpack.dct(Win*Y, axis=0, norm='ortho') 78 | 79 | # Apply DCT to 50% shifted blockSize 80 | Y = np.reshape(np.roll(y, int(-blockSize/2)), (blockSize, nBlocks), 81 | order='f') 82 | C2 = scipy.fftpack.dct(Win*Y, axis=0, norm='ortho') 83 | 84 | coeff = np.concatenate((C.ravel(order='f'), C2.ravel(order='f'))) 85 | 86 | if return_win: return coeff, win 87 | else: return coeff 88 | 89 | def adjointShortTimeDCT(coeff, win, Ntrue=None): 90 | """ 91 | adjointShortTimeDCT(coeff, win, Ntrue=None) -> y 92 | 93 | Applies the adjoint/transpose Modified DCT to the coefficients coeff. 94 | This is also the pseudo-inverse of the forward MDCT. 95 | 96 | If Ntrue=N_original, where N_original is the original length of 97 | the signal y (i.e., before zero-padding in forwardShortTimeDCT), 98 | we truncate the padded zeros and return the original y. 99 | 100 | See forwardShortTimeDCT for an example of the window win. 101 | 102 | Stephen Becker, 3/18/2017 103 | See also forwardShortTimeDCT 104 | """ 105 | if coeff.size % 2: 106 | raise ValueError("""coeff should have an even number of elements. 107 | Did you compute coeff with forwardShortTimeDCT?""") 108 | N = int(coeff.size/2) 109 | 110 | blockSize = win.size 111 | nBlocks = int(np.ceil(float(N)/blockSize)) 112 | 113 | if Ntrue is not None and Ntrue > N: 114 | raise ValueError("""The specified value of Ntrue ({}) is too big 115 | for the number of coefficients in coeff ({})""".format( 116 | Ntrue, N)) 117 | 118 | Win = scipy.sparse.spdiags(win, [0], blockSize, blockSize) 119 | 120 | C = np.reshape(coeff[0:N], (blockSize, nBlocks), order='f') 121 | Y = Win*scipy.fftpack.idct(C, axis=0, norm='ortho') 122 | y = Y.ravel(order='f') 123 | 124 | C2 = np.reshape(coeff[N:], (blockSize, nBlocks), order='f') 125 | Y2 = Win*scipy.fftpack.idct(C2, axis=0, norm='ortho') 126 | y2 = np.roll(Y2.ravel(order='f'), int(blockSize/2)) 127 | y += y2 128 | 129 | if Ntrue: 130 | y = y[0:Ntrue] 131 | 132 | return y 133 | 134 | def my_upsample(y, sampleSet, n): 135 | """ 136 | my_upsample(y, sampleSet, n) -> x 137 | Returns x of length n such that x[sampleSet] = y 138 | """ 139 | if y.ndim == 1: 140 | x = np.zeros(n) 141 | x[sampleSet] = y 142 | else: 143 | x = np.zeros((n, y.shape[1])) 144 | x[sampleSet,:] = y 145 | return x 146 | -------------------------------------------------------------------------------- /Homeworks/HW10/test_python_functions.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just a simple script to help test the python routines and ensure they 3 | match the MATLAB routines. 4 | """ 5 | import numpy as np 6 | import scipy, scipy.io 7 | import pickle 8 | 9 | from python_functions import * 10 | 11 | def convert_handel(): 12 | import pickle 13 | mat = scipy.io.loadmat('handel.mat') 14 | y = mat['y'] 15 | Fs = mat['Fs'] 16 | 17 | pickle.dump((y,Fs), open('handel.pkl', 'wb')) # NOT bw compatible w/ python2 18 | pickle.dump((y,Fs), open('handel2.pkl', 'wb'), protocol=2) 19 | 20 | # load with 21 | y,Fs = pickle.load(open('handel.pkl', 'rb')) 22 | 23 | def test_project_l1(): 24 | # JMF 25/03/2017: tested with row vec, col vec, and mats; matches project_l1.m 25 | mat = scipy.io.loadmat('x.mat') 26 | x = mat['x'] 27 | 28 | y = project_l1(x, 1) 29 | 30 | def test_STDCT(): 31 | mat = scipy.io.loadmat('x.mat') 32 | x = mat['x'].ravel() 33 | coeff_ref = mat['coeff'].ravel() 34 | win_ref = mat['win'].ravel() 35 | 36 | coeff, win = forwardShortTimeDCT(x) 37 | 38 | xrec = adjointShortTimeDCT(coeff, win, x.size) 39 | print(np.linalg.norm(x-xrec)) 40 | 41 | def test_my_upsample(): 42 | x = np.random.randn(10, 2) 43 | sampleSet = np.array([0, 2, 3, 4]) 44 | y = x[sampleSet] 45 | print(x) 46 | print(my_upsample(y, sampleSet, x.size)) 47 | 48 | def Manuel(): 49 | mat = scipy.io.loadmat('handel.mat') 50 | y = mat['y'] 51 | Fs = mat['Fs'] 52 | # y,Fs = pickle.load(open('handel2.pkl', 'rb')) 53 | y = y.ravel() 54 | N = y.size 55 | print N 56 | 57 | coeff, win = forwardShortTimeDCT(y) 58 | 59 | 60 | if __name__ == '__main__': 61 | #Manuel() 62 | #convert_handel() 63 | test_project_l1() 64 | #test_STDCT() 65 | #test_my_upsample() 66 | -------------------------------------------------------------------------------- /Homeworks/ProjectRubric.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/ProjectRubric.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Stephen Becker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Notes/00_IntroToOptProblems.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/00_IntroToOptProblems.pdf -------------------------------------------------------------------------------- /Notes/00a_metaRules.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/00a_metaRules.pdf -------------------------------------------------------------------------------- /Notes/01_TypesOfMinimizers_IntroConvexity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/01_TypesOfMinimizers_IntroConvexity.pdf -------------------------------------------------------------------------------- /Notes/02_ConvexSets_part1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/02_ConvexSets_part1.pdf -------------------------------------------------------------------------------- /Notes/03_ConvexSets_part2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/03_ConvexSets_part2.pdf -------------------------------------------------------------------------------- /Notes/04_SeparatingHyperplanes_Farkas.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/04_SeparatingHyperplanes_Farkas.pdf -------------------------------------------------------------------------------- /Notes/05_ConvexFunctions_part1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/05_ConvexFunctions_part1.pdf -------------------------------------------------------------------------------- /Notes/05_ConvexFunctions_part2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/05_ConvexFunctions_part2.pdf -------------------------------------------------------------------------------- /Notes/05_ConvexFunctions_part3_LipschitzGradient.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/05_ConvexFunctions_part3_LipschitzGradient.pdf -------------------------------------------------------------------------------- /Notes/05_ConvexFunctions_part4_examples.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/05_ConvexFunctions_part4_examples.pdf -------------------------------------------------------------------------------- /Notes/05_ConvexFunctions_part5_preservingConvexity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/05_ConvexFunctions_part5_preservingConvexity.pdf -------------------------------------------------------------------------------- /Notes/06_ConjugateFunctions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/06_ConjugateFunctions.pdf -------------------------------------------------------------------------------- /Notes/07_GradientDescent_intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/07_GradientDescent_intro.pdf -------------------------------------------------------------------------------- /Notes/08_ExistenceUniquenessMinimizers.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/08_ExistenceUniquenessMinimizers.pdf -------------------------------------------------------------------------------- /Notes/09_ProximityOperators.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/09_ProximityOperators.pdf -------------------------------------------------------------------------------- /Notes/10_OptimizationProblems.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/10_OptimizationProblems.pdf -------------------------------------------------------------------------------- /Notes/11_FirstViewLagrangeMultipliers.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/11_FirstViewLagrangeMultipliers.pdf -------------------------------------------------------------------------------- /Notes/12_ConicOptimizationProblems.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/12_ConicOptimizationProblems.pdf -------------------------------------------------------------------------------- /Notes/13_moreOnSDPs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/13_moreOnSDPs.pdf -------------------------------------------------------------------------------- /Notes/14_LagrangianAndDualProblem.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/14_LagrangianAndDualProblem.pdf -------------------------------------------------------------------------------- /Notes/15_MoreDuality.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/15_MoreDuality.pdf -------------------------------------------------------------------------------- /Notes/16_SaddlePtsSharedLagrangians.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/16_SaddlePtsSharedLagrangians.pdf -------------------------------------------------------------------------------- /Notes/17_GameTheoryConnections.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/17_GameTheoryConnections.pdf -------------------------------------------------------------------------------- /Notes/18_FenchelRockafellarDuality.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/18_FenchelRockafellarDuality.pdf -------------------------------------------------------------------------------- /Notes/19_KKT_and_complementarySlackness.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/19_KKT_and_complementarySlackness.pdf -------------------------------------------------------------------------------- /Notes/22_ProximalGradientDescent_convergenceAnalysis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/22_ProximalGradientDescent_convergenceAnalysis.pdf -------------------------------------------------------------------------------- /Notes/22a_ProximalGradientDescent_motivation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/22a_ProximalGradientDescent_motivation.pdf -------------------------------------------------------------------------------- /Notes/23_NesterovAcceleration_convergenceAnalysis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/23_NesterovAcceleration_convergenceAnalysis.pdf -------------------------------------------------------------------------------- /Notes/24_ConvergenceRates.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/24_ConvergenceRates.pdf -------------------------------------------------------------------------------- /Notes/25_ConjugateGradientMethod.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/25_ConjugateGradientMethod.pdf -------------------------------------------------------------------------------- /Notes/26_QuasiNewtonMethods.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/26_QuasiNewtonMethods.pdf -------------------------------------------------------------------------------- /Notes/28_FindingGradientsByHand.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/28_FindingGradientsByHand.pdf -------------------------------------------------------------------------------- /Notes/29_AutomaticDifferentiation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/29_AutomaticDifferentiation.pdf -------------------------------------------------------------------------------- /Notes/29a_AdjointStateMethod.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/29a_AdjointStateMethod.pdf -------------------------------------------------------------------------------- /Notes/30_GradientsParameterizedFunctions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/30_GradientsParameterizedFunctions.pdf -------------------------------------------------------------------------------- /Notes/31_NewtonAndIPM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/31_NewtonAndIPM.pdf -------------------------------------------------------------------------------- /Notes/32_ADMM_DRS_PrimalDual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/32_ADMM_DRS_PrimalDual.pdf -------------------------------------------------------------------------------- /Notes/33_LinearPrograms.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/33_LinearPrograms.pdf -------------------------------------------------------------------------------- /Notes/34_IntegerLinearPrograms.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/34_IntegerLinearPrograms.pdf -------------------------------------------------------------------------------- /Notes/README.md: -------------------------------------------------------------------------------- 1 | # Handwritten notes 2 | 3 | Created via Microsoft OneNote with a Wacom One tablet, then exporting to PDF and using ghostscript/gs to reduce file-size using /screen preset 4 | -------------------------------------------------------------------------------- /Notes/appendix_notes_01.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/appendix_notes_01.pdf -------------------------------------------------------------------------------- /Notes/supplement_Geometry_Differentiability.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_Geometry_Differentiability.pdf -------------------------------------------------------------------------------- /Notes/supplement_LagrangeMultipliersIn2D.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_LagrangeMultipliersIn2D.pdf -------------------------------------------------------------------------------- /Notes/supplement_Slater_PrimalNotAchieved.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_Slater_PrimalNotAchieved.pdf -------------------------------------------------------------------------------- /Notes/supplement_VariationalInequalities_and_LCP.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_VariationalInequalities_and_LCP.pdf -------------------------------------------------------------------------------- /Notes/supplement_convergenceIteratesGradientDescent.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_convergenceIteratesGradientDescent.pdf -------------------------------------------------------------------------------- /Notes/supplement_dualityPracticeProblem.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_dualityPracticeProblem.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Advanced Convex Optimization class 2 | APPM 5630 at CU Boulder 3 | Prof. Becker, Spring 2025. Meeting time: MWF 11:15 -- 12:05 PM, DUAN G2B21 4 | 5 | Office hours: 6 | - Mon 3-4:30 and Thursday 1-2:30, hybrid (in my office or via zoom; see Canvas for zoom link) 7 | - Our TA Nic Rummel's office hours are Thurs 4:30-5:30 8 | 9 | This repo contains in-class demos and some homework solutions (much of it is from Spring 2023 and 2021 or even Fall 2018 when this class was taught as a special topics course 4720/5720) 10 | 11 | - Here is the [class policy/procedures](policies.md) document (and if you need it for some reason, the [2018 policies/syllabus (PDF)](APPM4720_5720_Fall2018_Syllabus.pdf)). 12 | - Here is the [syllabus (details on the content)](syllabus.md) document (and if you need it, the [day-by-day schedule](Fall2018_day-by-day_schedule.pdf) from Fall 2018 has even more details on what we covered) 13 | 14 | List of previous semesters' class projects 15 | - [2023 class projects](SlideshowAllPresentations_5630_Spring23.pdf) 16 | - [2021 class projects](SlideshowAllPresentations_5630_Spring21.pdf) 17 | - [2018 class projects](SlideshowAllPresentations_4720Fall18.pdf) 18 | - [2017 class projects](https://amath.colorado.edu/faculty/becker/SlideshowAllPresentations_4720Spr17.pdf). 19 | 20 | We will follow instructors' notes. The supplementary texts we used most often: 21 | - [Convex Optimization by Boyd and Vandenberghe](http://www.stanford.edu/~boyd/cvxbook/), Cambridge University Press 2004 22 | - [First-Order Methods in Optimization by Amir Beck](https://epubs.siam.org/doi/book/10.1137/1.9781611974997), SIAM 2017, see also [SIAM website](http://bookstore.siam.org/mo25/) 23 | - [Convex Analysis and Monotone Operator Theory in Hilbert Spaces by Bauschke and Combettes](https://link.springer.com/book/10.1007%2F978-3-319-48311-5), 2nd edition, Springer 2017 24 | 25 | More details on textbooks in the [syllabus](syllabus.md). 26 | -------------------------------------------------------------------------------- /SlideshowAllPresentations_4720Fall18.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/SlideshowAllPresentations_4720Fall18.pdf -------------------------------------------------------------------------------- /SlideshowAllPresentations_5630_Spring21.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/SlideshowAllPresentations_5630_Spring21.pdf -------------------------------------------------------------------------------- /TypedNotes/APPM5720Notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/TypedNotes/APPM5720Notes.pdf -------------------------------------------------------------------------------- /TypedNotes/README.md: -------------------------------------------------------------------------------- 1 | # APPM 5630 at CU Boulder: typed up notes 2 | 3 | 4 | These are latex notes from the Fall 2018 class typed up by Mitchell Krock. 5 | 6 | For future semesters, these notes might be modified/added/corrected (you can make a pull request if you have a change, or if you want to be very active, we can add you on the repo) 7 | 8 | Note: the compiled pdf might not always be up-to-date 9 | 10 | In addition, a verbose version of lecture notes for Spring 2021 is available at lecture_notes.pdf by Jaden Wang. It is perhaps most useful for lectures not posted on github and ease of locating information in a single typed PDF, although occasional extra insights might be helpful. Source files are in the lecture_notes_tex folder. 11 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/TypedNotes/lecture_notes.pdf -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_01.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \chapter{Theoretical Foundation} 6 | \newpage 7 | \section{Introduction} 8 | An optimization problem looks like 9 | \[ 10 | \min_{x \in C} f(x) 11 | \] 12 | where $ f(x)$ is the \allbold{objective function} and $ C \subseteq \rr^n$ is the \allbold{constraint set}. $ C$ might look like 13 | \[ 14 | C=\{x: g_i(x) \leq 0 \ \forall \ i=1,\ldots,m\} 15 | .\] 16 | 17 | \begin{remark} 18 | We can always turn a maximization problem into a minimization problem as the following: 19 | \[ 20 | \min_x f(x) = -\max_x -f(x) 21 | .\] 22 | Therefore, WLOG, we will stick with minimization. 23 | 24 | \end{remark} 25 | 26 | \begin{eg} 27 | An assistant professor earns \$100 per day, and they enjoy both ice cream and cake. The optimization problem aims to maximize the utility ( \emph{e.g.} happiness) of ice cream $ f_1(x_1)$ and of cake $ f_2(x_2)$. The constraints we have is that $ x_1\geq 0, x_2 \geq 0$, and $ x_1+x_2 \leq 100$. 28 | 29 | To maximize both utility, it might be natural to define 30 | \[ 31 | F(\vec{x}) = \begin{pmatrix} f_1(x_1)\\f_2(x_2) \end{pmatrix}, \vec{ x} = \begin{pmatrix} x_1\\x_2 \end{pmatrix} 32 | \] 33 | and maximize $ F$. However, this isn't a well-defined problem, because \emph{there is no total order on $ \rr^n$}! That is, we don't have a good way to compare whether a vector is bigger than another vector, except in the cases when the same direction of inequality can be achieved for all components of two vectors and a partial order can be established. For this kind of \allbold{multi-objective} optimization problem, we can look for Pareto-optimal points in these special cases. We can also try to convert the output into a scalar as the following: 34 | \[ 35 | \min_x f_1(x) + \lambda \cdot f_2(x_2) 36 | \] 37 | for some $ \lambda>0$ that reflects our preference for cake vs ice cream. But this can be subjective. 38 | 39 | \end{eg} 40 | 41 | 42 | Thus, For the remainder of this class, we are only going to assume $ f: \rr^n \to \rr$. 43 | \\ 44 | 45 | Moreover, for $ f: \rr \to \rr$, it's very easy to solve by using root finding algorithms or grid search. So since interesting problems occur with vector inputs, we will simply use $ x$ to represent vectors. 46 | 47 | \begin{notation} 48 | $ \min$ asks for the minimum value, whereas $ \arg\min$ asks for the minimizer that yields the minimum value. 49 | \end{notation} 50 | \newpage 51 | \subsection{Lipschitz continuity} 52 | \begin{eg} 53 | Let's consider a variant of the Dirichlet function, $ f: \rr \to \rr$ 54 | \begin{equation*} 55 | f(x)= 56 | \begin{cases} 57 | x & \text{ if } x \in \qq\\ 58 | 1 & \text{ if } x \in \rr \setminus \qq 59 | \end{cases} 60 | \end{equation*} 61 | Then the solution to the problem 62 | \[ 63 | \min_{x \in [0,1]} f(x) = 0 64 | \] 65 | is $ x=0$ by observation. However, the function is not smooth and a small perturbation can yield wildly different values. Thus, it is not tractable to solve this numerically. 66 | \end{eg} 67 | 68 | This requires us to add a smoothness assumption: 69 | \begin{defn} 70 | $ f: \rr^{n} \to \rr$ is \allbold{$L$-Lipschitz continuous} with respect to a norm $ \norm{ \cdot } $ if for all $ x, y \in \rr^{n}$, 71 | \[ 72 | |f(x) - f(y)|\leq L \cdot \norm{x-y} 73 | .\] 74 | \end{defn} 75 | 76 | \begin{note} 77 | Lipschitz continuity implies continuity and uniform continuity. It is a stronger statement because it tells us \emph{how} the function is (uniformly) continuous. However, it doesn't require differentiability. 78 | \end{note} 79 | 80 | \begin{defn} 81 | For $ 1\leq p < \infty$, 82 | \[ 83 | \norm{x}_p = \left( \sum_{ i= 1}^{ n} |x_i|^p \right)^{\frac{1}{p}} 84 | .\] 85 | For $ p = \infty$, 86 | \[ 87 | \norm{x}_{\infty} = \max_{1\leq i\leq n} |x_i| 88 | .\] 89 | \end{defn} 90 | 91 | \begin{remark} 92 | $ \norm{x}_1 $ and $ \norm{x}_2^2 $ have separable terms as they are sums of their components. $ \norm{x}_2^2 $ is also differentiable which makes it the nicest norm to optimize. 93 | \end{remark} 94 | 95 | \begin{eg} 96 | Let $ f: \rr^{n} \to \rr$ be $ L$-Lipschitz continuous w.r.t. $ \norm{ \cdot }_{\infty} $. Let $ C = [0,1]^{n}$, \emph{i.e.} in $ \rr^{2}$, $ C$ is a square. To solve the problem 97 | \[ 98 | \min_{x \in C} f(x) 99 | ,\] 100 | since we have few assumption, there is no better method (in the worst case sense) than the \allbold{uniform grid method}. The idea is that we pick $ p+1$ points in each dimension, \emph{i.e.} $ \{0,\frac{1}{p},\frac{2}{p},\ldots,1\} $, so we would have $ (p+1)^{n}$ points in total. 101 | 102 | Let $ x^* $ be a global optimal point, then there exists a grid point $ \widetilde{ x}$ s.t. \[ 103 | \norm{ x^* -\widetilde{ x}}_{\infty} \leq \frac{1}{2} \cdot \frac{1}{p} 104 | .\] 105 | Thus by Lipschitz continuity, 106 | \begin{align*} 107 | |f(x^* ) - f(\widetilde{ x})| &\leq L \cdot \norm{ x^* -\widetilde{ x}}_{\infty} \\ 108 | &\leq \frac{1}{2} \frac{L}{p} 109 | \end{align*} 110 | So we can find $ \widetilde{ x}$ by taking the discrete minimum of all $ (p+1)^{n}$ grid points.\\ 111 | 112 | In (non-discrete) optimization, we usually can't exactly find the minimizer, but rather find something very close. 113 | 114 | \begin{defn} 115 | $ x$ is a \allbold{$ \epsilon$-optimal solution} to $ \min_{x \in C} f(x)$ if $ x \in C$ and 116 | \[ 117 | f(x)-f^* \leq \epsilon 118 | \] 119 | where $ f^* = \min_{x \in C} f(x)$. 120 | \end{defn} 121 | 122 | Our uniform grid method gives us an $ \epsilon$-optimal solution with $ \epsilon = \frac{L}{2p}$, and requires $ (p+1)^{n}$ function evaluations. Writing $ p$ in terms of $ \epsilon$, we have $ p=\frac{L}{2 \epsilon}$ so equivalently it requires $ \left( \frac{2L}{ \epsilon} + 1 \right)^{n} $ function evaluations, which approximately is $ \epsilon^{-n}$. 123 | 124 | For $ \epsilon = 10^{-6}$, $ n=100$, it requires $ 10^{600}$ function evaluations. This is really bad! 125 | 126 | Take-aways from this example: 127 | \begin{itemize} 128 | \item curse-of-dimensionality: there can be trillions of variables in a Google Neural Network. It would be intractable using the grid method. 129 | \item we need more assumptions to allow us to use more powerful methods. 130 | \end{itemize} 131 | \end{eg} 132 | 133 | \subsection{Categorization} 134 | \begin{figure}[H] 135 | \hspace*{-4cm} 136 | \includegraphics[width=1.6\textwidth]{./figures/categorization.jpg} 137 | \end{figure} 138 | \newpage 139 | \end{document} 140 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_02.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | 6 | \subsection{Minimizers} 7 | 8 | We are given a generic problem $ \min_{x \in C} f(x), C \subseteq \rr^{n}$. Then a \allbold{feasible point} $ x$ means $ x \in C$. A \allbold{solution} or \allbold{minimizer} or \allbold{global minimizer} $ x^* $ means 9 | \begin{enumerate}[label=\arabic*)] 10 | \item $ x^* \in C$ 11 | \item $ \ \forall \ y \in C, f(x^* )\leq f(y)$ 12 | \end{enumerate} 13 | It might not be unique, \emph{i.e.} $ x^* \in \arg \min_{x \in C} f(x)$. 14 | \begin{eg} 15 | \[ 16 | \min_{x \in \rr} f(x) \text{ where } f(x)=0 \ \forall \ x 17 | .\] 18 | 19 | \end{eg} 20 | Sometimes the solution may not exist (even for convex problems). 21 | \begin{eg} 22 | \[ 23 | \min_{ x \in (0,1)} x^2 24 | .\] 25 | \end{eg} 26 | 27 | $ x^* $ is a \allbold{local minimizer} if $ x^* $ is feasible and there exists an $ \epsilon>0$ s.t. $ f(x^* )\leq f(y)$ $ \ \forall \ y \in C \cap B_{ \epsilon} ( x^* ) \coloneqq \{y: \norm{ y-x^* } \} < \epsilon$ . A \allbold{strict local minimizer} simply doesn't achieve equality. $ x^* $ is an \allbold{isolated local minimum} if it is a local minimum and no other local minimum are nearby. Notice that isolated implies strict but the converse is false. 28 | 29 | \begin{eg}[strict but not isolated] 30 | \begin{equation*} 31 | f(x)= 32 | \begin{cases} 33 | x^{4} \cos \left( \frac{1}{x} \right) +2x^{4} & x\neq 0\\ 34 | 0 & x=0\\ 35 | \end{cases} 36 | \end{equation*} 37 | $ x^* =0 $ is strict but not isolated due to the rapid oscillation near $ x=0$. 38 | \end{eg} 39 | 40 | \begin{notation} 41 | $ f \in \mathcal{ C}^3$ means $f,f',f'',f''' $ all exist and are continuous. $ f \in \mathcal{ C}^3( \rr^{n})$ means $ f, \nabla f, \nabla^2 f, \nabla^3 f$ all exists and are continuous. 42 | \end{notation} 43 | 44 | \subsubsection{Connections with Calculus 1} 45 | Recall that in Cal 1, we first find the stationary/critical points in the domain. Then we add the boundary points and minimize over the small (finite) set of candidates. 46 | 47 | In high-dimension optimization, we cannot check critical points and the boundary separately because the set of points in the boundary becomes infinite. Moreover, there can be infinite critical points too. 48 | 49 | Necessary condition: if $ x^* $ is a local or global minimizer and $ C = \rr^{n}$, then $ x^* $ is a \allbold{critical point}. But the converse is false. 50 | 51 | \begin{notation} 52 | The boundary of $ C$ is denoted as $ \partial C \coloneqq \overline{C} \setminus \inte C$. 53 | \end{notation} 54 | 55 | If $ x^* $ is a critical point but is not a local or global minimizer, then it's a \allbold{saddle point}. 56 | 57 | \begin{thm}[Weierstrass] 58 | If $ f$ is continuous and $ C$ is compact, then $ f$ achieves its infimum over $ C$. That is, 59 | \[ 60 | \inf_{x \in C} f(x) = \min_{x \in C} f(x) 61 | .\] 62 | \end{thm} 63 | \begin{note} 64 | This is pretty much the same as the Extreme Value Theorem. 65 | \end{note} 66 | 67 | \begin{proof} 68 | First let's prove a claim. 69 | \begin{claim} 70 | Every compact set $ K$ is closed and bounded. 71 | \end{claim} 72 | 73 | Closed: suppose not, the compact set $ K$ doesn't contain all its limit points. That is, there exists a limit point $ x \not\in K$ s.t. a sequence $ (x_n) \subseteq K$ converges to $ x$. But that also means that all subsequences of $ (x_n)$ converges to $ x \not\in K$ as well, contradicting with the definition of compactness that for every sequence in $ K$ there exists a subsequence that converges inside $ K$. 74 | 75 | Bounded: suppose not, for all $ M > 0$, there exists a $ x \in K$ s.t. $ \norm{ x} > M $. This allows us to find a sequence $ (x_n) \subseteq K$ s.t. $ x_n > n$. This way every subsequence is also unbounded and cannot converge, contradicting with the definition of sequential compactness. 76 | 77 | Now let's begin proof proper. Since $ C$ is compact and $ f$ is continuous, the image of $ C$ under $ f$, $ f(C)$, is also compact (this follows from sequential definition of continuity). By the claim $f(C) $ is bounded and closed, meaning that it has an infimum (completeness axiom) and contains the infimum (closed). Thus, $ f$ achieves its infimum over $ C$. 78 | \end{proof} 79 | \begin{remark} 80 | It would be nice if our constraints $ C$ are compact. But at the very least, we want our constraint sets to be closed. For example, $ \norm{ Ax -b} \leq \epsilon $ instead of $ \norm{ Ax -b} < \epsilon $. 81 | \end{remark} 82 | 83 | Several things to note about the feasible set $ C$: 84 | 85 | If $ C = \O$, the problem is infeasible. This is not always easy to spot. 86 | 87 | In this class, $ C$ will usually be convex and not integral, \emph{i.e.} $ \zz^{n}$. 88 | 89 | Integral constraint is problematic because the optimal integer solution might not be at all close to the optimal real solution, so we cannot obtain it by solving for the real solution first and then round it. 90 | 91 | \end{document} 92 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_05.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | 6 | \subsection{separating and supporting hyperplanes} 7 | ~\begin{thm}[separating hyperplane] 8 | Let $ C,D$ be convex, non-intersecting sets in $ \rr^{n}$, then there exists $ a \in \rr^{n} \setminus \{0\} $ and $ \mu \in \rr$ s.t. 9 | \begin{align*} 10 | a^{T}x \leq \mu \ \forall \ x \in C\\ 11 | a^{T} x \geq \mu \ \forall \ x \in D 12 | \end{align*} 13 | \end{thm} 14 | \begin{note} 15 | This reads as there exists a hyperplane that separates the two convex sets. It is clearly not true if the sets aren't convex. $ a$ is the normal to the hyperplane. 16 | \end{note} 17 | 18 | \begin{defn}[Chebyshev set] 19 | A set $ S$ is a \allbold{Chebyshev set} if for all $ x_0$, there exists a unique $ x \in S$ s.t. 20 | \[ 21 | x = \argmin_{y \in S} \norm{ y-x_0} 22 | .\] 23 | \end{defn} 24 | \begin{note} 25 | This reads as there exists a unique best approximation point in the set $ S$ for any $ x_0$. 26 | \end{note} 27 | \begin{eg} 28 | Open unit ball isn't Chebyshev because it doesn't reach infimum. 29 | \end{eg} 30 | \begin{eg} 31 | A nonconvex set isn't Chebyshev because there exists an $ x_0$ where we have at least two best approximation points. 32 | \end{eg} 33 | \begin{thm} 34 | Any nonempty, closed, convex set in a Hilbert space is Chebyshev. 35 | \end{thm} 36 | 37 | \begin{thm}[supporting hyperplanes] 38 | ~\begin{enumerate}[label=(\roman*)] 39 | \item If $ C$ is convex, closed and $ D = \{x_0\}, x_0 \not\in C $, then there exists $ a \in \rr^{n}$ s.t. $ a^{T}x< a^{T}x_0 \ \forall \ x \in C$. 40 | \item Same but $ C$ needs not be closed, $ x_0 \not\in \overline{C}$. 41 | \item as in (ii) but allow $ x_0 \in \overline{C}\setminus C$. 42 | \end{enumerate} 43 | \end{thm} 44 | \begin{proof} 45 | (i): WLOG let $ x_0 = 0$ (since we can always translate $ C$). $ C$ is Chebyshev so let $ y$ be the unique closest point to $ 0$, and define $ a=-y$ (normal of the hyperplane). We wish to show that $ a^{T}x< a^{T} x_0 =0 \ \forall \ x \in C$. That is, $ y^{T} x>0 \ \forall \ x \in C$. 46 | 47 | Given $ x \in C$, $ y + \epsilon (x-y) \in C$ by convexity. Since $ y$ is the best approximation point, 48 | \begin{align*} 49 | \norm{ y}^2 &\leq \norm{ y + \epsilon(x-y)}^2 \\ 50 | &= \norm{ y}^2 + 2 \epsilon\langle y,x-y \rangle+ \epsilon^2 \norm{ x-y}^2 \\ 51 | 0&= 2 \langle y,x \rangle - 2 \langle y,y \rangle + \epsilon \norm{ x-y}^2 \\ 52 | \langle y,x \rangle &\geq \norm{ y}^2 -\frac{ \epsilon}{2} \norm{ x-y}^2 53 | \end{align*} 54 | Take $ \epsilon \to 0$, since $ y \neq 0 \implies \norm{ y}>0 $, we obtain $ y^{T}x>0$ as required. 55 | \end{proof} 56 | 57 | \begin{remark} 58 | This is related to \allbold{Theorems of Alternatives}. Generally, they are stated as the following: 59 | 60 | Either $ A$ is true, $ B$ is false, but not both. 61 | \end{remark} 62 | \begin{eg}[Fredhold alternative,finite-dim] 63 | 64 | Either $ \{x: Ax=b\} $ is empty, or $ \{\lambda: A^{T} \lambda =0, \lambda^{T}b \neq 0\} $ is non-empty, but not both. 65 | 66 | Why do we care? To prove that there is a solution to $ Ax=b$. We can simply find a solution $ x$. This is a "certificate". But if professor asks you to prove there isn't a solution to $ Ax=b$, we can try to show that $ A$ is singular, but if $ b=0$ even singular $ A$ works. Another way is to find a "certificate" $ \lambda$. This is the first task of duality. 67 | \end{eg} 68 | 69 | 70 | \begin{eg}[Farkas Lemma] 71 | Either $ \{Ax=b,x\geq 0\} $ is non-empty, or $ \{\lambda: A^{T} \lambda\geq 0, \lambda^{T}b<0\} $ is non-empty, but not both. 72 | \end{eg} 73 | \begin{thm}[Thereom of Alternatives for strict linear inequalities] 74 | The following statements are equivalent: 75 | \begin{enumerate}[label=(\roman*)] 76 | \item The set $ \{x: Ax0$, we can take all components of $ y $ to $ 0^{+} $, so $ \lambda^{T} y \to 0^{+} $. Then $ \lambda^{T}(b-A^{T}x)\leq \mu \leq 0$ implies that $ \lambda^{T} b \leq 0$. 101 | 102 | Taken together, we have $ \lambda \geq 0, \lambda \neq 0$, $ A^{T} \lambda =0,$ and $ \lambda^{T}b \leq 0$. 103 | \end{proof} 104 | \newpage 105 | \end{document} 106 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_07.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsection{First-order conditions} 6 | 7 | ~\begin{thm} 8 | If $ f: \rr^{n} \to \rr$ is differentiable on dom(f) and if dom(f) is open and convex, then $ f$ is convex iff for all $ x,y \in \dom(f)$, 9 | \[ 10 | f(y) \geq f(x) + \langle \nabla f(x),y-x \rangle 11 | .\] 12 | \end{thm} 13 | \begin{note} 14 | This is the 1st order Taylor approximation (tangent line). The line is supporting the epigraph of $ f$. 15 | \end{note} 16 | ~\begin{figure}[H] 17 | \centering 18 | \includegraphics[width=\textwidth]{./figures/cvx_tan.png} 19 | \end{figure} 20 | \begin{thm} 21 | Under the same assumption, $ f$ is convex iff $\nabla f$ is monotone. That is, for all $ x,y \in \dom(x)$, 22 | \[ 23 | \langle x-y, \nabla f(x) - \nabla f(y) \rangle \geq 0 24 | .\] 25 | \end{thm} 26 | \begin{intuition} 27 | Recall in 1D, $ f$ is convex if slope is non-decreasing. That is, if $ x-y\geq 0$, then $ f'(x) - f'(y) \geq 0$ and if $ x-y \leq 0$ then $ f'(x)-f'(y)\leq 0$. A concise way to express that is $ (x-y)(f'(x) - f'(y)) \geq 0$. Here we generalize this to higher dimensions. 28 | \end{intuition} 29 | 30 | \begin{thm}[2nd-order condition] 31 | $ f: \rr^{n} \to \rr$. If the Hessian $ \nabla^2 f(x)$ exists for all $ x \in \dom(f)$, then 32 | \begin{enumerate}[label=\alph*)] 33 | \item $ f$ is convex iff $ \nabla ^2 f(x) \succeq 0 \ \forall \ x \in \dom(f)$. 34 | \item $ f$ is $ \mu$-strongly convex ( w.r.t. $ \norm{ \cdot }_2 $ ) iff $ \nabla ^2 f(x) \succeq \mu I$. 35 | \end{enumerate} 36 | If $ \nabla ^2 f(x) \succ 0$, then $ f$ is \allbold{strictly convex}. 37 | \end{thm} 38 | \begin{remark} 39 | $ f$ can be convex but $ \nabla f, \nabla ^2 f$ need not exist! 40 | \end{remark} 41 | What if $ f$ isn't differentiable? 42 | 43 | \begin{defn}[subdifferential] 44 | Let $ f: \rr^{n} \to (-\infty, \infty]$ be proper, then we define the \allbold{subdifferential} of $ f$ at $ x$ to be 45 | \[ 46 | \partial f(x) = \{d \in \rr^{n}: \ \forall \ y \in \rr^{n}, f(y) \geq f(x) + \langle d,y-x \rangle\} 47 | .\] 48 | \end{defn} 49 | \begin{note} 50 | $ d$ here is called a \allbold{subgradient}. 51 | \end{note} 52 | \begin{thm} 53 | If $ f$ is proper and convex then 54 | \[ 55 | x \in \ri(\dom(f)) \implies \partial f(x) \neq \O 56 | .\] 57 | \end{thm} 58 | \begin{note} 59 | The proof is related to separating/supporting hyperplanes. 60 | \end{note} 61 | \begin{prop} 62 | $ \partial f(x) $ is a singleton iff $ f$ is differentiable at $ x$. 63 | \end{prop} 64 | 65 | \begin{eg} 66 | $ f(x) = |x|$. Then if $ x\neq 0$, $ f'(x) = \sgn(x)$ and $ \partial f(x) = \{f'(x)\} $. If $ x=0$, $ f'(0)$ DNE. But $ \partial f(0) = [-1,1] $. 67 | \end{eg} 68 | 69 | \begin{thm}[Fermat's Rule] 70 | If $ f$ is a proper function, then 71 | \[ 72 | \argmin_{x} f(x) = \{x: 0 \in \partial f(x)\} 73 | .\] 74 | \end{thm} 75 | \begin{proof} 76 | This just means that we can plug $ 0$ into the definition of subdifferential and get 77 | \[ 78 | f(y) \geq f(x) + \langle 0, y-x \rangle = f(x) \ \forall \ y 79 | .\] 80 | This clearly shows that $ x$ is a global minimizer. 81 | \end{proof} 82 | 83 | \begin{note} 84 | This generalizes the calculus idea of critical points for smooth functions. 85 | \end{note} 86 | 87 | \begin{remark} 88 | Subdifferentials are a global notion (for all $ y$) whereas gradients are a local notion. How do we reconcile that subdifferential can be the gradient? The answer is that the global property of convexity links the two. 89 | \end{remark} 90 | \begin{remark} 91 | So all we need to do is to invert $ \partial f$. That is, 92 | \[ 93 | \argmin f(x) = \partial f^{-1} 94 | .\] 95 | In fact, this is usually not practical or even possible especially for interesting problems. It may be possible for subproblems. 96 | \end{remark} 97 | \begin{defn}[normal cone] 98 | The \allbold{normal cone} to a set $ C$ at point $ x$ is 99 | \begin{equation*} 100 | N_C(x)= 101 | \begin{cases} 102 | \{d: \langle d,y-x \rangle \leq 0 \ \forall \ y \in C\} & \text{ if } x \in C\\ 103 | \O & \text{ if } x \not\in C 104 | \end{cases} 105 | \end{equation*} 106 | \end{defn} 107 | \begin{eg} 108 | Let $ C \neq \O$ be convex, so $ I_C$ is a proper convex function. Then $ \partial I_C = N_C$. 109 | \end{eg} 110 | \begin{eg} 111 | $ x \in \inte C \implies N_C(x) = \{0\} $. Why? WLOG, shift $ C$ so $ x=0$. If $ \langle d,y \rangle\leq 0 \ \forall \ y \in C$. Then $ x \in \inte C \implies$ we can choose $y = \epsilon d \in C$ for sufficiently small $ \epsilon >0$. Then $ \epsilon \norm{ d}^2 \leq 0 \implies d=0 $. 112 | \end{eg} 113 | \begin{eg} 114 | $ x \in \partial C$ (the boundary). We want $ d$ s.t. $ \langle d,y \rangle\leq 0 \ \forall \ y \in C$. Geometrically this means we want the angle between $ d,y$ to be perpendicular or obtuse. If the boundary is smooth, since $ d$ needs to be at least perpendicular to any $ y$ immediately to the left and right of $ x$, it must be the normal ray of the tangent plane. 115 | \end{eg} 116 | 117 | \begin{eg} 118 | ~\begin{figure}[H] 119 | \centering 120 | \includegraphics[width=0.8\textwidth]{./figures/normal_cone.png} 121 | \caption{The normal cone at non-smooth boundary looks indeed like a cone.} 122 | \end{figure} 123 | \end{eg} 124 | \begin{remark} 125 | An equivalent definition of normal cone is the set of all vectors that define a supporting hyperplane to $ C$, passing through $ x$. 126 | \end{remark} 127 | 128 | \begin{eg} 129 | If $ C$ is a vector space, since $ C$ is closed under inverses, if we use $ -y$ in addition to $ y$ in the definition we will get an equality which implies orthogonality. Hence 130 | \begin{equation*} 131 | N_C(x)= 132 | \begin{cases} 133 | C^{\perp} & x \in C\\ 134 | \O & x \not\in C 135 | \end{cases} 136 | \end{equation*} 137 | \end{eg} 138 | 139 | \begin{prop}[6.47 BC17] 140 | If $ C \neq \O$ is closed and convex, then $ x=P_C(y)$ iff $ y-x \in N_C(x)$, where $ P_C(y)$ denotes the orthogonal projection of $ y$ onto $ C$. 141 | \end{prop} 142 | \end{document} 143 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_08.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \newpage 6 | \subsection{Calculus} 7 | 8 | \begin{remark} 9 | Calculus is a set of rules we can use to calculate. 10 | \end{remark} 11 | 12 | One such rule is that derivatives/gradients are linear. 13 | 14 | Is it true that $ \partial (f+g) = \partial f+\partial g$, where $ +$ is the Minkowski sum? No! Although it's often true. 15 | 16 | \begin{eg} 17 | $ f=I_C, g= I_D \in \rr^{2}$. 18 | ~\begin{figure}[H] 19 | \centering 20 | \includegraphics[width=\textwidth]{./figures/counter_linear.png} 21 | \end{figure} 22 | 23 | Then $ \partial (f+g)(x) = \partial f(x) \partial g(x)$ for all $ x$ except at $ x=0$. At $ x=0$, recall that 24 | \begin{align*} 25 | \partial f(0) = N_C(0) &= \rr_{+} \times \{0\} \\ 26 | \partial g(0) = N_D(0) &= \rr_{-} \times \{0\} 27 | \end{align*} 28 | So $ \partial f(0) + \partial g(0) = \rr \times \{0\} $ 29 | But 30 | \begin{align*} 31 | \partial (f+g) (0) &= N_{C \cap D}(0) \qquad \qquad \qquad \text{ by def of indicator} \\ 32 | &= N_{0} \\ 33 | &= \{d: \langle d,y-0 \rangle \leq 0 \ \forall \ y \in \{0\} \} \quad \text{ vacuous constraint} \\ 34 | &= \rr^2 35 | \end{align*} 36 | We can see this counterexample is somewhat contrived, so linearity is often true. 37 | \end{eg} 38 | 39 | \begin{remark} 40 | Sufficient conditions to guarantee when this linearity is true are called \allbold{constraint qualifications (CQ)}. 41 | \end{remark} 42 | \begin{coro}[16.48 (iv) BC17] 43 | If $ f,g \in \Gamma_0(\mathcal{H})$, and $\mathcal{H} = \rr^{n} $, and one of the following holds: 44 | \begin{enumerate}[label=(\roman*)] 45 | \item $ \ri(\dom(f)) \cap \ri(\dom(g)) \neq \O $. 46 | \item $ \\dom(f) \cap \inte (\dom(g)) \neq \O$. 47 | \item either $ f$ or $ g$ has full domain (all of $ \rr^{n}$). 48 | \end{enumerate} 49 | \end{coro} 50 | \begin{note} 51 | (iii) is most commonly used. 52 | \end{note} 53 | Since the previous example didn't satisfy a CQ, the linearity didn't hold. That is, $ \dom f = C, \dom g =D, \inte C \cap \inte D = \O$. 54 | \begin{remark} 55 | There are other cones including \allbold{tangent, polar, recession/asymptotic, and barrier cones}. 56 | \end{remark} 57 | \newpage 58 | \subsection{Lipschitz gradient} 59 | An easier way to show $ F$ is Lipschitz-continuous: if $ F'$ exists, then $ |\norm{ F'}| \leq L \implies F$ is Lipschitz continuous (by the definition of derivative/Jacobian and some manipulation). 60 | \begin{notation} 61 | $ |\norm{ \cdot } |$ denotes the appropriate operator norm, usually spectral norm if the original norm is Euclidean. 62 | \end{notation} 63 | 64 | \begin{remark} 65 | In optimization, "Jacobian" is often confusing, since it's unclear what $ F$ is. Of the objective function or of the gradient? Instead we prefer to say the Jacobian of the objective is the gradient (transposed). The Jacobian of the gradient is the Hessian. 66 | \end{remark} 67 | \begin{remark} 68 | The Hessian can be thought of as a bilinear operator $ \langle d, \nabla ^2 f(x) d \rangle$ 69 | \end{remark} 70 | 71 | \begin{thm} 72 | Suppose convex $ f \in \mathcal{ C}^2(U)$ for some open set $ U \subseteq \rr^{n}$, then 73 | \[ 74 | \nabla f \text{ is $L$-Lipschitz continuous on } U \iff \ \forall \ x \in U, \nabla ^2 f(x) \preceq L I 75 | .\] 76 | That is, all eigenvalues of $ \nabla ^2f(x) \leq L \implies |\norm{ \nabla ^2 f(x)} | \leq L$. 77 | \end{thm} 78 | \begin{thm} 79 | Same setup, then 80 | 81 | $ f$ is $ \mu$-strongly convex on $ U \iff \ \forall \ x \in U, \mu I \preceq \nabla ^2 f(x)$. 82 | \end{thm} 83 | \begin{note} 84 | We assume $ \mu>0$ since $ \mu=0$ would give us plain old convexity. 85 | \end{note} 86 | \begin{remark} 87 | One of our common assumption will be $ \nabla f$ is $L$-Lipschitz continuous ($ \nabla ^2 f \preceq LI$) and a bit less common, also assume strong convexity ($ uI \preceq \nabla ^2 f$). 88 | \end{remark} 89 | 90 | \end{document} 91 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_09.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \begin{eg}[best function ever] 6 | Consider $ f(x) = \frac{1}{2} \norm{ x}_2^2, \nabla f(x) = x, \nabla ^2f(x) = I $. So $ L=1, \mu=1$. This is the only function with this property. 7 | 8 | This is the nicest function ever for optimization! 9 | \end{eg} 10 | \begin{defn}[condition number] 11 | The \allbold{condition number} of $ f$ is $ k_f = \frac{L}{\mu}$. $ k_f \approx 1$ is good. Larger is bad. 12 | \end{defn} 13 | Why do we care about these assumptions? 14 | 15 | Recall from calculus, Taylor's theorem states that 16 | \[ 17 | f(y) = f(x) + f'(x) (y-x) + \frac{1}{2} f''(\xi) (y-x)^2 18 | ,\] 19 | were $ \xi \in [x,y]$. If $ f''(\xi) \leq L \ \forall \ \xi$, then 20 | \[ 21 | f(y) \leq f(x) + f'(x)(y-x) + \frac{L}{2} (y-x)^2 22 | .\] 23 | \begin{thm} 24 | If $ \nabla f$ is $L$-Lipschitz continuous and $ f$ is $ \mu$-strongly convex, then for all $ x,y \in \dom(f)$, 25 | \[ 26 | \frac{\mu}{2} \norm{ y-x}^2 \leq f(y) - (f(x) + \langle \nabla f(x), y-x \rangle) \leq \frac{L}{2} \norm{ y-x}^2 27 | .\] 28 | \end{thm} 29 | ~\begin{figure}[H] 30 | \centering 31 | \includegraphics[width=\textwidth]{./figures/quad_bounds.png} 32 | \caption{If $ f$ is complicated but we can "sandwich" it between a quadratic upper bound and a quadratic lower bound ( $ \mu>0$ ) or a linear lower bound ($ \mu=0$), then we can work with the quadratics to understand the behavior of $ f$ since quadratics are much easier to deal with.} 33 | \end{figure} 34 | 35 | See more properties from this section in the Github handout StrongConvexityLipschitz.pdf. 36 | \newpage 37 | 38 | \subsection{Examples [BV04 Ch.3.1.5]} 39 | Examples of convex functions $ f: \rr \to \rr$ : 40 | \begin{itemize} 41 | \item $ e^{ax}, a \in \rr$. 42 | \item $ x^{a}$ on $ x \in \rr_{++}$ if $ a\leq 0$ or $ a\geq 1$. (It's concave on $ 0\leq a \leq 1$). 43 | \item $ |x|^{a}$ on all of $ \rr$, if $ a\geq 1$. 44 | \item $ - \log_b(x)$ on $ \rr_{++}$ if $ b>1$. 45 | \item On $ \rr^{+}$, 46 | \begin{equation*} 47 | \begin{cases} 48 | x \cdot \log(x) & x>0\\ 49 | 0 & x=0 50 | \end{cases} 51 | \end{equation*} 52 | since $ f''(x) = \frac{1}{x} >0$. 53 | \end{itemize} 54 | 55 | Examples of convex functions $ f: \rr^{n} \to \rr$: 56 | \begin{itemize} 57 | \item any norm/seminorm (follows directly from triangle inequality). 58 | \item $ f(x) = \max\{x_1,\ldots,x_n\} $. 59 | \item $ f(x,y) = x^2 /y$, $ \dom(f) = \rr \times \rr_{++}$. "Quadratic over linear". 60 | 61 | $ f(x,y) = \norm{ x}_2^2 /y $, $ \dom(f) = \rr^{n-1} \times \rr_{++}$. 62 | 63 | $ f(x,Y) = x^{T} Y^{-1} x$, $ \dom(f) = \rr^{n} \times S_{++}^{n}$. "Matrix fractional function". 64 | \begin{note} 65 | "Linear fractional function" 66 | \[ 67 | g(x) = \frac{Ax+b}{c^{T}x+ d }, \quad \dom(g) = \{x: c^{T}x+d >0\} 68 | \] 69 | is not convex but it is \allbold{quasi-convex}. It is defined by having all convex sub-level sets $ \{x: f(x) \leq \alpha\} $. 70 | \end{note} 71 | ~\begin{figure}[H] 72 | \centering 73 | \hspace*{-3cm} 74 | \includegraphics[width=1.4\textwidth]{./figures/quasi_cvx.png} 75 | \end{figure} 76 | \item "log-sum exp" aka "soft-max" 77 | \[ 78 | f(x) = \frac{1}{\alpha} \log \left( e^{\alpha x_1} + \ldots + e^{\alpha x_n} \right) , \alpha > 0 79 | .\] 80 | This is differentiable but needs to be careful about numerical under/overflow. 81 | \item geometric mean $ f(x) \left( \prod_{ i= 1}^{ n} x_i \right)^{\frac{1}{n}} $ on $ \rr_{++}^{n}$. 82 | \item $ - \log \det(X) =-\log\left( \prod \lambda_i \right) = - \sum \log(\lambda_i) $ on $ S_{++}^{n}$. 83 | \end{itemize} 84 | 85 | 86 | \begin{thm}[Jensen's Inequality] 87 | \[ 88 | f(\ev [x]) \leq \ev [f(x)] 89 | .\] 90 | \end{thm} 91 | \begin{remark} 92 | Let $ X$ be a random variable that outputs points in $ \dom(f)$ with probability in $ [0,1]$, then the inequality follows from definition of convex function. 93 | \end{remark} 94 | \begin{eg} 95 | In machine learning, we often prove something like 96 | \[ 97 | \ev[\norm{ \text{ error} }^2 ] \leq \epsilon 98 | .\] 99 | Let $ f(x) = x^2$. So by Jensen's inequality: 100 | \begin{align*} 101 | \left( \ev[ \norm{ \text{ error} } ] \right) ^2 &\leq \ev\left[ \norm{ \text{ error} }^2 \right]\leq \epsilon \\ 102 | \ev[\norm{ \text{ error} } ] &\leq \sqrt{\ev \left[ \norm{ \text{ error} }^2 \right] } \leq \sqrt{ \epsilon} 103 | \end{align*} 104 | Recall that $ \norm{ \text{ error} }^2 $ is the nicest function ever. 105 | \end{eg} 106 | 107 | \begin{remark} 108 | H\"{o}lder's inequality/Cauchy-Schwarz can also be proved via Jensen. 109 | \end{remark} 110 | \begin{thm}[H\"{o}lder's inequality] 111 | If $ \frac{1}{p} + \frac{1}{q} = 1$, 112 | \[ 113 | |\langle x,y \rangle| \leq \norm{ x}_p \cdot \norm{ y}_q 114 | .\] 115 | \end{thm} 116 | 117 | \begin{remark} 118 | We can use Jensen's to prove Holder inequality. 119 | \end{remark} 120 | \end{document} 121 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_10.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsection{Preserving convexity} 6 | 7 | \subsubsection{Rule 0: non-negative (weighted) sums} 8 | If $ f_1, \ldots, f_m$ are convex, $ \alpha_i \geq 0$, then 9 | $ x \mapsto \sum \alpha_i f_i(x)$ is convex too. 10 | 11 | Subtraction (negative weights) doesn't work. 12 | 13 | It works for integrals too: 14 | 15 | If for all $ y$, $ f( \cdot ,y)$ is convex, and $ w(y) \geq 0$. Then 16 | \[ 17 | x \mapsto \int_{\Omega} f(x,y) w(y)\ dy 18 | \] 19 | is convex. 20 | 21 | \subsubsection{Rule 1: perspective function} 22 | 23 | ~\begin{defn}[perspective] 24 | Let $ f: \rr^{n} \to \rr$, then its \allbold{perspective} is $ g: \rr^{n+1} \to \rr$, 25 | \[ 26 | g(x,t) = t \cdot f\left(\frac{x}{t}\right), \quad \dom(g) = \{(x,t): x / t \in \dom(f), t>0\} 27 | .\] 28 | \end{defn} 29 | \begin{prop} 30 | $ f: \rr^{n} \to \rr$ is convex $ \implies$ its perspective is convex. 31 | \end{prop} 32 | 33 | \begin{eg} 34 | $ f(x) = \norm{ x}^2 $ is convex. Its perspective is 35 | \[ 36 | t \cdot \norm{ \frac{x}{t}}^2 = t \cdot \frac{\norm{ x}^2 }{t^2 } = \frac{\norm{ x}^2 }{t } 37 | .\] 38 | This is the quadratic-over-linear example we saw earlier. This is the proof that it is convex. 39 | \end{eg} 40 | \begin{eg} 41 | $ f(x) = -\log(x)$ is convex. Its perspective is 42 | \[ 43 | -t \cdot \log\left( \frac{x}{t} \right) = t \cdot \log(t) - t \cdot \log(x), x,t>0 44 | .\] 45 | This is the \allbold{relative entropy} of $ t,x$. More generally, the \allbold{Kullback-Leibler divergence} is 46 | \[ 47 | D_{KL} (u,v) = \sum_{ i= 1}^{ n} u_i \log\left( \frac{u_i}{v_i } \right) -u_i+v_i 48 | .\] 49 | 50 | This is an example of \allbold{Bregman Divergence}, which we often use to measure "distance" as an alternative to metric. It's especially good for probability distributions. 51 | \end{eg} 52 | 53 | 54 | \subsubsection{Rule 2: special types of compositions} 55 | 56 | Composition of convex functions typically doesn't preserve convexity! 57 | 58 | \begin{thm} 59 | $ f$ is convex if 60 | \begin{enumerate}[label=(\roman*)] 61 | \item $ h$ is convex and 62 | \item if $ k=1$, $ h$ is nondecreasing and $ g$ is convex or $ h$ is nonincreasing and $ g$ is concave. 63 | \item if $ k>1$, we enforce (ii) to each argument of $ h$ and each $ g_i$. 64 | \end{enumerate} 65 | \end{thm} 66 | \begin{note} 67 | For nonincreasing/decreasing, we must take into account $ \pm \infty$, since in convex analysis we assign infinity to any point not in the domain. So although $ h(x) = x$ is nondecreasing on $ \rr$, if we restrict $ \dom(h) = [0,1]$ then it is not nondecreasing anymore. 68 | \end{note} 69 | \begin{thm}[tattoo-worthy] 70 | $ f= h\circ g$ is convex if $ h$ is convex and $ g$ is affine. 71 | \end{thm} 72 | 73 | \begin{eg} 74 | $ f(x) = \norm{ Ax-b}^2 $ is convex by this theorem. 75 | \end{eg} 76 | \subsubsection{Rule 3: min/max} 77 | 78 | \begin{prop} 79 | If $ f,g$ both convex, then $ x\mapsto \max \{f(x),g(x)\} $ is convex. 80 | \end{prop} 81 | \begin{proof} 82 | The epigraph of the maximum is the intersection of two convex epigraphs. Convex sets are closed under arbitrary intersections. 83 | \end{proof} 84 | ~\begin{figure}[H] 85 | \centering 86 | \includegraphics[width=0.8\textwidth]{./figures/max_cvx.png} 87 | \end{figure} 88 | \begin{note} 89 | This works for supremum too due to closure under arbitrary intersections. 90 | \end{note} 91 | \begin{eg} 92 | \[ 93 | f(x) = \sup_{y \in \mathcal{ A}} f(x;y) 94 | \] 95 | is convex as long as $ f( \cdot ; y)$ is convex $ \ \forall \ y \in \mathcal{ A}$, where $ \mathcal{ A}$ is an arbitrary set that can be uncountable. 96 | \end{eg} 97 | 98 | \begin{eg}[spectral norm] 99 | \[ f(A) = \norm{ A}_{\infty} = \sup_{\norm{ x}_2=1 } \norm{ Ax}_2 \] is convex since $ \ \forall \ x, A \mapsto \norm{ Ax}_2 $ is convex (composition of convex and affine). 100 | \end{eg} 101 | 102 | \begin{figure}[H] 103 | \centering 104 | \includegraphics[width=0.8\textwidth]{./figures/min_cvx.png} 105 | \end{figure} 106 | It's easy to see that min doesn't necessarily preserve convexity because it unions epigraphs instead.We need to impose more restrictions to make it work: 107 | \begin{thm} 108 | If $ f: \rr^{n} \times \rr^{m}$ is (jointly) convex and if $ C \neq \O$ is a convex set, then 109 | \[ 110 | g(x) = \inf_{y \in C} f(x,y) \text{ is convex} 111 | .\] 112 | \end{thm} 113 | 114 | \begin{eg} 115 | $ \min \{f_1(x), f_2(x)\} $ is not usually convex since this is like taking 116 | \begin{equation*} 117 | f(x,y)= 118 | \begin{cases} 119 | f_1(x), &y=1\\ 120 | f_2(x), &y=2 121 | \end{cases} 122 | \end{equation*} 123 | and constraint $ C = \{1,2\} $ is not convex. 124 | \end{eg} 125 | \begin{eg} 126 | The distance to a convex set is a convex function. Let $ C \neq \O$ be convex, 127 | \[ 128 | f(x) = \inf_{y \in C} \norm{ x-y} 129 | .\] 130 | 131 | Prove $ (x,y) \mapsto \norm{ x-y} $ is convex. 132 | \begin{proof} 133 | We know $ z \mapsto \norm{ z} $ is convex. Consider the linear operator $ A(x,y) = x-y$. That is, 134 | \[ 135 | A \begin{pmatrix} x\\y \end{pmatrix} = \begin{pmatrix} I & -I \end{pmatrix} \begin{pmatrix} x\\y \end{pmatrix} = x-y 136 | .\] 137 | Then the composition of convex and affine is still convex. 138 | \end{proof} 139 | \end{eg} 140 | 141 | \end{document} 142 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_11.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsection{Gradient descent} 6 | 7 | Problem: we want to solve $ \min_{x} f(x)$, $ f:\rr^{n} \to \rr$, $ f \in \Gamma_0 (\rr^{n})$ (proper, lsc, convex) and $ \nabla f$ is $L$-Lipschitz continuous (strongly smooth). 8 | 9 | \subsubsection{Attempt 1} 10 | \[ 11 | x_{k+1} = \argmin_{x} \left[ f(x_k) + \underbrace{ \langle \nabla f(x_k), x- x_k \rangle}_{q_k (x) \text{ 1st order surrogate} } \right] 12 | .\] 13 | Linearization is a common trick to simplify problems. However, this fails because $ \min_{x} q_k(x) = -\infty$ for a linear function (unless it's already optimal). We can fix this by add a compact constraint. Then it's called \allbold{Frank-Wolfe} or \allbold{conditional gradient}. We omit this discussion as it's a bit niche. 14 | 15 | \subsubsection{Attempt 2} 16 | Consider the 2nd order Taylor series: 17 | \[ 18 | x_{k+1} = \argmin_{x} \underbrace{ f(x_k) + \langle \nabla f(x_k) , x-x_k \rangle + \frac{1}{2} \langle x-x_k, \nabla ^2 f(x_k) (x-x_k) \rangle}_{q_k(x) \text{ quadratic surrogate} } 19 | .\] 20 | Since $ f$ is convex, $ \nabla ^2 f(x) \succeq 0 \implies q_k(x)$ is a convex quadratic (sum of convex functions). 21 | 22 | To minimize $ q_k(x)$, we use Fermat's rule: 23 | \begin{align*} 24 | 0 &= \nabla q_k(x) \\ 25 | &= \nabla f(x_k) + \nabla ^2 f(x_k) (x-x_k) &&\text{ gradients of linear and quadratic terms} \\ 26 | x_{k+1} &= x_k - \nabla ^2 f(x_k)^{-1} \nabla f(x_k) 27 | \end{align*} 28 | 29 | This is \allbold{Newton's method}, a generalization of the "Newton-Raphson" for 1D root-finding, applied to the gradient. It is a \allbold{2nd-order method} because it involves the derivative of the gradient which is the second derivative (Hessian). 30 | 31 | \begin{remark} 32 | Unlike 1D root-finding, 2nd order methods in higher dimensions converge quickly but each iteration may be costly because we need to invert the Hessian and solving system of equations. This is about $ \mathcal{ O}(n^3)$. 1st-order methods only use $ \nabla f(x)$ and usually converge more slowly but each step is cheap at about $ \mathcal{ O}(n)$. 33 | \end{remark} 34 | 35 | \subsubsection{What to use?} 36 | It depends: 37 | \begin{itemize} 38 | \item Structure matters (is $ \nabla ^2 f$ easy to invert? Is it ill-conditioned (which hurts 1st order more)?) 39 | \item For small/medium problem size, high accuracy, we use 2nd order. This is default for cvx/cvxpy. 40 | \item In between problems: try both? 41 | \end{itemize} 42 | 43 | \subsubsection{Other types} 44 | \begin{itemize} 45 | \item 3rd order: usually not worth the complexity. See recent Nesterov work for a plausible implementation. 46 | \item 0th order: Extremely slow and finding gradient is cheap anyway, usually not worth it. 47 | \item coordinate descent: heavily depends on the structure. 48 | \end{itemize} 49 | 50 | \subsubsection{Attempt 3} 51 | By assumption, $ 0 \preceq \nabla ^2 f(x) \preceq L I$. Thus, for all $ y$, 52 | \[ 53 | \frac{1}{2} \langle y, \nabla ^2 f(x)\ y \rangle \leq \frac{1}{2} L \norm{ y}^2 54 | .\] 55 | This allows us to upper bound the quadratic surrogate and simplify it further by removing the Hessian. Notice $ (LI) ^{-1} = \frac{1}{L} I$ which replaces $ (\nabla ^2 f)^{-1}$. So we can modify Newton's method as 56 | \begin{align*} 57 | x_{k+1} &= \argmin_{x} \underbrace{ f(x_k) + \langle \nabla f(x_k) , x- x_k \rangle + \frac{1}{2} L \norm{ x -x_k}^2}_{q_k(x)}\\ 58 | &= x_k - \frac{1}{L} \nabla f(x_k) 59 | \end{align*} 60 | This is $ \mathcal{ O}(n)$. Here $ q_k(x) \geq f(x) \ \forall \ x$ is more than a linearization but is less than the full 2nd order Taylor expansion. It is a \allbold{majorizer} of $ f$. 61 | 62 | fig 63 | 64 | \subsubsection{Majorization-minimization (MM)} 65 | MM can always guarantee making progress on the minimization. The framework is 66 | \begin{enumerate}[label=\arabic*)] 67 | \item Assume we can always construct a majorizer $ q_k$ s.t. 68 | \begin{enumerate}[label=(\roman*)] 69 | \item $ \ \forall \ x, f(x) \leq q_k(x)$ 70 | \item $ f(x_k) = q_k(x_k)$ 71 | \end{enumerate} 72 | \item Iterate: $ x_{k+1} \in \argmin_{x} q_k(x)$. 73 | \end{enumerate} 74 | This algorithm is a \allbold{descent algorithm}. That is, it never makes things worse. 75 | \begin{proof} 76 | \begin{align*} 77 | f(x_{k+1}) &\leq q_k (x_{k+1}) \qquad \text{ by (i)} \\ 78 | &\leq q_k(x_k) \qquad \text{ by 2)} \\ 79 | &= f(x_k) \qquad \text{ by (ii)} 80 | \end{align*} 81 | \end{proof} 82 | 83 | Usually we might eventually show that (no convexity needed): 84 | \begin{itemize} 85 | \item If $ f(x)$ is bounded below, then $ f(x_k)$ converges by MCT. 86 | \item If $ (x_k)$ converges and $ f$ is lsc, then the limit $ x_k \to x$ is a stationary point \emph{i.e.} $ \nabla f(x) =0$. 87 | \end{itemize} 88 | \begin{eg}[usually non-convex] 89 | ~\begin{enumerate}[label=\arabic*)] 90 | \item Expectation maximization (EM) for maximum-likelihood estimation. 91 | \item Difference of convex functions (DC) or convex + concave: 92 | \[ 93 | f(x) = g(x) - h(x) 94 | ,\] 95 | where $ g,h$ are both convex. Although $ -h$ is concave, $ -h$ is majorized by its tangent line which is convex. Then 96 | \[ 97 | q_k(x) = g(x) - \underbrace{ (h(x_k) + \langle \nabla h(x_k),x-x_k \rangle)}_{ \text{ affine in }x } 98 | \] 99 | is a majorizer, and $ q_k(x)$ is convex. 100 | \end{enumerate} 101 | The takeaway is that not all non-convex problems are equally hard. 102 | \end{eg} 103 | \end{document} 104 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_14.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \newpage 6 | \section{Convex optimization problems} 7 | \subsection{Tricks} 8 | The standard form of an optimization problem looks like: 9 | \begin{align*} 10 | \min\ &f_0(x)\\ 11 | \text{subject to } &f_i(x) \leq 0, i = 1,\ldots,m \\ 12 | &h_i(x) = 0 , i = 1,\ldots,p 13 | \end{align*} 14 | \begin{enumerate}[label=\arabic*)] 15 | \item Max to min: 16 | \[ 17 | \max_{x \in C} f(x) = - \min_{x \in C} -f(x) 18 | .\] 19 | \item Equivalent problems: 20 | \begin{eg} 21 | $ f(x) = \sqrt{|x|} $. Then $ \argmin f(x) = \argmin f(x)^2$ and we turn it into a convex problem. 22 | \end{eg} 23 | 24 | \begin{eg} 25 | $ f(x) = \norm{ Ax-b} + \lambda \norm{ x}^2 $. Equivalently, we can solve 26 | \[ 27 | \min \norm{ Ax- b}^2 + \sigma \norm{ x}^2 28 | .\] 29 | So we need to adjust the constant (Lagrange multipliers). 30 | \end{eg} 31 | 32 | \item Change of variables: 33 | This works especially well for affine transformation because it doesn't change convexity. 34 | \item Eliminate equality constraints: 35 | \begin{eg} 36 | \[ 37 | \underset{ Ax=b}{ \min} f(x) 38 | .\] 39 | We can decompose $ x = x_p + \ker A$ (particular solution + homogeneous solution). Let $ F$ be the basis of $ \ker(A)$, then $ x= x_p + F z$. This way we change the problem to 40 | \[ 41 | \min_z f(x_p +F z) 42 | .\] 43 | Notice this eliminates the constraint by (affine) change of variable. 44 | \end{eg} 45 | \item Slack variables: 46 | 47 | $ f_i(x) \leq 0$ iff there exists a $ s_i \geq 0$ s.t. $ f_i(x) + s_i =0$. 48 | Then we turn $ \min_x f_0(x), \text{ subject to } f_1(x) \leq 0$ into 49 | \begin{align*} 50 | \min_{x,s}\ &f_0(x)\\ 51 | \text{ subject to } &f_1(x) + s= 0\\ 52 | &s\geq 0 53 | \end{align*} 54 | This is less important nowadays since softwares are less constrained by the form we give them. 55 | \item Epigraph: 56 | \[ 57 | \min_{x \in \rr^{n}} f(x) \iff \min_{x \in \rr^{n}, t \in \rr} t, \qquad f(x) \leq t 58 | .\] 59 | \begin{eg} 60 | \begin{align*} 61 | \min \norm{ Ax -b}_1 &= \min \sum_{ i= 1}^{ m} \left| a_i^{T} x - b_i \right| \\ 62 | &= \min_{t \in \rr^{m}, x \in \rr^{n}} \mathbbm{1} t\\ 63 | & \qquad \qquad \qquad a_i^{T}x - b_i \leq t_i\\ 64 | & \qquad \qquad \qquad a_i^{T} x - b_i \geq -t 65 | \end{align*} 66 | \end{eg} 67 | \item Solve coupled functions $ \min f(x) +g(x)$. This is equivalent to 68 | \begin{align*} 69 | \min_{x,z} f(x) + g(z) \text{ subject to } x=z 70 | \end{align*} 71 | This way we decouple the functions and make it easier to solve. 72 | \item Marginalization: 73 | \begin{align*} 74 | \min_{x,y} f(x,y) &= \min_x \left( \min_y f(x,y) \right) \\ 75 | &= \min_x g(x) 76 | \end{align*} 77 | \begin{note} 78 | We can always commute extremization of the same type. 79 | \end{note} 80 | \end{enumerate} 81 | 82 | \subsection{Convex optimization problems [BV04 Ch.4.2]} 83 | 84 | We wish to make both the function and the constraint sets to be convex. 85 | A typical problem: 86 | \begin{align*} 87 | \min &f_0(x)\\ 88 | \text{subject to } &f_i(x) \leq 0, i = 1,\ldots,m\\ 89 | &h_i(x) = 0 , i = 1,\ldots,p 90 | \end{align*} 91 | A convex problem would be 92 | \begin{align*} 93 | \min\ &f_0 (x) \\ 94 | \text{ subject to } &f_i(x) \leq 0 \\ 95 | &a_i^{T} x =b_i 96 | \end{align*} 97 | where $ f_0, \ldots, f_m$ are convex functions and the equality constraints are affine. 98 | \begin{thm} 99 | Consider the convex problem, $\min f(x), x \in C$. Assume $ f \in \mathcal{ C}^{1}$. Then $ x$ is optimal iff 100 | \begin{enumerate}[label=\arabic*)] 101 | \item $ x \in C$ 102 | \item $ \ \forall \ y \in C$, $ \langle \nabla f(x), y-x \rangle \geq 0$ (Euler inequality). 103 | \end{enumerate} 104 | 105 | \end{thm} 106 | 107 | \begin{proof} 108 | $ (\impliedby)$: 109 | \begin{align*} 110 | f(y) \geq f(x) + \langle \nabla f(x) , y-x \rangle \geq f(x) 111 | \end{align*} 112 | $ (\implies)$: 113 | Suppose $ x$ is optimal but there exists a $ y \in C$ s.t. $\langle \nabla f(x),y-x \rangle < 0$. Then for $ t \in (0,1]$, the 1D parameterization yields: 114 | \begin{align*} 115 | \phi(t) &=f(x+t(y-x))\\ 116 | &= \phi(0) + \phi'( 0) t + \frac{\phi''( \xi)}{ 2} t^2 \qquad \qquad \text{ Taylor} \\ 117 | &\leq f(x) + t \langle \nabla f(x), y-x \rangle \qquad \phi \text{ convex by composition} \\ 118 | &< f(x) 119 | \end{align*} 120 | Clearly $ \phi(t)$ is feasible and this contradicts that $ x$ is optimal. 121 | \end{proof} 122 | \begin{coro} 123 | If $ \langle d,z \rangle \geq 0 \ \forall \ z \implies d=0$. 124 | \end{coro} 125 | \begin{proof} 126 | Take $ z=-d$ and result follows from positive definitiveness of norm. 127 | \end{proof} 128 | \end{document} 129 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_16.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsubsection{Linear matrix inequalities (LMI): dual problem of SDPs} 6 | \begin{align*} 7 | \min_{x \in \rr^{n}}\ & \langle c,x \rangle \\ 8 | \text{subject to } &\sum_{ i= 1}^{ m} x_i F_i + G \preceq 0, i = 1,\ldots,m, F_i, G \in S^{k} \\ 9 | & Ax=b 10 | \end{align*} 11 | \begin{note} 12 | $ \sum_{ i= 1}^{ m} x_i F_i$ is like $ A^* y = \sum_{ i= 1}^{ m} y_i \ve{a}_i$ in the case when $ Ax=b \implies \ve{a}_i^{T}x=b$. 13 | \end{note} 14 | 15 | \begin{remark} 16 | We can recover linear programs by letting $ F_i,G$ be diagonal matrices. 17 | \end{remark} 18 | 19 | \begin{remark} 20 | We can also recover SOCP's, details ommitted. Let $ A \in S_{++}^{r}, C \in S^{s}, B \in \rr^{r \times s}$. Then 21 | \[ 22 | \begin{pmatrix} A&B\\B^{T}&C \end{pmatrix} \succeq 0 \iff \underbrace{ C-B^{T}A^{-1}B}_{ \text{ Schur complement} } \succeq 0 23 | .\] 24 | Schur complement might be computationally cheaper especially for example when $ C =0$. 25 | \end{remark} 26 | 27 | Let $ K_1, K_2$ be proper cones, then $ K_1 \times K_2$ is also a proper cone. 28 | \begin{eg} 29 | $ X \succeq 0, Y \succeq 0$, we can write 30 | \[ 31 | \begin{pmatrix} X&Z\\Z^{T}&Y\\ \end{pmatrix} \succeq 0, Z=0 \text{ (linear constraint)} 32 | .\] 33 | However, this is horrible for computation. For example, in the case of negative log barrier, we can separate each constraint and projecting to $ \rr_{+}^{n}$ is easy. We can also project to $ S_{+}^{n}$ by making the eigenvalues to nonnegative. But doing this on a bigger matrix is expensive since finding eigenvalues is super-linear. 34 | \end{eg} 35 | 36 | \newpage 37 | \section{Duality [BV04 Ch.5]} 38 | \subsection{Lagrange dual function/problem} 39 | 40 | Consider $ p^* = \min_{x \in C} f(x)$. Here $ x \in C$ is \allbold{primal feasible} and we can find it by finding the smallest upper bound. We wish to find a dual feasible point s.t. it is the largest lower bound on $ p^* $. 41 | 42 | \end{document} 43 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_17.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | 6 | First start with the primal problem without assuming convexity: 7 | \begin{align*} 8 | \min\ &f_0(x) \\ 9 | \text{subject to } &f_i(x) \leq 0, i = 1,\ldots,m \\ 10 | &h_i(x) = 0 , i = 1,\ldots,p 11 | \end{align*} 12 | Given an non-empty domain $ D = \bigcap_{i=0}^m \dom(f_i) \cap \bigcap_{ i=0}^{m} \dom(h_i)$. Note that $ \dom (f_0) $ is implicit constraint, \emph{i.e.} $ f_0(x) = x^2 + I_{ \{x\geq 0\} }$. 13 | 14 | \begin{defn}[lagrangian] 15 | \[ 16 | \mathscr{L}(x, \lambda, \nu) = f_0(x) + \sum_{ i= 1}^{ m} \lambda_i f_i(x) + \sum_{ i= 1}^{ p} \nu_i h_i(x) 17 | ,\] 18 | where $ \lambda,\nu$ are \allbold{dual variables}, $ \lambda$ is associated with inequality constraints whereas $ \nu$ is associated with equality constraints. 19 | \end{defn} 20 | \begin{remark} 21 | We can be creative about whether we put the constraints explicitly in $ f_i, h_i$ or implicitly in $ f_0$. 22 | \end{remark} 23 | 24 | \begin{defn}[dual function] 25 | \[ 26 | g( \lambda, \nu) = \inf_{x \in D} \mathscr{L}(x, \lambda, \nu) 27 | .\] 28 | \end{defn} 29 | \begin{note} 30 | This problem is usually easier than the primal problem because it doesn't have constraints. 31 | \end{note} 32 | 33 | \begin{ques} 34 | Is $ g$ convex? 35 | \end{ques} 36 | Recall the infimum preserves convexity iff $ C$ is convex and $ f(x,y)$ is jointly convex, whereas the supremum preserves convexity iff $ f(x,y)$ is convex for all $ x$. Since the Lagrangian is most likely not jointly convex, but it is in fact affine in terms of the dual variables. Therefore, 37 | \[ 38 | g(\lambda,\nu) = - \sup_{x \in D} \mathscr{L}(x, \lambda,\nu) 39 | \] 40 | is concave. 41 | 42 | 43 | Dual problem (D): 44 | \[ 45 | d^* = \max_{\lambda \geq 0} g(\lambda,\nu) 46 | .\] 47 | \begin{prop}[weak duality] 48 | Define $ p^*$ to be the optimal value for the primal problem and $ d^* $ be the maximum value for the dual problem. Then if $ \lambda \geq 0$ and $ \nu$ is anything, then 49 | \[ 50 | g(\lambda,\nu) \leq p^* 51 | .\] 52 | Hence $ d^* \leq p^* $. 53 | \end{prop} 54 | 55 | \begin{proof} 56 | \begin{align*} 57 | g(\lambda,\nu) &= \inf_{x} \mathscr{L}(x, \lambda,\nu) \\ 58 | &\leq \mathscr{L}(x, \lambda, \nu) \ \forall \ x \in D \text{ and feasible} \\ 59 | &= f_0(x) + \sum_{ i= 1}^{ m} \underbrace{\lambda_i}_{\geq 0} \underbrace{f_i(x)}_{\leq 0}+ \sum_{ i= 1}^{ p} \nu_i \underbrace{h_i(x)}_{=0} \qquad \text{ x is feasible} \\ 60 | &\leq f_0(x) 61 | \end{align*} 62 | \end{proof} 63 | \begin{remark} 64 | "Strong duality" $ d^* =p^* $ tend to happen if $ (P)$ is convex. 65 | \end{remark} 66 | \begin{eg}[dual of a LP] 67 | \begin{align*} 68 | \min\ & \langle c,x \rangle \\ 69 | \text{subject to } &x\geq 0 \iff -x_i \leq 0\\ 70 | &Ax=b 71 | \end{align*} 72 | Then 73 | \begin{align*} 74 | \mathscr{L}(x,\lambda,\nu) &= \langle c,x \rangle - \langle \lambda,x \rangle + \nu^{T} (Ax-b)\\ 75 | &= \langle c,x \rangle - \langle \lambda,x \rangle + \langle A^{T} \nu,x \rangle - \langle \nu,b \rangle \\ 76 | g(\lambda,\nu) &= \inf_x \mathscr{L}(x, \lambda,\nu) \\ 77 | &= -\langle \nu,b \rangle + \inf_{\lambda} \langle c-\lambda+ A^{T}\nu,x \rangle = 78 | \begin{cases} 79 | -\langle \nu,b \rangle &c-\lambda+A^{T} \nu = 0 \\ 80 | -\infty & \text{ else} 81 | \end{cases} 82 | \end{align*} 83 | Thus the dual problem $ \max_{\lambda\geq 0} g(\lambda,\nu)$ becomes 84 | \[ 85 | \max_{\lambda\geq 0} - \langle \nu,b \rangle, \lambda= c+A^{T} \nu 86 | \] 87 | or 88 | \[ 89 | -\min\langle \nu,b \rangle, c+A^{T} \nu \geq 0 90 | .\] 91 | This is a LP! 92 | \end{eg} 93 | \end{document} 94 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_18.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsubsection{Dual of linear programs} 6 | The primal (P) is: 7 | \begin{align*} 8 | \max_{x \in \rr^{n}}\quad & \langle c,x \rangle \\ 9 | \text{subject to } \quad &x \geq 0\\ 10 | y:\quad &Ax \leq b 11 | \end{align*} 12 | The dual (D) is: 13 | \begin{align*} 14 | \min\quad &\langle b,y \rangle \\ 15 | \text{subject to } \quad & y\geq 0\\ 16 | x:\quad & A^{T} y \geq c 17 | \end{align*} 18 | 19 | Rules to transform (P) to (D): 20 | \begin{enumerate}[label=(\arabic*)] 21 | \item $ \max \to \min$ and vice versa. 22 | \item variables $ \to$ constraints and vice versa. 23 | \item objective and RHS of inequality flip places. 24 | \item matrices transpose. 25 | \end{enumerate} 26 | "SOB" mnemonic: sensible, odd, and bizarre from a business perspective 27 | \begin{align*} 28 | \text{ primal variable } x_i \quad &x_i\geq 0 \text{: sensible} \\ 29 | & \text{ no constraint: odd} \\ 30 | &x_i \leq 0 \text{: bizarre} 31 | \end{align*} 32 | \begin{align*} 33 | \text{ constraints in primal when maximizing} \quad & a_i^{T}x\leq b_i \text{: sensible (think budget)} \\ 34 | & a_i^{T}x=b_i\text{: odd} \\ 35 | &a_i^{T}x \geq b_i \text{: bizarre} 36 | \end{align*} 37 | The rule: a dual constraint is S/O/B if primal variable is S/O/B. And vice versa. 38 | 39 | \begin{eg} 40 | \begin{align*} 41 | (P) \qquad \qquad \min_{\substack{x\geq 0\\ x \in \rr^{2}}} \quad &3x_1+2x_2\\ 42 | y_1: \quad &x_1 + 2x_2 \geq 5 \qquad S\\ 43 | y_2:\quad & \underbrace{x_2 \leq 2 \qquad}\qquad B\\ 44 | & \begin{pmatrix} 1&2\\0&1 \end{pmatrix} \begin{pmatrix} x_1\\x_2 \end{pmatrix} 45 | \end{align*} 46 | \begin{align*} 47 | (D) \qquad \qquad \min_{\substack{y \in \rr^{2}\\y_1\geq 0\ S\\ y_2 \leq 0 \ B}} \quad &5y_1+2y_2\\ 48 | x_1: \quad &y_1 + 0y_2 \leq 3 \qquad S\\ 49 | x_2:\quad & 2y_1+y_2 \leq 2 \qquad S 50 | \end{align*} 51 | 52 | 53 | Observe: $ f(x) = 3x_1 + 2 x_2 = \underbrace{2x_1}_{\geq 0} + \underbrace{(x_1 + 2x_2)}_{\geq 5} \geq 5$. We proved $ 5\leq p^* $. However, this is not the tightest bound. The dual variables give us the tightest: 3 times the first constraint and -4 times the second constraint yields $ 7\leq p^* $. 54 | \end{eg} 55 | \allbold{Duality gap}: $ x, \lambda$ feasible, $ f_0(x) - g(\lambda,\nu)$. 56 | 57 | \subsubsection{Strong duality results} 58 | \begin{itemize} 59 | \item If $ (P)$ isn't convex, strong duality is unlikely except certain nonconvex QP: s-lemma/s-procedure (see Appendix of BV). 60 | \item If (P) is convex, strong duality holds under certain constraint qualifications (CQ) such as Slater's condition. 61 | \begin{align*} 62 | \min\quad &f_0(x) \\ 63 | \text{subject to } \quad &f_i(x) \leq 0, i = 1,\ldots,m \\ 64 | &Ax = b 65 | \end{align*} 66 | \begin{defn}[Slater's conditions] 67 | They hold if there exists a strictly feasible point, $ x \in \ri(\dom(f_0))$ and 68 | 69 | if $ f_i$ is affine, $ f_i(x)\leq 0$ (feasible) 70 | 71 | if $ f_i$ isn't affine, $ f_i(x)<0$ (strictly feasible) 72 | 73 | and $ Ax=b$. 74 | \end{defn} 75 | \begin{thm} 76 | If (P) is convex and Slater's conditions hold, then 77 | \begin{enumerate}[label=(\roman*)] 78 | \item we have strong duality, $ d^* =p^* <\infty$ 79 | \item there exists an optimal solution to the dual problem. 80 | \end{enumerate} 81 | \end{thm} 82 | \begin{note} 83 | Slater's does NOT imply there exists an optimal \emph{primal} solution. 84 | \begin{eg} 85 | $ \inf_{x \in \rr} e^{x}$. It is convex, lsc, proper. But it is not coercive so it doesn't have an optimal primal solution. 86 | \end{eg} 87 | \end{note} 88 | \begin{remark} 89 | Often we want Slater's condition on the dual. Since the dual of the dual is the primal, then we guarantee an optimal solution. 90 | \end{remark} 91 | \begin{coro}[Slater for LP] 92 | Slater's conditions hold iff the LP is feasible \emph{i.e.} $ p^* < \infty$. 93 | 94 | $ p^* < \infty \implies d^* =p^* $ and dual optimal solution exists. 95 | 96 | $ d^* >- \infty \implies d^* =p^* $ and primal optimal solution exists. 97 | 98 | Hence if either $ p^* $ or $ d^* \in \rr$ (not $ \pm \infty$), then optimal primal and dual solutions exist. 99 | \end{coro} 100 | \begin{note} 101 | $ d^* = - \infty, p^* = -\infty$ is possible but rare. This is not strong duality. 102 | \end{note} 103 | \end{itemize} 104 | \end{document} 105 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_19.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsection{Saddle point interpretation [BV 4.2]} 6 | Here we want to find the saddle points as we want to minimize the primal but maximize the dual. 7 | \begin{align*} 8 | p^* = \min\quad &f_0(x) \\ 9 | \text{subject to } \quad &f_i(x) \leq 0, i = 1,\ldots,m \\ 10 | &Ax = b 11 | \end{align*} 12 | This is equivalent to 13 | \begin{align*} 14 | &\min_{x}f_0(x) + \sup_{\lambda\geq 0, \nu}\left\{ \sum \lambda_i f_i(x) + \nu^{T} (Ax-b)\right\} \\ 15 | =\ & \min_{x \in D} \sup_{\lambda\geq 0, \nu} \mathscr{L}(x,\lambda,\nu) 16 | \end{align*} 17 | This is because if $ f_i(x)>0$ or $ a_i^{T} x -b_i \neq 0$ for some $ i$, then we get $ \infty$ in the supremum, encoding it as infeasible. 18 | 19 | Then the dual is 20 | \[ 21 | d^* = \max_{\lambda\geq 0, \nu } g(\lambda,\nu) = \max_{\lambda\geq 0, \nu} \min_{x \in D} \mathscr{L}(x,\lambda,\nu) 22 | .\] 23 | The weak-duality is equivalent to "min-max" inequality: 24 | \[ 25 | d^* =\max_{\lambda\geq 0,\nu} \min_{x \in D} \mathscr{L}(x,\lambda,\nu) \leq \min_{x \in D} \max_{\lambda\geq 0, \nu} \mathscr{L}(x,\lambda,\nu) = p^* 26 | .\] 27 | And equality is achieved if strong-duality holds. 28 | \begin{note} 29 | All "min/max" should be "inf/sup" until proven. 30 | \end{note} 31 | 32 | \begin{thm} 33 | Saddle point occurs when 34 | \begin{enumerate}[label=(\arabic*)] 35 | \item strong-duality/strong max/min 36 | \item inf/sup are achieved. 37 | 38 | That is, $ (x^* ,(\lambda^* ,\nu^* ))$ is a saddle point of $ \mathscr{L}(x,(\lambda,\nu))$ if 39 | \begin{align*} 40 | \mathscr{L}(x^* ,(\lambda^* ,\nu^* )) &= \inf_x \mathscr{L}(x,(\lambda^* ,\nu^* )) \\ 41 | \mathscr{L}(x^* ,(\lambda^* ,\nu^* )) &= \sup_{\lambda,\nu} \mathscr{L}(x^* ,(\lambda,\nu)) 42 | \end{align*} 43 | \end{enumerate} 44 | \end{thm} 45 | 46 | \begin{coro} 47 | If we know $ \lambda^* ,\nu^* $, then we can find $ x^* $ by solving the unconstrained problem 48 | \[ 49 | \min_x \mathscr{L}(x,(\lambda^* ,\nu^* )) 50 | .\] 51 | \end{coro} 52 | This allows us to solve problems with shared Lagrangians. 53 | \subsubsection{Shared Lagrangian} 54 | \begin{eg} 55 | \begin{align*} 56 | \min\quad &\norm{ x}_1 \\ 57 | \text{subject to } \quad & \norm{ Ax-b}_2 \leq \epsilon \quad \iff \quad \norm{ Ax-b}_2^2 - \epsilon^2 \leq 0 58 | \end{align*} 59 | Let 60 | \[ 61 | \mathscr{L}(x,\lambda) = \norm{ x}_1 + \lambda \left( \norm{ Ax-b}_2^2 - \epsilon^2 \right) 62 | .\] 63 | With the correct $ \lambda^* $, this is equivalent to 64 | \[ 65 | \min_x \norm{ x}_1 + \lambda^* \norm{ Ax-b}_2^2 66 | ,\] 67 | because dropping the constant doesn't affect minimizer. This unconstrained problem is much nicer because the least squares is differentiable, whereas the original constraint is hard to project. 68 | 69 | \end{eg} 70 | 71 | Even if we don't know $ \lambda^* $, 72 | \begin{enumerate}[label=(\arabic*)] 73 | \item guess $ \lambda$, solve $ x = x(\lambda)$, check if the constraint is active, update $ \lambda$ (solve the dual problem). 74 | \item often $ \epsilon$ is not known (hyper-parameter) and set via cross-validation so we can do cross-validation on $ \lambda$ directly (evaluate trade-off in modeling). 75 | \end{enumerate} 76 | We assume existence of saddle points here, which is given by the following: 77 | \begin{prop} 78 | Slater's on both primal and dual $ \implies$ existence of saddle points. 79 | \end{prop} 80 | 81 | \subsection{Game Theory connection} 82 | Consider a finite, 2-person, 0-sum game: "matrix game" (not Prisoner's dilemma). 83 | 84 | This involves the Minimax Theorem of Von Neumann. 85 | \begin{eg}[rock-paper-scissors] 86 | Player 1 wants to minimize and Player 2 wants to maximize utility. The payoff matrix looks like 87 | \begin{table}[H] 88 | \centering 89 | \begin{tabular}{c||c|c|c} 90 | &P&S&R\\ 91 | \hline 92 | \hline 93 | P&0&1&-1\\ 94 | \hline 95 | S&-1&0&1\\ 96 | \hline 97 | R&1&-1&0 98 | \end{tabular} 99 | \caption*{Row: Player 1; Column: Player 2} 100 | \end{table} 101 | 102 | $ u^{T}Pv$ is the payoff, intuitively it means player 1 chooses a row and player 2 chooses a column. For a fair game, the payoff value is 0. Since $ A=-A^{T}$ is antisymmetrical, it's fair. But in reality, $ u$ and $ v$ actually encode the probability of choose each row/column, which sums up to 1. 103 | 104 | Define probability simplex $ \Delta = \{u:u\geq 0, \sum u_i = 1\} $. 105 | 106 | \begin{case}[Player 2 knows player 1's strategy] 107 | If $ u$ is known, 108 | Then the decision is easy: choose $ v \in \argmax_{v \in \Delta} u^{T}Pv$. 109 | 110 | If Player 1 knows Player 2 knows Player 1's strategy, then Player 1 should select $ u$ to minimize Player 2's payoff: 111 | \[ 112 | p_1^* = \min_{u \in \Delta} \max_{v \in \Delta} u^{T}Pv 113 | .\] 114 | This is in fact a LP. 115 | \end{case} 116 | \begin{case}[Player 1 knows Player 2's strategy] 117 | \[ 118 | p_2^* = \max_{v \in \Delta} \min_{u \in \Delta} u^{T}Pv 119 | .\] 120 | \end{case} 121 | Intuitively, whoever has knowledge of opponent's move gets an edge, so the payoff when Player 2 has an edge in maximizing will be at least the payoff when Player 1 has an edge in minimizing. That is, $ p_1^* \geq p_2^* $. This is weak duality. Slater's condition for LP requires only a feasible point. Since $ \Delta$ is nonempty, we have strong duality $ p_1^* =p_2^* $. 122 | \end{eg} 123 | 124 | \end{document} 125 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_20.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsection{Fenchel-Rockafellar Duality [BC17]} 6 | 7 | \[ 8 | (P) \qquad \qquad \min_x f(x) + g(Ax) 9 | ,\] 10 | where $ f,g \in \Gamma_0$ and allows $ +\infty$ values, and $ A$ is a $ m\times n$ matrix. 11 | \[ 12 | (D) \qquad \qquad \min_{v} f^* (A^* v) + g^* (-v) 13 | .\] 14 | \subsubsection{Connections to Lagrangian duality} 15 | Recall 16 | \[ 17 | f^* (y) = \sup_{x} \langle x,y \rangle -f(x) 18 | .\] 19 | 20 | Take the same $ (P)$, recast as 21 | \[ 22 | \min_{x,z} f(x) + g(z)\ s.t.\ z=Ax \quad \implies \quad \mathscr{L}(x,z,v)=f(x)+g(z)+ \langle z-Ax,v \rangle 23 | .\] 24 | Then the Lagrangian dual function $ h(v)$ is 25 | \begin{align*} 26 | h(v) = \inf_{x,z} \mathscr{L}(x,z,v) &= \inf_{x} (f(x)- \langle Ax,v \rangle) + \inf_{z} (g(z) + \langle z,v \rangle)\\ 27 | &= - \sup_{x} (\langle x,A^* v \rangle -f(x)) - \sup_{z}( - \langle z,v \rangle -g(z)) \\ 28 | &= -(f^* (A^* v) +g^* (-v)) 29 | \end{align*} 30 | Thus, the Lagrangian and F-R dual problems only differ by a minus sign: \[ \max_v h(v) = -\min_v -h(v) = - \min_v f^* (A^* v) +g^* (-v).\] 31 | \subsubsection{Saddle-point interpretation} 32 | 33 | If $ g \in \Gamma_0$, then $ g=g^{**}$, so $ g(x) = \sup_y \langle x,y \rangle -g^* (y)$. Using this fact we can rewrite the primal problem as 34 | \begin{align*} 35 | (P)\qquad \min_x f(x) + g(Ax) &= \min_x \sup_v f(x) + \langle Ax,v \rangle - g^* (v) \\ 36 | &= \sup_v \min_x f(x) + \langle x,A^* v \rangle -g^* (v) \qquad \text{ if saddle point exists} \\ 37 | &= \sup_v -f^* (-A^* v) -g^* (v)\\ 38 | &= \sup_v -f^* (A^* v) - g^* (-v) \qquad \qquad (D) 39 | \end{align*} 40 | 41 | \begin{prop}[18.9] 42 | Let $ f \in \Gamma_0( \mathcal{ H})$, if $ f^* $ is strictly convex, then $ f$ is (Gateaux) differentiable on $ \inte \dom(f)$. 43 | \end{prop} 44 | \begin{prop}[18.15] 45 | If $ f$ is continuous and convex, then 46 | 47 | $ f$ is (Frechet) differentaible and $ \nabla f$ is $L$-Lipschitz continuous if and only if $ f^* $ is $ L^{-1}$ strongly convex, 48 | 49 | and $ f \in \Gamma_0, f=f^{^* } $. 50 | \end{prop} 51 | 52 | \subsubsection{Algorithms} 53 | \begin{enumerate}[label=(\arabic*)] 54 | \item gradients: If I know $ \nabla g$, can I find $ \nabla (g \circ A)$? Yes. It's $ A^* (\nabla g \circ A)$. 55 | \item projections/proximity operators: Let $ C = \{x: \norm{ x}_2 \leq 1\} $ and $ C \circ A = \{x: \norm{ Ax}_2 \leq 1\} $. In general, if I know $ \prox_{g}$, I don't know $ \prox_{g \circ A}$. Here there is no chain rule nor linearity. 56 | \end{enumerate} 57 | \begin{remark} 58 | We can use the dual to shift the linear operator from the proximity term to the differentiable term. 59 | \end{remark} 60 | 61 | \begin{thm}[15.23 generalized Slater] 62 | If $ 0 \in \ri (\dom g - A(\dom f))$ (CQ), then strong duality holds. That is, 63 | \[ 64 | \inf_x f(x) + g(Ax) = -\min_{v} f^* (A^* v) + g^* (-v) 65 | ,\] 66 | and the dual solution is obtained. 67 | \end{thm} 68 | \begin{note} 69 | In finite dimensions, for CQ we just need to show $ \ri(\dom g) \cap A (\ri (\dom f))\neq \O$. Or, if $ f,g$ are polyhedral, $ \dom g \cap A(\dom f) \neq \O$. This is essentially saying we want a strictly feasible point. 70 | \end{note} 71 | \end{document} 72 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_22.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \begin{remark} 6 | Complementary slackness means if $ f_j(x^* )<0$, then this is an inactive constraint, since $ \lambda_j^* =0$, and 7 | \begin{align*} 8 | \mathscr{L}(x,\lambda^* ,\nu^* ) &= f_0(x)+ \sum_{i\neq j} \lambda_i^* f_i(x) + \sum \nu_i^* h_i(x) 9 | \end{align*} 10 | This is only true if we have strong duality. In particular, it usually isn't true for non-convex problems. 11 | \end{remark} 12 | \begin{eg}[1] 13 | Consider a convex case with inactive constraints: 14 | \begin{align*} 15 | \min_{x \in \rr}\quad &x \\ 16 | \text{subject to } \quad &x\geq 0 \\ 17 | &x\leq 1 \text{ this is not tight/active constraint} 18 | \end{align*} 19 | We can just remove the inactive constraint and still get the same solution. 20 | \end{eg} 21 | \begin{eg}[2] 22 | Consider a non-convex case with non-tight constraints: 23 | \begin{align*} 24 | \min_{x \in \rr}\quad &x \\ 25 | \text{subject to } \quad &x\geq 0 \qquad \text{ not tight} \\ 26 | &x^2 \geq 1 27 | \end{align*} 28 | as solution is $ x^* =1$. But if we remove the non-tight constraint $ x\geq 0$ here, we would get $ -\infty$ as the solution instead, so we can't just drop non-tight constraint for non-convex problems. 29 | \end{eg} 30 | 31 | \subsection{Meta-rules} 32 | Suppose $ C \subseteq \rr^{n}$, possibly nonconvex. 33 | \begin{enumerate}[label=(\arabic*)] 34 | \item switch between min and max with double minus signs or between argmin and argmax with single minus sign (since we don't care about function value). 35 | \item If $ \phi$ is monotone on $ \im(f)$, then 36 | \[ 37 | \argmin_{x \in C} \phi(f(x)) = \argmin_{x \in C} f(x) 38 | .\] 39 | \begin{eg} 40 | $ \frac{1}{2} \norm{ Ax-b}^2 $ and $ \norm{ Ax-b} $. 41 | \end{eg} 42 | \item If we have all mins or all maxs, we can swap order 43 | \[ 44 | \min_x \min_y f(x,y) = \min_y \min_x f(x,y) = \min_{x,y} f(x,y) 45 | .\] 46 | \item If $ D \subseteq C$ where $ C$ can be seen as a relaxation, then 47 | \[ 48 | \min_{x \in C} f(x) \leq \min_{x \in D} f(x) 49 | .\] 50 | And we can obtain a lower bound this way. 51 | \item "superadditivity": 52 | \[ 53 | \min_{x \in C} f(x)+g(x) \geq \min_{x \in C} f(x) + \min_{x \in C} g(x) 54 | .\] 55 | \end{enumerate} 56 | 57 | \begin{eg}[solving convex problems using KKT] 58 | Recall that the solution to the least squares problem $ \min_x \frac{1}{2} \norm{ Ax-b}^2 $ when $ A$ has more rows than columns ($ m\geq n$) is 59 | \[ 60 | x^* = \left( A^{T}A \right) ^{-1} A^{T}b 61 | .\] 62 | In the case when $ m0$, then 44 | \[ 45 | p^* \geq p(u,0)\geq p^* - \underbrace{\lambda_i^* u_i}_{>0} 46 | .\] 47 | So loosening constraint doesn't help much to reduce the minimum. 48 | \end{case} 49 | \begin{case}[3] 50 | $ \lambda_i^* =0$, for example when $ f_i(x)\leq 10^{6}$, the constraint is inactive. 51 | \end{case} 52 | \begin{case}[4] 53 | When $ \lambda_i^* $ is large, loosen $ (u_i)>0$, or if $ \lambda_i^* =0$, tighten $ (u_i<0)$, then the analysis can't help us. 54 | \end{case} 55 | \begin{case}[5] 56 | If $ \nu_i^* \gg 0, v_i<0$ or $ \nu_i^* \ll 0, v_i>0$, then $ p(0,v) \gg p^* $ and we have a big change. 57 | \end{case} 58 | \begin{case}[6] 59 | If $ |v_i^* | \ll 1$ or $ \nu_i^* >0,v_i<0$ or $ \nu_i^* <0, v_i>0$, then $ p(0,v)$ doesn't change much. 60 | \end{case} 61 | \subsubsection{local sensitivity analysis} 62 | We can show that $ (P_{u,v})$ is convex using the minimizing conditions. Recall that for a convex function, 63 | \[ 64 | f(y) \geq f(x) + \langle \partial f(x) , y-x \rangle 65 | .\] 66 | Comparing with Equation above we see that the dual variables are just the subgradients of $ p(u,v)$! 67 | If $ p(u,v)$ is differentiable, 68 | \[ 69 | \frac{\partial p(0,0)}{\partial u_i} = -\lambda_i^* , \quad \frac{\partial p(0,0)}{\partial v_i} =-\nu_i^* 70 | .\] 71 | This is symmetric! Now we can write the Taylor expansion: 72 | \[ 73 | p(u,v) = p(0,0) +\langle \frac{\partial p}{\partial u} ,u \rangle + \langle \frac{\partial p}{\partial v} ,v \rangle + \text{ higher-order terms} 74 | .\] 75 | This is only accurate with small perturbation. 76 | 77 | In economics, dual variables is referred to as "shadow prices". In statistics, it's called "score test". 78 | 79 | \subsection{Generalized Inequalities [BV 5.9]} 80 | \begin{align*} 81 | \min\quad &f_0(x) \\ 82 | \text{subject to } \quad &f_i(x) \preceq 0, f_i:\rr^{n} \to S^{m} \\ 83 | &h_i(x) = 0 , i = 1,\ldots,p 84 | \end{align*} 85 | \begin{eg}[SDP] 86 | Punchline: we get analogous KKT conditions. Instead of $ \lambda_i\geq 0$ now we require $ \Lambda_i \succeq 0$. 87 | 88 | Caveat: Before, if $ \lambda\geq 0, y \geq 0$ and $ \langle \lambda,y \rangle =0$, then $ \ \forall \ i, \lambda_i =0$ or $ y_i = 0$. 89 | However, in the matrix case, if $ \Lambda \succeq 0, Y=f_i(x) \succeq 0$ and $ \langle \Lambda,Y \rangle \geq 0$, it does NOT mean $ Y=0$ or $ \Lambda =0$. 90 | 91 | But if $ \Lambda \succ 0, Y \succeq 0, \langle \Lambda,Y \rangle = 0 $, then $ Y=0$ 92 | \end{eg} 93 | \end{document} 94 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_24.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \newpage 6 | \chapter{Algorithms} 7 | \newpage 8 | \section{Unconstrained Optimization} 9 | We assume reasonable smoothness of the objective. Here is an overview of the algorithms: 10 | \begin{enumerate}[label=(\arabic*)] 11 | \item gradient descent 12 | \[ 13 | x_{k+1} = x_k - t \cdot \nabla f(x_k), \quad t= \frac{1}{L} 14 | .\] 15 | where $ L$ is the Lipschitz constant of the gradient. 16 | \item Newton's method 17 | \[ 18 | x_{k+1} = x_k - \left( \nabla ^2 f(x_k) \right)^{-1} \nabla f(x_k) 19 | .\] 20 | This can reduce to gradient descent when we have $ \nabla ^2 f(x) \preceq L \cdot I$ and we just bound the Hessian with $ L \cdot I$. 21 | \item Quasi-Newton 22 | \end{enumerate} 23 | \newpage 24 | \subsection{Proximal gradient descent} 25 | \begin{align*} 26 | \min \underbrace{ f(x)}_{ \text{ smooth, strongly convex} } + \underbrace{ g(x)}_{ \text{simple, convex}} 27 | \end{align*} 28 | Note we can add indicator function to $ g$: 29 | \[ 30 | g(x) = I_{C}(x) + h(x) 31 | ,\] 32 | \emph{i.e.} when we have constraint $ x \in C$. 33 | \subsubsection{motivation} 34 | We can try the first-order Taylor approximation of $ f$. However, recall minimizing a linear function would go to negative infinity, so we need to go to 2nd order. 35 | \begin{align*} 36 | x_{k+1} &= \argmin_{x} f(x_k) + \langle \nabla f(x_k),x-x_k \rangle + \frac{1}{2} L\norm{ x-x_k}_2^2 + g(x)\\ 37 | &= \argmin_x \frac{1}{L} \left( \langle \nabla f(x_k),x-x_k \rangle + \frac{1}{2} L\norm{ x-x_k}_2^2 + g(x)\right) \\ 38 | &= \argmin_x \frac{1}{2} \norm{ x-\left(x_k - \frac{1}{L} \nabla f(x_k)\right)}_2^2 + \frac{1}{L} \cdot g(x) \text{ complete the square and ignore constants} \\ 39 | &= \argmin_x \frac{1}{2} \norm{ x-\widetilde{ x}}_2^2 + \frac{1}{L} \cdot g(x) \\ 40 | &= \prox_{\frac{1}{L} \cdot g} (\widetilde{ x}) 41 | \end{align*} 42 | Note that this solution is unique because we have strong convexity. 43 | \subsubsection{algorithm} 44 | \begin{align*} 45 | x_{k+1} = \prox_{t g} (x_k - t \cdot \nabla f(x_k))\qquad t\text{ via line search or } t=\frac{1}{L} 46 | \end{align*} 47 | \begin{remark} 48 | If $ g(x)=0$, proximal operator is the identity function so it reduces to gradient descent. 49 | \end{remark} 50 | 51 | \begin{eg} 52 | $ g(x) = I_C$. Then 53 | \begin{align*} 54 | \prox_{t g} (\widetilde{ x}) = \Proj_C(x) 55 | \end{align*} 56 | Recall from linear algebra: if $ \Proj_V(\widetilde{ x})$ is the projection of $ \widetilde{ x} \to V$, then 57 | \[ 58 | \widetilde{ x} =\Proj_V(\widetilde{ x}) + \Proj_{V^{\perp}}(\widetilde{ x}) 59 | .\] 60 | We can generalize this result to \allbold{Moreau's decomposition}: 61 | \begin{align*} 62 | \widetilde{ x} = \prox_g (\widetilde{ x}) + \prox_{g^* } (\widetilde{ x}) 63 | \end{align*} 64 | \end{eg} 65 | \begin{eg} 66 | \[ 67 | \Proj_{\norm{ x}_{\infty} \leq 1} = \widetilde{ x} - \prox_{\norm{ \cdot }_1 } (\widetilde{ x}) 68 | .\] 69 | 70 | \begin{align*} 71 | \prox_{t \norm{ \cdot }_1 } (y) = \argmin_x \frac{1}{2} \norm{ x-y}_2 ^2 + L \norm{ x}_1 \text{ this is separable!} 72 | \end{align*} 73 | By Fermat's rule, 74 | \begin{align*} 75 | \prox_g (y) = \argmin \frac{1}{2}\norm{ x-y}^2 + g(x)\\ 76 | \implies 0 &\in x-y + \partial g(x)\\ 77 | y&\in x + \partial g(x) \\ 78 | y & \in (I+\partial g)(x) \\ 79 | x&\in \left( I+ \partial g \right)^{-1} (y) \\ 80 | x&= \left( I+ \partial \norm{ }_1 \right)^{-1} y \text{ unique solution s.c.} 81 | \end{align*} 82 | We derived earlier that the solution to $ \prox_{t \cdot \norm{ \cdot }_1 }$ is 83 | \[ 84 | x= \sgn(y) \cdot \lfloor |y| - t\rfloor_{+} 85 | .\] 86 | \end{eg} 87 | 88 | \subsubsection{alternative derivation} 89 | By Fermat 90 | \begin{align*} 91 | 0 &\in \partial (f+g)(x) \\ 92 | 0 &= \in \nabla f(x) + \partial g(x) \text{ under CQ} \\ 93 | x &= x + \nabla f(x) + \partial g(x) \\ 94 | x-\nabla f(x) &\in x + \partial g(x) = (I + \partial g)(x) \\ 95 | x &= \left( I+ \partial g \right) ^{-1} (I - \nabla f)(x) \text{ fixed point eqn} \\ 96 | x_{k+1} &= \left( I+ \partial g \right) ^{-1} (I - \nabla f)(x_k)\\ 97 | &= \prox_g (x_k - \nabla f(x_k)) 98 | \end{align*} 99 | If $ f = 0$, we get 100 | \begin{align*} 101 | x_{k+1} &= \prox_{t g}(x_k) \text{ here t is anything we want since }f=0 \\ 102 | &= \argmin t \cdot g(x) + \frac{1}{2} \norm{ x-x_k}^2 \\ 103 | \end{align*} 104 | \begin{remark} 105 | Forward Euler exactly corresponds to gradient descent, whereas backward Euler exactly corresponds to proximal gradient descent. Thus, proximal gradient descent is also called "forward-backward method". 106 | \end{remark} 107 | \end{document} 108 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_25.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsubsection{Convergence of gradient descent} 6 | 7 | ~\begin{thm} 8 | Consider the problem 9 | \[ 10 | f^* = \min_x f(x) 11 | .\] 12 | $ f \in \Gamma_0(\rr^{n})$. We assume $ \nabla f$ is $L$-Lipschitz continuous. Choose $ t = \frac{1}{L}$. Then gradient descent with step size $ t$ converges with rate $ \mathcal{ O}\left( \frac{1}{k} \right) $. 13 | \end{thm} 14 | \begin{proof} 15 | We wish to bound $ f(x_{k+1}) - f^* $ by the local linear and quadratic lower and upper bounds. $L$-Lipschitz continuous implies that $ \nabla ^2 f(x) \preceq L \cdot I$. Recall that $ x_{k+1} = x_k - \frac{1}{L} \nabla f(x_k)$. 16 | \begin{align*} 17 | f(x_{k+1}) &\leq f(x_k) + \langle \nabla f(x), x_{k+1}-x_k \rangle + \frac{L}{2} \norm{ x_{k+1}-x_k}^2 \\ 18 | &= f(x_k) - \frac{1}{2L} \norm{ f(x_k)}^2 \text{ descent method guarantees progress} \\ 19 | &\leq f^* +\langle \nabla f(x_k),x_k-x^* \rangle -\frac{1}{2L} \norm{ \nabla f(x_k)}^2 \text{ by convexity} \\ 20 | &= f^* + \frac{L}{2} \left( \norm{ x_k-x^* } ^2 - \norm{ x_k -x^* -\frac{1}{L} \nabla f(x_k)}^2 \right) \\ 21 | &= f^* + \frac{L}{2} \left( \norm{ x_k-x^* }^2 - \norm{ x_{k+1} - x^* } \right) \\ 22 | \sum_{ i= 1}^{ k} f(x_i) - f^* &= \frac{L}{2} \sum_{ i= 1}^{ k} \norm{ x_{i-1} - x^* } ^2 - \norm{ x_i - x^* }^2 \\ 23 | &= \frac{L}{2} (\norm{ x_0 - x^* }^2 - \norm{ x_k - x^* }^2 ) \text{ telescope} \\ 24 | &\leq \frac{L}{2} \norm{ x_0 - x^* }^2 \\ 25 | f(x_{k}) - f^* &\leq \frac{1}{k} \sum_{ i= 1}^{ k} f(x_i) - f^* \leq \frac{L}{2k} \norm{ x_0-x^* }^2 \\ 26 | &= \mathcal{ O}\left( \frac{1}{k} \right) 27 | \end{align*} 28 | \end{proof} 29 | 30 | Question: is this the best we can? 31 | \begin{enumerate}[label=(\arabic*)] 32 | \item Is our analysis tight? Yes. 33 | \item This is worst-case complexity. 34 | \item Are there similar methods (\emph{i.e.} first-order) with faster rates? More precisely, first-order method satisfies (Lanczos/CG): 35 | \[ 36 | x_k \in \mathscr{L}_k = \text{ span} \{x_0, \nabla f(x_0), \nabla f(x_1), \ldots , \nabla f(x_{k-1})\} 37 | .\] 38 | The answer is yes, by Nesterov 1983. 39 | \end{enumerate} 40 | 41 | \begin{thm}[Nesterov 1983] 42 | For any 1st order method, there exists a $ f \in \Gamma_0(\rr^{n})$ with $ \nabla f$ $L$-Lipschitz continuous and \[ f(x_k)-f^* \geq \frac{3}{32} \cdot \frac{L}{k^2} \cdot \norm{ x_0-x^* }^2 \text{ for } k\leq \frac{1}{2}(n-1) \] and \[x_k - x^* \geq \frac{1}{8} \norm{ x_0 - x^* }^2 \] 43 | \end{thm} 44 | \begin{proof} 45 | \emph{Sketch}: The adversarial function is 46 | \begin{align*} 47 | f(x) &= \frac{L}{4} \left( \langle x,Ax \rangle - \langle e_1,x \rangle \right), A= \begin{pmatrix} 2&-1&0&\ldots\\-1&2&-1&0\\ \ldots\\0&0&-1&2 \end{pmatrix}\\ 48 | \nabla f(x) &= \frac{L}{4} (Ax, e_1)\\ 49 | x^* &= A^{-1} e_1 50 | \end{align*} 51 | Assume $ x_0 = 0$ (we can shift). At $ x_k$, only first $ k$ coordinates are nonzero. Since $ A^{-1}$ is a dense matrix, so $ x^* $ has nonzero elements, so we can get a high norm difference. 52 | \end{proof} 53 | 54 | \begin{thm}[Nesterov] 55 | \begin{align*} 56 | y_0 &= x_0\\ 57 | x_{k+1} &= y_k-t_k \nabla f(y_k)\\ 58 | y_{k+1} & = x_{k+1} + \frac{k}{k+3} (x_{k+1}-x_k) 59 | \end{align*} 60 | This has convergence rate of $ \mathcal{ O}\left(\frac{1}{k^2}\right)$. 61 | \end{thm} 62 | \begin{remark} 63 | Since we cannot get better than $ O\left( \frac{1}{k^2} \right) $ and this algorithm achieves it, so it is optimal. 64 | \end{remark} 65 | 66 | \end{document} 67 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_26.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsubsection{Gradient descent analysis with strong convexity} 6 | 7 | What do we want to analyze? Error metrics. 8 | 9 | \begin{enumerate}[label=(\arabic*)] 10 | \item $ f(x_{k+1}) - f_{x_k}, \norm{ x_{k+1} - x_k}, \norm{ \nabla f(x_k)} $: can always be practical termination criteria, although they might not be good. 11 | \item $ f(x_k) -f^* $: we can use this sometimes if we know $ f^* =0 $ or in the primal/dual problem when we can squeeze the gap between bounds. 12 | \item $ \norm{ x_k - x^* } $ : often can't use. 13 | \end{enumerate} 14 | 15 | \begin{remark} 16 | $ f(x_k) = \sum_{ j= 1}^{ k} \frac{1}{j}$, then $ f(x_{k+1}) - f(x_k) \to 0$ but we don't have a minimum since the series diverge! 17 | \end{remark} 18 | 19 | \subsubsection{suboptimality bounds (see PDF handout)} 20 | If $ \nabla f$ is $L$-Lipschitz continuous, then 21 | \begin{enumerate}[label=(\arabic*)] 22 | \item $ \norm{ \nabla f(x)} \leq L \norm{ x-x^* }_2 $. 23 | \item $ f(x) - f^* \leq \frac{L}{2} \norm{ x - x^* } $ by continuity. 24 | \item $ \norm{ \nabla f(x)}^2 \leq 2L (f(x) - f^* ) $. 25 | \end{enumerate} 26 | \begin{remark} 27 | By these bounds, bounding $ \norm{ x-x^* } $ is the nicest if possible but usually out of reach. The next nicest to bound is $ f(x) - f^* $. "$ x$ is an $ \epsilon$-sub-optimal point" means $ f(x) - f^* \leq \epsilon$. 28 | \end{remark} 29 | 30 | If $ f$ is $ \mu$-strongly convex, then 31 | \begin{enumerate}[label=(\arabic*)] 32 | \item $ \norm{ x-x^* }^2 \leq \frac{2}{\mu} (f(x) - f^* ) $. 33 | \item $ \norm{ x-x^* } \leq \frac{1}{\mu} \norm{ \nabla f(x)} $. 34 | \item Polyak-Lojasiewicz (PL): $ f(x) - f^* \leq \frac{1}{2\mu} \norm{ \nabla f(x)}^2 $. 35 | \end{enumerate} 36 | 37 | Recall from last time, we derive 38 | \[ 39 | f(x_{k+1}) \leq f(x_k) - \frac{1}{2L} \norm{ \nabla f(x_k)}^2 40 | .\] 41 | So if we add $ \mu$-strongly convex to the assumption of gradient descent analysis, then 42 | \begin{align*} 43 | f(x_{k+1}) - f(x_k) &\leq - \frac{1}{2L} \norm{ \nabla f(x_k)}^2 \leq -\frac{\mu}{L} (f(x_k) -f^*) \text{ by PL} \\ 44 | f(x_{k+1}) &\leq f(x_k) - \frac{\mu}{L} (f(x_k) - f^* )\\ 45 | f(x_{k+1}) - f^* &\leq \left( 1 - \frac{\mu}{L} \right) (f(x_k) - f^* ) 46 | \end{align*} 47 | Since $ \mu I \preceq \nabla^2 f \preceq LI$. So $ 0< \rho:= \frac{\mu}{L} < 1$. By contraction mapping theorem, this converges. 48 | \begin{align*} 49 | \norm{ x_k-x^* }^2 \leq \frac{2}{\mu} \rho^{k} (f(x_0) - f^* ) 50 | \end{align*} 51 | \begin{remark} 52 | $ \kappa = \frac{L}{\mu}$ is the condition number of the Hessian, \emph{i.e.} the largest singular value over the smallest. 53 | \end{remark} 54 | \subsubsection{Convergence rate} 55 | \begin{table}[H] 56 | \centering 57 | \begin{tabular}{c|c|c} 58 | rate& iteration number &example\\ 59 | \hline 60 | \hline 61 | $ \mathcal{ O}\left( \frac{1}{k^{1 /4}} \right) $ & $ \mathcal{ O}\left( \frac{1}{ \epsilon^{4}} \right) $ & non-convex subgradient method\\ 62 | \hline 63 | $ \mathcal{ O}\left( \frac{1}{\sqrt{k} } \right) $ & $ \mathcal{ O}\left( \frac{1}{ \epsilon^2} \right) $ & subgradient descent or SGD\\ 64 | \hline 65 | $ \mathcal{ O}\left( \frac{1}{k} \right) $& $ \mathcal{ O}\left( \frac{1}{ \epsilon} \right) $ & gradient-descent with Lipschitz\\ 66 | \hline 67 | $ \mathcal{ O} \left( \frac{1}{k^2} \right) $ & $ \mathcal{ O}\left( \frac{1}{ \sqrt{ \epsilon} } \right) $ & Nesterov acceleration\\ 68 | \hline 69 | $ \mathcal{ O}(\rho^{k})$ & $ \mathcal{ O}\left( \log\left( \frac{1}{ \epsilon} \right) \right) $& gradient descent with Lipschitz and strong convexity\\ 70 | \hline 71 | $ \mathcal{ O}\left( \rho^{2^{k}} \right) $ & $ \log_2 \left(\mathcal{ O}\left( \log \left( \frac{1}{ \epsilon} \right) \right) \right) $ & Newton's method locally\\ 72 | \end{tabular} 73 | \end{table} 74 | 75 | \end{document} 76 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_27.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \newpage 6 | \subsection{Linear conjugate gradient method} 7 | CG solves (usually approximately) $ Ax =b$ if $ A \succ 0$. More details and intuition can be found at \url{cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf}. 8 | 9 | \begin{eg} 10 | Consider the least squares problem. Let $ \phi(x) = \frac{1}{2} x^{T} \widetilde{ A}^{T} \widetilde{ A}x - \widetilde{ b}^{T} \widetilde{ A}x + \frac{1}{2} \widetilde{ b}^{T}\widetilde{ b} =: \frac{1}{2} x^{T} A x - b^{T} x + \text{ const.} $. Here $ A \succeq 0$ always and $ A \succ 0$ if $ m>n$ and full rank. Then we can solve $ \nabla \phi(x) = Ax-b$. 11 | \begin{note} 12 | Don't form the Gram matrix. Instead use LSQR method. 13 | \end{note} 14 | 15 | One idea to $ \min \phi(x)$ is coordinate descent/alternating minimization. This slowly converges to the solution via a zigzag path. If we change to the eigenvector basis, it is guaranteed to converge in $ n$ steps. However, finding the eigenvector basis is just as expensive as solving the normal equation directly at $ \mathcal{ O}(n^3)$. 16 | \begin{defn}[conjugate directions] 17 | $ \{p_i\} $ are conjugate directions if they are $ A$-orthogonal. That is, 18 | \[ 19 | \langle p_i |A|p_j \rangle := \langle p_i, A p_j \rangle = 0 \text{ if } i\neq j 20 | .\] 21 | \end{defn} 22 | 23 | \begin{note} 24 | If we have $ \{p_i\}_{i=0}^{n-1} $, it's a basis. If $ p_i$ are eigenvectors of a symmetric matrix $ A$, then they are $ A$-orthogonal. 25 | \end{note} 26 | 27 | Our goal is to find $ \{p_i\} $ more cheaply than eigenvectors. 28 | \end{eg} 29 | 30 | \begin{thm}[conjugate direction method (abstract)] 31 | Assume $ \{p_i\}_{i=0}^{n-1} $ are conjugate directions. Then 32 | \begin{align*} 33 | x_{k+1} = x_k + \alpha_k p_k 34 | \end{align*} 35 | where $ a_k$ solves $ \min_{\alpha} \phi(x_k + \alpha p_k)$ which is exact line search. The solution to this 1D problem has a closed form: 36 | \begin{align*} 37 | a_k = - \frac{\langle r_k | p_k \rangle}{ \langle p_k|A|p_k \rangle} 38 | \end{align*} 39 | where $ r_k = A x_k-b$. Then $ x_n = x^* $. 40 | \end{thm} 41 | 42 | \begin{proof} 43 | Since $ \{ p_i\} $ is a basis, we can write 44 | \begin{align*} 45 | x^* -x_0 &= \sum_{ i= 0}^{ n-1} \sigma_i p_i \\ 46 | p_k^{T} A(x^* -x_0)&= \sum_{ i= 0}^{ n-1} \sigma_i \langle p_k|A|p_i \rangle \\ 47 | &= \sigma_k \langle p_k|A|p_k \rangle \\ 48 | \sigma_k &= \frac{\langle p_k|A|x^* -x_0 \rangle}{ \langle p_k|A|p_k \rangle} \ \forall \ k 49 | \end{align*} 50 | Moreover, 51 | \begin{align*} 52 | x_k - x_0 &= \sum_{ i= 0}^{ k-1} \alpha_i p_i \\ 53 | p_k^{T} A(x_k - x_0) &= \sum_{ i= 0}^{ k-1} \alpha_i \langle p_k|A|p_i \rangle \\ 54 | \langle p_k|A|x_k-x_0 \rangle &= 0 55 | \end{align*} 56 | Substituting $ x_k $ as $ x_0$, 57 | \begin{align*} 58 | \sigma_k = \frac{\langle p_k|A|x^* -x_k \rangle}{ \langle p_k|A|p_k \rangle} = \alpha_k 59 | \end{align*} 60 | Therefore, $ x_n = x^* $ since they have the same expression in the basis. 61 | \end{proof} 62 | \begin{remark} 63 | We can think of this process as either building up $ x^* $ component-by-component or cutting the error $ x^* -x_k$ component-by-component. 64 | \end{remark} 65 | Facts: 66 | \begin{itemize} 67 | \item $ r_{k+1} = r_k + \alpha_k A p_k$ 68 | \item $ \langle r_k, p_i \rangle=0, i0$. This is a necessary condition called \allbold{curvature condition}. That is, 72 | \[ 73 | \langle x_{k+1} - x_k, \nabla f_{k+1} - \nabla f_k \rangle > 0 74 | .\] 75 | This is strictly monotone, which is satisfied when $ f$ is strictly convex. If $ f$ isn't strictly convex, it would complicate quasi-Newton method ( \emph{e.g.} might need to add a line search). 76 | 77 | $ x \in \rr^{n}, B \in \rr^{n \times n}$, so $ B$ has $ n(n+1) /2$ degrees of freedom, and $ n$ constraints from the secant equation. 78 | 79 | \begin{eg} 80 | When $ n=1$, degree of freedom and constraint are both 1, $ B_{k+1}$ is completely determined. This is called the \emph{secant method}. 81 | \end{eg} 82 | When $ n>1$, $ B$ is underdetermined. The standard ways to choose $ B$ is 83 | \begin{align*} 84 | B_{k+1} &= \argmin_{B \succ 0, B s_k = y_k} \norm{ B-B_k}_w^2 \\ 85 | \text{ or } B_{k+1}^{-1} &= \argmin_{B^{-1} \succ 0, B^{-1}y_k=s_k} \norm{ B^{-1}-B_k^{-1}}_w^2 86 | \end{align*} 87 | where $ \norm{ \cdot }_w $ is some norm chosen so the problem has a closed-form solution. The class of methods on choosing $ B$ is called the \allbold{Broyden class}. 88 | 89 | \begin{notation} 90 | $ B$ approximates $ \nabla ^2 f$, and $ H$ approximates $ (\nabla ^2 f)^{-1}$. That is, $ B_k ^{-1} = H_k$. 91 | \end{notation} 92 | 93 | Observe that it's cheaper to just approximate the inverse Hessian, although it is actually not an issue because $ B_{k+1}$ is a low-rank update of $ B_k$, so we can use Sherman-Morrison-Woodbury formula to obtain the inverse very cheaply. 94 | \end{document} 95 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_29.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsubsection{BFGS} 6 | Using a specific weighted norm that satisfies the secant equation: 7 | \begin{thm}[BFGS] 8 | \begin{align*} 9 | H_{k+1} = \left( I- \rho_k s_k y_k^{T} \right) H_k(I- \rho_k y_k s_k^{T}) + \rho_k s_k s_k^{T}, \quad \rho_k = \frac{1}{\langle y_k,s_k \rangle} 10 | \end{align*} 11 | \end{thm} 12 | 13 | \begin{remark} 14 | ~\begin{itemize} 15 | \item Iteration count: win (compared to gradient descent) 16 | \item Flops: $ \mathcal{ O}(n^2) < \mathcal{ O} (n^3)$ : win (compared to Newton) 17 | \item Memory: $ \mathcal{ O}(n^2)$ : loss (same as Newton) 18 | \end{itemize} 19 | \end{remark} 20 | 21 | \begin{thm}[L-BFGS] 22 | \begin{align*} 23 | H_{k+1} &= V_k^{T} H_k V_k + \rho_k s_k s_k^{T}\\ 24 | H_{k+1} (w) &= V_k^{T} H_{k} (V_k w)+ \rho_k s_k s_k^{T} w 25 | \end{align*} 26 | where $ z :=V_k w$ uses $ y_k, s_k$ and $ s_k^{T} w$ uses $ y_k, s_k$. Both are cheap. Then $ H_k (z)$ depends on $ y_{k-1}, s_{k-1}, H_{k-1}$. We can do this recursively down to the first. Instead we stop at $ (k-m)$th term. That is, 27 | \begin{align*} 28 | H_{k-m} = \frac{\langle y_{k-m},s_{k-m} \rangle}{ \norm{ y_{k-m}}^2 } \cdot I 29 | \end{align*} 30 | We can start with gradient descent to initialize. That is, $ H_0 = \frac{1}{L} \cdot I$. 31 | \end{thm} 32 | Then the storage becomes $ 2(m+1) n $. Commonly choose $ m \in \{3,20\} $. 33 | 34 | \begin{remark} 35 | Usually $ B_k \not\to \nabla ^2f(x^* )$. 36 | \end{remark} 37 | 38 | \begin{thm}[convergence] 39 | If $ 0< \mu I \leq \nabla ^2 f(x) \leq L \cdot I$, then BFGS converges and usually superlinearly. 40 | \end{thm} 41 | 42 | Open question: if $ f$ is non-convex, does BFGS converge to a stationary point? 43 | 44 | \begin{remark} 45 | If $ m = 0$ "memoryless" BFGs plus exact linesearch yields nonlinear CG. 46 | \end{remark} 47 | 48 | \begin{remark} 49 | What if we have constraints? Recall that for gradient descent we can do proximal/projected gradient descent. That is, 50 | \[ 51 | x_{k+1}= \Proj(x_k - t \cdot \nabla f(x_k)) 52 | .\] 53 | Can we do the same thing for any quasi-Newton method? 54 | \[ 55 | x_{k+1} = \Proj_{B_k} (x_k - B_k^{-1} \nabla f_k) 56 | .\] 57 | This is usually not feasible since the scaled projection is hard to compute. 58 | \end{remark} 59 | \newpage 60 | \subsection{Newton's methods} 61 | Let $ \Delta x = \nabla ^2 f(x_k) ^{-1} \nabla f(x_k)$. 62 | \begin{enumerate}[label=(\arabic*)] 63 | \item computational: "inexact-Newton", "matrix-free", "truncated-Newton", or "Newton-CG" mean approximate $ \Delta x$. That is, we wish to solve 64 | \begin{align*} 65 | \nabla ^2 f(x_k) \cdot \Delta x = \nabla f(x_k) 66 | \end{align*} 67 | We can solve this with linear CG with only a few steps (adaptive). We can use preconditioners such as incomplete Cholesky or BFGS. 68 | 69 | Often Hessian is structured and we can exploit that in computing the Hessian-gradient product. 70 | \item convergence: 71 | 72 | In practice, we use a linesearch or even better a trust-region to "globalize". We wish to avoid bad saddle points. 73 | 74 | For trust-region, we minimize a quadratic model. 75 | \end{enumerate} 76 | \end{document} 77 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_30.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsubsection{Trust-region for nonconvex} 6 | \begin{align*} 7 | x_{k+1} &= x_k + p_k \\ 8 | &= x_k + \argmin_{p} \langle \nabla f_k, p \rangle + \frac{1}{2} \langle p,B_k,p \rangle 9 | \end{align*} 10 | If $ B_k \succ 0$, then this is a convex quadratic, the gradient equals 0 is the sufficient condition. Then the Newton step is the minimizer $ p_k = B_k^{-1} \nabla f_k$. If not, Newton step isn't the minimizer. Now we have to use trust-region. 11 | \begin{align*} 12 | x_{k+1} &= x_k + \argmin_{p} \langle \nabla f_k, p \rangle + \frac{1}{2} \langle p,B_k,p \rangle\\ 13 | s.t. \quad & \norm{ p} \leq \Delta \iff \frac{1}{2}\norm{ p}^2 \leq \frac{1}{2} \Delta^2 14 | \end{align*} 15 | \begin{remark} 16 | If $ B_k$ is indefinite, we see that if we pretend the gradient term isn't there, then the quadratic form is minimized by the leftmost eigenvector (associated with the negative-most eigenvalue) of $ B_k$, scaled to the trust region radius. See N+W for tricks to solve. 17 | \end{remark} 18 | The KKT conditions are necessary. 19 | \begin{align*} 20 | \mathscr{L}(p, \lambda) = \langle \nabla f_k,p \rangle + \frac{1}{2} \langle p|B_k|p \rangle + \frac{1}{2} \lambda (\norm{ p}^2 - \Delta^2 ) 21 | \end{align*} 22 | Stationarity: 23 | \begin{align*} 24 | \nabla f_k + B_k p + \lambda p &= 0 \\ 25 | p &= \underbrace{ (B_k + \lambda I)^{-1} }_{ \text{regularity} } \nabla f_k 26 | \end{align*} 27 | \begin{remark} 28 | Typical: trust-region methods can sometimes guarantee a local minimizer even if the problem isn't convex. 29 | \end{remark} 30 | \begin{remark} 31 | Alternatively, we can use 32 | \begin{itemize} 33 | \item cubic regularization 34 | \item perturbed gradient descent 35 | \end{itemize} 36 | \end{remark} 37 | \newpage 38 | \subsection{Nonlinear least squares} 39 | The objective is 40 | \begin{align*} 41 | f(x) &= \frac{1}{2} \sum_{ j= 1}^{ m} r_j^2 (x), \quad r_j: \rr^{n} \to \rr \\ 42 | &:= \frac{1}{2} \norm{ R(x)} ^2, \quad R: \rr^{n} \to \rr^{m} 43 | \end{align*} 44 | This is perhaps the most common in engineering and sciences. We use squares here not only because it's easier to differentiate but also because if our data have Gaussian noise, then this becomes the maximum likelihood estimation. 45 | 46 | Let the Jacobian of $ R$ be $ J(x)$. 47 | \begin{align*} 48 | J(x)_{i,j} &= \frac{\partial r_i}{\partial x_j} \\ 49 | J(x) &= \begin{pmatrix} \nabla r_1(x)^{T}\\ \vdots\\ \nabla r_m(x)^{T} \end{pmatrix} \\ 50 | \nabla f(x) &= \nabla \left( \frac{1}{2} \sum_{ j= 1}^{ m} r_j^2(x) \right) \\ 51 | &= \frac{1}{2} \sum_{ j= 1}^{ m} \nabla (r_j^2(x)) && \text{ linearity} \\ 52 | &= \frac{1}{2} \sum_{ j= 1}^{ m} 2 r_j(x) \nabla r_j(x) \\ 53 | &= J(x)^{T} R(x) \\ 54 | \nabla ^2f(x) &= J(x)^{T}J(x) + \sum_{ j= 1}^{ m} r_j(x) \cdot \nabla ^2 r_j(x) 55 | \end{align*} 56 | In the least squares case $ r_j(x) = a_i^{T} x -b_i$, we see that $ \nabla ^2f(x) = A^{T}A$ just as we expect. 57 | 58 | \subsubsection{Gauss-Newton method} 59 | \begin{align*} 60 | x_{k+1} = x_k - B_k^{-1} \nabla f_k 61 | \end{align*} 62 | where $ B_k = J_k^{T} J_k$. So we ignore the sum term to approximate the Hessian. This is worse than Newton but better than gradient descent's constant $ L \cdot I$ approximation, and we get the approximation "for free" as we need to compute $ J(x)$ for the gradient anyway. Although inverting it can get expensive. 63 | 64 | Another derivation: 65 | \begin{align*} 66 | R(x) &\approx R(x_k) + J(x_k) (x-x_k) && \text{ 1st order Taylor}\\ 67 | f(x) &= \frac{1}{2} \norm{ R(x)} ^2 \\ 68 | &\approx \frac{1}{2} \norm{ R_k + J_k(x-x_k)}^2 && \text{ linear ls model!} \\ 69 | x_{k+1}&= (J_k^{T} J_k)^{-1} J_k^{T} (J_k x_k - R_k) &&\text{ normal eq} \\ 70 | &= x_k - (J_k^{T} J_k)^{-1} J_k^{T} R_k\\ 71 | &= x_k- (J_k^{T} J_k)^{-1} \nabla f_k \\ 72 | \end{align*} 73 | \subsubsection{Levenberg-Marquardt} 74 | 75 | This is Gauss-Newton with a trust-region. 76 | 77 | Softwares: 78 | 79 | \begin{itemize} 80 | \item Matlab: lsqnonlin 81 | \item python: scipy.optimize.least\_squares, lmfit (modeling) 82 | \end{itemize} 83 | \end{document} 84 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_31.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \section{Methods for constrained problems} 6 | \subsubsection{Special nice constraints } 7 | \emph{e.g.} $ x\geq 0, \ell_i \leq x_i \leq u_i, \norm{ x-x_0}\leq 3 $ 8 | \begin{enumerate}[label=(\arabic*)] 9 | \item projected gradient descent (with Nesterov acceleration). 10 | \item active-set methods ( \emph{e.g.} L-BFGS-B). 11 | \end{enumerate} 12 | \begin{eg} 13 | $ x \in \rr^{2}, 0\leq x_i \leq 1$. Suppose $ x_1^{k} = 1, x_2^{k} = .3 $. Then for $ k+1$, let $ x_1^{k+1} = x_1^{k}$ unchanged, and ignore constraint for $ x_2$ and use $ L-BFGS$ and check it satisfies. 14 | \end{eg} 15 | 16 | \subsubsection{not-so-nice constraints} 17 | \emph{e.g.} $ g(x) \leq 0$ where $ g$ is non-linear. 18 | \begin{enumerate}[label=(\arabic*)] 19 | \item penalty methods: both convex and non-convex 20 | \item augmented Lagrangian: both 21 | \item sequential quadratic programming (SQP): both 22 | \item ADMM (alternating direction method of multipliers, also applied to non-convex) and DR (Douglas-Rachford): convex 23 | \item Primal Dual methods: mostly convex 24 | \item Interior-point methods (IPM): convex. IPOPT: non-convex. 25 | \end{enumerate} 26 | 27 | \subsection{penalty methods} 28 | 29 | \begin{align*} 30 | \min\quad &f_0(x) \\ 31 | \text{subject to } \quad &f_i(x) \leq 0, i = 1,\ldots,m \\ 32 | &h_i(x) = 0 , i = 1,\ldots,p 33 | \end{align*} 34 | \begin{eg} 35 | \begin{align*} 36 | \min\quad &\norm{ x}_1 \\ 37 | \text{subject to } \quad & \norm{ Ax-b}_2 ^2 \leq \epsilon^2 38 | \end{align*} 39 | Then 40 | \begin{align*} 41 | \mathscr{L}(x,\lambda) = \norm{ x}_1 + \lambda ( \norm{ Ax-b}^2 - \epsilon^2 ) 42 | \end{align*} 43 | If strong-duality holds and there exists saddle points, 44 | \begin{align*} 45 | x^* \in \argmin_{x} \mathscr{L}(x, \lambda^* ) 46 | \end{align*} 47 | \end{eg} 48 | We typically use the \allbold{quadratic penalty}: 49 | 50 | For equality constraints, define 51 | \begin{align*} 52 | Q_{\mu} (x) = f_0(x) + \frac{\mu}{2} \sum_{ i= 1}^{ m} h_i(x)^2 53 | \end{align*} 54 | Solve 55 | \begin{align*} 56 | x^{(k)} &= \argmin Q_{\mu_k}(x)\\ 57 | \text{ update } &\mu_{k+1} \text{ increasing}\\ 58 | x^{(k+1)} &= \argmin_{x} Q(\mu_{k+1}) (x) 59 | \end{align*} 60 | This is a warm-start with $ x^{(k)}$. 61 | \begin{thm} 62 | Suppose $ \mu_k \to \infty$. If $ (x^{(k)})$ has a limit point $ x^* $, then $ x^* $ is optimal. 63 | \end{thm} 64 | \begin{note} 65 | No convexity is needed but usually need convexity to update $ x^{(k+1)}$. 66 | \end{note} 67 | 68 | For inequality constraints, define 69 | \begin{align*} 70 | Q_{\mu}(x) &= f_0(x) + \frac{\mu}{2} \left( \sum_{ i= 1}^{ m} h_i^2(x) + \sum_{ i= 1}^{ m} \lfloor f_i(x) \rfloor_{+}^2 \right) 71 | \end{align*} 72 | \begin{note} 73 | The floor function makes it usually non-smooth. 74 | \end{note} 75 | \begin{remark} 76 | The general idea is to put constraints into the objective: 77 | \begin{align*} 78 | \min f_0(x), \quad x \in C \implies \min f_0(x) + g(x) 79 | \end{align*} 80 | \end{remark} 81 | Methods 82 | \begin{enumerate}[label=(\arabic*)] 83 | \item $ g(x) = I_{C}(x)$ : this is mathematically equivalent but no computational benefit 84 | \item penalty: 85 | \begin{align*} 86 | g_{\mu}(x) = 87 | \begin{cases} 88 | \mu \cdot x^2 & x<0\\ 89 | 0 & x\geq 0 90 | \end{cases} 91 | \end{align*} 92 | When $ \mu \to \infty$, the smooth quadratic barrier converges the non-smooth infinite barrier. 93 | \item barrier: 94 | \begin{align*} 95 | g_{\mu}(x) = -\frac{1}{\mu} \cdot \log x 96 | \end{align*} 97 | The barrier is not define for $ x\leq 0$, so it forces the solution to stay strictly feasible. 98 | \end{enumerate} 99 | 100 | \begin{remark} 101 | Drawback: QP is often ill-conditioned as $ \mu \to \infty$. 102 | \end{remark} 103 | \begin{eg} 104 | \begin{align*} 105 | \min\quad &\frac{1}{2} x^{T}Px \\ 106 | &Ax=b, A \in \rr^{m\times n}, mi} g_j x_j^{(k)} \right) \\ 27 | x_i^{(k+1)} &= \alpha \\ 28 | \end{align*} 29 | \end{eg} 30 | \begin{remark} 31 | Jacobi only uses $ x^{(k)}$ for each $ k$, allowing parallelization and randomized order. 32 | \end{remark} 33 | 34 | If we do this row-wise, it's called ART (algebraic reconstruction technique) or Kaczmarz algorithm, or POCS (projection onto convex sets). 35 | 36 | Consider 37 | \begin{align*} 38 | \min \quad & f(x), x = \begin{pmatrix} x_1 \\ \vdots\\x_n \end{pmatrix}, x_i \in C_i \text{ can be blocks} \\ 39 | x_i^{(k+1)} &\in \argmin_{\alpha \in C} f\left( x_1^{(k+1)}, \ldots, x_{j-1}^{(k+1)}, \alpha, x_{i+1}^{(k)},\ldots, x_n^{(k)} \right) \text{ or } \\ 40 | x_i^{(k+1)} &= x_i^{(k)} -\eta \frac{\partial f}{\partial x_i} \left( x_1^{(k+1)},\ldots,x_{i-1}^{(k+1)}, x_i^{(k)},\ldots, x_n^{(k)} \right) 41 | \end{align*} 42 | The last step is that if it's too hard to find $ \argmin$, we instead just take a gradient at that step. 43 | 44 | If we have two variables $ \min f(x,y)$, then 45 | \begin{align*} 46 | x^{(k+1)} &\in \argmin_{x} f(x,y^{(k)})\\ 47 | y^{(k+1)} & \in \argmin_{y} f(x^{(k+1)},y) 48 | \end{align*} 49 | We can modify it to PALM (proximal alternating linearized minimization) for non-convex problems: 50 | \begin{align*} 51 | x^{(k+1)} &\in \argmin_{x} f(x,y^{(k)}) + \frac{\mu}{2} \norm{ x - x^{(k)}}^2 \\ 52 | y^{(k+1)} & \in \argmin_{y} f(x^{(k+1)},y) + \frac{\mu}{2} \norm{ y-y^{(k)}}^2 53 | \end{align*} 54 | 55 | \subsubsection{ADMM (Alternating Direction Method of Multipliers)} 56 | See 2011 Boyd et al monograph. 57 | 58 | \begin{align*} 59 | \min\quad &f(x) \\ 60 | \text{subject to } \quad & Ax=b 61 | \end{align*} 62 | Attempt 1: Let's try with dual ascent, using $ y$ as the dual variable: 63 | \begin{align*} 64 | \mathscr{L}(x,y) &= f(x) + \langle y,Ax-b \rangle\\ 65 | g(y) &= \inf_{x} \mathscr{L}(x,y) \\ 66 | x_{k+1} &\in \argmin \mathscr{L}(x,y_k)\\ 67 | y_{k+1} &= y_k + t (\underbrace{ Ax_{k+1}-b}_{\nabla g(y_k) } ) 68 | \end{align*} 69 | This allows us to exploit the separable structure of the original problem if available \emph{e.g.} $ f(x) = \sum f_i(x_i)$, since we need to relax the linear constraint in the original problem, and the dual allows us to make the Lagrangian separable \emph{i.e.} $ \langle y,Ax-b \rangle \implies \langle A^* y,x \rangle - \langle y,b \rangle$. However, the downside is that it may not converge. 70 | 71 | Attempt 2: Let's try the augmented Lagrangian which is equivalent to the original problem: 72 | \begin{align*} 73 | \min \quad &f(x) + \frac{\rho}{2} \norm{ Ax-b}^2 \\ 74 | &Ax=b 75 | \end{align*} 76 | Unfortunately the Lagrangian is no longer separable due to the quadratic term: 77 | \begin{align*} 78 | \mathscr{L}(x,y) = f(x) + \langle y,Ax-b \rangle + \frac{\rho}{2} \norm{ Ax-b}^2 79 | \end{align*} 80 | Would it be possible to combine the two methods? 81 | 82 | Attempt 3 (ADMM): let $ F(x) = \sum_{ i= 1}^{ n} f_i(x_i)$ or $ F(v) = f(x) + g(z)$ if $ n=2$. 83 | \begin{align*} 84 | \min \quad & f(x) + g(z)\\ 85 | & Ax+ Bz = c 86 | \end{align*} 87 | The algorithm is: 88 | \begin{align*} 89 | x^{(k+1)} &\in \argmin_{x} \mathscr{L}_{\rho} \left( \begin{pmatrix} x\\z^{(k)} \end{pmatrix}, y^{(k)} \right) \\ 90 | z^{(k+1)} &\in \argmin_{z} \mathscr{L}_{\rho} \left( \begin{pmatrix} x^{(k+1)}\\z \end{pmatrix}, y^{(k)} \right) \\ 91 | \text{ update } y^{(k+1)} &= y_k + \rho (A x_{k+1}+ B z_{k+1}-c) 92 | \end{align*} 93 | \begin{note} 94 | If we jointly minimize the first two lines, it becomes the augmented Lagrangian method. 95 | \end{note} 96 | 97 | What if $ n>2$, \emph{i.e.} $ \min_x \sum_{ i= 1}^{ n} f_i(x)$, where $ x$ is a block vector of $ x_i$? 98 | 99 | One idea is $ \min_{x_i} \sum f_i(x_i) s.t. $ linear constraints enforces $ x_i = x_j$. 100 | Naive generalization from $ n=2$ doesn't converge very well. Instead we use a consensus trick: 101 | \begin{align*} 102 | F(v) &= G(x) + H(z) 103 | \end{align*} 104 | where $ x= \begin{pmatrix} x_1\\ \vdots \\ x_n \end{pmatrix} $, $ z$ has the same size as $ x_i$, and $ v = \begin{pmatrix} x\\z \end{pmatrix} $. 105 | \begin{align*} 106 | \min_{x,z} \quad & F(x) + G(z)\\ 107 | &\begin{pmatrix} I&&&&-I\\&I&&&-I\\&& \ddots && \vdots \\&&&I&-I \end{pmatrix} \begin{pmatrix} x\\z \end{pmatrix} =0 108 | \end{align*} 109 | This enforces $ x_i = z \implies x_i = x_j$. We see that $ A = I$ and $ B = \begin{pmatrix} -I\\ \vdots \\ -I \end{pmatrix} $ from the $ n=2$ linear constraint. Now we see $ x_i$ is decoupled at each update step: 110 | \begin{align*} 111 | x_{k+1} &\in \argmin_{x} \mathscr{L}_{\rho}(x,z,y_k) && \text{ decoupled}\\ 112 | z_{k+1} &\in \argmin_z \mathscr{L}_{\rho} (x_{k+1},z,y_k) = \frac{1}{n} \sum_{ i= 1}^{ n} x_i && \text{ consensus}\\ 113 | \text{ update } &y_{k+1} \text{ as usual} 114 | \end{align*} 115 | \begin{remark} 116 | This is a common trick in optimization. In a coupled system, we relax it to be decoupled first and let them recouple later. 117 | \end{remark} 118 | \end{document} 119 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_38.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \subsubsection{Douglas-Rachford} 6 | It is equivalent to ADMM in certain senses. See [BC17 28.3]. 7 | 8 | Algorithm: $ 0< \lambda<2, \rho>0, y_0$ 9 | \begin{align*} 10 | x_k&= \prox_{\rho g}(y_k) \\ 11 | z_k&= \prox_{\rho f}(2x_k -y_k) \\ 12 | y_{k+1} &= y_k + \lambda(z_k -y_k) 13 | \end{align*} 14 | The proximity operator is equivalent to the Lagrangian in ADMM. 15 | 16 | Notice 17 | \begin{align*} 18 | \min \sum_{ i= 1}^{ n} f_i(x) \iff \min \sum_{ i= 1}^{ n} f_i(x_i)\ s.t.\ x_i=x_j \ \forall \ i,j 19 | \end{align*} 20 | \begin{remark} 21 | In signal processing, we can parallelize this algorithm and only require communications among all workers in the consensus step. We can also enforce consensus in different ways and achieve consensus faster in a graph. 22 | \end{remark} 23 | \subsection{Primal Dual Methods} 24 | ADMM has some issues: if we want to $ \min_{x}\ g(x)+ \widetilde{ h}(Ax)$, $ h(x) = \widetilde{ h}(Ax)$ (DR form). Rewrite as $ \min_{x,z}\ g(x) + \widetilde{ h}(z)$, $ Ax-z=0$ (ADMM form). Finding $ \prox_{h}$ is often hard due to $ A$. $ \prox_{\widetilde{ h}}$ easy doesn't mean $ \prox_{h}$ is easy due to $ A$. 25 | 26 | The trick is to use ADMM with a scaled norm in the quadratic term in augmented Lagrangian. A clever choice is 27 | \begin{align*} 28 | \norm{ z} _{M}^2 = \langle z|M|z \rangle,\ M = \frac{1}{\sigma} I - A^{T}A,\ \sigma< \frac{1}{\norm{ A}^2 }\implies M \succ 0 29 | \end{align*} 30 | 31 | Chambolle and Pock, primal-dual hybrid gradient, preconditioned ADMM 32 | 33 | \subsubsection{general primal-dual method (Condat)} 34 | Suppose $ f,g,h$ convex, proper, lsc, 35 | \begin{align*} 36 | \min_x \ \underbrace{ f(x)}_{ \text{ smooth}, \nabla f } + \underbrace{ g(x)}_{ \text{ easy } \prox_g} + \underbrace{ h(Ax)}_{ \text{ easy } \prox_{h} } 37 | \end{align*} 38 | \begin{lem} 39 | $ x = \prox_h(x) + \prox_{h^* }(x)$. 40 | \end{lem} 41 | Since $ h$ is convex, 42 | \begin{align*} 43 | h(w) =h^{* *} (w) = \sup_y \langle w,y \rangle - h^* (y) 44 | \end{align*} 45 | We solve 46 | \begin{align*} 47 | \min_x \max_y f(x) + g(x) + \underbrace{ \langle Ax,y \rangle }_{ \text{ links primal-dual} } -h^* (y) && \text{ saddle pt problem} 48 | \end{align*} 49 | Optimality: use Fenchel-Rockafellar. 50 | 51 | Assume CQ hold, 52 | \begin{align*} 53 | 0 &\in \partial (f+g+h \circ A)(x)\\ 54 | 0 &\in \nabla f(x)+ \partial g(x) + A^{T} \underbrace{ \partial h(Ax) }_{ y} && \text{ CQ} \\ 55 | &\begin{cases} 56 | 0 \in \nabla f(x) + \partial g(x) + A^{T}y\\ 57 | Ax \in \partial h^* (y)\qquad \text{ since } y \in \partial h(Ax) \\ 58 | \end{cases} 59 | \end{align*} 60 | Rewrite the two equations in matrix form (although they are operators) 61 | \begin{align*} 62 | \underbrace{ - \begin{pmatrix} \nabla f &0\\0&0 \end{pmatrix}}_{T_2} \begin{pmatrix} x\\y \end{pmatrix} \in \underbrace{ \begin{pmatrix} \partial g& A^{T}\\-A& \partial h^* \end{pmatrix}}_{T_1} \begin{pmatrix} x\\y \end{pmatrix} 63 | \end{align*} 64 | This yields 65 | \begin{align*} 66 | - T_2 z &\in T_1 z,\qquad z = \begin{pmatrix} x\\y \end{pmatrix} \\ 67 | z-T_2 z & \in z+T_1 z &&\text{ add }z \text{ on both sides} \\ 68 | (I-T_2)z & \in (I+T_1)z 69 | \end{align*} 70 | We will solve via forward-backward (proximal descent). WLOG assume $ T_2$ is $ 1$-Lipschitz, $ (I + dg)^{-1}=\prox_g$ easy and $ (I + \partial h^* )^{-1}$ easy, then 71 | \begin{align*} 72 | z_{k+1} = \underbrace{ (I + T_1)^{-1} }_{ \text{ backward/implicit} } \underbrace{ (I-T_2) }_{ \text{ forward/explicit} } z_k 73 | \end{align*} 74 | Instead of adding $ z$, we do a trick and add $ Vz$: 75 | \begin{align*} 76 | Vz-T_2 z &\in Vz+ T_1 z 77 | \end{align*} 78 | where we choose $ V$ to be 79 | \begin{align*} 80 | V = \begin{pmatrix} \tau^{-1}I& -A^{T}\\-A& \sigma^{-1}I \end{pmatrix} 81 | \end{align*} 82 | This guarantees that $ V \succ 0$ if $ \sigma \tau > \norm{ A}^{-2} $. 83 | \begin{align*} 84 | z_{k+1} &= (V+T_1)^{-1}(V-T_2) z\\ 85 | V+T_1 &= \begin{pmatrix} \tau^{-1}I + \partial g&0\\-2A& \sigma^{-1}I + \partial h^* \end{pmatrix} && \text{ upper triangular} 86 | \end{align*} 87 | We can invert this using back substitution: 88 | \begin{align*} 89 | \begin{pmatrix} \tau^{-1}I + \partial g&0\\-2A & \sigma^{-1}I + \partial h^* \end{pmatrix} \begin{pmatrix} x_{k+1}\\y_{k+1} \end{pmatrix} &= \begin{pmatrix} v\\w \end{pmatrix}\\ 90 | x_{k+1} &= (\tau^{-1} I + \partial g)^{-1}v && \text{ via } \prox_g \\ 91 | \text{ then solve for } &y_{k+1} 92 | \end{align*} 93 | \end{document} 94 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/lec_39.tex: -------------------------------------------------------------------------------- 1 | \documentclass[class=article,crop=false]{standalone} 2 | \input{../preamble.tex} 3 | 4 | \begin{document} 5 | \section{Linear Programs} 6 | \subsection{Simplex Method} 7 | \begin{defn}[face] 8 | \begin{align*} 9 | \{x: a_i^{T}x=b_i, i \in I; a_i^{T}x=b_i, i \in I^{c}\} 10 | \end{align*} 11 | \end{defn} 12 | 13 | \begin{defn}[vertex] 14 | A \allbold{vertex} of a polyhedral set $ \{x: Ax\leq b\} $ is a face that has just 1 point. 15 | \end{defn} 16 | 17 | There exists an optimal solution that is a vertex (optimal might not be unique \emph{e.g.} when level set is parallel to an edge). 18 | 19 | Simplex method hops from one vertex to another. We also call vertex a basic feasible point. 20 | 21 | The constraints is $ Ax=b, x\leq 0$, $ A$ is $ m\times n$ matrix, and for some points the equality is achieved. Let $ B$ be the list of all indices, called basis, where $ |B| = m$. And for $ n-m$ 22 | 23 | We start at a basis $ B$, use duality to choose 1 index to leave. Adjust until a new index enters. 24 | 25 | Problems: 26 | \begin{enumerate}[label=(\arabic*)] 27 | \item lots of linear algebra, $ A_B^{-1}$ needs to be updated/downdated. Usually we use LU. If $ A$ is sparse, it's more complicated. 28 | \item there are many pivot rules 29 | \item finding a starting point: 2-phase approach 30 | \item degenerate bases and cycling 31 | \item presolving 32 | \item variants: dual simplex, self-dual 33 | \end{enumerate} 34 | 35 | \subsubsection{complexity} 36 | In CS, complexity means: time in input. What does the input mean? 37 | \begin{enumerate}[label=(\arabic*)] 38 | \item real arithmetic, numbers are ``cts'', encode real numbers. A unit storage cost for a $ \rr$ number. Operations/flops have unit cost. Thus the cost depends only on the dimension. 39 | \item rational model, or combinatorial model: $ b_i \in \qq$. Now the cost of operations depends on the value of the integers. Larger value is more costly. 40 | \end{enumerate} 41 | 42 | Practically simplex is great. However, Klee-Minty showed that in $ \rr^{n}$ with $ 2^{n}$ vertices and simplex visited all. Thus simplex is not in $ P$ using the usual pivot rule. The open question is whether this is true for all pivot rules. If we prove polynomial Hirsch conjecture we will know the answer. 43 | 44 | \begin{eg}[solving linear equations] 45 | In real arithmetic, this is polynomial $ \mathcal{ O}(n^3)$. In rational model, it is also polynomial. 46 | 47 | For LPs, in rational model, the answer is again yes. Khachiyan proposed ellipsoid method, a generalization of bisection method. Then Karmarkkov with IPM which is practical. 48 | \end{eg} 49 | What is the answer for real arithmetic model, is it polynomial time (strongly polynomial)? 50 | Are there other algorithms to solve LPs in polynomial time? 51 | \end{document} 52 | -------------------------------------------------------------------------------- /TypedNotes/lecture_notes_tex/master.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper]{report} 2 | 3 | \input{../preamble.tex} 4 | 5 | \title{Convex Optimization by Prof. Stephen Becker} 6 | 7 | \begin{document} 8 | \maketitle 9 | \tableofcontents 10 | \newpage 11 | %start lectures 12 | \input{lec_01} 13 | \input{lec_02} 14 | \input{lec_03} 15 | \input{lec_04} 16 | \input{lec_05} 17 | \input{lec_06} 18 | \input{lec_07} 19 | \input{lec_08} 20 | \input{lec_09} 21 | \input{lec_10} 22 | \input{lec_12} 23 | \input{lec_13} 24 | \input{lec_14} 25 | \input{lec_15} 26 | \input{lec_16} 27 | \input{lec_17} 28 | \input{lec_18} 29 | \input{lec_19} 30 | \input{lec_20} 31 | \input{lec_21} 32 | \input{lec_22} 33 | \input{lec_23} 34 | \input{lec_24} 35 | \input{lec_11} 36 | \input{lec_25} 37 | \input{lec_26} 38 | \input{lec_27} 39 | \input{lec_28} 40 | \input{lec_29} 41 | \input{lec_30} 42 | \input{lec_31} 43 | \input{lec_32} 44 | \input{lec_36} 45 | \input{lec_37} 46 | \input{lec_38} 47 | \input{lec_39} 48 | \input{lec_33} 49 | \input{lec_35} 50 | %end lectures 51 | \end{document} 52 | -------------------------------------------------------------------------------- /policies_CU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/policies_CU.pdf -------------------------------------------------------------------------------- /utilities/README.md: -------------------------------------------------------------------------------- 1 | The most useful module here is `firstOrderMethods.py` 2 | 3 | The main routine is this: 4 | - `gradientDescent` is proximal gradient descent 5 | - You can turn on "acceleration" (Nesterov acceleration, aka FISTA) 6 | - You can also enable line searches instead of a constant stepsize 7 | - It has a few features, like it can record error information 8 | 9 | There are three high-level functions: 10 | - `lassoSolver` is a wrapper to `gradientDescent` specialized for the lasso problem $\min_x .5\|\|Ax-b\|\|^2 + \tau\|\|x\|\|_1$ 11 | - `createTestProblem` creates some test problems for unit tests 12 | - it has 3 types of problems: (1) plain least-squares, (2) lasso, (3) logistic regression 13 | - it can use `cvxpy` to compute the exact solution, and also return the objects needed to test the code in this package 14 | - you can run `runAllTestProblems` to run all the problems 15 | 16 | And there are a few misc utility functions, such as: 17 | - `backtrackingLinesearch` which uses the Armijo conditions 18 | - `LipschitzLinesearch` suitable when the function is Lipschitz-gradient and convex 19 | - `LipschitzLinesearch_stabler` is a variant based on the ideas we did in the TFOCS paper (it's more stable numerically) 20 | - `powerMethod` for estimating the spectral norm of a matrix (useful for estimating Lipschitz constants) 21 | 22 | TODO: 23 | - Add an option for exact linesearch for lasso (following my [tech report](https://github.com/stephenbeckr/exactLASSOlinesearch)) 24 | - Add a solver for non-negative least squares 25 | - Incorporate into something like [benchOpt](https://github.com/benchopt/) 26 | -------------------------------------------------------------------------------- /utilities/fminunc_wrapper_simple.m: -------------------------------------------------------------------------------- 1 | function [f,g,h] = fminunc_wrapper_simple(x,F,G, H) 2 | % [f,g] = fminunc_wrapper_simple( x, F, G ) 3 | % for use with Matlab's "fminunc" 4 | % and also compatible with Mark Schmidt's minFunc package 5 | % 6 | % Example usage: 7 | % F = @(x) norm(A*x-b)^2/2 % this is our objective 8 | % G = @(x) A'*(A*x-b) % this is the gradient of F(x) 9 | % options = optimoptions('fminunc','SpecifyObjectiveGradient',true); 10 | % func = @(x)fminunc_wrapper_simple(x,F,G); 11 | % x0 = randn(size(A,2),1); 12 | % 13 | % fminunc_wrapper_simple(); % zero-out history 14 | % x = fminunc( func, x0, options ) % Matlab's solver 15 | % hist1 = fminunc_wrapper_simple(); % record history 16 | % x = minFunc(func, x0 ) % Mark Schmidt's minFunc solver 17 | % hist2 = fminunc_wrapper_simple(); % record history 18 | % xTrue = A\b; % true solution known in closed-form 19 | % fTrue = F(xTrue); 20 | 21 | % semilogy( hist1 - fTrue ); hold all 22 | % semilogy( hist2 - fTrue ); legend('fminunc','minFunc'); 23 | % 24 | % [fHist] = fminunc_wrapper() 25 | % will return the function history 26 | % and reset the history to zero. 27 | % 28 | % ... = fminunc_wrapper( x, F, G, H ) 29 | % will also compute the Hessian H if provided and requested 30 | % 31 | % Stephen Becker, Stephen.Becker@Colorado.edu 2/17/2017 32 | 33 | persistent fcnHist 34 | if nargin == 0 35 | f = fcnHist; 36 | fcnHist = []; 37 | return; 38 | end 39 | 40 | 41 | f = F(x); 42 | fcnHist(end+1) = f; % not efficient in terms of memory allocation 43 | 44 | if nargout > 1 45 | g = G(x); 46 | end 47 | if nargin > 3 && ~isempty(H) && nargout > 2 48 | h = H(x); 49 | end 50 | -------------------------------------------------------------------------------- /utilities/secondOrderMethods.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | secondOrderMethods module 4 | Mostly for APPM 5630 at CU Boulder, but others may find it useful too 5 | The main routine is NewtonsMethod(...) 6 | Calls firstOrderMethods.py for linesearch and such 7 | Note: not very well documented, but hopefully simple enough that you can figure 8 | things out 9 | I spent about 5 minutes testing this, so it's not very robust code! use at your own risk! 10 | The main module depends heavily on numpy 11 | 12 | Stephen Becker, March 3 2023, stephen.becker@colorado.edu 13 | 14 | Released under the Modified BSD License: 15 | Copyright (c) 2023, Stephen Becker. All rights reserved. 16 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 17 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 18 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 19 | 3. Neither the name of the Stephen Becker nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL STEPHEN BECKER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE 21 | """ 22 | import numpy as np 23 | from scipy import linalg 24 | import firstOrderMethods 25 | 26 | def NewtonsMethod(f,grad,Hess,x0,tol=1e-6,maxIters=1e2,printEvery=1, 27 | errorFunction=None, saveHistory=False,stronglyConvex=True): 28 | """ 29 | NewtonsMethod with either fixed stepsize or backtracking linesearch 30 | f is objective function 31 | grad returns gradient of objective function 32 | x0 is initial starting point 33 | tol stopping tolerance 34 | maxIters maximum number of iterations 35 | printEvery prints out information every printEvery steps; set to 0 for quiet 36 | errorFunction if provided, will evaluate errorFunction(x) at every iteration 37 | saveHistory whether to save function and error history 38 | stronglyConvex if True, then assumes Hessian matrix is positive definite 39 | 40 | Outputs: 41 | x final iterate 42 | data dictionary with detailed info. Keys include: 43 | 'steps', 'fcnHistory', 'errHistory', 'flag', 'fx' 44 | """ 45 | x = np.asarray(x0).copy() 46 | fx = f(x) 47 | t = 1 # initial guess for stepsize used for linesearch 48 | maxIters = int(maxIters) 49 | fcnHistory = [] 50 | errHistory = [] 51 | 52 | if stronglyConvex is True: 53 | HessType = 'pos' 54 | # For some reason, scipy.linalg.solve doesn't have a semidefinite option 55 | else: 56 | HessType = 'sym' # if complex, would need to change to 'her' 57 | 58 | ## Fancy stuff, not essential 59 | if printEvery == 0 or np.isinf(printEvery): 60 | # Users has requested no output 61 | # The "pprint" function does nothing 62 | def pprint(*args, **kwargs): 63 | pass 64 | display = False 65 | else: 66 | display = True 67 | pprint = print # pprint is now a synonym for "print" function 68 | if errorFunction is not None: 69 | pprint("Iter. Objective Stepsize Error") 70 | pprint("----- --------- -------- -------") 71 | else: 72 | pprint("Iter. Objective Stepsize") 73 | pprint("----- --------- --------") 74 | 75 | ## Main loop 76 | flag = "Quitting due to reaching max iterations" 77 | for k in range(maxIters+1): 78 | ## Actual math: 79 | g = grad(x) 80 | H = Hess(x) 81 | p = - linalg.solve(H,g,assume_a=HessType) # Newton step 82 | 83 | xNew,t,fNew, linesearchIter = firstOrderMethods.backtrackingLinesearch(f,x,p,g,1,fx) 84 | if t == 0: 85 | flag = "Quitting at iter",k,"since linesearch failed" 86 | pprint(flag) 87 | break 88 | 89 | ### Now book-keeping, etc. 90 | 91 | # Save data, record error 92 | if errorFunction is not None: 93 | err = errorFunction(xNew) 94 | if saveHistory: 95 | errHistory.append(err) 96 | if saveHistory: 97 | fcnHistory.append(fNew) 98 | 99 | if display and (not k % printEvery) : # modulo 100 | if errorFunction is not None: 101 | print(f"{k:5d} {fNew:7.2e} {t:6.2e} {err:.2e}") 102 | else: 103 | print(f"{k:5d} {fNew:7.2e} {t:6.2e}") 104 | 105 | # Check for convergence 106 | # If we wanted to get fancier, we could have separate tolerance variables 107 | # for each kind of check. 108 | if np.abs(fx-fNew) < tol: 109 | flag = "Quitting due to stagnating objective value" 110 | pprint(flag) 111 | break 112 | if np.linalg.norm(g) < tol: 113 | flag = "Quitting due to norm of gradient being small" 114 | pprint(flag) 115 | break 116 | # since xNew - x = stepsize*g, the following check is very similar 117 | # to the norm(g) check. The main difference is that it uses both 118 | # relative and absolute tolerances; another difference is that it 119 | # checks each entry (like l_inf norm) rather than 120 | # Euclidean norm. Suggested by Cooper 121 | if np.allclose(xNew,x,rtol=tol, atol=1e-3*tol): 122 | flag = "Quitting due to successive iterates being close together" 123 | pprint(flag) 124 | break 125 | 126 | # Get ready for next iteration 127 | fx = fNew 128 | x = xNew 129 | 130 | data = {'steps':k, 'fcnHistory':np.asarray(fcnHistory), 131 | 'errHistory':np.asarray(errHistory), 132 | 'flag':flag, 'fx':fx } 133 | return xNew, data --------------------------------------------------------------------------------