├── .gitignore
├── APPM4720_5720_Fall2018_Syllabus.pdf
├── Demos
    ├── AD_demos
    │   ├── ADiGator_demo
    │   │   ├── .gitignore
    │   │   ├── f1.m
    │   │   ├── f1_demo.m
    │   │   └── polydatafit
    │   │   │   ├── .DS_Store
    │   │   │   ├── Contents.m
    │   │   │   ├── fit.m
    │   │   │   ├── fit4numjac.m
    │   │   │   ├── fit_ADiGatorJac.m
    │   │   │   ├── fit_ADiGatorJac.mat
    │   │   │   ├── fit_Jac.m
    │   │   │   └── main.m
    │   ├── ForwardDiff.jl
    │   │   └── ForwardDiff_demo.ipynb
    │   └── README.md
    ├── AutoDiffByHand.ipynb
    ├── AutoDiff_threeReLU_implementations.ipynb
    ├── AutomaticDifferentiation.ipynb
    ├── BlockMultiplies
    │   ├── MatrixMultiplyDemo.html
    │   ├── MatrixMultiplyDemo.mlx
    │   ├── MultiplyMatrices_C.pdf
    │   ├── README.md
    │   ├── compareSpeed_homegrown_vs_MKL.m
    │   ├── matrixMultiply.m
    │   ├── multiplyMatrices.c
    │   └── multiplyMatrices.mexmaci64
    ├── CVX_demo
    │   ├── Handout2_cvx_tutorial.pdf
    │   ├── README.md
    │   ├── cvx_demo.mlx
    │   ├── cvx_demo.pdf
    │   ├── cvxpy_intro.ipynb
    │   ├── cvxpy_intro.pdf
    │   ├── tutorialSolutions.ipynb
    │   ├── tutorialSolutions.m
    │   └── tutorialSolutions.py
    ├── ConjugateGradientDemo.ipynb
    ├── ConvergenceRateDemo.ipynb
    ├── RPCA_case_study.ipynb
    ├── RPCA_case_study_solutions.ipynb
    └── nonconvex_example_2D.ipynb
├── Fall2018_day-by-day_schedule.pdf
├── Handouts
    ├── FirmlyNonexpansive.pdf
    ├── FixedPtTheorems.pdf
    ├── StrongConvexityLipschitz.pdf
    ├── StrongConvexityLipshitz.pdf
    ├── SubOptimalityBounds.pdf
    └── SubgradientDescent.pdf
├── Homeworks
    ├── APPM5630Spring25Homework01-02.pdf
    ├── APPM5630Spring25Homework03-04.pdf
    ├── APPM5630Spring25Homework05-06.pdf
    ├── APPM5630Spring25Homework07-08.pdf
    ├── APPM5630Spring25Homework09-10.pdf
    ├── HW01_helper_polyhedrality.pdf
    ├── HW04
    │   ├── APPM5630_HW4_helperFunctions.ipynb
    │   ├── implicit2explicit.m
    │   ├── quadraticObjective.m
    │   └── test_adjoint.m
    ├── HW10
    │   ├── .gitignore
    │   ├── README.md
    │   ├── adjointShortTimeDCT.m
    │   ├── forwardShortTimeDCT.m
    │   ├── handel.mat
    │   ├── handel.pkl
    │   ├── handel2.pkl
    │   ├── listen_to_Handel.ipynb
    │   ├── listen_to_Handel.m
    │   ├── listen_to_Handel.py
    │   ├── my_upsample.m
    │   ├── project_l1.m
    │   ├── python_functions.py
    │   └── test_python_functions.py
    ├── ProjectInformation.md
    ├── ProjectRubric.pdf
    └── README.md
├── LICENSE
├── Notes
    ├── 00_IntroToOptProblems.pdf
    ├── 00a_metaRules.pdf
    ├── 01_TypesOfMinimizers_IntroConvexity.pdf
    ├── 02_ConvexSets_part1.pdf
    ├── 03_ConvexSets_part2.pdf
    ├── 04_SeparatingHyperplanes_Farkas.pdf
    ├── 05_ConvexFunctions_part1.pdf
    ├── 05_ConvexFunctions_part2.pdf
    ├── 05_ConvexFunctions_part3_LipschitzGradient.pdf
    ├── 05_ConvexFunctions_part4_examples.pdf
    ├── 05_ConvexFunctions_part5_preservingConvexity.pdf
    ├── 06_ConjugateFunctions.pdf
    ├── 07_GradientDescent_intro.pdf
    ├── 08_ExistenceUniquenessMinimizers.pdf
    ├── 09_ProximityOperators.pdf
    ├── 10_OptimizationProblems.pdf
    ├── 11_FirstViewLagrangeMultipliers.pdf
    ├── 12_ConicOptimizationProblems.pdf
    ├── 13_moreOnSDPs.pdf
    ├── 14_LagrangianAndDualProblem.pdf
    ├── 15_MoreDuality.pdf
    ├── 16_SaddlePtsSharedLagrangians.pdf
    ├── 17_GameTheoryConnections.pdf
    ├── 18_FenchelRockafellarDuality.pdf
    ├── 19_KKT_and_complementarySlackness.pdf
    ├── 22_ProximalGradientDescent_convergenceAnalysis.pdf
    ├── 22a_ProximalGradientDescent_motivation.pdf
    ├── 23_NesterovAcceleration_convergenceAnalysis.pdf
    ├── 24_ConvergenceRates.pdf
    ├── 25_ConjugateGradientMethod.pdf
    ├── 26_QuasiNewtonMethods.pdf
    ├── 28_FindingGradientsByHand.pdf
    ├── 29_AutomaticDifferentiation.pdf
    ├── 29a_AdjointStateMethod.pdf
    ├── 30_GradientsParameterizedFunctions.pdf
    ├── 31_NewtonAndIPM.pdf
    ├── 32_ADMM_DRS_PrimalDual.pdf
    ├── 33_LinearPrograms.pdf
    ├── 34_IntegerLinearPrograms.pdf
    ├── README.md
    ├── appendix_notes_01.pdf
    ├── supplement_Geometry_Differentiability.pdf
    ├── supplement_LagrangeMultipliersIn2D.pdf
    ├── supplement_Slater_PrimalNotAchieved.pdf
    ├── supplement_VariationalInequalities_and_LCP.pdf
    ├── supplement_convergenceIteratesGradientDescent.pdf
    └── supplement_dualityPracticeProblem.pdf
├── README.md
├── SlideshowAllPresentations_4720Fall18.pdf
├── SlideshowAllPresentations_5630_Spring21.pdf
├── TypedNotes
    ├── APPM5720Notes.pdf
    ├── APPM5720Notes.tex
    ├── README.md
    ├── lecture_notes.pdf
    ├── lecture_notes_tex
    │   ├── lec_01.tex
    │   ├── lec_02.tex
    │   ├── lec_03.tex
    │   ├── lec_04.tex
    │   ├── lec_05.tex
    │   ├── lec_06.tex
    │   ├── lec_07.tex
    │   ├── lec_08.tex
    │   ├── lec_09.tex
    │   ├── lec_10.tex
    │   ├── lec_11.tex
    │   ├── lec_12.tex
    │   ├── lec_13.tex
    │   ├── lec_14.tex
    │   ├── lec_15.tex
    │   ├── lec_16.tex
    │   ├── lec_17.tex
    │   ├── lec_18.tex
    │   ├── lec_19.tex
    │   ├── lec_20.tex
    │   ├── lec_21.tex
    │   ├── lec_22.tex
    │   ├── lec_23.tex
    │   ├── lec_24.tex
    │   ├── lec_25.tex
    │   ├── lec_26.tex
    │   ├── lec_27.tex
    │   ├── lec_28.tex
    │   ├── lec_29.tex
    │   ├── lec_30.tex
    │   ├── lec_31.tex
    │   ├── lec_32.tex
    │   ├── lec_33.tex
    │   ├── lec_34.tex
    │   ├── lec_35.tex
    │   ├── lec_36.tex
    │   ├── lec_37.tex
    │   ├── lec_38.tex
    │   ├── lec_39.tex
    │   ├── master.tex
    │   └── preamble.tex
    └── notes.sty
├── policies.md
├── policies_CU.pdf
├── syllabus.md
└── utilities
    ├── APPM5630_utilities.ipynb
    ├── README.md
    ├── firstOrderMethods.py
    ├── fminunc_wrapper_simple.m
    └── secondOrderMethods.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/APPM4720_5720_Fall2018_Syllabus.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/APPM4720_5720_Fall2018_Syllabus.pdf


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/.gitignore:
--------------------------------------------------------------------------------
1 | .trash/
2 | Df1*
3 | 


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/f1.m:
--------------------------------------------------------------------------------
1 | function [y] = f1(x,scale)
2 | % Like the simple example we used in class.
3 | % https://en.wikipedia.org/wiki/Automatic_differentiation
4 | 
5 |    y = scale*(x(1)*x(2) + sin(x(1)));
6 | 
7 | end
8 | 


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/f1_demo.m:
--------------------------------------------------------------------------------
 1 | function [] = f1_demo()
 2 |    %function [y] = f1(x,scale)
 3 |    %% Like the simple example we used in class.
 4 |    %% https://en.wikipedia.org/wiki/Automatic_differentiation
 5 |    %
 6 |    %   y = scale*(x(1)*x(2) + sin(x(1)));
 7 |    % 
 8 |    %end
 9 | 
10 |    % print source
11 |    type f1
12 |    fprintf('\n');
13 |    
14 |    % Set up inputs (and optionally parameters)
15 |    x = adigatorCreateDerivInput([2 1], 'x');
16 |    aux = adigatorCreateAuxInput([1 1]);
17 |    
18 |    % Generate the derivative function file
19 |    opt = adigatorOptions('overwrite', 1); % overwrite existing generated AD sources
20 |    adigator('f1', {x, aux}, 'Df1', opt);
21 | 
22 |    % Call the derivative
23 |    x = [1; 2];
24 |    scale = 1;
25 |    x_ad = struct('f', x, 'dx', ones([2 1]));
26 |    y = Df1(x_ad, scale)
27 |    fprintf('Derivative with ADiGator:\n');
28 |    y.dx
29 |    
30 |    fprintf('Check derivative with analytic derivative:\n');
31 |    Df1_check(x, scale)
32 | 
33 | end
34 | 
35 | function [dx] = Df1_check(x, scale)
36 |    
37 |    dx = scale*[x(2) + cos(x(1)); x(1)];
38 |    
39 | end
40 | 


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/polydatafit/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/AD_demos/ADiGator_demo/polydatafit/.DS_Store


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/polydatafit/Contents.m:
--------------------------------------------------------------------------------
 1 | % ADiGator polynomial data fitting Jacobian example
 2 | %
 3 | % Copyright 2011-2015 Matthew J. Weinstein and Anil V. Rao
 4 | % Distributed under the GNU General Public License version 3.0
 5 | %
 6 | % ----------------------------------------------------------------------- %
 7 | % FILES:
 8 | % fit.m        - polynomial data fitting function
 9 | % fit4numjac.m - polynomial data fitting function to be called by numjac
10 | % main.m       - computes Jacobian of fit function using ADiGator and
11 | %                numjac


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/polydatafit/fit.m:
--------------------------------------------------------------------------------
 1 | function p = fit(x, d, m)
 2 | % FIT -- Given x and d, fit() returns p
 3 | % such that norm(V*p-d) = min, where
 4 | % V = [1, x, x.^2, ... x.^(m-1)].
 5 | 
 6 | dim_x = size(x, 1);
 7 | if dim_x < m
 8 |   error('x must have at least m entries');
 9 | end
10 | 
11 | V = ones(dim_x, 1);
12 | 
13 | for count = 1 : (m-1)
14 |   V = [V, x.^count];
15 | end
16 | p = V\d;


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/polydatafit/fit4numjac.m:
--------------------------------------------------------------------------------
 1 | function p= fit4numjac(t,x, d, m)
 2 | % FIT -- Given x and d, fit() returns p
 3 | % such that norm(V*p-d) = min, where
 4 | % V = [1, x, x.?2, ... x.?(m-1)].
 5 | 
 6 | dim_x = size(x, 1);
 7 | if dim_x < m
 8 |   error('x must have at least m entries');
 9 | end
10 | 
11 | V = ones(dim_x, 1);
12 | 
13 | for count = 1 : (m-1)
14 |   V = [V, x.^count];
15 | end
16 | 
17 | p = V \ d;


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/polydatafit/fit_ADiGatorJac.m:
--------------------------------------------------------------------------------
 1 | % This code was generated using ADiGator version 1.3
 2 | % ©2010-2014 Matthew J. Weinstein and Anil V. Rao
 3 | % ADiGator may be obtained at https://sourceforge.net/projects/adigator/ 
 4 | % Contact: mweinstein@ufl.edu
 5 | % Bugs/suggestions may be reported to the sourceforge forums
 6 | %                    DISCLAIMER
 7 | % ADiGator is a general-purpose software distributed under the GNU General
 8 | % Public License version 3.0. While the software is distributed with the
 9 | % hope that it will be useful, both the software and generated code are
10 | % provided 'AS IS' with NO WARRANTIES OF ANY KIND and no merchantability
11 | % or fitness for any purpose or application.
12 | 
13 | function p = fit_ADiGatorJac(x,d,m)
14 | global ADiGator_fit_ADiGatorJac
15 | if isempty(ADiGator_fit_ADiGatorJac); ADiGator_LoadData(); end
16 | Gator1Data = ADiGator_fit_ADiGatorJac.fit_ADiGatorJac.Gator1Data;
17 | % ADiGator Start Derivative Computations
18 | %User Line: % FIT -- Given x and d, fit() returns p
19 | %User Line: % such that norm(V*p-d) = min, where
20 | %User Line: % V = [1, x, x.^2, ... x.^(m-1)].
21 | dim_x.f = size(x.f,1);
22 | %User Line: dim_x = size(x, 1);
23 | cadaconditional1 = lt(dim_x.f,m);
24 | %User Line: cadaconditional1 = dim_x < m;
25 | V.f = ones(dim_x.f,1);
26 | %User Line: V = ones(dim_x, 1);
27 | cada1f1 = m - 1;
28 | cadaforvar1.f = 1:cada1f1;
29 | %User Line: cadaforvar1 = 1 : (m-1);
30 | V.dx = zeros(700,1);
31 | V.f(100,8) = 0;
32 | for cadaforcount1 = 1:7
33 |     count.f = cadaforvar1.f(:,cadaforcount1);
34 |     %User Line: count = cadaforvar1(:,cadaforcount1);
35 |     cada1f1dx = count.f.*x.f(:).^(count.f-1).*x.dx;
36 |     cada1f1dx((x.f(:) == 0 & x.dx == 0) | count.f == 0) = 0;
37 |     cada1f1 = x.f.^count.f;
38 |     V.dx = V.dx(Gator1Data.Index4,1);
39 |     V.f = V.f(:,1:7);
40 |     cada1td1 = zeros(700,1);
41 |     cada1td1(logical(Gator1Data.Index1(:,cadaforcount1))) = V.dx(nonzeros(Gator1Data.Index1(:,cadaforcount1)));
42 |     cada1td1(logical(Gator1Data.Index2(:,cadaforcount1))) = cada1f1dx(nonzeros(Gator1Data.Index2(:,cadaforcount1)));
43 |     V.dx = cada1td1;
44 |     cada1tempf1 = [V.f(:,1:Gator1Data.Index3(cadaforcount1)),cada1f1];
45 |     V.f = zeros(100,8);
46 |     V.f(:,1:size(cada1tempf1,2)) = cada1tempf1;
47 |     %User Line: V = [V, x.^count];
48 | end
49 | cada1tf3 = V.f\d;
50 | cada1td1 = zeros(8,100);
51 | cada1td1(Gator1Data.Index5) = V.dx;
52 | cada1td1 = cada1tf3.'*cada1td1;
53 | cada1td1 = cada1td1(:);
54 | cada1td3 = cada1td1(Gator1Data.Index6);
55 | cada1tf4 = V.f.';
56 | cada1td1 = sparse(Gator1Data.Index7,Gator1Data.Index8,cada1td3,100,100);
57 | cada1td1 = cada1tf4*cada1td1;
58 | cada1td1 = cada1td1(:);
59 | cada1td4 = full(cada1td1(Gator1Data.Index9));
60 | cada1tf4 = (V.f*cada1tf3 - d).';
61 | cada1td1 = sparse(Gator1Data.Index10,Gator1Data.Index11,V.dx,100,700);
62 | cada1td1 = cada1tf4*cada1td1;
63 | cada1td1 = cada1td1(:);
64 | cada1td5 = full(cada1td1(Gator1Data.Index12));
65 | cada1td3 = cada1td4;
66 | cada1td3(Gator1Data.Index13) = cada1td3(Gator1Data.Index13) + cada1td5;
67 | cada1tf4 = -(V.f.'*V.f);
68 | cada1td1 = zeros(8,100);
69 | cada1td1(Gator1Data.Index14) = cada1td3;
70 | cada1td1 = cada1tf4\cada1td1;
71 | cada1td1 = cada1td1(:);
72 | p.dx = cada1td1(Gator1Data.Index15);
73 | p.f = cada1tf3;
74 | %User Line: p = V\d
75 | p.dx_size = [8,100];
76 | p.dx_location = Gator1Data.Index16;
77 | end
78 | 
79 | 
80 | function ADiGator_LoadData()
81 | global ADiGator_fit_ADiGatorJac
82 | ADiGator_fit_ADiGatorJac = load('fit_ADiGatorJac.mat');
83 | return
84 | end


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/polydatafit/fit_ADiGatorJac.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/AD_demos/ADiGator_demo/polydatafit/fit_ADiGatorJac.mat


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/polydatafit/fit_Jac.m:
--------------------------------------------------------------------------------
 1 | % function [Jac,Fun] = fit_Jac(x,d,m)
 2 | % 
 3 | % Jacobian wrapper file generated by ADiGator
 4 | % ©2010-2014 Matthew J. Weinstein and Anil V. Rao
 5 | % ADiGator may be obtained at https://sourceforge.net/projects/adigator/ 
 6 | % Contact: mweinstein@ufl.edu
 7 | % Bugs/suggestions may be reported to the sourceforge forums
 8 | %                    DISCLAIMER
 9 | % ADiGator is a general-purpose software distributed under the GNU General
10 | % Public License version 3.0. While the software is distributed with the
11 | % hope that it will be useful, both the software and generated code are
12 | % provided 'AS IS' with NO WARRANTIES OF ANY KIND and no merchantability
13 | % or fitness for any purpose or application.
14 | 
15 | function [Jac,Fun] = fit_Jac(x,d,m)
16 | gator_x.f = x;
17 | gator_x.dx = ones(100,1);
18 | p = fit_ADiGatorJac(gator_x,d,m);
19 | Jac = reshape(p.dx,[8 100]);
20 | Fun = p.f;
21 | end


--------------------------------------------------------------------------------
/Demos/AD_demos/ADiGator_demo/polydatafit/main.m:
--------------------------------------------------------------------------------
 1 | % This file uses both MATLAB finite differences as well as adigator in order
 2 | % to compute derivatives of the fit.m function. The user can change m and n
 3 | % to change to problem size.
 4 | % Copyright 2011-2014 Matthew J. Weinstein and Anil V. Rao
 5 | % Distributed under the GNU General Public License version 3.0
 6 | m = 8;
 7 | n = 100;
 8 | TOL = 1e-5;
 9 | 
10 | x = floor(rand(n,1)*1000)/1000;
11 | d = floor(rand(n,1)*1000)/1000;
12 | numeval = 25;
13 | 
14 | % Create the Jacobian file
15 | tic
16 | gx = adigatorCreateDerivInput([n,1],'x');
17 | adigatorGenJacFile('fit',{gx,d,m});
18 | gentime = toc;
19 | 
20 | 
21 | % Use the ADiGator generated Jacobian file
22 | tic
23 | for i = 1:numeval
24 |   [J,p] = fit_Jac(x,d,m);
25 | end
26 | adigatortime = toc/numeval;
27 | 
28 | 
29 | % numerically compute the Jacobian using FD
30 | tic
31 | for i = 1:numeval
32 |   dpdx2 = numjac(@(t,x)fit4numjac(t,x,d,m),0,x,p,TOL*ones(n,1),[],0);
33 | end
34 | fdtime = toc/numeval;
35 | 
36 | 
37 | fprintf('Derivatives of fit function:\n');
38 | fprintf(['m = %1.0f, n = %1.0f, TOL = ',num2str(TOL),'\n'],m,n);
39 | fprintf(['ADiGator File Generation Time: ',num2str(gentime),'\n']);
40 | fprintf(['ADiGator Average Eval Time:    ',num2str(adigatortime),'\n']);
41 | fprintf(['F Diff Average Eval Time:      ',num2str(fdtime),'\n']);
42 | 


--------------------------------------------------------------------------------
/Demos/AD_demos/README.md:
--------------------------------------------------------------------------------
1 | # Automatic Differentiation demos
2 | 
3 | - **Python**: see [../AutomaticDifferentiation.ipynb](../AutomaticDifferentiation.ipynb)
4 | - **Julia**: see [ForwardDiff.jl/](ForwardDiff.jl/) package. Julia has a rich ecosystem of autodiff tools, which are constantly evolving, so we have not attempted to be up-to-date
5 |   - A [discourse post](https://discourse.julialang.org/t/state-of-automatic-differentiation-in-julia/43083) from about 2020 (and see followup comments) lists about 20 packages: FowardDiff
6 | ForwardDiff2, Nabla, Tracker, Yota, Zygote, ReverseDiff, AutoGrad, NiLang, ModelingToolkit, XGrad, Calculus, FiniteDifferences, FiniteDiff, TaylorSeries, DualNumbers, HyperDualNumbers, Knet, Capstan, Flux, ...
7 | - **Matlab**: historically, Matlab hasn't had a rich autodiff community.  When this class first ran in 2016, we used ADiGator (see [ADiGator_demo/](ADiGator_demo/)
8 |   - As of version R2021a, with the Deep Learning Toolbox, there is now much better native support. See, e.g., [mathworks.com/help/deeplearning/ug/include-automatic-differentiation.html](https://www.mathworks.com/help/deeplearning/ug/include-automatic-differentiation.html).
9 | 


--------------------------------------------------------------------------------
/Demos/BlockMultiplies/MatrixMultiplyDemo.mlx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/BlockMultiplies/MatrixMultiplyDemo.mlx


--------------------------------------------------------------------------------
/Demos/BlockMultiplies/MultiplyMatrices_C.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/BlockMultiplies/MultiplyMatrices_C.pdf


--------------------------------------------------------------------------------
/Demos/BlockMultiplies/README.md:
--------------------------------------------------------------------------------
1 | # Block computation demo
2 | 
3 | For discussion of block/mini-batch methods
4 | 
5 | See the demo script:
6 | 
7 | [Demo in HTML format](http://htmlpreview.github.io/?https://github.com/stephenbeckr/CambridgeOptimisationCourse/blob/master/BlockComputation_demo/MatrixMultiplyDemo.html)
8 | (this link uses the nice htmlpreview.github.io website; if you just click the raw html file, it will not render)
9 | 


--------------------------------------------------------------------------------
/Demos/BlockMultiplies/compareSpeed_homegrown_vs_MKL.m:
--------------------------------------------------------------------------------
 1 | %{
 2 | Compare the speed of matrix multiplication, of my simple C for loop
 3 |  vs using Matlab's version (which calls Intel's MKL BLAS)
 4 | 
 5 | %}
 6 | 
 7 | % Compile, if not already done:
 8 | % mex multiplyMatrices.c -O
 9 | 
10 | % First, convince you that my function gives the right answer
11 | A   = randn(50,51);
12 | B   = randn(51,52);  % pick rectangular, helps find bugs
13 | C   = multiplyMatrices(A,B);
14 | err = norm( C - A*B, 'fro' )/norm( A*B, 'fro' );
15 | fprintf('Error is: %g\n', err );
16 | 
17 | %% Now, try some speed tests
18 | 
19 | nList   = [50,100,500,800,1e3,1.2e3,1.5e3];
20 | nReps   = 3;
21 | [results1,results2] = deal( zeros( nReps,length(nList) ) );
22 | for ni = 1:length(nList)
23 |     n   = nList(ni);
24 |     A   = randn(n);
25 |     B   = randn(n);
26 |     
27 |     for rep = 1:nReps
28 |         tic;
29 |         C   = multiplyMatrices( A, B );
30 |         t   = toc;
31 |         
32 |         results1(rep,ni)    = t;
33 |         
34 |         tic;
35 |         C   = A*B;
36 |         t   = toc;
37 |         
38 |         results2(rep,ni)    = t;
39 |     end
40 | end
41 | 
42 | %% Plot
43 | figure(1); clf;
44 | loglog( nList, mean(results1), 'o--', 'linewidth', 2,'markersize',10 );
45 | hold all
46 | loglog( nList, mean(results2), '*--', 'linewidth', 2,'markersize',10 );
47 | set(gca,'fontsize',20);
48 | legend('My implementation','Matlab''s implementation','location','northwest');
49 | title('Time to multiply n x n matrices');
50 | ylabel('Time (s)');
51 | xlabel('Dimension n');
52 | xlim([50,1.6e3]);
53 | text( 100,.5,  'At n=1500, C code took 27 sec, Matlab took 0.17 sec' );
54 | export_fig 'MultiplyMatrices_C' '-pdf' '-transparent'


--------------------------------------------------------------------------------
/Demos/BlockMultiplies/matrixMultiply.m:
--------------------------------------------------------------------------------
 1 | %{
 2 | Demonstration of effect of memory/communication cost
 3 |  on performance
 4 | 
 5 | Stephen Becker, June 11 2018
 6 | %}
 7 | 
 8 | n   = 5e3;
 9 | A   = randn(n);
10 | X   = randn(n);
11 | 
12 | blockSizes  = [1, 5, 10, 20, 100, 1e3, 5e3 ];
13 | times       = zeros( length(blockSizes), 1 );
14 | 
15 | for i   = 1:length(blockSizes)
16 |     fprintf('i is %d\n', i );
17 |     block   = blockSizes( i );
18 |     if block < 1e2
19 |         reps    = round( 1e2/block );
20 |     elseif block < 1e3
21 |         reps    = round( 1e3/block );
22 |     else
23 |         reps    = round( 1e4/block );
24 |     end
25 |     % to get accurate timing, we'll repeat this
26 |     t1 = tic;
27 |     for trial = 1:reps
28 |         y   = A*X(:,1:block);
29 |     end
30 |     tm  = toc(t1);
31 |     tm  = tm/reps;
32 |     times(i)    = tm;
33 | end
34 | %% Display results
35 | figure(1); clf;
36 | loglog( blockSizes, times, 'o--','markersize',12,'linewidth',2 )
37 | set(gca,'fontsize',18);
38 | xlabel('n');
39 | ylabel('Time to multiply A*X(:,1:n)');
40 | hold all
41 | loglog( blockSizes, blockSizes*times(2)/blockSizes(2), ':','linewidth',2  )
42 | loglog( blockSizes, blockSizes*times(5)/blockSizes(5), ':','linewidth',2  )
43 | ylim([1e-2,10]);
44 | 
45 | %% Display them another way
46 | figure(2); clf;
47 | semilogx( blockSizes, times.*(n./blockSizes'), 'o--','markersize',12,'linewidth',2 )
48 | set(gca,'fontsize',18);
49 | xlabel('n');
50 | ylabel('Time to multiply A*X');
51 | hold all


--------------------------------------------------------------------------------
/Demos/BlockMultiplies/multiplyMatrices.c:
--------------------------------------------------------------------------------
 1 | #if defined(__GNUC__) && !(defined(__clang__)) && defined(NEEDS_UCHAR)
 2 | #include <uchar.h>
 3 | #endif
 4 | #include <math.h>
 5 | #include "mex.h"
 6 | 
 7 | 
 8 | void mexFunction( int nlhs, mxArray *plhs[], 
 9 | 		  int nrhs, const mxArray*prhs[] ) 
10 | { 
11 |     double *A, *B, *C;
12 |     mwSize m, n, k;
13 |     mwIndex i, j, u;
14 |     
15 |     if ( nrhs != 2 ) {
16 |         mexPrintf("Usage:  C = multiplyMatrices(A,B)\n");
17 |         mexErrMsgIdAndTxt("MATLAB:mexFile:invlaidNumInputs","Need 2 inputs");
18 |     }
19 |     
20 |     A   = mxGetPr( prhs[0] ); /* A is m x k */
21 |     m   = mxGetM(  prhs[0] );
22 |     k   = mxGetN(  prhs[0] );
23 |     
24 |     B   = mxGetPr( prhs[1] ); /* A is m x k */
25 |     if (k != mxGetM(  prhs[1] ) ){
26 |         mexPrintf("Usage:  C = multiplyMatrices(A,B), where size(A,2)==size(B,1)\n");
27 |         mexErrMsgIdAndTxt("MATLAB:mexFile:invlaidNumInputs","Wrong size");
28 |     }
29 |     n   = mxGetN(  prhs[1] );
30 |     
31 |     plhs[0] = mxCreateDoubleMatrix( m, n, mxREAL );
32 |     C       = mxGetPr( plhs[0] );
33 |     
34 |     
35 |     /* Now, the actual computation */
36 |     for (i=0; i<m; i++)
37 |         for (j=0;j<n;j++)
38 |             for (u=0;u<k;u++)
39 |                 C[i+m*j] += A[i+m*u]*B[u+j*k];
40 | }


--------------------------------------------------------------------------------
/Demos/BlockMultiplies/multiplyMatrices.mexmaci64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/BlockMultiplies/multiplyMatrices.mexmaci64


--------------------------------------------------------------------------------
/Demos/CVX_demo/Handout2_cvx_tutorial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/CVX_demo/Handout2_cvx_tutorial.pdf


--------------------------------------------------------------------------------
/Demos/CVX_demo/README.md:
--------------------------------------------------------------------------------
 1 | # Demos for using CVX (Matlab) and CVXPY (Python)
 2 | 
 3 | ## Step 1: get introduced to CVX/CVXPY
 4 | 
 5 | - For Matlab, see [cvx_demo.mlx](cvx_demo.mlx) or its PDF version [cvx_demo.pdf](cvx_demo.pdf)
 6 |   - On Macs, if you get errors opening mex files like "...mexmaci64" cannot be opened because the developer cannot be verified." then follow instructions analogous to [here](https://www.fieldtriptoolbox.org/faq/mexmaci64_cannot_be_opened_because_the_developer_cannot_be_verified/), e.g., (1) `sudo xattr -r -d com.apple.quarantine LOCATION_OF_CVX` followed by (2) `sudo find LOCATION_OF_CVX -name \*.mexmaci64 -exec spctl --add {} \;`
 7 | - For Python, see [cvxpy_intro.ipynb](cvxpy_intro.ipynb) (which has link to colab or use [direct link](https://colab.research.google.com/github/stephenbeckr/convex-optimization-class/blob/master/Demos/CVX_demo/cvxpy_intro.ipynb)) or its PDF version [cvxpy_intro.pdf](cvxpy_intro.pdf)
 8 |   - Or just look at the documentation from the software, which is pretty good
 9 | 
10 | ## Step 2: try filling out the worksheet
11 | 
12 | [Handout2_cvx_tutorial.pdf](Handout2_cvx_tutorial.pdf)
13 | 
14 | ## Step 3: check your answers
15 | 
16 | - For Matlab, [tutorialSolutions.m](tutorialSolutions.m)
17 | - For Python, [tutorialSolutions.py](tutorialSolutions.py) (this uses an older version of cvxpy and python) or the (newer) Jupyter notebook  [tutorialSolutions.ipynb](tutorialSolutions.ipynb)
18 | 
19 | ## Step 4: if you are super fast and finish everything...
20 | - Try my "least squares challenge" in either [python](https://github.com/stephenbeckr/ML-theory-class/blob/main/Code/LeastSquaresChallenge.ipynb) or [matlab](https://github.com/stephenbeckr/ML-theory-class/blob/main/Code/LeastSquaresChallenge.m).
21 |   - [Solutions are here](https://github.com/stephenbeckr/ML-theory-class/tree/solutions/Code)
22 | 


--------------------------------------------------------------------------------
/Demos/CVX_demo/cvx_demo.mlx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/CVX_demo/cvx_demo.mlx


--------------------------------------------------------------------------------
/Demos/CVX_demo/cvx_demo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/CVX_demo/cvx_demo.pdf


--------------------------------------------------------------------------------
/Demos/CVX_demo/cvxpy_intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Demos/CVX_demo/cvxpy_intro.pdf


--------------------------------------------------------------------------------
/Demos/CVX_demo/tutorialSolutions.m:
--------------------------------------------------------------------------------
 1 | %{
 2 | For CVX in-class tutorial, CCIMI short-course at Cambridge,
 3 |  June 2018, Becker
 4 | %}
 5 | %% Setup the matrix:
 6 | % rng(0);
 7 | n   = 10;
 8 | m   = 5;
 9 | A   = reshape( mod( (1:m*n)-1, 11 )+1, m, n )
10 | y   = (1:m)'
11 | 
12 | % Make global changes to CVX parameters
13 | cvx_precision best
14 | cvx_quiet true
15 | %% Exercise 1
16 | cvx_begin
17 |   variable x(n)
18 |   minimize norm(x)
19 |   subject to
20 |     norm( A*x-y ) <= .1
21 | cvx_end
22 | fprintf('Problem 1, CVX status is "%s", optimal value is %g\n', cvx_status, cvx_optval );
23 | %% Exercise 2
24 | cvx_begin
25 |   variable x(n)
26 |   minimize sum_square(x)
27 |   subject to
28 |     norm( A*x-y ) <= .1
29 | cvx_end
30 | fprintf('Problem 2, CVX status is "%s", optimal value is %g\n', cvx_status, cvx_optval );
31 | %% Exercise 3
32 | cvx_begin
33 |   variable x(n)  
34 |   minimize norm(x,1)
35 |   subject to
36 |     norm( A*x-y ) <= .1
37 | cvx_end
38 | fprintf('Problem 3, CVX status is "%s", optimal value is %g\n', cvx_status, cvx_optval );
39 | %% Exercise 4
40 | cvx_begin
41 |   variable x(n)
42 |   dual variable lambda
43 |   minimize norm(x)
44 |   subject to
45 |     norm( A*x-y ) <= .1 : lambda
46 | cvx_end
47 | % Now, resolve
48 | cvx_begin
49 |   variable xx(n)
50 |   minimize norm(xx) + lambda*norm( A*xx - y )
51 | cvx_end
52 | fprintf('Problem 4, CVX status is "%s", optimal value is %g\n', cvx_status, cvx_optval );
53 | fprintf(' Discrepanct via two methods is %g\n', norm(xx-x)/norm(x) );
54 | 
55 | %% Exercise 5: matrix problem
56 | % Note: the simple solution, sum(norms(A-x*o',2))
57 | %   doesn't work in CVX "Version 3.0beta, Build 1183 (dda2109)"
58 | o = ones(n,1);
59 | cvx_begin
60 |   variable x(m)
61 |   minimize sum(norms( A - x*o', 2 ))
62 | cvx_end
63 | fprintf('Problem 5, CVX status is "%s", optimal value is %g\n', cvx_status, cvx_optval );
64 | %% Exercise 6: matrix problem
65 | cvx_begin
66 |   variable x(m)
67 |   minimize norm( A - x*o')
68 | cvx_end
69 | fprintf('Problem 6, CVX status is "%s", optimal value is %g\n', cvx_status, cvx_optval );
70 | %% Exercise 7: matrix problem
71 | cvx_begin
72 |   variable X(m,n)
73 |   minimize norm( X - A, 'fro')
74 |   subject to
75 |     ones(1,m)*X*ones(n,1) == 1
76 | cvx_end
77 | fprintf('Problem 7, CVX status is "%s", optimal value is %g\n', cvx_status, cvx_optval );
78 | %% Exercise 8: matrix problem
79 | B   = A(:,1:5);
80 | cvx_begin
81 |   variable X(m,m) symmetric
82 |   minimize norm( X - B, 'fro')
83 |   subject to
84 |     X == semidefinite(m)
85 | cvx_end
86 | fprintf('Problem 8, CVX status is "%s", optimal value is %g\n', cvx_status, cvx_optval );


--------------------------------------------------------------------------------
/Demos/CVX_demo/tutorialSolutions.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np # np.array (and used internally in cvxpy)
  3 | import cvxpy as cvx
  4 | import argparse
  5 | import time
  6 | 
  7 | def get_vars():
  8 |    A = np.array([[1, 6,11, 5,10, 4, 9, 3, 8, 2],
  9 |                  [2, 7, 1, 6,11, 5,10, 4, 9, 3],
 10 |                  [3, 8, 2, 7, 1, 6,11, 5,10, 4],
 11 |                  [4, 9, 3, 8, 2, 7, 1, 6,11, 5],
 12 |                  [5,10, 4, 9, 3, 8, 2, 7, 1, 6]])
 13 | 
 14 |    y = np.array([1,2,3,4,5]).T
 15 | 
 16 |    return A, y
 17 | 
 18 | def print_status(prob, x):
 19 |    print("Problem status: ", prob.status)
 20 |    print("Optimal value:  ", prob.value)
 21 |    print("Optimal var:\n", x.value)
 22 | 
 23 | 
 24 | def problem1():
 25 |    A, y = get_vars()
 26 |    x = cvx.Variable(10) # column vector with 10 elements
 27 |    
 28 |    obj = cvx.Minimize(cvx.norm(x)) # cvx.norm defaults to the 2-norm
 29 |    constraints = [cvx.norm(A*x-y) <= 0.1] # specify a list of constraints
 30 | 
 31 |    prob = cvx.Problem(obj, constraints)
 32 |    prob.solve()
 33 |    
 34 |    print_status(prob, x)
 35 | 
 36 | def problem2():
 37 |    A, y = get_vars()
 38 |    x = cvx.Variable(10)
 39 |    
 40 |    obj = cvx.Minimize(cvx.norm(x)**2) # cvxpy objects implement the standard python ops
 41 |    constraints = [cvx.norm(A*x-y) <= 0.1]
 42 | 
 43 |    prob = cvx.Problem(obj, constraints)
 44 |    prob.solve()
 45 | 
 46 |    print_status(prob, x)
 47 | 
 48 | def problem3():
 49 |    A, y = get_vars()
 50 |    x = cvx.Variable(10)
 51 |    
 52 |    obj = cvx.Minimize(cvx.norm(x, p=1))
 53 |    constraints = [cvx.norm(A*x-y) <= 0.1]
 54 | 
 55 |    prob = cvx.Problem(obj, constraints)
 56 |    prob.solve()
 57 | 
 58 |    print_status(prob, x)
 59 | 
 60 | def problem4():
 61 |    def get_problem1_dual_value():
 62 |       obj = cvx.Minimize(cvx.norm(x))
 63 |       constraints = [cvx.norm(A*x-y) <= 0.1]
 64 | 
 65 |       prob = cvx.Problem(obj, constraints)
 66 |       prob.solve()
 67 |       
 68 |       return constraints[0].dual_value
 69 |    
 70 |    A, y = get_vars()
 71 |    x = cvx.Variable(10)
 72 |    
 73 |    # resolve problem 1 and return dual value for the constraint
 74 |    l = get_problem1_dual_value()
 75 |    print("dual variable: ", l)
 76 |    
 77 |    obj = cvx.Minimize(cvx.norm(x) + l*cvx.norm(A*x-y))
 78 |    
 79 |    prob = cvx.Problem(obj)
 80 |    prob.solve()
 81 |    
 82 |    # note that the solution is the same, but the optimal value is different,
 83 |    # since for problem 1 we form the Lagrangian \|x\|_2 + \lambda(\|Ax-y\|_2-0.1)
 84 |    print_status(prob, x)
 85 | 
 86 |    # the optimal value for problem 1 should be
 87 |    print("problem 1 optimal value: ", prob.value - 0.1*l)
 88 | 
 89 | 
 90 | def problem5():
 91 |    A, y = get_vars()
 92 |    x = cvx.Variable(5)
 93 |    ones = np.ones((10,1))
 94 | 
 95 |    obj = cvx.Minimize(sum(cvx.norm(A-x*ones.T, axis=0))) # cvx.norm behaves like np.linalg.norm
 96 | 
 97 |    prob = cvx.Problem(obj)
 98 |    prob.solve()
 99 | 
100 |    print_status(prob, x)
101 | 
102 | def problem6():
103 |    A, y = get_vars()
104 |    x = cvx.Variable(5)
105 |    ones = np.ones((10,1))
106 |    
107 |    obj = cvx.Minimize(cvx.norm(A-x*ones.T))
108 | 
109 |    prob = cvx.Problem(obj)
110 |    #prob.solve(verbose=True) # ~1e-7 duality gap, but CVXOPT gets a singular KKT system
111 |    prob.solve(verbose=True, kktsolver='robust')
112 |    #prob.solve(verbose=True, solver='SCS')
113 | 
114 |    print_status(prob, x)
115 |   
116 | def problem7():
117 |    A, y = get_vars()
118 |    X = cvx.Variable(5,10)
119 | 
120 |    obj = cvx.Minimize(cvx.norm(X-A, 'fro'))
121 |    constraints = [ np.ones((5,)).T*X*np.ones((10,)) == 1. ]
122 | 
123 |    prob = cvx.Problem(obj, constraints)
124 |    prob.solve()
125 | 
126 |    print_status(prob, X)
127 | 
128 | def problem8():
129 |    A, y = get_vars()
130 |    B = A[:,0:5]
131 |    X = cvx.Variable(5,5) # could use Semidef or Symmetric here instead
132 |    
133 |    obj = cvx.Minimize(cvx.norm(X-B, 'fro'))
134 |    constraints = [ X == X.T, X >> 0 ] # X is PSD
135 | 
136 |    prob = cvx.Problem(obj, constraints)
137 |    prob.solve()
138 | 
139 |    print_status(prob, X)
140 | 
141 | def problem9():
142 |    A, y = get_vars()
143 |    print('Rerun Problem 1 without parameterizing ...')
144 |    x = cvx.Variable(10)
145 |    obj = cvx.Minimize(cvx.norm(x))
146 |    constraints = [cvx.norm(A@x - y) <= 0.1]
147 |    prob = cvx.Problem(obj, constraints)
148 |    t = time.time()
149 |    prob.solve()
150 |    elapsed = time.time() - t
151 |    print(f"  Elapsed time: {elapsed} seconds.")
152 |    print('Now parameterize y ...')
153 |    b = cvx.Parameter(5)
154 |    obj = cvx.Minimize(cvx.norm(x))
155 |    constraints = [cvx.norm(A@x - b) <= 0.1]
156 |    prob = cvx.Problem(obj, constraints)
157 | 
158 |    for i in range(10):
159 |       b.value = np.random.rand(5)
160 |       t = time.time()
161 |       prob.solve()
162 |       elapsed = time.time() - t
163 |       print(f"  {i=}, Elapsed time: {elapsed} seconds.")
164 | 
165 | if __name__ == '__main__':
166 |    parser = argparse.ArgumentParser()
167 |    parser.add_argument('num', type=int, help='number of problem to run', default=1)
168 |    args = parser.parse_args()
169 |    
170 |    num = args.num
171 |    if num < 1 or num > 9:
172 |       raise argparse.ArgumentError('Problem number should be in [1..9]')
173 |    
174 |    problem_funs = [eval('problem'+str(i)) for i in range(1,10)]
175 |    problem_funs[num-1]()
176 |    
177 | 
178 | # vim: set ts=3 sw=3 sts=3 et :
179 | 


--------------------------------------------------------------------------------
/Fall2018_day-by-day_schedule.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Fall2018_day-by-day_schedule.pdf


--------------------------------------------------------------------------------
/Handouts/FirmlyNonexpansive.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/FirmlyNonexpansive.pdf


--------------------------------------------------------------------------------
/Handouts/FixedPtTheorems.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/FixedPtTheorems.pdf


--------------------------------------------------------------------------------
/Handouts/StrongConvexityLipschitz.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/StrongConvexityLipschitz.pdf


--------------------------------------------------------------------------------
/Handouts/StrongConvexityLipshitz.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/StrongConvexityLipshitz.pdf


--------------------------------------------------------------------------------
/Handouts/SubOptimalityBounds.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/SubOptimalityBounds.pdf


--------------------------------------------------------------------------------
/Handouts/SubgradientDescent.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Handouts/SubgradientDescent.pdf


--------------------------------------------------------------------------------
/Homeworks/APPM5630Spring25Homework01-02.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/APPM5630Spring25Homework01-02.pdf


--------------------------------------------------------------------------------
/Homeworks/APPM5630Spring25Homework03-04.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/APPM5630Spring25Homework03-04.pdf


--------------------------------------------------------------------------------
/Homeworks/APPM5630Spring25Homework05-06.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/APPM5630Spring25Homework05-06.pdf


--------------------------------------------------------------------------------
/Homeworks/APPM5630Spring25Homework07-08.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/APPM5630Spring25Homework07-08.pdf


--------------------------------------------------------------------------------
/Homeworks/APPM5630Spring25Homework09-10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/APPM5630Spring25Homework09-10.pdf


--------------------------------------------------------------------------------
/Homeworks/HW01_helper_polyhedrality.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/HW01_helper_polyhedrality.pdf


--------------------------------------------------------------------------------
/Homeworks/HW04/implicit2explicit.m:
--------------------------------------------------------------------------------
 1 | function A = implicit2explicit(Afun,m,n)
 2 | %IMPLICIT2EXPLICIT takes a linear function Afun(x) and builds the corresponding matrix
 3 | %   Makes an explicit matrix using the linear function
 4 | %   in the function handle "Afun", where the domain is R^n
 5 | %   and the range is in R^m
 6 | %
 7 | % Usage: implicit2explicit(Afun,m,n)
 8 | %
 9 | %   If n = [n1,n2], the domain is the space of n1 x n2 matrices
10 | %       Output of Afun should always be a m x 1 column vector
11 | %
12 | % Stephen Becker, stephen.becker@colorado.edu, 2/13/2017
13 | 
14 | if nargin < 3, n = m; end
15 | 
16 | A = zeros(m,prod(n));
17 | if numel(n) == 1
18 |     e = zeros(n,1);
19 | else
20 |     if numel(n) ~= 2, error('bad value for size of domain'); end
21 |     e = zeros(n(1),n(2));
22 | end
23 | for j = 1:prod(n)
24 |     e(j) = 1;
25 |     A(:,j) = Afun(e);
26 |     e(j) = 0;
27 | end
28 | 


--------------------------------------------------------------------------------
/Homeworks/HW04/quadraticObjective.m:
--------------------------------------------------------------------------------
 1 | function [f,g] = quadraticObjective(x,A,b, At)
 2 | % f = quadraticObjective( x, A, b )
 3 | %   computes f(x) = 1/2 || Ax-b ||^2
 4 | %   where "A" is a linear operator (either a matrix or a function handle)
 5 | %
 6 | % [f,g] = ...
 7 | %   also returns the gradient g(x) = \nabla f(x) = A'*(A*x-b)
 8 | %
 9 | % ... = quadraticObjective( x, A, b, At )
10 | %   uses "At" for the adjoint of the linear operator "A".
11 | %   This is only necessary if "A" is a function handle
12 | %
13 | % ... = quadraticObjective( x, A, b, At, c )
14 | %   uses f(x) = 1/2 ||Ax-b||^2 + dot(c,x)
15 | %
16 | % This form is suitable for most of Matlab's solvers
17 | %   and for 3rd party packages like Mark Schmidt's minFunc
18 | %
19 | % Stephen.Becker@Colorado.edu, 2/13/2017
20 | 
21 | if ~isa( A, 'function_handle')
22 |     At  = @(x) A'*x;
23 |     A   = @(x) A*x; % overload notation
24 | elseif nargin < 4
25 |     error('If "A" is a function hande, need 4 inputs');
26 | end
27 | if nargin < 5
28 |     c   = [];
29 | end
30 | 
31 | r   = A(x) - b; % residual
32 | f   = norm( r )^2/2;
33 | if ~isempty(c)
34 |     f   = f + dot(c,x);
35 | end
36 | if nargout >= 2
37 |     g   = At(r);
38 |     if ~isempty(c)
39 |         g   = g + c;
40 |     end
41 | end
42 | 


--------------------------------------------------------------------------------
/Homeworks/HW04/test_adjoint.m:
--------------------------------------------------------------------------------
 1 | function test_adjoint( A, At, sz, nRep )
 2 | % TEST_ADJOINT( A, At, sz )
 3 | %   tests whether the function handles A and At are
 4 | %   adjoints (= conjugate transpose) of each other.
 5 | %   
 6 | %   sz should be the size of the domain, e.g.,
 7 | %       sz = n  for domain to be n x 1 column vectors
 8 | % 
 9 | %       sz = [n1,n2] for the domain to be n1 x n2 matrices
10 | %
11 | % TEST_ADJOINT( A, At, sz, nRep )
12 | %   controls how many tests to perform (default: 10)
13 | %
14 | % Stephen.Becker@Colorado.edu, 2/13/2017
15 | 
16 | if nargin < 4, nRep = 10; end
17 | 
18 | for rep = 1:nRep
19 |     
20 |     if numel(sz) == 1
21 |         n   = sz;
22 |         x   = randn(n,1);
23 |     else
24 |         x   = randn(sz); % could be a matrix
25 |     end
26 |     
27 |     Ax  = A(x);
28 |     y   = randn( size(Ax) );
29 |     Aty = At(y);
30 |     
31 |     er  = dot( Ax, y ) - dot( x, Aty );
32 |     fprintf('Error in adjoint: %.2g\n', abs(er)/sqrt(norm(x)*norm(y)) );
33 | end
34 | 


--------------------------------------------------------------------------------
/Homeworks/HW10/.gitignore:
--------------------------------------------------------------------------------
1 | *.mat
2 | *.wav
3 | 


--------------------------------------------------------------------------------
/Homeworks/HW10/README.md:
--------------------------------------------------------------------------------
1 | ### Note on `handel.pkl`/`handel2.pkl`
2 | `handel.pkl` was pickled in Python 3 with the default protocol=3, which is not
3 | backwards compatible with the Python 2 pickler.
4 | `handel2.pkl` was pickled with protocol=2, which is backwards compatible with
5 | Python 2.
6 | 


--------------------------------------------------------------------------------
/Homeworks/HW10/adjointShortTimeDCT.m:
--------------------------------------------------------------------------------
 1 | function y = adjointShortTimeDCT( coeff, win, Ntrue )
 2 | % y = adjointShortTimeDCT( coeff, win )
 3 | %   applies the adoint/transpose MDCT to the coefficients "coeff"
 4 | %   This is also the pseudo-inverse of the forward MDCT
 5 | %
 6 | % y = adjointShortTimeDCT( coeff, win, N_original )
 7 | %   should be used when the forward MDCT is applied to signals
 8 | %   of length N_original. We need to know N_original so we can
 9 | %   undo the zero-padding (which is done when N_original is not
10 | %   a multiple of the blockSizez)
11 | %
12 | % see forwardShortTimeDCT.m for an example of the window "win"
13 | %
14 | % Stephen Becker, 3/18/2017
15 | % See also forwardShortTimeDCT.m
16 | 
17 | 
18 | N           = length(coeff)/2;
19 | blockSize   = length(win);
20 | nBlocks     = ceil( N/blockSize );
21 | if nargin >= 3
22 |     assert( Ntrue <= N );
23 | else
24 |     Ntrue = [];
25 | end
26 | 
27 | Win         = spdiags(win(:),0,blockSize,blockSize);
28 | 
29 | C           = reshape( coeff(1:N), blockSize, nBlocks );
30 | Y           = Win*idct( C );
31 | y           = Y(:);
32 | 
33 | C           = reshape( coeff(N+1:end), blockSize, nBlocks );
34 | Y2          = Win*idct( C );
35 | y2          = circshift(Y2(:),blockSize/2);
36 | 
37 | y           = y + y2;
38 | 
39 | if ~isempty(Ntrue)
40 |     y   = y(1:Ntrue);
41 | end
42 | 


--------------------------------------------------------------------------------
/Homeworks/HW10/forwardShortTimeDCT.m:
--------------------------------------------------------------------------------
 1 | function coeff = forwardShortTimeDCT( y, win )
 2 | % coeff = forwardShortTimeDCT( y, win )
 3 | %   applies the Modified DCT to the signal y
 4 | %   This is a linear function.
 5 | %   Assumes y is a column vector of length N
 6 | %   The blockSize is encoded in the length of win
 7 | %   This code then uses a lapped (50% overlapping)
 8 | %   DCT on segments of y of length blockSize.
 9 | %
10 | %   Note: this function zero-pads y to be an even multiple of blockSize
11 | %
12 | % An example window that we recommend, so that
13 | %   the transpose of this function is its pseudo-inverse,
14 | %   is:
15 | %   win         = sin( pi*( (1:blockSize) + 1/2)/(blockSize) );
16 | %   (a typical value of blockSize = 1024)
17 | %   This satisifes the Princen-Bradley conditions, meaning that
18 | %   we can guarantee   win.^2 + circshift( win, blockSize/2).^2  =  1
19 | %
20 | % Stephen Becker, 3/18/2017
21 | % See also adjointShortTimeDCT.m
22 | 
23 | N           = length(y);
24 | blockSize   = length(win);
25 | % Make y be a multiple of the blockSize by zero-padding
26 | nBlocks     = ceil( N/blockSize );
27 | y           = [y; zeros(blockSize*nBlocks-N,1) ];
28 | 
29 | Win         = spdiags(win(:),0,blockSize,blockSize);
30 | 
31 | Y           = reshape( y, blockSize, nBlocks );
32 | C           = dct( Win*Y );
33 | 
34 | % and 50% shift
35 | Y           = reshape( circshift(y,-blockSize/2), blockSize, nBlocks );
36 | C2          = dct( Win*Y );
37 | 
38 | coeff       = [ C(:); C2(:) ];
39 | % or instead, intersperse
40 | % coeff       = [C;C2]; coeff = coeff(:);
41 | 


--------------------------------------------------------------------------------
/Homeworks/HW10/handel.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/HW10/handel.mat


--------------------------------------------------------------------------------
/Homeworks/HW10/handel.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/HW10/handel.pkl


--------------------------------------------------------------------------------
/Homeworks/HW10/handel2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/HW10/handel2.pkl


--------------------------------------------------------------------------------
/Homeworks/HW10/listen_to_Handel.m:
--------------------------------------------------------------------------------
 1 | %{
 2 | For HW 10 to demonstrate listening to the "Handel" audio file
 3 | 
 4 | Stephen Becker, 3/18/2017
 5 | %}
 6 | 
 7 | load handel.mat  % loads y, Fs
 8 | 
 9 | %% Play the sound
10 | playerObj = audioplayer(y,Fs);
11 | play( playerObj )
12 | %% Play the down-sampled sound (by a factor of 2)
13 | playerObj = audioplayer(y(1:2:end),Fs/2);
14 | play( playerObj )
15 | %% Play the down-sampled sound (by a factor of 4)
16 | playerObj = audioplayer(y(1:4:end),Fs/4);
17 | play( playerObj )
18 | %% Filter-then-downsample to avoid aliasing
19 | % first low-pass filter it
20 | Rp  = 1e-3;      % for peak-to-peak ripple
21 | Rst = 1e-3;      % for stopband attenuation
22 | ordr= 100;       % filter order
23 | eqnum   = firceqrip( ordr, 1/4, [Rp Rst], 'passedge');
24 | % fvtool(eqnum, 'Fs',Fs ); % Visualize the filter's frequency response
25 | lowpassFIR = dsp.FIRFilter('Numerator',eqnum);
26 | yFiltered  = lowpassFIR( y );
27 | 
28 | % playerObj = audioplayer(yFiltered,Fs);
29 | playerObj = audioplayer(yFiltered(1:4:end),Fs/4);
30 | play( playerObj )
31 | 
32 | %% Plot spectrograms
33 | % Use pwelch estimate instead of "spectrogram" function
34 | blockSize   = 1e3;
35 | win         = window( @barthannwin, blockSize );
36 | figure(1);
37 | subplot(2,2,1);
38 | pwelch( y, win, [], [], Fs ); title('Original signal');
39 | subplot(2,2,2);
40 | pwelch( y(1:4:end), win, [], [], Fs/4 ); title('Downsampled (aliased)');
41 | subplot(2,2,3);
42 | pwelch( yFiltered, win, [], [], Fs ); title('Filtered');
43 | subplot(2,2,4);
44 | pwelch( yFiltered(1:4:end), win, [], [], Fs/4 ); title('Filtered-then-downsampled');
45 | 
46 | 
47 | %% and a time-frequency plot
48 | figure(3);
49 | spectrogram( y, 5e2, [], [], Fs )
50 | title('Spectrogram');


--------------------------------------------------------------------------------
/Homeworks/HW10/listen_to_Handel.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For HW 10 to demonstrate listening to the "Handel" audio filename
  3 | Stephen Becker, 3/18/2017
  4 | """
  5 | import numpy as np
  6 | import pickle
  7 | import scipy.io.wavfile
  8 | import scipy.signal as sig
  9 | 
 10 | def save_wav(filename, data, Fs=44100):
 11 |     # assume we're working with floats
 12 |     # rescale to [-1,1]
 13 |     _data = 2*data/(data.max() - data.min())
 14 |     scipy.io.wavfile.write(filename, int(Fs), _data.astype(np.float32))
 15 | 
 16 | def listen():
 17 |     y,Fs = pickle.load(open('handel.pkl', 'rb'))
 18 |     y = y.ravel() # want a vector, not (n,1) array
 19 |     Fs = Fs[0][0] # want a scalar, not a (1,1) array
 20 |     sounds = []
 21 |     do_plots = True
 22 |     if do_plots: import matplotlib.pyplot as plt
 23 | 
 24 |     # The original signal
 25 |     save_wav('handel.wav', y, Fs)
 26 |     sounds.append(('Handel', 'handel.wav'))
 27 | 
 28 |     # Naive downsample by a factor of 2
 29 |     y2 = y[::2]; fn = 'handel_dec2.wav'
 30 |     save_wav(fn, y2, Fs/2)
 31 |     sounds.append(('Handel Downsampled by a factor of 2', fn))
 32 | 
 33 |     # Naive downsample by a factor of 4
 34 |     y4 = y[::4]; fn = 'handel_dec4.wav'
 35 |     save_wav(fn, y4, Fs/4)
 36 |     sounds.append(('Handel Downsampled by a factor of 4', fn))
 37 | 
 38 |     # Anti-alias (lowpass) filter then downsample
 39 |     # Use Parks-McClellan to design a lowpass filter with:
 40 |     #  * a passband from 0*Nyquist to 0.25*Nyquist
 41 |     #  * a stopband from 0.3*Nyquist to 0.5*Nyquist
 42 |     filt = sig.remez(100, [0, 0.125, 0.15, 0.5], [1, 0])
 43 |     if do_plots: # Plot filt's frequency response
 44 |         w, h = sig.freqz(filt)
 45 |         fig = plt.figure()
 46 |         plt.title('Anti-alias Frequency Response')
 47 |         plt.plot(w/np.pi*Fs/2, 20 * np.log10(abs(h)), 'b')
 48 |         plt.ylabel('Amplitude [dB]', color='b')
 49 |         plt.xlabel('Frequency [Hz]')
 50 | 
 51 |     # perform the filtering with the filter we designed above
 52 |     yFiltered = sig.lfilter(filt, np.array([1]), y)
 53 |     fn = 'handel_lp.wav'
 54 |     save_wav(fn, yFiltered, Fs)
 55 |     sounds.append(('Handel Lowpass Filtered', fn))
 56 | 
 57 |     # It should now be safe to downsample by a factor of 4
 58 |     yFiltered4 = yFiltered[::4]; fn = 'handel_lp_dec4.wav'
 59 |     save_wav(fn, yFiltered4, Fs/4)
 60 |     sounds.append(('Handel Lowpass Filtered and Downsampled by a factor of 4', fn))
 61 | 
 62 |     # Print some info
 63 |     print('You should play the following files with your preferred media player')
 64 |     for desc, fn in sounds:
 65 |         print('{}:\n --> {}'.format(desc, fn))
 66 | 
 67 |     if do_plots: # estimate PSD
 68 |         fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
 69 | 
 70 |         f, Pxx = sig.welch(y, fs=Fs, window='barthann', nperseg=1024,
 71 |             scaling='density')
 72 |         ax1.plot(f, 10*np.log10(Pxx))
 73 |         ax1.set_title('Original signal')
 74 | 
 75 |         f, Pxx = sig.welch(y4, fs=Fs/4, window='barthann', nperseg=128,
 76 |             scaling='density')
 77 |         ax2.plot(f, 10*np.log10(Pxx))
 78 |         ax2.set_title('Downsampled (x4) (aliased)')
 79 | 
 80 |         f, Pxx = sig.welch(yFiltered, fs=Fs, window='barthann', nperseg=1024,
 81 |             scaling='density')
 82 |         ax3.plot(f, 10*np.log10(Pxx))
 83 |         ax3.set_title('Filtered')
 84 | 
 85 |         f, Pxx = sig.welch(yFiltered4, fs=Fs/4, window='barthann', nperseg=128,
 86 |             scaling='density')
 87 |         ax4.plot(f, 10*np.log10(Pxx))
 88 |         ax4.set_title('Filtered then Downsampled')
 89 | 
 90 |         for ax in (ax1, ax2, ax3, ax4):
 91 |             ax.set_xlabel('Frequency [Hz]')
 92 |             ax.set_ylabel('Power Spectral Density [dB/Hz]')
 93 | 
 94 |     if do_plots: # time-frequency raster
 95 |         fig = plt.figure()
 96 |         f, t, Sxx = sig.spectrogram(y, fs=Fs, window='barthann', nperseg=512,
 97 |             scaling='density')
 98 |         plt.pcolormesh(f, t, 10*np.log10(Sxx.T))
 99 |         plt.colorbar()
100 |         plt.xlabel('Frequency [Hz]')
101 |         plt.ylabel('Time [sec]')
102 | 
103 |     if do_plots: plt.show()
104 | 
105 | if __name__ == '__main__':
106 |     listen()
107 | 


--------------------------------------------------------------------------------
/Homeworks/HW10/my_upsample.m:
--------------------------------------------------------------------------------
1 | function x = my_upsample( y, sampleSet, n )
2 | % x = my_upsample( y, sampleSet, n )
3 | %   returns x of length n such that x(sampleSet) = y
4 | x   = zeros(n,size(y,2));
5 | x(sampleSet,:)  = y;
6 | 


--------------------------------------------------------------------------------
/Homeworks/HW10/project_l1.m:
--------------------------------------------------------------------------------
 1 | function y = project_l1(x, tau)
 2 | % y = project_l1( x, tau )
 3 | %   projects x onto the scaled l1 ball, ||x||_1 <= tau
 4 | %   If tau is not provided, default is tau = 1.
 5 | %
 6 | %   If x is a matrix, the operation is performed along
 7 | %   each column.
 8 | %
 9 | % Stephen Becker and Emmanuel Candes, 2009/2010
10 | % Crucial bug fix: 3/17/2017
11 | 
12 | if nargin < 2, tau = 1; end
13 | 
14 | row_vec = 0;
15 | if size(x,1) == 1 && size(x,2) > 1
16 |    row_vec = 1;
17 |    x = x(:);
18 | end
19 | 
20 | absx = abs(x);
21 | s   = sort(absx, 1, 'descend');
22 | cs  = cumsum(s, 1);
23 | 
24 | I = find(cs(end,:) > tau);
25 | % If in I, then x is not feasible, and we must project;
26 | % if not in I, then x is already feasible.
27 | % Bug found by SRB on 3/17/2017
28 | y = x;
29 | 
30 | % Do projections where needed
31 | for i=1:numel(I)
32 |    ind = I(i);
33 |    
34 |    % JMF 27/03/2017: There's probably a slicker way to do this.
35 |    thresh = get_vector_thresh(x(:,i), tau, s(:,i), cs(:,i));
36 |    y(:,i) = sign(x(:,i)).*max(absx(:,i) - thresh, 0);
37 | end
38 | 
39 | if row_vec % restore size
40 |    y = y.';
41 | end
42 | 
43 | end
44 | 
45 | function [thresh] = get_vector_thresh(x, tau, s, cs)
46 |    % Check some "discrete" levels of shrinkage, e.g. [s(2:end),0]
47 |    % This lets us discover which indices will be nonzero
48 |    i_tau   = find(cs - (1:numel(x))'.* [s(2:end) ; 0] >= tau,   1);
49 |    
50 |    % Now that we know which indices are involved, it's a very simple problem:
51 |    thresh  = (cs(i_tau) - tau)/i_tau;
52 | end
53 | 


--------------------------------------------------------------------------------
/Homeworks/HW10/python_functions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.sparse, scipy.fftpack
  3 | 
  4 | def project_l1(x, tau=1.):
  5 |     """
  6 |     project_l1(x, tau) -> y
  7 |       projects x onto the scaled l1 ball, ||x||_1 <= tau
  8 |       If tau is not provided, the default is tau = 1.
  9 | 
 10 |     Stephen Becker and Emmanuel Candes, 2009/2010.
 11 |     Crucial bug fix: 3/17/2017, SRB
 12 |     """
 13 |     absx = np.abs(x)
 14 |     s = np.sort(absx, axis=None)[::-1] # sort in descending order
 15 |     cs = np.cumsum(s)
 16 | 
 17 |     if cs[-1] <= tau:
 18 |         # x is already feasible, so no thresholding needed
 19 |         return x
 20 | 
 21 |     # Check some "discrete" levels of shrinkage, e.g. [s(2:end),0]
 22 |     # This lets us discover which indices will be nonzero
 23 |     n = x.size
 24 |     i_tau = np.where(cs -
 25 |         np.arange(1,n+1)*np.concatenate((s[1:],0), axis=None) >= tau)[0][0]
 26 | 
 27 |     # Now that we know which indices are involved, it's a very simple problem
 28 |     thresh = (cs[i_tau]-tau) / (i_tau+1)
 29 | 
 30 |     # Shrink x by the amount "thresh"
 31 |     return np.sign(x)*np.maximum(absx - thresh, 0)
 32 | 
 33 | def forwardShortTimeDCT(y, win=None):
 34 |     """
 35 |     forwardShortTimeDCT(y, win=None) -> coeff (, win)
 36 | 
 37 |     Applies the Modified DCT to the signal y
 38 |     This is a linear function.
 39 |     Assumes y is a vector of length N
 40 |     This code then uses a lapped (50% overlapping)
 41 |     DCT on segments of y of length blockSize.
 42 | 
 43 |     Note: this function zero-pads y to be an even multiple of blockSize.
 44 | 
 45 |     An example window that we recommend, so that the transpose of this
 46 |     function is its pseudo-inverse is
 47 | 
 48 |     win = np.sin(np.pi*(np.arange(1,blockSize+1)+0.5)/blockSize)
 49 |     (a typical value of blockSize = 1024)
 50 | 
 51 |     On input, if win is not specified, we return (coeff, win) so the
 52 |     caller has access to the window we used.
 53 | 
 54 |     This satisfies the Princen-Bradley conditions, meaning that we can
 55 |     guarantee  win**2+np.roll(win, int(blockSize/2))**2 == 1.
 56 | 
 57 |     Stephen Becker, 3/18/2017
 58 |     See also adjointShortTimeDCT
 59 |     """
 60 |     # Make a window if not provided by the user
 61 |     if win is None:
 62 |         blockSize = 1024
 63 |         win = np.sin(np.pi*(np.arange(1,blockSize+1)+0.5)/blockSize)
 64 |         return_win = True
 65 |     else:
 66 |         blockSize = win.size
 67 |         return_win = False
 68 | 
 69 |     # Zero-pad y so it is a multiple of blockSize
 70 |     N = y.size
 71 |     nBlocks = int(np.ceil(float(N)/blockSize))
 72 |     y = np.concatenate((y, np.zeros(nBlocks*blockSize-N)))
 73 | 
 74 |     # Apply DCT to aligned blocks
 75 |     Win = scipy.sparse.spdiags(win, [0], blockSize, blockSize)
 76 |     Y = np.reshape(y, (blockSize, nBlocks), order='f')
 77 |     C = scipy.fftpack.dct(Win*Y, axis=0, norm='ortho')
 78 | 
 79 |     # Apply DCT to 50% shifted blockSize
 80 |     Y = np.reshape(np.roll(y, int(-blockSize/2)), (blockSize, nBlocks),
 81 |         order='f')
 82 |     C2 = scipy.fftpack.dct(Win*Y, axis=0, norm='ortho')
 83 | 
 84 |     coeff = np.concatenate((C.ravel(order='f'), C2.ravel(order='f')))
 85 | 
 86 |     if return_win: return coeff, win
 87 |     else: return coeff
 88 | 
 89 | def adjointShortTimeDCT(coeff, win, Ntrue=None):
 90 |     """
 91 |     adjointShortTimeDCT(coeff, win, Ntrue=None) -> y
 92 | 
 93 |     Applies the adjoint/transpose Modified DCT to the coefficients coeff.
 94 |     This is also the pseudo-inverse of the forward MDCT.
 95 | 
 96 |     If Ntrue=N_original, where N_original is the original length of
 97 |     the signal y (i.e., before zero-padding in forwardShortTimeDCT),
 98 |     we truncate the padded zeros and return the original y.
 99 | 
100 |     See forwardShortTimeDCT for an example of the window win.
101 | 
102 |     Stephen Becker, 3/18/2017
103 |     See also forwardShortTimeDCT
104 |     """
105 |     if coeff.size % 2:
106 |         raise ValueError("""coeff should have an even number of elements.
107 |             Did you compute coeff with forwardShortTimeDCT?""")
108 |     N = int(coeff.size/2)
109 | 
110 |     blockSize = win.size
111 |     nBlocks = int(np.ceil(float(N)/blockSize))
112 | 
113 |     if Ntrue is not None and Ntrue > N:
114 |         raise ValueError("""The specified value of Ntrue ({}) is too big
115 |             for the number of coefficients in coeff ({})""".format(
116 |             Ntrue, N))
117 | 
118 |     Win = scipy.sparse.spdiags(win, [0], blockSize, blockSize)
119 | 
120 |     C = np.reshape(coeff[0:N], (blockSize, nBlocks), order='f')
121 |     Y = Win*scipy.fftpack.idct(C, axis=0, norm='ortho')
122 |     y = Y.ravel(order='f')
123 | 
124 |     C2 = np.reshape(coeff[N:], (blockSize, nBlocks), order='f')
125 |     Y2 = Win*scipy.fftpack.idct(C2, axis=0, norm='ortho')
126 |     y2 = np.roll(Y2.ravel(order='f'), int(blockSize/2))
127 |     y += y2
128 | 
129 |     if Ntrue:
130 |         y = y[0:Ntrue]
131 | 
132 |     return y
133 | 
134 | def my_upsample(y, sampleSet, n):
135 |     """
136 |     my_upsample(y, sampleSet, n) -> x
137 |     Returns x of length n such that x[sampleSet] = y
138 |     """
139 |     if y.ndim == 1:
140 |         x = np.zeros(n)
141 |         x[sampleSet] = y
142 |     else:
143 |         x = np.zeros((n, y.shape[1]))
144 |         x[sampleSet,:] = y
145 |     return x
146 | 


--------------------------------------------------------------------------------
/Homeworks/HW10/test_python_functions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is just a simple script to help test the python routines and ensure they
 3 | match the MATLAB routines.
 4 | """
 5 | import numpy as np
 6 | import scipy, scipy.io
 7 | import pickle
 8 | 
 9 | from python_functions import *
10 | 
11 | def convert_handel():
12 |     import pickle
13 |     mat = scipy.io.loadmat('handel.mat')
14 |     y = mat['y']
15 |     Fs = mat['Fs']
16 | 
17 |     pickle.dump((y,Fs), open('handel.pkl', 'wb')) # NOT bw compatible w/ python2
18 |     pickle.dump((y,Fs), open('handel2.pkl', 'wb'), protocol=2)
19 | 
20 |     # load with
21 |     y,Fs = pickle.load(open('handel.pkl', 'rb'))
22 | 
23 | def test_project_l1():
24 |     # JMF 25/03/2017: tested with row vec, col vec, and mats; matches project_l1.m
25 |     mat = scipy.io.loadmat('x.mat')
26 |     x = mat['x']
27 | 
28 |     y = project_l1(x, 1)
29 | 
30 | def test_STDCT():
31 |     mat = scipy.io.loadmat('x.mat')
32 |     x = mat['x'].ravel()
33 |     coeff_ref = mat['coeff'].ravel()
34 |     win_ref = mat['win'].ravel()
35 | 
36 |     coeff, win = forwardShortTimeDCT(x)
37 | 
38 |     xrec = adjointShortTimeDCT(coeff, win, x.size)
39 |     print(np.linalg.norm(x-xrec))
40 | 
41 | def test_my_upsample():
42 |     x = np.random.randn(10, 2)
43 |     sampleSet = np.array([0, 2, 3, 4])
44 |     y = x[sampleSet]
45 |     print(x)
46 |     print(my_upsample(y, sampleSet, x.size))
47 | 
48 | def Manuel():
49 |     mat = scipy.io.loadmat('handel.mat')
50 |     y = mat['y']
51 |     Fs = mat['Fs']
52 | #   y,Fs = pickle.load(open('handel2.pkl', 'rb'))
53 |     y = y.ravel()
54 |     N = y.size
55 |     print N
56 | 
57 |     coeff, win = forwardShortTimeDCT(y)
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     #Manuel()
62 |     #convert_handel()
63 |     test_project_l1()
64 |     #test_STDCT()
65 |     #test_my_upsample()
66 | 


--------------------------------------------------------------------------------
/Homeworks/ProjectRubric.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Homeworks/ProjectRubric.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Stephen Becker
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Notes/00_IntroToOptProblems.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/00_IntroToOptProblems.pdf


--------------------------------------------------------------------------------
/Notes/00a_metaRules.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/00a_metaRules.pdf


--------------------------------------------------------------------------------
/Notes/01_TypesOfMinimizers_IntroConvexity.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/01_TypesOfMinimizers_IntroConvexity.pdf


--------------------------------------------------------------------------------
/Notes/02_ConvexSets_part1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/02_ConvexSets_part1.pdf


--------------------------------------------------------------------------------
/Notes/03_ConvexSets_part2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/03_ConvexSets_part2.pdf


--------------------------------------------------------------------------------
/Notes/04_SeparatingHyperplanes_Farkas.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/04_SeparatingHyperplanes_Farkas.pdf


--------------------------------------------------------------------------------
/Notes/05_ConvexFunctions_part1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/05_ConvexFunctions_part1.pdf


--------------------------------------------------------------------------------
/Notes/05_ConvexFunctions_part2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/05_ConvexFunctions_part2.pdf


--------------------------------------------------------------------------------
/Notes/05_ConvexFunctions_part3_LipschitzGradient.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/05_ConvexFunctions_part3_LipschitzGradient.pdf


--------------------------------------------------------------------------------
/Notes/05_ConvexFunctions_part4_examples.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/05_ConvexFunctions_part4_examples.pdf


--------------------------------------------------------------------------------
/Notes/05_ConvexFunctions_part5_preservingConvexity.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/05_ConvexFunctions_part5_preservingConvexity.pdf


--------------------------------------------------------------------------------
/Notes/06_ConjugateFunctions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/06_ConjugateFunctions.pdf


--------------------------------------------------------------------------------
/Notes/07_GradientDescent_intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/07_GradientDescent_intro.pdf


--------------------------------------------------------------------------------
/Notes/08_ExistenceUniquenessMinimizers.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/08_ExistenceUniquenessMinimizers.pdf


--------------------------------------------------------------------------------
/Notes/09_ProximityOperators.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/09_ProximityOperators.pdf


--------------------------------------------------------------------------------
/Notes/10_OptimizationProblems.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/10_OptimizationProblems.pdf


--------------------------------------------------------------------------------
/Notes/11_FirstViewLagrangeMultipliers.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/11_FirstViewLagrangeMultipliers.pdf


--------------------------------------------------------------------------------
/Notes/12_ConicOptimizationProblems.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/12_ConicOptimizationProblems.pdf


--------------------------------------------------------------------------------
/Notes/13_moreOnSDPs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/13_moreOnSDPs.pdf


--------------------------------------------------------------------------------
/Notes/14_LagrangianAndDualProblem.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/14_LagrangianAndDualProblem.pdf


--------------------------------------------------------------------------------
/Notes/15_MoreDuality.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/15_MoreDuality.pdf


--------------------------------------------------------------------------------
/Notes/16_SaddlePtsSharedLagrangians.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/16_SaddlePtsSharedLagrangians.pdf


--------------------------------------------------------------------------------
/Notes/17_GameTheoryConnections.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/17_GameTheoryConnections.pdf


--------------------------------------------------------------------------------
/Notes/18_FenchelRockafellarDuality.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/18_FenchelRockafellarDuality.pdf


--------------------------------------------------------------------------------
/Notes/19_KKT_and_complementarySlackness.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/19_KKT_and_complementarySlackness.pdf


--------------------------------------------------------------------------------
/Notes/22_ProximalGradientDescent_convergenceAnalysis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/22_ProximalGradientDescent_convergenceAnalysis.pdf


--------------------------------------------------------------------------------
/Notes/22a_ProximalGradientDescent_motivation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/22a_ProximalGradientDescent_motivation.pdf


--------------------------------------------------------------------------------
/Notes/23_NesterovAcceleration_convergenceAnalysis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/23_NesterovAcceleration_convergenceAnalysis.pdf


--------------------------------------------------------------------------------
/Notes/24_ConvergenceRates.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/24_ConvergenceRates.pdf


--------------------------------------------------------------------------------
/Notes/25_ConjugateGradientMethod.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/25_ConjugateGradientMethod.pdf


--------------------------------------------------------------------------------
/Notes/26_QuasiNewtonMethods.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/26_QuasiNewtonMethods.pdf


--------------------------------------------------------------------------------
/Notes/28_FindingGradientsByHand.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/28_FindingGradientsByHand.pdf


--------------------------------------------------------------------------------
/Notes/29_AutomaticDifferentiation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/29_AutomaticDifferentiation.pdf


--------------------------------------------------------------------------------
/Notes/29a_AdjointStateMethod.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/29a_AdjointStateMethod.pdf


--------------------------------------------------------------------------------
/Notes/30_GradientsParameterizedFunctions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/30_GradientsParameterizedFunctions.pdf


--------------------------------------------------------------------------------
/Notes/31_NewtonAndIPM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/31_NewtonAndIPM.pdf


--------------------------------------------------------------------------------
/Notes/32_ADMM_DRS_PrimalDual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/32_ADMM_DRS_PrimalDual.pdf


--------------------------------------------------------------------------------
/Notes/33_LinearPrograms.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/33_LinearPrograms.pdf


--------------------------------------------------------------------------------
/Notes/34_IntegerLinearPrograms.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/34_IntegerLinearPrograms.pdf


--------------------------------------------------------------------------------
/Notes/README.md:
--------------------------------------------------------------------------------
1 | # Handwritten notes
2 | 
3 | Created via Microsoft OneNote with a Wacom One tablet, then exporting to PDF and using ghostscript/gs to reduce file-size using /screen preset
4 | 


--------------------------------------------------------------------------------
/Notes/appendix_notes_01.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/appendix_notes_01.pdf


--------------------------------------------------------------------------------
/Notes/supplement_Geometry_Differentiability.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_Geometry_Differentiability.pdf


--------------------------------------------------------------------------------
/Notes/supplement_LagrangeMultipliersIn2D.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_LagrangeMultipliersIn2D.pdf


--------------------------------------------------------------------------------
/Notes/supplement_Slater_PrimalNotAchieved.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_Slater_PrimalNotAchieved.pdf


--------------------------------------------------------------------------------
/Notes/supplement_VariationalInequalities_and_LCP.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_VariationalInequalities_and_LCP.pdf


--------------------------------------------------------------------------------
/Notes/supplement_convergenceIteratesGradientDescent.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_convergenceIteratesGradientDescent.pdf


--------------------------------------------------------------------------------
/Notes/supplement_dualityPracticeProblem.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/Notes/supplement_dualityPracticeProblem.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Advanced Convex Optimization class
 2 | APPM 5630 at CU Boulder
 3 | Prof. Becker, Spring 2025. Meeting time: MWF 11:15 -- 12:05 PM, DUAN G2B21
 4 | 
 5 | Office hours:
 6 | - Mon 3-4:30 and Thursday 1-2:30, hybrid (in my office or via zoom; see Canvas for zoom link)
 7 | - Our TA Nic Rummel's office hours are Thurs 4:30-5:30
 8 | 
 9 | This repo contains in-class demos and some homework solutions (much of it is from Spring 2023 and 2021 or even Fall 2018 when this class was taught as a special topics course 4720/5720)
10 | 
11 | - Here is the [class policy/procedures](policies.md) document (and if you need it for some reason, the [2018 policies/syllabus (PDF)](APPM4720_5720_Fall2018_Syllabus.pdf)).
12 | - Here is the [syllabus (details on the content)](syllabus.md) document (and if you need it, the [day-by-day schedule](Fall2018_day-by-day_schedule.pdf) from Fall 2018 has even more details on what we covered)
13 | 
14 | List of previous semesters' class projects
15 | - [2023 class projects](SlideshowAllPresentations_5630_Spring23.pdf)
16 | - [2021 class projects](SlideshowAllPresentations_5630_Spring21.pdf)
17 | - [2018 class projects](SlideshowAllPresentations_4720Fall18.pdf)
18 | - [2017 class projects](https://amath.colorado.edu/faculty/becker/SlideshowAllPresentations_4720Spr17.pdf).  
19 | 
20 | We will follow instructors' notes. The supplementary texts we used most often:
21 | - [Convex Optimization by Boyd and Vandenberghe](http://www.stanford.edu/~boyd/cvxbook/), Cambridge University Press 2004
22 | - [First-Order Methods in Optimization by Amir Beck](https://epubs.siam.org/doi/book/10.1137/1.9781611974997), SIAM 2017, see also [SIAM website](http://bookstore.siam.org/mo25/)
23 | - [Convex Analysis and Monotone Operator Theory in Hilbert Spaces by Bauschke and Combettes](https://link.springer.com/book/10.1007%2F978-3-319-48311-5), 2nd edition, Springer 2017
24 | 
25 | More details on textbooks in the [syllabus](syllabus.md).
26 | 


--------------------------------------------------------------------------------
/SlideshowAllPresentations_4720Fall18.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/SlideshowAllPresentations_4720Fall18.pdf


--------------------------------------------------------------------------------
/SlideshowAllPresentations_5630_Spring21.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/SlideshowAllPresentations_5630_Spring21.pdf


--------------------------------------------------------------------------------
/TypedNotes/APPM5720Notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/TypedNotes/APPM5720Notes.pdf


--------------------------------------------------------------------------------
/TypedNotes/README.md:
--------------------------------------------------------------------------------
 1 | # APPM 5630 at CU Boulder: typed up notes
 2 | 
 3 | 
 4 | These are latex notes from the Fall 2018 class typed up by Mitchell Krock.
 5 | 
 6 | For future semesters, these notes might be modified/added/corrected (you can make a pull request if you have a change, or if you want to be very active, we can add you on the repo)
 7 | 
 8 | Note: the compiled pdf might not always be up-to-date
 9 | 
10 | In addition, a verbose version of lecture notes for Spring 2021 is available at lecture_notes.pdf by Jaden Wang. It is perhaps most useful for lectures not posted on github and ease of locating information in a single typed PDF, although occasional extra insights might be helpful. Source files are in the lecture_notes_tex folder.
11 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/TypedNotes/lecture_notes.pdf


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_01.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \chapter{Theoretical Foundation}
  6 | \newpage
  7 | \section{Introduction}
  8 | An optimization problem looks like
  9 | \[
 10 | 	\min_{x \in C} f(x)
 11 | \]
 12 | where $ f(x)$ is the  \allbold{objective function} and $ C \subseteq \rr^n$ is the \allbold{constraint set}. $ C$ might look like
 13 |  \[
 14 | 	 C=\{x: g_i(x) \leq 0 \ \forall \ i=1,\ldots,m\} 
 15 | .\] 
 16 | 
 17 | \begin{remark}
 18 | We can always turn a maximization problem into a minimization problem as the following:
 19 | \[
 20 | 	\min_x f(x) = -\max_x -f(x)
 21 | .\] 
 22 | Therefore, WLOG, we will stick with minimization.
 23 | 
 24 | \end{remark}
 25 | 	
 26 | \begin{eg}
 27 | 	An assistant professor earns \$100 per day, and they enjoy both ice cream and cake. The optimization problem aims to maximize the utility ( \emph{e.g.} happiness) of ice cream $ f_1(x_1)$ and of cake $ f_2(x_2)$. The constraints we have is that $ x_1\geq 0, x_2 \geq 0$, and $ x_1+x_2 \leq 100$.
 28 | 
 29 | 	To maximize both utility, it might be natural to define
 30 | 	\[
 31 | 		F(\vec{x}) = \begin{pmatrix} f_1(x_1)\\f_2(x_2) \end{pmatrix}, \vec{ x} = \begin{pmatrix} x_1\\x_2 \end{pmatrix}  
 32 | 	\]
 33 | and maximize $ F$. However, this isn't a well-defined problem, because  \emph{there is no total order on $ \rr^n$}! That is, we don't have a good way to compare whether a vector is bigger than another vector, except in the cases when the same direction of inequality can be achieved for all components of two vectors and a partial order can be established. For this kind of \allbold{multi-objective} optimization problem, we can look for Pareto-optimal points in these special cases. We can also try to convert the output into a scalar as the following:
 34 | \[
 35 | 	\min_x f_1(x) + \lambda \cdot  f_2(x_2)
 36 | \]
 37 | for some $ \lambda>0$ that reflects our preference for cake vs ice cream. But this can be subjective.
 38 | 
 39 | \end{eg}
 40 | 
 41 | 
 42 | Thus, For the remainder of this class, we are only going to assume $ f: \rr^n \to \rr$. 
 43 | \\
 44 | 
 45 | Moreover, for $ f: \rr \to \rr$, it's very easy to solve by using root finding algorithms or grid search. So since interesting problems occur with vector inputs, we will simply use $ x$ to represent vectors.
 46 | 
 47 | \begin{notation}
 48 | 	$ \min$ asks for the minimum value, whereas $ \arg\min$ asks for the minimizer that yields the minimum value.
 49 | \end{notation}
 50 | \newpage
 51 | \subsection{Lipschitz continuity}
 52 | \begin{eg}
 53 | Let's consider a variant of the Dirichlet function, $ f: \rr \to \rr$
 54 | \begin{equation*}
 55 | 	f(x)=
 56 | \begin{cases}
 57 | 	x & \text{ if } x \in \qq\\  
 58 | 	1 & \text{ if } x \in \rr \setminus \qq 
 59 | \end{cases}
 60 | \end{equation*}
 61 | Then the solution to the problem
 62 | \[
 63 | 	\min_{x \in [0,1]} f(x) = 0
 64 | \] 
 65 | is $ x=0$ by observation. However, the function is not smooth and a small perturbation can yield wildly different values. Thus, it is not tractable to solve this numerically.
 66 | \end{eg}
 67 | 
 68 | This requires us to add a smoothness assumption:
 69 | \begin{defn}
 70 | 	$ f: \rr^{n} \to \rr$ is \allbold{$L$-Lipschitz continuous} with respect to a norm $ \norm{ \cdot } $ if for all $ x, y \in \rr^{n}$,
 71 | \[
 72 | 	|f(x) - f(y)|\leq L \cdot \norm{x-y} 
 73 | .\]
 74 | \end{defn}
 75 | 
 76 | \begin{note}
 77 | 	Lipschitz continuity implies continuity and uniform continuity. It is a stronger statement because it tells us \emph{how} the function is (uniformly) continuous. However, it doesn't require differentiability. 
 78 | \end{note}
 79 | 
 80 | \begin{defn}
 81 | For $ 1\leq p < \infty$,
 82 |  \[
 83 | 	 \norm{x}_p = \left( \sum_{ i= 1}^{ n} |x_i|^p \right)^{\frac{1}{p}}  
 84 | .\] 
 85 | For $ p = \infty$,
 86 | \[
 87 | \norm{x}_{\infty} = \max_{1\leq i\leq n} |x_i|
 88 | .\]
 89 | \end{defn}
 90 | 
 91 | \begin{remark}
 92 | $ \norm{x}_1 $ and $ \norm{x}_2^2 $ have separable terms as they are sums of their components. $ \norm{x}_2^2 $ is also differentiable which makes it the nicest norm to optimize.
 93 | \end{remark}
 94 | 
 95 | \begin{eg}
 96 | 	Let $ f: \rr^{n} \to \rr$ be $ L$-Lipschitz continuous w.r.t.  $ \norm{ \cdot }_{\infty} $. Let $ C = [0,1]^{n}$, \emph{i.e.} in $ \rr^{2}$, $ C$ is a square. To solve the problem
 97 | 	\[
 98 | 		\min_{x \in C} f(x)
 99 | 	,\]
100 | 	since we have few assumption, there is no better method (in the worst case sense) than the \allbold{uniform grid method}. The idea is that we pick $ p+1$ points in each dimension,  \emph{i.e.} $ \{0,\frac{1}{p},\frac{2}{p},\ldots,1\} $, so we would have $ (p+1)^{n}$ points in total.
101 | 
102 | 	Let $ x^* $ be a global optimal point, then there exists a grid point $ \widetilde{ x}$ s.t.  \[
103 | 	\norm{ x^* -\widetilde{ x}}_{\infty} \leq \frac{1}{2} \cdot  \frac{1}{p} 
104 | 	.\]
105 | 	Thus by Lipschitz continuity, 
106 | 	\begin{align*}
107 | 		|f(x^* ) - f(\widetilde{ x})| &\leq L \cdot \norm{ x^* -\widetilde{ x}}_{\infty} \\
108 | 		&\leq \frac{1}{2} \frac{L}{p}
109 | 	\end{align*} 
110 | 	So we can find $ \widetilde{ x}$ by taking the discrete minimum of all $ (p+1)^{n}$ grid points.\\
111 | 
112 | 	In (non-discrete) optimization, we usually can't exactly find the minimizer, but rather find something very close.
113 | 
114 | \begin{defn}
115 | 	$ x$ is a  \allbold{$ \epsilon$-optimal solution} to $ \min_{x \in C} f(x)$ if $ x \in C$ and
116 | 	\[
117 | 		f(x)-f^*  \leq \epsilon
118 | 	\]
119 | 	where $ f^* = \min_{x \in C} f(x)$.
120 | \end{defn}
121 | 
122 | Our uniform grid method gives us an $ \epsilon$-optimal solution with $ \epsilon = \frac{L}{2p}$, and requires $ (p+1)^{n}$ function evaluations. Writing $ p$ in terms of  $ \epsilon$, we have $ p=\frac{L}{2 \epsilon}$ so equivalently it requires $ \left( \frac{2L}{ \epsilon} + 1 \right)^{n} $ function evaluations, which approximately is $ \epsilon^{-n}$. 
123 | 
124 | For $ \epsilon = 10^{-6}$, $ n=100$, it requires  $ 10^{600}$ function evaluations. This is really bad!
125 | 
126 | Take-aways from this example:
127 | \begin{itemize}
128 | 	\item curse-of-dimensionality: there can be trillions of variables in a Google Neural Network. It would be intractable using the grid method.
129 | 	\item we need more assumptions to allow us to use more powerful methods.
130 | \end{itemize}
131 | \end{eg}
132 | 
133 | \subsection{Categorization}
134 | \begin{figure}[H]
135 | 	\hspace*{-4cm}
136 | 	\includegraphics[width=1.6\textwidth]{./figures/categorization.jpg}
137 | \end{figure}
138 | \newpage
139 | \end{document}
140 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_02.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | 
 6 | \subsection{Minimizers}
 7 | 
 8 | We are given a generic problem $ \min_{x \in C} f(x), C \subseteq \rr^{n}$. Then a \allbold{feasible point} $ x$ means  $ x \in C$. A \allbold{solution} or \allbold{minimizer} or \allbold{global minimizer}  $ x^* $ means
 9 | \begin{enumerate}[label=\arabic*)]
10 | 	\item $ x^* \in C$
11 | 	\item $ \ \forall \ y \in C, f(x^* )\leq f(y)$
12 | \end{enumerate}
13 | It might not be unique, \emph{i.e.} $ x^* \in \arg \min_{x \in C} f(x)$.
14 | \begin{eg}
15 | \[
16 | 	\min_{x \in \rr} f(x) \text{ where } f(x)=0 \ \forall \ x 
17 | .\] 
18 | 
19 | \end{eg}
20 | Sometimes the solution may not exist (even for convex problems).
21 | \begin{eg}
22 | \[
23 | 	\min_{ x \in (0,1)} x^2 
24 | .\] 
25 | \end{eg}
26 | 
27 | $ x^* $ is a \allbold{local minimizer} if $ x^* $ is feasible and there exists an $ \epsilon>0$ s.t. $ f(x^* )\leq f(y)$ $ \ \forall \ y \in C \cap B_{ \epsilon} ( x^*  ) \coloneqq \{y: \norm{ y-x^* } \} < \epsilon$ . A \allbold{strict local minimizer} simply doesn't achieve equality. $ x^* $ is an \allbold{isolated local minimum} if it is a local minimum and no other local minimum are nearby. Notice that isolated implies strict but the converse is false.
28 | 
29 | \begin{eg}[strict but not isolated]
30 | \begin{equation*}
31 | 	f(x)=
32 | \begin{cases}
33 | 	x^{4} \cos \left( \frac{1}{x} \right) +2x^{4} & x\neq 0\\
34 | 	0 & x=0\\
35 | \end{cases}
36 | \end{equation*}
37 | $ x^* =0 $ is strict but not isolated due to the rapid oscillation near $ x=0$.
38 | \end{eg}
39 | 
40 | \begin{notation}
41 | 	$ f \in \mathcal{ C}^3$ means $f,f',f'',f''' $ all exist and are continuous. $ f \in \mathcal{ C}^3( \rr^{n})$ means $ f, \nabla f, \nabla^2 f, \nabla^3 f$ all exists and are continuous.
42 | \end{notation}
43 | 
44 | \subsubsection{Connections with Calculus 1}
45 | Recall that in Cal 1, we first find the stationary/critical points in the domain. Then we add the boundary points and minimize over the small (finite) set of candidates.
46 | 
47 | In high-dimension optimization, we cannot check critical points and the boundary separately because the set of points in the boundary becomes infinite. Moreover, there can be infinite critical points too.
48 | 
49 | Necessary condition: if $ x^* $ is a local or global minimizer and $ C = \rr^{n}$, then $ x^* $ is a \allbold{critical point}. But the converse is false.
50 | 
51 | \begin{notation}
52 | 	The boundary of $ C$ is denoted as  $ \partial C \coloneqq \overline{C} \setminus \inte C$.
53 | \end{notation}
54 | 
55 | If $ x^* $ is a critical point but is not a local or global minimizer, then it's a \allbold{saddle point}.
56 | 
57 | \begin{thm}[Weierstrass]
58 | If $ f$ is continuous and  $ C$ is compact, then  $ f$ achieves its infimum over  $ C$. That is,
59 |  \[
60 | 	 \inf_{x \in C} f(x) = \min_{x \in C} f(x)
61 | .\] 
62 | \end{thm}
63 | \begin{note}
64 | This is pretty much the same as the Extreme Value Theorem.
65 | \end{note}
66 | 
67 | \begin{proof}
68 | 	First let's prove a claim.
69 | 	\begin{claim}
70 | 		Every compact set $ K$ is closed and bounded. 
71 | 	\end{claim}
72 | 
73 | 	Closed: suppose not, the compact set $ K$ doesn't contain all its limit points. That is, there exists a limit point $ x \not\in K$ s.t. a sequence  $ (x_n) \subseteq K$ converges to $ x$. But that also means that all subsequences of $ (x_n)$ converges to $ x \not\in K$ as well, contradicting with the definition of compactness that for every sequence in $ K$ there exists a subsequence that converges inside $ K$. 
74 | 
75 | 	Bounded: suppose not, for all $ M > 0$, there exists a  $ x \in K$ s.t. $ \norm{ x} > M $. This allows us to find a sequence $ (x_n) \subseteq K$ s.t. $ x_n > n$. This way every subsequence is also unbounded and cannot converge, contradicting with the definition of sequential compactness.
76 | 	
77 | 	Now let's begin proof proper. Since $ C$ is compact and  $ f$ is continuous, the image of  $ C$ under  $ f$,  $ f(C)$, is also compact (this follows from sequential definition of continuity). By the claim $f(C) $ is bounded and closed, meaning that it has an infimum (completeness axiom) and contains the infimum (closed). Thus, $ f$ achieves its infimum over $ C$.
78 | \end{proof}
79 | \begin{remark}
80 | It would be nice if our constraints $ C$ are compact. But at the very least, we want our constraint sets to be closed. For example,  $ \norm{ Ax -b} \leq \epsilon $ instead of $ \norm{ Ax -b} < \epsilon $.
81 | \end{remark}
82 | 
83 | Several things to note about the feasible set $ C$:
84 | 
85 | If $ C = \O$, the problem is infeasible. This is not always easy to spot.
86 | 
87 | In this class, $ C$ will usually be convex and not integral,  \emph{i.e.} $ \zz^{n}$.
88 | 
89 | Integral constraint is problematic because the optimal integer solution might not be at all close to the optimal real solution, so we cannot obtain it by solving for the real solution first and then round it.
90 | 
91 | \end{document}
92 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_05.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | 
  6 | \subsection{separating and supporting hyperplanes}
  7 | ~\begin{thm}[separating hyperplane]
  8 | Let $ C,D$ be convex, non-intersecting sets in  $ \rr^{n}$, then there exists $ a \in \rr^{n} \setminus \{0\} $ and $ \mu \in \rr$ s.t. 
  9 | \begin{align*}
 10 | 	a^{T}x \leq \mu \ \forall \ x \in C\\
 11 | 	a^{T} x \geq \mu \ \forall \ x \in D
 12 | \end{align*}
 13 | \end{thm}
 14 | \begin{note}
 15 | This reads as there exists a hyperplane that separates the two convex sets. It is clearly not true if the sets aren't convex. $ a$ is the normal to the hyperplane.
 16 | \end{note}
 17 | 
 18 | \begin{defn}[Chebyshev set]
 19 | A set $ S$ is a  \allbold{Chebyshev set} if for all $ x_0$, there exists a unique $ x \in S$ s.t. 
 20 | \[
 21 | x = \argmin_{y \in S} \norm{ y-x_0} 
 22 | .\] 
 23 | \end{defn}
 24 | \begin{note}
 25 | This reads as there exists a unique best approximation point in the set $ S$ for any $ x_0$.
 26 | \end{note}
 27 | \begin{eg}
 28 | Open unit ball isn't Chebyshev because it doesn't reach infimum.
 29 | \end{eg}
 30 | \begin{eg}
 31 | A nonconvex set isn't Chebyshev because there exists an $ x_0$ where we have at least two best approximation points.
 32 | \end{eg}
 33 | \begin{thm}
 34 | Any nonempty, closed, convex set in a Hilbert space is Chebyshev.
 35 | \end{thm}
 36 | 
 37 | \begin{thm}[supporting hyperplanes]
 38 | ~\begin{enumerate}[label=(\roman*)]
 39 | 	\item If $ C$ is convex, closed and  $ D = \{x_0\}, x_0 \not\in C $, then there exists $ a \in \rr^{n}$ s.t. $ a^{T}x< a^{T}x_0 \ \forall \ x \in C$.
 40 | 	\item Same but $ C$ needs not be closed,  $ x_0 \not\in  \overline{C}$.
 41 | 	\item as in (ii) but allow $ x_0 \in \overline{C}\setminus C$.
 42 | \end{enumerate}
 43 | \end{thm}
 44 | \begin{proof}
 45 | 	(i): WLOG let $ x_0 = 0$ (since we can always translate $ C$). $ C$ is Chebyshev so let  $ y$ be the unique closest point to  $ 0$, and define  $ a=-y$ (normal of the hyperplane). We wish to show that $ a^{T}x< a^{T} x_0 =0 \ \forall \ x \in C$. That is, $ y^{T} x>0 \ \forall \ x \in C$.
 46 | 
 47 | 	Given $ x \in C$, $ y + \epsilon (x-y) \in C$ by convexity. Since $ y$ is the best approximation point,
 48 | 	\begin{align*}
 49 | 		\norm{ y}^2 &\leq \norm{ y + \epsilon(x-y)}^2 \\
 50 | 		&= \norm{ y}^2 + 2 \epsilon\langle y,x-y \rangle+ \epsilon^2 \norm{ x-y}^2   \\
 51 | 		0&= 2 \langle y,x \rangle - 2 \langle y,y \rangle + \epsilon \norm{ x-y}^2  \\
 52 | 		\langle y,x \rangle &\geq \norm{ y}^2 -\frac{ \epsilon}{2} \norm{ x-y}^2 
 53 | 	\end{align*}
 54 | 	Take $ \epsilon \to 0$, since $ y \neq 0 \implies \norm{ y}>0 $, we obtain $ y^{T}x>0$ as required.
 55 | \end{proof}
 56 | 
 57 | \begin{remark}
 58 | 	This is related to \allbold{Theorems of Alternatives}. Generally, they are stated as the following:
 59 | 
 60 | Either $ A$ is true,  $ B$ is false, but not both.
 61 | \end{remark}
 62 | \begin{eg}[Fredhold alternative,finite-dim]
 63 | 
 64 | Either $ \{x: Ax=b\} $ is empty, or $ \{\lambda: A^{T} \lambda =0, \lambda^{T}b \neq 0\} $ is non-empty, but not both.
 65 | 
 66 | Why do we care? To prove that there is a solution to $ Ax=b$. We can simply find a solution  $ x$. This is a "certificate". But if professor asks you to prove there isn't a solution to  $ Ax=b$, we can try to show that  $ A$ is singular, but if  $ b=0$ even singular  $ A$ works. Another way is to find a "certificate"  $ \lambda$. This is the first task of duality.
 67 | \end{eg}
 68 | 
 69 | 
 70 | \begin{eg}[Farkas Lemma]
 71 | Either $ \{Ax=b,x\geq 0\} $ is non-empty, or $ \{\lambda: A^{T} \lambda\geq 0, \lambda^{T}b<0\} $ is non-empty, but not both.
 72 | \end{eg}
 73 | \begin{thm}[Thereom of Alternatives for strict linear inequalities]
 74 | The following statements are equivalent:
 75 | \begin{enumerate}[label=(\roman*)]
 76 | 	\item The set $ \{x: Ax<b\} $ is empty.
 77 | 	\item The sets $ C = \{b-Ax : x \in \rr^{n}\} $ and $ D = \rr_{++}^{m}$ do not intersect.
 78 | 	\item The hyperplane separation theorem and its converse hold. That is,
 79 | 		\[
 80 | 			\ \exists \ \lambda \geq 0 ( \lambda \neq 0) \text{ s.t. } A^{T} \lambda = 0, \lambda ^{T} b \leq 0 
 81 | 		.\] 
 82 | \end{enumerate}
 83 | \end{thm}
 84 | \begin{intuition}
 85 | 	(ii) is just rephrasing (i). No intersection from (ii) can then be established by finding something that separates $ C,D$ in (iii).
 86 | \end{intuition}
 87 | \begin{proof}(converse of hyperplane separation)
 88 | 
 89 | 	(iii) $ \implies$ (i): suppose such $ \lambda$ exists, and for contradiction, assume there exists $ x$  s.t. $ Ax < b$. Then since $ \lambda \geq 0$,
 90 | \[
 91 | 	0= (A^{T} \lambda)^{T} x = \lambda^{T} Ax < \lambda ^{T} b
 92 | .\] 
 93 | So we obtain $ 0< \lambda^{T} b \leq 0$, a contradiction.
 94 | 
 95 | (i) $ \implies$ (iii): By the separation theorem, we know there exists $ \lambda \neq 0$ s.t. 
 96 | \begin{align*}
 97 | 	\lambda^{T}(b-Ax) &\leq \mu, x \in \rr^{n}\\
 98 | 	\lambda^{T} y &\geq \mu, y \in \rr_{++}^{n}
 99 | \end{align*}
100 | It follows from the first condition that $ \lambda^{T}Ax =0$ because otherwise we can just choose a large negative $ x$ to exceed $ \mu$ and get contradiction. Since this is true for all $ x$, it must be that  $ \lambda^{T} A = A^{T} \lambda =0$. From the second condition we have $ \lambda \geq 0$, because otherwise if $ \lambda_i<0$, we can choose $ y_i \to \infty$ to get contradiction. Moreover, we need $ \mu \leq 0$ since if $ \mu>0$, we can take all components of $ y $ to $ 0^{+} $, so $ \lambda^{T} y \to 0^{+} $. Then $ \lambda^{T}(b-A^{T}x)\leq \mu \leq 0$ implies that $ \lambda^{T} b \leq 0$.
101 | 
102 | Taken together, we have $ \lambda \geq 0, \lambda \neq 0$, $ A^{T} \lambda =0,$ and $ \lambda^{T}b \leq 0$.
103 | \end{proof}
104 | \newpage
105 | \end{document}
106 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_07.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \subsection{First-order conditions}
  6 | 
  7 | ~\begin{thm}
  8 | 	If $ f: \rr^{n} \to \rr$ is differentiable on dom(f) and if dom(f) is open and convex, then $ f$ is convex iff for all  $ x,y \in \dom(f)$,
  9 | 	\[
 10 | 		f(y) \geq f(x) + \langle \nabla f(x),y-x \rangle
 11 | 	.\] 
 12 | \end{thm}
 13 | \begin{note}
 14 | 	This is the 1st order Taylor approximation (tangent line). The line is supporting the epigraph of $ f$.
 15 | \end{note}
 16 | ~\begin{figure}[H]
 17 | 	\centering
 18 | 	\includegraphics[width=\textwidth]{./figures/cvx_tan.png}
 19 | \end{figure}
 20 | \begin{thm}
 21 | 	Under the same assumption, $ f$ is convex iff $\nabla  f$ is monotone. That is, for all $ x,y \in \dom(x)$,
 22 | 	\[
 23 | 		\langle x-y, \nabla f(x) - \nabla f(y) \rangle \geq 0
 24 | 	.\] 
 25 | \end{thm}
 26 | \begin{intuition}
 27 | 	Recall in 1D, $ f$ is convex if slope is non-decreasing. That is, if $ x-y\geq 0$, then  $ f'(x) - f'(y) \geq 0$ and if $ x-y \leq 0$ then  $ f'(x)-f'(y)\leq 0$. A concise way to express that is $ (x-y)(f'(x) - f'(y)) \geq 0$. Here we generalize this to higher dimensions.
 28 | \end{intuition}
 29 | 
 30 | \begin{thm}[2nd-order condition]
 31 | 	$ f: \rr^{n} \to \rr$. If the Hessian $ \nabla^2 f(x)$ exists for all $ x \in \dom(f)$, then
 32 | 	\begin{enumerate}[label=\alph*)]
 33 | 		\item $ f$ is convex iff  $ \nabla ^2 f(x) \succeq 0 \ \forall \ x \in \dom(f)$.
 34 | 		\item $ f$ is  $ \mu$-strongly convex ( w.r.t. $ \norm{ \cdot }_2 $ ) iff $ \nabla ^2 f(x) \succeq \mu I$.
 35 | 	\end{enumerate}
 36 | 	If $ \nabla ^2 f(x) \succ 0$, then $ f$ is \allbold{strictly convex}. 
 37 | \end{thm}
 38 | \begin{remark}
 39 | $ f$ can be convex but  $ \nabla f, \nabla ^2 f$ need not exist!
 40 | \end{remark}
 41 | What if $ f$ isn't differentiable?
 42 | 
 43 |  \begin{defn}[subdifferential]
 44 | 	 Let $ f: \rr^{n} \to (-\infty, \infty]$ be proper, then we define the \allbold{subdifferential} of $ f$ at $ x$ to be
 45 | 	 \[
 46 | 		 \partial f(x) = \{d \in \rr^{n}: \ \forall \ y \in \rr^{n}, f(y) \geq f(x) + \langle d,y-x \rangle\} 
 47 | 	 .\] 
 48 | \end{defn}
 49 | \begin{note}
 50 | $ d$ here is called a  \allbold{subgradient}. 
 51 | \end{note}
 52 | \begin{thm}
 53 | If $ f$ is proper and convex then
 54 |  \[
 55 | 	 x \in \ri(\dom(f)) \implies \partial f(x) \neq \O
 56 | .\] 
 57 | \end{thm}
 58 | \begin{note}
 59 | The proof is related to separating/supporting hyperplanes.
 60 | \end{note}
 61 | \begin{prop}
 62 | 	$ \partial f(x) $ is a singleton  iff  $ f$ is differentiable at  $ x$.
 63 | \end{prop}
 64 | 
 65 | \begin{eg}
 66 | 	$ f(x) = |x|$. Then if  $ x\neq 0$,  $ f'(x) = \sgn(x)$ and  $ \partial f(x) = \{f'(x)\} $. If $ x=0$,  $ f'(0)$ DNE. But $ \partial f(0) = [-1,1] $.
 67 | \end{eg}
 68 | 
 69 | \begin{thm}[Fermat's Rule]
 70 | If $ f$ is a proper function, then
 71 |  \[
 72 | 	 \argmin_{x} f(x) = \{x: 0 \in \partial f(x)\} 
 73 | .\] 
 74 | \end{thm}
 75 | \begin{proof}
 76 | This just means that we can plug $ 0$ into the definition of subdifferential and get
 77 |  \[
 78 | 	 f(y) \geq f(x) + \langle 0, y-x \rangle = f(x) \ \forall \ y
 79 | .\] 
 80 | This clearly shows that $ x$ is a global minimizer. 
 81 | \end{proof}
 82 | 
 83 | \begin{note}
 84 | This generalizes the calculus idea of critical points for smooth functions.
 85 | \end{note}
 86 | 
 87 | \begin{remark}
 88 | 	Subdifferentials are a global notion (for all $ y$) whereas gradients are a local notion. How do we reconcile that subdifferential can be the gradient? The answer is that the global property of convexity links the two.
 89 | \end{remark}
 90 | \begin{remark}
 91 | So all we need to do is to invert $ \partial f$. That is,
 92 |  \[
 93 | 	 \argmin f(x) = \partial f^{-1}
 94 | .\]
 95 | In fact, this is usually not practical or even possible especially for interesting problems. It may be possible for subproblems.
 96 | \end{remark}
 97 | \begin{defn}[normal cone]
 98 | The \allbold{normal cone} to a set $ C$ at point  $ x$ is
 99 | \begin{equation*}
100 | 	N_C(x)=
101 | \begin{cases}
102 | 	\{d: \langle d,y-x \rangle \leq 0 \ \forall \ y \in C\} & \text{ if } x \in C\\
103 | 	\O & \text{ if } x \not\in C 
104 | \end{cases}
105 | \end{equation*}
106 | \end{defn}
107 | \begin{eg}
108 | Let $ C \neq \O$ be convex, so $ I_C$ is a proper convex function. Then  $ \partial I_C = N_C$.
109 | \end{eg}
110 | \begin{eg}
111 | 	$ x \in \inte C \implies N_C(x) = \{0\} $. Why? WLOG, shift $ C$ so  $ x=0$. If  $ \langle d,y \rangle\leq 0 \ \forall \ y \in C$. Then $ x \in \inte C \implies$ we can choose $y = \epsilon d \in C$ for sufficiently small $ \epsilon >0$. Then $ \epsilon \norm{ d}^2 \leq 0 \implies d=0 $.
112 | \end{eg}
113 | \begin{eg}
114 | 	$ x \in \partial C$ (the boundary). We want $ d$  s.t. $ \langle d,y \rangle\leq 0 \ \forall \ y \in C$. Geometrically this means we want the angle between $ d,y$ to be perpendicular or obtuse. If the boundary is smooth, since  $ d$ needs to be at least perpendicular to any  $ y$ immediately to the left and right of  $ x$, it must be the normal ray of the tangent plane.
115 | \end{eg}
116 | 
117 | \begin{eg}
118 | 	~\begin{figure}[H]
119 | 		\centering
120 | 		\includegraphics[width=0.8\textwidth]{./figures/normal_cone.png}
121 | 		\caption{The normal cone at non-smooth boundary looks indeed like a cone.}
122 | 	\end{figure}
123 | \end{eg}
124 | \begin{remark}
125 | An equivalent definition of normal cone is the set of all vectors that define a supporting hyperplane to $ C$, passing through  $ x$.
126 | \end{remark}
127 | 
128 | \begin{eg}
129 | If $ C$ is a vector space, since $ C$ is closed under inverses, if we use  $ -y$ in addition to $ y$ in the definition we will get an equality which implies orthogonality. Hence
130 |  \begin{equation*}
131 | 	 N_C(x)=
132 | \begin{cases}
133 | 	C^{\perp} & x \in C\\
134 | 	\O & x \not\in C
135 | \end{cases}
136 | \end{equation*}
137 | \end{eg}
138 | 
139 | \begin{prop}[6.47 BC17]
140 | 	If $ C \neq \O$ is closed and convex, then $ x=P_C(y)$ iff  $ y-x \in N_C(x)$, where $ P_C(y)$ denotes the orthogonal projection of  $ y$ onto  $ C$.
141 | \end{prop}
142 | \end{document}
143 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_08.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \newpage
 6 | \subsection{Calculus}
 7 | 
 8 | \begin{remark}
 9 | Calculus is a set of rules we can use to calculate.
10 | \end{remark}
11 | 
12 | One such rule is that derivatives/gradients are linear.
13 | 
14 | Is it true that $ \partial (f+g) = \partial f+\partial g$, where $ +$ is the Minkowski sum? No! Although it's often true.
15 | 
16 | \begin{eg}
17 | $ f=I_C, g= I_D \in \rr^{2}$. 
18 | ~\begin{figure}[H]
19 | 	\centering
20 | 	\includegraphics[width=\textwidth]{./figures/counter_linear.png}
21 | \end{figure}
22 | 
23 | Then $ \partial (f+g)(x) = \partial f(x) \partial g(x)$ for all $ x$ except at  $ x=0$. At  $ x=0$, recall that 
24 |  \begin{align*}
25 | 	 \partial f(0) = N_C(0) &= \rr_{+} \times \{0\} \\
26 | 	 \partial g(0) = N_D(0) &= \rr_{-} \times \{0\} 
27 | \end{align*}
28 | So $ \partial f(0) + \partial g(0) = \rr \times \{0\} $ 
29 | But
30 | \begin{align*}
31 | 	\partial (f+g) (0) &= N_{C \cap D}(0) \qquad \qquad \qquad  \text{ by def of indicator} \\
32 | 	&= N_{0} \\
33 | 	&= \{d: \langle d,y-0 \rangle \leq 0 \ \forall \ y \in \{0\} \} \quad  \text{ vacuous constraint} \\
34 | 	&= \rr^2 
35 | \end{align*}
36 | We can see this counterexample is somewhat contrived, so linearity is often true.
37 | \end{eg}
38 | 
39 | \begin{remark}
40 | 	Sufficient conditions to guarantee when this linearity is true are called \allbold{constraint qualifications (CQ)}.
41 | \end{remark}
42 | \begin{coro}[16.48 (iv) BC17]
43 | 	If $ f,g \in \Gamma_0(\mathcal{H})$, and $\mathcal{H} = \rr^{n} $, and one of the following holds:
44 | 	\begin{enumerate}[label=(\roman*)]
45 | 		\item $ \ri(\dom(f)) \cap \ri(\dom(g)) \neq \O $.
46 | 		\item $ \\dom(f) \cap \inte (\dom(g)) \neq \O$.
47 | 		\item either $ f$ or  $ g$ has full domain (all of $ \rr^{n}$).
48 | 	\end{enumerate}
49 | \end{coro}
50 | \begin{note}
51 | 	(iii) is most commonly used.
52 | \end{note}
53 | Since the previous example didn't satisfy a CQ, the linearity didn't hold. That is, $ \dom f = C, \dom g =D, \inte C \cap \inte D = \O$.
54 | \begin{remark}
55 | There are other cones including \allbold{tangent, polar, recession/asymptotic, and barrier cones}. 
56 | \end{remark}
57 | \newpage
58 | \subsection{Lipschitz gradient}
59 | An easier way to show $ F$ is Lipschitz-continuous: if $ F'$ exists, then  $ |\norm{ F'}| \leq L \implies F$ is Lipschitz continuous (by the definition of derivative/Jacobian and some manipulation).
60 | \begin{notation}
61 | 	$ |\norm{ \cdot } |$ denotes the appropriate operator norm, usually spectral norm if the original norm is Euclidean.
62 | \end{notation}
63 | 
64 | \begin{remark}
65 | 	In optimization, "Jacobian" is often confusing, since it's unclear what $ F$ is. Of the objective function or of the gradient? Instead we prefer to say the Jacobian of the objective is the gradient (transposed). The Jacobian of the gradient is the Hessian.
66 | \end{remark}
67 | \begin{remark}
68 | 	The Hessian can be thought of as a bilinear operator $ \langle d, \nabla ^2 f(x) d \rangle$
69 | \end{remark}
70 | 
71 | \begin{thm}
72 | 	Suppose convex $ f \in \mathcal{ C}^2(U)$ for some open set $ U \subseteq \rr^{n}$, then
73 | 	\[
74 | 		\nabla f \text{ is $L$-Lipschitz continuous on } U \iff \ \forall \ x \in U, \nabla ^2 f(x) \preceq L I 
75 | 	.\] 
76 | 	That is, all eigenvalues of $ \nabla ^2f(x) \leq L \implies |\norm{ \nabla ^2 f(x)} | \leq L$.
77 | \end{thm}
78 | \begin{thm}
79 | Same setup, then 
80 | 
81 | $ f$ is  $ \mu$-strongly convex on $ U \iff \ \forall \ x \in U, \mu I \preceq \nabla ^2 f(x)$.
82 | \end{thm}
83 | \begin{note}
84 | We assume $ \mu>0$ since $ \mu=0$ would give us plain old convexity.
85 | \end{note}
86 | \begin{remark}
87 | 	One of our common assumption will be $ \nabla f$ is $L$-Lipschitz continuous ($ \nabla ^2 f \preceq LI$) and a bit less common, also assume strong convexity ($ uI \preceq \nabla ^2 f$).
88 | \end{remark}
89 | 
90 | \end{document}
91 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_09.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \begin{eg}[best function ever]
  6 | 	Consider $ f(x) = \frac{1}{2} \norm{ x}_2^2, \nabla f(x) = x, \nabla ^2f(x) = I $. So $ L=1, \mu=1$. This is the only function with this property.
  7 | 	
  8 | 	This is the nicest function ever for optimization!
  9 | \end{eg}
 10 | \begin{defn}[condition number]
 11 | The \allbold{condition number} of $ f$ is  $ k_f = \frac{L}{\mu}$. $ k_f \approx 1$ is good. Larger is bad.
 12 | \end{defn}
 13 | Why do we care about these assumptions?
 14 | 
 15 | Recall from calculus, Taylor's theorem states that
 16 | \[
 17 | 	f(y) = f(x) + f'(x) (y-x) + \frac{1}{2} f''(\xi) (y-x)^2
 18 | ,\] 
 19 | were $ \xi \in [x,y]$. If $ f''(\xi) \leq L \ \forall \ \xi$, then
 20 | \[
 21 | 	f(y) \leq f(x) + f'(x)(y-x) + \frac{L}{2} (y-x)^2
 22 | .\] 
 23 | \begin{thm}
 24 | 	If $ \nabla f$ is $L$-Lipschitz continuous and $ f$ is  $ \mu$-strongly convex, then for all $ x,y \in \dom(f)$,
 25 | 	\[
 26 | 		\frac{\mu}{2} \norm{ y-x}^2 \leq f(y) - (f(x) + \langle \nabla f(x), y-x \rangle) \leq \frac{L}{2} \norm{ y-x}^2 
 27 | 	.\] 
 28 | \end{thm}
 29 | ~\begin{figure}[H]
 30 | 	\centering
 31 | 	\includegraphics[width=\textwidth]{./figures/quad_bounds.png}
 32 | 	\caption{If $ f$ is complicated but we can "sandwich" it between a quadratic upper bound and a quadratic lower bound ( $ \mu>0$ ) or a linear lower bound ($ \mu=0$), then we can work with the quadratics to understand the behavior of $ f$ since quadratics are much easier to deal with.}
 33 | \end{figure}
 34 | 
 35 | See more properties from this section in the Github handout StrongConvexityLipschitz.pdf.
 36 | \newpage
 37 | 
 38 | \subsection{Examples [BV04 Ch.3.1.5]}
 39 | Examples of convex functions $ f: \rr \to \rr$ :
 40 | \begin{itemize}
 41 | 	\item $ e^{ax}, a \in \rr$.
 42 | 	\item $ x^{a}$ on $ x \in \rr_{++}$ if $ a\leq 0$ or  $ a\geq 1$. (It's concave on  $ 0\leq a \leq 1$).
 43 | 	\item  $ |x|^{a}$ on all of $ \rr$, if $ a\geq 1$.
 44 | 	\item  $ - \log_b(x)$ on $ \rr_{++}$ if $ b>1$.
 45 | 	\item On $ \rr^{+}$,
 46 | 		\begin{equation*}
 47 | 		\begin{cases}
 48 | 			x \cdot \log(x) & x>0\\
 49 | 			0 & x=0
 50 | 		\end{cases}
 51 | 		\end{equation*}
 52 | 		since $ f''(x) = \frac{1}{x} >0$.
 53 | \end{itemize}
 54 | 
 55 | Examples of convex functions $ f: \rr^{n} \to \rr$:
 56 | \begin{itemize}
 57 | 	\item any norm/seminorm (follows directly from triangle inequality).
 58 | 	\item $ f(x) = \max\{x_1,\ldots,x_n\} $.
 59 | 	\item $ f(x,y) = x^2 /y$, $ \dom(f) = \rr \times \rr_{++}$. "Quadratic over linear".
 60 | 
 61 | 		$ f(x,y) = \norm{ x}_2^2 /y $, $ \dom(f) = \rr^{n-1} \times \rr_{++}$.
 62 | 
 63 | 		$ f(x,Y) = x^{T} Y^{-1} x$, $ \dom(f) = \rr^{n} \times S_{++}^{n}$. "Matrix fractional function".
 64 | 		\begin{note}
 65 | 		"Linear fractional function"
 66 | 		 \[
 67 | 			 g(x) = \frac{Ax+b}{c^{T}x+ d }, \quad \dom(g) = \{x: c^{T}x+d >0\} 
 68 | 		\] 
 69 | 		is not convex but it is \allbold{quasi-convex}. It is defined by having all convex sub-level sets $ \{x: f(x) \leq \alpha\} $.  
 70 | 		\end{note}
 71 | 		~\begin{figure}[H]
 72 | 			\centering
 73 | 			\hspace*{-3cm}
 74 | 			\includegraphics[width=1.4\textwidth]{./figures/quasi_cvx.png}
 75 | 		\end{figure}
 76 | 	\item "log-sum exp" aka "soft-max"
 77 | 		\[
 78 | 			f(x) = \frac{1}{\alpha} \log \left( e^{\alpha x_1} + \ldots + e^{\alpha x_n} \right) , \alpha > 0
 79 | 		.\]
 80 | 		This is differentiable but needs to be careful about numerical under/overflow.
 81 | 	\item geometric mean $ f(x) \left( \prod_{ i= 1}^{ n} x_i \right)^{\frac{1}{n}} $ on $ \rr_{++}^{n}$.
 82 | 	\item $ - \log \det(X) =-\log\left( \prod \lambda_i \right) = - \sum \log(\lambda_i) $ on $ S_{++}^{n}$.
 83 | \end{itemize}
 84 | 
 85 | 
 86 | \begin{thm}[Jensen's Inequality]
 87 | 	\[
 88 | 	f(\ev [x]) \leq \ev [f(x)]
 89 | 	.\] 
 90 | \end{thm}
 91 | \begin{remark}
 92 | 	Let $ X$ be a random variable that outputs points in  $ \dom(f)$ with probability in $ [0,1]$, then the inequality follows from definition of convex function.
 93 | \end{remark}
 94 | \begin{eg}
 95 | In machine learning, we often prove something like
 96 | \[
 97 | 	\ev[\norm{ \text{ error} }^2 ] \leq \epsilon
 98 | .\]
 99 | Let $ f(x) = x^2$. So by Jensen's inequality:
100 | \begin{align*}
101 | 	\left( \ev[ \norm{ \text{ error} } ] \right) ^2 &\leq \ev\left[ \norm{ \text{ error} }^2  \right]\leq \epsilon \\
102 | 	\ev[\norm{ \text{ error} } ] &\leq \sqrt{\ev \left[ \norm{ \text{ error} }^2  \right] } \leq \sqrt{ \epsilon} 
103 | \end{align*}
104 | Recall that $ \norm{ \text{ error} }^2 $ is the nicest function ever.
105 | \end{eg}
106 | 
107 | \begin{remark}
108 | 	H\"{o}lder's inequality/Cauchy-Schwarz can also be proved via Jensen.
109 | \end{remark}
110 | \begin{thm}[H\"{o}lder's inequality]
111 | If $ \frac{1}{p} + \frac{1}{q} = 1$,
112 | \[
113 | |\langle x,y \rangle| \leq \norm{ x}_p \cdot \norm{ y}_q  
114 | .\] 
115 | \end{thm}
116 | 
117 | \begin{remark}
118 | We can use Jensen's to prove Holder inequality.
119 | \end{remark}
120 | \end{document}
121 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_10.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \subsection{Preserving convexity}
  6 | 
  7 | \subsubsection{Rule 0: non-negative (weighted) sums}
  8 | If $ f_1, \ldots, f_m$ are convex, $ \alpha_i \geq 0$, then
  9 | $ x \mapsto \sum \alpha_i f_i(x)$ is convex too.
 10 | 
 11 | Subtraction (negative weights) doesn't work. 
 12 | 
 13 | It works for integrals too:
 14 | 
 15 | If for all $ y$,  $ f( \cdot ,y)$ is convex, and $ w(y) \geq 0$. Then
 16 | \[
 17 | 	x \mapsto \int_{\Omega} f(x,y) w(y)\ dy
 18 | \]
 19 | is convex.
 20 | 
 21 | \subsubsection{Rule 1: perspective function}
 22 | 
 23 | ~\begin{defn}[perspective]
 24 | Let $ f: \rr^{n} \to \rr$, then its \allbold{perspective} is $ g: \rr^{n+1} \to \rr$,
 25 | \[
 26 | 	g(x,t) = t \cdot f\left(\frac{x}{t}\right), \quad \dom(g) = \{(x,t): x / t \in \dom(f), t>0\} 
 27 | .\] 
 28 | \end{defn}
 29 | \begin{prop}
 30 | $ f: \rr^{n} \to \rr$ is convex $ \implies$ its perspective is convex.
 31 | \end{prop}
 32 | 
 33 | \begin{eg}
 34 | 	$ f(x) = \norm{ x}^2 $ is convex. Its perspective is
 35 | 	\[
 36 | 	t \cdot \norm{ \frac{x}{t}}^2 = t \cdot \frac{\norm{ x}^2 }{t^2 }  = \frac{\norm{ x}^2 }{t } 
 37 | 	.\] 
 38 | 	This is the quadratic-over-linear example we saw earlier. This is the proof that it is convex.
 39 | \end{eg}
 40 | \begin{eg}
 41 | 	$ f(x) = -\log(x)$ is convex. Its perspective is
 42 | 	\[
 43 | 		-t \cdot \log\left( \frac{x}{t} \right) = t \cdot \log(t) - t \cdot \log(x), x,t>0
 44 | 	.\]
 45 | 	This is the \allbold{relative entropy} of $ t,x$. More generally, the \allbold{Kullback-Leibler divergence} is
 46 | 	\[
 47 | 		D_{KL} (u,v) = \sum_{ i= 1}^{ n} u_i \log\left( \frac{u_i}{v_i } \right) -u_i+v_i
 48 | 	.\] 
 49 | 
 50 | This is an example of \allbold{Bregman Divergence}, which we often use to measure "distance" as an alternative to metric. It's especially good for probability distributions.
 51 | \end{eg}
 52 | 
 53 | 
 54 | \subsubsection{Rule 2: special types of compositions}
 55 | 
 56 | Composition of convex functions typically doesn't preserve convexity!
 57 | 
 58 | \begin{thm}
 59 | $ f$ is convex if
 60 |  \begin{enumerate}[label=(\roman*)]
 61 | 	\item $ h$ is convex and
 62 | 	\item if $ k=1$, $ h$ is nondecreasing and  $ g$ is convex or $ h$ is nonincreasing and  $ g$ is concave.
 63 | 	\item if  $ k>1$, we enforce (ii) to each argument of  $ h$ and each $ g_i$.
 64 | \end{enumerate}
 65 | \end{thm}
 66 | \begin{note}
 67 | 	For nonincreasing/decreasing, we must take into account $ \pm \infty$, since in convex analysis we assign infinity to any point not in the domain. So although $ h(x) = x$ is nondecreasing on  $ \rr$, if we restrict $ \dom(h) = [0,1]$ then it is not nondecreasing anymore.
 68 | \end{note}
 69 | \begin{thm}[tattoo-worthy]
 70 | $ f= h\circ g$ is convex if  $ h$ is convex and  $ g$ is affine.
 71 | \end{thm}
 72 | 
 73 | \begin{eg}
 74 | 	$ f(x) = \norm{ Ax-b}^2 $ is convex by this theorem.
 75 | \end{eg}
 76 | \subsubsection{Rule 3: min/max}
 77 | 
 78 | \begin{prop}
 79 | 	If $ f,g$ both convex, then $ x\mapsto \max \{f(x),g(x)\} $ is convex.
 80 | \end{prop}
 81 | \begin{proof}
 82 | The epigraph of the maximum is the intersection of two convex epigraphs. Convex sets are closed under arbitrary intersections.
 83 | \end{proof}
 84 | ~\begin{figure}[H]
 85 | 	\centering
 86 | 	\includegraphics[width=0.8\textwidth]{./figures/max_cvx.png}
 87 | \end{figure}
 88 | \begin{note}
 89 | This works for supremum too due to closure under arbitrary intersections.
 90 | \end{note}
 91 | \begin{eg}
 92 | \[
 93 | 	f(x) = \sup_{y \in \mathcal{ A}} f(x;y) 
 94 | \] 
 95 | is convex as long as $ f( \cdot ; y)$ is convex $ \ \forall \ y \in \mathcal{ A}$, where $ \mathcal{ A}$ is an arbitrary set that can be uncountable.
 96 | \end{eg}
 97 | 
 98 | \begin{eg}[spectral norm]
 99 | 	\[ f(A) = \norm{ A}_{\infty} = \sup_{\norm{ x}_2=1 } \norm{ Ax}_2 \] is convex since $ \ \forall \ x, A \mapsto \norm{ Ax}_2 $ is convex (composition of convex and affine).
100 | \end{eg}
101 | 
102 | \begin{figure}[H]
103 | 	\centering
104 | 	\includegraphics[width=0.8\textwidth]{./figures/min_cvx.png}
105 | \end{figure}
106 | It's easy to see that min doesn't necessarily preserve convexity because it unions epigraphs instead.We need to impose more restrictions to make it work:
107 | \begin{thm}
108 | 	If $ f: \rr^{n} \times \rr^{m}$ is (jointly) convex and if $ C \neq \O$ is a convex set, then
109 | 	\[
110 | 		g(x) = \inf_{y \in C} f(x,y) \text{ is convex} 
111 | 	.\] 
112 | \end{thm}
113 | 
114 | \begin{eg}
115 | 	$ \min \{f_1(x), f_2(x)\} $ is not usually convex since this is like taking
116 | 	\begin{equation*}
117 | 		f(x,y)=
118 | 	\begin{cases}
119 | 		f_1(x), &y=1\\
120 | 		f_2(x), &y=2
121 | 	\end{cases}
122 | 	\end{equation*}
123 | 	and constraint $ C = \{1,2\} $ is not convex.
124 | \end{eg}
125 | \begin{eg}
126 | The distance to a convex set is a convex function. Let $ C \neq \O$ be convex,
127 | \[
128 | 	f(x) = \inf_{y \in C} \norm{ x-y} 
129 | .\] 
130 | 	
131 | Prove $ (x,y) \mapsto \norm{ x-y} $ is convex.
132 | \begin{proof}
133 | 	We know $ z \mapsto \norm{ z} $ is convex. Consider the linear operator $ A(x,y) = x-y$. That is,
134 | 	 \[
135 | 		 A \begin{pmatrix} x\\y \end{pmatrix} = \begin{pmatrix} I & -I \end{pmatrix} \begin{pmatrix} x\\y \end{pmatrix} = x-y
136 | 	.\] 
137 | 	Then the composition of convex and affine is still convex.
138 | \end{proof}
139 | \end{eg}
140 | 
141 | \end{document}
142 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_11.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \subsection{Gradient descent}
  6 | 
  7 | Problem: we want to solve $ \min_{x} f(x)$, $ f:\rr^{n} \to \rr$, $ f \in \Gamma_0 (\rr^{n})$ (proper, lsc, convex) and $ \nabla f$ is $L$-Lipschitz continuous (strongly smooth).
  8 | 
  9 | \subsubsection{Attempt 1}
 10 | \[
 11 | 	x_{k+1} = \argmin_{x} \left[ f(x_k) + \underbrace{ \langle \nabla f(x_k), x- x_k \rangle}_{q_k (x) \text{ 1st order surrogate} } \right]
 12 | .\]
 13 | Linearization is a common trick to simplify problems. However, this fails because $ \min_{x} q_k(x) = -\infty$ for a linear function (unless it's already optimal). We can fix this by add a compact constraint. Then it's called \allbold{Frank-Wolfe} or \allbold{conditional gradient}. We omit this discussion as it's a bit niche.
 14 | 
 15 | \subsubsection{Attempt 2}
 16 | Consider the 2nd order Taylor series:
 17 | \[
 18 | 	x_{k+1} = \argmin_{x} \underbrace{ f(x_k) + \langle \nabla f(x_k) , x-x_k \rangle + \frac{1}{2} \langle x-x_k, \nabla ^2 f(x_k) (x-x_k) \rangle}_{q_k(x) \text{ quadratic surrogate} }
 19 | .\] 
 20 | Since $ f$ is convex,  $ \nabla ^2 f(x) \succeq 0 \implies q_k(x)$ is a convex quadratic (sum of convex functions). 
 21 | 
 22 | To minimize $ q_k(x)$, we use Fermat's rule:
 23 | \begin{align*}
 24 | 	0 &= \nabla q_k(x) \\
 25 | 	  &= \nabla f(x_k) + \nabla ^2 f(x_k) (x-x_k) &&\text{ gradients of linear and quadratic terms} \\
 26 | 	x_{k+1} &= x_k - \nabla ^2 f(x_k)^{-1} \nabla f(x_k) 
 27 | \end{align*}
 28 | 
 29 | This is \allbold{Newton's method}, a generalization of the "Newton-Raphson" for 1D root-finding, applied to the gradient. It is a \allbold{2nd-order method} because it involves the derivative of the gradient which is the second derivative (Hessian). 
 30 | 
 31 | \begin{remark}
 32 | 	Unlike 1D root-finding, 2nd order methods in higher dimensions converge quickly but each iteration may be costly because we need to invert the Hessian and solving system of equations. This is about $ \mathcal{ O}(n^3)$. 1st-order methods only use $ \nabla f(x)$ and usually converge more slowly but each step is cheap at about $ \mathcal{ O}(n)$.
 33 | \end{remark}
 34 | 
 35 | \subsubsection{What to use?}
 36 | It depends:
 37 | \begin{itemize}
 38 | 	\item Structure matters (is $ \nabla ^2 f$ easy to invert? Is it ill-conditioned (which hurts 1st order more)?)
 39 | 	\item For small/medium problem size, high accuracy, we use 2nd order. This is default for cvx/cvxpy.
 40 | 	\item In between problems: try both?
 41 | \end{itemize}
 42 | 
 43 | \subsubsection{Other types}
 44 | \begin{itemize}
 45 | 	\item 3rd order: usually not worth the complexity. See recent Nesterov work for a plausible implementation.
 46 | 	\item 0th order: Extremely slow and finding gradient is cheap anyway, usually not worth it.
 47 | 	\item coordinate descent: heavily depends on the structure.
 48 | \end{itemize}
 49 | 
 50 | \subsubsection{Attempt 3}
 51 | By assumption, $ 0 \preceq \nabla ^2 f(x) \preceq L I$. Thus, for all $ y$,
 52 | \[
 53 | 	\frac{1}{2} \langle y, \nabla ^2 f(x)\ y \rangle \leq \frac{1}{2} L \norm{ y}^2 
 54 | .\] 
 55 | This allows us to upper bound the quadratic surrogate and simplify it further by removing the Hessian. Notice $ (LI) ^{-1} = \frac{1}{L} I$ which replaces $ (\nabla ^2 f)^{-1}$. So we can modify Newton's method as
 56 | \begin{align*}
 57 | 	x_{k+1} &= \argmin_{x} \underbrace{ f(x_k) + \langle \nabla f(x_k) , x- x_k \rangle + \frac{1}{2} L \norm{ x -x_k}^2}_{q_k(x)}\\ 
 58 | 		&= x_k - \frac{1}{L} \nabla f(x_k)
 59 | \end{align*}
 60 | This is $ \mathcal{ O}(n)$. Here $ q_k(x) \geq f(x) \ \forall \ x$ is more than a linearization but is less than the full 2nd order Taylor expansion. It is a \allbold{majorizer} of $ f$. 
 61 | 
 62 | fig
 63 | 
 64 | \subsubsection{Majorization-minimization (MM)} 
 65 | MM can always guarantee making progress on the minimization. The framework is
 66 | \begin{enumerate}[label=\arabic*)]
 67 | 	\item Assume we can always construct a majorizer $ q_k$ s.t.
 68 | 		\begin{enumerate}[label=(\roman*)]
 69 | 			\item $ \ \forall \ x, f(x) \leq q_k(x)$
 70 | 			\item $ f(x_k) = q_k(x_k)$ 
 71 | 		\end{enumerate}
 72 | 	\item Iterate: $ x_{k+1} \in \argmin_{x} q_k(x)$.
 73 | \end{enumerate}
 74 | This algorithm is a \allbold{descent algorithm}. That is, it never makes things worse.
 75 | \begin{proof}
 76 | \begin{align*}
 77 | 	f(x_{k+1}) &\leq q_k (x_{k+1}) \qquad  \text{ by (i)} \\
 78 | 		   &\leq q_k(x_k) \qquad  \text{ by 2)} \\
 79 | 		   &= f(x_k) \qquad \text{ by (ii)}  
 80 | \end{align*}
 81 | \end{proof}
 82 | 
 83 | Usually we might eventually show that (no convexity needed):
 84 | \begin{itemize}
 85 | 	\item If $ f(x)$ is bounded below, then  $ f(x_k)$ converges by MCT.
 86 | 	\item If $ (x_k)$ converges and $ f$ is lsc, then the limit  $ x_k \to x$ is a stationary point \emph{i.e.} $ \nabla f(x) =0$.
 87 | \end{itemize}
 88 | \begin{eg}[usually non-convex]
 89 | 	~\begin{enumerate}[label=\arabic*)]
 90 | 		\item Expectation maximization (EM) for maximum-likelihood estimation.
 91 | 		\item Difference of convex functions (DC) or convex + concave:
 92 | 			\[
 93 | 				f(x) = g(x) - h(x)
 94 | 			,\]
 95 | 			where $ g,h$ are both convex. Although $ -h$ is concave, $ -h$ is majorized by its tangent line which is convex. Then
 96 | 			\[
 97 | 				q_k(x) = g(x) - \underbrace{ (h(x_k) + \langle \nabla h(x_k),x-x_k \rangle)}_{ \text{ affine in }x } 
 98 | 			\]
 99 | 			is a majorizer, and $ q_k(x)$ is convex.
100 | 	\end{enumerate}
101 | The takeaway is that not all non-convex problems are equally hard.
102 | \end{eg}
103 | \end{document}
104 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_14.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \newpage
  6 | \section{Convex optimization problems}
  7 | \subsection{Tricks}
  8 | The standard form of an optimization problem looks like:
  9 | \begin{align*}
 10 | \min\ &f_0(x)\\
 11 | \text{subject to } &f_i(x) \leq 0, i = 1,\ldots,m \\
 12 | &h_i(x) = 0 , i = 1,\ldots,p
 13 | \end{align*}
 14 | \begin{enumerate}[label=\arabic*)]
 15 | \item Max to min:
 16 | \[
 17 | 	 \max_{x \in C} f(x) = - \min_{x \in C} -f(x)
 18 | .\] 
 19 | \item Equivalent problems:
 20 | \begin{eg}
 21 | 	$ f(x) = \sqrt{|x|} $. Then $ \argmin f(x) = \argmin f(x)^2$ and we turn it into a convex problem.
 22 | \end{eg}
 23 | 
 24 | \begin{eg}
 25 | 	$ f(x) = \norm{ Ax-b} + \lambda \norm{ x}^2 $. Equivalently, we can solve
 26 | 	\[
 27 | 	\min \norm{ Ax- b}^2 + \sigma \norm{ x}^2  
 28 | 	.\] 
 29 | 	So we need to adjust the constant (Lagrange multipliers).
 30 | \end{eg}
 31 | 
 32 | \item Change of variables:
 33 | This works especially well for affine transformation because it doesn't change convexity.
 34 | 	\item Eliminate equality constraints:
 35 | 		\begin{eg}
 36 | 		\[
 37 | 			\underset{ Ax=b}{ \min} f(x)
 38 | 		.\] 
 39 | 		We can decompose $ x = x_p + \ker A$ (particular solution + homogeneous solution). Let  $ F$ be the basis of  $ \ker(A)$, then  $ x= x_p + F z$. This way we change the problem to
 40 | 		\[
 41 | 			\min_z f(x_p +F z)
 42 | 		.\]
 43 | 		Notice this eliminates the constraint by (affine) change of variable.
 44 | 		\end{eg}
 45 | 	\item Slack variables:
 46 | 
 47 | 		$ f_i(x) \leq 0$ iff there exists a  $ s_i \geq 0$ s.t. $ f_i(x) + s_i =0$.
 48 | 		Then we turn $ \min_x f_0(x), \text{ subject to }  f_1(x) \leq 0$ into
 49 | 		\begin{align*}
 50 | 			\min_{x,s}\ &f_0(x)\\
 51 | 				   \text{ subject to }  &f_1(x) + s= 0\\ 
 52 | 				    &s\geq 0
 53 | 		\end{align*}
 54 | 		This is less important nowadays since softwares are less constrained by the form we give them.
 55 | 	\item Epigraph:
 56 | 		\[
 57 | 			\min_{x \in \rr^{n}} f(x) \iff \min_{x \in \rr^{n}, t \in \rr} t, \qquad  f(x) \leq t
 58 | 		.\] 
 59 | 		\begin{eg}
 60 | 			\begin{align*}
 61 | 		\min \norm{ Ax -b}_1 &= \min \sum_{ i= 1}^{ m} \left| a_i^{T} x - b_i \right|  \\ 
 62 | 				     &= \min_{t \in \rr^{m}, x \in \rr^{n}} \mathbbm{1} t\\ 
 63 | 				     & \qquad \qquad \qquad  a_i^{T}x - b_i \leq t_i\\
 64 | 				     & \qquad \qquad \qquad  a_i^{T} x - b_i \geq -t
 65 | 			\end{align*} 
 66 | 		\end{eg}
 67 | 	\item Solve coupled functions $ \min f(x) +g(x)$. This is equivalent to
 68 | 		\begin{align*}
 69 | 			\min_{x,z} f(x) + g(z) \text{ subject to }  x=z
 70 | 		\end{align*}
 71 | 		This way we decouple the functions and make it easier to solve.
 72 | 	\item Marginalization:
 73 | 		\begin{align*}
 74 | 			\min_{x,y} f(x,y) &= \min_x \left( \min_y f(x,y) \right)  \\
 75 | 					   &= \min_x g(x)
 76 | 		\end{align*}
 77 | 		\begin{note}
 78 | 		We can always commute extremization of the same type.
 79 | 		\end{note}
 80 | \end{enumerate}
 81 | 
 82 | \subsection{Convex optimization problems [BV04 Ch.4.2]}
 83 | 
 84 | We wish to make both the function and the constraint sets to be convex. 
 85 | A typical problem:
 86 | \begin{align*}
 87 | \min &f_0(x)\\
 88 | \text{subject to } &f_i(x) \leq 0, i = 1,\ldots,m\\
 89 | &h_i(x) = 0 , i = 1,\ldots,p
 90 | \end{align*}
 91 | A convex problem would be
 92 | \begin{align*}
 93 | 	\min\ &f_0 (x) \\
 94 | 	     \text{ subject to }  &f_i(x) \leq 0 \\
 95 | 				  &a_i^{T} x =b_i
 96 | \end{align*}
 97 | where $ f_0, \ldots, f_m$ are convex functions and the equality constraints are affine.
 98 | \begin{thm}
 99 | Consider the convex problem, $\min f(x), x \in C$. Assume $ f \in \mathcal{ C}^{1}$. Then $ x$ is optimal iff
100 | \begin{enumerate}[label=\arabic*)]
101 | 	\item $ x \in C$
102 | 	\item $ \ \forall \ y \in C$, $ \langle \nabla f(x), y-x \rangle \geq 0$ (Euler inequality).
103 | \end{enumerate}
104 | 
105 | \end{thm}
106 | 
107 | \begin{proof}
108 | 	$ (\impliedby)$: 
109 | 	\begin{align*}
110 | 		f(y) \geq  f(x) + \langle \nabla f(x) , y-x \rangle \geq f(x)
111 | 	\end{align*}
112 | 	$ (\implies)$:
113 | 	Suppose $ x$ is optimal but there exists a $ y \in C$ s.t. $\langle \nabla f(x),y-x \rangle < 0$. Then for $ t \in (0,1]$, the 1D parameterization yields:
114 | 	\begin{align*}
115 | 		\phi(t) &=f(x+t(y-x))\\
116 | 			&= \phi(0) + \phi'( 0) t + \frac{\phi''( \xi)}{ 2} t^2 \qquad \qquad  \text{ Taylor} \\
117 | 			&\leq f(x) + t \langle \nabla f(x), y-x \rangle \qquad \phi \text{ convex by composition}  \\
118 | 			&< f(x)
119 | 	\end{align*}
120 | 	Clearly $ \phi(t)$ is feasible and this contradicts that $ x$ is optimal. 
121 | \end{proof}
122 | \begin{coro}
123 | If $ \langle d,z \rangle \geq 0 \ \forall \ z \implies d=0$.
124 | \end{coro}
125 | \begin{proof}
126 | Take $ z=-d$ and result follows from positive definitiveness of norm.
127 | \end{proof}
128 | \end{document}
129 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_16.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \subsubsection{Linear matrix inequalities (LMI): dual problem of SDPs}
 6 | \begin{align*}
 7 | \min_{x \in \rr^{n}}\ & \langle c,x \rangle \\
 8 | \text{subject to } &\sum_{ i= 1}^{ m} x_i F_i + G \preceq 0, i = 1,\ldots,m, F_i, G \in S^{k} \\
 9 | & Ax=b
10 | \end{align*}
11 | \begin{note}
12 | 	$ \sum_{ i= 1}^{ m} x_i F_i$ is like $ A^* y = \sum_{ i= 1}^{ m} y_i \ve{a}_i$ in the case when $ Ax=b \implies \ve{a}_i^{T}x=b$.
13 | \end{note}
14 | 
15 | \begin{remark}
16 | We can recover linear programs by letting $ F_i,G$ be diagonal matrices.
17 | \end{remark}
18 | 
19 | \begin{remark}
20 | We can also recover SOCP's, details ommitted. Let $ A \in S_{++}^{r}, C \in S^{s}, B \in \rr^{r \times s}$. Then
21 | \[
22 | 	\begin{pmatrix} A&B\\B^{T}&C \end{pmatrix} \succeq 0 \iff \underbrace{ C-B^{T}A^{-1}B}_{ \text{ Schur complement} } \succeq 0
23 | .\]
24 | Schur complement might be computationally cheaper especially for example when $ C =0$.
25 | \end{remark}
26 | 
27 | Let $ K_1, K_2$ be proper cones, then $ K_1 \times K_2$ is also a proper cone.
28 | \begin{eg}
29 | $ X \succeq 0, Y \succeq 0$, we can write
30 | \[
31 | 	\begin{pmatrix} X&Z\\Z^{T}&Y\\ \end{pmatrix} \succeq 0, Z=0 \text{ (linear constraint)} 
32 | .\] 
33 | However, this is horrible for computation. For example, in the case of negative log barrier, we can separate each constraint and projecting to $ \rr_{+}^{n}$ is easy. We can also project to $ S_{+}^{n}$ by making the eigenvalues to nonnegative. But doing this on a bigger matrix is expensive since finding eigenvalues is super-linear.
34 | \end{eg}
35 | 
36 | \newpage
37 | \section{Duality [BV04 Ch.5]}
38 | \subsection{Lagrange dual function/problem}
39 | 
40 | Consider $ p^* = \min_{x \in C} f(x)$. Here $ x \in C$ is \allbold{primal feasible} and we can find it by finding the smallest upper bound. We wish to find a dual feasible point s.t. it is the largest lower bound on  $ p^* $. 
41 | 
42 | \end{document}
43 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_17.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | 
 6 | First start with the primal problem without assuming convexity:
 7 | \begin{align*}
 8 | \min\ &f_0(x) \\
 9 | \text{subject to } &f_i(x) \leq 0, i = 1,\ldots,m \\
10 | &h_i(x) = 0 , i = 1,\ldots,p
11 | \end{align*}
12 | Given an non-empty domain $ D = \bigcap_{i=0}^m \dom(f_i) \cap \bigcap_{ i=0}^{m} \dom(h_i)$. Note that $ \dom (f_0) $ is implicit constraint, \emph{i.e.} $ f_0(x) = x^2 + I_{ \{x\geq 0\} }$.
13 | 
14 | \begin{defn}[lagrangian]
15 | \[
16 | 	\mathscr{L}(x, \lambda, \nu) = f_0(x) + \sum_{ i= 1}^{ m} \lambda_i f_i(x) + \sum_{ i= 1}^{ p} \nu_i h_i(x)
17 | ,\]
18 | where $ \lambda,\nu$ are \allbold{dual variables}, $ \lambda$ is associated with inequality constraints whereas $ \nu$ is associated with equality constraints.
19 | \end{defn}
20 | \begin{remark}
21 | 	We can be creative about whether we put the constraints explicitly in $ f_i, h_i$ or implicitly in $ f_0$.
22 | \end{remark}
23 | 
24 | \begin{defn}[dual function]
25 | \[
26 | 	g( \lambda, \nu) = \inf_{x \in D} \mathscr{L}(x, \lambda, \nu)
27 | .\] 
28 | \end{defn}
29 | \begin{note}
30 | This problem is usually easier than the primal problem because it doesn't have constraints.
31 | \end{note}
32 | 
33 | \begin{ques}
34 | Is $ g$ convex?
35 | \end{ques}
36 | Recall the infimum preserves convexity iff  $ C$ is convex and  $ f(x,y)$ is jointly convex, whereas the supremum preserves convexity iff $ f(x,y)$ is convex for all  $ x$. Since the Lagrangian is most likely not jointly convex, but it is in fact affine in terms of the dual variables. Therefore,
37 |  \[
38 | 	 g(\lambda,\nu) = - \sup_{x \in D} \mathscr{L}(x, \lambda,\nu)
39 | \]
40 | is concave.
41 | 
42 | 
43 | Dual problem (D):
44 | \[
45 | 	d^* = \max_{\lambda \geq 0} g(\lambda,\nu)
46 | .\] 
47 | \begin{prop}[weak duality]
48 | Define $ p^*$ to be the optimal value for the primal problem and $ d^* $ be the maximum value for the dual problem. Then if $ \lambda \geq 0$ and $ \nu$ is anything, then
49 | \[
50 | 	g(\lambda,\nu) \leq p^* 
51 | .\]
52 | Hence $ d^* \leq p^* $. 
53 | \end{prop}
54 | 
55 | \begin{proof}
56 | \begin{align*}
57 | 	g(\lambda,\nu) &= \inf_{x} \mathscr{L}(x, \lambda,\nu) \\
58 | 		       &\leq \mathscr{L}(x, \lambda, \nu)  \ \forall \ x \in D \text{ and feasible} \\
59 | 		       &= f_0(x) + \sum_{ i= 1}^{ m} \underbrace{\lambda_i}_{\geq 0} \underbrace{f_i(x)}_{\leq 0}+ \sum_{ i= 1}^{ p} \nu_i \underbrace{h_i(x)}_{=0} \qquad  \text{ x is feasible}  \\
60 | 		       &\leq f_0(x)
61 | \end{align*}
62 | \end{proof}
63 | \begin{remark}
64 | 	"Strong duality" $ d^* =p^* $ tend to happen if $ (P)$ is convex.
65 | \end{remark}
66 | \begin{eg}[dual of a LP]
67 | \begin{align*}
68 | \min\ & \langle c,x \rangle \\
69 | \text{subject to } &x\geq 0 \iff -x_i \leq 0\\
70 | &Ax=b
71 | \end{align*}
72 | Then
73 | \begin{align*}
74 | 	\mathscr{L}(x,\lambda,\nu) &= \langle c,x \rangle - \langle \lambda,x \rangle + \nu^{T} (Ax-b)\\
75 | 	&= \langle c,x \rangle - \langle \lambda,x \rangle + \langle A^{T} \nu,x \rangle - \langle \nu,b \rangle \\
76 | 	g(\lambda,\nu) &= \inf_x \mathscr{L}(x, \lambda,\nu)  \\
77 | 	&= -\langle \nu,b \rangle + \inf_{\lambda} \langle c-\lambda+ A^{T}\nu,x \rangle =
78 | 	\begin{cases}
79 | 		-\langle \nu,b \rangle &c-\lambda+A^{T} \nu = 0 \\
80 | 		-\infty & \text{ else} 
81 | 	\end{cases}
82 | \end{align*}
83 | Thus the dual problem $ \max_{\lambda\geq 0} g(\lambda,\nu)$ becomes
84 | \[
85 | \max_{\lambda\geq 0} - \langle \nu,b \rangle, \lambda= c+A^{T} \nu
86 | \] 
87 | or
88 | \[
89 | -\min\langle \nu,b \rangle, c+A^{T} \nu \geq 0
90 | .\]
91 | This is a LP!
92 | \end{eg}
93 | \end{document}
94 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_18.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \subsubsection{Dual of linear programs}
  6 | The primal (P) is:
  7 | \begin{align*}
  8 | \max_{x \in \rr^{n}}\quad & \langle c,x \rangle \\
  9 | \text{subject to } \quad &x \geq 0\\
 10 | y:\quad  &Ax \leq b
 11 | \end{align*}
 12 | The dual (D) is:
 13 | \begin{align*}
 14 | \min\quad &\langle b,y \rangle \\
 15 | \text{subject to } \quad & y\geq 0\\
 16 | x:\quad & A^{T} y \geq c
 17 | \end{align*}
 18 | 
 19 | Rules to transform (P) to (D):
 20 | \begin{enumerate}[label=(\arabic*)]
 21 | 	\item $ \max \to \min$ and vice versa.
 22 | 	\item variables $ \to$ constraints and vice versa.
 23 | 	\item objective and RHS of inequality flip places.
 24 | 	\item matrices transpose.
 25 | \end{enumerate}
 26 | "SOB" mnemonic: sensible, odd, and bizarre from a business perspective
 27 | \begin{align*}
 28 | \text{ primal variable } x_i \quad &x_i\geq 0 \text{: sensible}  \\
 29 | & \text{ no constraint: odd}   \\
 30 | &x_i \leq 0 \text{: bizarre} 
 31 | \end{align*}
 32 | \begin{align*}
 33 | 	\text{ constraints in primal when maximizing} \quad & a_i^{T}x\leq b_i \text{: sensible (think budget)}  \\
 34 | & a_i^{T}x=b_i\text{: odd}   \\
 35 | &a_i^{T}x \geq b_i \text{: bizarre} 
 36 | \end{align*}
 37 | The rule: a dual constraint is S/O/B if primal variable is S/O/B. And vice versa. 
 38 | 
 39 | \begin{eg}
 40 | \begin{align*}
 41 | 	(P) \qquad \qquad \min_{\substack{x\geq 0\\ x \in \rr^{2}}} \quad &3x_1+2x_2\\
 42 | 						 y_1: \quad &x_1 + 2x_2 \geq 5 \qquad S\\
 43 | 						 y_2:\quad & \underbrace{x_2 \leq 2 \qquad}\qquad B\\
 44 | 							   & \begin{pmatrix} 1&2\\0&1 \end{pmatrix} \begin{pmatrix} x_1\\x_2 \end{pmatrix} 
 45 | \end{align*}
 46 | \begin{align*}
 47 | 	(D) \qquad \qquad \min_{\substack{y \in \rr^{2}\\y_1\geq 0\ S\\ y_2 \leq 0 \ B}} \quad &5y_1+2y_2\\
 48 | 						 x_1: \quad &y_1 + 0y_2 \leq 3 \qquad S\\
 49 | 						 x_2:\quad & 2y_1+y_2 \leq 2 \qquad  S
 50 | \end{align*}
 51 | 
 52 | 
 53 | Observe: $ f(x) = 3x_1 + 2 x_2 = \underbrace{2x_1}_{\geq 0} + \underbrace{(x_1 + 2x_2)}_{\geq 5} \geq 5$. We proved $ 5\leq p^* $. However, this is not the tightest bound. The dual variables give us the tightest: 3 times the first constraint and -4 times the second constraint yields $ 7\leq p^* $.
 54 | \end{eg}
 55 | \allbold{Duality gap}: $ x, \lambda$ feasible, $ f_0(x) - g(\lambda,\nu)$.
 56 | 
 57 | \subsubsection{Strong duality results}
 58 | \begin{itemize}
 59 | 	\item If $ (P)$ isn't convex, strong duality is unlikely except certain nonconvex QP: s-lemma/s-procedure (see Appendix of BV).
 60 | 	\item  If (P) is convex, strong duality holds under certain constraint qualifications (CQ) such as Slater's condition.
 61 | 		\begin{align*}
 62 | 		\min\quad &f_0(x) \\
 63 | 		\text{subject to } \quad &f_i(x) \leq 0, i = 1,\ldots,m \\
 64 | 		&Ax = b
 65 | 		\end{align*}
 66 | 		\begin{defn}[Slater's conditions]
 67 | 			They hold if there exists a strictly feasible point, $ x \in \ri(\dom(f_0))$ and
 68 | 
 69 | 			if $ f_i$ is affine, $ f_i(x)\leq 0$ (feasible)
 70 | 
 71 | 			if $ f_i$ isn't affine, $ f_i(x)<0$ (strictly feasible)
 72 | 
 73 | 			and $ Ax=b$.
 74 | 		\end{defn}
 75 | 		\begin{thm}
 76 | 			If (P) is convex and Slater's conditions hold, then
 77 | 			\begin{enumerate}[label=(\roman*)]
 78 | 				\item we have strong duality, $ d^* =p^* <\infty$
 79 | 				\item there exists an optimal solution to the dual problem.
 80 | 			\end{enumerate}
 81 | 		\end{thm}
 82 | 		\begin{note}
 83 | 			Slater's does NOT imply there exists an optimal \emph{primal} solution.
 84 | 		\begin{eg}
 85 | 		$ \inf_{x \in \rr} e^{x}$. It is convex, lsc, proper. But it is not coercive so it doesn't have an optimal primal solution.
 86 | 		\end{eg}
 87 | 		\end{note}
 88 | 		\begin{remark}
 89 | 		Often we want Slater's condition on the dual. Since the dual of the dual is the primal, then we guarantee an optimal solution.
 90 | 		\end{remark}
 91 | 		\begin{coro}[Slater for LP]
 92 | 		Slater's conditions hold iff the LP is feasible \emph{i.e.} $ p^* < \infty$.
 93 | 
 94 | 		$ p^* < \infty \implies d^* =p^* $ and dual optimal solution exists.
 95 | 
 96 | 		$ d^* >- \infty \implies d^* =p^* $ and primal optimal solution exists.
 97 | 
 98 | 		Hence if either $ p^* $ or $ d^* \in \rr$ (not $ \pm \infty$), then optimal primal and dual solutions exist.
 99 | 		\end{coro}
100 | 		\begin{note}
101 | 		$ d^* = - \infty, p^* = -\infty$ is possible but rare. This is not strong duality.
102 | 		\end{note}
103 | \end{itemize}
104 | \end{document}
105 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_19.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \subsection{Saddle point interpretation [BV 4.2]}
  6 | Here we want to find the saddle points as we want to minimize the primal but maximize the dual.
  7 | \begin{align*}
  8 | 	p^* = \min\quad &f_0(x)  \\
  9 | 	\text{subject to } \quad &f_i(x) \leq 0, i = 1,\ldots,m  \\
 10 | &Ax = b  
 11 | \end{align*}
 12 | This is equivalent to
 13 | \begin{align*}
 14 | 	&\min_{x}f_0(x) + \sup_{\lambda\geq 0, \nu}\left\{ \sum \lambda_i f_i(x) + \nu^{T} (Ax-b)\right\} \\
 15 | 		 =\ & \min_{x \in D} \sup_{\lambda\geq 0, \nu} \mathscr{L}(x,\lambda,\nu)
 16 | \end{align*}
 17 | This is because if $ f_i(x)>0$ or  $ a_i^{T} x -b_i \neq 0$ for some $ i$, then we get  $ \infty$ in the supremum, encoding it as infeasible.
 18 | 
 19 | Then the dual is
 20 | \[
 21 | 	d^* = \max_{\lambda\geq 0, \nu } g(\lambda,\nu) = \max_{\lambda\geq 0, \nu} \min_{x \in D} \mathscr{L}(x,\lambda,\nu)
 22 | .\] 
 23 | The weak-duality is equivalent to "min-max" inequality:
 24 | \[
 25 | 	d^* =\max_{\lambda\geq 0,\nu} \min_{x \in D} \mathscr{L}(x,\lambda,\nu) \leq \min_{x \in D} \max_{\lambda\geq 0, \nu} \mathscr{L}(x,\lambda,\nu) = p^* 
 26 | .\]
 27 | And equality is achieved if strong-duality holds.
 28 | \begin{note}
 29 | All "min/max" should be "inf/sup" until proven.
 30 | \end{note}
 31 | 
 32 | \begin{thm}
 33 | Saddle point occurs when
 34 | \begin{enumerate}[label=(\arabic*)]
 35 | 	\item strong-duality/strong max/min
 36 | 	\item inf/sup are achieved.
 37 | 		
 38 | 		That is, $ (x^* ,(\lambda^* ,\nu^* ))$ is a saddle point of $ \mathscr{L}(x,(\lambda,\nu))$ if
 39 | 		\begin{align*}
 40 | 			\mathscr{L}(x^* ,(\lambda^* ,\nu^* )) &= \inf_x \mathscr{L}(x,(\lambda^* ,\nu^* )) \\
 41 | 			\mathscr{L}(x^* ,(\lambda^* ,\nu^* )) &= \sup_{\lambda,\nu} \mathscr{L}(x^* ,(\lambda,\nu))
 42 | 		\end{align*}
 43 | \end{enumerate}
 44 | \end{thm}
 45 | 
 46 | \begin{coro}
 47 | 	If we know $ \lambda^* ,\nu^* $, then we can find $ x^* $ by solving the unconstrained problem
 48 | 	\[
 49 | 		\min_x \mathscr{L}(x,(\lambda^* ,\nu^* ))
 50 | 	.\] 
 51 | \end{coro}
 52 | This allows us to solve problems with shared Lagrangians.
 53 | \subsubsection{Shared Lagrangian}
 54 | \begin{eg}
 55 | \begin{align*}
 56 | \min\quad &\norm{ x}_1  \\ 
 57 | \text{subject to } \quad & \norm{ Ax-b}_2 \leq \epsilon \quad \iff \quad \norm{ Ax-b}_2^2 - \epsilon^2 \leq 0 
 58 | \end{align*}
 59 | Let
 60 | \[
 61 | 	\mathscr{L}(x,\lambda) = \norm{ x}_1 + \lambda \left( \norm{ Ax-b}_2^2 - \epsilon^2  \right) 
 62 | .\]
 63 | With the correct $ \lambda^* $, this is equivalent to
 64 | \[
 65 | \min_x \norm{ x}_1 + \lambda^* \norm{ Ax-b}_2^2
 66 | ,\]
 67 | because dropping the constant doesn't affect minimizer. This unconstrained problem is much nicer because the least squares is differentiable, whereas the original constraint is hard to project.
 68 | 
 69 | \end{eg}
 70 | 
 71 | Even if we don't know $ \lambda^* $,
 72 | \begin{enumerate}[label=(\arabic*)]
 73 | 	\item guess $ \lambda$, solve $ x = x(\lambda)$, check if the constraint is active, update $ \lambda$ (solve the dual problem).
 74 | 	\item often $ \epsilon$ is not known (hyper-parameter) and set via cross-validation so we can do cross-validation on $ \lambda$ directly (evaluate trade-off in modeling).
 75 | \end{enumerate}
 76 | We assume existence of saddle points here, which is given by the following:
 77 | \begin{prop}
 78 | Slater's on both primal and dual $ \implies$ existence of saddle points. 
 79 | \end{prop}
 80 | 
 81 | \subsection{Game Theory connection}
 82 | Consider a finite, 2-person, 0-sum game: "matrix game" (not Prisoner's dilemma). 
 83 | 
 84 | This involves the Minimax Theorem of Von Neumann.
 85 | \begin{eg}[rock-paper-scissors]
 86 | Player 1 wants to minimize and Player 2 wants to maximize utility. The payoff matrix looks like
 87 | \begin{table}[H]
 88 | 	\centering
 89 | 	\begin{tabular}{c||c|c|c}
 90 | 		&P&S&R\\
 91 | 		\hline
 92 | 		\hline
 93 | 		P&0&1&-1\\
 94 | 		\hline
 95 | 		S&-1&0&1\\
 96 | 		\hline
 97 | 		R&1&-1&0
 98 | 	\end{tabular}
 99 | 	\caption*{Row: Player 1; Column: Player 2}
100 | \end{table}
101 | 
102 | $ u^{T}Pv$ is the payoff, intuitively it means player 1 chooses a row and player 2 chooses a column. For a fair game, the payoff value is 0. Since $ A=-A^{T}$ is antisymmetrical, it's fair. But in reality, $ u$ and  $ v$ actually encode the probability of choose each row/column, which sums up to 1.
103 | 
104 | Define probability simplex $ \Delta = \{u:u\geq 0, \sum u_i = 1\} $.
105 | 
106 | \begin{case}[Player 2 knows player 1's strategy]
107 | If $ u$ is known, 
108 | Then the decision is easy: choose $ v \in \argmax_{v \in \Delta} u^{T}Pv$.
109 | 
110 | If Player 1 knows Player 2 knows Player 1's strategy, then Player 1 should select $ u$ to minimize Player 2's payoff:
111 |  \[
112 | p_1^* = \min_{u \in \Delta} \max_{v \in \Delta} u^{T}Pv
113 | .\]
114 | This is in fact a LP.
115 | \end{case}
116 | \begin{case}[Player 1 knows Player 2's strategy]
117 | \[
118 | p_2^* = \max_{v \in \Delta} \min_{u \in \Delta} u^{T}Pv
119 | .\] 
120 | \end{case}
121 | Intuitively, whoever has knowledge of opponent's move gets an edge, so the payoff when Player 2 has an edge in maximizing will be at least the payoff when Player 1 has an edge in minimizing. That is, $ p_1^* \geq p_2^* $. This is weak duality. Slater's condition for LP requires only a feasible point. Since $ \Delta$ is nonempty, we have strong duality $ p_1^* =p_2^* $.
122 | \end{eg}
123 | 
124 | \end{document}
125 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_20.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \subsection{Fenchel-Rockafellar Duality [BC17]}
 6 | 
 7 | \[
 8 | 	(P) \qquad \qquad \min_x f(x) + g(Ax)
 9 | ,\]
10 | where $ f,g \in \Gamma_0$ and allows $ +\infty$ values, and $ A$ is a  $ m\times n$ matrix. 
11 | \[
12 | 	(D) \qquad \qquad \min_{v} f^* (A^* v) + g^* (-v)
13 | .\]
14 | \subsubsection{Connections to Lagrangian duality}
15 | Recall
16 | \[
17 | 	f^* (y) = \sup_{x} \langle x,y \rangle -f(x)
18 | .\]
19 | 
20 | Take the same $ (P)$, recast as
21 |  \[
22 | 	 \min_{x,z} f(x) + g(z)\ s.t.\ z=Ax \quad  \implies \quad  \mathscr{L}(x,z,v)=f(x)+g(z)+ \langle z-Ax,v \rangle
23 | .\]
24 | Then the Lagrangian dual function $ h(v)$ is
25 | \begin{align*}
26 | 	h(v) = \inf_{x,z} \mathscr{L}(x,z,v) &= \inf_{x} (f(x)- \langle Ax,v \rangle) + \inf_{z} (g(z) + \langle z,v \rangle)\\
27 | 					     &= - \sup_{x} (\langle x,A^* v \rangle -f(x)) - \sup_{z}( - \langle z,v \rangle -g(z)) \\
28 | 					     &= -(f^* (A^* v) +g^* (-v))
29 | \end{align*}
30 | Thus, the Lagrangian and F-R dual problems only differ by a minus sign: \[ \max_v h(v) = -\min_v -h(v) = - \min_v f^* (A^* v) +g^* (-v).\]
31 | \subsubsection{Saddle-point interpretation}
32 | 
33 | If $ g \in \Gamma_0$, then $ g=g^{**}$, so $ g(x) = \sup_y \langle x,y \rangle -g^* (y)$. Using this fact we can rewrite the primal problem as
34 | \begin{align*}
35 | 	(P)\qquad  \min_x f(x) + g(Ax) &= \min_x \sup_v f(x) + \langle Ax,v \rangle - g^* (v) \\
36 | 				 &= \sup_v \min_x f(x) + \langle x,A^* v \rangle -g^* (v) \qquad \text{ if saddle point exists} \\
37 | 				 &= \sup_v -f^* (-A^* v) -g^* (v)\\
38 | 				 &= \sup_v -f^* (A^* v) - g^* (-v) \qquad \qquad (D)
39 | \end{align*}
40 | 
41 | \begin{prop}[18.9]
42 | 	Let $ f \in \Gamma_0( \mathcal{ H})$, if $ f^* $ is strictly convex, then $ f$ is (Gateaux) differentiable on  $ \inte \dom(f)$.
43 | \end{prop}
44 | \begin{prop}[18.15]
45 | If $ f$ is continuous and convex, then
46 | 
47 | $ f$ is (Frechet) differentaible and  $ \nabla f$ is $L$-Lipschitz continuous if and only if $ f^* $ is $ L^{-1}$ strongly convex,
48 | 
49 | and $ f \in \Gamma_0, f=f^{^* } $.
50 | \end{prop}
51 | 
52 | \subsubsection{Algorithms}
53 | \begin{enumerate}[label=(\arabic*)]
54 | 	\item gradients: If I know $ \nabla g$, can I find $ \nabla (g \circ A)$? Yes. It's $ A^* (\nabla g \circ A)$.
55 | 	\item projections/proximity operators: Let $ C = \{x: \norm{ x}_2 \leq 1\} $ and $ C \circ A = \{x: \norm{ Ax}_2 \leq 1\} $. In general, if I know $ \prox_{g}$, I don't know $ \prox_{g \circ A}$. Here there is no chain rule nor linearity.
56 | \end{enumerate}
57 | \begin{remark}
58 | We can use the dual to shift the linear operator from the proximity term to the differentiable term.
59 | \end{remark}
60 | 
61 | \begin{thm}[15.23 generalized Slater]
62 | 	If $ 0 \in \ri (\dom g - A(\dom f))$ (CQ), then strong duality holds. That is,
63 | 	\[
64 | 		\inf_x f(x) + g(Ax) = -\min_{v} f^* (A^* v) + g^* (-v)
65 | 	,\]
66 | 	and the dual solution is obtained.
67 | \end{thm}
68 | \begin{note}
69 | 	In finite dimensions, for CQ we just need to show $ \ri(\dom g) \cap A (\ri (\dom f))\neq \O$. Or, if $ f,g$ are polyhedral,  $ \dom g \cap A(\dom f) \neq \O$. This is essentially saying we want a strictly feasible point.
70 | \end{note}
71 | \end{document}
72 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_22.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \begin{remark}
  6 | 	Complementary slackness means if $ f_j(x^* )<0$, then this is an inactive constraint, since $ \lambda_j^* =0$, and
  7 | 	\begin{align*}
  8 | 		\mathscr{L}(x,\lambda^* ,\nu^* ) &= f_0(x)+ \sum_{i\neq j} \lambda_i^* f_i(x) + \sum \nu_i^* h_i(x) 
  9 | 	\end{align*}
 10 | This is only true if we have strong duality. In particular, it usually isn't true for non-convex problems.
 11 | \end{remark}
 12 | \begin{eg}[1]
 13 | Consider a convex case with inactive constraints:
 14 | \begin{align*}
 15 | \min_{x \in \rr}\quad &x \\
 16 | \text{subject to } \quad &x\geq 0 \\
 17 | &x\leq 1 \text{ this is not tight/active constraint} 
 18 | \end{align*}
 19 | We can just remove the inactive constraint and still get the same solution.
 20 | \end{eg}
 21 | \begin{eg}[2]
 22 | Consider a non-convex case with non-tight constraints:
 23 | \begin{align*}
 24 | \min_{x \in \rr}\quad &x \\
 25 | \text{subject to } \quad &x\geq 0 \qquad \text{ not tight} \\
 26 | &x^2 \geq 1
 27 | \end{align*}
 28 | as solution is $ x^* =1$. But if we remove the non-tight constraint $ x\geq 0$ here, we would get $ -\infty$ as the solution instead, so we can't just drop non-tight constraint for non-convex problems.
 29 | \end{eg}
 30 | 
 31 | \subsection{Meta-rules}
 32 | Suppose $ C \subseteq \rr^{n}$, possibly nonconvex.
 33 | \begin{enumerate}[label=(\arabic*)]
 34 | 	\item switch between min and max with double minus signs or between argmin and argmax with single minus sign (since we don't care about function value).
 35 | 	\item If $ \phi$ is monotone on $ \im(f)$, then
 36 | 		\[
 37 | 			\argmin_{x \in C} \phi(f(x)) = \argmin_{x \in C} f(x)
 38 | 		.\]
 39 | 		\begin{eg}
 40 | 		$ \frac{1}{2} \norm{ Ax-b}^2 $ and $ \norm{ Ax-b} $.
 41 | 		\end{eg}
 42 | 	\item If we have all mins or all maxs, we can swap order
 43 | 		\[
 44 | 			\min_x \min_y f(x,y) = \min_y \min_x f(x,y) = \min_{x,y} f(x,y)
 45 | 		.\] 
 46 | 	\item If $ D \subseteq C$ where $ C$ can be seen as a relaxation, then
 47 | 		\[
 48 | 			\min_{x \in C} f(x) \leq \min_{x \in D} f(x)
 49 | 		.\]
 50 | 		And we can obtain a lower bound this way.
 51 | 	\item "superadditivity":
 52 | 		\[
 53 | 			\min_{x \in C} f(x)+g(x) \geq \min_{x \in C} f(x) + \min_{x \in C} g(x)
 54 | 		.\] 
 55 | \end{enumerate}
 56 | 
 57 | \begin{eg}[solving convex problems using KKT]
 58 | 	Recall that the solution to the least squares problem $ \min_x \frac{1}{2} \norm{ Ax-b}^2 $ when $ A$ has more rows than columns ($ m\geq n$) is
 59 |  \[
 60 | 	 x^* = \left( A^{T}A \right) ^{-1} A^{T}b
 61 | .\]
 62 | In the case when $ m<n$, the system is underdetermined so $ Ax=b$ has many solutions, so we instead want to find the solution with the least Euclidean norm (since we can add any vector from $ \ker A$ to arbitrarily inflate the norm of the solution):
 63 | \begin{align*}
 64 | \min\quad &\frac{1}{2}\norm{ x}^2  \\
 65 | \text{subject to } \quad &Ax=b
 66 | \end{align*}
 67 | And the solution is
 68 | \[
 69 | 	x^*  = A^{T}\left( AA^{T} \right) ^{-1} b
 70 | .\] 
 71 | To see this, consider the more general quadratic problem
 72 | \begin{align*}
 73 | 	\min\quad & \frac{1}{2} \langle x,Px \rangle + \langle q,x \rangle+ r, P\succeq 0  \\
 74 | 		  &Ax = b
 75 | \end{align*}
 76 | Note that we recover the problem when $ P=I, q=r=0$.
 77 | The Lagrangian is
 78 | \[
 79 | 	\mathscr{L}(x,\nu) = \frac{1}{2} \langle x,Px \rangle+ \langle q,x \rangle+ r+ \nu^{T}(Ax-b)
 80 | .\] 
 81 | The KKT conditions are the following:
 82 | \begin{enumerate}[label=(\arabic*)]
 83 | 	\item stationarity:
 84 | 		\[
 85 | 			\nabla _x \mathscr{L}(x,\nu) = Px+q+A^{T}\nu = 0
 86 | 		.\] 
 87 | 	\item primal feasibility:
 88 | 		\[
 89 | 		Ax=b
 90 | 		.\] 
 91 | 	\item dual feasibility: N/A.
 92 | 	\item Complementary slackness: N/A.
 93 | \end{enumerate}
 94 | Since the conditions are linear equations, we can combine them into a larger system of equations:
 95 | \[
 96 | 	\begin{pmatrix} P&A^{T}\\A&0 \end{pmatrix} \begin{pmatrix} x\\ \nu \end{pmatrix} = \begin{pmatrix} -q\\ b \end{pmatrix}  
 97 | .\]
 98 | So when $ P=I,q=r=0$, we have
 99 | \begin{align*}
100 | 	\begin{pmatrix} A&A A^{T} \\ A&0 \end{pmatrix} \begin{pmatrix} x\\ \nu \end{pmatrix}&= \begin{pmatrix} 0\\ b \end{pmatrix}  \\
101 | 	\begin{pmatrix} x \\ \nu \end{pmatrix} &= \begin{pmatrix} A^{T}\left(A A^{T} \right) ^{-1} b \\-\left( AA^{T} \right)^{-1} b \end{pmatrix}  
102 | \end{align*}
103 | \end{eg}
104 | \begin{eg}
105 | Consider the problem
106 | \begin{align*}
107 | \min\quad &\frac{1}{2} \norm{ Ax-b}^2  \\
108 | \text{subject to } \quad & \mathbbm{1}^{T} x \leq \tau 
109 | \end{align*}
110 | The Lagrangian is
111 | \[
112 | 	\mathscr{L}(x,\lambda) = \frac{1}{2}\norm{ Ax-b}^2 + \lambda (\mathbbm{1}^{T}x - \tau) 
113 | .\]
114 | The KKT conditions are
115 | \begin{enumerate}[label=(\arabic*)]
116 | 	\item 
117 | 		\[
118 | 			\nabla _x \mathscr{L}(x,\lambda) = A^{T}(Ax-b) + \lambda \mathbbm{1} = 0
119 | 		.\] 
120 | 	\item $ \mathbbm{1}^{T}x - \tau \leq 0$.
121 | 	\item $ \lambda\geq 0$.
122 | 	\item $ \lambda = 0$ or $ \mathbbm{1}^{T} x -\tau = 0$.
123 | \end{enumerate}
124 | In the case when $ \lambda=0$, the problem reduces to least squares and we've already solved it. When $ \lambda \neq 0$ and $ \mathbbm{1}^{T}x  = r$ instead, we can solve it the following way:
125 | \begin{align*}
126 | 	x &= \left( A^{T}A \right) ^{-1} (A^{T}b - \lambda\mathbbm{1}) \\
127 | 	\tau=\mathbbm{1}^{T}x &= \mathbbm{1}^{T} \left( A^{T}A \right) ^{-1} (A^{T}b- \lambda \mathbbm{1})\\
128 | 	\lambda &= \frac{\mathbbm{1}^{T} \left( A^{T}A \right)^{-1} A^{T}b - \tau }{ \mathbbm{1}^{T} \left( A^{T}A \right)^{-1} \mathbbm{1} } 
129 | \end{align*}
130 | Note that $ \lambda$ is just a scalar.
131 | \end{eg}
132 | \end{document}
133 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_23.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \subsection{Perturbation and Sensitivity Analysis [BV 5.6]}
 6 | Given the primal problem
 7 | \begin{align*}
 8 | 	(P) \qquad p^* =\min\quad &f_0(x) \\
 9 | \text{subject to } \quad &f_i(x) \leq 0, i = 1,\ldots,m \\
10 | &h_i(x) = 0 , i = 1,\ldots,p
11 | \end{align*}
12 | and let's perturb this a little:
13 | \begin{align*}
14 | 	(P_{u,v}) \qquad p(u,v)=\min\quad &f_0(x) \\
15 | \text{subject to } \quad &f_i(x) \leq u_i, i = 1,\ldots,m \\
16 | &h_i(x) = v_i , i = 1,\ldots,p
17 | \end{align*}
18 | So $ p^* =p(0,0)$, what is $ p(u,v)$?
19 | 
20 | We can do a local analysis and find out the derivatives or a global analysis using a bound.
21 | 
22 | \subsubsection{global analysis}
23 | We can use Slater's condition (strict feasibility) to check strong duality and existence of optimal dual points.
24 | \begin{align*}
25 | 	p^* =p(0,0)&= d^*  \\
26 | 		   &= \sup_{\lambda\geq 0} g(\lambda,\nu)\\
27 | 		   &= g(\lambda^* ,\nu^* ) \\
28 | 		   &= \inf_{x} \mathscr{L}(x,\lambda^* ,\nu^* ) \\
29 | 		   &= \inf_{x} f_0(x) + \sum \underbrace{ \lambda_i^* }_{\geq 0} \underbrace{f_i(x)}_{\leq u_i} + \sum \nu_i^* \underbrace{ h_i(x)}_{v_i} \\
30 | 		   &\leq f_0(\overline{x})+\langle \lambda^* ,u \rangle + \langle \nu^* ,v \rangle\\
31 | 	p(u,v) = f_0(\overline{x}) &\geq p^* -\langle \lambda^* ,u \rangle - \langle \nu^* ,v \rangle 
32 | \end{align*}
33 | where we chose an $ \overline{x}$ that is feasible for  $ (P_{u,v})$.
34 | 
35 | \begin{case}[1]
36 | 	$ \lambda_i^* $ is large, then if we tighten $ i$th inequality constraint  $ (u_i<0)$,
37 | 	\begin{align*}
38 | 		p(u,0) \geq p^* - \lambda_i u_i
39 | 	\end{align*}
40 | 	So $ p(u,0)$ is much larger than  $ p^* $.
41 | \end{case}
42 | \begin{case}[2]
43 | $ \lambda_i^* $ is small, and we loosen $ i$th inequality constraint  $ u_i>0$, then
44 | \[
45 | 	p^* \geq p(u,0)\geq p^* - \underbrace{\lambda_i^* u_i}_{>0}
46 | .\] 
47 | So loosening constraint doesn't help much to reduce the minimum.
48 | \end{case}
49 | \begin{case}[3]
50 | 	$ \lambda_i^* =0$, for example when $ f_i(x)\leq 10^{6}$, the constraint is inactive.
51 | \end{case}
52 | \begin{case}[4]
53 | 	When $ \lambda_i^* $ is large, loosen $ (u_i)>0$, or if $ \lambda_i^* =0$, tighten $ (u_i<0)$, then the analysis can't help us.
54 | \end{case}
55 | \begin{case}[5]
56 | 	If $ \nu_i^* \gg 0, v_i<0$ or $ \nu_i^* \ll 0, v_i>0$, then $ p(0,v) \gg p^* $ and we have a big change.
57 | \end{case}
58 | \begin{case}[6]
59 | 	If $ |v_i^* | \ll 1$ or $ \nu_i^* >0,v_i<0$ or $ \nu_i^* <0, v_i>0$, then $ p(0,v)$ doesn't change much.
60 | \end{case}
61 | \subsubsection{local sensitivity analysis}
62 | We can show that $ (P_{u,v})$ is convex using the minimizing conditions. Recall that for a convex function,
63 |  \[
64 | 	 f(y) \geq f(x) + \langle \partial f(x) , y-x \rangle
65 | .\] 
66 | Comparing with Equation above we see that the dual variables are just the subgradients of $ p(u,v)$!
67 | If $ p(u,v)$ is differentiable,
68 |  \[
69 | 	 \frac{\partial p(0,0)}{\partial u_i} = -\lambda_i^* , \quad \frac{\partial p(0,0)}{\partial v_i} =-\nu_i^*  
70 | .\] 
71 | This is symmetric! Now we can write the Taylor expansion:
72 | \[
73 | 	p(u,v) = p(0,0) +\langle \frac{\partial p}{\partial u} ,u \rangle + \langle \frac{\partial p}{\partial v} ,v \rangle + \text{ higher-order terms} 
74 | .\]
75 | This is only accurate with small perturbation.
76 | 
77 | In economics, dual variables is referred to as "shadow prices". In statistics, it's called "score test".
78 | 
79 | \subsection{Generalized Inequalities [BV 5.9]}
80 | \begin{align*}
81 | \min\quad &f_0(x) \\
82 | \text{subject to } \quad &f_i(x) \preceq 0, f_i:\rr^{n} \to S^{m} \\
83 | &h_i(x) = 0 , i = 1,\ldots,p
84 | \end{align*}
85 | \begin{eg}[SDP]
86 | 	Punchline: we get analogous KKT conditions. Instead of $ \lambda_i\geq 0$ now we require $ \Lambda_i \succeq 0$.
87 | 
88 | 	Caveat: Before, if $ \lambda\geq 0, y \geq 0$ and $ \langle \lambda,y \rangle =0$, then $ \ \forall \ i, \lambda_i =0$ or $ y_i = 0$.
89 | 	However, in the matrix case, if $ \Lambda \succeq 0, Y=f_i(x) \succeq 0$ and $ \langle \Lambda,Y \rangle \geq 0$, it does NOT mean $ Y=0$ or  $ \Lambda =0$. 
90 | 
91 | 	But if $ \Lambda \succ 0, Y \succeq 0, \langle \Lambda,Y \rangle = 0 $, then $ Y=0$
92 | \end{eg}
93 | \end{document}
94 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_24.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \newpage
  6 | \chapter{Algorithms}
  7 | \newpage
  8 | \section{Unconstrained Optimization}
  9 | We assume reasonable smoothness of the objective. Here is an overview of the algorithms:
 10 | \begin{enumerate}[label=(\arabic*)]
 11 | 	\item gradient descent
 12 | 		\[
 13 | 			x_{k+1} = x_k - t \cdot  \nabla f(x_k), \quad t= \frac{1}{L}
 14 | 		.\] 
 15 | 		where $ L$ is the Lipschitz constant of the gradient.
 16 | 	\item Newton's method
 17 | 		\[
 18 | 			x_{k+1} = x_k - \left( \nabla ^2 f(x_k) \right)^{-1} \nabla f(x_k)
 19 | 		.\] 
 20 | 		This can reduce to gradient descent when we have $ \nabla ^2 f(x) \preceq L \cdot I$ and we just bound the Hessian with $ L \cdot I$.
 21 | 	\item Quasi-Newton
 22 | \end{enumerate}
 23 | \newpage
 24 | \subsection{Proximal gradient descent}
 25 | \begin{align*}
 26 | 	\min \underbrace{ f(x)}_{ \text{ smooth, strongly convex} } + \underbrace{ g(x)}_{ \text{simple, convex}}  
 27 | \end{align*}
 28 | Note we can add indicator function to $ g$:
 29 | \[
 30 | 	g(x) = I_{C}(x) + h(x)
 31 | ,\] 
 32 | \emph{i.e.} when we have constraint $ x \in C$.
 33 | \subsubsection{motivation}
 34 | We can try the first-order Taylor approximation of $ f$. However, recall minimizing a linear function would go to negative infinity, so we need to go to 2nd order.
 35 | \begin{align*}
 36 | 	x_{k+1} &= \argmin_{x} f(x_k) + \langle \nabla f(x_k),x-x_k \rangle + \frac{1}{2} L\norm{ x-x_k}_2^2 + g(x)\\
 37 | 	&= \argmin_x \frac{1}{L} \left(  \langle \nabla f(x_k),x-x_k \rangle + \frac{1}{2} L\norm{ x-x_k}_2^2 + g(x)\right)  \\
 38 | 	&= \argmin_x \frac{1}{2} \norm{ x-\left(x_k - \frac{1}{L} \nabla f(x_k)\right)}_2^2 + \frac{1}{L} \cdot g(x) \text{ complete the square and ignore constants}  \\
 39 | 	&= \argmin_x \frac{1}{2} \norm{ x-\widetilde{ x}}_2^2 + \frac{1}{L} \cdot g(x)  \\
 40 | 	&= \prox_{\frac{1}{L} \cdot g} (\widetilde{ x})
 41 | \end{align*}
 42 | Note that this solution is unique because we have strong convexity.
 43 | \subsubsection{algorithm}
 44 | \begin{align*}
 45 | 	x_{k+1} = \prox_{t g} (x_k - t \cdot \nabla f(x_k))\qquad t\text{ via line search or } t=\frac{1}{L} 
 46 | \end{align*}
 47 | \begin{remark}
 48 | 	If $ g(x)=0$, proximal operator is the identity function so it reduces to gradient descent.
 49 | \end{remark}
 50 | 
 51 | \begin{eg}
 52 | 	$ g(x) = I_C$. Then
 53 | 	 \begin{align*}
 54 | 		 \prox_{t g} (\widetilde{ x}) = \Proj_C(x)
 55 | 	\end{align*}
 56 | 	Recall from linear algebra: if $ \Proj_V(\widetilde{ x})$ is the projection of $ \widetilde{ x} \to V$, then
 57 | 	\[
 58 | 		\widetilde{ x} =\Proj_V(\widetilde{ x}) + \Proj_{V^{\perp}}(\widetilde{ x})
 59 | 	.\]
 60 | 	We can generalize this result to \allbold{Moreau's decomposition}: 
 61 | 	\begin{align*}
 62 | 		\widetilde{ x} = \prox_g (\widetilde{ x}) + \prox_{g^* } (\widetilde{ x})
 63 | 	\end{align*}
 64 | \end{eg}
 65 | 	\begin{eg}
 66 | 	\[
 67 | 		\Proj_{\norm{ x}_{\infty} \leq 1} = \widetilde{ x} - \prox_{\norm{ \cdot }_1 } (\widetilde{ x})
 68 | 	.\] 
 69 | 
 70 | 	\begin{align*}
 71 | 		\prox_{t \norm{ \cdot }_1 } (y) = \argmin_x \frac{1}{2} \norm{ x-y}_2 ^2 + L \norm{ x}_1 \text{ this is separable!}
 72 | 	\end{align*}
 73 | 	By Fermat's rule,
 74 | 	\begin{align*}
 75 | 		\prox_g (y) = \argmin \frac{1}{2}\norm{ x-y}^2 + g(x)\\
 76 | 		\implies 0 &\in x-y + \partial g(x)\\
 77 | 		y&\in x + \partial g(x)  \\
 78 | 		y & \in (I+\partial g)(x)  \\
 79 | 		x&\in \left( I+ \partial g \right)^{-1} (y)   \\
 80 | 		x&= \left( I+ \partial \norm{ }_1  \right)^{-1} y  \text{ unique solution s.c.}
 81 | 	\end{align*}
 82 | 	We derived earlier that the solution to $ \prox_{t \cdot \norm{ \cdot }_1 }$ is
 83 | 	\[
 84 | 		x= \sgn(y) \cdot \lfloor |y| - t\rfloor_{+}
 85 | 	.\] 
 86 | 	\end{eg}
 87 | 
 88 | \subsubsection{alternative derivation}
 89 | By Fermat
 90 | \begin{align*}
 91 | 	0 &\in \partial (f+g)(x)  \\
 92 | 	0 &= \in \nabla  f(x) + \partial g(x) \text{ under CQ} \\
 93 | 	x &= x +  \nabla  f(x) + \partial g(x) \\
 94 | 	x-\nabla f(x) &\in x + \partial g(x) = (I + \partial g)(x)  \\
 95 | 	x &= \left( I+ \partial g \right) ^{-1} (I - \nabla f)(x) \text{ fixed point eqn} \\
 96 | 	x_{k+1} &= \left( I+ \partial g \right) ^{-1} (I - \nabla f)(x_k)\\
 97 | 		&= \prox_g (x_k - \nabla f(x_k)) 
 98 | \end{align*}
 99 | If $ f = 0$, we get
100 |  \begin{align*}
101 | 	 x_{k+1} &= \prox_{t g}(x_k) \text{ here t is anything we want since }f=0 \\
102 | 		 &= \argmin t \cdot g(x) + \frac{1}{2} \norm{ x-x_k}^2  \\
103 | \end{align*}
104 | \begin{remark}
105 | Forward Euler exactly corresponds to gradient descent, whereas backward Euler exactly corresponds to proximal gradient descent. Thus, proximal gradient descent is also called "forward-backward method".
106 | \end{remark}
107 | \end{document}
108 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_25.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \subsubsection{Convergence of gradient descent}
 6 | 
 7 | ~\begin{thm}
 8 | 	Consider the problem 
 9 | 	\[
10 | 		f^* = \min_x f(x)
11 | 	.\] 
12 | 	$ f \in \Gamma_0(\rr^{n})$. We assume $ \nabla f$ is $L$-Lipschitz continuous. Choose $ t = \frac{1}{L}$. Then gradient descent with step size $ t$ converges with rate  $ \mathcal{ O}\left( \frac{1}{k} \right) $.
13 | \end{thm}
14 | \begin{proof}
15 | 	We wish to bound $ f(x_{k+1}) - f^* $ by the local linear and quadratic lower and upper bounds. $L$-Lipschitz continuous implies that $ \nabla ^2 f(x) \preceq L \cdot I$. Recall that $ x_{k+1} = x_k - \frac{1}{L} \nabla f(x_k)$.
16 | 	\begin{align*}
17 | 		f(x_{k+1}) &\leq f(x_k) + \langle \nabla f(x), x_{k+1}-x_k \rangle + \frac{L}{2} \norm{ x_{k+1}-x_k}^2 \\
18 | 			   &= f(x_k) - \frac{1}{2L} \norm{ f(x_k)}^2 \text{ descent method guarantees progress} \\
19 | 			   &\leq f^* +\langle \nabla f(x_k),x_k-x^*  \rangle -\frac{1}{2L} \norm{ \nabla f(x_k)}^2   \text{ by convexity} \\
20 | 			   &= f^* + \frac{L}{2} \left( \norm{ x_k-x^* } ^2 - \norm{ x_k -x^* -\frac{1}{L} \nabla f(x_k)}^2  \right)   \\
21 | 			   &= f^* + \frac{L}{2} \left( \norm{ x_k-x^* }^2 - \norm{ x_{k+1} - x^* }   \right)  \\
22 | 		\sum_{ i= 1}^{ k}  f(x_i) - f^* &= \frac{L}{2} \sum_{ i= 1}^{ k} \norm{ x_{i-1} - x^* } ^2 - \norm{ x_i - x^* }^2  \\
23 | 						&= \frac{L}{2} (\norm{ x_0 - x^* }^2 - \norm{ x_k - x^* }^2  ) \text{ telescope} \\
24 | 						&\leq \frac{L}{2} \norm{ x_0 - x^* }^2   \\
25 | 		f(x_{k}) - f^* &\leq \frac{1}{k} \sum_{ i= 1}^{ k} f(x_i) - f^* \leq \frac{L}{2k} \norm{ x_0-x^* }^2  \\
26 | 			       &= \mathcal{ O}\left( \frac{1}{k} \right) 
27 | 	\end{align*}
28 | \end{proof}
29 | 
30 | Question: is this the best we can?
31 | \begin{enumerate}[label=(\arabic*)]
32 | 	\item Is our analysis tight? Yes.
33 | 	\item This is worst-case complexity.
34 | 	\item Are there similar methods (\emph{i.e.} first-order) with faster rates? More precisely, first-order method satisfies (Lanczos/CG):
35 | 		\[
36 | 			x_k \in \mathscr{L}_k = \text{ span} \{x_0, \nabla f(x_0), \nabla f(x_1), \ldots , \nabla f(x_{k-1})\} 
37 | 		.\]
38 | 		The answer is yes, by Nesterov 1983.
39 | \end{enumerate}
40 | 
41 | \begin{thm}[Nesterov 1983]
42 | 	For any 1st order method, there exists a $ f \in \Gamma_0(\rr^{n})$ with $ \nabla f$ $L$-Lipschitz continuous and \[ f(x_k)-f^*  \geq \frac{3}{32} \cdot \frac{L}{k^2} \cdot \norm{ x_0-x^* }^2 \text{ for } k\leq \frac{1}{2}(n-1)  \] and \[x_k - x^* \geq \frac{1}{8} \norm{ x_0 - x^* }^2 \]
43 | \end{thm}
44 | \begin{proof}
45 | \emph{Sketch}: The adversarial function is
46 | \begin{align*}
47 | 	f(x) &= \frac{L}{4} \left( \langle x,Ax \rangle - \langle e_1,x \rangle \right), A= \begin{pmatrix} 2&-1&0&\ldots\\-1&2&-1&0\\ \ldots\\0&0&-1&2 \end{pmatrix}\\
48 | 	\nabla f(x) &= \frac{L}{4} (Ax, e_1)\\
49 | 	x^* &= A^{-1} e_1
50 | \end{align*}
51 | Assume $ x_0 = 0$ (we can shift). At $ x_k$, only first $ k$ coordinates are nonzero. Since  $ A^{-1}$ is a dense matrix, so $ x^* $ has nonzero elements, so we can get a high norm difference.
52 | \end{proof}
53 | 
54 | \begin{thm}[Nesterov]
55 | \begin{align*}
56 | 	y_0 &= x_0\\
57 | 	x_{k+1} &= y_k-t_k \nabla f(y_k)\\
58 | 	y_{k+1} & = x_{k+1} + \frac{k}{k+3} (x_{k+1}-x_k)
59 | \end{align*}
60 | This has convergence rate of $ \mathcal{ O}\left(\frac{1}{k^2}\right)$.
61 | \end{thm}
62 | \begin{remark}
63 | 	Since we cannot get better than $ O\left( \frac{1}{k^2} \right) $ and this algorithm achieves it, so it is optimal.
64 | \end{remark}
65 | 
66 | \end{document}
67 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_26.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \subsubsection{Gradient descent analysis with strong convexity}
 6 | 
 7 | What do we want to analyze? Error metrics.
 8 | 
 9 | \begin{enumerate}[label=(\arabic*)]
10 | 	\item $ f(x_{k+1}) - f_{x_k}, \norm{ x_{k+1} - x_k}, \norm{ \nabla f(x_k)} $: can always be practical termination criteria, although they might not be good.
11 | 	\item $ f(x_k) -f^* $: we can use this sometimes if we know $ f^* =0 $ or in the primal/dual problem when we can squeeze the gap between bounds.
12 | 	\item $ \norm{ x_k - x^* } $ : often can't use.
13 | \end{enumerate}
14 | 
15 | \begin{remark}
16 | 	$ f(x_k) = \sum_{ j= 1}^{ k} \frac{1}{j}$, then $ f(x_{k+1}) - f(x_k) \to 0$ but we don't have a minimum since the series diverge! 
17 | \end{remark}
18 | 
19 | \subsubsection{suboptimality bounds (see PDF handout)}
20 | If $ \nabla f$ is $L$-Lipschitz continuous, then 
21 | \begin{enumerate}[label=(\arabic*)]
22 | 	\item $ \norm{ \nabla f(x)} \leq L \norm{ x-x^* }_2  $.
23 | 	\item $ f(x) - f^* \leq \frac{L}{2} \norm{ x - x^* } $ by continuity.
24 | 	\item $ \norm{ \nabla f(x)}^2 \leq 2L (f(x) - f^* ) $.
25 | \end{enumerate}
26 | \begin{remark}
27 | 	By these bounds, bounding $ \norm{ x-x^* } $ is the nicest if possible but usually out of reach. The next nicest to bound is $ f(x) - f^* $. "$ x$ is an $ \epsilon$-sub-optimal point" means $ f(x) - f^* \leq \epsilon$. 
28 | \end{remark}
29 | 
30 | If $ f$ is  $ \mu$-strongly convex, then
31 | \begin{enumerate}[label=(\arabic*)]
32 | 	\item $ \norm{ x-x^* }^2 \leq \frac{2}{\mu} (f(x) - f^* ) $.
33 | 	\item $ \norm{ x-x^* } \leq \frac{1}{\mu} \norm{ \nabla f(x)} $.
34 | 	\item Polyak-Lojasiewicz (PL): $ f(x) - f^* \leq \frac{1}{2\mu} \norm{ \nabla f(x)}^2 $.
35 | \end{enumerate}
36 | 
37 | Recall from last time, we derive
38 | \[
39 | 	f(x_{k+1}) \leq f(x_k) - \frac{1}{2L} \norm{ \nabla f(x_k)}^2 
40 | .\] 
41 | So if we add $ \mu$-strongly convex to the assumption of gradient descent analysis, then
42 | \begin{align*}
43 | 	f(x_{k+1}) - f(x_k) &\leq - \frac{1}{2L} \norm{ \nabla f(x_k)}^2 \leq -\frac{\mu}{L} (f(x_k) -f^*) \text{ by PL}     \\
44 | 	f(x_{k+1}) &\leq f(x_k) - \frac{\mu}{L} (f(x_k) - f^* )\\ 
45 | 	f(x_{k+1}) - f^* &\leq \left( 1 - \frac{\mu}{L} \right) (f(x_k) - f^* )
46 | \end{align*}
47 | Since $ \mu I \preceq \nabla^2 f \preceq LI$. So $ 0< \rho:= \frac{\mu}{L} < 1$. By contraction mapping theorem, this converges.
48 | \begin{align*}
49 | 	\norm{ x_k-x^* }^2 \leq \frac{2}{\mu} \rho^{k} (f(x_0) - f^* )
50 | \end{align*}
51 | \begin{remark}
52 | $ \kappa = \frac{L}{\mu}$ is the condition number of the Hessian, \emph{i.e.} the largest singular value over the smallest.
53 | \end{remark}
54 | \subsubsection{Convergence rate}
55 | \begin{table}[H]
56 | 	\centering
57 | 	\begin{tabular}{c|c|c}
58 | 		rate& iteration number &example\\
59 | 		\hline
60 | 		\hline
61 | 		$ \mathcal{ O}\left( \frac{1}{k^{1 /4}} \right) $ & $ \mathcal{ O}\left( \frac{1}{ \epsilon^{4}} \right) $ & non-convex subgradient method\\
62 | 		\hline
63 | 		$ \mathcal{ O}\left( \frac{1}{\sqrt{k} } \right) $ & $ \mathcal{ O}\left( \frac{1}{ \epsilon^2} \right) $ & subgradient descent or SGD\\
64 | 		\hline
65 | 		$ \mathcal{ O}\left( \frac{1}{k} \right) $& $ \mathcal{ O}\left( \frac{1}{ \epsilon} \right) $ & gradient-descent with Lipschitz\\
66 | 		\hline
67 | 		$ \mathcal{ O} \left( \frac{1}{k^2} \right) $ & $ \mathcal{ O}\left( \frac{1}{ \sqrt{ \epsilon} } \right) $ & Nesterov acceleration\\
68 | 		\hline
69 | 		$ \mathcal{ O}(\rho^{k})$ & $ \mathcal{ O}\left( \log\left( \frac{1}{ \epsilon} \right)  \right) $& gradient descent with Lipschitz and strong convexity\\
70 | 		\hline
71 | 		$ \mathcal{ O}\left( \rho^{2^{k}} \right) $ & $ \log_2 \left(\mathcal{ O}\left( \log \left( \frac{1}{ \epsilon} \right)  \right) \right) $ & Newton's method locally\\
72 | 	\end{tabular}
73 | \end{table}
74 | 
75 | \end{document}
76 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_27.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \newpage
 6 | \subsection{Linear conjugate gradient method}
 7 | CG solves (usually approximately) $ Ax =b$ if  $ A \succ 0$. More details and intuition can be found at \url{cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf}.
 8 | 
 9 | \begin{eg}
10 | 	Consider the least squares problem. Let $ \phi(x) = \frac{1}{2} x^{T} \widetilde{ A}^{T} \widetilde{ A}x - \widetilde{ b}^{T} \widetilde{ A}x + \frac{1}{2} \widetilde{ b}^{T}\widetilde{ b} =: \frac{1}{2} x^{T} A x - b^{T} x + \text{ const.} $. Here $ A \succeq 0$ always and $ A \succ 0$ if $ m>n$ and full rank. Then we can solve  $ \nabla \phi(x) = Ax-b$.
11 | 	\begin{note}
12 | 	Don't form the Gram matrix. Instead use LSQR method.
13 | 	\end{note}
14 | 
15 | 	One idea to $ \min \phi(x)$ is coordinate descent/alternating minimization. This slowly converges to the solution via a zigzag path. If we change to the eigenvector basis, it is guaranteed to converge in $ n$ steps. However, finding the eigenvector basis is just as expensive as solving the normal equation directly at  $ \mathcal{ O}(n^3)$.
16 | \begin{defn}[conjugate directions]
17 | $ \{p_i\} $ are conjugate directions if they are $ A$-orthogonal. That is, 
18 | \[
19 | \langle p_i |A|p_j \rangle := \langle p_i, A p_j \rangle = 0 \text{ if } i\neq j 
20 | .\] 
21 | \end{defn}
22 | 
23 | \begin{note}
24 | If we have $ \{p_i\}_{i=0}^{n-1} $, it's a basis. If $ p_i$ are eigenvectors of a symmetric matrix  $ A$, then they are  $ A$-orthogonal.
25 | \end{note}
26 | 
27 | Our goal is to find $ \{p_i\} $ more cheaply than eigenvectors.
28 | \end{eg}
29 | 
30 | \begin{thm}[conjugate direction method (abstract)]
31 | Assume $ \{p_i\}_{i=0}^{n-1} $ are conjugate directions. Then
32 | \begin{align*}
33 | 	x_{k+1} = x_k + \alpha_k p_k 
34 | \end{align*}
35 | where $ a_k$ solves $ \min_{\alpha} \phi(x_k + \alpha p_k)$ which is exact line search. The solution to this 1D problem has a closed form:
36 | \begin{align*}
37 | 	a_k = - \frac{\langle r_k | p_k \rangle}{ \langle p_k|A|p_k \rangle}
38 | \end{align*}
39 | where $ r_k = A x_k-b$. Then $ x_n = x^* $.
40 | \end{thm}
41 | 
42 | \begin{proof}
43 | Since $ \{ p_i\} $ is a basis, we can write
44 | \begin{align*}
45 | 	x^* -x_0 &= \sum_{ i= 0}^{ n-1} \sigma_i p_i \\
46 | 	p_k^{T} A(x^* -x_0)&= \sum_{ i= 0}^{ n-1} \sigma_i \langle p_k|A|p_i \rangle \\
47 | 	&= \sigma_k \langle p_k|A|p_k \rangle \\
48 | 	\sigma_k &= \frac{\langle p_k|A|x^* -x_0 \rangle}{ \langle p_k|A|p_k \rangle} \ \forall \ k
49 | \end{align*}
50 | Moreover,
51 | \begin{align*}
52 | 	x_k - x_0 &= \sum_{ i= 0}^{ k-1} \alpha_i p_i \\
53 | 	p_k^{T} A(x_k - x_0) &= \sum_{ i= 0}^{ k-1} \alpha_i \langle p_k|A|p_i \rangle \\
54 | 	\langle p_k|A|x_k-x_0 \rangle &= 0
55 | \end{align*}
56 | Substituting $ x_k $ as $ x_0$,
57 | \begin{align*}
58 | 	\sigma_k = \frac{\langle p_k|A|x^* -x_k \rangle}{ \langle p_k|A|p_k \rangle} = \alpha_k
59 | \end{align*}
60 | Therefore, $ x_n = x^* $ since they have the same expression in the basis.
61 | \end{proof}
62 | \begin{remark}
63 | We can think of this process as either building up $ x^* $ component-by-component or cutting the error  $ x^* -x_k$ component-by-component.
64 | \end{remark}
65 | Facts:
66 | \begin{itemize}
67 | 	\item $ r_{k+1} = r_k + \alpha_k A p_k$ 
68 | 	\item $ \langle r_k, p_i \rangle=0, i<k$.
69 | 	\item $ x_k$ minimizes $ \phi$ over $ K(r_0,k) = \text{ span}\{r_0, A r_0,\ldots,A^{k-1} r_0\}  $. This is the Krylov subspace. Note that $ K(r_0,n-1) = \rr^{n}$.
70 | 	\item $ \langle r_k,r_i \rangle=0, i<k$.
71 | 	\item $ p_k, r_k \in K(r_0,k)$.
72 | \end{itemize}
73 | 
74 | \begin{thm}[conjugate gradient]
75 | Given arbitrary $ x_0$, $ r_0 = A x_0 - b$, $ p_0 = -r_0$. Compute iteratively
76 | \begin{align*}
77 | 	\beta_k &= \frac{\langle r_k|A|p_{k-1} \rangle}{ \langle p_{k-1}|A|p_{k-1} \rangle}, \text{ chosen s.t. }  \langle p_k|A|p_{k-1} \rangle=0 \\
78 | p_k &= -r_k + \beta_k p_{k-1}\\
79 | \alpha_k &= - \frac{\langle r_k|p_k \rangle}{ \langle p_k|A|p_k \rangle} \\
80 | 	x_{k+1} &= x_k + \alpha_k p_k \\
81 | 	r_{k+1} &= r_k - \alpha_k A p_k
82 | \end{align*}
83 | The magic is that $ \langle p_k|A|p_i \rangle=0 \ \forall \ i\leq k-1$ (see Nocedal and Wright for proof). 
84 | 
85 | The cost is one matrix-vector multiply per step.
86 | \end{thm}
87 | \begin{thm}[convergence of CG]
88 | \begin{align*}
89 | 	\norm{ x_k - x^* }_A \leq 2 \left( \frac{\sqrt{\kappa}-1 }{\sqrt{\kappa} +1 } \right)^{k} \norm{ x_0 - x^* }_A 
90 | \end{align*}
91 | where $ \kappa = \kappa(A)$ is the condition number of  $ A$.
92 | \end{thm}
93 | \end{document}
94 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_28.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \subsection{non-linear conjugate gradient}
 6 | Recall that in linear CG, $ \phi(x)$ is quadratic and $ \nabla \phi(x) = Ax-b = r$ the residual. In non-linear, neither holds true, but we can replace the parts that no longer hold with more general expressions:
 7 | \begin{thm}[non-linear CG]
 8 | \begin{align*}
 9 | 	a_k &\approx \argmin_{\alpha} \phi(x_k+ \alpha p_k) &&\text{ linesearch} \\
10 | 	x_{k+1} &= x_k + \alpha_k p_k\\
11 | 	\beta_{k+1}^{FR} &= \frac{\norm{ \nabla \phi_{k+1}^2} }{\norm{ \nabla \phi_k}^2  } \\
12 | 	p_{k+1} &= -\nabla \phi_{k+1} + \beta_{k+1} p_k
13 | \end{align*}
14 | \end{thm}
15 | \begin{remark}
16 | 	~\begin{itemize}
17 | 		\item it can be fast, but has to be unconstrained optimization. It might work very well on "almost quadratic" functions.
18 | 		\item too sensitive to parameters: Hager+Zhang has the most advanced version which uses a lot of parameters.
19 | 		\item quasi-Newton methods can be just as fast and more stable.
20 | 	\end{itemize}
21 | \end{remark}
22 | \newpage
23 | \subsection{Quasi-Newton Methods}
24 | \begin{remark}
25 | This is the gold-standard for large-scale, smooth, unconstrained optimization.
26 | \end{remark}
27 | 
28 | \begin{defn}[quadratic approximation]
29 | \begin{align*}
30 | 	m_k(p) &= f_k + \langle \nabla f_k,p \rangle + \frac{1}{2} \langle p|B_k|p \rangle, \quad B_k \succ 0 \\
31 | 	\widetilde{ x}_{k+1} &= x_k +p_k , \quad p_k = \argmin_p m_k(p) = - B_k^{-1} \nabla f_k
32 | \end{align*}
33 | Sometimes we do a linesearch, $ x_{k+1} = x_k + \alpha p^* $.
34 | \end{defn}
35 | \begin{eg}
36 | Gradient descent is when $ B_k = L \cdot I.$
37 | \end{eg}
38 | \begin{eg}
39 | 	Newton's method is when $ B_k = \nabla^2 f(x_k)$.
40 | \end{eg}
41 | 
42 | \begin{thm}[Quasi-Newton]
43 | Using the quadratic approximation framework, choose $ B_k$ s.t. 
44 | \begin{enumerate}[label=(\arabic*)]
45 | 	\item $ B_k$ is more accurate than $ L \cdot I$
46 | 	\item $ B_k$ is cheaper than Newton (in terms of forming and inverting).
47 | \end{enumerate}
48 | \end{thm}
49 | 
50 | The main trick is to construct $ B_{k+1}$ by updating $ B_k$.
51 | 
52 | To find $ x_{k+2}$,
53 | \begin{align*}
54 | 	m_{k+1}(p) &= f_{k+1} + \langle \nabla f_{k+1},p \rangle + \frac{1}{2} \langle p|B_{k+1}|p \rangle\\
55 | 	\nabla m_{k+1} (p) &= \nabla f_{k+1} + B_{k+1} p  
56 | \end{align*}
57 | Thus, $ \nabla m_{k+1} (0) = \nabla f_{k+1}$.
58 | 
59 | Then the following conditions are automatically satisfied:
60 | \begin{enumerate}[label=(\arabic*)]
61 | 	\item $ m_{k+1} (0) = f_{k+1}$ 
62 | 	\item $ \nabla m_{k+1} (0) = \nabla f_{k+1}$
63 | \end{enumerate}
64 | This gives us freedom in choosing $ B_{k+1}$. Let $ s_k = x_{k+1} - x_k$ and $ y_k = \nabla f_{k+1} - \nabla f_k$. Moreover, let's impose
65 | \begin{align*}
66 | 	\nabla m_{k+1} (-s_k) &= \nabla f_{k+1} - B_{k+1} s_k = \nabla f_k\\
67 | B_{k+1} s_k &= y_k \qquad \text{ secant equation} 
68 | \end{align*}
69 | Most Quasi-Newton methods choose $ B_{k+1}$ to satisfy the secant equation.
70 | 
71 | Notice that we want $ B_{k+1} \succ 0$. Then $ \langle s_k|B_k|s_k \rangle = \langle s_k, y_k \rangle >0$. This is a necessary condition called \allbold{curvature condition}. That is,
72 | 		\[
73 | 		\langle x_{k+1} - x_k, \nabla f_{k+1} - \nabla f_k \rangle > 0
74 | 		.\] 
75 | 		This is strictly monotone, which is satisfied when $ f$ is strictly convex. If $ f$ isn't strictly convex, it would complicate quasi-Newton method ( \emph{e.g.} might need to add a line search).
76 | 
77 | 		$ x \in \rr^{n}, B \in \rr^{n \times n}$, so $ B$ has  $ n(n+1) /2$ degrees of freedom, and $ n$ constraints from the secant equation.
78 | 
79 | 		 \begin{eg}
80 | 			 When $ n=1$, degree of freedom and constraint are both 1, $ B_{k+1}$ is completely determined. This is called the \emph{secant method}.
81 | 		\end{eg}
82 | When $ n>1$,  $ B$ is underdetermined. The standard ways to choose  $ B$ is
83 |  \begin{align*}
84 | 	 B_{k+1} &= \argmin_{B \succ 0, B s_k = y_k} \norm{ B-B_k}_w^2 \\
85 | 	 \text{ or } B_{k+1}^{-1} &= \argmin_{B^{-1} \succ 0, B^{-1}y_k=s_k} \norm{ B^{-1}-B_k^{-1}}_w^2  
86 | \end{align*}
87 | where $ \norm{ \cdot }_w $ is some norm chosen so the problem has a closed-form solution. The class of methods on choosing $ B$ is called the  \allbold{Broyden class}. 
88 | 
89 | \begin{notation}
90 | 	$ B$ approximates  $ \nabla ^2 f$, and $ H$ approximates  $ (\nabla ^2 f)^{-1}$. That is, $ B_k ^{-1} = H_k$.
91 | \end{notation}
92 | 
93 | Observe that it's cheaper to just approximate the inverse Hessian, although it is actually not an issue because $ B_{k+1}$ is a low-rank update of $ B_k$, so we can use Sherman-Morrison-Woodbury formula to obtain the inverse very cheaply.
94 | \end{document}
95 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_29.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \subsubsection{BFGS}
 6 | Using a specific weighted norm that satisfies the secant equation:
 7 | \begin{thm}[BFGS]
 8 | \begin{align*}
 9 | 	H_{k+1} = \left( I- \rho_k s_k y_k^{T}  \right) H_k(I- \rho_k y_k s_k^{T}) + \rho_k s_k s_k^{T}, \quad \rho_k = \frac{1}{\langle y_k,s_k \rangle}
10 | \end{align*}
11 | \end{thm}
12 | 
13 | \begin{remark}
14 | 	~\begin{itemize}
15 | 		\item Iteration count: win (compared to gradient descent)
16 | 		\item Flops: $ \mathcal{ O}(n^2) < \mathcal{ O} (n^3)$ : win (compared to Newton)
17 | 		\item Memory: $ \mathcal{ O}(n^2)$ : loss (same as Newton)
18 | 	\end{itemize}
19 | \end{remark}
20 | 
21 | \begin{thm}[L-BFGS]
22 | \begin{align*}
23 | 	H_{k+1} &= V_k^{T} H_k V_k + \rho_k s_k s_k^{T}\\
24 | 	H_{k+1} (w) &= V_k^{T} H_{k} (V_k w)+ \rho_k s_k s_k^{T} w
25 | \end{align*}
26 | where $ z :=V_k w$ uses $ y_k, s_k$ and $ s_k^{T} w$ uses $ y_k, s_k$. Both are cheap. Then $ H_k (z)$ depends on $ y_{k-1}, s_{k-1}, H_{k-1}$. We can do this recursively down to the first. Instead we stop at $ (k-m)$th term. That is,
27 |  \begin{align*}
28 | 	H_{k-m} = \frac{\langle y_{k-m},s_{k-m} \rangle}{ \norm{ y_{k-m}}^2 } \cdot I
29 | \end{align*}
30 | We can start with gradient descent to initialize. That is, $ H_0 = \frac{1}{L} \cdot I$.
31 | \end{thm}
32 | Then the storage becomes $ 2(m+1) n $. Commonly choose  $ m \in \{3,20\} $.
33 | 
34 | \begin{remark}
35 | 	Usually $ B_k \not\to \nabla ^2f(x^* )$. 
36 | \end{remark}
37 | 
38 | \begin{thm}[convergence]
39 | 	If $ 0< \mu I \leq \nabla ^2 f(x) \leq L \cdot I$, then BFGS converges and usually superlinearly.
40 | \end{thm}
41 | 
42 | Open question: if $ f$ is non-convex, does BFGS converge to a stationary point?
43 | 
44 | \begin{remark}
45 | If $ m = 0$ "memoryless" BFGs plus exact linesearch yields nonlinear CG.
46 | \end{remark}
47 | 
48 | \begin{remark}
49 | What if we have constraints? Recall that for gradient descent we can do proximal/projected gradient descent. That is,
50 | \[
51 | 	x_{k+1}= \Proj(x_k - t \cdot \nabla f(x_k))
52 | .\] 
53 | Can we do the same thing for any quasi-Newton method? 
54 | \[
55 | 	x_{k+1} = \Proj_{B_k} (x_k - B_k^{-1} \nabla f_k)
56 | .\] 
57 | This is usually not feasible since the scaled projection is hard to compute.
58 | \end{remark}
59 | \newpage
60 | \subsection{Newton's methods}
61 | Let $ \Delta x = \nabla ^2 f(x_k) ^{-1} \nabla f(x_k)$.
62 | \begin{enumerate}[label=(\arabic*)]
63 | 	\item computational: "inexact-Newton", "matrix-free", "truncated-Newton", or "Newton-CG" mean approximate $ \Delta x$. That is, we wish to solve
64 | 		\begin{align*}
65 | 			\nabla ^2 f(x_k) \cdot \Delta x = \nabla f(x_k)
66 | 		\end{align*}
67 | 		We can solve this with linear CG with only a few steps (adaptive). We can use preconditioners such as incomplete Cholesky or BFGS. 
68 | 
69 | 		Often Hessian is structured and we can exploit that in computing the Hessian-gradient product.
70 | 	\item convergence:
71 | 
72 | 		In practice, we use a linesearch or even better a trust-region to "globalize". We wish to avoid bad saddle points.
73 | 
74 | 		For trust-region, we minimize a quadratic model.
75 | \end{enumerate}
76 | \end{document}
77 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_30.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \subsubsection{Trust-region for nonconvex}
 6 | \begin{align*}
 7 | 	x_{k+1} &= x_k + p_k \\
 8 | 	&= x_k + \argmin_{p} \langle \nabla f_k, p \rangle + \frac{1}{2} \langle p,B_k,p \rangle
 9 | \end{align*}
10 | If $ B_k \succ 0$, then this is a convex quadratic, the gradient equals 0 is the sufficient condition. Then the Newton step is the minimizer $ p_k = B_k^{-1} \nabla f_k$. If not, Newton step isn't the minimizer. Now we have to use trust-region.
11 | \begin{align*}
12 | 	x_{k+1} &= x_k + \argmin_{p} \langle \nabla f_k, p \rangle + \frac{1}{2} \langle p,B_k,p \rangle\\
13 | 	s.t. \quad  & \norm{ p} \leq \Delta \iff \frac{1}{2}\norm{ p}^2 \leq \frac{1}{2} \Delta^2 
14 | \end{align*}
15 | \begin{remark}
16 | If $ B_k$ is indefinite, we see that if we pretend the gradient term isn't there, then the quadratic form is minimized by the leftmost eigenvector (associated with the negative-most eigenvalue) of $ B_k$, scaled to the trust region radius. See N+W for tricks to solve.
17 | \end{remark}
18 | The KKT conditions are necessary.
19 | \begin{align*}
20 | 	\mathscr{L}(p, \lambda) = \langle \nabla f_k,p \rangle + \frac{1}{2} \langle p|B_k|p \rangle + \frac{1}{2} \lambda (\norm{ p}^2 - \Delta^2 )
21 | \end{align*}
22 | Stationarity:
23 | 		\begin{align*}
24 | 			\nabla f_k + B_k p + \lambda p &= 0 \\
25 | 			p &= \underbrace{ (B_k + \lambda I)^{-1} }_{ \text{regularity} }  \nabla f_k
26 | 		\end{align*}
27 | \begin{remark}
28 | Typical: trust-region methods can sometimes guarantee a local minimizer even if the problem isn't convex.
29 | \end{remark}
30 | \begin{remark}
31 | Alternatively, we can use
32 | \begin{itemize}
33 | 	\item cubic regularization
34 | 	\item perturbed gradient descent
35 | \end{itemize}
36 | \end{remark}
37 | \newpage
38 | \subsection{Nonlinear least squares}
39 | The objective is
40 | \begin{align*}
41 | 	f(x) &= \frac{1}{2} \sum_{ j= 1}^{ m} r_j^2 (x), \quad  r_j: \rr^{n} \to \rr \\
42 | 	     &:= \frac{1}{2} \norm{ R(x)} ^2, \quad R: \rr^{n} \to \rr^{m} 
43 | \end{align*}
44 | This is perhaps the most common in engineering and sciences. We use squares here not only because it's easier to differentiate but also because if our data have Gaussian noise, then this becomes the maximum likelihood estimation.
45 | 
46 | Let the Jacobian of $ R$ be  $ J(x)$.
47 |  \begin{align*}
48 | 	 J(x)_{i,j} &= \frac{\partial r_i}{\partial x_j}  \\
49 | 	 J(x) &= \begin{pmatrix} \nabla r_1(x)^{T}\\ \vdots\\ \nabla r_m(x)^{T} \end{pmatrix} \\
50 | 	 \nabla f(x) &= \nabla \left( \frac{1}{2} \sum_{ j= 1}^{ m} r_j^2(x) \right)  \\
51 | 		     &= \frac{1}{2} \sum_{ j= 1}^{ m} \nabla (r_j^2(x)) && \text{ linearity}  \\
52 | 		     &= \frac{1}{2} \sum_{ j= 1}^{ m} 2 r_j(x) \nabla r_j(x) \\
53 | 		     &= J(x)^{T} R(x) \\
54 | 	 \nabla ^2f(x) &= J(x)^{T}J(x) + \sum_{ j= 1}^{ m} r_j(x) \cdot  \nabla ^2 r_j(x)
55 | \end{align*}
56 | In the least squares case $ r_j(x) = a_i^{T} x -b_i$, we see that $ \nabla ^2f(x) = A^{T}A$ just as we expect.
57 | 
58 | \subsubsection{Gauss-Newton method}
59 | \begin{align*}
60 | 	x_{k+1} = x_k - B_k^{-1} \nabla f_k
61 | \end{align*}
62 | where $ B_k = J_k^{T} J_k$. So we ignore the sum term to approximate the Hessian. This is worse than Newton but better than gradient descent's constant $ L \cdot I$ approximation, and we get the approximation "for free" as we need to compute $ J(x)$ for the gradient anyway. Although inverting it can get expensive.
63 | 
64 | Another derivation:
65 | \begin{align*}
66 | 	R(x) &\approx R(x_k) + J(x_k) (x-x_k) && \text{ 1st order Taylor}\\ 
67 | 	f(x) &= \frac{1}{2} \norm{ R(x)} ^2 \\
68 | 	     &\approx \frac{1}{2} \norm{ R_k + J_k(x-x_k)}^2  && \text{ linear ls model!} \\
69 | 	x_{k+1}&= (J_k^{T} J_k)^{-1} J_k^{T} (J_k x_k - R_k) &&\text{ normal eq} \\
70 | 	       &= x_k - (J_k^{T} J_k)^{-1} J_k^{T} R_k\\
71 | 	       &= x_k- (J_k^{T} J_k)^{-1} \nabla f_k \\
72 | \end{align*}
73 | \subsubsection{Levenberg-Marquardt}
74 | 
75 | This is Gauss-Newton with a trust-region.
76 | 
77 | Softwares:
78 | 
79 | \begin{itemize}
80 | 	\item Matlab: lsqnonlin
81 | 	\item python: scipy.optimize.least\_squares, lmfit (modeling)
82 | \end{itemize}
83 | \end{document}
84 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_31.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \section{Methods for constrained problems}
  6 | \subsubsection{Special nice constraints }
  7 | \emph{e.g.} $ x\geq 0, \ell_i \leq x_i \leq u_i, \norm{ x-x_0}\leq 3 $
  8 | \begin{enumerate}[label=(\arabic*)]
  9 | 	\item projected gradient descent (with Nesterov acceleration).
 10 | 	\item active-set methods ( \emph{e.g.} L-BFGS-B).
 11 | \end{enumerate}
 12 | \begin{eg}
 13 | $ x \in \rr^{2}, 0\leq x_i \leq 1$. Suppose $ x_1^{k} = 1, x_2^{k} = .3 $. Then for $ k+1$, let  $ x_1^{k+1} = x_1^{k}$ unchanged, and ignore constraint for $ x_2$ and use $ L-BFGS$ and check it satisfies.
 14 | \end{eg}
 15 | 
 16 | \subsubsection{not-so-nice constraints}
 17 | \emph{e.g.} $ g(x) \leq 0$ where $ g$ is non-linear.
 18 |  \begin{enumerate}[label=(\arabic*)]
 19 | 	\item penalty methods: both convex and non-convex
 20 | 	\item augmented Lagrangian: both
 21 | 	\item sequential quadratic programming (SQP): both
 22 | 	\item ADMM (alternating direction method of multipliers, also applied to non-convex) and DR (Douglas-Rachford): convex
 23 | 	\item Primal Dual methods: mostly convex
 24 | 	\item Interior-point methods (IPM): convex. IPOPT: non-convex.
 25 | \end{enumerate}
 26 | 
 27 | \subsection{penalty methods}
 28 | 
 29 | \begin{align*}
 30 | \min\quad &f_0(x) \\
 31 | \text{subject to } \quad &f_i(x) \leq 0, i = 1,\ldots,m \\
 32 | &h_i(x) = 0 , i = 1,\ldots,p
 33 | \end{align*}
 34 | \begin{eg}
 35 | \begin{align*}
 36 | \min\quad &\norm{ x}_1  \\
 37 | \text{subject to } \quad & \norm{ Ax-b}_2 ^2 \leq \epsilon^2 
 38 | \end{align*}
 39 | Then
 40 | \begin{align*}
 41 | 	\mathscr{L}(x,\lambda) = \norm{ x}_1 + \lambda ( \norm{ Ax-b}^2 - \epsilon^2 ) 
 42 | \end{align*}
 43 | If strong-duality holds and there exists saddle points,
 44 | \begin{align*}
 45 | 	x^*  \in \argmin_{x} \mathscr{L}(x, \lambda^* )
 46 | \end{align*}
 47 | \end{eg}
 48 | We typically use the \allbold{quadratic penalty}:
 49 | 
 50 | For equality constraints, define
 51 | \begin{align*}
 52 | 	Q_{\mu} (x) = f_0(x) + \frac{\mu}{2} \sum_{ i= 1}^{ m} h_i(x)^2
 53 | \end{align*}
 54 | Solve
 55 | \begin{align*}
 56 | 	x^{(k)} &= \argmin Q_{\mu_k}(x)\\
 57 | 	\text{ update } &\mu_{k+1} \text{ increasing}\\
 58 | 	x^{(k+1)} &= \argmin_{x} Q(\mu_{k+1}) (x)
 59 | \end{align*}
 60 | This is a warm-start with $ x^{(k)}$.
 61 | \begin{thm}
 62 | 	Suppose $ \mu_k \to \infty$. If $ (x^{(k)})$ has a limit point $ x^* $, then $ x^* $ is optimal.
 63 | \end{thm}
 64 | \begin{note}
 65 | 	No convexity is needed but usually need convexity to update $ x^{(k+1)}$.
 66 | \end{note}
 67 | 
 68 | For inequality constraints, define
 69 | \begin{align*}
 70 | 	Q_{\mu}(x) &= f_0(x) + \frac{\mu}{2} \left( \sum_{ i= 1}^{ m} h_i^2(x) + \sum_{ i= 1}^{ m} \lfloor f_i(x) \rfloor_{+}^2 \right) 
 71 | \end{align*}
 72 | \begin{note}
 73 | The floor function makes it usually non-smooth.
 74 | \end{note}
 75 | \begin{remark}
 76 | The general idea is to put constraints into the objective:
 77 | \begin{align*}
 78 | 	\min f_0(x), \quad x \in C \implies \min f_0(x) + g(x)
 79 | \end{align*}
 80 | \end{remark}
 81 | Methods
 82 | \begin{enumerate}[label=(\arabic*)]
 83 | 	\item $ g(x) = I_{C}(x)$ : this is mathematically equivalent but no computational benefit
 84 | 	\item penalty: 
 85 | 		\begin{align*}
 86 | 			g_{\mu}(x) =
 87 | 			\begin{cases}
 88 | 				\mu \cdot x^2 & x<0\\
 89 | 				0 & x\geq 0
 90 | 			\end{cases}
 91 | 		\end{align*}
 92 | 		When $ \mu \to \infty$, the smooth quadratic barrier converges the non-smooth infinite barrier.
 93 | 	\item barrier:
 94 | 		\begin{align*}
 95 | 			g_{\mu}(x) = -\frac{1}{\mu} \cdot \log x
 96 | 		\end{align*}
 97 | 		The barrier is not define for $ x\leq 0$, so it forces the solution to stay strictly feasible.
 98 | \end{enumerate}
 99 | 
100 | \begin{remark}
101 | Drawback: QP is often ill-conditioned as $ \mu \to \infty$.
102 | \end{remark}
103 | \begin{eg}
104 | \begin{align*}
105 | \min\quad &\frac{1}{2} x^{T}Px \\
106 | &Ax=b, A \in \rr^{m\times n}, m<n\\
107 | Q_{\mu}(x) &= \frac{1}{2} x^{T}Px + \frac{\mu}{2} \norm{ Ax-b}^2\\ 
108 | 0 = \nabla Q &= Px + \mu A^{T}(Ax-b) \\
109 | x&= (P+ \mu A^{T}A)^{-1} A^{T}b
110 | \end{align*}
111 | Since the condition number of the matrix usually depends on $ \mu$, as $ \mu \to \infty$ the condition number also becomes very large.
112 | \end{eg}
113 | 
114 | In addition to quadratic penalty, we can use exact penalty: use $ |h_i(x)|$ instead. It destroys smoothness.
115 | 
116 | \end{document}
117 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_32.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \subsection{Augmented Lagrangian}
  6 | 
  7 | Assume we only have equality constraints.
  8 | \begin{align*}
  9 | 	(P) \qquad \qquad \min\quad &f_0(x) \\
 10 | &h_i(x) = 0 , i = 1,\ldots,p
 11 | \end{align*}
 12 | 
 13 | If I knew optimal Lagrange multipliers, then 
 14 | \begin{align*}
 15 | 	x^* \in \argmin_x \mathscr{L}(x, \nu^* ) \impliedby f_0(x) + \sum_i \nu_i^* h_i(x)
 16 | \end{align*}
 17 | and turn this into an unconstrained problem. We can try searching for $  \nu^* $:
 18 | \begin{align*}
 19 | 	g(\nu) &= \min_{x} \mathscr{L}(x,\nu)
 20 | \end{align*}
 21 | and we try to maximize $ g(\nu)$/dual ascent. We can use gradient ascent: 
 22 | \begin{align*}
 23 | 	\nu_{k+1} = \nu_k + \eta_k \cdot \nabla g(\nu_k)
 24 | \end{align*}
 25 | \begin{prop}
 26 | 	If $ g(\nu) = \inf_x \mathscr{L}(x,\nu)$ and $ x_{\nu} = \argmin \mathscr{L}(x,\nu)$ is a single point under some reasonable conditions,
 27 | 	\begin{align*}
 28 | 		\nabla g(\nu) = \nabla _\nu \mathscr{L}(x_{\nu}, \nu)
 29 | 	\end{align*}
 30 | \end{prop}
 31 | 
 32 | However, this problem might not have a minimizer unless $ \nu$ is optimal (can go to negative infinity \emph{e.g.} linear objective). Then there is no gradient. Augmented Lagrangian aims to fix this. The problem becomes:
 33 | 
 34 | \begin{align*}
 35 | 	(P_{\mu}) \qquad \qquad \min\quad &f_0(x) + \frac{\mu}{2} \sum_{ i= 1}^{ m} h_i^2(x) \\
 36 | \text{ subject to}\quad  &h_i(x) = 0
 37 | \end{align*}
 38 | $ (P_{\mu})$ is equivalent to $ (P)$, since the squared terms cannot make the objective smaller. Then the dual ascent becomes
 39 |  \begin{align*}
 40 | 	 \max g(\nu)\\
 41 | 	 g(\nu) &= \min_{x} \mathscr{L}_{\mu}(x,\nu) =\min_x f_0(x) + \sum_i \nu_i h_i(x) + \frac{\mu}{2} \sum_i h_i^2(x)
 42 | \end{align*}
 43 | The quadratic penalty term regularizes this objective so it won't go to negative infinity. Then the algorithm becomes
 44 | \begin{align*}
 45 | 	\text{ principle update: }& x_k \in \argmin_{x} \mathscr{L}_{\mu} (x, \nu_k) \\
 46 | 	\text{ dual update: }& \nu_{k+1} = \nu_k + \mu h_i(x_k) && \text{ gradient ascent} 
 47 | \end{align*}
 48 | In practice we often provide $ \mu$ heuristically.
 49 | 
 50 | Notice that this is adding a regularization term to the gradient ascent, or we can understand it as adding a linear term and thus not requiring $ \mu$ to go to infinity as the penalty method, which fixes the ill-conditioned problem of penalty method. The convergence result is however fairly weak.
 51 | 
 52 | What if we add inequality constraints? We have some tricks and we only mention one here:
 53 | \begin{enumerate}[label=(\arabic*)]
 54 | 	\item slack variables: $ f_i(x) \leq 0 \iff f_i(x) + s_i=0 , s_i\geq 0$ and we keep the simpler inequality constraint implicit in the domain (hard constraints). Then we hope the solver can solve it.
 55 | \end{enumerate}
 56 | See the rest in Nocedal and Wright. LANCELOT is a software that uses this.
 57 | 
 58 | \subsubsection{SQP}
 59 | Sequential quadratic programming, \emph{i.e.} fancy Newton. Packages use this are SNOPT, KNITRO, LANCELOT, TRON.
 60 | 
 61 | Again let's first consider only equality constraints:
 62 | \begin{align*}
 63 | \min\quad &f_0(x) \\
 64 | 			 &H(x) = 0
 65 | \end{align*}
 66 | The Lagrangian is
 67 | \begin{align*}
 68 | 	\mathscr{L}(x,\nu) = f_0(x) + \nu^{T} H(x)
 69 | \end{align*}
 70 | Let's list the KKT conditions:
 71 | \begin{enumerate}[label=(\arabic*)]
 72 | 	\item stationarity: 
 73 | 		\begin{align*}
 74 | 			0 = \nabla _x \mathscr{L} &= \nabla f_0(x) + \mathbf{J}(x) \nu
 75 | 		\end{align*}
 76 | 		where $ \mathbf{J}(x)$ is the Jacobian of  $ H(x)$.
 77 | 	\item primal feasibility: $ H(x) = 0$.
 78 | \end{enumerate}
 79 | There are no complementary slackness or dual feasibility conditions since $ \lambda$ doesn't exist.
 80 | 
 81 | So we have two vector equations and we can just stack them together as one vector equation $ F(x,\nu) = 0$ and treat it as a root-finding problem using Newton's method.
 82 | 
 83 | \begin{align*}
 84 | 	\begin{pmatrix} x_{k+1}\\ \nu_{k+1} \end{pmatrix} = \begin{pmatrix} x_k\\ \nu_k \end{pmatrix} + \left( F'(x_k,\nu_k) \right) ^{-1} F(x_k,\nu_k)
 85 | \end{align*}
 86 | where
 87 | \begin{align*}
 88 | 	F'(x,\nu) = \begin{pmatrix} \nabla _{x x}^2 \mathscr{L}& \mathbf{J}(x)\\ \mathbf{J}(x)^{T}&0 \end{pmatrix} 
 89 | \end{align*}
 90 | Let $ p = x_{k+1} - x_k$, then the problem becomes
 91 | \begin{align*}
 92 | 	(QP) \qquad \qquad 	\min \quad & f_k + \langle \nabla f_k, p \rangle + \frac{1}{2} \langle p|\nabla _{x x}^2 \mathscr{L}|p\rangle\\
 93 | 	\text{ subject to} \quad & \mathbf{J}(x_k) p + H(x_k) = 0 
 94 | \end{align*}
 95 | 
 96 | This is a primal-dual Newton method. We can also add in inequality constraints and solve it via Quadratic Programming. We can add in tricks:
 97 | \begin{enumerate}[label=(\arabic*)]
 98 | 	\item active- sets
 99 | 	\item line search or trust region
100 | \end{enumerate}
101 | 
102 | \end{document}
103 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_33.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | 
 6 | \section{Bonus: find gradients}
 7 | 
 8 | \begin{enumerate}[label=(\arabic*)]
 9 | 	\item don't: DFO (derivate-free optimization). 	
10 | 	\item finite differences
11 | 	\item analytic
12 | 	\item automatic:
13 | 		\begin{enumerate}[label=(\alph*)]
14 | 			\item automatic-differentiation (AD)
15 | 			\item adjoint-state method
16 | 		\end{enumerate}	 
17 | \end{enumerate}
18 | 
19 | \subsection{DFO}
20 | 
21 | In small dimensions, the gradient doesn't much information than multiple function evaluations. In large dimensions DFO fails.
22 | 
23 | Let $ x \in \rr^{n}$. DFO gets $ \{f(x_i)\}_{i=1}^k$ and gradient method gets $ \{f(x_i), \nabla f(x_i)\}_{i=1}^k $. 
24 | 
25 | \begin{eg}
26 | \begin{align*}
27 | 	\min f(x) = \frac{1}{2} x^{T} H x + q^{T} x
28 | \end{align*}
29 | Consider the analogy of a game of guessing numbers but with an adversarial kid judging if your guesses are right or wrong. DFO only has $ k$-degrees of freedom, but the function has  $ n^2$-degree of freedom. So we need $ k =n^2$ to get it right. Although in low-dimensions we can use a grid-search method (just like the number guessing game).
30 | 
31 | \end{eg}
32 | The usual setup of DFO: $ \nabla f$ exists but hard to find or $ \nabla f$ doesn't exist. This often happens when $ f$ is very noisy.
33 | 
34 | The algorithms are:
35 |  \begin{enumerate}[label=(\arabic*)]
36 | 	 \item model-based: \emph{e.g.} DFO-TR (trust-region) with $ \dim \approx 200$. We interpolate a polynomial in a trust region iteratively. We usually do linear or quadratic polynomials in high-dimensions because we need too many degrees of freedom in cubics or more.
37 | 	 \item coordinate-descent, ``pattern search''.
38 | 	 \item Nelder-Mead simplex-reflection (unrelated to symplex method)
39 | 	 \item implicit filtering: gradient descent with large-stepsize finite-diff.
40 | 	 \item Bayesian optimization: use Gaussian processes. 
41 | 	 \item Evolutionary search: pick a random direction.
42 | 	 \item particles swarm: meta-heuristics. They are good for finding global minimum but have poor accuracy.
43 | \end{enumerate}
44 | 
45 | \subsection{Finite differences}
46 | \begin{eg}
47 | 
48 | \begin{align*}
49 | 	f(x) &= \cos x\\
50 | 	f'(x) &= -\sin x\\
51 | 	      &= \lim_{ h \to 0} \frac{\cos(x+h)- \cos(x)}{ h}&& \text{ forward-diff } \mathcal{ O}(h)\\
52 | 	      &= \lim_{ h \to 0} \frac{\cos(x+h)-\cos(x-h)}{2h } &&\text{ central-diff } \mathcal{ O}(h^2)
53 | \end{align*}
54 | 
55 | \end{eg}
56 | In $ \rr^{n}$,
57 | \begin{align*}
58 | 	(\nabla f(x))_i &\approx \frac{f(x+he_i)-f(x)}{ h} && n+1 \text{ function eval} \\
59 | 			&\approx \frac{f(x+he_i)-f(x-he_i)}{ 2h} && 2n \text{ function eval}
60 | \end{align*}
61 | This becomes expensive for more number of points at $ \mathcal{ O}(n)$.
62 | 
63 | Although often the error isn't a big deal, this is a bad idea if $ f$ is noisy. This is great for prototyping. 
64 | 
65 | Is the expense necessary?
66 | \begin{eg}
67 | \begin{align*}
68 | 	f(x) &= \frac{1}{2} \norm{ Ax-b}^2, A \text{ square } && \mathcal{ O}(n^2)\\  
69 | 	\nabla f(x) &= A^{T}(Ax-b) && \mathcal{ O}(n^2)
70 | \end{align*}
71 | But finite-difference here is $ \mathcal{ O}(n^3)$. So it's possible to beat finite-difference.
72 | \end{eg}
73 | 
74 | \subsection{analytic solutions}
75 | Chain rule for vector fields:
76 | 
77 | Let $ h = g \circ f$, where  $ f: \rr^{n} \to \rr^{m}, g: \rr^{m} \to \rr^{p}$, so $ h: \rr^{n} \to \rr^{p}$.
78 | 
79 | \begin{align*}
80 | 	\left[ Jf(x) \right]_{ij} &= \frac{[\partial f(x)]_{ij}}{ \partial x_j} \\
81 | 	Jh(x) &= Jg(f(x)) Jf(x)\\
82 | \end{align*}
83 | \begin{eg}
84 | If $ p=1$, then
85 |  \begin{align*}
86 | 	 \nabla h(x) = (Jh(x))^{T} = (Jf(x))^{T} \nabla g(f(x))
87 | \end{align*}
88 | \end{eg}
89 | \begin{eg}
90 | \begin{align*}
91 | 	f(x) &= Ax+b, p =1&& Jf(x) = A\\
92 | 	\nabla h(x) &= A^{T} \nabla g(Ax+b)
93 | \end{align*}
94 | \end{eg}
95 | 
96 | \subsection{Automatic differentiation}
97 | See hand-written notes. I'm too lazy to write up this one.
98 | \end{document}
99 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_34.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \subsection{Automatic Differentiation}
 6 | \begin{eg}
 7 | \begin{align*}
 8 | 	g(x) &= \prod_{i=1}^m (x-t_i)\\
 9 | 	g'(x) &= \sum_{ i= 1}^{ m} \prod_{j\neq i}(x-t_j) && \text{ symbolic} \\
10 | 	g'(x) &= \sum_{ i= 1}^{ m} \frac{g(x)}{x-t_i } \\
11 | \end{align*}
12 | 
13 | In AD:
14 | \begin{align*}
15 | 	f^{(k)} &= (x-t_k) f^{(k-1)} \\
16 | 	r^{(k)} &= (x-t_k) r^{(k+1)} \\
17 | 	g'(x) = \sum_{ i= 1}^{ m} f^{(i-1)} \cdot r^{(i+1)}
18 | \end{align*}
19 | \end{eg}
20 | 
21 | \subsubsection{Modes}
22 | Let $ f: \rr^{n} \to \rr^{(p)}$. Then
23 | 
24 | Forward: $ O(n)$ so it's great if  $ f:\rr \to \rr^{p}$.
25 | 
26 | Reverse: $ O(p)$ so great if  $ f: \rr^{n} \to \rr$. This is usually the setting in optimization.
27 | 
28 | \begin{eg}
29 | 	$ f: \rr^{2} \to \rr$, $ f(x_1, x_2) = x_1 x_2 + \sin(x_1)$
30 | \end{eg}
31 | \end{document}
32 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_35.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \subsection{by hand}
  6 | \subsubsection{implicit differentiation}
  7 | 
  8 | $ F(x,y) = 0$ often implicitly defines a function  $ y = f(x)$. Find  $ dy /dx$.
  9 | 
 10 |  $ F = 0 \implies dF / dx =  d0 / dx =0$. Thus,
 11 |  \begin{align*}
 12 |  	0=\frac{d F}{d x} &= \frac{\partial F}{\partial x} \frac{d x}{d x} + \frac{\partial F}{\partial y} \frac{d y}{d x}  \\
 13 | 	0&= \frac{\partial F}{\partial x} + \frac{\partial F}{\partial y} \frac{dy}{dx} \\
 14 | 	\frac{d y}{d x} &= - \frac{ \frac{\partial F}{\partial x} }{ \frac{\partial F}{\partial y} }
 15 |  \end{align*}
 16 | 
 17 | \subsubsection{matrix variables}
 18 | See BV04, or A Matrix Handbook for Statisticians Seber 08, or Matrix Cookbook, for examples.
 19 | \begin{eg}
 20 | \begin{align*}
 21 | 	\nabla (\log \det (X)) = X^{-1}, X \succ 0
 22 | \end{align*}
 23 | \end{eg}
 24 | \subsubsection{parametric functions}
 25 | See notes for details.
 26 | \begin{align*}
 27 | 	f(x) = \max_{z \in Z} \phi(x,z)
 28 | \end{align*}
 29 | \begin{thm}[Danskin]
 30 | 	Suppose $ Z$ compact,  $ \phi$ jointly continuous and $ \phi( \cdot ,z)$ convex. Define
 31 | 	\begin{align*}
 32 | 		Z(x) = \argmax_{z \in Z} \phi(x,z).
 33 | 	\end{align*}
 34 | 	Then
 35 | 	\begin{enumerate}[label=(\arabic*)]
 36 | 		\item The directional derivative $ D_d$ satisfies
 37 | 			 \begin{align*}
 38 | 				 D_d f(x) = \max_{z \in Z(x)} D_d \phi(x,z)
 39 | 			\end{align*}
 40 | 		\item If $ \phi( \cdot ,z)$ is differentiable, $ \nabla _x \phi$ is continuous, then
 41 | 			\begin{align*}
 42 | 				\partial f(x) = \conv \{\nabla _x \phi(x,z): z \in Z(x)\} 
 43 | 			\end{align*}
 44 | 			and $ Z(x)$ a singleton  $ \implies$ $ f$ is differentiable.
 45 | 	\end{enumerate}
 46 | \end{thm}
 47 | \begin{note}
 48 | This theorem doesn't apply to the discrete case.
 49 | \end{note}
 50 | \begin{thm}[Dubovitskii and Milyutin]
 51 | 	If $ |Z|$ is finite,  $ \phi( \cdot ,z)$ is convex $ \ \forall \ z \in Z$, then
 52 | 	\begin{align*}
 53 | 		\partial f = \conv \left\{ \bigcup_{ z \in Z(x)} \partial \phi(x,z) \right\} 
 54 | 	\end{align*}
 55 | \end{thm}
 56 | \begin{eg}
 57 | 	$ f(x) = |x| = \max \{x,-x\} $. Then $ \partial f(0) = \conv \{1,-1\} = [-1,1] $.
 58 | \end{eg}
 59 | 
 60 | \begin{align*}
 61 | 	f(x) = \min_{z \in Z} \phi(x,z)
 62 | \end{align*}
 63 | \begin{thm}
 64 | Under the same conditions,
 65 | \begin{align*}
 66 | 	\partial f(x) = \partial \phi(x,z) \ \forall \ z \in Z(x)
 67 | \end{align*}
 68 | \end{thm}
 69 | 
 70 | \begin{align*}
 71 | 	f(x) = \int \phi(x,z) dz
 72 | \end{align*}
 73 | \begin{thm}
 74 | Sometimes,
 75 | \begin{align*}
 76 | 	f'(x) = \int \frac{d}{dx} \phi(x,z) dz 
 77 | \end{align*}
 78 | \end{thm}
 79 | \subsection{Adjoint state method}
 80 | \begin{enumerate}[label=(\arabic*)]
 81 | 	\item implicit differentiation and careful parentheses.
 82 | 		\begin{align*}
 83 | 			\min_{p} g(u(p),p)\ s.t.\ f(u(p),p) = 0, u \in \mathcal{ H}
 84 | 		\end{align*}
 85 | 		\begin{eg}
 86 | 			$ u_{tt} = c^2(x) u_{x x}$. Let $p= c^2(x)$, \emph{e.g.} varying speed of sound. Or $ p$ could be other parameters like ICs and BCs. Applications in inverses problems such as oil detection. This is often called ``PDE-constrained optimization problem''.
 87 | 			To solve this, we ignore the constraint first. Then we wish to solve $ dg /dp$.
 88 | 		\end{eg}
 89 | 		\begin{eg}
 90 | 		\begin{align*}
 91 | 			\min g(u,p) &\ s.t.\ A u = b, A= V \text{ diag}  (p) V^{T}, p \in \rr^{n}, u \in \rr^{m}, A \in \rr^{m} \times \rr^{m}\\
 92 | 			g(u,p) &= \frac{1}{2} \sum_i (u_i - y_i)^2 + \frac{1}{2} \norm{ p}^2 \\
 93 | 		\end{align*}
 94 | 		\end{eg}
 95 | 		\begin{eg}
 96 | 			Let $ \mathcal{ H} = \rr^{m}$. $ f(u,p) = A(p) \cdot  u -b(p)=0$, linear in $ u$, so $ u(p) = A(p)^{-1} b(p)$. The goal is to find
 97 | 			\begin{align*}
 98 | 				(\nabla _p g)^{T} &= g_p + g_u u_p \\
 99 | 				0=f_p &= A_p u + Au_p -b_p \\
100 | 				u_p &= A^{-1}(b_p - A_p u)\\
101 | 				(\nabla _p g)^{T} &= g_p + g_u (A^{-1}(b_p - A_p u))\\
102 | 						  &= g_p + (g_u A^{-1})(b_p - A_p u) \\
103 | 			\end{align*}
104 | 			Let $ \lambda^* = g_u A^{-1} \implies \lambda = A^{-*} g_u^* $, then we have the adjoint-state equation
105 | 			\begin{align*}
106 | 				A^* \lambda = g_u^* 
107 | 			\end{align*}
108 | 			Now by clever grouping we are only solving one RHS instead of $ n$ RHS.
109 | 		\end{eg}
110 | 	\item adjoints of (bounded/unbounded) linear operators.
111 | 		What about adjoints if $ | \mathcal{ H}| = \infty$?
112 | 		\begin{eg}
113 | 			Let $ L : \mathcal{ H} \to \mathcal{ H}$, $ \mathcal{ H} = L^2[0,T]$:
114 | 		\begin{align*}
115 | 			L(u) = 3u'+4u,u(0)=0, t \in [0,T].
116 | 		\end{align*}
117 | 		Let $ f(u,p) = L(u) -h(t)$. To get the formal adjoint (adjoint doesnt exists since $ L$ doesn't have full domain),
118 | 		 \begin{align*}
119 | 			 \int_0^T (3u'+4u) v dt &= 3\int u'vdt + 4\int uv dt\\
120 | 			 \int u(-3v'+ 4v) dt&= 3uv\big|_0^T - 3\int uv' dt + 4 \int uv dt && \text{ set }  v(T)=0
121 | 		\end{align*}
122 | 		Thus,
123 | 		\begin{align*}
124 | 			L^* (v) = -3v'+4v, v(T)=0, L^* (v) = h
125 | 		\end{align*}
126 | 		\end{eg}
127 | 		\begin{remark}
128 | 		~\begin{enumerate}[label=(\arabic*)]
129 | 			\item doesn't require linear PDE's
130 | 			\item not always a good idea: memory issues, consistency from the order of applying discretization or optimization
131 | 			\item software: Dolfin-Adjoint, FEnICS
132 | 		\end{enumerate}
133 | 		\end{remark}
134 | \end{enumerate}
135 | \end{document}
136 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_37.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[class=article,crop=false]{standalone} 
  2 | \input{../preamble.tex}
  3 | 
  4 | \begin{document}
  5 | \begin{remark}
  6 | 	IPM are state-of-the-art on problems (used by cvxpy) that are
  7 | \begin{enumerate}[label=(\arabic*)]
  8 | 	\item medium size or smaller (maybe 10000)
  9 | 	\item conic problems: LP, QP, SOCP, SDP:
 10 | 		\begin{align*}
 11 | 		\min\quad & \langle C,X \rangle \\
 12 | 		\text{subject to } &X \succeq 0 \\
 13 | 			  & \mathcal{ A}(x) =b
 14 | 		\end{align*}
 15 | 		and we can use $ -\log \det(X)$ to satisfy $ X \succeq 0$.
 16 | \end{enumerate}
 17 | \end{remark}
 18 | 
 19 | \subsubsection{(Block) Coordinate Descent, Alternating Minimization, Gauss-Seidel}
 20 | This method exploits certain structure of the problem. It's also a "column-action" methods.
 21 | \begin{eg}[Gauss-Siedel]
 22 | Consider solving the least square problem with $ x \in \rr^{n}$ and let $ G$ be the Gram matrix. The normal equation becomes
 23 | \begin{align*}
 24 | 	Gx &= \widetilde{ b}\\
 25 | 	\begin{pmatrix} g_1 & \ldots& g_n \end{pmatrix} \begin{pmatrix} x_1\\ \vdots\\ x_n \end{pmatrix} &= \widetilde{ b} \\
 26 | 	g_i \alpha &= \widetilde{ b} - \left( \sum_{j<i} g_j x_j^{(k+1)}+ \sum_{j>i} g_j x_j^{(k)} \right) \\
 27 | 	x_i^{(k+1)} &= \alpha \\
 28 | \end{align*}
 29 | \end{eg}
 30 | \begin{remark}
 31 | 	Jacobi only uses $ x^{(k)}$ for each $ k$, allowing parallelization and randomized order.
 32 | \end{remark}
 33 | 
 34 | If we do this row-wise, it's called ART (algebraic reconstruction technique) or Kaczmarz algorithm, or POCS (projection onto convex sets).
 35 | 
 36 | Consider
 37 | \begin{align*}
 38 | 	\min \quad & f(x), x = \begin{pmatrix} x_1 \\ \vdots\\x_n \end{pmatrix}, x_i  \in C_i \text{ can be blocks} \\
 39 | 	x_i^{(k+1)} &\in \argmin_{\alpha \in C} f\left( x_1^{(k+1)}, \ldots, x_{j-1}^{(k+1)}, \alpha, x_{i+1}^{(k)},\ldots, x_n^{(k)} \right) \text{ or } \\
 40 | 	x_i^{(k+1)} &= x_i^{(k)} -\eta \frac{\partial f}{\partial x_i} \left( x_1^{(k+1)},\ldots,x_{i-1}^{(k+1)}, x_i^{(k)},\ldots, x_n^{(k)} \right)  
 41 | \end{align*}
 42 | The last step is that if it's too hard to find $ \argmin$, we instead just take a gradient at that step.
 43 | 
 44 | If we have two variables $ \min f(x,y)$, then
 45 | \begin{align*}
 46 | 	x^{(k+1)} &\in \argmin_{x} f(x,y^{(k)})\\
 47 | 	y^{(k+1)} & \in \argmin_{y} f(x^{(k+1)},y)
 48 | \end{align*}
 49 | We can modify it to PALM (proximal alternating linearized minimization) for non-convex problems:
 50 | \begin{align*}
 51 | 	x^{(k+1)} &\in \argmin_{x} f(x,y^{(k)}) + \frac{\mu}{2} \norm{ x - x^{(k)}}^2 \\
 52 | 	y^{(k+1)} & \in \argmin_{y} f(x^{(k+1)},y) + \frac{\mu}{2} \norm{ y-y^{(k)}}^2 
 53 | \end{align*}
 54 | 
 55 | \subsubsection{ADMM (Alternating Direction Method of Multipliers)}
 56 | See 2011 Boyd et al monograph.
 57 | 
 58 | \begin{align*}
 59 | \min\quad &f(x) \\
 60 | \text{subject to } \quad & Ax=b 
 61 | \end{align*}
 62 | Attempt 1: Let's try with dual ascent, using $ y$ as the dual variable:
 63 | \begin{align*}
 64 | 	\mathscr{L}(x,y) &= f(x) + \langle y,Ax-b \rangle\\
 65 | 	g(y) &= \inf_{x} \mathscr{L}(x,y) \\
 66 | 	x_{k+1} &\in \argmin \mathscr{L}(x,y_k)\\ 
 67 | 	y_{k+1} &= y_k + t (\underbrace{ Ax_{k+1}-b}_{\nabla g(y_k) } ) 
 68 | \end{align*}
 69 | This allows us to exploit the separable structure of the original problem if available \emph{e.g.} $ f(x) = \sum f_i(x_i)$, since we need to relax the linear constraint in the original problem, and the dual allows us to make the Lagrangian separable \emph{i.e.} $ \langle y,Ax-b \rangle \implies \langle A^* y,x \rangle - \langle y,b \rangle$. However, the downside is that it may not converge.
 70 | 
 71 | Attempt 2: Let's try the augmented Lagrangian which is equivalent to the original problem:
 72 | \begin{align*}
 73 | 	\min \quad &f(x) + \frac{\rho}{2} \norm{ Ax-b}^2 \\
 74 | 		   &Ax=b
 75 | \end{align*}
 76 | Unfortunately the Lagrangian is no longer separable due to the quadratic term:
 77 | \begin{align*}
 78 | 	\mathscr{L}(x,y) = f(x) + \langle y,Ax-b \rangle + \frac{\rho}{2} \norm{ Ax-b}^2 
 79 | \end{align*}
 80 | Would it be possible to combine the two methods?
 81 | 
 82 | Attempt 3 (ADMM): let $ F(x) = \sum_{ i= 1}^{ n} f_i(x_i)$ or $ F(v) =  f(x) + g(z)$ if $ n=2$. 
 83 | \begin{align*}
 84 | 	\min \quad & f(x) + g(z)\\
 85 | 		   & Ax+ Bz = c
 86 | \end{align*}
 87 | The algorithm is:
 88 | \begin{align*}
 89 | 	x^{(k+1)} &\in \argmin_{x} \mathscr{L}_{\rho} \left( \begin{pmatrix} x\\z^{(k)} \end{pmatrix}, y^{(k)}  \right) \\
 90 | 	z^{(k+1)} &\in \argmin_{z} \mathscr{L}_{\rho} \left( \begin{pmatrix} x^{(k+1)}\\z \end{pmatrix}, y^{(k)}  \right) \\
 91 | 	\text{ update } y^{(k+1)} &= y_k + \rho (A x_{k+1}+ B z_{k+1}-c)
 92 | \end{align*}
 93 | \begin{note}
 94 | 	If we jointly minimize the first two lines, it becomes the augmented Lagrangian method.
 95 | \end{note}
 96 | 
 97 | What if $ n>2$, \emph{i.e.} $ \min_x \sum_{ i= 1}^{ n} f_i(x)$, where $ x$ is a block vector of $ x_i$?
 98 | 
 99 | One idea is $ \min_{x_i} \sum f_i(x_i) s.t. $ linear constraints enforces $ x_i = x_j$. 
100 | Naive generalization from $ n=2$ doesn't converge very well. Instead we use a consensus trick:
101 |  \begin{align*}
102 | 	 F(v) &= G(x) + H(z)
103 | \end{align*}
104 | where $ x= \begin{pmatrix} x_1\\ \vdots \\ x_n \end{pmatrix} $, $ z$ has the same size as  $ x_i$, and $ v = \begin{pmatrix} x\\z \end{pmatrix} $.
105 | \begin{align*}
106 | 	\min_{x,z} \quad & F(x) + G(z)\\
107 | 			 &\begin{pmatrix} I&&&&-I\\&I&&&-I\\&& \ddots && \vdots \\&&&I&-I \end{pmatrix} \begin{pmatrix} x\\z \end{pmatrix} =0
108 | \end{align*}
109 | This enforces $ x_i = z \implies x_i = x_j$. We see that $ A = I$ and  $ B = \begin{pmatrix} -I\\ \vdots \\ -I \end{pmatrix} $ from the $ n=2$ linear constraint. Now we see $ x_i$ is decoupled at each update step:
110 | \begin{align*}
111 | 	x_{k+1} &\in \argmin_{x} \mathscr{L}_{\rho}(x,z,y_k) && \text{ decoupled}\\
112 | 	z_{k+1} &\in \argmin_z \mathscr{L}_{\rho} (x_{k+1},z,y_k) = \frac{1}{n} \sum_{ i= 1}^{ n} x_i && \text{ consensus}\\
113 | 	\text{ update } &y_{k+1} \text{ as usual} 
114 | \end{align*}
115 | \begin{remark}
116 | 	This is a common trick in optimization. In a coupled system, we relax it to be decoupled first and let them recouple later.
117 | \end{remark}
118 | \end{document}
119 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_38.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \subsubsection{Douglas-Rachford}
 6 | It is equivalent to ADMM in certain senses. See [BC17 28.3].
 7 | 
 8 | Algorithm: $ 0< \lambda<2, \rho>0, y_0$
 9 | \begin{align*}
10 | 	x_k&= \prox_{\rho g}(y_k) \\
11 | 	z_k&= \prox_{\rho f}(2x_k -y_k) \\
12 | 	y_{k+1} &= y_k + \lambda(z_k -y_k)
13 | \end{align*}
14 | The proximity operator is equivalent to the Lagrangian in ADMM.
15 | 
16 | Notice
17 | \begin{align*}
18 | 	\min \sum_{ i= 1}^{ n} f_i(x) \iff \min \sum_{ i= 1}^{ n} f_i(x_i)\ s.t.\ x_i=x_j \ \forall \ i,j
19 | \end{align*}
20 | \begin{remark}
21 | In signal processing, we can parallelize this algorithm and only require communications among all workers in the consensus step. We can also enforce consensus in different ways and achieve consensus faster in a graph.
22 | \end{remark}
23 | \subsection{Primal Dual Methods}
24 | ADMM has some issues: if we want to $ \min_{x}\ g(x)+ \widetilde{ h}(Ax)$, $ h(x) = \widetilde{ h}(Ax)$ (DR form). Rewrite as $ \min_{x,z}\ g(x) + \widetilde{ h}(z)$, $ Ax-z=0$ (ADMM form). Finding $ \prox_{h}$ is often hard due to $ A$.  $ \prox_{\widetilde{ h}}$ easy doesn't mean $ \prox_{h}$ is easy due to $ A$.
25 | 
26 | The trick is to use ADMM with a scaled norm in the quadratic term in augmented Lagrangian. A clever choice is
27 | \begin{align*}
28 | 	\norm{ z} _{M}^2 = \langle z|M|z \rangle,\ M = \frac{1}{\sigma} I - A^{T}A,\ \sigma< \frac{1}{\norm{ A}^2 }\implies M \succ 0
29 | \end{align*}
30 | 
31 | Chambolle and Pock, primal-dual hybrid gradient, preconditioned ADMM
32 | 
33 | \subsubsection{general primal-dual method (Condat)}
34 | Suppose $ f,g,h$ convex, proper, lsc,
35 | \begin{align*}
36 | 	\min_x \ \underbrace{ f(x)}_{ \text{ smooth}, \nabla f } + \underbrace{ g(x)}_{ \text{ easy } \prox_g} + \underbrace{ h(Ax)}_{ \text{ easy } \prox_{h} }  
37 | \end{align*}
38 | \begin{lem}
39 | 	$ x = \prox_h(x) + \prox_{h^* }(x)$.
40 | \end{lem}
41 | Since $ h$ is convex,
42 | \begin{align*}
43 | 	h(w) =h^{* *} (w) = \sup_y \langle w,y \rangle - h^* (y) 
44 | \end{align*}
45 | We solve
46 | \begin{align*}
47 | 	\min_x \max_y f(x) + g(x) + \underbrace{ \langle Ax,y \rangle }_{ \text{ links primal-dual} } -h^* (y) && \text{ saddle pt problem} 
48 | \end{align*}
49 | Optimality: use Fenchel-Rockafellar.
50 | 
51 | Assume CQ hold,
52 | \begin{align*}
53 | 	0 &\in \partial (f+g+h \circ A)(x)\\
54 | 	0 &\in \nabla f(x)+ \partial g(x) + A^{T} \underbrace{ \partial h(Ax) }_{ y} && \text{ CQ} \\
55 | 	  &\begin{cases}
56 | 		0 \in \nabla f(x) + \partial g(x) + A^{T}y\\
57 | 		Ax \in \partial h^* (y)\qquad  \text{ since } y \in \partial h(Ax) \\
58 | 	\end{cases}
59 | \end{align*}
60 | Rewrite the two equations in matrix form (although they are operators)
61 | \begin{align*}
62 | 	\underbrace{ - \begin{pmatrix} \nabla f &0\\0&0 \end{pmatrix}}_{T_2} \begin{pmatrix} x\\y \end{pmatrix} \in \underbrace{ \begin{pmatrix} \partial g& A^{T}\\-A& \partial h^*  \end{pmatrix}}_{T_1} \begin{pmatrix} x\\y \end{pmatrix} 
63 | \end{align*}
64 | This yields
65 | \begin{align*}
66 | 	- T_2 z &\in T_1 z,\qquad  z = \begin{pmatrix} x\\y \end{pmatrix} \\
67 | 	z-T_2 z & \in z+T_1 z &&\text{ add }z \text{ on both sides}  \\
68 | 	(I-T_2)z  & \in (I+T_1)z
69 | \end{align*}
70 | We will solve via forward-backward (proximal descent). WLOG assume $ T_2$ is $ 1$-Lipschitz, $ (I + dg)^{-1}=\prox_g$ easy and $ (I + \partial h^* )^{-1}$ easy, then
71 |  \begin{align*}
72 | 	 z_{k+1} = \underbrace{ (I + T_1)^{-1} }_{ \text{ backward/implicit} } \underbrace{ (I-T_2) }_{ \text{ forward/explicit} } z_k
73 | \end{align*}
74 | Instead of adding $ z$, we do a trick and add $ Vz$:
75 | \begin{align*}
76 | 	Vz-T_2 z &\in Vz+ T_1 z
77 | \end{align*}
78 | where we choose $ V$ to be
79 | \begin{align*}
80 | 	V = \begin{pmatrix} \tau^{-1}I& -A^{T}\\-A& \sigma^{-1}I \end{pmatrix} 
81 | \end{align*}
82 | This guarantees that $ V \succ 0$ if $ \sigma \tau > \norm{ A}^{-2} $.
83 | \begin{align*}
84 | 	z_{k+1} &= (V+T_1)^{-1}(V-T_2) z\\
85 | 	V+T_1 &= \begin{pmatrix} \tau^{-1}I + \partial g&0\\-2A& \sigma^{-1}I + \partial h^*  \end{pmatrix} && \text{ upper triangular} 
86 | \end{align*}
87 | We can invert this using back substitution:
88 | \begin{align*}
89 | 	\begin{pmatrix} \tau^{-1}I + \partial g&0\\-2A & \sigma^{-1}I + \partial h^*  \end{pmatrix} \begin{pmatrix} x_{k+1}\\y_{k+1} \end{pmatrix} &= \begin{pmatrix} v\\w \end{pmatrix}\\
90 | 	x_{k+1} &= (\tau^{-1} I + \partial g)^{-1}v && \text{ via } \prox_g \\
91 | 	\text{ then solve for } &y_{k+1} 
92 | \end{align*}
93 | \end{document}
94 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/lec_39.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[class=article,crop=false]{standalone} 
 2 | \input{../preamble.tex}
 3 | 
 4 | \begin{document}
 5 | \section{Linear Programs}
 6 | \subsection{Simplex Method}
 7 | \begin{defn}[face]
 8 | \begin{align*}
 9 | 	\{x: a_i^{T}x=b_i, i \in I; a_i^{T}x=b_i, i \in I^{c}\} 
10 | \end{align*}
11 | \end{defn}
12 | 
13 | \begin{defn}[vertex]
14 | A \allbold{vertex} of a polyhedral set $ \{x: Ax\leq b\} $ is a face that has just 1 point. 
15 | \end{defn}
16 | 
17 | There exists an optimal solution that is a vertex (optimal might not be unique \emph{e.g.} when level set is parallel to an edge).
18 | 
19 | Simplex method hops from one vertex to another. We also call vertex a basic feasible point.
20 | 
21 | The constraints is $ Ax=b, x\leq 0$, $ A$ is  $ m\times n$ matrix, and for some points the equality is achieved. Let  $ B$ be the list of all indices, called basis, where  $ |B| = m$. And for $ n-m$  
22 | 
23 | We start at a basis $ B$, use duality to choose 1 index to leave. Adjust until a new index enters.
24 | 
25 | Problems:
26 |  \begin{enumerate}[label=(\arabic*)]
27 | 	\item lots of linear algebra, $ A_B^{-1}$ needs to be updated/downdated. Usually we use LU. If $ A$ is sparse, it's more complicated.
28 | 	\item there are many pivot rules
29 | 	\item finding a starting point: 2-phase approach
30 | 	\item degenerate bases and cycling
31 | 	\item presolving
32 | 	\item variants: dual simplex, self-dual
33 | \end{enumerate}
34 | 
35 | \subsubsection{complexity}
36 | In CS, complexity means: time in input. What does the input mean?
37 | \begin{enumerate}[label=(\arabic*)]
38 | 	\item real arithmetic, numbers are ``cts'', encode real numbers. A unit storage cost for a $ \rr$ number. Operations/flops have unit cost. Thus the cost depends only on the dimension.
39 | 	\item rational model, or combinatorial model: $ b_i \in \qq$. Now the cost of operations depends on the value of the integers. Larger value is more costly.
40 | \end{enumerate}
41 | 
42 | Practically simplex is great. However, Klee-Minty showed that in $ \rr^{n}$ with $ 2^{n}$ vertices and simplex visited all. Thus simplex is not in $ P$ using the usual pivot rule. The open question is whether this is true for all pivot rules. If we prove polynomial Hirsch conjecture we will know the answer.
43 | 
44 | \begin{eg}[solving linear equations]
45 | 	In real arithmetic, this is polynomial $ \mathcal{ O}(n^3)$. In rational model, it is also polynomial.
46 | 
47 | 	For LPs, in rational model, the answer is again yes. Khachiyan proposed ellipsoid method, a generalization of bisection method. Then Karmarkkov with IPM which is practical.
48 | \end{eg}
49 | What is the answer for real arithmetic model, is it polynomial time (strongly polynomial)?
50 | Are there other algorithms to solve LPs in polynomial time?
51 | \end{document}
52 | 


--------------------------------------------------------------------------------
/TypedNotes/lecture_notes_tex/master.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[a4paper]{report}
 2 | 
 3 | \input{../preamble.tex}
 4 | 
 5 | \title{Convex Optimization by Prof. Stephen Becker}
 6 | 
 7 | \begin{document}
 8 | 	\maketitle
 9 | 	\tableofcontents
10 | 	\newpage
11 | 	%start lectures
12 | 	\input{lec_01}
13 | 	\input{lec_02}
14 | 	\input{lec_03}
15 | 	\input{lec_04}
16 | 	\input{lec_05}
17 | 	\input{lec_06}
18 | 	\input{lec_07}
19 | 	\input{lec_08}
20 | 	\input{lec_09}
21 | 	\input{lec_10}
22 | 	\input{lec_12}
23 | 	\input{lec_13}
24 | 	\input{lec_14}
25 | 	\input{lec_15}
26 | 	\input{lec_16}
27 | 	\input{lec_17}
28 | 	\input{lec_18}
29 | 	\input{lec_19}
30 | 	\input{lec_20}
31 | 	\input{lec_21}
32 | 	\input{lec_22}
33 | 	\input{lec_23}
34 | 	\input{lec_24}
35 | 	\input{lec_11}
36 | 	\input{lec_25}
37 | 	\input{lec_26}
38 | 	\input{lec_27}
39 | 	\input{lec_28}
40 | 	\input{lec_29}
41 | 	\input{lec_30}
42 | 	\input{lec_31}
43 | 	\input{lec_32}
44 | 	\input{lec_36}
45 | 	\input{lec_37}
46 | 	\input{lec_38}
47 | 	\input{lec_39}
48 | 	\input{lec_33}
49 | 	\input{lec_35}
50 | 	%end lectures
51 | \end{document}
52 | 


--------------------------------------------------------------------------------
/policies_CU.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stephenbeckr/convex-optimization-class/f81c65de2b41549f3af01e3af47a160116a21460/policies_CU.pdf


--------------------------------------------------------------------------------
/utilities/README.md:
--------------------------------------------------------------------------------
 1 | The most useful module here is `firstOrderMethods.py`
 2 | 
 3 | The main routine is this:
 4 | - `gradientDescent` is proximal gradient descent
 5 |   - You can turn on "acceleration" (Nesterov acceleration, aka FISTA)
 6 |   - You can also enable line searches instead of a constant stepsize
 7 |   - It has a few features, like it can record error information
 8 | 
 9 | There are three high-level functions:
10 | - `lassoSolver` is a wrapper to `gradientDescent` specialized for the lasso problem $\min_x .5\|\|Ax-b\|\|^2 + \tau\|\|x\|\|_1$
11 | - `createTestProblem` creates some test problems for unit tests
12 |   - it has 3 types of problems: (1) plain least-squares, (2) lasso, (3) logistic regression
13 |   - it can use `cvxpy` to compute the exact solution, and also return the objects needed to test the code in this package
14 | - you can run `runAllTestProblems` to run all the problems
15 | 
16 | And there are a few misc utility functions, such as:
17 | - `backtrackingLinesearch` which uses the Armijo conditions
18 | - `LipschitzLinesearch` suitable when the function is Lipschitz-gradient and convex
19 | - `LipschitzLinesearch_stabler` is a variant based on the ideas we did in the TFOCS paper (it's more stable numerically)
20 | - `powerMethod` for estimating the spectral norm of a matrix (useful for estimating Lipschitz constants)
21 | 
22 | TODO:
23 | - Add an option for exact linesearch for lasso (following my [tech report](https://github.com/stephenbeckr/exactLASSOlinesearch))
24 | - Add a solver for non-negative least squares
25 | - Incorporate into something like [benchOpt](https://github.com/benchopt/)
26 | 


--------------------------------------------------------------------------------
/utilities/fminunc_wrapper_simple.m:
--------------------------------------------------------------------------------
 1 | function [f,g,h] = fminunc_wrapper_simple(x,F,G, H)
 2 | % [f,g] = fminunc_wrapper_simple( x, F, G  )
 3 | % for use with Matlab's "fminunc"
 4 | % and also compatible with Mark Schmidt's minFunc package
 5 | %
 6 | % Example usage:
 7 | %   F = @(x) norm(A*x-b)^2/2  % this is our objective
 8 | %   G = @(x) A'*(A*x-b)     % this is the gradient of F(x)
 9 | %   options = optimoptions('fminunc','SpecifyObjectiveGradient',true);
10 | %   func    =  @(x)fminunc_wrapper_simple(x,F,G);
11 | %   x0      = randn(size(A,2),1);
12 | % 
13 | %   fminunc_wrapper_simple();        % zero-out history
14 | %   x = fminunc( func, x0, options ) % Matlab's solver
15 | %   hist1   = fminunc_wrapper_simple();     % record history
16 | %   x = minFunc(func, x0 )           % Mark Schmidt's minFunc solver
17 | %   hist2   = fminunc_wrapper_simple();     % record history
18 | %   xTrue   = A\b;                   % true solution known in closed-form
19 | %   fTrue   = F(xTrue);
20 | 
21 | %   semilogy( hist1 - fTrue ); hold all
22 | %   semilogy( hist2 - fTrue ); legend('fminunc','minFunc');
23 | %
24 | % [fHist] = fminunc_wrapper()
25 | %       will return the function history
26 | %       and reset the history to zero.
27 | %
28 | % ... = fminunc_wrapper( x, F, G, H )
29 | %   will also compute the Hessian H if provided and requested
30 | %
31 | % Stephen Becker, Stephen.Becker@Colorado.edu  2/17/2017
32 | 
33 | persistent fcnHist
34 | if nargin == 0
35 |    f = fcnHist;
36 |    fcnHist = [];
37 |    return;
38 | end
39 | 
40 | 
41 | f = F(x);
42 | fcnHist(end+1)  = f; % not efficient in terms of memory allocation
43 | 
44 | if nargout > 1
45 |     g = G(x);
46 | end
47 | if nargin > 3 && ~isempty(H) && nargout > 2
48 |     h = H(x);
49 | end
50 | 


--------------------------------------------------------------------------------
/utilities/secondOrderMethods.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | secondOrderMethods module
  4 |     Mostly for APPM 5630 at CU Boulder, but others may find it useful too
  5 |     The main routine is NewtonsMethod(...)
  6 |     Calls firstOrderMethods.py for linesearch and such
  7 |     Note: not very well documented, but hopefully simple enough that you can figure
  8 |         things out
  9 |         I spent about 5 minutes testing this, so it's not very robust code! use at your own risk!
 10 |     The main module depends heavily on numpy
 11 |     
 12 |     Stephen Becker, March 3 2023, stephen.becker@colorado.edu
 13 |     
 14 |     Released under the Modified BSD License:
 15 | Copyright (c) 2023, Stephen Becker. All rights reserved.
 16 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 17 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 18 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 19 | 3. Neither the name of the Stephen Becker nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL STEPHEN BECKER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
 21 | """
 22 | import numpy as np
 23 | from scipy import linalg
 24 | import firstOrderMethods
 25 | 
 26 | def NewtonsMethod(f,grad,Hess,x0,tol=1e-6,maxIters=1e2,printEvery=1,
 27 |                      errorFunction=None, saveHistory=False,stronglyConvex=True):
 28 |   """
 29 |   NewtonsMethod with either fixed stepsize or backtracking linesearch
 30 |   f         is objective function
 31 |   grad      returns gradient of objective function
 32 |   x0        is initial starting point
 33 |   tol       stopping tolerance
 34 |   maxIters  maximum number of iterations
 35 |   printEvery        prints out information every printEvery steps; set to 0 for quiet
 36 |   errorFunction     if provided, will evaluate errorFunction(x) at every iteration
 37 |   saveHistory       whether to save function and error history
 38 |   stronglyConvex            if True, then assumes Hessian matrix is positive definite
 39 | 
 40 |   Outputs:
 41 |   x         final iterate
 42 |   data      dictionary with detailed info. Keys include: 
 43 |     'steps', 'fcnHistory', 'errHistory', 'flag', 'fx'
 44 |   """
 45 |   x   = np.asarray(x0).copy()
 46 |   fx  = f(x)
 47 |   t   = 1 # initial guess for stepsize used for linesearch
 48 |   maxIters = int(maxIters)
 49 |   fcnHistory = []
 50 |   errHistory = []
 51 | 
 52 |   if stronglyConvex is True:
 53 |     HessType = 'pos'
 54 |     # For some reason, scipy.linalg.solve doesn't have a semidefinite option
 55 |   else:
 56 |     HessType = 'sym' # if complex, would need to change to 'her'
 57 | 
 58 |   ## Fancy stuff, not essential
 59 |   if printEvery == 0  or  np.isinf(printEvery):
 60 |     # Users has requested no output
 61 |     # The "pprint" function does nothing
 62 |     def pprint(*args, **kwargs):
 63 |       pass
 64 |     display = False
 65 |   else:
 66 |     display = True
 67 |     pprint = print  # pprint is now a synonym for "print" function
 68 |     if errorFunction is not None:
 69 |       pprint("Iter.  Objective Stepsize  Error")
 70 |       pprint("-----  --------- --------  -------")
 71 |     else:
 72 |       pprint("Iter.  Objective Stepsize")
 73 |       pprint("-----  --------- --------")
 74 | 
 75 |   ## Main loop
 76 |   flag = "Quitting due to reaching max iterations"
 77 |   for k in range(maxIters+1):
 78 |     ## Actual math:
 79 |     g   = grad(x)
 80 |     H   = Hess(x)
 81 |     p   = - linalg.solve(H,g,assume_a=HessType) # Newton step
 82 |     
 83 |     xNew,t,fNew, linesearchIter = firstOrderMethods.backtrackingLinesearch(f,x,p,g,1,fx)
 84 |     if t == 0:
 85 |         flag = "Quitting at iter",k,"since linesearch failed"
 86 |         pprint(flag)
 87 |         break
 88 |     
 89 |     ### Now book-keeping, etc.
 90 | 
 91 |     # Save data, record error
 92 |     if errorFunction is not None:
 93 |       err = errorFunction(xNew)
 94 |       if saveHistory:
 95 |         errHistory.append(err)
 96 |     if saveHistory:
 97 |       fcnHistory.append(fNew)
 98 |     
 99 |     if display and (not k % printEvery) :  # modulo
100 |       if errorFunction is not None:
101 |         print(f"{k:5d}  {fNew:7.2e}  {t:6.2e}  {err:.2e}")
102 |       else:
103 |         print(f"{k:5d}  {fNew:7.2e}  {t:6.2e}")
104 |     
105 |     # Check for convergence
106 |     # If we wanted to get fancier, we could have separate tolerance variables
107 |     #   for each kind of check.
108 |     if np.abs(fx-fNew) < tol:
109 |       flag = "Quitting due to stagnating objective value"
110 |       pprint(flag)
111 |       break
112 |     if np.linalg.norm(g) < tol:
113 |       flag = "Quitting due to norm of gradient being small"
114 |       pprint(flag)
115 |       break
116 |     # since xNew - x = stepsize*g, the following check is very similar
117 |     #   to the norm(g) check. The main difference is that it uses both
118 |     #   relative and absolute tolerances; another difference is that it
119 |     #   checks each entry (like l_inf norm) rather than
120 |     #   Euclidean norm.  Suggested by Cooper
121 |     if np.allclose(xNew,x,rtol=tol, atol=1e-3*tol):
122 |       flag = "Quitting due to successive iterates being close together"
123 |       pprint(flag)
124 |       break
125 |     
126 |     # Get ready for next iteration
127 |     fx = fNew
128 |     x  = xNew
129 |   
130 |   data = {'steps':k, 'fcnHistory':np.asarray(fcnHistory), 
131 |           'errHistory':np.asarray(errHistory),
132 |           'flag':flag, 'fx':fx }
133 |   return xNew, data


--------------------------------------------------------------------------------