├── .gitignore
├── LICENSE
├── README.md
├── matlab
├── affine_fit.m
├── empirical_risk_minimization
│ ├── plot_binary_classification_scores.m
│ └── plot_losses_theory_class.m
├── intro_supervised_learning
│ ├── polynomial_regression.m
│ └── polynomial_regression_with_replications.m
├── kernels
│ ├── interpolate_kernel.m
│ └── kernel_simulations_1d_single_plot.m
├── least_squares
│ ├── OLS_polynomals_plots.m
│ ├── OLS_polynomals_rates.m
│ └── ridge_regression.m
├── local_averaging
│ ├── all_learning_curves.m
│ ├── knn.m
│ ├── nadaraya.m
│ ├── regressogram.m
│ └── regressogram_poly.m
├── mathematical_preliminaries
│ └── expectation_of_max.m
├── model_selection
│ ├── model_selection.m
│ ├── path_lasso.m
│ ├── script_model_selection.m
│ └── script_model_selectionROT.m
├── neural_networks
│ ├── launch_training_relu_nn.m
│ ├── neural_networks_1d.m
│ ├── neural_networks_1d_testerrors.m
│ └── random_features_interpolation.m
├── optimization
│ ├── grad_descent_comparison.m
│ ├── hinge_sgd.m
│ └── logistic_sgd_saga.m
└── sq_dist.m
└── python
├── 1_mathematical_preliminaries.ipynb
├── 2_introduction_supervised_learning.ipynb
├── 3_least_squares.ipynb
├── 4_empirical_risk_minimization.ipynb
├── 5_optimization.ipynb
├── 6_local_averaging.ipynb
├── 7_kernels.ipynb
├── 8_model_selection.ipynb
└── 9_neural_networks.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright in this Work has been licensed exclusively to The MIT Press, http://mitpress.mit.edu, which will be releasing the final version to the public in 2023. All inquiries regarding rights should be addressed to The MIT Press, Rights and Permissions Department.
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Learning Theory from First Principles
2 | Python & Matlab code for the figures from the book "Learning Theory from First Principles" by Francis Bach (in preparation)
3 |
4 |
5 | See available draft of the book [here](https://www.di.ens.fr/%7Efbach/ltfp_book.pdf).
6 |
7 | **Contributors** :
8 | - Matlab Figures : Francis Bach
9 | - Reproduction in Python : Maria Bastiani, Gabriel Fiastre, Shane Hoeberichts, Camille Leempoels, Berné Nortier
10 |
11 | ## Table of Contents
12 | [To Do](#to-do) | [Contribution guidelines](#contribution-guidelines) | [Python code](#python-code) | [Matlab code](#matlab-code)
13 |
14 |
15 |
16 | ## To Do
17 |
18 | **Figures still to be done in python** :
19 |
20 | - [ ] Figure 5.3
21 | - [ ] Figure 7.3
22 | - [ ] Figure 8.2
23 | - [ ] Figures 9.1, 9.2
24 |
25 |
26 |
27 | ## Contribution Guidelines
28 | You can edit the python notebooks to reproduce missing figures (they are all done in Matlab, but a couple of them are not reproduced in python yet, see [To Do](#to-do))
29 |
30 | Make sure you have a Github account and git installed on your local machine.
31 |
32 | ### 0. Contact Me
33 | [Contact me](https://www.di.ens.fr/~fbach/) before so we can discuss any potential contribution, what is needed etc.
34 |
35 | ### 1. Fork the repository & clone it
36 | Go to the repository page & click on the "Fork" button in the top-right corner of the page. This creates a personal copy of the repository under your GitHub account.
37 |
38 | After creating your personnal copy, you can clone the repository :
39 | ```
40 | git clone https://github.com/your-username/Learning_Theory_from_First_Principles/
41 | cd Learning_Theory_from_First_Principles/
42 | ```
43 | Replace **your-username** with your Github username.
44 |
45 | ### 2. Make changes, commit & push
46 | When you are satisfied with your changes, you can commit and push them. Please use a descriptive commit message.
47 | ```
48 | git add . && git commit -am "" && git push
49 | ```
50 |
51 | ### 3. Create a pull request
52 | Once you are done with all your changes and would like to submit your contribution to the main repository, you can **open a Pull Request**
53 | - Visit your fork on GitHub.
54 | - Click on the **"Compare & pull request"** button.
55 | - Provide a meaningful title and description for your pull request (e.g. "Adding missing figure fig-3-2")
56 | - Click on the "Create pull request" button.
57 |
58 |
59 | ## Python Code
60 | The Python code is organized into individual notebooks for each chapter :
61 | - Chapter 1: [Mathematical preliminaries](python/1_mathematical_preliminaries.ipynb)
62 | - Chapter 2: [Introduction to supervised learning](python/2_introduction_supervised_learning.ipynb)
63 | - Chapter 3: [Linear least-squares regres](python/3_least_squares.ipynb)
64 | - Chapter 4: [Empirical risk minimization](python/4_empirical_risk_minimization.ipynb)
65 | - Chapter 5: [Optimization](python/5_optimization.ipynb)
66 | - Chapter 6: [Local averaging methods](python/6_local_averaging.ipynb)
67 | - Chapter 7: [Kernel methods](python/7_kernels.ipynb)
68 | - Chapter 8: [Sparse methods](python/8_model_selection.ipynb)
69 | - Chapter 9: [Neural networks](python/9_neural_networks.ipynb)
70 |
71 |
72 |
73 | ## Matlab code
74 | ### Generic helper functions
75 | - [affine_fit.m](/matlab/affine_fit.m)
76 | - [sq_dist.m](/matlab/sq_dist.m)
77 |
78 |
79 | ### Chapter 1: Mathematical preliminaries
80 |
81 | - [Figure 1.1](matlab/expectation_of_max.m) (expectation of maximum of Gaussian random variables)
82 |
83 |
84 | ### Chapter 2: Introduction to supervised learning
85 |
86 | - [Figure 2.1](matlab/intro_supervised_learning/polynomial_regression.m) (polynomial regression with increasing orders - predictions)
87 | - [Figure 2.2](matlab/polynomial_regression_with_replications.m) (polynomial regression with increasing orders - errors)
88 |
89 |
90 | ### Chapter 3: Linear least-squares regression
91 |
92 | - [Figure 3.1](matlab/least_squares/OLS_polynomals_plots.m) (polynomial regression with varying number of observations)
93 | - [Figure 3.2](matlab/least_squares/OLS_polynomals_rates.m) (convergence rate for polynomial regression)
94 | - [Figure 3.3](matlab/least_squares/ridge_regression.m) (polynomial ridge regression)
95 |
96 |
97 | ### Chapter 4: Empirical risk minimization
98 |
99 | - [Figure 4.1](matlab/empirical_risk_minimization/plot_losses_theory_class.m) (convex surrogates)
100 | - [Figure 4.2](matlab/empirical_risk_minimization/plot_binary_classification_scores.m) (optimal score functions for Gaussian class-conditional densities)
101 |
102 |
103 | ### Chapter 5: Optimization
104 |
105 | - [Figure 5.1](matlab/optimization/grad_descent_comparison.m) (gradient descent on two least-squares problems)
106 | - [Figure 5.2](matlab/optimization/hinge_sgd.m) (comparison of step-sizes for SGD for the support vector machine)
107 | - [Figure 5.3](matlab/optimization/logistic_sgd_saga.m) (comparison of step-sizes for SGD for logistic regression)
108 |
109 |
110 | ### Chapter 6: Local averaging methods
111 |
112 | - [Figure 6.2](matlab/local_averaging/regressogram.m) (regressogram in one dimension)
113 | - [Figure 6.3](matlab/local_averaging/knn.m) (k-nearest neighbor in one dimension)
114 | - [Figure 6.4](matlab/local_averaging/nadaraya.m) (Nadaraya-Watson in one dimension)
115 | - [Figure 6.5](matlab/local_averaging/all_learning_curves.m) (learning curves for local averaging)
116 | - [Figure 6.6](matlab/local_averaging/regressogram_poly.m) (locally linear partitioning estimate)
117 |
118 |
119 | ### Chapter 7: Kernel methods
120 |
121 | - [Figure 7.2](matlab/kernels/interpolate_kernel.m) (minimum norm interpolator)
122 | - [Figure 7.3](matlab/kernels/kernel_simulations_1d_single_plot.m) (comparison of kernels)
123 |
124 |
125 | ### Chapter 8: Sparse methods
126 |
127 | - [Figure 8.1](matlab/model_selection/path_lasso.m) (regularization path)
128 | - [Figure 8.2](matlab/model_selection/model_selection.m) (comparison of estimators) + [script_model_selection.m](matlab/model_selection/script_model_selection.m) + [script_model_selectionROT.m](matlab/model_selection/script_model_selectionROT.m)
129 |
130 |
131 | ### Chapter 9: Neural networks
132 |
133 | - [Figure 9.1](matlab/neural_networks/neural_networks_1d_testerrors.m) (global convergence for different numbers of neurons) + [launch_training_relu_nn.m](matlab/neural_networks/launch_training_relu_nn.m)
134 | - [Figure 9.2](matlab/neural_networks/random_features_interpolation.m) (random features - kernels)
135 | - [Figure 9.3](matlab/neural_networks/neural_networks_1d.m) (neural networks fitting)
136 |
--------------------------------------------------------------------------------
/matlab/affine_fit.m:
--------------------------------------------------------------------------------
1 | function [a,b] = affine_fit(x,y);
2 |
3 | % making sure we have column vectors
4 | if size(x,1)==1,
5 | x = x';
6 | end
7 | if size(y,1)==1,
8 | y = y';
9 | end
10 | n = length(x);
11 | x = [ x, ones(n,1) ];
12 |
13 | M = (x'*x) \ (x'*y);
14 | a = M(1);
15 | b = M(2);
16 |
17 |
--------------------------------------------------------------------------------
/matlab/empirical_risk_minimization/plot_binary_classification_scores.m:
--------------------------------------------------------------------------------
1 | try
2 | ccc=openfig('plot_binary_classification_scores.fig');
3 | catch
4 | disp('missing figure file')
5 | end
6 |
7 |
8 | x = -5:.01:5;
9 |
10 | mu1 = 2;
11 | sigma1 = 1;
12 | pdf1 = 1/sqrt(2*pi)/sigma1 * exp( - (x-mu1).^2 / 2 / sigma1^2);
13 | mu2 = -2;
14 | sigma2 = 1;
15 | pdf2 = 1/sqrt(2*pi)/sigma2 * exp( - (x-mu2).^2 / 2 / sigma2^2);
16 |
17 | subplot(1,2,1);
18 | plot(x,pdf1,'b','linewidth',2)
19 | hold on
20 | plot(x,pdf2,'r','linewidth',2);
21 | hold off
22 | set(gca,'fontsize',20)
23 | legend('class 1','class -1');
24 | title('class conditional densities','FontWeight','normal')
25 | axis([-5 5 0 .41])
26 |
27 | subplot(1,2,2)
28 | eta = pdf1 ./ (pdf1 + pdf2);
29 | plot(x,2*eta-1,'k','linewidth',2)
30 | hold on;
31 | plot(x,sign(2*eta-1),'b','linewidth',2)
32 | plot(x,atanh(2*eta-1),'r','linewidth',2)
33 | hold off
34 | set(gca,'fontsize',20)
35 | legend('2 \eta(x) - 1','sign( 2 \eta(x) - 1 )','atanh( 2 \eta(x) - 1 )','location','NorthWest');
36 | title('optimal scores','FontWeight','normal')
37 | axis([-5 5 -3 3])
38 |
39 |
40 | try
41 | print('-depsc', 'plot_binary_classification_scores.eps');
42 | close(ccc)
43 | catch
44 | disp('missing figure file')
45 | end
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/matlab/empirical_risk_minimization/plot_losses_theory_class.m:
--------------------------------------------------------------------------------
1 | try
2 | ccc=openfig('losses_class.fig');
3 | catch
4 | disp('missing figure file')
5 | end
6 |
7 |
8 | u=-3:.01:4;
9 | set(gca, 'fontsize',20);
10 | plot(u,-.5*sign(u)+.5,'b','linewidth',2); hold on
11 | plot(u,max(1-u,u*0),'r','linewidth',2);
12 | plot(u,(u-1).^2,'k','linewidth',2);
13 | plot(u,log(1+exp(-u)),'g','linewidth',2);
14 |
15 | hold off
16 | axis([-3 4 0 4 ])
17 | legend(' 0-1 : 1_{u \leq 0}',' hinge : max(1-u,0)',' square : (1-u)^2',' logistic : log(1 +e^{-u})')
18 | set(gca,'fontsize',32)
19 |
20 | try
21 | print('-depsc', 'losses_class.eps');
22 | close(ccc)
23 | catch
24 | disp('missing figure file')
25 | end
26 |
--------------------------------------------------------------------------------
/matlab/intro_supervised_learning/polynomial_regression.m:
--------------------------------------------------------------------------------
1 | addpath ..
2 | clear all
3 | seed=1;
4 | randn('state',seed);
5 | rand('state',seed);
6 |
7 | try
8 | ccc=openfig('polynomial_regression.fig');
9 | catch
10 | disp('missing figure file')
11 | end
12 | Xgrid = (-1:.001:1)';
13 | n = 20;
14 | std_noise = .25;
15 | Xsample = rand(n,1)*2-1;
16 | Ygrid = Xgrid.^2 - .5;
17 | Ygrid_with_noise = Xgrid.^2 - .5 + randn(length(Ygrid),1) * std_noise;
18 | Ysample = Xsample.^2 - .5 + randn(n,1) * std_noise;
19 |
20 |
21 | kmax = 14;
22 | Xdata = ones(n,1);
23 | Xgriddata = ones(length(Xgrid),1);
24 |
25 | for i=0:kmax
26 | wdata = (Xdata' * Xdata) \ ( Xdata' * Ysample);
27 | Ygrid_prediction = Xgriddata * wdata;
28 | subplot(3,5,i+1)
29 | plot(Xgrid,Ygrid_prediction,'r','linewidth',2); hold on;
30 | plot(Xgrid,Ygrid,'b','linewidth',2);
31 | plot(Xsample,Ysample,'kx','markersize',10);
32 | hold off;
33 | axis([-1 1 -1 1.1])
34 | title(sprintf('train = %1.2f, test = %1.2f',mean((Xdata*wdata-Ysample).^2),std_noise^2+mean((Ygrid_prediction-Ygrid).^2)),'FontWeight','normal');
35 | Xdata = [ Xdata, Xsample.^(i+1) ];
36 | Xgriddata = [ Xgriddata, Xgrid.^(i+1) ];
37 | set(gca,'fontsize',16)
38 | legend(sprintf('k = %d',i))
39 | end
40 |
41 | try
42 | print('-depsc', 'polynomial_regression.eps');
43 | close(ccc)
44 | catch
45 | disp('missing figure file')
46 | end
47 |
--------------------------------------------------------------------------------
/matlab/intro_supervised_learning/polynomial_regression_with_replications.m:
--------------------------------------------------------------------------------
1 | addpath ..
2 | clear all
3 | seed=1;
4 | randn('state',seed);
5 | rand('state',seed);
6 |
7 | try
8 | ccc=openfig('polynomial_regression_with_replications.fig');
9 | catch
10 | disp('missing figure file')
11 | end
12 |
13 | Xgrid = (-1:.001:1)';
14 | n = 20;
15 | std_noise = .25;
16 |
17 | nrep = 32;
18 |
19 | for irep = 1:nrep
20 | irep
21 | Xsample = rand(n,1)*2-1;
22 | Ygrid = Xgrid.^2 - .5;
23 | Ygrid_with_noise = Xgrid.^2 - .5 + randn(length(Ygrid),1) * std_noise;
24 | Ysample = Xsample.^2 - .5 + randn(n,1) * std_noise;
25 |
26 |
27 | kmax = 7;
28 | Xdata = ones(n,1);
29 | Xgriddata = ones(length(Xgrid),1);
30 |
31 | for i=0:kmax
32 | wdata = (Xdata' * Xdata + n * eye(size(Xdata,2)) * 1e-12) \ ( Xdata' * Ysample);
33 | Ygrid_prediction = Xgriddata * wdata;
34 | training_errors(irep,i+1) = mean((Xdata*wdata-Ysample).^2);
35 | testing_errors(irep,i+1) = std_noise^2+mean((Ygrid_prediction-Ygrid).^2);
36 | Xdata = [ Xdata, Xsample.^(i+1) ];
37 | Xgriddata = [ Xgriddata, Xgrid.^(i+1) ];
38 |
39 | end
40 | end
41 | plot(0:kmax,mean(testing_errors),'-rx','linewidth',2); hold on;
42 | plot(0:kmax,mean(training_errors),'-bx','linewidth',2); hold off;
43 |
44 | errorbar(0:kmax,mean(testing_errors),std(testing_errors),'-rx','linewidth',2); hold on;
45 | errorbar(0:kmax,mean(training_errors),std(training_errors),'-bx','linewidth',2);
46 | plot(0:kmax,std_noise^2 * ones(1,kmax+1),'k--','linewidth',1); hold off;
47 |
48 |
49 | axis([ 0 kmax 0 1])
50 | set(gca,'fontsize',20);
51 | xlabel('polynomial order');
52 | ylabel('errors');
53 | legend('testing','training','Bayes error');
54 |
55 | try
56 | print('-depsc', 'polynomial_regression_with_replications.eps');
57 | close(ccc)
58 | catch
59 | disp('missing figure file')
60 | end
61 |
62 |
--------------------------------------------------------------------------------
/matlab/kernels/interpolate_kernel.m:
--------------------------------------------------------------------------------
1 | addpath ..
2 | clear all
3 | seed=0;
4 | randn('state',seed);
5 | rand('state',seed);
6 | try
7 | ccc=openfig('interpolate_kernel.fig');
8 | catch
9 | disp('missing figure file')
10 | end
11 |
12 |
13 | n = 10;
14 | x = rand(n,1)*2-1;
15 | y = abs(x) -1 + randn(n,1)*.1;
16 | y = randn(n,1);
17 | xtest = 2*(0:.001:1)'-1;
18 |
19 |
20 | % gaussian kernel
21 | alphak = 6;
22 |
23 | K = exp( -sq_dist(x',x') * alphak );
24 | Ktest = exp( -sq_dist(xtest',x') * alphak );
25 |
26 | alpha = K \ y;
27 | yest1 = Ktest*alpha;
28 |
29 | plot(xtest,yest1,'r','linewidth',2); hold on;
30 | plot(x,y,'kx','markersize',8); hold off;
31 |
32 |
33 | % exponential kernel
34 | alphak = 2;
35 | K = exp( -sqrt(sq_dist(x',x')) * alphak );
36 | Ktest = exp( -sqrt(sq_dist(xtest',x')) * alphak );
37 |
38 | alpha = K \ y;
39 | yest2 = Ktest*alpha;
40 |
41 | plot(xtest,yest1,'r','linewidth',2); hold on;
42 | plot(xtest,yest2,'b','linewidth',2); hold on;
43 | plot(x,y,'kx','markersize',8); hold off;
44 |
45 |
46 | % exponential kernel - 2
47 | alphak = 2;
48 | temp = sqrt(sq_dist(x',x'));
49 | K = (1 + sqrt(sq_dist(x',x')) * alphak ).* exp( -sqrt(sq_dist(x',x')) * alphak );
50 | Ktest = (1 +sqrt(sq_dist(xtest',x')) * alphak ).* exp( -sqrt(sq_dist(xtest',x')) * alphak );
51 |
52 | alpha = K \ y;
53 | yest3 = Ktest*alpha;
54 |
55 | plot(xtest,yest1,'r','linewidth',2); hold on;
56 | plot(xtest,yest2,'b','linewidth',2); hold on;
57 | plot(xtest,yest3,'k','linewidth',2); hold on;
58 | plot(x,y,'kx','markersize',16); hold off;
59 |
60 |
61 | % exponential kernel - 3
62 | alphak = 2;
63 | temp = sqrt(sq_dist(x',x'));
64 | K = (1 + sqrt(sq_dist(x',x')) * alphak + sq_dist(x',x') * alphak^2 / 3 ).* exp( -sqrt(sq_dist(x',x')) * alphak );
65 | Ktest = (1 + sqrt(sq_dist(xtest',x')) * alphak + sq_dist(xtest',x') * alphak^2 / 3 ).* exp( -sqrt(sq_dist(xtest',x')) * alphak );
66 |
67 | alpha = K \ y;
68 | yest4 = Ktest*alpha;
69 |
70 | plot(xtest,yest2,'m','linewidth',2); hold on;
71 | plot(xtest,yest3,'k','linewidth',2); hold on;
72 | plot(xtest,yest4,'r','linewidth',2); hold on;
73 | plot(xtest,yest1,'b','linewidth',2); hold on;
74 |
75 | plot(x,y,'g.','markersize',30); hold off;
76 | legend('Exponential (s=1)','Matern (s=2)','Matern (s=3)','Gaussian')
77 | set(gca,'fontsize',24)
78 | axis([-1 1 -10 10])
79 | xlabel('x');
80 | ylabel('y');
81 |
82 |
83 | try
84 | print('-depsc', 'interpolate_kernel.eps');
85 | close(ccc)
86 | catch
87 | disp('missing figure file')
88 | end
89 |
--------------------------------------------------------------------------------
/matlab/kernels/kernel_simulations_1d_single_plot.m:
--------------------------------------------------------------------------------
1 | clear all
2 | seed=1;
3 | randn('state',seed);
4 | rand('state',seed);
5 |
6 | ns = round(2.^[2:.25:10]);
7 | nrep = 20;
8 | try
9 | ccc=openfig('kernel_simulations_1d_single_plot.fig');
10 | catch
11 | disp('missing figure file')
12 | end
13 |
14 | kkk = 0;
15 | for kernel_type=[1 2 4]
16 | kkk = kkk + 1;
17 | seed=1;
18 | randn('state',seed);
19 | rand('state',seed);
20 |
21 | for idata=1:2
22 | for irep=1:nrep
23 | irep
24 | n = max(ns);
25 | ntest = max(ns)*4;
26 |
27 | Xfull = rand(n,1);
28 | Xtest = (0:(ntest-1) )'/ (ntest-1);
29 | std_noise = .2;
30 |
31 |
32 |
33 | switch idata
34 |
35 | case 1
36 | yfull = sin(4*pi*Xfull) + std_noise * randn(n,1);
37 | ytest = sin(4*pi*Xtest);
38 | case 2
39 | yfull = sign(sin(4*pi*Xfull)) + std_noise * randn(n,1);
40 | ytest = sign(sin(4*pi*Xtest));
41 | end
42 |
43 |
44 |
45 | for in=1:length(ns)
46 | n = ns(in);
47 | X = Xfull(1:n,:);
48 |
49 | y = yfull(1:n);
50 |
51 |
52 | alphak = 2 ;
53 | switch kernel_type
54 |
55 | case 1
56 | K = exp( -sqrt(sq_dist(X',X')) * alphak );
57 | Ktest = exp( -sqrt(sq_dist(Xtest',X')) * alphak );
58 |
59 | case 2
60 | temp = sqrt(sq_dist(X',X'));
61 | K = (1 + sqrt(sq_dist(X',X')) * alphak ).* exp( -sqrt(sq_dist(X',X')) * alphak );
62 | Ktest = (1 +sqrt(sq_dist(Xtest',X')) * alphak ).* exp( -sqrt(sq_dist(Xtest',X')) * alphak );
63 | case 3
64 | temp = sqrt(sq_dist(X',X'));
65 | K = (1 + sqrt(sq_dist(X',X')) * alphak + sq_dist(X',X') * alphak^2 / 3 ).* exp( -sqrt(sq_dist(X',X')) * alphak );
66 | Ktest = (1 + sqrt(sq_dist(Xtest',X')) * alphak + sq_dist(Xtest',X') * alphak^2 / 3 ).* exp( -sqrt(sq_dist(Xtest',X')) * alphak );
67 |
68 | case 4
69 | K = exp( - (sq_dist(X',X')) * alphak*3 );
70 | Ktest = exp( - (sq_dist(Xtest',X')) * alphak*3 );
71 |
72 | end
73 |
74 | [u,e] = eig(K);
75 | e = diag(e);
76 |
77 | lambdas = 10.^[2:-.25:-12];
78 |
79 | for ilambda = 1:length(lambdas)
80 | lambda = lambdas(ilambda);
81 |
82 | alpha = u * ( 1./(e + n*lambda) .* ( u' * y ) ) ;
83 | ytest_pred = Ktest * alpha;
84 | valtest(ilambda) = 1/ntest*sum( (ytest_pred - ytest).^2 );
85 | end
86 | valtests(:,in,irep,idata) = valtest;
87 | end
88 | end
89 | end
90 |
91 | for idata=1:2
92 | subplot(3,3,(kkk-1)*3+idata);
93 | n = 128;
94 |
95 |
96 | X = rand(n,1);
97 | Xtest = (0:(ntest-1) )'/ (ntest-1);
98 | std_noise = .2;
99 | switch idata
100 |
101 | case 1
102 | y = sin(4*pi*X) + std_noise * randn(n,1);
103 | ytest = sin(4*pi*Xtest);
104 | case 2
105 | y = sign(sin(4*pi*X)) + std_noise * randn(n,1);
106 | ytest = sign(sin(4*pi*Xtest));
107 | end
108 |
109 |
110 | alphak = 2 ;
111 | switch kernel_type
112 |
113 | case 1
114 | K = exp( -sqrt(sq_dist(X',X')) * alphak );
115 | Ktest = exp( -sqrt(sq_dist(Xtest',X')) * alphak );
116 |
117 | case 2
118 | temp = sqrt(sq_dist(X',X'));
119 | K = (1 + sqrt(sq_dist(X',X')) * alphak ).* exp( -sqrt(sq_dist(X',X')) * alphak );
120 | Ktest = (1 +sqrt(sq_dist(Xtest',X')) * alphak ).* exp( -sqrt(sq_dist(Xtest',X')) * alphak );
121 | case 3
122 | temp = sqrt(sq_dist(X',X'));
123 | K = (1 + sqrt(sq_dist(X',X')) * alphak + sq_dist(X',X') * alphak^2 / 3 ).* exp( -sqrt(sq_dist(X',X')) * alphak );
124 | Ktest = (1 + sqrt(sq_dist(Xtest',X')) * alphak + sq_dist(Xtest',X') * alphak^2 / 3 ).* exp( -sqrt(sq_dist(Xtest',X')) * alphak );
125 | case 4
126 | K = exp( - (sq_dist(X',X')) * alphak*3 );
127 | Ktest = exp( - (sq_dist(Xtest',X')) * alphak*3 );
128 |
129 |
130 | end
131 | [u,e] = eig(K);
132 | e = diag(e);
133 |
134 | lambdas = 10.^[2:-.25:-12];
135 |
136 | for ilambda = 1:length(lambdas)
137 | lambda = lambdas(ilambda);
138 |
139 | alpha = u * ( 1./(e + n*lambda) .* ( u' * y ) ) ;
140 | ytest_pred = Ktest * alpha;
141 | valtest(ilambda) = 1/ntest*sum( (ytest_pred - ytest).^2 );
142 | end
143 | [a,ilambda] = min(valtest);
144 | lambda = lambdas(ilambda);
145 |
146 |
147 | alpha = u * ( 1./(e + n*lambda) .* ( u' * y ) ) ;
148 | ytest_pred = Ktest * alpha;
149 | plot(Xtest,ytest,'r','linewidth',2); hold on
150 | hold on;
151 | plot(Xtest,ytest_pred,'k','linewidth',2);
152 | plot(X,y,'kx');
153 | hold off
154 | legend('target','prediction');
155 | set(gca,'fontsize',16);
156 | xlabel('x');
157 | ylabel('y');
158 | axis([0 1 -1.5 2])
159 | switch idata
160 | case 1
161 | title('Smooth target','FontWeight','normal')
162 | case 2
163 | title('Non-smooth target','FontWeight','normal')
164 | end
165 | end
166 |
167 | subplot(3,3,(kkk-1)*3+3);
168 | plot(log2(ns),log2(min(mean(valtests(:,:,:,1),3))),'b','linewidth',2); hold on
169 | plot(log2(ns),log2(min(mean(valtests(:,:,:,2),3))),'r','linewidth',2);
170 |
171 | [a,b] = affine_fit(log2(ns),log2(min(mean(valtests(:,:,:,1),3))));
172 | plot(log2(ns),a*log2(ns)+b,'b:','linewidth',2);
173 | [a,b] = affine_fit(log2(ns),log2(min(mean(valtests(:,:,:,2),3))));
174 | plot(log2(ns),a*log2(ns)+b,'r:','linewidth',2);
175 | hold off
176 |
177 |
178 | set(gca,'fontsize',16);
179 | xlabel('log_2(n)');
180 | ylabel('log_2(excess risk)');
181 | legend('smooth target','non-smooth target','location','southwest');
182 | title('Convergence rates','FontWeight','normal')
183 | axis([2 10 -11 0.5])
184 |
185 |
186 |
187 |
188 |
189 | end
190 | try
191 | print('-depsc', 'rates_1d_all_kernels.eps');
192 | catch
193 | disp('missing figure file')
194 | end
195 |
196 | close(ccc)
197 |
198 |
199 |
200 |
--------------------------------------------------------------------------------
/matlab/least_squares/OLS_polynomals_plots.m:
--------------------------------------------------------------------------------
1 | addpath ..
2 |
3 | clear all
4 | seed=1;
5 | randn('state',seed);
6 | rand('state',seed);
7 | std_noise = .25;
8 |
9 | Xgrid = (-1:.001:1)';
10 | ns = round(10.^[1:.25:3]);
11 | Ygrid = Xgrid.^2 - .5;
12 | Ygrid_with_noise = Xgrid.^2 - .5 + randn(length(Ygrid),1) * std_noise;
13 |
14 | try
15 | ccc=openfig('polynomial_regression_varying_n.fig');
16 | catch
17 | disp('missing figure file')
18 | end
19 |
20 | for in=1:length(ns)
21 | n = ns(in);
22 |
23 | Xsample = rand(n,1)*2-1;
24 | Ysample = Xsample.^2 - .5 + randn(n,1) * std_noise;
25 |
26 |
27 |
28 | kmax = 5;
29 | Xdata = ones(n,1);
30 | Xgriddata = ones(length(Xgrid),1);
31 |
32 | for i=0:kmax
33 | wdata = (Xdata' * Xdata) \ ( Xdata' * Ysample);
34 | Ygrid_prediction = Xgriddata * wdata;
35 | subplot(3,3,in)
36 | plot(Xgrid,Ygrid_prediction,'r','linewidth',3); hold on;
37 | plot(Xgrid,Ygrid,'b','linewidth',3);
38 | plot(Xsample,Ysample,'kx','markersize',10);
39 | plot(Xgrid,Ygrid_prediction,'r','linewidth',3);
40 | plot(Xgrid,Ygrid,'b','linewidth',3);
41 | hold off;
42 | axis([-1 1 -1 1.1])
43 | title(sprintf('n = %d, train = %1.2f, test = %1.2f',n,mean((Xdata*wdata-Ysample).^2),std_noise^2+mean((Ygrid_prediction-Ygrid).^2)),'FontWeight','normal');
44 | if i1e-4);
45 | ind = [ find(abs(nactive(2:end)-nactive(1:end-1))>0) length(lambdas) ];
46 |
47 |
48 | plot(lambdas(ind),ws(:,ind)','-x','linewidth',2)
49 | set(gca,'fontsize',20);
50 | xlabel('regularization parameter');
51 | ylabel('weights');
52 | % seed
53 | % pause
54 | % end
55 |
56 | try
57 | print('-depsc', 'path_lasso.eps');
58 | close(ccc)
59 | catch
60 | disp('missing figure file')
61 | end
62 |
63 |
64 |
--------------------------------------------------------------------------------
/matlab/model_selection/script_model_selection.m:
--------------------------------------------------------------------------------
1 | function [performance_lasso,performance_ridge,performance_OMP,performance_oracle,performance_zero] = script_model_selection(n,d,k,std_noise,seeds,dmax);
2 |
3 |
4 | nrep = length(seeds);
5 | for iseed = 1:nrep;
6 | seed = seeds(iseed);
7 | randn('state',seed);
8 | rand('state',seed);
9 |
10 | X = randn(n,dmax);
11 | X = X(:,1:d);
12 | wast = zeros(d,1);
13 | wast(1:k) = sign(randn(k,1));
14 | y = X * wast + std_noise * sqrt(k) * randn(n,1);
15 |
16 |
17 | % zero prediction
18 | w = zeros(d,1);
19 | performance_zero(iseed) = sum( 1/n * ( X * ( w - wast) ).^2 );
20 |
21 | % lasso
22 | w = zeros(d,1);
23 |
24 | lambdas = 10.^[1:-.1:-7];
25 | L = max(eig(X'*X/n));
26 |
27 | for ilambda = 1:length(lambdas)
28 | lambda = lambdas(ilambda);
29 | maxiter = 20;
30 | for iter=1:maxiter
31 | %vals(iter) = 1/n * sum( ( X*w-y).^2 ) + lambda * sum(abs(w));
32 | grad = 1/n * X' * ( X*w-y );
33 |
34 | w = w - 1/L * grad;
35 | w = sign(w) .* max( abs(w) - lambda / L, 0);
36 | end
37 | ws(:,ilambda) = w;
38 | vals(ilambda) = sum( 1/n * ( X * ( w - wast) ).^2 );
39 | valests(ilambda) = sum( ( w - wast ).^2 );
40 | end
41 | performance_lasso(:,iseed) = vals;
42 |
43 |
44 | % ridge
45 | w = zeros(d,1);
46 |
47 | lambdas = 10.^[1:-.1:-7];
48 |
49 | for ilambda = 1:length(lambdas)
50 | lambda = lambdas(ilambda);
51 | w_ridge = (X'*X + n*lambda*eye(d))\(X'*y);
52 |
53 | vals(ilambda) = sum( 1/n * ( X * ( w_ridge - wast) ).^2 );
54 | end
55 | performance_ridge(:,iseed) = vals;
56 |
57 |
58 | % % omp - inefficent
59 | % I = [];
60 | % Ic = 1:d;
61 | % for i = 1:d
62 | % perfloc=[];
63 | % for j=1:length(Ic)
64 | % icand = Ic(j);
65 | % wcand = (X(:,[I icand])'*X(:,[I icand])+n*1e-14*eye(i))\(X(:,[I icand])'*y);
66 | % perfloc(j) = 1/n * sum( ( X(:,[I icand])*wcand-y).^2 );
67 | % end
68 | % [a,b] = min(perfloc);
69 | % I = [I Ic(b) ];
70 | % Ic(b) = [];
71 | % wcand = (X(:,I)'*X(:,I)+n*1e-14*eye(i))\(X(:,I)'*y);
72 | % w = zeros(d,1);
73 | % w(I) = wcand;
74 | %
75 | %
76 | % vals_OMP_inefficient(i) = sum( 1/n * ( X * ( w - wast) ).^2 );
77 | % end
78 | % performance_OMP_inefficient = min(vals_OMP)
79 |
80 |
81 | % omp - efficient
82 | I = [];
83 | Ic = 1:d;
84 | Xorth = X;
85 | yorth = y;
86 |
87 | for i = 1:d
88 | perfloc=[];
89 | for j=1:length(Ic)
90 | icand = Ic(j);
91 | wcand = (Xorth(:,icand)'*Xorth(:,icand)+n*1e-14)\(Xorth(:,icand)'*yorth);
92 | perfloc(j) = 1/n * sum( ( Xorth(:,icand)*wcand-yorth).^2 );
93 | end
94 | [a,b] = min(perfloc);
95 | I = [I Ic(b) ];
96 | inew = Ic(b);
97 | Ic(b) = [];
98 | wcand = (X(:,I)'*X(:,I)+n*1e-12*eye(i))\(X(:,I)'*y);
99 | w = zeros(d,1);
100 | w(I) = wcand;
101 | vals_OMP(i) = sum( 1/n * ( X * ( w - wast) ).^2 );
102 |
103 | % temp = ( eye(n) - X(:,I) * inv(X(:,I)'*X(:,I)+n*1e-14*eye(i)) * X(:,I)' );
104 | % Xorth = temp * y ;
105 | % yorth = temp * y;
106 |
107 | temp = ( eye(n) - Xorth(:,inew) * inv(Xorth(:,inew)'*Xorth(:,inew)+n*1e-14 ) * Xorth(:,inew)' );
108 | Xorth = temp * Xorth ;
109 | yorth = temp * yorth;
110 |
111 | end
112 | performance_OMP(:,iseed) = vals_OMP;
113 |
114 |
115 |
116 | w_ols = (X'*X+n*1e-14*eye(d))\(X'*y);
117 | performance_ols = sum( 1/n * ( X * ( w_ols - wast) ).^2 );
118 | I = 1:k;
119 | wcand = (X(:,I)'*X(:,I)+n*1e-12*eye(k))\(X(:,I)'*y);
120 | w = zeros(d,1);
121 | w(I) = wcand;
122 | performance_oracle(iseed) = sum( 1/n * ( X * ( w - wast) ).^2 );
123 |
124 | end
125 |
126 |
127 |
--------------------------------------------------------------------------------
/matlab/model_selection/script_model_selectionROT.m:
--------------------------------------------------------------------------------
1 | function [performance_lasso,performance_ridge,performance_OMP,performance_oracle,performance_zero] = script_model_selectionROT(n,d,k,std_noise,seeds,dmax);
2 |
3 |
4 | nrep = length(seeds);
5 | for iseed = 1:nrep;
6 | seed = seeds(iseed);
7 | randn('state',seed);
8 | rand('state',seed);
9 |
10 | X = randn(n,dmax);
11 | X = X(:,1:d);
12 | wast = zeros(d,1);
13 | wast(1:k) = sign(randn(k,1));
14 | y = X * wast + std_noise * sqrt(k) * randn(n,1);
15 |
16 | % random rotation
17 | [u,s,v] = svd(randn(d,d));
18 | X = X * u;
19 | wast = u' * wast;
20 |
21 | % zero prediction
22 | w = zeros(d,1);
23 | performance_zero(iseed) = sum( 1/n * ( X * ( w - wast) ).^2 );
24 |
25 |
26 | % lasso
27 | w = zeros(d,1);
28 |
29 | lambdas = 10.^[1:-.1:-7];
30 | L = max(eig(X'*X/n));
31 |
32 | for ilambda = 1:length(lambdas)
33 | lambda = lambdas(ilambda);
34 | maxiter = 20;
35 | for iter=1:maxiter
36 | %vals(iter) = 1/n * sum( ( X*w-y).^2 ) + lambda * sum(abs(w));
37 | grad = 1/n * X' * ( X*w-y );
38 |
39 | w = w - 1/L * grad;
40 | w = sign(w) .* max( abs(w) - lambda / L, 0);
41 | end
42 | ws(:,ilambda) = w;
43 | vals(ilambda) = sum( 1/n * ( X * ( w - wast) ).^2 );
44 | valests(ilambda) = sum( ( w - wast ).^2 );
45 | end
46 | performance_lasso(:,iseed) = vals;
47 |
48 |
49 | % ridge
50 | w = zeros(d,1);
51 |
52 | lambdas = 10.^[1:-.1:-7];
53 |
54 | for ilambda = 1:length(lambdas)
55 | lambda = lambdas(ilambda);
56 | w_ridge = (X'*X + n*lambda*eye(d))\(X'*y);
57 |
58 | vals(ilambda) = sum( 1/n * ( X * ( w_ridge - wast) ).^2 );
59 | end
60 | performance_ridge(:,iseed) = vals;
61 |
62 |
63 | % % omp - inefficent
64 | % I = [];
65 | % Ic = 1:d;
66 | % for i = 1:d
67 | % perfloc=[];
68 | % for j=1:length(Ic)
69 | % icand = Ic(j);
70 | % wcand = (X(:,[I icand])'*X(:,[I icand])+n*1e-14*eye(i))\(X(:,[I icand])'*y);
71 | % perfloc(j) = 1/n * sum( ( X(:,[I icand])*wcand-y).^2 );
72 | % end
73 | % [a,b] = min(perfloc);
74 | % I = [I Ic(b) ];
75 | % Ic(b) = [];
76 | % wcand = (X(:,I)'*X(:,I)+n*1e-14*eye(i))\(X(:,I)'*y);
77 | % w = zeros(d,1);
78 | % w(I) = wcand;
79 | %
80 | %
81 | % vals_OMP_inefficient(i) = sum( 1/n * ( X * ( w - wast) ).^2 );
82 | % end
83 | % performance_OMP_inefficient = min(vals_OMP)
84 |
85 |
86 | % omp - efficient
87 | I = [];
88 | Ic = 1:d;
89 | Xorth = X;
90 | yorth = y;
91 |
92 | for i = 1:d
93 | perfloc=[];
94 | for j=1:length(Ic)
95 | icand = Ic(j);
96 | wcand = (Xorth(:,icand)'*Xorth(:,icand)+n*1e-14)\(Xorth(:,icand)'*yorth);
97 | perfloc(j) = 1/n * sum( ( Xorth(:,icand)*wcand-yorth).^2 );
98 | end
99 | [a,b] = min(perfloc);
100 | I = [I Ic(b) ];
101 | inew = Ic(b);
102 | Ic(b) = [];
103 | wcand = (X(:,I)'*X(:,I)+n*1e-12*eye(i))\(X(:,I)'*y);
104 | w = zeros(d,1);
105 | w(I) = wcand;
106 | vals_OMP(i) = sum( 1/n * ( X * ( w - wast) ).^2 );
107 |
108 | % temp = ( eye(n) - X(:,I) * inv(X(:,I)'*X(:,I)+n*1e-14*eye(i)) * X(:,I)' );
109 | % Xorth = temp * y ;
110 | % yorth = temp * y;
111 |
112 | temp = ( eye(n) - Xorth(:,inew) * inv(Xorth(:,inew)'*Xorth(:,inew)+n*1e-14 ) * Xorth(:,inew)' );
113 | Xorth = temp * Xorth ;
114 | yorth = temp * yorth;
115 |
116 | end
117 | performance_OMP(:,iseed) = vals_OMP;
118 |
119 |
120 |
121 | w_ols = (X'*X+n*1e-14*eye(d))\(X'*y);
122 | performance_ols = sum( 1/n * ( X * ( w_ols - wast) ).^2 );
123 |
124 | end
125 |
126 |
127 |
--------------------------------------------------------------------------------
/matlab/neural_networks/launch_training_relu_nn.m:
--------------------------------------------------------------------------------
1 | function [w,b,eta,eta_bias,test_errors,train_errors] = launch_training_relu_nn(X,y,Xtest,ytest,m,batch_size,maxiter,gamma)
2 |
3 | [n d] = size(X);
4 | [ntest d] = size(Xtest);
5 |
6 | % random on the sphere
7 | w = randn(d,m)/sqrt(d/2);
8 | w = randn(d,m); w = w ./ repmat(sqrt(sum(w.^2,1)),d,1);
9 | b = rand(1,m)*2-1;
10 | eta = randn(1,m)/sqrt(m/2);
11 | eta_bias = 0;
12 |
13 | % training
14 | train_errors = zeros(1,floor(maxiter/100));
15 | test_errors = zeros(1,floor(maxiter/100));
16 |
17 | for iter = 1:maxiter
18 |
19 | if mod(iter,100)==1
20 | test_errors(1+(iter-1)/100) = mean( (max(Xtest*w + repmat(b,ntest,1),0) * eta' + eta_bias - ytest).^2);
21 | end
22 | ind = mod( ((iter-1)*batch_size+1:iter*batch_size) - 1, n)+1;
23 | Xbatch = X(ind,:);
24 | ybatch = y(ind);
25 | hidden = max(Xbatch*w + repmat(b,batch_size,1),0);
26 | hiddender = double((Xbatch*w + repmat(b,batch_size,1)) > 0 );
27 |
28 | ypred = hidden * eta' + eta_bias;
29 | if mod(iter,100)==1
30 | train_errors(iter) = mean( (ypred - ybatch).^2);
31 | end
32 |
33 | gradeta = (ypred - ybatch)' * hidden;
34 | gradeta_bias = sum((ypred - ybatch)');
35 | gradb = (ypred - ybatch)' * ( hiddender .* repmat(eta, batch_size,1) ) ;
36 | gradw = Xbatch' * ( (ypred - ybatch) .* ( hiddender .* repmat(eta, batch_size,1) ) );
37 |
38 | w = w - (gamma/batch_size) * gradw;
39 | b = b - (gamma/batch_size) * gradb;
40 | eta = eta - (gamma/batch_size) * gradeta;
41 | % no constant term in the last layer (like presented)
42 | % eta_bias = eta_bias - (gamma/batch_size) * gradeta_bias;
43 |
44 |
45 | end
46 |
--------------------------------------------------------------------------------
/matlab/neural_networks/neural_networks_1d.m:
--------------------------------------------------------------------------------
1 | clear all
2 | seed=1;
3 | randn('state',seed);
4 | rand('state',seed);
5 |
6 | n = 128
7 | ntest = 1024;
8 | nrep = 1;
9 |
10 |
11 | try
12 | ccc=openfig('neural_networks_1d.fig');
13 | catch
14 | disp('missing figure file')
15 | end
16 |
17 |
18 |
19 |
20 |
21 |
22 | for idata=1:4
23 |
24 |
25 | X = rand(n,1)*2-1;
26 | Xtest = (0:(ntest-1) )'/ (ntest-1) * 2 - 1;
27 | std_noise = .2;
28 | % std_noise = 0;
29 | switch idata
30 |
31 | case 1
32 | y = sin(2*pi*X) + std_noise * randn(n,1);
33 | ytest = sin(2*pi*Xtest);
34 |
35 | case 2
36 | y = sign(sin(2*pi*X)) + std_noise * randn(n,1);
37 | ytest = sign(sin(2*pi*Xtest));
38 |
39 | case 3
40 | y = ( max(X/2,0)-.25 ) *4 + std_noise * randn(n,1);
41 | ytest = ( max(Xtest/2,0) -.25)* 4;
42 |
43 | case 4
44 | y = 4*abs( X+1-.25-floor(X+1-.25) -1/2)-1 + std_noise * randn(n,1);
45 | ytest = 4*abs( Xtest+1-.25-floor(Xtest+1-.25) -1/2)-1 ;
46 |
47 |
48 | end
49 |
50 |
51 | ms = [ 5 32 100];
52 | for im = 1:length(ms)
53 | m = ms(im);
54 |
55 | m
56 | maxiter = 400000;
57 | gamma = 0.005;
58 | batch_size = 16;
59 |
60 | % training
61 | [w,b,eta,eta_bias,test_errors,train_errors] = launch_training_relu_nn(X,y,Xtest,ytest,m,batch_size,maxiter,gamma);
62 |
63 | % testing
64 | ytest_pred = max(Xtest*w + repmat(b,ntest,1),0) * eta'+ eta_bias ;
65 |
66 | % plotting
67 |
68 | subplot(3,4,idata+(im-1)*4);
69 |
70 | plot(Xtest,ytest,'r','linewidth',2); hold on
71 | hold on;
72 | plot(Xtest,ytest_pred,'b','linewidth',2);
73 | plot(X,y,'kx');
74 | hold off
75 | legend('target','prediction','linewidth',2);
76 | set(gca,'fontsize',20);
77 | xlabel('x');
78 | ylabel('y');
79 | axis([-1 1 -1.5 2])
80 | title(sprintf('m = %d',m),'FontWeight','normal')
81 | end
82 | end
83 |
84 | try
85 | print('-depsc', 'neural_networks_1d.eps');
86 | close(ccc)
87 |
88 | catch
89 | disp('missing figure file')
90 | end
91 |
92 |
93 |
94 |
95 |
96 |
97 |
--------------------------------------------------------------------------------
/matlab/neural_networks/neural_networks_1d_testerrors.m:
--------------------------------------------------------------------------------
1 | clear all
2 | seed=0;
3 | randn('state',seed);
4 | rand('state',seed);
5 |
6 | maxiter = 1000000;
7 | batch_size = 10;
8 | n = maxiter* batch_size;
9 | ntest = 1024;
10 | nrep = 1;
11 |
12 | try
13 | ccc=openfig('neural_networks_1d_testerrors.fig');
14 | catch
15 | disp('missing figure file')
16 | end
17 |
18 |
19 |
20 |
21 | idata=4;
22 |
23 |
24 | X = rand(n,1)*2-1;
25 | Xtest = (0:(ntest-1) )'/ (ntest-1) * 2 - 1;
26 | std_noise = .2;
27 | std_noise = 0;
28 | switch idata
29 |
30 | case 1
31 | y = sin(2*pi*X) + std_noise * randn(n,1);
32 | ytest = sin(2*pi*Xtest);
33 |
34 | case 2
35 | y = sign(sin(2*pi*X)) + std_noise * randn(n,1);
36 | ytest = sign(sin(2*pi*Xtest));
37 |
38 | case 3
39 | y = ( max(X/2,0)-.25 ) *4 + std_noise * randn(n,1);
40 | ytest = ( max(Xtest/2,0) -.25)* 4;
41 |
42 | case 4
43 | y = 4*abs( X+1-.25-floor(X+1-.25) -1/2)-1 + std_noise * randn(n,1);
44 | ytest = 4*abs( Xtest+1-.25-floor(Xtest+1-.25) -1/2)-1 ;
45 |
46 |
47 | end
48 |
49 |
50 | ms = [ 5 20 100];
51 | for im = 1:length(ms)
52 | m = ms(im);
53 |
54 | m
55 | restarts = 20;
56 |
57 | for irestart = 1:restarts
58 | irestart
59 | maxiter = 400000;
60 | gamma = 0.005;
61 | batch_size = 16;
62 |
63 | [w,b,eta,eta_bias,test_errors,train_errors] = launch_training_relu_nn(X,y,Xtest,ytest,m,batch_size,maxiter,gamma);
64 |
65 | test_errors_restarts(irestart,:) = test_errors;
66 |
67 | ytest_pred(irestart,:) = max(Xtest*w + repmat(b,ntest,1),0) * eta'+ eta_bias ;
68 |
69 | end
70 |
71 | subplot(2,3,im);
72 |
73 | plot(Xtest,ytest,'r','linewidth',4); hold on
74 | hold on;
75 | plot(Xtest,ytest_pred,'b','linewidth',2);
76 | hold off
77 |
78 | legend('target','prediction','linewidth',2);
79 | set(gca,'fontsize',20);
80 | xlabel('x');
81 | ylabel('y');
82 | axis([0 1 -1.5 1.9])
83 | title(sprintf('prediction functions - m = %d',m),'FontWeight','normal')
84 |
85 | subplot(2,3,im+3);
86 | plot(log10(test_errors_restarts'),'b','linewidth',2)
87 | xlabel('iterations/100');
88 | ylabel('log_{10}(test error)');
89 | set(gca,'fontsize',20);
90 | axis([0 maxiter/100 -4 0])
91 | title(sprintf('test errors - m = %d',m),'FontWeight','normal')
92 |
93 | end
94 |
95 |
96 | try
97 | print('-depsc', 'neural_networks_1d_testerrors.eps');
98 | close(ccc)
99 |
100 | catch
101 | disp('missing figure file')
102 | end
103 |
104 |
105 |
--------------------------------------------------------------------------------
/matlab/neural_networks/random_features_interpolation.m:
--------------------------------------------------------------------------------
1 | addpath ..
2 | clear all
3 | seed=0;
4 | randn('state',seed);
5 | rand('state',seed);
6 | try
7 | ccc=openfig('random_features_interpolation.fig');
8 | catch
9 | disp('missing figure file')
10 | end
11 |
12 | d = 1;
13 | n = 10;
14 | x = rand(n,1)*2-1;
15 | y = abs(x) -1 + randn(n,1)*.1;
16 | y = randn(n,1);
17 | xtest = 2*(0:.001:1)'-1;
18 | ntest = length(xtest);
19 |
20 | % multivariate splines kernel
21 | alphak = 6;
22 |
23 | K = 1/6 + x*x'/2/d + sq_dist(x',x').^1.5 / 24 * gamma(2) * gamma(d/2) / gamma(d/2+3/2) / gamma(1/2)
24 | Ktest = 1/6 + xtest*x'/2/d + sq_dist(xtest',x').^1.5 / 24 * gamma(2) * gamma(d/2) / gamma(d/2+3/2) / gamma(1/2)
25 |
26 | alpha = K \ y;
27 | yest1 = Ktest*alpha;
28 |
29 |
30 | % random features
31 | subplot(1,3,1)
32 | nrep = 10;
33 | for irep = 1:nrep
34 | m = 20;
35 | W = randn(m,d); W = W ./ repmat(sqrt(sum(W.^2,2)),1,d);
36 | b = rand(m,1)*2 - 1;
37 | Phi = max(x * W' + repmat(b',n,1),0);
38 | Phitest = max(xtest * W' + repmat(b',ntest,1),0);
39 | K = Phi * Phi' / m;
40 | Ktest = Phitest * Phi' / m ;
41 | alpha = (K + 1e-12*eye(n)) \ y;
42 | yests(:,irep) = Ktest*alpha;
43 |
44 | end
45 |
46 | plot(xtest,yest1,'b','linewidth',3); hold on;
47 | plot(xtest,yests,'linewidth',1); hold on;
48 | plot(x,y,'g.','markersize',30); hold off;
49 | legend('spline kernel')
50 | set(gca,'fontsize',24)
51 | axis([-1 1 -10 10])
52 | xlabel('x');
53 | ylabel('y');
54 | title('m = 20','FontWeight','normal')
55 |
56 |
57 | % random features
58 | subplot(1,3,2)
59 | nrep = 10;
60 | for irep = 1:nrep
61 | m = 100;
62 | W = randn(m,d); W = W ./ repmat(sqrt(sum(W.^2,2)),1,d);
63 | b = rand(m,1)*2 - 1;
64 | Phi = max(x * W' + repmat(b',n,1),0);
65 | Phitest = max(xtest * W' + repmat(b',ntest,1),0);
66 | K = Phi * Phi' / m;
67 | Ktest = Phitest * Phi' / m ;
68 | alpha = (K + 1e-12*eye(n)) \ y;
69 | yests(:,irep) = Ktest*alpha;
70 |
71 | end
72 |
73 | plot(xtest,yest1,'b','linewidth',3); hold on;
74 | plot(xtest,yests,'linewidth',1); hold on;
75 | plot(x,y,'g.','markersize',30); hold off;
76 | legend('spline kernel')
77 | set(gca,'fontsize',24)
78 | axis([-1 1 -10 10])
79 | xlabel('x');
80 | ylabel('y');
81 | title('m = 100','FontWeight','normal')
82 |
83 |
84 | % random features
85 | subplot(1,3,3)
86 | nrep = 10;
87 | for irep = 1:nrep
88 | m = 200;
89 | W = randn(m,d); W = W ./ repmat(sqrt(sum(W.^2,2)),1,d);
90 | b = rand(m,1)*2 - 1;
91 | Phi = max(x * W' + repmat(b',n,1),0);
92 | Phitest = max(xtest * W' + repmat(b',ntest,1),0);
93 | K = Phi * Phi' / m;
94 | Ktest = Phitest * Phi' / m ;
95 | alpha = (K + 1e-12*eye(n)) \ y;
96 | yests(:,irep) = Ktest*alpha;
97 |
98 | end
99 |
100 | plot(xtest,yest1,'b','linewidth',3); hold on;
101 | plot(xtest,yests,'linewidth',1); hold on;
102 | plot(x,y,'g.','markersize',30); hold off;
103 | legend('spline kernel')
104 | set(gca,'fontsize',24)
105 | axis([-1 1 -10 10])
106 | xlabel('x');
107 | ylabel('y');
108 | title('m = 200','FontWeight','normal')
109 |
110 |
111 | try
112 | print('-depsc', 'random_features_interpolation.eps');
113 | close(ccc)
114 | catch
115 | disp('missing figure file')
116 | end
117 |
--------------------------------------------------------------------------------
/matlab/optimization/grad_descent_comparison.m:
--------------------------------------------------------------------------------
1 | clear all
2 | seed=1;
3 | randn('state',seed);
4 | rand('state',seed);
5 |
6 | try
7 | ccc=openfig('grad_descent_comparison.fig');
8 | catch
9 | disp('missing figure file')
10 | end
11 |
12 |
13 |
14 | % fixed matrix with planted singular values
15 | d = 1000;
16 |
17 | H = randn(d,d);
18 | [u,s,v] = svd(H);
19 | H = u * diag(1./(1:d)) * u';
20 | Hsqrt = sqrtm(H);
21 | L = max(eig(H));
22 | wstar = u*randn(d,1);
23 | wstar = wstar / sqrt(wstar'*H*wstar);
24 |
25 | w = zeros(d,1);
26 | maxiter = 5000;
27 | for iter=1:maxiter
28 | Fw(iter) = .5 * ( w - wstar)' * H * ( w - wstar);
29 | w = w - 1/L * H * ( w - wstar);
30 |
31 |
32 | end
33 |
34 |
35 | % fixed matrix with planted singular values
36 | d = 1000;
37 |
38 | H = randn(d,d);
39 | [u,s,v] = svd(H);
40 | H = u * diag(1./(1:d).^2) * u';
41 | Hsqrt = sqrtm(H);
42 | L = max(eig(H));
43 | wstar = u*randn(d,1);
44 | wstar = wstar / sqrt(wstar'*H*wstar);
45 |
46 | w = zeros(d,1);
47 | maxiter = 5000;
48 | for iter=1:maxiter
49 | Fw2(iter) = .5 * ( w - wstar)' * H * ( w - wstar);
50 | w = w - 1/L * H * ( w - wstar);
51 |
52 |
53 | end
54 |
55 |
56 |
57 | subplot(1,2,1);
58 | plot(0:maxiter-1,log10(Fw),'b-','linewidth',2); hold on
59 | plot(0:maxiter-1,log10(Fw2),'r-','linewidth',2);
60 | plot(0:maxiter-1,log10(1/8*sum(wstar.^2)./(0:maxiter-1)),'k--','linewidth',2); hold off
61 | ylabel('log_{10}[ F(\theta_t) - F(\eta_\ast) ]');
62 | title('semi-log plot','FontWeight','normal')
63 | xlabel('t')
64 | set(gca,'fontsize',20);
65 | legend('\lambda_k \sim 1/k','\lambda_k \sim 1/k^2','bound','location','southwest');
66 | axis([0 5000 -6 0])
67 | subplot(1,2,2);
68 | plot(log10(0:maxiter-1),log10(Fw),'b-','linewidth',2); hold on
69 | plot(log10(0:maxiter-1),log10(Fw2),'r-','linewidth',2);
70 | plot(log10(0:maxiter-1),log10(1/8*sum(wstar.^2)./(0:maxiter-1)),'k--','linewidth',2); hold off
71 | ylabel('log_{10}[ F(\theta_t) - F(\eta_\ast) ]');
72 | title('log-log plot','FontWeight','normal')
73 | xlabel('log_{10}(t)')
74 | set(gca,'fontsize',20);
75 | legend('\lambda_k \sim 1/k','\lambda_k \sim 1/k^2','bound','location','southwest');
76 | axis([0 log10(5000) -6 0])
77 |
78 | try
79 | print('-depsc', 'grad_descent_comparison.eps');
80 | close(ccc)
81 | catch
82 | disp('missing figure file')
83 | end
84 |
85 |
86 |
--------------------------------------------------------------------------------
/matlab/optimization/hinge_sgd.m:
--------------------------------------------------------------------------------
1 | clear all
2 | seed=1;
3 | randn('state',seed);
4 | rand('state',seed);
5 |
6 | try
7 | ccc=openfig('grad_descent_comparison.fig');
8 | catch
9 | disp('missing figure file')
10 | end
11 |
12 |
13 |
14 |
15 | % fixed matrix with planted singular values
16 | d = 40;
17 | n = 400;
18 |
19 | H = randn(d,d);
20 | [u,s,v] = svd(H);
21 | H = u * diag(1./(1:d)) * u';
22 | % H = u * diag(1./(1:d).^2) * u';
23 | Hsqrt = sqrtm(H);
24 |
25 |
26 | X = randn(n,d) * Hsqrt;
27 | w0 = randn(d,1);
28 | w0 = w0 / sqrt(w0'*H*w0);
29 | noise_std = 1;
30 | y = sign(X * w0 + noise_std * randn(n,1));
31 | R2 = max(sum(X.^2,2));
32 |
33 | w = zeros(d,1);
34 |
35 | mu = 1e-1;
36 |
37 |
38 | % SGD 1/sqrt(t) step
39 | w = zeros(d,1);
40 | wave = zeros(d,1);
41 |
42 | maxiter = 10000;
43 | for iter=1:maxiter
44 | functionval_c(iter) = mean( max( 1 - y .* ( X * w ),0)) + mu/2 * sum(w.^2);
45 | functionval_c_ave(iter) = mean( max( 1 - y .* ( X * wave ),0)) + mu/2 * sum(wave.^2);
46 | it = ceil(n*rand);
47 | w = w - 1/sqrt(iter)/R2 * ( mu * w + double( 1 - y(it) .* ( X(it,:) * w ) > 0 ) * ( - y(it) * X(it,:)') );
48 | wave = ( 1 - 1/iter) * wave + w/iter;
49 | end
50 |
51 |
52 | % SGD 1/mu t step
53 | w = zeros(d,1);
54 | wave = zeros(d,1);
55 |
56 | maxiter = 10000;
57 | for iter=1:maxiter
58 | functionval_sc(iter) = mean( max( 1 - y .* ( X * w ),0)) + mu/2 * sum(w.^2);
59 | functionval_sc_ave(iter) = mean( max( 1 - y .* ( X * wave ),0)) + mu/2 * sum(wave.^2);
60 | it = ceil(n*rand);
61 | w = w - 1/(mu*(iter+1)) * ( mu * w + double( 1 - y(it) .* ( X(it,:) * w ) > 0 ) * ( - y(it) * X(it,:)') );
62 | wave = ( 1 - 1/iter) * wave + w/iter;
63 | end
64 |
65 |
66 | % GD 1/mu t step
67 | w = zeros(d,1);
68 | maxiter_det = 100000;
69 | for iter=1:maxiter_det
70 | functionval_sc_det(iter) = mean( max( 1 - y .* ( X * w ),0)) + mu/2 * sum(w.^2);
71 | it = ceil(n*rand);
72 | w = w - 1/(mu*(iter+1)) * ( mu * w - 1/n * X' * ( y .* double( 1 - y .* ( X * w ) > 0 ) ) );
73 | end
74 |
75 |
76 | subplot(1,2,2);
77 | plot(log10(1:maxiter),log10(functionval_c - functionval_sc_det(end)),'r','linewidth',2); hold on
78 | plot(log10(1:maxiter),log10(functionval_sc - functionval_sc_det(end)),'b','linewidth',2); hold off
79 | xlabel('log_{10}(t)');
80 | ylabel('log_{10}[F(\theta_t) - F(\eta_\ast)]');
81 | set(gca,'fontsize',18)
82 | title('log-log plot - \mu = 0.1','FontWeight','normal')
83 | legend('\gamma_t = 1/(R^2 t^{1/2})','\gamma_t = 1/(\mu t)');
84 |
85 | subplot(1,2,1);
86 | plot( (1:maxiter), (functionval_c(1:maxiter) - functionval_sc_det(end)),'r','linewidth',2); hold on
87 | plot( (1:maxiter), (functionval_sc(1:maxiter) - functionval_sc_det(end)),'b','linewidth',2); hold off
88 | axis([1 maxiter 0 .5])
89 | xlabel('t');
90 | ylabel('F(\theta_t) - F(\eta_\ast)');
91 | set(gca,'fontsize',18)
92 | title('semi-log plot - \mu = 0.1','FontWeight','normal')
93 | legend('\gamma_t = 1/(R^2 t^{1/2})','\gamma_t = 1/(\mu t)');
94 |
95 |
96 | try
97 | print('-depsc', 'svm_largemu.eps');
98 | close(ccc)
99 | catch
100 | disp('missing figure file')
101 | end
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 | clear all
113 | seed=1;
114 | randn('state',seed);
115 | rand('state',seed);
116 |
117 |
118 | try
119 | ccc=openfig('grad_descent_comparison.fig');
120 | catch
121 | disp('missing figure file')
122 | end
123 |
124 |
125 |
126 |
127 | % fixed matrix with planted singular values
128 | d = 40;
129 | n = 400;
130 |
131 | H = randn(d,d);
132 | [u,s,v] = svd(H);
133 | H = u * diag(1./(1:d)) * u';
134 | % H = u * diag(1./(1:d).^2) * u';
135 | Hsqrt = sqrtm(H);
136 |
137 |
138 | X = randn(n,d) * Hsqrt;
139 | w0 = randn(d,1);
140 | w0 = w0 / sqrt(w0'*H*w0);
141 | noise_std = 1;
142 | y = sign(X * w0 + noise_std * randn(n,1));
143 | R2 = max(sum(X.^2,2));
144 |
145 | w = zeros(d,1);
146 |
147 | mu = 1e-3;
148 |
149 |
150 | % SGD 1/sqrt(t) step
151 | w = zeros(d,1);
152 | wave = zeros(d,1);
153 |
154 | maxiter = 10000;
155 | for iter=1:maxiter
156 | functionval_c(iter) = mean( max( 1 - y .* ( X * w ),0)) + mu/2 * sum(w.^2);
157 | functionval_c_ave(iter) = mean( max( 1 - y .* ( X * wave ),0)) + mu/2 * sum(wave.^2);
158 | it = ceil(n*rand);
159 | w = w - 1/sqrt(iter)/R2 * ( mu * w + double( 1 - y(it) .* ( X(it,:) * w ) > 0 ) * ( - y(it) * X(it,:)') );
160 | wave = ( 1 - 1/iter) * wave + w/iter;
161 | end
162 |
163 |
164 | % SGD 1/mu t step
165 | w = zeros(d,1);
166 | wave = zeros(d,1);
167 |
168 | maxiter = 10000;
169 | for iter=1:maxiter
170 | functionval_sc(iter) = mean( max( 1 - y .* ( X * w ),0)) + mu/2 * sum(w.^2);
171 | functionval_sc_ave(iter) = mean( max( 1 - y .* ( X * wave ),0)) + mu/2 * sum(wave.^2);
172 | it = ceil(n*rand);
173 | w = w - 1/(mu*(iter+1)) * ( mu * w + double( 1 - y(it) .* ( X(it,:) * w ) > 0 ) * ( - y(it) * X(it,:)') );
174 | wave = ( 1 - 1/iter) * wave + w/iter;
175 | end
176 |
177 |
178 | % GD 1/mu t step
179 | w = zeros(d,1);
180 | maxiter_det = 100000;
181 | for iter=1:maxiter_det
182 | functionval_sc_det(iter) = mean( max( 1 - y .* ( X * w ),0)) + mu/2 * sum(w.^2);
183 | it = ceil(n*rand);
184 | w = w - 1/(mu*(iter+1)) * ( mu * w - 1/n * X' * ( y .* double( 1 - y .* ( X * w ) > 0 ) ) );
185 | end
186 |
187 |
188 |
189 | subplot(1,2,2);
190 | plot(log10(1:maxiter),log10(functionval_c - functionval_sc_det(end)),'r','linewidth',2); hold on
191 | plot(log10(1:maxiter),log10(functionval_sc - functionval_sc_det(end)),'b','linewidth',2); hold off
192 | xlabel('log_{10}(t)');
193 | ylabel('log_{10}[F(\theta_t) - F(\eta_\ast)]');
194 | set(gca,'fontsize',18)
195 | title('log-log plot - \mu = 0.001','FontWeight','normal')
196 | legend('\gamma_t = 1/(R^2 t^{1/2})','\gamma_t = 1/(\mu t)');
197 |
198 | subplot(1,2,1);
199 | plot( (1:maxiter), (functionval_c(1:maxiter) - functionval_sc_det(end)),'r','linewidth',2); hold on
200 | plot( (1:maxiter), (functionval_sc(1:maxiter) - functionval_sc_det(end)),'b','linewidth',2); hold off
201 | axis([1 maxiter 0 .75])
202 | xlabel('t');
203 | ylabel('F(\theta_t) - F(\eta_\ast)');
204 | set(gca,'fontsize',18)
205 | title('semi-log plot - \mu = 0.001','FontWeight','normal')
206 | legend('\gamma_t = 1/(R^2 t^{1/2})','\gamma_t = 1/(\mu t)');
207 |
208 | try
209 | print('-depsc', 'svm_smallmu.eps');
210 | close(ccc)
211 | catch
212 | disp('missing figure file')
213 | end
214 |
215 |
216 |
217 |
218 |
--------------------------------------------------------------------------------
/matlab/optimization/logistic_sgd_saga.m:
--------------------------------------------------------------------------------
1 | clear all
2 | seed=1;
3 | randn('state',seed);
4 | rand('state',seed);
5 |
6 | try
7 | ccc=openfig('grad_descent_comparison.fig');
8 | catch
9 | disp('missing figure file')
10 | end
11 |
12 |
13 |
14 |
15 | % fixed matrix with planted singular values
16 | d = 40;
17 | n = 2000;
18 |
19 | H = randn(d,d);
20 | [u,s,v] = svd(H);
21 | H = u * diag(1./(1:d)) * u';
22 | % H = u * diag(1./(1:d).^2) * u';
23 | Hsqrt = sqrtm(H);
24 |
25 |
26 | X = randn(n,d) * Hsqrt;
27 | w0 = randn(d,1);
28 | w0 = w0 / sqrt(w0'*H*w0);
29 | noise_std = 1;
30 | y = sign(X * w0 + noise_std * randn(n,1));
31 | R2 = max(sum(X.^2,2));
32 | L = max(eig(X'*X)/n);
33 | w = zeros(d,1);
34 |
35 | mu = 1e-2;
36 | mu = R2/ n;
37 | n = n/2;
38 | Xtest = X(n+1:2*n,:);
39 | ytest = y(n+1:2*n);
40 | X = X(1:n,:);
41 | y = y(1:n);
42 |
43 |
44 |
45 | % SGD step
46 | w = zeros(d,1);
47 | wave = zeros(d,1);
48 | number_of_passes = 50;
49 | tostore = 1:n/10:number_of_passes*n;
50 | tostore_ind = zeros(1,number_of_passes*n);
51 | tostore_ind(tostore) = 1;
52 | maxiter_sto = number_of_passes*n;
53 | istore=1;
54 | for iter=1:maxiter_sto
55 | if tostore_ind(iter),
56 | functionval_sc(istore) = mean( log( 1 + exp(- y .* ( X*w ) ) ) ) + mu/2 * sum(w.^2);
57 | functionval_sc_ave(istore) = mean( log( 1 + exp(- y .* ( X*wave ) ) ) ) + mu/2 * sum(wave.^2);
58 | functionval_sc_test(istore) = mean( log( 1 + exp(- ytest .* ( Xtest*w ) ) ) ) ;
59 | functionval_sc_ave_test(istore) = mean( log( 1 + exp(- ytest .* ( Xtest*wave ) ) ) ) ;
60 |
61 | istore = istore+1;
62 | end
63 |
64 |
65 | it = ceil(n*rand);
66 | temp = ( X(it,:) * w ) .* y(it);
67 | w = w - 1/(R2*sqrt(iter+1)+mu*(iter+1)) * ( mu * w + X(it,:)' * ( y(it) .* ( - 1./( 1 + exp(temp) ) ) ) );
68 | wave = ( 1 - 1/iter) * wave + w/iter;
69 | end
70 |
71 |
72 | % GD
73 | w = zeros(d,1);
74 | maxiter_det = number_of_passes+1;
75 | for iter=1:maxiter_det
76 | functionval_sc_det(iter) = mean( log( 1 + exp(- y .* ( X*w ) ) ) ) + mu/2 * sum(w.^2);
77 | functionval_sc_det_test(iter) = mean( log( 1 + exp(- ytest .* ( Xtest*w ) ) ) ) ;
78 |
79 |
80 | temp = ( X * w ) .* y;
81 | grad = mu* w + 1/n * X' * ( y .* ( - 1./( 1 + exp(temp) ) ) ) ;
82 | w = w - 1/(L) * grad;
83 | end
84 |
85 |
86 | % SAGA
87 | w = zeros(d,1);
88 | tostore = 1:n/10:number_of_passes*n;
89 | tostore_ind = zeros(1,number_of_passes*n*4);
90 | tostore_ind(tostore) = 1;
91 |
92 | temp = ( X * w ) .* y;
93 | grad = mu* w + 1/n * X' * ( y .* ( - 1./( 1 + exp(temp) ) ) ) ;
94 | zs = X' .* repmat( (y .* ( - 1./( 1 + exp(temp) )))',d,1) + repmat(mu * w,1,n);
95 | meanzs = mean(zs,2);
96 | wave = zeros(d,1);
97 |
98 | maxiter_saga = number_of_passes*n*2;
99 | istore = 1;
100 | for iter=1:maxiter_saga
101 |
102 | if tostore_ind(iter),
103 | functionval_saga(istore) = mean( log( 1 + exp(- y .* ( X*w ) ) ) ) + mu/2 * sum(w.^2);
104 | functionval_saga_test(istore) = mean( log( 1 + exp(- ytest .* ( Xtest*w ) ) ) ) ;
105 |
106 | istore = istore+1;
107 | end
108 |
109 | if iter==maxiter_saga
110 | optvalue = mean( log( 1 + exp(- y .* ( X*w ) ) ) ) + mu/2 * sum(w.^2);
111 | end
112 |
113 |
114 |
115 |
116 |
117 | it = ceil(n*rand);
118 | temp = ( X(it,:) * w ) .* y(it);
119 | gradloc = mu * w + X(it,:)' * ( y(it) .* ( - 1./( 1 + exp(temp) ) ) );
120 | w = w - 1/3/R2 * ( gradloc - zs(:,it) + meanzs );
121 | meanzs = meanzs + gradloc/n - zs(:,it)/n;
122 | zs(:,it) = gradloc;
123 | end
124 |
125 | subplot(1,2,2);
126 | plot( (tostore)/n, functionval_sc_test ,'b','linewidth',2); hold on
127 | % plot( (tostore)/n, functionval_sc_ave_test ,'b:','linewidth',2); hold on
128 | plot( (tostore)/n,functionval_saga_test,'r','linewidth',2); hold on
129 | plot( (0:maxiter_det-1),functionval_sc_det_test,'k','linewidth',2); hold off
130 | legend('SGD','SAGA','GD','location','northeast');
131 | xlabel('effective passes');
132 | ylabel('F_{test}(\theta_t)');
133 | set(gca,'fontsize',18)
134 | title('Expected risk - n = 1000 ','FontWeight','normal')
135 | axis([0 number_of_passes 0.4 0.7])
136 |
137 |
138 | subplot(1,2,1);
139 | plot( (tostore)/n,log10(functionval_sc - optvalue),'b','linewidth',2); hold on
140 | % plot( (tostore)/n,log10(functionval_sc_ave - optvalue),'b:','linewidth',2); hold on
141 | plot( (tostore)/n,log10(functionval_saga - optvalue),'r','linewidth',2); hold on
142 | plot( (0:maxiter_det-1),log10(functionval_sc_det - optvalue),'k','linewidth',2); hold off
143 | legend('SGD','SAGA','GD','location','southeast');
144 | xlabel('effective passes');
145 | ylabel('log_{10}[F(\theta_t) - F(\theta_\ast)]');
146 | set(gca,'fontsize',18)
147 | title('Training objective - n = 1000','FontWeight','normal')
148 | axis([0 number_of_passes -10 -0.5])
149 |
150 |
151 | try
152 | print('-depsc', 'saga_lown.eps');
153 | catch
154 | disp('missing figure file')
155 | end
156 |
157 |
158 |
159 |
160 |
161 |
162 | clear all
163 | seed=1;
164 | randn('state',seed);
165 | rand('state',seed);
166 |
167 | try
168 | ccc=openfig('grad_descent_comparison.fig');
169 | catch
170 | disp('missing figure file')
171 | end
172 |
173 |
174 |
175 | % fixed matrix with planted singular values
176 | d = 40;
177 | n = 20000;
178 |
179 | H = randn(d,d);
180 | [u,s,v] = svd(H);
181 | H = u * diag(1./(1:d)) * u';
182 | % H = u * diag(1./(1:d).^2) * u';
183 | Hsqrt = sqrtm(H);
184 |
185 |
186 | X = randn(n,d) * Hsqrt;
187 | w0 = randn(d,1);
188 | w0 = w0 / sqrt(w0'*H*w0);
189 | noise_std = 1;
190 | y = sign(X * w0 + noise_std * randn(n,1));
191 | R2 = max(sum(X.^2,2));
192 | L = max(eig(X'*X)/n);
193 | w = zeros(d,1);
194 |
195 | mu = 1e-2;
196 | mu = R2/ n;
197 | n = n/2;
198 | Xtest = X(n+1:2*n,:);
199 | ytest = y(n+1:2*n);
200 | X = X(1:n,:);
201 | y = y(1:n);
202 |
203 |
204 |
205 | % SGD step
206 | w = zeros(d,1);
207 | wave = zeros(d,1);
208 | number_of_passes = 50;
209 | tostore = 1:n/10:number_of_passes*n;
210 | tostore_ind = zeros(1,number_of_passes*n);
211 | tostore_ind(tostore) = 1;
212 | maxiter_sto = number_of_passes*n;
213 | istore=1;
214 | for iter=1:maxiter_sto
215 | if tostore_ind(iter),
216 | functionval_sc(istore) = mean( log( 1 + exp(- y .* ( X*w ) ) ) ) + mu/2 * sum(w.^2);
217 | functionval_sc_ave(istore) = mean( log( 1 + exp(- y .* ( X*wave ) ) ) ) + mu/2 * sum(wave.^2);
218 | functionval_sc_test(istore) = mean( log( 1 + exp(- ytest .* ( Xtest*w ) ) ) ) ;
219 | functionval_sc_ave_test(istore) = mean( log( 1 + exp(- ytest .* ( Xtest*wave ) ) ) ) ;
220 |
221 | istore = istore+1;
222 | end
223 |
224 |
225 | it = ceil(n*rand);
226 | temp = ( X(it,:) * w ) .* y(it);
227 | w = w - 1/(R2*sqrt(iter+1)+mu*(iter+1)) * ( mu * w + X(it,:)' * ( y(it) .* ( - 1./( 1 + exp(temp) ) ) ) );
228 | wave = ( 1 - 1/iter) * wave + w/iter;
229 | end
230 |
231 |
232 | % GD
233 | w = zeros(d,1);
234 | maxiter_det = number_of_passes+1;
235 | for iter=1:maxiter_det
236 | functionval_sc_det(iter) = mean( log( 1 + exp(- y .* ( X*w ) ) ) ) + mu/2 * sum(w.^2);
237 | functionval_sc_det_test(iter) = mean( log( 1 + exp(- ytest .* ( Xtest*w ) ) ) ) ;
238 |
239 |
240 | temp = ( X * w ) .* y;
241 | grad = mu* w + 1/n * X' * ( y .* ( - 1./( 1 + exp(temp) ) ) ) ;
242 | w = w - 1/(L) * grad;
243 | end
244 |
245 |
246 | % SAGA
247 | w = zeros(d,1);
248 | tostore = 1:n/10:number_of_passes*n;
249 | tostore_ind = zeros(1,number_of_passes*n*4);
250 | tostore_ind(tostore) = 1;
251 |
252 | temp = ( X * w ) .* y;
253 | grad = mu* w + 1/n * X' * ( y .* ( - 1./( 1 + exp(temp) ) ) ) ;
254 | zs = X' .* repmat( (y .* ( - 1./( 1 + exp(temp) )))',d,1) + repmat(mu * w,1,n);
255 | meanzs = mean(zs,2);
256 | wave = zeros(d,1);
257 |
258 | maxiter_saga = number_of_passes*n*2;
259 | istore = 1;
260 | for iter=1:maxiter_saga
261 |
262 | if tostore_ind(iter),
263 | functionval_saga(istore) = mean( log( 1 + exp(- y .* ( X*w ) ) ) ) + mu/2 * sum(w.^2);
264 | functionval_saga_test(istore) = mean( log( 1 + exp(- ytest .* ( Xtest*w ) ) ) ) ;
265 |
266 | istore = istore+1;
267 | end
268 |
269 | if iter==maxiter_saga
270 | optvalue = mean( log( 1 + exp(- y .* ( X*w ) ) ) ) + mu/2 * sum(w.^2);
271 | end
272 |
273 |
274 |
275 |
276 |
277 | it = ceil(n*rand);
278 | temp = ( X(it,:) * w ) .* y(it);
279 | gradloc = mu * w + X(it,:)' * ( y(it) .* ( - 1./( 1 + exp(temp) ) ) );
280 | w = w - 1/3/R2 * ( gradloc - zs(:,it) + meanzs );
281 | meanzs = meanzs + gradloc/n - zs(:,it)/n;
282 | zs(:,it) = gradloc;
283 | end
284 |
285 | subplot(1,2,2);
286 | plot( (tostore)/n, functionval_sc_test ,'b','linewidth',2); hold on
287 | plot( (tostore)/n,functionval_saga_test,'r','linewidth',2); hold on
288 | plot( (0:maxiter_det-1),functionval_sc_det_test,'k','linewidth',2); hold off
289 | legend('SGD','SAGA','GD','location','northeast');
290 | xlabel('effective passes');
291 | ylabel('F_{test}(\theta_t)');
292 | set(gca,'fontsize',18)
293 | title('Expected risk - n = 10000 ','FontWeight','normal')
294 | axis([0 number_of_passes 0.4 0.7])
295 |
296 |
297 | subplot(1,2,1);
298 | plot( (tostore)/n,log10(functionval_sc - optvalue),'b','linewidth',2); hold on
299 | plot( (tostore)/n,log10(functionval_saga - optvalue),'r','linewidth',2); hold on
300 | plot( (0:maxiter_det-1),log10(functionval_sc_det - optvalue),'k','linewidth',2); hold off
301 | legend('SGD','SAGA','GD','location','southeast');
302 | xlabel('effective passes');
303 | ylabel('log_{10}[F(\theta_t) - F(\theta_\ast)]');
304 | set(gca,'fontsize',18)
305 | title('Training objective - n = 10000','FontWeight','normal')
306 | axis([0 number_of_passes -10 -0.5])
307 |
308 |
309 | try
310 | print('-depsc', 'saga_highn.eps');
311 | close(ccc)
312 | catch
313 | disp('missing figure file')
314 | end
315 |
316 |
317 |
--------------------------------------------------------------------------------
/matlab/sq_dist.m:
--------------------------------------------------------------------------------
1 | % sq_dist - a function to compute a matrix of all pairwise squared distances
2 | % between two sets of vectors, stored in the columns of the two matrices, a
3 | % (of size D by n) and b (of size D by m). If only a single argument is given
4 | % or the second matrix is empty, the missing matrix is taken to be identical
5 | % to the first.
6 | %
7 | % Special functionality: If an optional third matrix argument Q is given, it
8 | % must be of size n by m, and in this case a vector of the traces of the
9 | % product of Q' and the coordinatewise squared distances is returned.
10 | %
11 | % NOTE: The program code is written in the C language for efficiency and is
12 | % contained in the file sq_dist.c, and should be compiled using matlabs mex
13 | % facility. However, this file also contains a (less efficient) matlab
14 | % implementation, supplied only as a help to people unfamiliar with mex. If
15 | % the C code has been properly compiled and is avaiable, it automatically
16 | % takes precendence over the matlab code in this file.
17 | %
18 | % Usage: C = sq_dist(a, b)
19 | % or: C = sq_dist(a) or equiv.: C = sq_dist(a, [])
20 | % or: c = sq_dist(a, b, Q)
21 | % where the b matrix may be empty.
22 | %
23 | % where a is of size D by n, b is of size D by m (or empty), C and Q are of
24 | % size n by m and c is of size D by 1.
25 | %
26 | % Copyright (c) 2003, 2004, 2005 and 2006 Carl Edward Rasmussen. 2006-03-09.
27 |
28 | function C = sq_dist(a, b, Q);
29 |
30 | if nargin < 1 | nargin > 3 | nargout > 1
31 | error('Wrong number of arguments.');
32 | end
33 |
34 | if nargin == 1 | isempty(b) % input arguments are taken to be
35 | b = a; % identical if b is missing or empty
36 | end
37 |
38 | [D, n] = size(a);
39 | [d, m] = size(b);
40 | if d ~= D
41 | error('Error: column lengths must agree.');
42 | end
43 |
44 | if nargin < 3
45 | C = zeros(n,m);
46 | for d = 1:D
47 | C = C + (repmat(b(d,:), n, 1) - repmat(a(d,:)', 1, m)).^2;
48 | end
49 | % C = repmat(sum(a.*a)',1,m)+repmat(sum(b.*b),n,1)-2*a'*b could be used to
50 | % replace the 3 lines above; it would be faster, but numerically less stable.
51 | else
52 | if [n m] == size(Q)
53 | C = zeros(D,1);
54 | for d = 1:D
55 | C(d) = sum(sum((repmat(b(d,:), n, 1) - repmat(a(d,:)', 1, m)).^2.*Q));
56 | end
57 | else
58 | error('Third argument has wrong size.');
59 | end
60 | end
61 |
--------------------------------------------------------------------------------