├── Matlab
    ├── Data
    │   ├── BikeSharingDeepGLM.mat
    │   ├── DataSimulationBinary.mat
    │   ├── DirectMarketing.mat
    │   ├── SchoolingData.mat
    │   └── abalone.mat
    ├── DeepGLM
    │   ├── nnfun
    │   │   ├── nnActivation.m
    │   │   ├── nnActivationGrad.m
    │   │   ├── nnBackPropagation.m
    │   │   ├── nnFeedForward.m
    │   │   ├── nnGradLogLikelihood.m
    │   │   ├── nnInitialize.m
    │   │   └── nnSumResidualSquare.m
    │   ├── plotfun
    │   │   ├── deepGLMplot.m
    │   │   ├── plotInterval.m
    │   │   ├── plotMSE.m
    │   │   ├── plotPPS.m
    │   │   ├── plotROC.m
    │   │   └── plotShrinkage.m
    │   ├── stafun
    │   │   ├── gen_Sobol.m
    │   │   ├── normrnd_qmc.m
    │   │   └── rqmc_rnd.m
    │   ├── train
    │   │   ├── deepGLMTrain.m
    │   │   ├── deepGLMTrainTest.m
    │   │   ├── deepGLMfit.m
    │   │   ├── deepGLMlogitPoisson.m
    │   │   ├── deepGLMnormalCV.m
    │   │   ├── deepGLMpoisson.m
    │   │   ├── deepGLMpredict.m
    │   │   └── deepGLMpredictLoss.m
    │   ├── utils
    │   │   ├── checkInput.m
    │   │   ├── deepGLMmsg.m
    │   │   ├── deepGLMout.m
    │   │   ├── isBinomial.m
    │   │   ├── predictionInterval.m
    │   │   ├── splitData.m
    │   │   └── sumResidualSquared.m
    │   └── vbfun
    │   │   ├── vbGradientLogLB.m
    │   │   ├── vbGradientLogq.m
    │   │   ├── vbLowerBound.m
    │   │   └── vbNaturalGradient.m
    ├── Document
    │   ├── deepGLM.pdf
    │   ├── deepGLMNormalExample.pdf
    │   ├── ~WRL1562.tmp
    │   └── ~WRL3227.tmp
    └── Examples
    │   ├── deepGLMBinomialExample.mlx
    │   ├── deepGLMBinomialExampleScript.m
    │   ├── deepGLMNormalExample.mlx
    │   ├── deepGLMNormalExampleScript.m
    │   └── deepGLMPoissonExampleScript.m
├── Python
    ├── .ipynb_checkpoints
    │   └── Example notebook-checkpoint.ipynb
    ├── DirectMarketing.mat
    ├── Example notebook.ipynb
    ├── __pycache__
    │   └── deepGLM.cpython-37.pyc
    ├── deepGLM.pdf
    └── deepGLM.py
├── R
    ├── 01_data
    │   └── abalone.csv
    ├── 02_libs
    │   ├── checkInput.R
    │   ├── deepGLMTrain.R
    │   ├── deepGLMfit.R
    │   ├── deepGLMpredict.R
    │   ├── nnActivation.R
    │   ├── nnActivationGrad.R
    │   ├── nnBackPropagation.R
    │   ├── nnFeedForward.R
    │   ├── nnGradLogLikelihood.R
    │   ├── nnInitialize.R
    │   ├── nnSumResidualSquare.R
    │   ├── predictionInterval.R
    │   ├── vbGradientLogLB.R
    │   ├── vbGradientLogq.R
    │   ├── vbLowerBound.R
    │   └── vbNaturalGradient.R
    ├── deepGLMNormalExample.R
    └── dependencies.R
├── README.html
└── README.md


/Matlab/Data/BikeSharingDeepGLM.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Data/BikeSharingDeepGLM.mat


--------------------------------------------------------------------------------
/Matlab/Data/DataSimulationBinary.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Data/DataSimulationBinary.mat


--------------------------------------------------------------------------------
/Matlab/Data/DirectMarketing.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Data/DirectMarketing.mat


--------------------------------------------------------------------------------
/Matlab/Data/SchoolingData.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Data/SchoolingData.mat


--------------------------------------------------------------------------------
/Matlab/Data/abalone.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Data/abalone.mat


--------------------------------------------------------------------------------
/Matlab/DeepGLM/nnfun/nnActivation.m:
--------------------------------------------------------------------------------
 1 | function out = nnActivation(z,func)
 2 | %NNACTIVATION Calculate activation output at nodes in each forward pass
 3 | 
 4 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 5 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 6 | %   
 7 | %   http://www.xxx.com
 8 | %
 9 | %   Version: 1.0
10 | %   LAST UPDATE: April, 2018
11 | 
12 | if nargin < 2
13 |     error(deepGLMmsg('deepglm:MustSpecifyActivationFunction'));
14 | end
15 | 
16 | switch func
17 |     case 'Linear'
18 |         out = z;
19 |     case 'Sigmoid'
20 |         out = 1.0 ./ (1.0 + exp(-z));
21 |     case 'Tanh'
22 |         out = tanh(z);
23 |     case 'ReLU'
24 |         out = max(0,z);
25 |     case 'LeakyReLU'
26 |         out = max(0,z)+ alpha*min(0,z);
27 | end
28 | end
29 | 
30 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/nnfun/nnActivationGrad.m:
--------------------------------------------------------------------------------
 1 | function out = nnActivationGrad(z,func)
 2 | %NNACTIVATIONGRAD Calculate derivative of activation output at hidden nodes 
 3 | %in each backward pass
 4 | %
 5 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 6 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 7 | %   
 8 | %   http://www.xxx.com
 9 | %
10 | %   Version: 1.0
11 | %   LAST UPDATE: April, 2018
12 | 
13 | 
14 | switch func
15 |     case 'Linear'
16 |         out = ones(size(z));
17 |     case 'Sigmoid'
18 |         temp = activation(z,text);
19 |         out = temp.*(1-temp);
20 |     case 'Tanh'
21 |         temp = activation(z,text);
22 |         out = 1 - temp^2;
23 |     case 'ReLU'
24 |         out = z>0;
25 |     case 'LeakyReLU'
26 |         if z > 0
27 |             out = 1;
28 |         else 
29 |             out = alpha;
30 |         end
31 | end
32 | 
33 | end
34 | 
35 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/nnfun/nnBackPropagation.m:
--------------------------------------------------------------------------------
 1 | function [gradient,nnOut] = nnBackPropagation(X,y,W_seq,beta,distr)
 2 | %NNBACKPROPAGATION Compute gradient of weights in a neural net using 
 3 | % backpropagation algorithm
 4 | %
 5 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 6 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 7 | %   
 8 | %   http://www.xxx.com
 9 | %
10 | %   Version: 1.0
11 | %   LAST UPDATE: April, 2018
12 | 
13 | n_train = size(X,1);
14 | L = length(W_seq);
15 | a_seq = cell(1,L);
16 | Z_seq = cell(1,L);
17 | 
18 | a_seq{1} = W_seq{1}*X';
19 | Z_seq{1} = [ones(1,n_train);nnActivation(a_seq{1},'ReLU')];
20 | for j=2:L
21 |     a_seq{j} = W_seq{j}*Z_seq{j-1};
22 |     Z_seq{j} = [ones(1,n_train);nnActivation(a_seq{j},'ReLU')];
23 | end
24 | delta_seq = cell(1,L+1);
25 | 
26 | % Calculate error at the output layers according to distribution family of
27 | % response
28 | nnOut = beta'*Z_seq{L};
29 | switch distr 
30 |     case 'normal'  
31 |         delta_seq{L+1} = y' - nnOut;
32 |     case 'binomial'
33 |         p_i = 1./(1+exp(-nnOut));
34 |         delta_seq{L+1} = y' - p_i;
35 |     case 'poisson'
36 |         delta_seq{L+1} = y' - exp(nnOut);
37 | end
38 | delta_seq{L} = (beta(2:end)*delta_seq{L+1}).*nnActivationGrad(a_seq{L},'ReLU');
39 | for j=L-1:-1:1
40 |     Wj_tilde = W_seq{j+1};
41 |     Wj_tilde = Wj_tilde(:,2:end);
42 |     delta_seq{j} = (nnActivationGrad(a_seq{j},'ReLU')).*(Wj_tilde'*delta_seq{j+1});
43 | end
44 | gradient_W1 = delta_seq{1}*X;
45 | gradient = gradient_W1(:);
46 | for j = 2:L
47 |     gradient_Wj = delta_seq{j}*(Z_seq{j-1})';
48 |     gradient = [gradient;gradient_Wj(:)];
49 | end
50 | gradient = [gradient;Z_seq{L}*delta_seq{L+1}'];
51 | end
52 | 
53 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/nnfun/nnFeedForward.m:
--------------------------------------------------------------------------------
 1 | function nnOutput = nnFeedForward(X,W_seq,beta)
 2 | %NNFEEDFORWARD Compute the output of a neural net 
 3 | 
 4 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 5 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 6 | %   
 7 | %   http://www.xxx.com
 8 | %
 9 | %   Version: 1.0
10 | %   LAST UPDATE: April, 2018
11 | 
12 | % Number of observations in dataset
13 | n_train = size(X,1); 
14 | 
15 | % Make forward passes to all layers
16 | a = W_seq{1}*X';
17 | Z = [ones(1,n_train);nnActivation(a,'ReLU')];
18 | L = length(W_seq);
19 | for j=2:L
20 |     a = W_seq{j}*Z;
21 |     Z = [ones(1,n_train);nnActivation(a,'ReLU')]; % Add biases 
22 | end
23 | % a = W_seq{L}*Z;
24 | % Z = [ones(1,n_train);nnActivation(a,'ReLU')];
25 | nnOutput = Z'*beta;
26 | end
27 | 
28 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/nnfun/nnGradLogLikelihood.m:
--------------------------------------------------------------------------------
 1 | function [gradient,nnOut] = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr,mean_sigma2_inverse)
 2 | %NNGRADIENTLLH Calculate gradient of log likelihood
 3 | %   Detailed explanation goes here
 4 | %
 5 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 6 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 7 | %   
 8 | %   http://www.xxx.com
 9 | %
10 | %   Version: 1.0
11 | %   LAST UPDATE: April, 2018
12 | 
13 | n = length(y);
14 | [back_prop,nnOut] = nnBackPropagation(X,y,W_seq,beta,distr);
15 | nnOut = nnOut';
16 | switch distr
17 |     case 'normal'
18 |         gradient_theta = mean_sigma2_inverse*back_prop;
19 |         gradient = datasize/n*gradient_theta;   % To compensate the variation
20 |     case 'binomial'
21 |         gradient = datasize/n*back_prop;
22 |     case 'poisson'
23 |         gradient = datasize/n*back_prop;
24 | end
25 | end
26 | 
27 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/nnfun/nnInitialize.m:
--------------------------------------------------------------------------------
 1 | function weights = nnInitialize(layers)
 2 | %NNINITIALIZE Summary of this function goes here
 3 | %  layers: vector of doubles, each number specifing the amount of
 4 | %  nodes in a layer of the network.
 5 | %
 6 | %  weights: cell array of weight matrices specifing the
 7 | %  translation from one layer of the network to the next.
 8 | %
 9 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
10 | %   Nguyen (nghia.nguyen@sydney.edu.au)
11 | %
12 | %   http://www.xxx.com
13 | %
14 | %   Version: 1.0
15 | %   LAST UPDATE: April, 2018
16 | 
17 | weights = cell(1, length(layers)-1);
18 | 
19 | for i = 1:length(layers)-1
20 |     % Using random weights from -b to b 
21 |     b = sqrt(6)/(layers(i)+layers(i+1));
22 |     if i==1
23 |         weights{i} = rand(layers(i+1),layers(i))*2*b - b;  % Input layer already have bias
24 |     else
25 |         weights{i} = rand(layers(i+1),layers(i)+1)*2*b - b;  % 1 bias in input layer
26 |     end
27 | end
28 | 
29 | end
30 | 
31 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/nnfun/nnSumResidualSquare.m:
--------------------------------------------------------------------------------
 1 | function out = nnSumResidualSquare(y,X,W_seq,beta)
 2 | %NNSUMRESIDUALSQUARE Calculate sum of square of residuals
 3 | %
 4 | %
 5 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 6 | %   Nguyen (nghia.nguyen@sydney.edu.au).
 7 | %   
 8 | %   http://www.xxx.com
 9 | %
10 | %   Version: 1.0
11 | %   LAST UPDATE: April, 2018
12 | 
13 | nnet_output = nnFeedForward(X,W_seq,beta);
14 | out = sum((y-nnet_output).^2);
15 | end
16 | 
17 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/plotfun/deepGLMplot.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/DeepGLM/plotfun/deepGLMplot.m


--------------------------------------------------------------------------------
/Matlab/DeepGLM/plotfun/plotInterval.m:
--------------------------------------------------------------------------------
 1 | function plotInterval(predMean,predInterval,opt,varargin)
 2 | %PLOTINTERVAL Plot prediction interval for test data
 3 | %
 4 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 5 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 6 | %   
 7 | %   http://www.xxx.com
 8 | %
 9 | %   Version: 1.0
10 | %   LAST UPDATE: April, 2018
11 | 
12 | if (nargin<2)
13 |     disp('ERROR: not enough input arguments!');
14 |     return;
15 | end
16 | 
17 | textTitle = opt.title;
18 | labelX = opt.labelX;
19 | labelY = opt.labelY;
20 | linewidth = opt.linewidth;
21 | 
22 | % Define some default texts
23 | if(isempty(textTitle))
24 |     textTitle = 'Prediction Interval on Test Data';
25 | end
26 | if(isempty(labelX))
27 |     labelX = 'Observation';
28 | end
29 | 
30 | % Parse additional options
31 | paramNames = {'Color'     'Style'       'ytrue'};
32 | paramDflts = {'red'       'shade'       []};
33 | [color,style,ytrue] = internal.stats.parseArgs(paramNames,...
34 |                                                  paramDflts, varargin{:});
35 |                           
36 | lower = predInterval(:,1);
37 | upper = predInterval(:,2);
38 | t = 1:1:length(predMean);
39 | switch style
40 |     case 'shade'       % Plot prediction interval in shade style
41 |         p = plot(t,predMean,t,upper,t,lower);
42 |         YLIM = get(gca,'YLim');    
43 |         delete(p);
44 |         a1 = area(t,upper,min(YLIM)); 
45 |         hold on;
46 |         set(a1,'LineStyle','none');     
47 |         set(a1,'FaceColor',[0.9 0.9 0.9]);
48 |         a2 = area(t,lower,min(YLIM)); 
49 |         set(a2,'LineStyle','none');     
50 |         set(a2,'FaceColor',[1 1 1]);
51 |         p2 = scatter(t,predMean,40,'MarkerEdgeColor',[1 0 0]);
52 |         if(~isempty(ytrue))
53 |             p1 = scatter(t,ytrue,40,'MarkerEdgeColor',[0 0 1]);
54 |             legend([p1,p2],{'True values','Prediction values'});
55 |         end
56 |         title(textTitle, 'FontSize',18)
57 |         xlabel(labelX)
58 |         ylabel(labelY)
59 |         hold off;           
60 |         set(gca,'Layer','top','XGrid','on','YGrid','on');
61 |     case 'boundary'   % Plot prediction interval in boundary style 
62 |         plot(t,predMean,'LineWidth',linewidth,'Color',color);
63 |         hold on
64 |         plot(t,upper,'--r',t,lower,'--r');
65 |         grid on
66 |         title('Prediction Interval on Test Data', 'FontSize',18)
67 |         xlabel('Observation')
68 |         hold off
69 |     case 'bar'        % Plot prediction interval in bar style 
70 |         err = (upper-lower)/2;
71 |         errorbar(predMean,err);
72 |         grid on
73 |         hold on
74 |         plot(predMean,'Color','red','LineWidth',2);
75 |         title('Prediction Interval on Test Data', 'FontSize',18)
76 |         xlabel('Observation')
77 |         hold off
78 | end
79 | end
80 | 
81 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/plotfun/plotMSE.m:
--------------------------------------------------------------------------------
 1 | function [outputArg1,outputArg2] = plotMSE(inputArg1,inputArg2)
 2 | %PLOTMSE Summary of this function goes here
 3 | %
 4 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 5 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 6 | %   
 7 | %   http://www.xxx.com
 8 | %
 9 | %   Version: 1.0
10 | %   LAST UPDATE: April, 2018
11 | 
12 | outputArg1 = inputArg1;
13 | outputArg2 = inputArg2;
14 | end
15 | 
16 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/plotfun/plotPPS.m:
--------------------------------------------------------------------------------
 1 | function plotPPS(loss,data)
 2 | %PLOTPPS Plot prediction loss
 3 | %
 4 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 5 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 6 | %   
 7 | %   http://www.xxx.com
 8 | %
 9 | %   Version: 1.0
10 | %   LAST UPDATE: April, 2018
11 | 
12 | plot(loss);
13 | grid on;
14 | title(['Prediction Loss on ',data,' set']);
15 | xlabel('Iterations');
16 | ylabel('PPS')
17 | end
18 | 
19 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/plotfun/plotROC.m:
--------------------------------------------------------------------------------
 1 | function plotROC(y_true,y_pred)
 2 | %PLOTROC Plot ROC curve and AUC
 3 | 
 4 | if nargin<2
 5 |     disp('Too few input arguments');
 6 |     return
 7 | end
 8 | 
 9 | if(size(y_true)~=size(y_pred))
10 |     disp('Target and output must have same size')
11 |     return
12 | elseif(size(y_true,1)~=1)
13 |     disp('Target and output must be row vectors with same length')
14 |     return
15 | else
16 |     plotroc(y_true,y_pred)
17 |     grid on
18 | end
19 | 
20 | end
21 | 
22 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/plotfun/plotShrinkage.m:
--------------------------------------------------------------------------------
 1 | function plotShrinkage(ShrinkageCoef,opt)
 2 | %PLOTSHRINKAGE Plot shrinkage coefficient of Group Lasso regularization
 3 | %
 4 | %
 5 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 6 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 7 | %   
 8 | %   http://www.xxx.com
 9 | %
10 | %   Version: 1.0
11 | %   LAST UPDATE: April, 2018
12 | 
13 | % Do not plot intercept coefficient
14 | % ShrinkageCoef = ShrinkageCoef(2:end,:);
15 | 
16 | TextTitle = opt.title;
17 | labelX = opt.labelX;
18 | labelY = opt.labelY;
19 | linewidth = opt.linewidth;
20 | color = opt.color;
21 | 
22 | numCoeff = size(ShrinkageCoef,1);   % Number of shrinkage coefficients
23 | fontsize = 13;
24 | 
25 | % Define default settings
26 | if(isempty(TextTitle))
27 |     TextTitle = 'Shrinakge Coefficients';
28 | end
29 | if(isempty(labelX))
30 |     labelX = 'Iteration';
31 | end
32 | 
33 | % Plot
34 | plot(ShrinkageCoef','LineWidth',linewidth);
35 | grid on
36 | title(TextTitle,'FontSize', 20)
37 | xlabel(labelX,'FontSize', 15)
38 | ylabel(labelY,'FontSize', 15)
39 | Ytext = ShrinkageCoef(:,end);  % Y coordination of text, different for coefficients
40 | Xtext = size(ShrinkageCoef,2); % X coordination of text, same for all coefficients 
41 | for i=1:numCoeff
42 |     text(Xtext,Ytext(i),['\gamma_{',num2str(i),'}'],'fontsize',fontsize)
43 | end
44 | end
45 | 
46 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/stafun/gen_Sobol.m:
--------------------------------------------------------------------------------
 1 | %genertate Sobol Sequence
 2 | function [X1]=gen_Sobol(m,s)
 3 | N = pow2(m); % Number of points;
 4 | cmax = 52; % number of digits of generated points
 5 | 
 6 | 
 7 | N = pow2(m);                             % Number of points;
 8 | P = sobolset(s);                         % Get Sobol sequence;
 9 | P = scramble(P,'MatousekAffineOwen');    % Scramble Sobol points;
10 | X1 = net(P,N);
11 | 
12 | X1=X1';
13 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/stafun/normrnd_qmc.m:
--------------------------------------------------------------------------------
1 | function x = normrnd_qmc(S,d)
2 | % generate Sxd matrix of standard normal numbers by RQMC
3 | rqmc = rqmc_rnd(S,d);  
4 | rqmc = rqmc(1:S,:);
5 | x = norminv(rqmc); 
6 | end


--------------------------------------------------------------------------------
/Matlab/DeepGLM/stafun/rqmc_rnd.m:
--------------------------------------------------------------------------------
 1 | function f = rqmc_rnd(S,d)
 2 | % generate a matrix of RQMC of size S times d
 3 | max_sobol = 1111;
 4 | r = floor(d/max_sobol);
 5 | s = d-r*max_sobol;
 6 | if r>=1
 7 |     f = gen_Sobol(ceil(log2(S)),max_sobol)'; 
 8 |     for i = 2:r
 9 |         f = [f,gen_Sobol(ceil(log2(S)),max_sobol)']; 
10 |     end
11 |     f = [f,gen_Sobol(ceil(log2(S)),s)']; 
12 | else
13 |     f = gen_Sobol(ceil(log2(S)),d)'; 
14 | end
15 |     
16 | end
17 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/train/deepGLMTrain.m:
--------------------------------------------------------------------------------
  1 | function est = deepGLMTrain(X_train,y_train,est)
  2 | % Traing a deepGLM model with continuous reponse y.
  3 | % Bayesian Adaptive Group Lasso is used on the first-layer weights; no
  4 | % regularization is put on the rest. sigma2 and tau are updated by
  5 | % mean-field VB. Inverse gamma prior is used for sigma2
  6 | % INPUT
  7 | %   X_train, y_train:           Training data (continuous response)
  8 | %   X_validation, y_validation: Validation data
  9 | %   n_units:                    Vector specifying the numbers of units in
 10 | %                               each layer
 11 | %   batchsize:                  Mini-batch size used in each iteration
 12 | %   eps0:                       Constant learning rate
 13 | %   isotropic:                  True if isotropic structure on Sigma is
 14 | %                               used, otherwise rank-1 structure is used
 15 | % OUTPUT
 16 | %   W_seq:                      The optimal weights upto the last hidden
 17 | %                               layer
 18 | %   beta                        The optimal weights that connect last hidden layer to the output
 19 | %   mean_sigma2                 Estimate of sigma2
 20 | %   shrinkage_gamma_seq         Update of shrinkage parameters over
 21 | %                               iteration
 22 | %   MSE_DL                      Mean squared error over iteration
 23 | %
 24 | %
 25 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 26 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 27 | %   
 28 | %   http://www.xxx.com
 29 | %
 30 | %   Version: 1.0
 31 | %   LAST UPDATE: April, 2018
 32 | 
 33 | % Extract training data and settings from input struct
 34 | X_val = est.data.Xval;
 35 | y_val = est.data.yval;
 36 | n_units = est.network;
 37 | batchsize = est.batchsize;
 38 | lrate = est.lrate;
 39 | isotropic = est.isIsotropic;
 40 | S = est.S;                   % Number of Monte Carlo samples to estimate the gradient
 41 | tau = est.tau;               % Threshold before reducing constant learning rate eps0
 42 | grad_weight = est.momentum;  % Weight in the momentum 
 43 | cScale = est.c;              % Random scale factor to initialize b,c
 44 | patience = est.patience;     % Stop if test error not improved after patience_parameter iterations
 45 | epoch = est.epoch;           % Number of times learning algorithm scan entire training data
 46 | verbose = est.verbose;
 47 | distr = est.dist;
 48 | lbFlag = est.lowerbound;         % Lowerbound flag
 49 | LBwindow = est.windowSize;
 50 | seed = est.seed;
 51 | 
 52 | if(~isnan(seed))
 53 |     rng(seed)
 54 | end
 55 | 
 56 | % Data merge for mini-batch sampling
 57 | data = [y_train,X_train];                 
 58 | datasize = length(y_train);
 59 | num1Epoch = round(datasize/batchsize);    % Number of iterations per epoch
 60 | 
 61 | % Network parameters
 62 | L = length(n_units);        % Number of hidden layers
 63 | p = size(X_train,2)-1;      % Number of covariates
 64 | W_seq = cell(1,L);          % Cells to store weight matrices
 65 | index_track = zeros(1,L);   % Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,...
 66 | index_track(1) = n_units(1)*(p+1);            % Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 
 67 | W1_tilde_index = n_units(1)+1:index_track(1); % Index of W1 without biases, as the first column if W1 are biases
 68 | w_tilde_index = []; % indices of non-biase weights, excluding W1, for l2-regulization prior
 69 | for j = 2:L
 70 |     index_track(j) = index_track(j-1)+n_units(j)*(n_units(j-1)+1);
 71 |     w_tilde_index = [w_tilde_index,(index_track(j-1)+n_units(j)+1):index_track(j)];
 72 | end
 73 | d_w = index_track(L);      % Total number of weights up to (and including) the last layer
 74 | d_beta = n_units(L)+1;     % Dimension of the weights beta connecting the last layer to the output
 75 | d_theta = d_w+d_beta;      % Total number of parameters
 76 | w_tilde_index = [w_tilde_index,(d_w+2:d_theta)];
 77 | d_w_tilde = length(w_tilde_index);
 78 | 
 79 | % Initialise weights and set initial mu equal to initial weights
 80 | layers = [size(X_train,2) n_units 1];  % Full structure of NN -> [input,hidden,output]
 81 | weights = nnInitialize(layers);
 82 | mu=[];
 83 | for i=1:length(layers)-1
 84 |     mu=[mu;weights{i}(:)];
 85 | end
 86 | % Initialize b and c
 87 | % b = normrnd(0,cScale,d_theta,1);
 88 | b = cScale*rand(d_theta,1);
 89 | if isotropic 
 90 |     c = cScale;
 91 | else
 92 |     c = cScale*ones(d_theta,1);
 93 | end
 94 | % Initialize lambda
 95 | lambda=[mu;b;c];
 96 | 
 97 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
 98 | W_seq{1} = W1; 
 99 | for j = 2:L
100 |     index = index_track(j-1)+1:index_track(j);
101 |     Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
102 |     W_seq{j} = Wj; 
103 | end
104 | beta = mu(d_w+1:d_theta);
105 | 
106 | % Get mini-batch
107 | idx = randperm(datasize,batchsize);
108 | minibatch = data(idx,:);
109 | y = minibatch(:,1);
110 | X = minibatch(:,2:end);
111 | 
112 | % Remove this after doing R verison
113 | % X = X_train;
114 | % y = y_train;
115 | 
116 | % minibatch = datasample(data,batchsize);
117 | % y = minibatch(:,1);
118 | % X = minibatch(:,2:end);
119 | 
120 | % Hyperparameters for inverse-Gamma prior on sigma2 if y~Nomal(0,sigma2)
121 | if(strcmp(distr,'normal'))
122 |     alpha0_sigma2 = 10; 
123 |     beta0_sigma2 = (alpha0_sigma2-1)*std(y); 
124 |     alpha_sigma2 = alpha0_sigma2 + length(y_train)/2; % Optimal VB parameter for updating sigma2 
125 |     beta_sigma2 = alpha_sigma2;                     % Mean_sigma2 and mean_sigma2_inverse are 
126 |                                                     % Initialised at small values 1/2 and 1 respectively  
127 |     mean_sigma2_inverse = alpha_sigma2/beta_sigma2;
128 |     mean_sigma2 = beta_sigma2/(alpha_sigma2-1);
129 |     mean_sigma2_save(1) = mean_sigma2;
130 | end
131 | 
132 | % Compute prediction loss if not using lowerbound for validation
133 | if(~lbFlag)
134 |     if(strcmp(distr,'normal'))
135 |         [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2);
136 |         disp(['Initial MSE: ',num2str(MSE_current)]);
137 |     else
138 |         [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr);
139 |         disp(['Initial PPS: ',num2str(PPS_current)]);
140 |     end
141 |     MSE_DL(1) = MSE_current;
142 |     PPS_DL(1) = PPS_current;
143 | end
144 | 
145 | % Calculations for group Lasso coefficients
146 | shrinkage_gamma = .01*ones(p,1); % Initialise gamma_beta, the shrinkage parameters
147 | shrinkage_l2 = .01;              % Hype-parameter for L2 prior
148 | mu_tau = zeros(p,1);             % Parameters for the auxiliary tau_j
149 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p);
150 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p);
151 | if isotropic
152 |     for j = 1:p
153 |         mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
154 |             b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1);
155 |         mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);        
156 |     end
157 |     lambda_tau = shrinkage_gamma.^2;
158 | else
159 |     c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p);
160 |     for j = 1:p
161 |         mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
162 |             b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2);
163 |         mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
164 |     end
165 |     lambda_tau = shrinkage_gamma.^2;
166 | end
167 | mean_inverse_tau = mu_tau;              % VB mean <1/tau_j>
168 | shrinkage_gamma_seq = shrinkage_gamma;  %
169 | mean_tau = 1./mu_tau+1./lambda_tau;
170 | m = n_units(1);
171 | 
172 | % Prepare to calculate lowerbound
173 | if(lbFlag)
174 |     if(strcmp(distr,'normal'))
175 |         const = alpha0_sigma2*log(beta0_sigma2)-gammaln(alpha0_sigma2)...
176 |                 -0.5*p*n_units(1)*log(2*pi)-0.5*d_w_tilde*log(2*pi)...
177 |                 -p*gammaln((n_units(1)+1)/2)-0.5*datasize*log(2*pi)...
178 |                 +p/2*log(2*pi)+0.5*d_theta*log(2*pi)+d_theta/2;
179 |     else
180 |         const = -0.5*p*n_units(1)*log(2*pi)-0.5*d_w_tilde*log(2*pi)...
181 |                 -p*gammaln((n_units(1)+1)/2)+p/2*log(2*pi)...
182 |                 +0.5*d_theta*log(2*pi)+d_theta/2;
183 |     end
184 |     
185 |     W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
186 |     W_seq{1} = W1; 
187 |     for j = 2:L
188 |         index = index_track(j-1)+1:index_track(j);
189 |         Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
190 |         W_seq{j} = Wj; 
191 |     end
192 |     beta = mu(d_w+1:d_theta);
193 |     mu_w_tilde = mu(w_tilde_index); 
194 |     b_w_tilde = b(w_tilde_index); 
195 |     c_w_tilde = c(w_tilde_index);
196 |     mean_w_tilde = mu_w_tilde'*mu_w_tilde+b_w_tilde'*b_w_tilde+sum(c_w_tilde.^2);
197 |     iter = 1;
198 |     vbLowerBound;
199 | %     disp(['Initial LB: ',num2str(lb(iter))]);
200 | end
201 | 
202 | %% Calcualte for the first iteration
203 | grad_g_lik_store = zeros(S,3*d_theta);
204 | lb_iter = zeros(1,S);
205 | %----------------------------Narutal Gradient (1st Iteration)--------------
206 | vbGradientLogLB
207 | gradient_bar = gradient_lambda;
208 | if(lbFlag)
209 |     lb(iter) = mean(lb_iter)/datasize;
210 |     disp(['Initial LB: ',num2str(lb(iter))]);
211 | end
212 | %--------------------------------------------------------------------------
213 | 
214 | 
215 | %% Training Phase
216 | % Prepare parameters for training
217 | idxEpoch = 0;          % Index of current epoch
218 | iter = 1;              % Index of current iteration
219 | stop = false;          % Stop flag for early stopping
220 | lambda_best = lambda;  % Store optimal lambda for output
221 | idxPatience = 0;       % Index of number of consequent non-decreasing iterations
222 |                        % for early stopping
223 | disp('---------- Training Phase ----------')
224 | while ~stop 
225 |     iter = iter+1;
226 |     
227 |     %% ------------------Natural Gradient Calculation----------------------
228 |     % Get mini-batch
229 |     idx = randperm(datasize,batchsize);
230 |     minibatch = data(idx,:);
231 |     y = minibatch(:,1);
232 |     X = minibatch(:,2:end);
233 |     
234 |     % Remove this after doing R verison
235 | %     X = X_train;
236 | %     y = y_train;
237 | 
238 | %     minibatch = datasample(data,batchsize);
239 | %     y = minibatch(:,1);
240 | %     X = minibatch(:,2:end);
241 |     
242 |     % Calculate expected terms of lowerbound
243 |     if(lbFlag)
244 |         vbLowerBound;
245 |     end
246 |     
247 |     % Calculate Natural Gradient
248 |     vbGradientLogLB
249 |     
250 |     % Get lowerbound in the current iteration
251 |     if(lbFlag)
252 |         lb(iter) = mean(lb_iter)/datasize;
253 |     end
254 |     %----------------------------------------------------------------------
255 |     
256 |     %% ------------------Stochastic gradient ascend update-----------------
257 |     % Prevent exploding Gradient
258 |     grad_norm = norm(gradient_lambda);
259 |     norm_gradient_threshold = 100;
260 |     if norm(gradient_lambda)>norm_gradient_threshold
261 |         gradient_lambda = (norm_gradient_threshold/grad_norm)*gradient_lambda;
262 |     end
263 |     
264 |     % Momentum gradient
265 |     gradient_bar_old = gradient_bar;
266 |     gradient_bar = grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda;     
267 |     
268 |     % Adaptive learning rate
269 |     if iter>tau
270 |         stepsize=lrate*tau/iter;
271 |     else
272 |         stepsize=lrate;
273 |     end
274 |     
275 |     % Gradient ascend
276 |     lambda = lambda + stepsize*gradient_bar;
277 |     
278 |     % Restore model parameters from variational parameter lambda
279 |     mu=lambda(1:d_theta,1);
280 |     b=lambda(d_theta+1:2*d_theta,1);
281 |     c=lambda(2*d_theta+1:end);
282 |     W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
283 |     W_seq{1} = W1; 
284 |     for j = 2:L
285 |         index = index_track(j-1)+1:index_track(j);
286 |         Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
287 |         W_seq{j} = Wj; 
288 |     end
289 |     beta = mu(d_w+1:d_theta);
290 |     %----------------------------------------------------------------------
291 | 
292 |     %% ---------------- Update tau and shrinkage parameters----------------    
293 |     if mod(iter,1) == 0
294 |         mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p);
295 |         b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p);
296 |         if isotropic
297 |             for j = 1:p
298 |                 mean_column_j_tilde(j) = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
299 |                     b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1);
300 |                 mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde(j));
301 |                 lambda_tau(j) = shrinkage_gamma(j)^2;
302 |             end
303 |         else
304 |             c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p);
305 |             for j = 1:p
306 |                 mean_column_j_tilde(j) = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
307 |                     b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2);
308 |                 mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde(j));
309 |                 lambda_tau(j) = shrinkage_gamma(j)^2;
310 |             end
311 |         end
312 |         mean_inverse_tau = mu_tau;
313 |         mean_tau = 1./mu_tau+1./lambda_tau;
314 |         shrinkage_gamma = sqrt((n_units(1)+1)./mean_tau);
315 |         shrinkage_gamma_seq = [shrinkage_gamma_seq,shrinkage_gamma];
316 |         
317 |         mu_w_tilde = mu(w_tilde_index); 
318 |         b_w_tilde = b(w_tilde_index); 
319 |         c_w_tilde = c(w_tilde_index);
320 |         mean_w_tilde = mu_w_tilde'*mu_w_tilde+b_w_tilde'*b_w_tilde+sum(c_w_tilde.^2);
321 | %         shrinkage_l2 = length(w_tilde_index)/mean_w_tilde;
322 |     end
323 |     %----------------------------------------------------------------------
324 |     
325 |     %% ------Update VB posterior for sigma2, which is inverse Gamma -------
326 |     % if y ~ N(0,sigma2)    
327 |     if(strcmp(distr,'normal'))
328 |         if (mod(iter,1) == 0)     
329 |             sum_squared = sumResidualSquared(y_train,X_train,W_seq,beta);
330 |             beta_sigma2 = beta0_sigma2+sum_squared/2;
331 |             mean_sigma2_inverse = alpha_sigma2/beta_sigma2;
332 |             mean_sigma2 = beta_sigma2/(alpha_sigma2-1);
333 |             mean_sigma2_save = [mean_sigma2_save,mean_sigma2];
334 |         end
335 |     end
336 |     %----------------------------------------------------------------------
337 | 
338 |     %% ----------------------------Validation------------------------------
339 |     % If using lowerbound for validation
340 |     if(lbFlag)
341 |         % Storing lowerbound moving average values
342 |         if (iter>LBwindow)
343 |             lb_bar(iter-LBwindow) = mean(lb(iter-LBwindow+1:iter));
344 |             if lb_bar(end)>=max(lb_bar)
345 |                 lambda_best = lambda;
346 |                 idxPatience = 0;
347 |             else
348 |                 idxPatience = idxPatience+1;
349 | %                 disp(['idxPatience: ',num2str(idxPatience)])
350 |             end 
351 |         end
352 |         
353 |     % If using MSE/Accuracy for validation
354 |     else 
355 |         if(strcmp(distr,'normal'))
356 |             [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2);
357 |         else
358 |             [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr);
359 |         end
360 | 
361 |         MSE_DL(iter) = MSE_current;
362 |         PPS_DL(iter) = PPS_current;
363 | 
364 |         if PPS_DL(iter)>=PPS_DL(iter-1)
365 |             gradient_bar = gradient_bar_old;
366 |         end
367 | 
368 |         if PPS_DL(iter)<=min(PPS_DL)
369 |             lambda_best = lambda;
370 |             idxPatience = 0;
371 |         else
372 |             idxPatience = idxPatience+1;
373 | %             disp(['idxPatience: ',num2str(idxPatience)])
374 |         end
375 |     end
376 | 
377 |     % Early stopping
378 |     if (idxPatience>patience)||(idxEpoch>epoch) 
379 |         stop = true; 
380 |     end 
381 |     %----------------------------------------------------------------------
382 |     
383 |     %% ------------------------------Display-------------------------------
384 |     % Display epoch index whenever an epoch is finished
385 |     if(~mod(iter,num1Epoch))
386 |         idxEpoch = idxEpoch + 1;
387 |     end
388 |     
389 |     % Display training results after each 'verbose' iteration
390 |     if (verbose && ~mod(iter,verbose))
391 |         if(lbFlag)     % Display lowerbound
392 | %             disp(['Epoch: ',num2str(idxEpoch)]);
393 | 
394 |             if (iter>LBwindow)
395 |                 disp(['Epoch: ',num2str(idxEpoch),'   -   ',...
396 |                         'Current LB: ',num2str(lb_bar(iter-LBwindow))]);
397 |             else
398 |                 disp(['Epoch: ',num2str(idxEpoch),'   -   ',...
399 |                         'Current LB: ',num2str(lb(iter))]);
400 |             end
401 |         else       % Or display MSE/Accuracy
402 |             if(strcmp(distr,'binomial'))
403 |                disp(['Current PPS: ',num2str(PPS_current)]);
404 |             else
405 |                disp(['Current MSE: ',num2str(MSE_current)]);
406 |             end
407 |         end
408 |     end
409 |     %----------------------------------------------------------------------
410 |     
411 | end
412 | 
413 | %% --------------------------Display Training Results----------------------
414 | disp('---------- Training Completed! ----------')
415 | disp(['Number of iteration:',num2str(iter)]);
416 | if(lbFlag)
417 |     disp(['LBBar best: ',num2str(max(lb_bar))]);
418 | else
419 |     disp(['PPS best: ',num2str(min(PPS_DL))]);
420 |     disp(['MSE best: ',num2str(min(MSE_DL))]);
421 | end
422 | 
423 | %% ----------------------Store training output-----------------------------
424 | lambda = lambda_best;
425 | mu = lambda(1:d_theta,1);
426 | b = lambda(d_theta+1:2*d_theta,1);
427 | c = lambda(2*d_theta+1:end);
428 | if isotropic              % For isotropic structure
429 |     SIGMA = b*b' + c^2*eyes(d_theta);
430 | else
431 |     SIGMA = b*b' + diag(c.^2);
432 | end
433 |     
434 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
435 | W_seq{1} = W1; 
436 | for j = 2:L
437 |     index = index_track(j-1)+1:index_track(j);
438 |     Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
439 |     W_seq{j} = Wj; 
440 | end
441 | beta = mu(d_w+1:d_w+d_beta);
442 | 
443 | % Store output in a struct
444 | est.out.weights = W_seq; 
445 | est.out.beta = beta;
446 | est.out.shrinkage = shrinkage_gamma_seq;
447 | est.out.iteration = iter;
448 | est.out.vbMU = mu;            % Mean of variational distribution of weights
449 | est.out.b = b;
450 | est.out.c = c;
451 | est.out.vbSIGMA = SIGMA;      % Covariance matrix of variational distribution 
452 |                               % of weights
453 | est.out.nparams = d_theta;    % Number of parameters     
454 | est.out.indexTrack = index_track;
455 | est.out.muTau = mu_tau;
456 | 
457 | if(strcmp(distr,'normal'))
458 |     est.out.sigma2Alpha = alpha_sigma2;
459 |     est.out.sigma2Beta = beta_sigma2;
460 |     est.out.sigma2Mean = mean_sigma2_save(end);
461 |     est.out.sigma2MeanIter = mean_sigma2_save;
462 | end
463 | 
464 | if(lbFlag)
465 |     est.out.lbBar = lb_bar(2:end);
466 |     est.out.lb = lb;
467 | else
468 |     if(strcmp(distr,'binomial'))
469 |         est.out.accuracy = MSE_DL;
470 |     else
471 |         est.out.mse = MSE_DL;
472 |     end
473 |     est.out.pps = PPS_DL;
474 | end
475 | end


--------------------------------------------------------------------------------
/Matlab/DeepGLM/train/deepGLMTrainTest.m:
--------------------------------------------------------------------------------
  1 | function est = deepGLMTrainTest(X_train,y_train,est)
  2 | % Traing a fGLM model with continuous reponse y.
  3 | % Bayesian Adaptive Group Lasso is used on the first-layer weights; no
  4 | % regularization is put on the rest. sigma2 and tau are updated by
  5 | % mean-field VB. Inverse gamma prior is used for sigma2
  6 | % INPUT
  7 | %   X_train, y_train:           Training data (continuous response)
  8 | %   X_validation, y_validation: Validation data
  9 | %   n_units:                    Vector specifying the numbers of units in
 10 | %                               each layer
 11 | %   batchsize:                  Mini-batch size used in each iteration
 12 | %   eps0:                       Constant learning rate
 13 | %   isotropic:                  True if isotropic structure on Sigma is
 14 | %                               used, otherwise rank-1 structure is used
 15 | % OUTPUT
 16 | %   W_seq:                      The optimal weights upto the last hidden
 17 | %                               layer
 18 | %   beta                        The optimal weights that connect last hidden layer to the output
 19 | %   mean_sigma2                 Estimate of sigma2
 20 | %   shrinkage_gamma_seq         Update of shrinkage parameters over
 21 | %                               iteration
 22 | %   MSE_DL                      Mean squared error over iteration
 23 | %
 24 | %
 25 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 26 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 27 | %   
 28 | %   http://www.xxx.com
 29 | %
 30 | %   Version: 1.0
 31 | %   LAST UPDATE: April, 2018
 32 | 
 33 | % Extract training data and settings from input struct
 34 | X_val = est.data.Xval;
 35 | y_val = est.data.yval;
 36 | n_units = est.network;
 37 | batchsize = est.batchsize;
 38 | lrate = est.lrate;
 39 | isotropic = est.isIsotropic;
 40 | S = est.S;                   % Number of Monte Carlo samples to estimate the gradient
 41 | tau = est.tau;               % Threshold before reducing constant learning rate eps0
 42 | grad_weight = est.momentum;  % Weight in the momentum 
 43 | cScale = est.c;              % Random scale factor to initialize b,c
 44 | patience = est.patience;     % Stop if test error not improved after patience_parameter iterations
 45 | epoch = est.epoch;           % Number of times learning algorithm scan entire training data
 46 | verbose = est.verbose;
 47 | distr = est.dist;
 48 | lbFlag = est.lowerbound;         % Lowerbound flag
 49 | LBwindow = 20;
 50 | seed = est.seed;
 51 | 
 52 | if(~isnan(seed))
 53 |     rng(seed);
 54 | end
 55 | 
 56 | % mu_tau = est.muTau;          % Parameters for the auxiliary tau_j
 57 | 
 58 | % Data merge for mini-batch sampling
 59 | data = [y_train,X_train];                 
 60 | datasize = length(y_train);
 61 | num1Epoch = round(datasize/batchsize);    % Number of iterations per epoch
 62 | 
 63 | % Network parameters
 64 | L = length(n_units);        % Number of hidden layers
 65 | p = size(X_train,2)-1;      % Number of covariates
 66 | W_seq = cell(1,L);          % Cells to store weight matrices
 67 | index_track = zeros(1,L);   % Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,...
 68 | index_track(1) = n_units(1)*(p+1);            % Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 
 69 | W1_tilde_index = n_units(1)+1:index_track(1); % Index of W1 without biases, as the first column if W1 are biases
 70 | w_tilde_index = []; % indices of non-biase weights, excluding W1, for l2-regulization prior
 71 | for j = 2:L
 72 |     index_track(j) = index_track(j-1)+n_units(j)*(n_units(j-1)+1);
 73 |     w_tilde_index = [w_tilde_index,(index_track(j-1)+n_units(j)+1):index_track(j)];
 74 | end
 75 | d_w = index_track(L);      % Total number of weights up to (and including) the last layer
 76 | d_beta = n_units(L)+1;     % Dimension of the weights beta connecting the last layer to the output
 77 | d_theta = d_w+d_beta;      % Total number of parameters
 78 | w_tilde_index = [w_tilde_index,(d_w+2:d_theta)];
 79 | d_w_tilde = length(w_tilde_index);
 80 | 
 81 | % Initialise weights and set initial mu equal to initial weights
 82 | layers = [size(X_train,2) n_units 1];  % Full structure of NN -> [input,hidden,output]
 83 | weights = nnInitialize(layers);
 84 | mu=[];
 85 | for i=1:length(layers)-1
 86 |     mu=[mu;weights{i}(:)];
 87 | end
 88 | % Initialize b and c
 89 | b = normrnd(0,cScale,d_theta,1);
 90 | if isotropic 
 91 |     c = cScale;
 92 | else
 93 |     c = cScale*ones(d_theta,1);
 94 | end
 95 | % Initialize lambda
 96 | lambda=[mu;b;c];
 97 | 
 98 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
 99 | W_seq{1} = W1; 
100 | for j = 2:L
101 |     index = index_track(j-1)+1:index_track(j);
102 |     Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
103 |     W_seq{j} = Wj; 
104 | end
105 | beta = mu(d_w+1:d_theta);
106 | 
107 | % if(batchsize~=datasize)
108 | %     % Get mini-batch
109 | %     idx = randperm(datasize,batchsize);
110 | %     minibatch = data(idx,:);
111 | %     y = minibatch(:,1);
112 | %     X = minibatch(:,2:end);
113 | % else
114 | %     y = y_train;
115 | %     X = X_train;
116 | % end
117 | 
118 | 
119 | % % Hyperparameters for inverse-Gamma prior on sigma2 if y~Nomal(0,sigma2)
120 | % if(strcmp(distr,'normal'))
121 | %     alpha0_sigma2 = 10; 
122 | %     beta0_sigma2 = (alpha0_sigma2-1)*std(y); 
123 | %     alpha_sigma2 = alpha0_sigma2 + length(y_train)/2; % Optimal VB parameter for updating sigma2 
124 | %     beta_sigma2 = alpha_sigma2;                     % Mean_sigma2 and mean_sigma2_inverse are 
125 | %                                                     % Initialised at small values 1/2 and 1 respectively  
126 | %     mean_sigma2_inverse = alpha_sigma2/beta_sigma2;
127 | %     mean_sigma2 = beta_sigma2/(alpha_sigma2-1);
128 | %     mean_sigma2_save(1) = mean_sigma2;
129 | % end
130 | 
131 | % Compute prediction loss if not using lowerbound for validation
132 | if(~lbFlag)
133 |     if(strcmp(distr,'normal'))
134 |         [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2);
135 |         disp(['Initial MSE: ',num2str(MSE_current)]);
136 |     else
137 |         [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr);
138 |         disp(['Initial PPS: ',num2str(PPS_current)]);
139 |     end
140 |     MSE_DL(1) = MSE_current;
141 |     PPS_DL(1) = PPS_current;
142 | end
143 | 
144 | % Calculations for group Lasso coefficients
145 | shrinkage_gamma = .01*ones(p,1); % Initialise gamma_beta, the shrinkage parameters
146 | shrinkage_l2 = .01;              % Hype-parameter for L2 prior
147 | mu_tau = zeros(p,1);             % Parameters for the auxiliary tau_j
148 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p);
149 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p);
150 | if isotropic
151 |     for j = 1:p
152 |         mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
153 |             b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1);
154 |         mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);        
155 |     end
156 |     lambda_tau = shrinkage_gamma.^2;
157 | else
158 |     c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p);
159 |     for j = 1:p
160 |         mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
161 |             b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2);
162 |         mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
163 |     end
164 |     lambda_tau = shrinkage_gamma.^2;
165 | end
166 | mean_inverse_tau = mu_tau;              % VB mean <1/tau_j>
167 | shrinkage_gamma_seq = shrinkage_gamma;  %
168 | mean_tau = 1./mu_tau+1./lambda_tau;
169 | m = n_units(1);
170 | 
171 | % Prepare to calculate lowerbound
172 | if(lbFlag)
173 |     if(strcmp(distr,'normal'))
174 |         const = alpha0_sigma2*log(beta0_sigma2)-gammaln(alpha0_sigma2)...
175 |                 -0.5*p*n_units(1)*log(2*pi)-0.5*d_w_tilde*log(2*pi)...
176 |                 -p*gammaln((n_units(1)+1)/2)-0.5*batchsize*log(2*pi)...
177 |                 +p/2*log(2*pi)+0.5*d_theta*log(2*pi)+d_theta/2;
178 |     else
179 |         const = -0.5*p*n_units(1)*log(2*pi)-0.5*d_w_tilde*log(2*pi)...
180 |                 -p*gammaln((n_units(1)+1)/2)+p/2*log(2*pi)...
181 |                 +0.5*d_theta*log(2*pi)+d_theta/2;
182 |     end
183 |     
184 |     W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
185 |     W_seq{1} = W1; 
186 |     for j = 2:L
187 |         index = index_track(j-1)+1:index_track(j);
188 |         Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
189 |         W_seq{j} = Wj; 
190 |     end
191 |     beta = mu(d_w+1:d_theta);
192 |     mu_w_tilde = mu(w_tilde_index); 
193 |     b_w_tilde = b(w_tilde_index); 
194 |     c_w_tilde = c(w_tilde_index);
195 |     mean_w_tilde = mu_w_tilde'*mu_w_tilde+b_w_tilde'*b_w_tilde+sum(c_w_tilde.^2);
196 |     iter = 1;
197 |     vbLowerBound;
198 | %     disp(['Initial LB: ',num2str(lb(iter))]);
199 | end
200 | 
201 | %% Calcualte for the first iteration
202 | grad_g_lik_store = zeros(S,3*d_theta);
203 | lb_iter = zeros(1,S);
204 | %----------------------------Narutal Gradient (1st Iteration)--------------
205 | if(batchsize~=datasize)
206 |     % Get mini-batch
207 |     idx = randperm(datasize,batchsize);
208 |     minibatch = data(idx,:);
209 |     y = minibatch(:,1);
210 |     X = minibatch(:,2:end);
211 | else
212 |     y = y_train;
213 |     X = X_train;
214 | end
215 | 
216 | vbGradientLogLB
217 | gradient_bar = gradient_lambda;
218 | if(lbFlag)
219 |     lb(iter) = mean(lb_iter)/batchsize;
220 |     disp(['Initial LB: ',num2str(lb(iter))]);
221 | end
222 | %--------------------------------------------------------------------------
223 | 
224 | 
225 | %% Training Phase
226 | % Prepare parameters for training
227 | idxEpoch = 0;          % Index of current epoch
228 | iter = 1;              % Index of current iteration
229 | stop = false;          % Stop flag for early stopping
230 | lambda_best = lambda;  % Store optimal lambda for output
231 | idxPatience = 0;       % Index of number of consequent non-decreasing iterations
232 |                        % for early stopping
233 | disp('---------- Training Phase ----------')
234 | while ~stop 
235 |     iter = iter+1;
236 |     
237 |     %% ------------------Natural Gradient Calculation----------------------
238 |     if(batchsize~=datasize)
239 |         % Get mini-batch
240 |         idx = randperm(datasize,batchsize);
241 |         minibatch = data(idx,:);
242 |         y = minibatch(:,1);
243 |         X = minibatch(:,2:end);
244 |     else
245 |         y = y_train;
246 |         X = X_train;
247 |     end
248 |     
249 |     % Calculate expected terms of lowerbound
250 |     if(lbFlag)
251 |         vbLowerBound;
252 |     end
253 |     
254 |     % Calculate Natural Gradient
255 |     vbGradientLogLB
256 |     
257 |     % Get lowerbound in the current iteration
258 |     if(lbFlag)
259 |         lb(iter) = mean(lb_iter)/batchsize;
260 |     end
261 |     %----------------------------------------------------------------------
262 |     
263 |     %% ------------------Stochastic gradient ascend update-----------------
264 |     % Prevent exploding Gradient
265 |     grad_norm = norm(gradient_lambda);
266 |     norm_gradient_threshold = 100;
267 |     if norm(gradient_lambda)>norm_gradient_threshold
268 |         gradient_lambda = (norm_gradient_threshold/grad_norm)*gradient_lambda;
269 |     end
270 |     
271 |     % Momentum gradient
272 |     gradient_bar_old = gradient_bar;
273 |     gradient_bar = grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda;     
274 |     
275 |     % Adaptive learning rate
276 |     if iter>tau
277 |         stepsize=lrate*tau/iter;
278 |     else
279 |         stepsize=lrate;
280 |     end
281 |     
282 |     % Gradient ascend
283 |     lambda=lambda+stepsize*gradient_bar;
284 |     
285 |     % Restore model parameters from variational parameter lambda
286 |     mu=lambda(1:d_theta,1);
287 |     b=lambda(d_theta+1:2*d_theta,1);
288 |     c=lambda(2*d_theta+1:end);
289 |     W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
290 |     W_seq{1} = W1; 
291 |     for j = 2:L
292 |         index = index_track(j-1)+1:index_track(j);
293 |         Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
294 |         W_seq{j} = Wj; 
295 |     end
296 |     beta = mu(d_w+1:d_theta);
297 |     %----------------------------------------------------------------------
298 | 
299 |     %% ---------------- Update tau and shrinkage parameters----------------    
300 |     if mod(iter,1) == 0
301 |         mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p);
302 |         b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p);
303 |         if isotropic
304 |             for j = 1:p
305 |                 mean_column_j_tilde(j) = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
306 |                     b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1);
307 |                 mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde(j));
308 |                 lambda_tau(j) = shrinkage_gamma(j)^2;
309 |             end
310 |         else
311 |             c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p);
312 |             for j = 1:p
313 |                 mean_column_j_tilde(j) = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
314 |                     b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2);
315 |                 mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde(j));
316 |                 lambda_tau(j) = shrinkage_gamma(j)^2;
317 |             end
318 |         end
319 |         mean_inverse_tau = mu_tau;
320 |         mean_tau = 1./mu_tau+1./lambda_tau;
321 |         shrinkage_gamma = sqrt((n_units(1)+1)./mean_tau);
322 |         shrinkage_gamma_seq = [shrinkage_gamma_seq,shrinkage_gamma];
323 |         
324 |         mu_w_tilde = mu(w_tilde_index); 
325 |         b_w_tilde = b(w_tilde_index); 
326 |         c_w_tilde = c(w_tilde_index);
327 |         mean_w_tilde = mu_w_tilde'*mu_w_tilde+b_w_tilde'*b_w_tilde+sum(c_w_tilde.^2);
328 |         shrinkage_l2 = length(w_tilde_index)/mean_w_tilde;
329 |     end
330 |     %----------------------------------------------------------------------
331 |     
332 |     %% ------Update VB posterior for sigma2, which is inverse Gamma -------
333 |     % if y ~ N(0,sigma2)    
334 |     if(strcmp(distr,'normal'))
335 |         if (mod(iter,1) == 0)     
336 |             sum_squared = sumResidualSquared(y_train,X_train,W_seq,beta);
337 |             beta_sigma2 = beta0_sigma2+sum_squared/2;
338 |             mean_sigma2_inverse = alpha_sigma2/beta_sigma2;
339 |             mean_sigma2 = beta_sigma2/(alpha_sigma2-1);
340 |             mean_sigma2_save = [mean_sigma2_save,mean_sigma2];
341 |         end
342 |     end
343 |     %----------------------------------------------------------------------
344 | 
345 |     %% ----------------------------Validation------------------------------
346 |     % Update lowerbound
347 | %     vbLowerBound;
348 |     
349 |     % If using lowerbound for validation
350 |     if(lbFlag)
351 |         % Storing lowerbound moving average values
352 |         if (iter>LBwindow)
353 |             lb_bar(iter-LBwindow) = mean(lb(iter-LBwindow+1:iter));
354 |             if lb_bar(end)>=max(lb_bar)
355 |                 lambda_best = lambda;
356 |                 idxPatience = 0;
357 |             else
358 |                 idxPatience = idxPatience+1;
359 |                 disp(['idxPatience: ',num2str(idxPatience)])
360 |             end 
361 |         end
362 |         
363 |     % If using MSE/Accuracy for validation
364 |     else 
365 |         if(strcmp(distr,'normal'))
366 |             [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2);
367 |         else
368 |             [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr);
369 |         end
370 | 
371 |         MSE_DL(iter) = MSE_current;
372 |         PPS_DL(iter) = PPS_current;
373 | 
374 |         if PPS_DL(iter)>=PPS_DL(iter-1)
375 |             gradient_bar = gradient_bar_old;
376 |         end
377 | 
378 |         if PPS_DL(iter)<=min(PPS_DL)
379 |             lambda_best = lambda;
380 |             idxPatience = 0;
381 |         else
382 |             idxPatience = idxPatience+1;
383 |             disp(['idxPatience: ',num2str(idxPatience)])
384 |         end
385 |     end
386 | 
387 |     % Early stopping
388 |     if (idxPatience>patience)||(idxEpoch>epoch) 
389 |         stop = true; 
390 |     end 
391 |     %----------------------------------------------------------------------
392 |     
393 |     %% ------------------------------Display-------------------------------
394 |     % Display epoch index whenever an epoch is finished
395 |     if(~mod(iter,num1Epoch))
396 |         idxEpoch = idxEpoch + 1;
397 |         disp(['Epoch: ',num2str(idxEpoch)]);
398 |     end
399 |     
400 |     % Display training results after each 'verbose' iteration
401 |     if (verbose && ~mod(iter,verbose))
402 |         if(lbFlag)     % Display lowerbound
403 |             disp(['Current LB: ',num2str(lb(iter))]);
404 |             if (iter>LBwindow)
405 |                 disp(['Current LB: ',num2str(lb_bar(iter-LBwindow))]);
406 |             end
407 |         else       % Or display MSE/Accuracy
408 |             if(strcmp(distr,'binomial'))
409 |                disp(['Current PPS: ',num2str(PPS_current)]);
410 |             else
411 |                disp(['Current MSE: ',num2str(MSE_current)]);
412 |             end
413 |         end
414 |     end
415 |     %----------------------------------------------------------------------
416 |     
417 | end
418 | 
419 | %% --------------------------Display Training Results----------------------
420 | disp('---------- Training Completed! ----------')
421 | disp(['Number of iteration:',num2str(iter)]);
422 | if(lbFlag)
423 |     disp(['LBBar best: ',num2str(max(lb_bar))]);
424 | else
425 |     disp(['PPS best: ',num2str(min(PPS_DL))]);
426 |     disp(['MSE best: ',num2str(min(MSE_DL))]);
427 | end
428 | 
429 | %% ----------------------Store training output-----------------------------
430 | lambda = lambda_best;
431 | mu = lambda(1:d_theta,1);
432 | b = lambda(d_theta+1:2*d_theta,1);
433 | c = lambda(2*d_theta+1:end);
434 | if isotropic              % For isotropic structure
435 |     SIGMA = b*b' + c^2*eyes(d_theta);
436 | else
437 |     SIGMA = b*b' + diag(c.^2);
438 | end
439 |     
440 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
441 | W_seq{1} = W1; 
442 | for j = 2:L
443 |     index = index_track(j-1)+1:index_track(j);
444 |     Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
445 |     W_seq{j} = Wj; 
446 | end
447 | beta = mu(d_w+1:d_w+d_beta);
448 | 
449 | % Store output in a struct
450 | est.out.weights = W_seq; 
451 | est.out.beta = beta;
452 | est.out.shrinkage = shrinkage_gamma_seq;
453 | est.out.iteration = iter;
454 | est.out.vbMU = mu;            % Mean of variational distribution of weights
455 | est.out.b = b;
456 | est.out.c = c;
457 | est.out.vbSIGMA = SIGMA;      % Covariance matrix of variational distribution 
458 |                               % of weights
459 | est.out.nparams = d_theta;    % Number of parameters     
460 | est.out.indexTrack = index_track;
461 | est.out.muTau = mu_tau;
462 | 
463 | if(strcmp(distr,'normal'))
464 |     est.out.sigma2Alpha = alpha_sigma2;
465 |     est.out.sigma2Beta = beta_sigma2;
466 |     est.out.sigma2Mean = mean_sigma2_save(end);
467 |     est.out.sigma2MeanIter = mean_sigma2_save;
468 | end
469 | 
470 | if(lbFlag)
471 |     est.out.lbBar = lb_bar(2:end);
472 |     est.out.lb = lb;
473 | else
474 |     if(strcmp(distr,'binomial'))
475 |         est.out.accuracy = MSE_DL;
476 |     else
477 |         est.out.mse = MSE_DL;
478 |     end
479 |     est.out.pps = PPS_DL;
480 | end
481 | end


--------------------------------------------------------------------------------
/Matlab/DeepGLM/train/deepGLMfit.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/DeepGLM/train/deepGLMfit.m


--------------------------------------------------------------------------------
/Matlab/DeepGLM/train/deepGLMlogitPoisson.m:
--------------------------------------------------------------------------------
  1 | function est = deepGLMlogitPoisson(X_train,y_train,est)
  2 | %DEEPGLMBINOMIAL Summary of this function goes here
  3 | % Traing a fGLM model with binary reponse y.
  4 | % Bayesian Adaptive Group Lasso is used on the first-layer weights; no
  5 | % regularization is put on the rest. sigma2 and tau are updated by
  6 | % mean-field VB. Inverse gamma prior is used for sigma2
  7 | % INPUT
  8 | %   X_train, y_train:           training data (continuous response)
  9 | %   X_validation, y_validation: validation data
 10 | %   n_units:                    vector specifying the numbers of units in
 11 | %                               each layer
 12 | %   batchsize:                  mini-batch size used in each iteration
 13 | %   eps0:                       constant learning rate
 14 | %   isotropic:                  true if isotropic structure on Sigma is
 15 | %                               used, otherwise rank-1 structure is used
 16 | % OUTPUT
 17 | %   W_seq:                      the optimal weights upto the last hidden
 18 | %                               layer
 19 | %   beta                        the optimal weights that connect last hidden layer to the output
 20 | %   mean_sigma2                 estimate of sigma2
 21 | %   shrinkage_gamma_seq         update of shrinkage parameters over
 22 | %                               iteration
 23 | %
 24 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 25 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 26 | %   
 27 | %   http://www.xxx.com
 28 | %
 29 | %   Version: 1.0
 30 | %   LAST UPDATE: April, 2018
 31 | 
 32 | % Extract training data and settings from input struct
 33 | Xval = est.data.Xval;
 34 | yval = est.data.yval;
 35 | n_units = est.network;
 36 | batchsize = est.batchsize;
 37 | lrate = est.lrate;
 38 | isotropic = est.isIsotropic;
 39 | S = est.S;                   % Number of Monte Carlo samples to estimate the gradient
 40 | tau = est.tau;               % Threshold before reducing constant learning rate eps0
 41 | grad_weight = est.momentum;  % Weight in the momentum 
 42 | cScale = est.c;              % Random scale factor to initialize b,c
 43 | patience = est.patience;     % Stop if test error not improved after patience_parameter iterations
 44 | epoch = est.epoch;           % Number of times learning algorithm scan entire training data
 45 | verbose = est.verbose;
 46 | distr = est.dist;
 47 | 
 48 | % Data merge for mini-batch sampling
 49 | data = [y_train,X_train];                 
 50 | datasize = length(y_train);
 51 | num1Epoch = round(datasize/batchsize);    % Number of iterations per epoch
 52 | 
 53 | % Network parameters
 54 | L = length(n_units);          % Number of hidden layers
 55 | p = size(X_train,2)-1;        % Number of covariates
 56 | W_seq = cell(1,L);            % Cells to store weight matrices
 57 | index_track = zeros(1,L);     % Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,...
 58 | index_track(1) = n_units(1)*(p+1);            % Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 
 59 | W1_tilde_index = n_units(1)+1:index_track(1); % Index of W1 without biases, as the first column if W1 are biases
 60 | w_tilde_index = []; % indices of non-biase weights, excluding W1, for l2-regulization prior
 61 | for j = 2:L
 62 |     index_track(j) = index_track(j-1)+n_units(j)*(n_units(j-1)+1);
 63 |     w_tilde_index = [w_tilde_index,(index_track(j-1)+n_units(j)+1):index_track(j)];
 64 | end
 65 | d_w = index_track(L);         % Total number of weights up to (and including) the last layer
 66 | d_beta = n_units(L)+1;        % Dimension of the weights beta connecting the last layer to the output
 67 | d_theta = d_w+d_beta;         % Total number of parameters
 68 | w_tilde_index = [w_tilde_index,(d_w+2:d_theta)];
 69 | d_w_tilde = length(w_tilde_index);
 70 | 
 71 | % Initialise weights and set initial mu equal to initial weights
 72 | layers = [size(X_train,2) n_units 1];  % Full structure of NN -> [input,hidden,output]
 73 | weights = nnInitialize(layers);
 74 | mu=[];
 75 | for i=1:length(layers)-1
 76 |     mu=[mu;weights{i}(:)];
 77 | end
 78 | % Initialize b and c
 79 | b = normrnd(0,cScale,d_theta,1);
 80 | if isotropic 
 81 |     c = cScale;
 82 | else
 83 |     c = cScale*ones(d_theta,1);
 84 | end
 85 | % Initialize lambda
 86 | lambda=[mu;b;c];
 87 | 
 88 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
 89 | W_seq{1} = W1; 
 90 | for j = 2:L
 91 |     index = index_track(j-1)+1:index_track(j);
 92 |     Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
 93 |     W_seq{j} = Wj; 
 94 | end
 95 | beta = mu(d_w+1:d_theta);
 96 | 
 97 | [Loss_current,Accuracy_current] = deepGLMpredictLoss(Xval,yval,W_seq,beta,distr); % compute prediction loss
 98 | disp(['Initial loss: ',num2str(Loss_current)]);
 99 | PPS_DL(1) = Loss_current;
100 | Accuracy_DL(1) = Accuracy_current;
101 | 
102 | shrinkage_gamma = .01*ones(p,1);  % Initialise gamma_beta, the shrinkage parameters
103 | shrinkage_l2 = .01;              % Hype-parameter for L2 prior
104 | mu_tau = zeros(p,1);              % Parameters for the auxiliary tau_j
105 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p);
106 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p);
107 | if isotropic
108 |     for j = 1:p
109 |         mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
110 |             b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1);
111 |         mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);        
112 |     end
113 |     lambda_tau = shrinkage_gamma.^2;
114 | else
115 |     c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p);
116 |     for j = 1:p
117 |         mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
118 |             b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2);
119 |         mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
120 |     end
121 |     lambda_tau = shrinkage_gamma.^2;
122 | end
123 | mean_inverse_tau = mu_tau; % VB mean <1/tau_j>
124 | shrinkage_gamma_seq = shrinkage_gamma; %
125 | 
126 | minibatch = datasample(data,batchsize);
127 | y = minibatch(:,1);
128 | X = minibatch(:,2:end);
129 | 
130 | rqmc = normrnd(0,1,S,d_theta+1); % using quasi MC random numbers      
131 | for s=1:S
132 |     U_normal = rqmc(s,:)';
133 |     epsilon1=U_normal(1);
134 |     epsilon2=U_normal(2:end);
135 |     theta=mu+epsilon1*b+c.*epsilon2;   
136 | 
137 |     W_seq = cell(1,L);        
138 |     W1 = reshape(theta(1:index_track(1)),n_units(1),p+1);
139 |     W_seq{1} = W1;
140 |     W1_tilde = W1(:,2:end); % weights without biases
141 |     W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau);
142 |     grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:)]; 
143 |     for j = 2:L
144 |         index = index_track(j-1)+1:index_track(j);
145 |         Wj = reshape(theta(index),n_units(j),n_units(j-1)+1);
146 |         W_seq{j} = Wj; 
147 |         Wj_tilde = Wj(:,2:end);
148 |         grad_prior_Wj = [zeros(n_units(j),1);-shrinkage_l2*Wj_tilde(:)];
149 |         grad_prior_w_beta = [grad_prior_w_beta;grad_prior_Wj];
150 |     end
151 |     beta = theta(d_w+1:d_theta);    
152 |     beta_tilde = beta(2:end); % vector beta without intercept
153 |     grad_prior_beta = [0;-shrinkage_l2*beta_tilde];
154 |     grad_prior_w_beta = [grad_prior_w_beta;grad_prior_beta];
155 |            
156 |     grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr);
157 |      
158 |     grad_h = grad_prior_w_beta+grad_llh; % gradient of log prior plus log-likelihood
159 |     grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic);
160 |     grad_theta = grad_h-grad_log_q;
161 |     
162 |     grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]';
163 | end
164 | grad_lb = (mean(grad_g_lik_store))';
165 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic);
166 | gradient_bar = gradient_lambda;
167 | 
168 | % Prepare parameters for training
169 | idxEpoch = 0;          % index of current epoch
170 | iter = 1;              % index of current iteration
171 | stop = false;          % Stop flag for early stopping
172 | lambda_best = lambda;  % Store optimal lambda for output
173 | idxPatience = 0;       % index of number of consequent non-decreasing iterations
174 |                        % for early stopping
175 | disp('---------- Training Phase ----------')
176 | while ~stop 
177 |     iter = iter+1;
178 |     % Display training results after each 'verbose' iteration
179 |     if (verbose && ~mod(iter,verbose))
180 |         if(~mod(iter,num1Epoch))
181 |             idxEpoch = idxEpoch + 1;
182 |             disp(['Epoch: ',num2str(idxEpoch)]);
183 |         end
184 |         if(strcmp(distr,'binomial'))
185 |             disp(['Current PPS: ',num2str(Loss_current)]);
186 |         else
187 |             disp(['Current MSE: ',num2str(Accuracy_current)]);
188 |         end
189 |     end  
190 |     
191 |     minibatch = datasample(data,batchsize);
192 |     y = minibatch(:,1);
193 |     X = minibatch(:,2:end);
194 | 
195 |     rqmc = normrnd(0,1,S,d_theta+1); % using quasi MC random numbers  
196 |     for s=1:S
197 |         U_normal = rqmc(s,:)';
198 |         epsilon1=U_normal(1);
199 |         epsilon2=U_normal(2:end);
200 |         theta=mu+b*epsilon1+c.*epsilon2;   
201 | 
202 |         W_seq = cell(1,L);        
203 |         W1 = reshape(theta(1:index_track(1)),n_units(1),p+1);
204 |         W_seq{1} = W1;
205 |         W1_tilde = W1(:,2:end); % weights without biases
206 |         W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau);
207 |         grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:)]; 
208 |         for j = 2:L
209 |             index = index_track(j-1)+1:index_track(j);
210 |             Wj = reshape(theta(index),n_units(j),n_units(j-1)+1);
211 |             W_seq{j} = Wj; 
212 |             Wj_tilde = Wj(:,2:end);
213 |             grad_prior_Wj = [zeros(n_units(j),1);-shrinkage_l2*Wj_tilde(:)];        
214 |             grad_prior_w_beta = [grad_prior_w_beta;grad_prior_Wj];
215 |         end
216 |         beta = theta(d_w+1:d_w+d_beta);    
217 |         beta_tilde = beta(2:end); % vector beta without intercept
218 |         grad_prior_beta = [0;-shrinkage_l2*beta_tilde];
219 |         grad_prior_w_beta = [grad_prior_w_beta;grad_prior_beta];
220 |        
221 |         grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr);
222 |         grad_h = grad_prior_w_beta+grad_llh;
223 |         grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic);
224 |         grad_theta = grad_h-grad_log_q;
225 |     
226 |         grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]';
227 |     end
228 |     grad_lb = (mean(grad_g_lik_store))';
229 |     gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic);
230 |     
231 |     grad_norm = norm(gradient_lambda);
232 |     norm_gradient_threshold = 100;
233 |     if norm(gradient_lambda)>norm_gradient_threshold
234 |         gradient_lambda = (norm_gradient_threshold/grad_norm)*gradient_lambda;
235 |     end
236 |     
237 |     gradient_bar_old = gradient_bar;
238 |     gradient_bar = grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda;     
239 |     
240 |     if iter>tau
241 |         stepsize=lrate*tau/iter;
242 |     else
243 |         stepsize=lrate;
244 |     end
245 |     
246 |     lambda=lambda+stepsize*gradient_bar;
247 |     
248 |     mu=lambda(1:d_theta,1);
249 |     b=lambda(d_theta+1:2*d_theta,1);
250 |     c=lambda(2*d_theta+1:end);
251 |     W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
252 |     W_seq{1} = W1; 
253 |     for j = 2:L
254 |         index = index_track(j-1)+1:index_track(j);
255 |         Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
256 |         W_seq{j} = Wj; 
257 |     end
258 |     beta = mu(d_w+1:d_theta);
259 | 
260 | % update tau and shrinkage parameters    
261 |     if mod(iter,5) == 0
262 |         mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p);
263 |         b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p);
264 |         if isotropic
265 |             for j = 1:p
266 |                 mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
267 |                     b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1);
268 |                 mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
269 |                 lambda_tau(j) = shrinkage_gamma(j)^2;
270 |             end
271 |         else
272 |             c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p);
273 |             for j = 1:p
274 |                 mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
275 |                     b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2);
276 |                 mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
277 |                 lambda_tau(j) = shrinkage_gamma(j)^2;
278 |             end
279 |         end
280 |         mean_inverse_tau = mu_tau;
281 |         mean_tau = 1./mu_tau+1./lambda_tau;
282 |         shrinkage_gamma = sqrt((n_units(1)+1)./mean_tau);
283 |         shrinkage_gamma_seq = [shrinkage_gamma_seq,shrinkage_gamma];
284 |         
285 |         mu_w_tilde = mu(w_tilde_index); 
286 |         b_w_tilde = b(w_tilde_index); 
287 |         c_w_tilde = c(w_tilde_index);
288 |         mean_w_tilde = mu_w_tilde'*mu_w_tilde+b_w_tilde'*b_w_tilde+sum(c_w_tilde.^2);
289 | %         shrinkage_l2 = length(w_tilde_index)/mean_w_tilde;
290 |     end
291 |     
292 |     
293 |     [Loss_current,Accuracy_current] = deepGLMpredictLoss(Xval,yval,W_seq,beta,distr); % compute prediction loss
294 |     PPS_DL(iter) = Loss_current;
295 |     Accuracy_DL(iter) = Accuracy_current;
296 | 
297 |     if PPS_DL(iter)>=PPS_DL(iter-1)
298 |         gradient_bar = gradient_bar_old;
299 |     end
300 |     
301 |     if PPS_DL(iter)<=min(PPS_DL)
302 |         lambda_best = lambda;
303 |         idxPatience = 0;
304 |         idxbest = iter;
305 |     else
306 |         idxPatience = idxPatience+1;
307 |     end
308 |     
309 |     if (idxPatience>patience)||(idxEpoch>epoch) 
310 |         stop = true; 
311 |     end 
312 | end
313 | 
314 | % Showing that training phase has completed
315 | disp('---------- Training Completed! ----------')
316 | disp(['Number of iteration:',num2str(iter)]);
317 | disp(['PPS best: ',num2str(min(PPS_DL))]);
318 | if (strcmp(distr,'poisson'))
319 |     disp(['MSE best: ',num2str(min(Accuracy_DL))]);
320 |     est.out.mse = Accuracy_DL;
321 | else
322 |     disp(['Accuracy best: ',num2str(max(Accuracy_DL))]);
323 |     est.out.accuracy = Accuracy_DL;
324 | end
325 | % Extract mode of weights to make prediction
326 | lambda = lambda_best;
327 | mu = lambda(1:d_theta,1);
328 | b = lambda(d_theta+1:2*d_theta,1);
329 | c = lambda(2*d_theta+1:end);
330 | if isotropic              % For isotropic structure
331 |     SIGMA = b*b' + c^2*eyes(d_theta);
332 | else
333 |     SIGMA = b*b' + diag(c.^2);
334 | end
335 | 
336 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
337 | W_seq{1} = W1; 
338 | for j = 2:L
339 |     index = index_track(j-1)+1:index_track(j);
340 |     Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
341 |     W_seq{j} = Wj; 
342 | end
343 | beta = mu(d_w+1:d_w+d_beta);
344 | 
345 | % Store output in a struct
346 | est.out.weights = W_seq; 
347 | est.out.beta = beta;
348 | est.out.shrinkage = shrinkage_gamma_seq;
349 | est.out.iteration = iter;
350 | est.out.pps = PPS_DL;
351 | est.out.vbMU = mu;            % Mean of variational distribution of weights
352 | est.out.b = b;
353 | est.out.c = c;
354 | est.out.vbSIGMA = SIGMA;      % Covariance matrix of variational distribution 
355 |                               % of weights
356 | est.out.nparams = d_theta;    % Number of parameters     
357 | est.out.indexTrack = index_track;   
358 | est.out.idxBest = idxbest;
359 | end
360 | 
361 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/train/deepGLMnormalCV.m:
--------------------------------------------------------------------------------
  1 | function est = deepGLMnormalCV(X_train,y_train,est)
  2 | % Traing a fGLM model with continuous reponse y.
  3 | % Bayesian Adaptive Group Lasso is used on the first-layer weights; no
  4 | % regularization is put on the rest. sigma2 and tau are updated by
  5 | % mean-field VB. Inverse gamma prior is used for sigma2
  6 | % INPUT
  7 | %   X_train, y_train:           Training data (continuous response)
  8 | %   X_validation, y_validation: Validation data
  9 | %   n_units:                    Vector specifying the numbers of units in
 10 | %                               each layer
 11 | %   batchsize:                  Mini-batch size used in each iteration
 12 | %   eps0:                       Constant learning rate
 13 | %   isotropic:                  True if isotropic structure on Sigma is
 14 | %                               used, otherwise rank-1 structure is used
 15 | % OUTPUT
 16 | %   W_seq:                      The optimal weights upto the last hidden
 17 | %                               layer
 18 | %   beta                        The optimal weights that connect last hidden layer to the output
 19 | %   mean_sigma2                 Estimate of sigma2
 20 | %   shrinkage_gamma_seq         Update of shrinkage parameters over
 21 | %                               iteration
 22 | %   MSE_DL                      Mean squared error over iteration
 23 | %
 24 | %
 25 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 26 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 27 | %   
 28 | %   http://www.xxx.com
 29 | %
 30 | %   Version: 1.0
 31 | %   LAST UPDATE: April, 2018
 32 | 
 33 | % Extract training data and settings from input struct
 34 | X_val = est.data.Xval;
 35 | y_val = est.data.yval;
 36 | n_units = est.network;
 37 | batchsize = est.batchsize;
 38 | lrate = est.lrate;
 39 | isotropic = est.isIsotropic;
 40 | S = est.S;                   % Number of Monte Carlo samples to estimate the gradient
 41 | tau = est.tau;               % Threshold before reducing constant learning rate eps0
 42 | grad_weight = est.momentum;  % Weight in the momentum 
 43 | cScale = est.c;              % Random scale factor to initialize b,c
 44 | patience = est.patience;     % Stop if test error not improved after patience_parameter iterations
 45 | epoch = est.epoch;           % Number of times learning algorithm scan entire training data
 46 | verbose = est.verbose;
 47 | distr = est.dist;
 48 | 
 49 | % Data merge for mini-batch sampling
 50 | data = [y_train,X_train];                 
 51 | datasize = length(y_train);
 52 | num1Epoch = round(datasize/batchsize);    % Number of iterations per epoch
 53 | 
 54 | % Network parameters
 55 | L = length(n_units);        % Number of hidden layers
 56 | p = size(X_train,2)-1;      % Number of covariates
 57 | W_seq = cell(1,L);          % Cells to store weight matrices
 58 | index_track = zeros(1,L);   % Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,...
 59 | index_track(1) = n_units(1)*(p+1);            % Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 
 60 | W1_tilde_index = n_units(1)+1:index_track(1); % Index of W1 without biases, as the first column if W1 are biases
 61 | for j = 2:L
 62 |     index_track(j) = index_track(j-1)+n_units(j)*(n_units(j-1)+1);
 63 | end
 64 | d_w = index_track(L);      % Total number of weights up to (and including) the last layer
 65 | d_beta = n_units(L)+1;     % Dimension of the weights beta connecting the last layer to the output
 66 | d_theta = d_w+d_beta;      % Total number of parameters
 67 | 
 68 | % Initialise weights and set initial mu equal to initial weights
 69 | layers = [size(X_train,2) n_units 1];  % Full structure of NN -> [input,hidden,output]
 70 | weights = nnInitialize(layers);
 71 | mu=[];
 72 | for i=1:length(layers)-1
 73 |     mu=[mu;weights{i}(:)];
 74 | end
 75 | % Initialize b and c
 76 | b = normrnd(0,cScale,d_theta,1);
 77 | if isotropic 
 78 |     c = cScale;
 79 | else
 80 |     c = cScale*ones(d_theta,1);
 81 | end
 82 | % Initialize lambda
 83 | lambda=[mu;b;c];
 84 | 
 85 | % Hyperparameters for inverse-Gamma prior on sigma2
 86 | alpha0_sigma2 = 0; 
 87 | beta0_sigma2 = 0; 
 88 | 
 89 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
 90 | W_seq{1} = W1; 
 91 | for j = 2:L
 92 |     index = index_track(j-1)+1:index_track(j);
 93 |     Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
 94 |     W_seq{j} = Wj; 
 95 | end
 96 | beta = mu(d_w+1:d_theta);
 97 | alpha_sigma2 = alpha0_sigma2+length(y_train)/2; % Optimal VB parameter for updating sigma2 
 98 | beta_sigma2 = alpha_sigma2;                     % Mean_sigma2 and mean_sigma2_inverse are 
 99 |                                                 % Initialised at small values 1/2 and 1 respectively  
100 | mean_sigma2_inverse = alpha_sigma2/beta_sigma2;
101 | mean_sigma2 = beta_sigma2/(alpha_sigma2-1);
102 | mean_sigma2_save(1) = mean_sigma2;
103 | % Compute prediction loss
104 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2);
105 | disp(['Initial MSE: ',num2str(MSE_current)]);
106 | MSE_DL(1) = MSE_current;
107 | PPS_DL(1) = PPS_current;
108 | 
109 | shrinkage_gamma = .01*ones(p,1); % Initialise gamma_beta, the shrinkage parameters
110 | mu_tau = zeros(p,1);             % Parameters for the auxiliary tau_j
111 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p);
112 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p);
113 | if isotropic
114 |     for j = 1:p
115 |         mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
116 |             b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1);
117 |         mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);        
118 |     end
119 |     lambda_tau = shrinkage_gamma.^2;
120 | else
121 |     c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p);
122 |     for j = 1:p
123 |         mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
124 |             b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2);
125 |         mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
126 |     end
127 |     lambda_tau = shrinkage_gamma.^2;
128 | end
129 | mean_inverse_tau = mu_tau;              % VB mean <1/tau_j>
130 | shrinkage_gamma_seq = shrinkage_gamma;  %
131 | 
132 | minibatch = datasample(data,batchsize);
133 | y = minibatch(:,1);
134 | X = minibatch(:,2:end);
135 | 
136 | rqmc = normrnd_qmc(S,d_theta+1);        % Using quasi MC random numbers  
137 | % grad_g_lik_store = zeros(S,)
138 | for s=1:S
139 |     U_normal = rqmc(s,:)';
140 |     epsilon1=U_normal(1);
141 |     epsilon2=U_normal(2:end);
142 |     theta=mu+epsilon1*b+c.*epsilon2;   
143 | 
144 |     W_seq = cell(1,L);        
145 |     W1 = reshape(theta(1:index_track(1)),n_units(1),p+1);
146 |     W_seq{1} = W1;
147 |     for j = 2:L
148 |         index = index_track(j-1)+1:index_track(j);
149 |         Wj = reshape(theta(index),n_units(j),n_units(j-1)+1);
150 |         W_seq{j} = Wj; 
151 |     end
152 |     beta = theta(d_w+1:d_theta);    
153 |     
154 |     W1_tilde = W1(:,2:end);                 % Weights without biases
155 |     W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau);
156 |     grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:);zeros(d_w+d_beta-index_track(1),1)];        
157 |         
158 |     grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr,mean_sigma2_inverse);
159 |      
160 |     grad_h = grad_prior_w_beta+grad_llh;    % Gradient of log prior plus log-likelihood
161 |     grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic);
162 |     grad_theta = grad_h-grad_log_q;
163 |     
164 |     grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]';
165 | end
166 | grad_lb = (mean(grad_g_lik_store))';
167 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic);
168 | gradient_bar = gradient_lambda;
169 | 
170 | % Prepare parameters for training
171 | idxEpoch = 0;          % Index of current epoch
172 | iter = 1;              % Index of current iteration
173 | stop = false;          % Stop flag for early stopping
174 | lambda_best = lambda;  % Store optimal lambda for output
175 | idxPatience = 0;       % Index of number of consequent non-decreasing iterations
176 |                        % for early stopping
177 | disp('---------- Training Phase ----------')
178 | while ~stop 
179 |     iter = iter+1;
180 |     % Display training results after each 'verbose' iteration
181 |     if (verbose && ~mod(iter,verbose))
182 |         if(~mod(iter,num1Epoch))
183 |             idxEpoch = idxEpoch + 1;
184 |             disp(['Epoch: ',num2str(idxEpoch)]);
185 |         end
186 |         disp(['Current MSE: ',num2str(MSE_current)]);
187 |     end
188 | %     minibatch = datasample(data,batchsize);
189 |     for k=1:5
190 |         [X,y,X_val,y_val] = splitData(X,y,k,5);
191 |         rqmc = normrnd_qmc(S,d_theta+1);
192 |         for s=1:S
193 |             U_normal = rqmc(s,:)';
194 |             epsilon1=U_normal(1);
195 |             epsilon2=U_normal(2:end);
196 |             theta=mu+b*epsilon1+c.*epsilon2;   
197 | 
198 |             W_seq = cell(1,L);        
199 |             W1 = reshape(theta(1:index_track(1)),n_units(1),p+1);
200 |             W_seq{1} = W1;
201 |             for j = 2:L
202 |                 index = index_track(j-1)+1:index_track(j);
203 |                 Wj = reshape(theta(index),n_units(j),n_units(j-1)+1);
204 |                 W_seq{j} = Wj; 
205 |             end
206 |             beta = theta(d_w+1:d_w+d_beta);    
207 | 
208 |             W1_tilde = W1(:,2:end); % Weights without biases
209 |             W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau);
210 |             grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:);zeros(d_w+d_beta-index_track(1),1)];        
211 | 
212 |             grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr,mean_sigma2_inverse);
213 | 
214 |             grad_h = grad_prior_w_beta+grad_llh;
215 |             grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic);
216 |             grad_theta = grad_h-grad_log_q;
217 | 
218 |             grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]';
219 |         end
220 |         grad_lb = (mean(grad_g_lik_store))';
221 |         gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic);
222 | 
223 |         grad_norm = norm(gradient_lambda);
224 |         norm_gradient_threshold = 100;
225 |         if norm(gradient_lambda)>norm_gradient_threshold
226 |             gradient_lambda = (norm_gradient_threshold/grad_norm)*gradient_lambda;
227 |         end
228 | 
229 |         gradient_bar_old = gradient_bar;
230 |         gradient_bar = grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda;     
231 | 
232 |         if iter>tau
233 |             stepsize=lrate*tau/iter;
234 |         else
235 |             stepsize=lrate;
236 |         end
237 | 
238 |         lambda=lambda+stepsize*gradient_bar;
239 | 
240 |         mu=lambda(1:d_theta,1);
241 |         b=lambda(d_theta+1:2*d_theta,1);
242 |         c=lambda(2*d_theta+1:end);
243 | 
244 |         W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
245 |         W_seq{1} = W1; 
246 |         for j = 2:L
247 |             index = index_track(j-1)+1:index_track(j);
248 |             Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
249 |             W_seq{j} = Wj; 
250 |         end
251 |         beta = mu(d_w+1:d_theta);
252 | 
253 |     % Update tau and shrinkage parameters    
254 |     %     if mod(iter,10) == 0
255 |          if mod(iter,1) == 0
256 |             mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p);
257 |             b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p);
258 |             if isotropic
259 |                 for j = 1:p
260 |                     mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
261 |                         b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1);
262 |                     mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
263 |                     lambda_tau(j) = shrinkage_gamma(j)^2;
264 |                 end
265 |             else
266 |                 c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p);
267 |                 for j = 1:p
268 |                     mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
269 |                         b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2);
270 |                     mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
271 |                     lambda_tau(j) = shrinkage_gamma(j)^2;
272 |                 end
273 |             end
274 |             mean_inverse_tau = mu_tau;
275 |             mean_tau = 1./mu_tau+1./lambda_tau;
276 |             shrinkage_gamma = sqrt((n_units(1)+1)./mean_tau);
277 |             shrinkage_gamma_seq = [shrinkage_gamma_seq,shrinkage_gamma];
278 |         end
279 | 
280 |     % Update VB posterior for sigma2, which is inverse Gamma    
281 |     %     if (iter >= 1000)&&(mod(iter,100) == 0)
282 |         if (iter >= 1)&&(mod(iter,1) == 0)     
283 |             beta_sigma2 = beta0_sigma2+sumResidualSquared(y_train,X_train,W_seq,beta)/2;
284 |     %         beta_sigma2 = beta0_sigma2+sumResidualSquared(y,X,W_seq,beta)/2;
285 |             mean_sigma2_inverse = alpha_sigma2/beta_sigma2;
286 |             mean_sigma2 = beta_sigma2/(alpha_sigma2-1);
287 |             mean_sigma2_save = [mean_sigma2_save,mean_sigma2];
288 |         end
289 |         [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2);
290 |     end
291 |   
292 |     MSE_DL(iter) = MSE_current;
293 |     PPS_DL(iter) = PPS_current;
294 | 
295 |     if MSE_DL(iter)>=MSE_DL(iter-1)
296 |         gradient_bar = gradient_bar_old;
297 |     end
298 |     
299 |     if MSE_DL(iter)<=min(MSE_DL)
300 |         lambda_best = lambda;
301 |         idxPatience = 0;
302 |         idxbest = iter;
303 |     else
304 |         idxPatience = idxPatience+1;
305 |     end
306 |     
307 |     if (idxPatience>patience)||(idxEpoch>epoch) 
308 |         stop = true; 
309 |     end 
310 | end
311 | disp('---------- Training Completed! ----------')
312 | disp(['Number of iteration:',num2str(iter)]);
313 | disp(['PPS best: ',num2str(min(PPS_DL))]);
314 | disp(['MSE best: ',num2str(min(MSE_DL))]);
315 | % disp(['Sigma best: ',num2str(sqrt(mean_sigma2_save(idxbest)))]);
316 | 
317 | % Extract mode of weights to make prediction
318 | lambda = lambda_best;
319 | mu = lambda(1:d_theta,1);
320 | b = lambda(d_theta+1:2*d_theta,1);
321 | c = lambda(2*d_theta+1:end);
322 | if isotropic              % For isotropic structure
323 |     SIGMA = b*b' + c^2*eyes(d_theta);
324 | else
325 |     SIGMA = b*b' + diag(c.^2);
326 | end
327 |     
328 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
329 | W_seq{1} = W1; 
330 | for j = 2:L
331 |     index = index_track(j-1)+1:index_track(j);
332 |     Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
333 |     W_seq{j} = Wj; 
334 | end
335 | beta = mu(d_w+1:d_w+d_beta);
336 | 
337 | % Store output in a struct
338 | est.out.weights = W_seq; 
339 | est.out.beta = beta;
340 | est.out.shrinkage = shrinkage_gamma_seq;
341 | est.out.iteration = iter;
342 | est.out.mse = MSE_DL;
343 | est.out.pps = PPS_DL;
344 | est.out.vbMU = mu;            % Mean of variational distribution of weights
345 | est.out.b = b;
346 | est.out.c = c;
347 | est.out.vbSIGMA = SIGMA;      % Covariance matrix of variational distribution 
348 |                               % of weights
349 | est.out.nparams = d_theta;    % Number of parameters     
350 | est.out.indexTrack = index_track;
351 | est.out.sigma2Alpha = alpha_sigma2;
352 | est.out.sigma2Beta = beta_sigma2;
353 | % est.out.idxBest = idxbest;
354 | est.out.sigma2Mean = mean_sigma2_save(end);
355 | est.out.sigma2MeanIter = mean_sigma2_save;
356 | end


--------------------------------------------------------------------------------
/Matlab/DeepGLM/train/deepGLMpoisson.m:
--------------------------------------------------------------------------------
  1 | function out = deepGLMpoisson(X_train,y_train,est)
  2 | %DEEPGLMPOISSON Summary of this function goes here
  3 | %
  4 | % Traing a fGLM model with binary reponse y.
  5 | % Bayesian Adaptive Group Lasso is used on the first-layer weights; no
  6 | % regularization is put on the rest. sigma2 and tau are updated by
  7 | % mean-field VB. Inverse gamma prior is used for sigma2
  8 | % INPUT
  9 | %   X_train, y_train:           training data (continuous response)
 10 | %   X_validation, y_validation: validation data
 11 | %   n_units:                    vector specifying the numbers of units in
 12 | %                               each layer
 13 | %   batchsize:                  mini-batch size used in each iteration
 14 | %   eps0:                       constant learning rate
 15 | %   isotropic:                  true if isotropic structure on Sigma is
 16 | %                               used, otherwise rank-1 structure is used
 17 | % OUTPUT
 18 | %   W_seq:                      the optimal weights upto the last hidden
 19 | %                               layer
 20 | %   beta                        the optimal weights that connect last hidden layer to the output
 21 | %   mean_sigma2                 estimate of sigma2
 22 | %   shrinkage_gamma_seq         update of shrinkage parameters over
 23 | %                               iteration
 24 | %
 25 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 26 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 27 | %   
 28 | %   http://www.xxx.com
 29 | %
 30 | %   Version: 1.0
 31 | %   LAST UPDATE: April, 2018
 32 | 
 33 | 
 34 | % Extract training data and settings from input struct
 35 | Xval = est.data.Xval;
 36 | yval = est.data.yval;
 37 | n_units = est.network;
 38 | batchsize = est.batchsize;
 39 | lrate = est.lrate;
 40 | isotropic = est.isIsotropic;
 41 | S = est.S;                   % Number of Monte Carlo samples to estimate the gradient
 42 | tau = est.tau;               % Threshold before reducing constant learning rate eps0
 43 | grad_weight = est.momentum;  % Weight in the momentum 
 44 | cScale = est.c;              % Random scale factor to initialize b,c
 45 | patience = est.patience;     % Stop if test error not improved after patience_parameter iterations
 46 | epoch = est.epoch;           % Number of times learning algorithm scan entire training data
 47 | verbose = est.verbose;
 48 | distr = est.dist;
 49 | 
 50 | % Data merge for mini-batch sampling
 51 | data = [y_train,X_train];                 
 52 | datasize = length(y_train);
 53 | num1Epoch = round(datasize/batchsize);    % Number of iterations per epoch
 54 | 
 55 | % Network parameters
 56 | L = length(n_units);          % Number of hidden layers
 57 | p = size(X_train,2)-1;        % Number of covariates
 58 | W_seq = cell(1,L);            % Cells to store weight matrices
 59 | index_track = zeros(1,L);     % Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,...
 60 | index_track(1) = n_units(1)*(p+1);            % Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 
 61 | W1_tilde_index = n_units(1)+1:index_track(1); % Index of W1 without biases, as the first column if W1 are biases
 62 | for j = 2:L
 63 |     index_track(j) = index_track(j-1)+n_units(j)*(n_units(j-1)+1);
 64 | end
 65 | d_w = index_track(L);         % Total number of weights up to (and including) the last layer
 66 | d_beta = n_units(L)+1;        % Dimension of the weights beta connecting the last layer to the output
 67 | d_theta = d_w+d_beta;         % Total number of parameters
 68 | 
 69 | % Initialise weights and set initial mu equal to initial weights
 70 | layers = [size(X_train,2) n_units 1];  % Full structure of NN -> [input,hidden,output]
 71 | weights = nnInitialize(layers);
 72 | mu=[];
 73 | for i=1:length(layers)-1
 74 |     mu=[mu;weights{i}(:)];
 75 | end
 76 | % Initialize b and c
 77 | b = normrnd(0,cScale,d_theta,1);
 78 | if isotropic 
 79 |     c = cScale;
 80 | else
 81 |     c = cScale*ones(d_theta,1);
 82 | end
 83 | % Initialize lambda
 84 | lambda=[mu;b;c];
 85 | 
 86 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
 87 | W_seq{1} = W1; 
 88 | for j = 2:L
 89 |     index = index_track(j-1)+1:index_track(j);
 90 |     Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
 91 |     W_seq{j} = Wj; 
 92 | end
 93 | beta = mu(d_w+1:d_theta);
 94 | 
 95 | [Loss_current,~] = deepGLMpredict(Xval,yval,W_seq,beta,distr); % compute prediction loss
 96 | disp(['Initial loss on validation set: ',num2str(Loss_current)]);
 97 | Loss_DL(1) = Loss_current;
 98 | 
 99 | shrinkage_gamma = .01*ones(p,1);  % Initialise gamma_beta, the shrinkage parameters
100 | mu_tau = zeros(p,1);              % Parameters for the auxiliary tau_j
101 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p);
102 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p);
103 | if isotropic
104 |     for j = 1:p
105 |         mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
106 |             b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1);
107 |         mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);        
108 |     end
109 |     lambda_tau = shrinkage_gamma.^2;
110 | else
111 |     c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p);
112 |     for j = 1:p
113 |         mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
114 |             b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2);
115 |         mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
116 |     end
117 |     lambda_tau = shrinkage_gamma.^2;
118 | end
119 | mean_inverse_tau = mu_tau; % VB mean <1/tau_j>
120 | shrinkage_gamma_seq = shrinkage_gamma; %
121 | 
122 | minibatch = datasample(data,batchsize);
123 | y = minibatch(:,1);
124 | X = minibatch(:,2:end);
125 | 
126 | rqmc = normrnd_qmc(S,d_theta+1); % using quasi MC random numbers      
127 | for s=1:S
128 |     U_normal = rqmc(s,:)';
129 |     epsilon1=U_normal(1);
130 |     epsilon2=U_normal(2:end);
131 |     theta=mu+epsilon1*b+c.*epsilon2;   
132 | 
133 |     W_seq = cell(1,L);        
134 |     W1 = reshape(theta(1:index_track(1)),n_units(1),p+1);
135 |     W_seq{1} = W1;
136 |     for j = 2:L
137 |         index = index_track(j-1)+1:index_track(j);
138 |         Wj = reshape(theta(index),n_units(j),n_units(j-1)+1);
139 |         W_seq{j} = Wj; 
140 |     end
141 |     beta = theta(d_w+1:d_theta);    
142 |     
143 |     W1_tilde = W1(:,2:end); % weights without biases
144 |     W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau);
145 |     grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:);zeros(d_w+d_beta-index_track(1),1)];        
146 |         
147 |     grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr);
148 |      
149 |     grad_h = grad_prior_w_beta+grad_llh; % gradient of log prior plus log-likelihood
150 |     grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic);
151 |     grad_theta = grad_h-grad_log_q;
152 |     
153 |     grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]';
154 | end
155 | grad_lb = (mean(grad_g_lik_store))';
156 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic);
157 | gradient_bar = gradient_lambda;
158 | 
159 | % Prepare parameters for training
160 | idxEpoch = 0;          % index of current epoch
161 | iter = 1;              % index of current iteration
162 | stop = false;          % Stop flag for early stopping
163 | lambda_best = lambda;  % Store optimal lambda for output
164 | idxPatience = 0;       % index of number of consequent non-decreasing iterations
165 |                        % for early stopping
166 | disp('---------- Training Phase ----------')
167 | while ~stop 
168 |     iter = iter+1;
169 |     % Display training results after each 'verbose' iteration
170 |     if (verbose && ~mod(iter,verbose))
171 |         if(~mod(iter,num1Epoch))
172 |             idxEpoch = idxEpoch + 1;
173 |             disp(['Epoch: ',num2str(idxEpoch)]);
174 |         end
175 |         disp(['Current PPS: ',num2str(Loss_current)]);
176 |     end  
177 |     
178 |     minibatch = datasample(data,batchsize);
179 |     y = minibatch(:,1);
180 |     X = minibatch(:,2:end);
181 |     rqmc = normrnd_qmc(S,d_theta+1);
182 |     for s=1:S
183 |         U_normal = rqmc(s,:)';
184 |         epsilon1=U_normal(1);
185 |         epsilon2=U_normal(2:end);
186 |         theta=mu+b*epsilon1+c.*epsilon2;   
187 | 
188 |         W_seq = cell(1,L);        
189 |         W1 = reshape(theta(1:index_track(1)),n_units(1),p+1);
190 |         W_seq{1} = W1;
191 |         for j = 2:L
192 |             index = index_track(j-1)+1:index_track(j);
193 |             Wj = reshape(theta(index),n_units(j),n_units(j-1)+1);
194 |             W_seq{j} = Wj; 
195 |         end
196 |         beta = theta(d_w+1:d_w+d_beta);    
197 | 
198 |         W1_tilde = W1(:,2:end); % weights without biases
199 |         W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau);
200 |         grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:);zeros(d_w+d_beta-index_track(1),1)];        
201 |         grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr);
202 |         grad_h = grad_prior_w_beta+grad_llh;
203 |         grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic);
204 |         grad_theta = grad_h-grad_log_q;
205 |     
206 |         grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]';
207 |     end
208 |     grad_lb = (mean(grad_g_lik_store))';
209 |     gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic);
210 |     
211 |     grad_norm = norm(gradient_lambda);
212 |     norm_gradient_threshold = 100;
213 |     if norm(gradient_lambda)>norm_gradient_threshold
214 |         gradient_lambda = (norm_gradient_threshold/grad_norm)*gradient_lambda;
215 |     end
216 |     
217 |     gradient_bar_old = gradient_bar;
218 |     gradient_bar = grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda;     
219 |     
220 |     if iter>tau
221 |         stepsize=lrate*tau/iter;
222 |     else
223 |         stepsize=lrate;
224 |     end
225 |     
226 |     lambda=lambda+stepsize*gradient_bar;
227 |     
228 |     mu=lambda(1:d_theta,1);
229 |     b=lambda(d_theta+1:2*d_theta,1);
230 |     c=lambda(2*d_theta+1:end);
231 | 
232 |     W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
233 |     W_seq{1} = W1; 
234 |     for j = 2:L
235 |         index = index_track(j-1)+1:index_track(j);
236 |         Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
237 |         W_seq{j} = Wj; 
238 |     end
239 |     beta = mu(d_w+1:d_theta);
240 | 
241 | % update tau and shrinkage parameters    
242 |     if mod(iter,10) == 0
243 |         mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p);
244 |         b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p);
245 |         if isotropic
246 |             for j = 1:p
247 |                 mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
248 |                     b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1);
249 |                 mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
250 |                 lambda_tau(j) = shrinkage_gamma(j)^2;
251 |             end
252 |         else
253 |             c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p);
254 |             for j = 1:p
255 |                 mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+...
256 |                     b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2);
257 |                 mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde);
258 |                 lambda_tau(j) = shrinkage_gamma(j)^2;
259 |             end
260 |         end
261 |         mean_inverse_tau = mu_tau;
262 |         mean_tau = 1./mu_tau+1./lambda_tau;
263 |         shrinkage_gamma = sqrt((n_units(1)+1)./mean_tau);
264 |         shrinkage_gamma_seq = [shrinkage_gamma_seq,shrinkage_gamma];
265 |     end
266 |     
267 |     
268 |     [Loss_current,~] = deepGLMpredict(Xval,yval,W_seq,beta,distr); % compute prediction loss
269 |     Loss_DL(iter) = Loss_current;
270 | 
271 |     if Loss_DL(iter)>=Loss_DL(iter-1)
272 |         gradient_bar = gradient_bar_old;
273 |     end
274 |     
275 |     if Loss_DL(iter)<=min(Loss_DL)
276 |         lambda_best = lambda;
277 |         idxPatience = 0;
278 |     else
279 |         idxPatience = idxPatience+1;
280 |     end
281 |     
282 |     if (idxPatience>patience)||(idxEpoch>epoch) 
283 |         stop = true; 
284 |     end 
285 | end
286 | disp('---------- Training Completed! ----------')
287 | disp(['Number of iteration:',num2str(iter)]);
288 | disp(['PPS best: ',num2str(min(Loss_DL))]);
289 | 
290 | lambda = lambda_best;
291 | mu=lambda(1:d_theta,1);
292 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1);
293 | W_seq{1} = W1; 
294 | for j = 2:L
295 |     index = index_track(j-1)+1:index_track(j);
296 |     Wj = reshape(mu(index),n_units(j),n_units(j-1)+1);
297 |     W_seq{j} = Wj; 
298 | end
299 | beta = mu(d_w+1:d_w+d_beta);
300 | 
301 | % [Loss_PPS,TP_MCR,MCR] = prediction_loss(y_validation,X_validation,W_seq,beta); % compute prediction loss
302 | % Store output in a struct
303 | out.weights = W_seq; 
304 | out.beta = beta;
305 | out.shrinkage = shrinkage_gamma_seq;
306 | out.pps = Loss_DL;
307 | out.iteration = iter;
308 | end
309 | 
310 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/train/deepGLMpredict.m:
--------------------------------------------------------------------------------
  1 | function out = deepGLMpredict(mdl,X,varargin)
  2 | %DEEPGLMPREDICT Make prediction from a trained deepGLM model
  3 | %
  4 | %   OUT = DEEPGLMPREDICT(MDL,XTEST) predict responses for new data XTEST using 
  5 | %   trained deepGLM structure MDL (output from DEEPGLMFIT) 
  6 | %
  7 | %   OUT = DEEPGLMPREDICT(MDL,XTEST,NAME,VALUE) predicts responses with additional 
  8 | %   options specified by one or more of the following name/value pairs:
  9 | %
 10 | %      'ytest'            Specify column of test responses. If this option 
 11 | %                         is specified with true response column of new 
 12 | %                         observations, deepGLMpredict will return prediction 
 13 | %                         scores (PPS, MSE or Classification Rate) using true 
 14 | %                         responses column vector ytest
 15 | %      'Interval'         Return prediction interval estimation for observations 
 16 | %                         in test data Xtest. By default, this predictive 
 17 | %                         interval capability is disable ('Interval' is 0). 
 18 | %                         Must be an positive number.   
 19 | %      'Nsample'          Number of samples generated from posterior distribution 
 20 | %                         of model parameters used to make prediction interval 
 21 | %                         estimation for test data. Must be a positive integer
 22 | %   Example:
 23 | %      Fit a deepGLM model for Direcmarketing data set. All of the
 24 | %      exampled data are located inside /Data folder of installed package. 
 25 | %      In order to use the sample dataset, user must add this Data folder
 26 | %      to Matlab path or explicitly direct to Data folder in 'load'
 27 | %      function
 28 | %
 29 | %      load('DirectMarketing.mat')
 30 | %      mdl = deepGLMfit(X,y,...                   % Training data
 31 | %                      'Network',[5,5],...        % Use 2 hidden layers
 32 | %                      'Lrate',0.01,...           % Specify learning rate
 33 | %                      'Verbose',10,...           % Display training result each 10 iteration
 34 | %                      'BatchSize',size(X,1),...  % Use entire training data as mini-batch
 35 | %                      'MaxEpoch',10000,...       % Maximum number of epoch
 36 | %                      'Patience',50,...          % Higher patience values could lead to overfitting
 37 | %                      'Seed',100);               % Set random seed to 100
 38 | %    
 39 | %      Pred = deepGLMpredict(mdl,X_test,...
 40 | %                           'ytest',y_test);
 41 | %      disp(['PPS on test data: ',num2str(Pred.pps)])
 42 | %      disp(['MSE on test data: ',num2str(Pred.mse)])
 43 | %   
 44 | %   For more examples, check EXAMPLES folder
 45 | %
 46 | %   See also DEEPGLMFIT, DEEPGLMPLOT
 47 | %
 48 | %   Copyright 2018:
 49 | %       Nghia Nguyen (nghia.nguyen@sydney.edu.au)
 50 | %       Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au)
 51 | %      
 52 | %   https://github.com/VBayesLab/deepGLM
 53 | %
 54 | %   Version: 1.0
 55 | %   LAST UPDATE: May, 2018
 56 | 
 57 | % Check errors input arguments
 58 | if nargin < 2
 59 |     error(deepGLMmsg('deepglm:TooFewInputs'));
 60 | end
 61 | 
 62 | % Load deepGLM params from struct
 63 | W_seq = mdl.out.weights;
 64 | beta = mdl.out.beta;
 65 | distr = mdl.dist;
 66 | 
 67 | % Parse additional options
 68 | paramNames = {'ytest'      'Interval'      'Nsample'       'Intercept'};
 69 | paramDflts = {[]           0               1000            true};
 70 | [y,alpha,Nsample,intercept] = internal.stats.parseArgs(paramNames, paramDflts, varargin{:});
 71 | 
 72 | % If y test is specified, check input
 73 | if(~isempty(y))
 74 |     if size(y,1) ~= size(X,1)
 75 |         error(deepGLMmsg('deepglm:InputSizeMismatchX'));
 76 |     end
 77 |     if size(y,2) ~= 1
 78 |         error(deepGLMmsg('deepglm:InputSizeMismatchY'));
 79 |     end
 80 | end
 81 | 
 82 | % Add column of 1 to X if intercept is true
 83 | if(intercept)
 84 |     X = [ones(size(X,1),1),X];
 85 | end
 86 | 
 87 | % Store Nsample to deepGLMfit
 88 | mdl.Nsample = Nsample;
 89 | 
 90 | % Calculate neuron network output
 91 | nnet_output = nnFeedForward(X,W_seq,beta);
 92 | 
 93 | switch distr
 94 |     case 'normal'
 95 |         out.yhat = nnet_output;    % Prediction for continuous response
 96 |         % If ytest if provided, then calculate pps and mse
 97 |         if(~isempty(y))
 98 |             sigma2 = mdl.out.sigma2Mean;
 99 |             mse = mean((y-nnet_output).^2);
100 |             pps = 1/2*log(sigma2) + 1/2/sigma2*mse;
101 |             out.mse = mse;
102 |             out.pps = pps;
103 |         end
104 |         % Calculate confidence interval if required
105 |         if(alpha~=0)
106 |             interval = predictionInterval(mdl,X,alpha);
107 |             out.interval = interval.interval;
108 |             out.yhatMatrix = interval.yhatMC;
109 |         end
110 |         
111 |     case 'binomial'
112 |         out.yNN = nnet_output;
113 |         out.yProb = exp(nnet_output)./(1+exp(nnet_output));
114 |         y_pred = double(nnet_output>0);   % Prediction for binary response
115 |         out.yhat = y_pred;
116 |         % If ytest if provided, then calculate pps and mse
117 |         if(~isempty(y))
118 |             pps = mean(-y.*nnet_output+log(1+exp(nnet_output)));
119 |             cr = mean(y==y_pred);    % Miss-classification rate
120 |             out.pps = pps;
121 |             out.accuracy = cr;
122 |         end
123 |         
124 |     case 'poisson'
125 |         out.yNN = nnet_output;
126 |         y_pred = exp(nnet_output);        % Prediction for poisson response
127 |         out.yhat = y_pred;
128 |         if(~isempty(y))
129 |             pps = mean(-y.*nnet_output+exp(nnet_output));
130 |             mse = mean((y-y_pred).^2);
131 |             out.mse = mse;
132 |             out.pps = pps;
133 |         end
134 | end
135 | end
136 | 
137 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/train/deepGLMpredictLoss.m:
--------------------------------------------------------------------------------
 1 | function [out1,out2] = deepGLMpredictLoss(X,y,W_seq,beta,distr,sigma2)
 2 | %DEEPGLMPREDICTION Make prediction from estimated deepGLM model
 3 | %
 4 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 5 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 6 | %   
 7 | %   http://www.xxx.com
 8 | %
 9 | %   Version: 1.0
10 | %   LAST UPDATE: April, 2018
11 | 
12 | % Calculate neuron network output
13 | nnet_output = nnFeedForward(X,W_seq,beta);
14 | 
15 | switch distr
16 |     case 'normal'
17 |         mse = mean((y-nnet_output).^2);
18 |         pps = 1/2*log(sigma2) + 1/2/sigma2*mse;
19 |         out2 = mse;
20 |     case 'binomial'
21 |         pps = mean(-y.*nnet_output+log(1+exp(nnet_output)));
22 |         y_pred = nnet_output>0;
23 |         mcr = mean(abs(y-y_pred));        % Miss-classification rate
24 |         out2 = 1 - mcr;                   % Report output in classification rate   
25 |     case 'poisson'
26 |         pps = mean(-y.*nnet_output+exp(nnet_output));
27 |         mse = mean((y-exp(nnet_output)).^2);
28 |         out2 = mse;
29 | end
30 | out1 = pps;
31 | 
32 | end
33 | 
34 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/utils/checkInput.m:
--------------------------------------------------------------------------------
 1 | function checkInput(est)
 2 | %CHECKDATA Check if user input correct model settings
 3 | %
 4 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 5 | %   Nguyen (nghia.nguyen@sydney.edu.au).
 6 | %   
 7 | %   http://www.xxx.com
 8 | %
 9 | %   Version: 1.0
10 | %   LAST UPDATE: April, 2018
11 | dist = est.dist;
12 | network = est.network;
13 | lrate = est.lrate;
14 | momentum = est.momentum;
15 | batchsize = est.batchsize;
16 | epoch = est.epoch;
17 | patience = est.patience;
18 | tau = est.tau;
19 | S = est.S;
20 | windowSize = est.windowSize;
21 | icept = est.icept;
22 | verbose = est.verbose;
23 | monitor = est.monitor;
24 | isotropic = est.isIsotropic;
25 | seed = est.seed;
26 | 
27 | if(~strcmpi(dist,'normal') && ~strcmpi(dist,'binomial') && ~strcmpi(dist,'poisson'))
28 |     error(['Distribution must be one of the followings: ','normal,','binomial,','poisson']);
29 | end
30 | 
31 | if(sum(network==0)>0)
32 |     error('Network must be an array of positive integers')
33 | end
34 | 
35 | if(sum(network==0)>0)
36 |     error('Network must be an array of positive integers')
37 | end
38 | 
39 | if(~isnumeric(lrate) || lrate<=0)
40 |     error('Learning rate must be a positive numerical value')
41 | end
42 | 
43 | if(~isnumeric(momentum) || momentum<0 || momentum > 1)
44 |     error('Momentum must be a numerical value from 0 to 1')
45 | end
46 | 
47 | if(~isnumeric(batchsize) || floor(batchsize)~= batchsize || batchsize <= 0)
48 |     error('Batch size must be an positive integer smaller than number of observations in training data');
49 | end
50 | 
51 | if(~isnumeric(epoch) || floor(epoch)~= epoch || epoch <= 0)
52 |     error('Number of epoches must be a positive integer');
53 | end
54 | 
55 | if(~isnumeric(patience) || floor(patience)~= patience || patience <= 0)
56 |     error('Patience must be a positive integer');
57 | end
58 | 
59 | if(~isnumeric(tau) || floor(tau)~= tau || tau <= 0)
60 |     error('LrateFactor must be a positive integer');
61 | end
62 | 
63 | if(~isnumeric(S) || floor(S)~= S || S <= 0)
64 |     error('S must be a positive integer');
65 | end
66 | 
67 | if(~isnumeric(windowSize) || floor(windowSize)~= windowSize || windowSize <= 0)
68 |     error('WindowSize must be a positive integer');
69 | end
70 | 
71 | if(~islogical(icept))
72 |     error('Intercept option must be a logical value');
73 | end
74 | 
75 | if(~isnumeric(verbose) || floor(verbose)~= verbose || verbose <= 0)
76 |     error('Verbose must be a positive integer');
77 | end
78 | 
79 | if(~islogical(monitor))
80 |     error('Monitor option must be a logical value');
81 | end
82 | 
83 | if(~islogical(isotropic))
84 |     error('Isotropic option must be a logical value');
85 | end
86 | 
87 | if (~isnan(seed))
88 |     if(~isnumeric(seed) || floor(seed)~= seed || seed <= 0)
89 |         error('Seed must be a nonnegative integer less than 2^32');
90 |     end
91 | end
92 | end
93 | 
94 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/utils/deepGLMmsg.m:
--------------------------------------------------------------------------------
 1 | function msg_out = deepGLMmsg(identifier)
 2 | %DEEPGLMMSG Define custom error/warning messages for exceptions
 3 | %   DEEPGLMMSG = (IDENTIFIER) extract message for input indentifier
 4 | %   
 5 | %
 6 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 7 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 8 | %
 9 | %   http://www.xxx.com
10 | %
11 | %   Version: 1.0
12 | %   LAST UPDATE: April, 2018
13 | 
14 | switch identifier
15 |     case 'deepglm:TooFewInputs'
16 |         msg_out = 'At least two arguments are specified';
17 |     case 'deepglm:InputSizeMismatchX'
18 |         msg_out = 'X and Y must have the same number of observations';
19 |     case 'deepglm:InputSizeMismatchY'
20 |         msg_out = 'Y must be a single column vector';
21 |     case 'deepglm:ArgumentMustBePair'
22 |         msg_out = 'Optinal arguments must be pairs';
23 |     case 'deepglm:ResponseMustBeBinary'
24 |         msg_out = 'Two level categorical variable required';
25 |     case 'deepglm:DistributionMustBeBinomial'
26 |         msg_out = 'Binomial distribution option required';
27 |     case 'deepglm:MustSpecifyActivationFunction'
28 |         msg_out = 'Activation function type requied';
29 | end
30 | end
31 | 
32 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/utils/deepGLMout.m:
--------------------------------------------------------------------------------
 1 | function out = deepGLMout()
 2 | %DEEPGLMOUT Generate default output structure for deepGLM training results
 3 | 
 4 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 5 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 6 | %   
 7 | %   http://www.xxx.com
 8 | %
 9 | %   Version: 1.0
10 | %   LAST UPDATE: April, 2018
11 | 
12 |     % Training method
13 |     out.dist = 'normal';         % Default distribution of dependent variable. 
14 |     out.initialize = 'adaptive'; % Default initialize method
15 |     out.isIsotropic = false;     % Default structure of variational Covariance matrix
16 |     out.ncore = 0;               % Default parallel computing option
17 |     
18 |     % Optional settings
19 |     out.seed = NaN;              % No random seed by default
20 |     out.nval = 0.2;              % Default proportion of training data for validation
21 |     out.verbose = 10;            % Default number of iteration to display training results
22 |     out.cutoff = 0.5;            % Default Cutoff probability for sigmoid function
23 |     out.stop = false;            % Execution Flag
24 |     out.quasiMC = true;          % Using Quasi MC for random number generator
25 |     out.monitor = false;         % Display training progress window
26 |     out.muTau = NaN;
27 |     out.lowerbound = true;
28 |     out.windowSize = 100;
29 |     
30 |     % Model hyper-parameters
31 |     out.network = [10,10];       % Default network structure
32 |     out.lrate = 0.01;            % Default Learning rate
33 |     out.S = 10;                  % Default Number of samples used to approximate gradient of likelihood
34 |     out.batchsize = 5000;        % Default Proportion of batch size over entire train set
35 |     out.epoch = 1000;            % Default Number of epoches in train phase
36 |     out.tau = 10000;             % Default Scale factor of learning rate
37 |     out.patience = 100;           % Default Number of consequence non-decreasing iterations (for early stopping checking)
38 |     out.c = 0.01;                % Default initial value of isotropic factor c
39 |     out.bvar = 0.01;             % Default initial variance of each element of b
40 |     out.momentum = 0.6;          % Default momentum weight
41 |  
42 |     % Variable names
43 |     out.name.ynames = NaN;       % y variables names
44 |     out.name.xnames = NaN;       % X variable names
45 |     
46 |     % Data properties
47 |     out.data.y = NaN;            % Dependent variable of training data
48 |     out.data.X = NaN;            % Independent variables of training data
49 |     out.data.ytest = NaN;        % Dependent variable of test data
50 |     out.data.Xtest = NaN;        % Independent variables of tets data
51 |     out.data.nTrain = NaN;       % Number of observation in training set
52 |     out.data.nTest = NaN;        % Number of observation in test set
53 |     out.data.Xval = [];
54 |     out.data.yval = [];
55 |     out.data.icept = true;      % Intercept option
56 |     
57 |     % Training results
58 | %     out.out.mse = NaN;
59 | %     out.out.accuracy = NaN;
60 |     
61 | end
62 | 
63 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/utils/isBinomial.m:
--------------------------------------------------------------------------------
 1 | function out = isBinomial(array)
 2 | %ISBINOMIAL Check if an array are binary vector
 3 | 
 4 | %   Copyright 2018 
 5 | %   http://www.xxx.com
 6 | %
 7 | %   Version: 1.0
 8 | %   LAST UPDATE: April, 2018
 9 | 
10 | out = false;
11 | uniqueVal = unique(array);    % Extract unique values in array
12 | if (length(uniqueVal)==2) && (uniqueVal(1)==0) && (uniqueVal(2)==1)
13 |     out = true;
14 | end
15 | end
16 | 
17 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/utils/predictionInterval.m:
--------------------------------------------------------------------------------
 1 | function predInterval = predictionInterval(mdl,X,zalpha)
 2 | %CONFIDENTINTERVAL Interval estimation for test data using deepGLM
 3 | %
 4 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
 5 | %   Nguyen (nghia.nguyen@sydney.edu.au)
 6 | %   
 7 | %   http://www.xxx.com
 8 | %
 9 | %   Version: 1.0
10 | %   LAST UPDATE: April, 2018
11 | 
12 | % Load deepGLM params from struct
13 | Nsample = mdl.Nsample;
14 | MU = mdl.out.vbMU;
15 | SIGMA = mdl.out.vbSIGMA;
16 | n_units = mdl.network;
17 | index_track = mdl.out.indexTrack;
18 | alpha_sigma2 = mdl.out.sigma2Alpha;
19 | beta_sigma2 = mdl.out.sigma2Beta;
20 | 
21 | % Calculate network parameters 
22 | L = length(n_units);        % Number of hidden layers
23 | p = size(X,2)-1;            % Number of covariates
24 | d_beta = n_units(L)+1; 
25 | d_w = index_track(L);
26 | 
27 | yhat = zeros(Nsample,size(X,1));      % Predicted values of test data
28 | nnOut = zeros(Nsample,size(X,1));     % Output of NN
29 | for i=1:Nsample
30 |     % Generate samples of theta from Normal distribution
31 |     theta_i = mvnrnd(MU,SIGMA);   
32 |     % Generate samples of sigma from IG distribution
33 |     sigma2_i = 1/gamrnd(alpha_sigma2,1./beta_sigma2);
34 |     
35 |     % For each generated theta, restore neuron net structure
36 |     W1 = reshape(theta_i(1:index_track(1)),n_units(1),p+1);
37 |     W_seq{1} = W1; 
38 |     for j = 2:L
39 |         index = index_track(j-1)+1:index_track(j);
40 |         Wj = reshape(theta_i(index),n_units(j),n_units(j-1)+1);
41 |         W_seq{j} = Wj; 
42 |     end
43 |     beta = theta_i(d_w+1:d_w+d_beta)';
44 |     
45 |     % Calculate neuron network output
46 |     nnOut(i,:) = nnFeedForward(X,W_seq,beta);
47 |     
48 |     % Calculate p(y|theta_i,sigma_i,X)
49 |     yhat(i,:) = normrnd(nnOut(i,:),sqrt(sigma2_i));
50 |     
51 | end
52 | 
53 | % 95% confidence interval
54 | yhatLCL = mean(yhat) - zalpha*std(yhat);
55 | yhatUCL = mean(yhat) + zalpha*std(yhat);
56 | yhatInterval = [yhatLCL',yhatUCL'];
57 | predInterval.yhatMC = yhat;
58 | predInterval.interval = yhatInterval;
59 | end
60 | 
61 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/utils/splitData.m:
--------------------------------------------------------------------------------
1 | function [Xtr,ytr,Xval,yval] = splitData(X,y,ratio,kfold)
2 | %SPLITDATA Split training data for crossvalidation
3 |     
4 | n = size(X,1);
5 | 
6 | end
7 | 
8 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/utils/sumResidualSquared.m:
--------------------------------------------------------------------------------
 1 | function S = sumResidualSquared(y,X,W_seq,beta)
 2 | % compute the sum_residual_squared for normal-NN model
 3 | 
 4 | nnet_output = nnFeedForward(X,W_seq,beta);
 5 | S = sum((y-nnet_output).^2);
 6 | end
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/vbfun/vbGradientLogLB.m:
--------------------------------------------------------------------------------
 1 | %% Script to calculate natural gradient of lowerbound
 2 | %rqmc = normrnd_qmc(S,d_theta+1);     % Using quasi MC random numbers 
 3 | rqmc = normrnd(0,1,S,d_theta+1);
 4 | % rng(iter)
 5 | % rqmc = rand(S,d_theta+1);
 6 | for s=1:S
 7 |     U_normal = rqmc(s,:)';
 8 |     epsilon1=U_normal(1);
 9 |     epsilon2=U_normal(2:end);
10 |     theta=mu+epsilon1*b+c.*epsilon2;   
11 | 
12 |     W_seq = cell(1,L);        
13 |     W1 = reshape(theta(1:index_track(1)),n_units(1),p+1);
14 |     W_seq{1} = W1;
15 |     W1_tilde = W1(:,2:end); % weights without biases
16 |     W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau);
17 |     grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:)]; 
18 |     for j = 2:L
19 |         index = index_track(j-1)+1:index_track(j);
20 |         Wj = reshape(theta(index),n_units(j),n_units(j-1)+1);
21 |         W_seq{j} = Wj; 
22 |         Wj_tilde = Wj(:,2:end);
23 |         grad_prior_Wj = [zeros(n_units(j),1);-shrinkage_l2*Wj_tilde(:)];        
24 |         grad_prior_w_beta = [grad_prior_w_beta;grad_prior_Wj];
25 |     end
26 |     beta = theta(d_w+1:d_theta);    
27 |     beta_tilde = beta(2:end); % vector beta without intercept
28 |     grad_prior_beta = [0;-shrinkage_l2*beta_tilde];
29 |     grad_prior_w_beta = [grad_prior_w_beta;grad_prior_beta];
30 |     
31 |     if(strcmp(distr,'normal'))    
32 |         [grad_llh,yNN] = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr,mean_sigma2_inverse);
33 |     else
34 |         [grad_llh,yNN] = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr);
35 |     end 
36 |     
37 |     grad_h = grad_prior_w_beta+grad_llh;    % Gradient of log prior plus log-likelihood
38 |     grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic);
39 |     grad_theta = grad_h-grad_log_q;
40 |     grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]';
41 |     
42 | %   ------------------ lower bound ---------------------------------------
43 |     if(lbFlag)
44 |         if(strcmp(distr,'normal'))
45 |             lb_iter(s) = constMean...
46 |                         -0.5*mean_sigma2_inverse*sum((y-yNN).^2)*datasize/batchsize...
47 |                         +const;
48 |         elseif(strcmp(distr,'binomial'))
49 |             lb_iter(s) = constMean...
50 |                         +sum(y.*yNN - log(1+exp(yNN)))*datasize/batchsize...
51 |                         +const;
52 |         else
53 |             lb_iter(s) = constMean...
54 |                         +sum(y.*yNN - exp(yNN))*datasize/batchsize...
55 |                         +const;
56 |         end
57 |     end
58 | %   ----------------------------------------------------------------------
59 | end
60 | grad_lb = (mean(grad_g_lik_store))';
61 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic);


--------------------------------------------------------------------------------
/Matlab/DeepGLM/vbfun/vbGradientLogq.m:
--------------------------------------------------------------------------------
 1 | function grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic)
 2 | %VBGRADIENTLOGQ Summary of this function goes here
 3 | %   Detailed explanation goes here
 4 | x = theta-mu;
 5 | if isotropic
 6 |     grad_log_q = -x./c^2+(1/c^2)*((b'*x)/(c^2+(b'*b)))*b;
 7 | else   
 8 |     d = b./c.^2;
 9 |     grad_log_q = -x./c.^2+(d'*x)/(1+(d'*b))*d;
10 | end
11 | end
12 | 
13 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/vbfun/vbLowerBound.m:
--------------------------------------------------------------------------------
 1 | %% Group Lasso + L2 prior on remaining weigths
 2 | if(strcmp(distr,'normal'))
 3 |     mean_log_sig2 = log(beta_sigma2)-psi(alpha_sigma2);
 4 |     logdet = log(det(1 + (b./(c.^2))'*b)) + sum(log((c.^2)));   
 5 |     constMean = -(alpha0_sigma2+1)*mean_log_sig2 - beta0_sigma2*mean_sigma2_inverse...
 6 |          +0.5*sum(2*(n_units(1)+1)*log(shrinkage_gamma)- (shrinkage_gamma.^2).*mean_tau)...
 7 |          -0.5*datasize*mean_log_sig2+gammaln(alpha_sigma2)...
 8 |          -alpha_sigma2*log(beta_sigma2)+(alpha_sigma2+1)*mean_log_sig2...
 9 |          +alpha_sigma2-0.5*(sum(log(lambda_tau))-p)+0.5*logdet...
10 |          +0.5*d_w_tilde*log(shrinkage_l2)-0.5*shrinkage_l2*mean_w_tilde...
11 |          -0.5*sum(mean_column_j_tilde'.*mean_inverse_tau);
12 | else
13 |     logdet = log(det(1 + (b./(c.^2))'*b)) + sum(log((c.^2)));   
14 |     constMean = 0.5*sum(2*(n_units(1)+1)*log(shrinkage_gamma)-(shrinkage_gamma.^2).*mean_tau)...
15 |          -0.5*(sum(log(lambda_tau))-p)+0.5*logdet+0.5*d_w_tilde*log(shrinkage_l2)...
16 |          -0.5*shrinkage_l2*mean_w_tilde-0.5*sum(mean_column_j_tilde'.*mean_inverse_tau);
17 | end
18 | 
19 | %  lb(iter) = (a1+constMean+const)/datasize;
20 |  
21 | % a3 = alpha0_sigma2*log(beta0_sigma2)-gammaln(alpha0_sigma2)...
22 | %      -0.5*p*n_units(1)*log(2*pi)-0.5*d_w_tilde*log(2*pi)...
23 | %      -p*gammaln((n_units(1)+1)/2)-0.5*datasize*log(2*pi)...
24 | %      +p/2*log(2*pi)+0.5*d_theta*log(2*pi)+d_theta/2;
25 | 
26 | 
27 | 
28 | % lb(iter) = alpha0_sigma2*log(beta0_sigma2)-gammaln(alpha0_sigma2)-...
29 | %     (alpha0_sigma2+1)*mean_log_sig2 - beta0_sigma2*mean_sigma2_inverse...
30 | %     -0.5*p*n_units(1)*log(2*pi)-0.5*sum(mean_column_j_tilde'.*mean_inverse_tau)...
31 | %     -0.5*d_w_tilde*log(2*pi)+0.5*d_w_tilde*log(shrinkage_l2)-...
32 | %     0.5*shrinkage_l2*mean_w_tilde-p*gammaln((n_units(1)+1)/2)+...
33 | %     0.5*sum(2*(n_units(1)+1)*log(shrinkage_gamma) - (shrinkage_gamma.^2).*mean_tau)...
34 | %     -0.5*datasize*log(2*pi)-0.5*datasize*mean_log_sig2-...
35 | %     0.5*mean_sigma2_inverse*sum_squared+gammaln(alpha_sigma2)-...
36 | %     alpha_sigma2*log(beta_sigma2)+(alpha_sigma2+1)*mean_log_sig2 + alpha_sigma2...
37 | %     +p/2*log(2*pi)-0.5*(sum(log(lambda_tau))-p)+0.5*d_theta*log(2*pi)+0.5*logdet+...
38 | %     d_theta/2 -0.5*((grad_llh'*b)^2+ sum((c.^2).*(grad_llh.^2)))
39 | 


--------------------------------------------------------------------------------
/Matlab/DeepGLM/vbfun/vbNaturalGradient.m:
--------------------------------------------------------------------------------
 1 | function prod = vbNaturalGradient(b,c,grad,isotropic)
 2 | %VBNATURALGRADIENT compute the product inverse_fisher times grad for two 
 3 | % cases: isotropic factor decompostion or rank-1 decomposition
 4 | % INPUT: 
 5 | %   grad:           the traditional gradient
 6 | %   b,c:            parameters in the factor decomposition
 7 | %   isotropic:      true if isotropic structure is used, rand-1 otherwise
 8 | % 
 9 | % OUTPUT: natural gradient
10 | %
11 | %   Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia
12 | %   Nguyen (nghia.nguyen@sydney.edu.au).
13 | %   
14 | %   http://www.xxx.com
15 | %
16 | %   Version: 1.0
17 | %   LAST UPDATE: April, 2018
18 | 
19 | if isotropic
20 |     d = length(b);
21 |     bb = b'*b;
22 |     alpha = 1/(c^2+bb);
23 |     omega = (2/c^2)*(d-1+c^4*alpha^2);
24 |     kappa = (1+c^2/bb-.5*(1+c^2/bb)^2)*2*c*bb*alpha^2+2*c^3*alpha/bb;
25 |     c2 = omega-2*c*alpha^2*kappa*bb;
26 | 
27 |     grad1 = grad(1:d);
28 |     grad2 = grad(d+1:2*d);
29 |     grad3 = grad(end);
30 | 
31 |     b_grad2 = b'*grad2;
32 |     const1 = (1+c^2/bb-.5*(1+c^2/bb)^2);
33 |     const2 = c^2*(1+c^2/bb);
34 |     Ainv_times_grad2 = (const1*b_grad2)*b+const2*grad2;
35 | 
36 |     prod = [(b'*grad1)*b+c^2*grad1;Ainv_times_grad2+(kappa^2/c2*b_grad2)*b-(kappa/c2*grad3)*b;-kappa/c2*b_grad2+grad3/c2];
37 | else
38 |     % Close-form method
39 | %     d = length(b);
40 | %     grad1 = grad(1:d);
41 | %     grad2 = grad(d+1:2*d);
42 | %     grad3 = grad(2*d+1:end);
43 | % 
44 | %     c2 = c.^2;
45 | %     b2 = b.^2;
46 | % 
47 | %     prod1 = (b'*grad1)*b+(grad1.*c2);
48 | % 
49 | %     alpha = 1/(1+sum(b2./c2));
50 | %     Cminus = diag(1./c2);
51 | %     Cminus_b = b./c2;
52 | %     Sigma_inv = Cminus-alpha*(Cminus_b*Cminus_b');
53 | % 
54 | %     A11_inv = (1/(1-alpha))*((1-1/(sum(b2)+1-alpha))*(b*b')+diag(c2));
55 | % 
56 | %     C = diag(c);
57 | %     A12 = 2*(C*Sigma_inv*b*ones(1,d)).*Sigma_inv;
58 | %     A21 = A12';
59 | %     A22 = 2*C*(Sigma_inv.*Sigma_inv)*C;
60 | %     D = A22-A21*A11_inv*A12;
61 | %     prod2 = A11_inv*grad2+(A11_inv*A12)*(D\A21)*(A11_inv*grad2)-(A11_inv*A12)*(D\grad3);
62 | %     prod3 = -(D\A21)*(A11_inv*grad2)+D\grad3;
63 | %     prod = [prod1;prod2;prod3];
64 |     
65 | %     % Approximation method
66 |     d = length(b);
67 |     grad1 = grad(1:d);
68 |     grad2 = grad(d+1:2*d);
69 |     grad3 = grad(2*d+1:end);
70 | 
71 |     c2 = c.^2;
72 |     b2 = b.^2;
73 | 
74 |     prod1 = (b'*grad1)*b+(grad1.*c2);
75 | 
76 |     const = sum(b2./c2);
77 |     const1 = 1/2+1/2/const;
78 |     prod2 = (b'*grad2)*b+(grad2.*c2);
79 |     prod2 = const1*prod2;
80 |     alpha = 1/(1+const);
81 |     x = alpha*b2./(c.^3);
82 |     y = 1./c2 - 2*alpha*(b./c2).^2;
83 |     aux = x./y;    
84 |     prod3 = grad3./y-(1/(1+sum(x.^2./y)))*(aux'*grad3)*aux;
85 |     prod3 = prod3/2;
86 |     prod = [prod1;prod2;prod3];
87 | end
88 | end
89 | 
90 | 


--------------------------------------------------------------------------------
/Matlab/Document/deepGLM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Document/deepGLM.pdf


--------------------------------------------------------------------------------
/Matlab/Document/deepGLMNormalExample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Document/deepGLMNormalExample.pdf


--------------------------------------------------------------------------------
/Matlab/Document/~WRL1562.tmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Document/~WRL1562.tmp


--------------------------------------------------------------------------------
/Matlab/Document/~WRL3227.tmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Document/~WRL3227.tmp


--------------------------------------------------------------------------------
/Matlab/Examples/deepGLMBinomialExample.mlx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Examples/deepGLMBinomialExample.mlx


--------------------------------------------------------------------------------
/Matlab/Examples/deepGLMBinomialExampleScript.m:
--------------------------------------------------------------------------------
 1 | % Examples demonstate how to use deepGLM function to fit data with binomial 
 2 | % dependent variable
 3 | %
 4 | % Copyright 2018 
 5 | %                Nghia Nguyen (nghia.nguyen@sydney.edu.au)
 6 | %                Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) 
 7 | %
 8 | % https://github.com/VBayesLab/deepGLM
 9 | % 
10 | % Version: 1.0
11 | % LAST UPDATE: May, 2018
12 | 
13 | clear
14 | clc
15 | 
16 | % load data
17 | % load('../Data/dataSimulationBinary.mat')
18 | load('../Data/DataSimulationBinary.mat')
19 | 
20 | %% Fit deepGLM model using default setting
21 | nn = [10];
22 | mdl = deepGLMfit(X,y,... 
23 |                  'Distribution','binomial',...
24 |                  'Network',nn,... 
25 |                  'Lrate',0.01,...           
26 |                  'Verbose',1,...             % Display training result each iteration
27 |                  'BatchSize',size(X,1),...   % Use entire training data as mini-batch
28 |                  'MaxEpoch',10000,...
29 |                  'Patience',50,...           % Higher patience values could lead to overfitting
30 |                  'Seed',100);
31 | %% Plot training output    
32 | % Plot lowerbound
33 | figure
34 | plot(mdl.out.lbBar,'LineWidth',2)
35 | title('Lowerbound of Variational Approximation','FontSize',20)
36 | xlabel('Iterations','FontSize',14,'FontWeight','bold')
37 | ylabel('Lowerbound','FontSize',14,'FontWeight','bold')
38 | grid on
39 | 
40 | % Plot shrinkage coefficients
41 | figure
42 | deepGLMplot('Shrinkage',mdl.out.shrinkage,...
43 |             'Title','Shrinkage Coefficients',...
44 |             'Xlabel','Iterations',...
45 |             'LineWidth',2);
46 | 
47 | %% Prediction on test data
48 | % Make prediction (point estimation) on a test set
49 | Pred1 = deepGLMpredict(mdl,X_test);
50 | 
51 | % If ytest is specified (for model evaluation purpose)
52 | % then we can check PPS and MSE on test set
53 | Pred2 = deepGLMpredict(mdl,X_test,'ytest',y_test);
54 | disp(['PPS on test data: ',num2str(Pred2.pps)])
55 | disp(['Classification rate on test data: ',num2str(Pred2.accuracy)])
56 | 
57 | % Plot ROC curve
58 | figure
59 | deepGLMplot('ROC',Pred2.yProb,...
60 |             'ytest',y_test,...
61 |             'Title','ROC',...
62 |             'Xlabel','False Positive Rate',...
63 |             'Ylabel','True Positive Rate')
64 | 
65 | %% Compare to linear model
66 | figure
67 | mdlLR = fitglm(X,y,'Distribution','binomial','Link','logit');
68 | yProb = predict(mdlLR,X_test);
69 | deepGLMplot('ROC',[Pred2.yProb,yProb],...
70 |             'ytest',y_test,...
71 |             'Title','ROC',...
72 |             'Xlabel','False Positive Rate',...
73 |             'Ylabel','True Positive Rate',...
74 |             'legend',{'deepGLM','Logistic Regression'})
75 | 
76 | 


--------------------------------------------------------------------------------
/Matlab/Examples/deepGLMNormalExample.mlx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Examples/deepGLMNormalExample.mlx


--------------------------------------------------------------------------------
/Matlab/Examples/deepGLMNormalExampleScript.m:
--------------------------------------------------------------------------------
  1 | % Examples demonstate how to use deepGLM function to fit data with continuos 
  2 | % dependent variable
  3 | %
  4 | % Copyright 2018 
  5 | %                Nghia Nguyen (nghia.nguyen@sydney.edu.au)
  6 | %                Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) 
  7 | %
  8 | % https://github.com/VBayesLab/deepGLM
  9 | %
 10 | % Version: 1.0
 11 | % LAST UPDATE: May, 2018
 12 | 
 13 | clear
 14 | clc
 15 | 
 16 | % load data
 17 | % load('../Data/dataSimulationContinuous.mat')
 18 | % load('../Data/dataSimulationContinuousEasy.mat')
 19 | % load('../Data/DirectMarketing.mat')
 20 | % load('../Data/SchoolingDataBART.mat')
 21 | % load('../Data/SchoolingDataDeepGLM.mat')
 22 | % load('../Data/OnlineBART.mat')
 23 | % load('../Data/HILDABart.mat')
 24 | load('../Data/abalone.mat')
 25 | 
 26 | %% Fit deepGLM model using default setting
 27 | % By default, if 'distribution' option is not specified then deepGLMfit
 28 | % will assign the response variables as 'normal'
 29 | nn = [10,10];
 30 | mdl = deepGLMfit(X,y,...  
 31 |                  'Network',nn,... 
 32 |                  'Lrate',0.008,...           
 33 |                  'Verbose',1,...             % Display training result each iteration
 34 |                  'BatchSize',1000,...        % Use entire training data as mini-batch
 35 |                  'MaxEpoch',10000,...
 36 |                  'Patience',100,...          % Higher patience values could lead to overfitting
 37 |                  'Seed',NaN,...
 38 |                  'WindowSize',100);
 39 |              
 40 | %% Plot training output
 41 | figure
 42 | plot(mdl.out.lbBar,'LineWidth',2)
 43 | title('Lowerbound of Variational Approximation','FontSize',0.5)
 44 | xlabel('Iterations','FontSize',0.2,'FontWeight','bold')
 45 | ylabel('Lowerbound','FontSize',0.2,'FontWeight','bold')
 46 | grid on
 47 | 
 48 | %% Plot shrinkage coefficients
 49 | figure
 50 | deepGLMplot('Shrinkage',mdl.out.shrinkage,...
 51 |             'Title','Shrinkage Coefficients',...
 52 |             'Xlabel','Iterations',...
 53 |             'LineWidth',2);
 54 | 
 55 | %% Prediction on test data
 56 | % Make prediction (point estimation) on a test set
 57 | disp('---------- Prediction ----------')
 58 | Pred1 = deepGLMpredict(mdl,X_test);
 59 | 
 60 | % If ytest is specified (for model evaluation purpose)
 61 | % then we can check PPS and MSE on test set
 62 | Pred2 = deepGLMpredict(mdl,X_test,'ytest',y_test);
 63 | disp(['PPS on test set using deepGLM is: ',num2str(Pred2.pps)])
 64 | disp(['MSE on test set using deepGLM is: ',num2str(Pred2.mse)])
 65 | 
 66 | % You can also perform point and interval estimation for a single test observation
 67 | idx = randi(length(y_test));     % Pick a random test data observation
 68 | dataTest = X_test(idx,:);
 69 | Pred3 = deepGLMpredict(mdl,dataTest,...
 70 |                        'Interval',1,...
 71 |                        'Nsample',1000);
 72 | disp(['Prediction Interval: [',num2str(Pred3.interval(1)),...
 73 |                                      ';',num2str(Pred3.interval(2)),']',]);
 74 | disp(['True value: ',num2str(y_test(idx))]);
 75 |   
 76 | 
 77 | % Estimate prediction interval for entire test data
 78 | Pred4 = deepGLMpredict(mdl,X_test,...
 79 |                       'ytest',y_test,...
 80 |                       'Interval',1,...
 81 |                       'Nsample',1000);                       
 82 | y_pred = mean(Pred4.yhatMatrix)';
 83 | mse2 = mean((y_test-y_pred).^2);
 84 | accuracy = (y_test<Pred4.interval(:,2) & y_test>Pred4.interval(:,1));
 85 | disp(['Prediction Interval accuracy: ',num2str(sum(accuracy)/length(accuracy))]);
 86 | 
 87 | %% Plot prediction interval
 88 | figure
 89 | deepGLMplot('Interval',Pred4,...
 90 |             'Title','Prediction Interval of Schooling Test Data',...
 91 |             'Xlabel','Observations',...
 92 |             'Ylabel','Wage($1000)',...
 93 |             'Nsample',60);
 94 |         
 95 | %% Plot prediction interval with true response
 96 | figure
 97 | deepGLMplot('Interval',Pred4,...
 98 |             'ytest',y_test,...
 99 |             'Title','Prediction Interval for Test Data',...
100 |             'Xlabel','Observations',...
101 |             'Ylabel','Wage($1000)',...
102 |             'Nsample',40);
103 |                               
104 | 


--------------------------------------------------------------------------------
/Matlab/Examples/deepGLMPoissonExampleScript.m:
--------------------------------------------------------------------------------
 1 | % Examples demonstate how to use deepGLM function to fit data with Poisson 
 2 | % dependent variable
 3 | %
 4 | % Copyright 2018 
 5 | %                Nghia Nguyen (nghia.nguyen@sydney.edu.au)
 6 | %                Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) 
 7 | %
 8 | % https://github.com/VBayesLab/deepGLM
 9 | % 
10 | % Version: 1.0
11 | % LAST UPDATE: May, 2018
12 | 
13 | clear
14 | clc
15 | 
16 | clear
17 | clc
18 | 
19 | % load data
20 | % load('../Data/BikeSharingDeepGLM.mat')
21 | load('../Data/abalone.mat')
22 | 
23 | 
24 | %% Fit deepGLM model using default setting
25 | nn = [10,10];
26 | mdl = deepGLMfit(X,y,...
27 |               'Distribution','poisson',...
28 |               'Network',nn,...
29 |               'Lrate',0.005,...
30 |               'BatchSize',size(X,1),...
31 |               'MaxEpoch',2000,...
32 |               'Patience',50,...
33 |               'Verbose',10,...
34 |               'Seed',1000);
35 | 
36 | %% Plot training output    
37 | % Plot lowerbound
38 | figure
39 | plot(mdl.out.lbBar,'LineWidth',2)
40 | title('Lowerbound of Variational Approximation','FontSize',20)
41 | xlabel('Iterations','FontSize',14,'FontWeight','bold')
42 | ylabel('Lowerbound','FontSize',14,'FontWeight','bold')
43 | grid on
44 | 
45 | % Plot shrinkage coefficients
46 | figure
47 | deepGLMplot('Shrinkage',mdl.out.shrinkage,...
48 |             'Title','Shrinkage Coefficients',...
49 |             'Xlabel','Iterations',...
50 |             'LineWidth',2);
51 | 
52 | 
53 | %% Prediction on test data
54 | % Make prediction (point estimation) on a test set
55 | Pred1 = deepGLMpredict(mdl,X_test);
56 | 
57 | % If ytest is specified (for model evaluation purpose)
58 | % then we can check PPS and MSE on test set
59 | Pred2 = deepGLMpredict(mdl,X_test,'ytest',y_test);
60 | disp(['PPS on test data: ',num2str(Pred2.pps)])
61 | disp(['Mean Square Error on test data: ',num2str(Pred2.mse)])
62 | 
63 | %% Compare with GLM Poisson
64 | mdlGLM = glmfit(X,y,'poisson');
65 | X_test = [ones(size(X_test,1),1) X_test];
66 | y_pred = exp(X_test*mdlGLM);
67 | ppsGLM = mean(-y_test'*X_test*mdlGLM + sum(y_pred));
68 | mseGLM = mean((y_test-y_pred).^2);
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/Python/DirectMarketing.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Python/DirectMarketing.mat


--------------------------------------------------------------------------------
/Python/__pycache__/deepGLM.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Python/__pycache__/deepGLM.cpython-37.pyc


--------------------------------------------------------------------------------
/Python/deepGLM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Python/deepGLM.pdf


--------------------------------------------------------------------------------
/R/02_libs/checkInput.R:
--------------------------------------------------------------------------------
 1 | # Function to check if user input valid settings
 2 | checkInput <- function(X,y,est){
 3 |   # Check if X is null
 4 |   if(is.null(X)){
 5 |     stop("'X' is missing")
 6 |   }
 7 | 
 8 |   # Check if y is null
 9 |   if(is.null(y)){
10 |     stop("'y' is missing")
11 |   }
12 | 
13 |   # Check if there are NaN values in X
14 | 
15 |   # Check if there are NaN values in y
16 | 
17 |   # Check if S is a positive integer
18 | 
19 |   # Check if BatchSize is a positive integer
20 | 
21 |   # Check if Lrate is a positive numerical number
22 | 
23 |   # Check if MaxEpoch is positive integer
24 | 
25 |   # Check if LRateFactor is positive integer
26 | 
27 |   # Check if Patience is positive integer
28 | 
29 |   # Check if Network is a vector of integer
30 | 
31 |   # Check if Distribution is one of {"normal","binomial","poisson"}
32 | 
33 |   # Check if Seed is an integer
34 | 
35 |   # Check if Intercept is a logical number
36 | 
37 |   # Check if Momentum is a number from 0 to 1
38 | 
39 |   # Check if Verbose is positive integer
40 | 
41 |   # Check if WindowSize is positive integer
42 | 
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/R/02_libs/deepGLMTrain.R:
--------------------------------------------------------------------------------
  1 | # Function to train deepGLM model
  2 | deepGLMTrain <- function(X_train,y_train,est){
  3 | 
  4 |   # Extract model parameters provided by users
  5 |   n_units <- est$network
  6 |   batchsize <- est$batchsize
  7 |   lrate <- est$lrate
  8 |   S <- est$S                      # Number of Monte Carlo samples to estimate the gradient
  9 |   tau <- est$tau                  # Threshold before reducing constant learning rate eps0
 10 |   grad_weight <- est$momentum     # Weight in the momentum
 11 |   cScale <- 0.01                 # Random scale factor to initialize b,c
 12 |   patience <- est$patience        # Stop if test error not improved after patience_parameter iterations
 13 |   epoch <- est$epoch              # Number of times learning algorithm scan entire training data
 14 |   verbose <- est$verbose
 15 |   distr <- est$dist
 16 |   LBwindow <- est$windowSize
 17 |   seed <- est$seed
 18 | 
 19 |   # Set random seed if specified
 20 |   if(!is.nan(seed)){
 21 |     set.seed(seed)
 22 |     # set.generator("MersenneTwister", initialization="init2002", resolution=53, seed=seed)
 23 |   }
 24 | 
 25 |   # Data merge for mini-batch sampling
 26 |   data <- cbind(y_train,X_train)
 27 |   datasize <- nrow(X_train)
 28 |   num1Epoch <- round(datasize/batchsize)    # Number of iterations per epoch
 29 | 
 30 |   # Network parameters
 31 |   L <- length(n_units)                      # Number of hidden layers
 32 |   p <- ncol(X_train)-1                      # Number of covariates
 33 |   W_seq <- vector("list",length = L)        # Cells to store weight matrices
 34 |   index_track <- numeric(L)                 # Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,...
 35 |   index_track[1] <- n_units[1]*(p+1)        # Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer
 36 |   W1_tilde_index <- c((n_units[1]+1):index_track[1])  # Index of W1 without biases, as the first column if W1 are biases
 37 |   w_tilde_index <- c()                                # indices of non-biase weights, excluding W1, for l2-regulization prior
 38 |   for (j in 2:L) {
 39 |     index_track[j] <- index_track[j-1]+n_units[j]*(n_units[j-1]+1)
 40 |     w_tilde_index <- c(w_tilde_index,(index_track[j-1]+n_units[j]+1):index_track[j])
 41 |   }
 42 |   d_w <- index_track[L]                      # Total number of weights up to (and including) the last layer
 43 |   d_beta <- n_units[L]+1                     # Dimension of the weights beta connecting the last layer to the output
 44 |   d_theta <- d_w+d_beta                      # Total number of parameters
 45 |   w_tilde_index <- c(w_tilde_index,((d_w+2):d_theta))
 46 |   d_w_tilde <- length(w_tilde_index)
 47 | 
 48 |   # Initialise weights and set initial mu equal to initial weights
 49 |   layers <- c(ncol(X_train),n_units,1)       # Full structure of NN -> [input,hidden,output]
 50 |   weights <- nnInitialize(layers)
 51 |   mu <- c()                             # Mean of variational distribution
 52 |   for (i in 1:(length(layers)-1)) {
 53 |     temp <- weights[[i]]
 54 |     mu <- c(mu,c(temp))
 55 |   }
 56 | 
 57 |   # Initialize b and c and lambda
 58 |   b <- runif(d_theta, min=0, max=cScale)
 59 |   c <- cScale*rep(1,d_theta)
 60 |   lambda <- c(mu,b,c)
 61 | 
 62 |   # Separate weigths to 2 list: one for last hidden layers to output layer and for the rest
 63 |   W1 <- matrix(mu[1:index_track[1]],n_units[1],p+1)
 64 |   W_seq[[1]] <- W1
 65 |   for (j in 2:L) {
 66 |     index <- (index_track[j-1]+1):index_track[j]
 67 |     Wj <- matrix(mu[index],n_units[j],n_units[j-1]+1)
 68 |     W_seq[[j]] <- Wj
 69 |   }
 70 |   beta <- mu[(d_w+1):d_theta]
 71 | 
 72 |   # Get mini-batch
 73 |   idx <- sample.int(datasize,batchsize,replace = T)
 74 |   y <- y_train[idx,]
 75 |   X <- X_train[idx,]
 76 |   # X <- X_train
 77 |   # y <- y_train
 78 | 
 79 |   # Hyperparameters for inverse-Gamma prior on sigma2 if y~Nomal(0,sigma2)
 80 |   mean_sigma2_save <- c()
 81 |   if(distr == "normal"){
 82 |     alpha0_sigma2 <- 10
 83 |     beta0_sigma2 <- (alpha0_sigma2-1)*sd(y)
 84 |     alpha_sigma2 <- alpha0_sigma2 + 0.5*length(y_train)  # Optimal VB parameter for updating sigma2
 85 |     beta_sigma2 <- alpha_sigma2                          # Mean_sigma2 and mean_sigma2_inverse are
 86 |     # Initialised at small values 1/2 and 1 respectively
 87 |     mean_sigma2_inverse <- alpha_sigma2/beta_sigma2
 88 |     mean_sigma2 <- beta_sigma2/(alpha_sigma2-1)
 89 |     mean_sigma2_save[1] <- mean_sigma2
 90 |   }
 91 | 
 92 |   # Calculations for group Lasso coefficients
 93 |   shrinkage_gamma <- .01*rep(1,p)           # Initialise gamma_beta, the shrinkage parameters
 94 |   shrinkage_l2 <- .01                       # Hype-parameter for L2 prior
 95 |   mu_tau <- rep(0,p)                        # Parameters for the auxiliary tau_j
 96 |   mu_matrixW1_tilde <- matrix(mu[W1_tilde_index],n_units[1],p)
 97 |   b_matrixW1_tilde <- matrix(b[W1_tilde_index],n_units[1],p)
 98 | 
 99 |   c_matrixW1_tilde <- matrix(c[W1_tilde_index],n_units[1],p)
100 |   for (j in 1:p) {
101 |     mean_column_j_tilde <- mu_matrixW1_tilde[,j] %*% mu_matrixW1_tilde[,j] +
102 |       b_matrixW1_tilde[,j] %*% b_matrixW1_tilde[,j] +
103 |       sum(c_matrixW1_tilde[,j]^2)
104 |     mu_tau[j] <- shrinkage_gamma[j]/sqrt(mean_column_j_tilde)
105 |   }
106 |   lambda_tau <- shrinkage_gamma^2
107 |   mean_inverse_tau <- mu_tau                 # VB mean <1/tau_j>
108 |   shrinkage_gamma_seq <- shrinkage_gamma
109 |   mean_tau <- 1/mu_tau + 1/lambda_tau
110 |   m <- n_units[1]
111 | 
112 |   # Prepare to calculate lowerbound
113 |   if(distr=="normal"){
114 |     const <- alpha0_sigma2*log(beta0_sigma2) - lgamma(alpha0_sigma2) -
115 |       0.5*p*n_units[1]*log(2*pi) - 0.5*d_w_tilde*log(2*pi) -
116 |       p*lgamma((n_units[1]+1)/2) - 0.5*datasize*log(2*pi) +
117 |       p/2*log(2*pi) + 0.5*d_theta*log(2*pi) + d_theta/2
118 |   }else{
119 |     const <- -0.5*p*n_units[1]*log(2*pi) - 0.5*d_w_tilde*log(2*pi)-
120 |       p*lgamma((n_units[1]+1)/2) + p/2*log(2*pi)+
121 |       0.5*d_theta*log(2*pi) + d_theta/2
122 |   }
123 |   W1 <- matrix(mu[1:index_track[1]],n_units[1],p+1)
124 |   W_seq[[1]] <- W1
125 |   for (j in 2:L) {
126 |     index <- (index_track[j-1]+1):index_track[j]
127 |     Wj <- matrix(mu[index],n_units[j],n_units[j-1]+1)
128 |     W_seq[[j]] <- Wj
129 |   }
130 |   beta <- mu[(d_w+1):d_theta]
131 |   mu_w_tilde <- mu[w_tilde_index]
132 |   b_w_tilde <- b[w_tilde_index]
133 |   c_w_tilde <- c[w_tilde_index]
134 |   mean_w_tilde <- c(mu_w_tilde %*% mu_w_tilde + b_w_tilde %*% b_w_tilde + sum(c_w_tilde^2))
135 |   iter <- 1
136 | 
137 |   # calculate analytical terms of lowerbound
138 |   constMean <- vbLowerBound(b,c,distr,p,beta_sigma2,alpha_sigma2,alpha0_sigma2,beta0_sigma2,
139 |                             mean_sigma2_inverse,n_units,shrinkage_gamma,mean_tau,datasize,
140 |                             lambda_tau,d_w_tilde,shrinkage_l2,mean_w_tilde,mean_column_j_tilde,
141 |                             mean_inverse_tau)
142 | 
143 |   # Calculate gradient of lowerbound and lowerbound of the first iteration
144 |   lb <- c()
145 |   grad_g_lik_store <- matrix(0,S,3*d_theta)
146 |   lb_iter <- matrix(0,1,S)
147 |   iter <- 1
148 |   gradient_lambda <- vbGradientLogLB(X,y,b,c,mu,S,p,L,d_theta,d_w,index_track,n_units,mean_inverse_tau,
149 |                                      shrinkage_l2,datasize,distr,mean_sigma2_inverse,constMean,
150 |                                      const,grad_g_lik_store,lb_iter,iter)
151 |   gradient_bar <- gradient_lambda$gradient_lambda
152 |   lb[iter] <- mean(gradient_lambda$lb_iter)/datasize
153 |   cat("Initial LB: ", lb[iter],'\n')
154 | 
155 |   #--------------------------Training Phase-----------------------------
156 |   # Prepare parameters for training
157 |   idxEpoch <- 0                          # Index of current epoch
158 |   iter <- 1                              # Index of current iteration
159 |   stop <- FALSE                          # Stop flag for early stopping
160 |   lambda_best <- lambda                  # Store optimal lambda for output
161 |   idxPatience <- 0                       # Index of number of consequent non-decreasing
162 |   # iterations for early stopping
163 |   mean_column_j_tilde <- matrix(0,1,p)
164 |   lb_bar <- c()
165 | 
166 |   print("---------- Training Phase ----------")
167 |   while (!stop) {
168 |     iter <- iter+1
169 | 
170 |     # Extract mini-batch
171 |     idx <- sample.int(datasize,batchsize,replace = T)
172 |     y <- y_train[idx,]
173 |     X <- X_train[idx,]
174 |     #    X <- X_train
175 |     #    y <- y_train
176 | 
177 |     # Calculate analytical terms of lowerbound
178 |     constMean <- vbLowerBound(b,c,distr,p,beta_sigma2,alpha_sigma2,alpha0_sigma2,beta0_sigma2,
179 |                               mean_sigma2_inverse,n_units,shrinkage_gamma,mean_tau,datasize,
180 |                               lambda_tau,d_w_tilde,shrinkage_l2,mean_w_tilde,mean_column_j_tilde,
181 |                               mean_inverse_tau)
182 | 
183 |     # Calculate Natural Gradient
184 |     grad_lb <- vbGradientLogLB(X,y,b,c,mu,S,p,L,d_theta,d_w,index_track,n_units,mean_inverse_tau,
185 |                                shrinkage_l2,datasize,distr,mean_sigma2_inverse,constMean,
186 |                                const,grad_g_lik_store,lb_iter,iter)
187 |     gradient_lambda = grad_lb$gradient_lambda
188 |     lb[iter] <- mean(grad_lb$lb_iter)/datasize
189 | 
190 |     # Prevent exploding Gradient
191 |     grad_norm <- sqrt(sum(gradient_lambda^2))
192 |     norm_gradient_threshold <- 100
193 |     if(grad_norm > norm_gradient_threshold){
194 |       gradient_lambda <- (norm_gradient_threshold/grad_norm)*gradient_lambda
195 |     }
196 | 
197 |     # Momentum gradient
198 |     gradient_bar_old <- gradient_bar
199 |     gradient_bar <- grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda
200 | 
201 |     # Adaptive learning rate
202 |     if(iter>tau){
203 |       stepsize <- lrate*tau/iter
204 |     }else{
205 |       stepsize <- lrate
206 |     }
207 | 
208 |     # Gradient ascend
209 |     lambda <- lambda + stepsize*gradient_bar
210 | 
211 |     # Restore model parameters from variational parameter lambda
212 |     mu <- lambda[1:d_theta]
213 |     b <- lambda[(d_theta+1):(2*d_theta)]
214 |     c <- lambda[(2*d_theta+1):length(lambda)]
215 |     W1 <- matrix(mu[1:index_track[1]],n_units[1],p+1)
216 |     W_seq[[1]] <- W1
217 |     for (j in 2:L){
218 |       index <- (index_track[j-1]+1):index_track[j]
219 |       Wj <- matrix(mu[index],n_units[j],n_units[j-1]+1)
220 |       W_seq[[j]] <- Wj
221 |     }
222 |     beta <- mu[(d_w+1):d_theta]
223 | 
224 |     # Update tau and shrinkage parameters
225 |     if(iter%%1 == 0){
226 |       mu_matrixW1_tilde <- matrix(mu[W1_tilde_index],n_units[1],p)
227 |       b_matrixW1_tilde <- matrix(b[W1_tilde_index],n_units[1],p)
228 |       c_matrixW1_tilde <- matrix(c[W1_tilde_index],n_units[1],p)
229 |       for (j in 1:p) {
230 |         mean_column_j_tilde[j] <- mu_matrixW1_tilde[,j] %*% mu_matrixW1_tilde[,j] +
231 |           b_matrixW1_tilde[,j] %*% b_matrixW1_tilde[,j] +
232 |           sum(c_matrixW1_tilde[,j]^2)
233 |         mu_tau[j] <- shrinkage_gamma[j]/sqrt(mean_column_j_tilde[j])
234 |         lambda_tau[j] <- shrinkage_gamma[j]^2
235 |       }
236 |       mean_inverse_tau <- mu_tau
237 |       mean_tau <- 1/mu_tau + 1/lambda_tau
238 |       shrinkage_gamma <- sqrt((n_units[1]+1)/mean_tau)
239 |       shrinkage_gamma_seq <- cbind(shrinkage_gamma_seq,shrinkage_gamma)
240 | 
241 |       mu_w_tilde <- mu[w_tilde_index]
242 |       b_w_tilde <- b[w_tilde_index]
243 |       c_w_tilde <- c[w_tilde_index]
244 |       mean_w_tilde <- c(mu_w_tilde %*% mu_w_tilde + b_w_tilde %*% b_w_tilde + sum(c_w_tilde^2))
245 |     }
246 | 
247 |     # Update VB posterior for sigma2, which is inverse Gamma
248 |     if(distr=="normal"){
249 |       if (iter%%1 == 0){
250 |         sum_squared <- nnSumResidualSquare(y_train,X_train,W_seq,beta)
251 |         beta_sigma2 <- beta0_sigma2 + sum_squared/2
252 |         mean_sigma2_inverse <- alpha_sigma2/beta_sigma2
253 |         mean_sigma2 <- beta_sigma2/(alpha_sigma2-1)
254 |         mean_sigma2_save <- c(mean_sigma2_save,mean_sigma2)
255 |       }
256 |     }
257 | 
258 |     # Using lowerbound for validation
259 |     if(iter>LBwindow){
260 |       lb_bar[iter-LBwindow] <- mean(lb[(iter-LBwindow+1):iter])
261 |       if(lb_bar[length(lb_bar)]>=max(lb_bar)){
262 |         lambda_best <- lambda
263 |         idxPatience <- 0
264 |       }else{
265 |         idxPatience <- idxPatience + 1
266 |       }
267 |     }
268 | 
269 |     # Early stopping
270 |     if((idxPatience>patience)||(idxEpoch>epoch)){
271 |       stop <- TRUE
272 |     }
273 | 
274 |     # Display epoch index whenever an epoch is finished
275 |     if(iter%%num1Epoch==0){
276 |       idxEpoch <- idxEpoch + 1
277 |     }
278 | 
279 |     # Display training results after each 'verbose' iteration
280 |     if (verbose && iter%%verbose==0){
281 |       if(iter>LBwindow){
282 |         cat("Epoch: ", idxEpoch,  "    -  Current LB: ",lb_bar[iter-LBwindow],"\n")
283 |         # message("Epoch: ", idxEpoch,  "  -  Current LB: ",lb_bar[iter-LBwindow])
284 |       }
285 |       else{
286 |         cat("Epoch: ", idxEpoch,  "-  Current LB: ",lb[iter],"\n")
287 |         # message("Epoch: ", idxEpoch,  "   -  Current LB: ",lb[iter])
288 |       }
289 |     }
290 |   }
291 | 
292 |   # Display Training Results
293 |   print('---------- Training Completed! ----------')
294 |   cat("Number of iteration: ",iter,'\n')
295 |   cat("LBBar best: ",max(lb_bar),'\n')
296 |   # message("Number of iteration: ",iter)
297 |   # message("LBBar best: ",max(lb_bar))
298 | 
299 |   # Store training output
300 |   lambda <- lambda_best
301 |   mu <- lambda[1:d_theta]
302 |   b <- lambda[(d_theta+1):(2*d_theta)]
303 |   c <- lambda[(2*d_theta+1):length(lambda)]
304 |   SIGMA = cbind(b) %*% b + diag(c^2)
305 | 
306 |   W1 <- matrix(mu[1:index_track[1]],n_units[1],p+1)
307 |   W_seq[[1]] <- W1
308 |   for (j in 2:L){
309 |     index <- (index_track[j-1]+1):index_track[j]
310 |     Wj <- matrix(mu[index],n_units[j],n_units[j-1]+1)
311 |     W_seq[[j]] <- Wj
312 |   }
313 |   beta <- mu[(d_w+1):d_theta]
314 | 
315 |   # Store output in a struct
316 |   est$out.weights <- W_seq
317 |   est$out.beta <- beta
318 |   est$out.shrinkage <- shrinkage_gamma_seq
319 |   colnames(est$out.shrinkage) <- NULL
320 |   est$out.iteration <- iter
321 |   est$out.vbMU <- mu                      # Mean of variational distribution of weights
322 |   est$out.b <- b
323 |   est$out.c <- c
324 |   est$out.vbSIGMA <- SIGMA                # Covariance matrix of variational distribution of weights
325 |   est$out.nparams <- d_theta              # Number of parameters
326 |   est$out.indexTrack <- index_track
327 |   est$out.muTau <- mu_tau
328 | 
329 |   if(distr=="normal"){
330 |     est$out.sigma2Alpha <- alpha_sigma2
331 |     est$out.sigma2Beta <- beta_sigma2
332 |     est$out.sigma2Mean <- mean_sigma2_save[length(mean_sigma2_save)]
333 |     est$out.sigma2MeanIter <- mean_sigma2_save
334 |   }
335 |   est$out.lbBar <- lb_bar[2:length(lb_bar)]
336 |   est$out.lb <- lb
337 | 
338 |   return(est)
339 | }
340 | 


--------------------------------------------------------------------------------
/R/02_libs/deepGLMfit.R:
--------------------------------------------------------------------------------
 1 | deepGLMfit <- function(X,y, Lrate=0.01, Network=c(10,10) , BatchSize=5000,
 2 |                        S=10, LRateFactor=10000, Momentum=0.6, Patience=100,
 3 |                        MaxEpoch = 100, Verbose=10, Distribution="normal",
 4 |                        WindowSize=100, Seed=NaN, Intercept=TRUE){
 5 | 
 6 |   # Store training settings in a list
 7 |   est <- list()
 8 |   est$S <- S
 9 |   est$lrate <- Lrate
10 |   est$epoch <- MaxEpoch
11 |   est$tau <- LRateFactor
12 |   est$patience <- Patience
13 |   est$network <- Network
14 |   est$dist <- Distribution
15 |   est$seed <- Seed
16 |   est$icept <- Intercept
17 |   est$momentum <- Momentum
18 |   est$verbose <- Verbose
19 |   est$windowSize <- WindowSize
20 | 
21 |   # Check if inputs are corrected
22 |   checkInput(X,y,est)
23 | 
24 |   # Calculate batch size
25 |   if(BatchSize<=1){                  # If specified batchsize is a propotion
26 |     BatchSize <- BatchSize * nrow(X)
27 |   }
28 |   if(BatchSize>=nrow(X)){
29 |     BatchSize <- nrow(X)
30 |   }
31 |   est$batchsize <- BatchSize
32 | 
33 |   # Insert intercepts if Intercept=TRUE
34 |   if(Intercept){
35 |     X <- cbind(matrix(1,nrow(X),1),X)
36 |   }
37 | 
38 |   #Start to train deepGLM
39 |   y <- as.matrix(y)
40 |   t_start <- Sys.time()
41 |   est <- deepGLMTrain(X,y,est)
42 |   t_stop <- Sys.time()
43 |   est$out.CPU <- t_stop - t_start
44 |   cat("Training time: ",est$out.CPU,'\n')
45 | 
46 |   return(est)
47 | }
48 | 


--------------------------------------------------------------------------------
/R/02_libs/deepGLMpredict.R:
--------------------------------------------------------------------------------
 1 | # Function to make prediction on an unseen data using a trained DeepGLM model
 2 | # Input:
 3 | deepGLMpredict <- function(mdl,X,y=NULL,Interval=0,Nsample=1000,Intercept=TRUE){
 4 | 
 5 |   # Transform X to a row matrix
 6 |   if(is.numeric(X)){
 7 |     X <- rbind(X)
 8 |   }
 9 | 
10 |   # If y is specify, check y
11 | 
12 |   # Store Nsample to mdl
13 |   mdl$Nsample <- Nsample
14 | 
15 |   # If training data does not include intercepts, the add intercepts
16 |   N <- nrow(X)                     # Number of observation in test data
17 |   if(Intercept){
18 |     X <- cbind(matrix(1,N,1),X)
19 |   }
20 | 
21 |   alpha <- Interval
22 | 
23 |   # Load deepGLM params from struct
24 |   W_seq <- mdl$out.weights
25 |   beta <- mdl$out.beta
26 |   distr <- mdl$dist
27 | 
28 |   # Calculate Neuron Network output
29 |   nnet_output <- nnFeedForward(X,W_seq,beta)              # Output vector of NN
30 |   out <- list()
31 | 
32 |   if(distr=="normal"){
33 |     out$yhat = nnet_output                                # Prediction for continuous response
34 |     # If ytest if provided, then calculate pps and mse
35 |     if(length(y)>0){
36 |       sigma2 <- mdl$out.sigma2Mean
37 |       mse <- mean((y-nnet_output)^2)
38 |       pps <- 1/2*log(sigma2) + 1/2/sigma2*mse
39 |       out$mse <- mse
40 |       out$pps <- pps
41 |     }
42 |     # Calculate confidence interval if required
43 |     if(alpha!=0){
44 |       interval <- predictionInterval(mdl,X,alpha)
45 |       out$interval <- interval$interval
46 |       out$yhatMatrix <- interval$yhatMC
47 |     }
48 | 
49 |   }else if(distr=="binomial"){
50 |     out$yNN <- nnet_output
51 |     out$yProb <- exp(nnet_output)/(1+exp(nnet_output))
52 |     y_pred <- as.numeric(nnet_output>0)                        #  Prediction for binary response
53 |     out$yhat <- y_pred
54 |     #If ytest if provided, then calculate pps and mse
55 |     if(length(y)>0){
56 |       pps <- mean(-y*nnet_output+log(1+exp(nnet_output)))
57 |       cr <- mean(y==y_pred)                                #  Miss-classification rate
58 |       out$pps <- pps
59 |       out$accuracy <- cr
60 |     }
61 | 
62 |   }else if(distr=="poisson"){
63 |     out$yNN <- nnet_output
64 |     y_pred <- exp(nnet_output)                            #  Prediction for poisson response
65 |     out$yhat <- y_pred
66 |     if(length(y)>0)
67 |       pps <- mean(-y*nnet_output+exp(nnet_output))
68 |     mse <- mean((y-y_pred)^2)
69 |     out$mse <- mse
70 |     out$pps <- pps
71 | 
72 |   }else{
73 |     message("Distribution must be: normal, binomial, poisson")
74 |   }
75 |   return(out)
76 | }
77 | 


--------------------------------------------------------------------------------
/R/02_libs/nnActivation.R:
--------------------------------------------------------------------------------
 1 | # Function to calculate activation function
 2 | # Input must be a matrix
 3 | nnActivation <- function(a,func){
 4 |   switch (func,
 5 |           Linear = {out <- a},
 6 |           Sigmoid = {out <- 1/(1+exp(-a))},
 7 |           ReLU = {out <- pmax(a,0)},
 8 |           defaut)
 9 |   return(out)
10 | }
11 | 


--------------------------------------------------------------------------------
/R/02_libs/nnActivationGrad.R:
--------------------------------------------------------------------------------
 1 | # Function to calculate derivative of activation function
 2 | # Input must be a matrix
 3 | nnActivationGrad <- function(a,func){
 4 |   switch (func,
 5 |           Linear = {out <- matrix(1,nrow(a),ncol(a))},
 6 |           Sigmoid = {out <- 1/(1+exp(-a))},
 7 |           ReLU = {out <- (a>0)*1},
 8 |           defaut)
 9 |   return(out)
10 | }
11 | 


--------------------------------------------------------------------------------
/R/02_libs/nnBackPropagation.R:
--------------------------------------------------------------------------------
 1 | # Function to calculate backbrop of a DFNN
 2 | # Input:
 3 | #         X,y -> Matrix
 4 | #         W_seq -> list of matrices
 5 | #         beta -> vector
 6 | #         distr -> character
 7 | nnBackPropagation <- function(X,y,W_seq,beta,distr){
 8 |   output = list()
 9 |   n_train <- nrow(X)             # Number of mini-batch training observation
10 |   L <- length(W_seq)             # Number of hidden layers until the last layer
11 |   a_seq <- vector("list", length = L)
12 |   Z_seq <- vector("list",length = L)
13 | 
14 |   a_seq[[1]] <- W_seq[[1]] %*% t(X)
15 |   Z_seq[[1]] <- rbind(matrix(1,1,n_train),nnActivation(a_seq[[1]],"ReLU"))
16 | 
17 |   for(j in 2:L){
18 |     a_seq[[j]] <- W_seq[[j]] %*% Z_seq[[j-1]]
19 |     Z_seq[[j]] <- rbind(matrix(1,1,n_train),nnActivation(a_seq[[j]],"ReLU"))
20 |   }
21 |   delta_seq = vector("list", length = L+1)
22 | 
23 |   # Calculate error at the output layers according to distribution family of response
24 |   nnOut = beta %*% Z_seq[[L]]
25 |   switch(distr,
26 |          normal = {delta_seq[[L+1]] <- t(y) - nnOut},
27 |          binomial = {p_i <- 1/(1+exp(-nnOut))
28 |          delta_seq[[L+1]] <- t(y) - p_i},
29 |          poisson = {delta_seq[[L+1]] <- t(y) - exp(nnOut)},
30 |          default)
31 |   delta_seq[[L]] <- (beta[2:length(beta)] %*% delta_seq[[L+1]]) * nnActivationGrad(a_seq[[L]],"ReLU")
32 | 
33 |   for (j in (L-1):1) {
34 |     Wj_tilde <- W_seq[[j+1]]
35 |     Wj_tilde <- Wj_tilde[,2:ncol(Wj_tilde)]
36 |     delta_seq[[j]] <- nnActivationGrad(a_seq[[j]],"ReLU")*(t(Wj_tilde) %*% delta_seq[[j+1]])
37 |   }
38 |   gradient_W1 <- delta_seq[[1]] %*% X
39 |   gradient <- c(gradient_W1)
40 |   # dim(gradient) <- c(ncol(gradient)*nrow(gradient),1)
41 |   for (j in 2:L) {
42 |     gradient_Wj <- c(delta_seq[[j]] %*% t(Z_seq[[j-1]]))
43 |     # dim(gradient_Wj) <- c(ncol(gradient_Wj)*nrow(gradient_Wj),1)
44 |     gradient <- c(gradient,gradient_Wj)
45 |   }
46 |   gradient <- c(gradient,c(Z_seq[[L]] %*% t(delta_seq[[L+1]])))
47 |   output$gradient <- gradient
48 |   output$nnOut <- nnOut
49 |   return(output)
50 | }
51 | 


--------------------------------------------------------------------------------
/R/02_libs/nnFeedForward.R:
--------------------------------------------------------------------------------
 1 | # Function to calculate output of a DFNN
 2 | nnFeedForward <- function(X,W_seq,beta){
 3 |   n_train <- nrow(X)                                # Number of training observations
 4 |   # Make forward passes to all layers
 5 |   a <- W_seq[[1]] %*% t(X)
 6 |   Z <- rbind(matrix(1,1,n_train),nnActivation(a,"ReLU"))
 7 |   L <- length(W_seq)
 8 |   for (j in 2:L) {
 9 |     a <- W_seq[[j]] %*% Z
10 |     Z <- rbind(matrix(1,1,n_train),nnActivation(a,"ReLU"))  # Add biases
11 |   }
12 |   nnOutput <- t(Z) %*% beta
13 |   return(nnOutput)
14 | }
15 | 


--------------------------------------------------------------------------------
/R/02_libs/nnGradLogLikelihood.R:
--------------------------------------------------------------------------------
 1 | # Function to calculate gradient of log-likelihood
 2 | nnGradLogLikelihood <- function(w_seq,beta,X,y,datasize,distr,mean_sigma2_inverse){
 3 |   output <- list()
 4 |   n = nrow(X)
 5 |   out <- nnBackPropagation(X,y,w_seq,beta,distr)
 6 |   back_prop <- out$gradient
 7 |   nnOut <- t(out$nnOut)
 8 | 
 9 |   switch (distr,
10 |           normal = {gradient_theta <- mean_sigma2_inverse*back_prop
11 |           gradient <- datasize/n*gradient_theta  },        # To compensate the variational lowerbound
12 |           binomial = {gradient <- datasize/n*back_prop},
13 |           poisson = {gradient <- datasize/n*back_prop},
14 |           default)
15 |   output$gradient <- gradient
16 |   output$nnOut <- nnOut
17 |   return(output)
18 | }
19 | 


--------------------------------------------------------------------------------
/R/02_libs/nnInitialize.R:
--------------------------------------------------------------------------------
 1 | # Function to initialize weights for deepGLM
 2 | # Input: layers is a data array specifying (input+hidden) layers
 3 | # Ex: c(20,10,10)
 4 | nnInitialize <- function(layers){
 5 |   # stopifnot(is.integer(layers))                    # layer must be array of interger
 6 |   num_layer <- length(layers)-1
 7 |   w <- vector("list",length = num_layer)       # Initialize a list to store matrices of weights
 8 |   for (i in 1:num_layer) {
 9 |     b <- sqrt(6)/(layers[i]+layers[i+1])
10 |     if(i==1){
11 |       w[[i]] <- matrix(runif(layers[i+1]*(layers[i]),-b,b),layers[i+1],layers[i])    # Input layer already has bias
12 |     }
13 |     else{
14 |       w[[i]] <- matrix(runif(layers[i+1]*(layers[i]+1),-b,b),layers[i+1],layers[i]+1)
15 |     }
16 |   }
17 |   return(w)
18 | }
19 | 


--------------------------------------------------------------------------------
/R/02_libs/nnSumResidualSquare.R:
--------------------------------------------------------------------------------
1 | # Function to calculate sum square error of 2 vector
2 | nnSumResidualSquare <- function(y,X,W_seq,beta){
3 |   nnet_output <- nnFeedForward(X,W_seq,beta)        # Output vector of NN
4 |   S <- sum((y-nnet_output)^2)
5 |   return(S)
6 | }
7 | 


--------------------------------------------------------------------------------
/R/02_libs/predictionInterval.R:
--------------------------------------------------------------------------------
 1 | # Calculate prediction interval for new observations
 2 | predictionInterval <- function(mdl,X,zalpha){
 3 |   predInterval <- list()
 4 |   # Load deepGLM params from struct
 5 |   Nsample <- mdl$Nsample
 6 |   mu <- mdl$out.vbMU
 7 |   SIGMA <- mdl$out.vbSIGMA
 8 |   n_units <- mdl$network
 9 |   index_track <- mdl$out.indexTrack
10 |   alpha_sigma2 <- mdl$out.sigma2Alpha
11 |   beta_sigma2 <- mdl$out.sigma2Beta
12 | 
13 |   # Calculate network parameters
14 |   L <- length(n_units)                                   # Number of hidden layers
15 |   p <- ncol(X)-1                                         # Number of covariates
16 |   d_beta <- n_units[L]+1
17 |   d_w <- index_track[L]
18 | 
19 |   yhat <- matrix(0,Nsample,nrow(X))                      # Predicted values of test data
20 |   nnOut <- matrix(0,Nsample,nrow(X))                     # Output of NN
21 |   W_seq <- vector("list",length = L)
22 |   for (i in 1:Nsample) {
23 |     theta_i <- rmvnorm(1,mean=mu,sigma=SIGMA)            # Generate samples of theta from Normal distribution
24 |     sigma2_i <- 1/rgamma(1,alpha_sigma2,beta_sigma2)   # Generate samples of sigma from IG distribution
25 | 
26 |     # For each generated theta, restore neuron net structure
27 |     W1 <- matrix(theta_i[1:index_track[1]],n_units[1],p+1)
28 |     W_seq[[1]] <- W1
29 |     for (j in 2:L){
30 |       index <- (index_track[j-1]+1):index_track[j]
31 |       Wj <- matrix(theta_i[index],n_units[j],n_units[j-1]+1)
32 |       W_seq[[j]] <- Wj
33 |     }
34 |     beta <- theta_i[(d_w+1):(d_w+d_beta)]
35 | 
36 |     nnOut[i,] <- nnFeedForward(X,W_seq,beta)                      # Calculate neuron network output
37 |     yhat[i,] <- rnorm(nrow(X),mean=nnOut[i,],sd=sqrt(sigma2_i))      # Calculate p(y|theta_i,sigma_i,X)
38 |   }
39 | 
40 |   # 1-std prediction interval interval
41 |   yhatLCL <- colMeans(yhat) - zalpha*apply(yhat, 2, sd)
42 |   yhatUCL <- colMeans(yhat) + zalpha*apply(yhat, 2, sd)
43 |   yhatInterval <- cbind(cbind(yhatLCL),cbind(yhatUCL))
44 |   predInterval$yhatMC <- yhat
45 |   predInterval$interval <- yhatInterval
46 |   return(predInterval)
47 | }
48 | 


--------------------------------------------------------------------------------
/R/02_libs/vbGradientLogLB.R:
--------------------------------------------------------------------------------
 1 | # Function to calculate the estimation of gradient of lowerbound
 2 | vbGradientLogLB <- function(X,y,b,c,mu,S,p,L,d_theta,d_w,index_track,n_units,mean_inverse_tau,
 3 |                             shrinkage_l2,datasize,distr,mean_sigma2_inverse,constMean,
 4 |                             const,grad_g_lik_store,lb_iter,iter){
 5 |   gradllh_out <- list()
 6 |   out <- list()
 7 |   batchsize <- nrow(X)
 8 |   #  set.generator("MersenneTwister", initialization="init2002", resolution=53, seed=iter)
 9 |   rqmc <- matrix(rnorm(S*(d_theta+1),0,1),S,d_theta+1)
10 |   for (s in 1:S) {
11 |     # Calculate theta
12 |     U_normal <- rqmc[s,]
13 |     epsilon1 <- U_normal[1]
14 |     epsilon2 <- U_normal[2:length(U_normal)]
15 |     theta <- mu + epsilon1*b + c*epsilon2
16 | 
17 |     W_seq <- vector("list", length = L)
18 |     W1 <- matrix(theta[1:index_track[1]],n_units[1],p+1)
19 |     W_seq[[1]] <- W1
20 |     W1_tilde <- W1[,2:ncol(W1)]                                  # weights without biases
21 |     W1_tilde_gamma <- W1_tilde %*% diag(c(mean_inverse_tau))
22 |     grad_prior_w_beta <- c(rep(0,n_units[1]),-c(W1_tilde_gamma))
23 |     for (j in 2:L) {
24 |       index <- (index_track[j-1]+1):index_track[j]
25 |       Wj <- matrix(theta[index],n_units[j],n_units[j-1]+1)
26 |       W_seq[[j]] <- Wj
27 |       Wj_tilde <- Wj[,2:ncol(Wj)]
28 |       grad_prior_Wj <- c(rep(0,n_units[j]),-shrinkage_l2 %*% c(Wj_tilde))
29 |       grad_prior_w_beta <- c(grad_prior_w_beta,grad_prior_Wj)
30 |     }
31 |     beta <- theta[(d_w+1):d_theta]
32 |     beta_tilde <- beta[2:length(beta)]                    # vector beta without intercept
33 |     grad_prior_beta <- c(0,c(-shrinkage_l2 %*% beta_tilde))
34 |     grad_prior_w_beta <- c(grad_prior_w_beta,grad_prior_beta)
35 | 
36 |     if (distr=="normal"){
37 |       gradllh_out <- nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr,mean_sigma2_inverse)
38 |     }else if(distr=="binomial"){
39 |       gradllh_out <- nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr)
40 |     }else if(distr=="poisson"){
41 |       gradllh_out <- nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr)
42 |     }else{
43 |       message("Distribution must be: normal, binomial, poisson")
44 |     }
45 |     grad_llh <- gradllh_out$gradient
46 |     yNN <- gradllh_out$nnOut
47 | 
48 |     grad_h <- grad_prior_w_beta + grad_llh              # Gradient of log prior plus log-likelihood
49 |     grad_log_q <- vbGradientLogq(b,c,theta,mu)
50 |     grad_theta <- grad_h - grad_log_q
51 |     grad_g_lik_store[s,] <- c(grad_theta,epsilon1*grad_theta, epsilon2*grad_theta)
52 | 
53 |     # Calculate Lowerbound
54 |     if(distr=="normal"){
55 |       lb_iter[s] <- constMean-0.5*mean_sigma2_inverse*sum((y-yNN)^2)*datasize/batchsize + const
56 |     }else if(distr=="binomial"){
57 |       lb_iter[s] <- constMean + sum(y*yNN - log(1+exp(yNN)))*datasize/batchsize + const
58 |     }else if(distr=="poisson"){
59 |       lb_iter[s] <- constMean + sum(y*yNN - exp(yNN))*datasize/batchsize + const
60 |     }else{
61 |       message("Distribution must be: normal, binomial, poisson")
62 |     }
63 |   }
64 |   grad_lb <- colMeans(grad_g_lik_store)
65 |   gradient_lambda <- vbNaturalGradient(b,c,grad_lb)
66 |   out$lb_iter <- lb_iter
67 |   out$gradient_lambda <- gradient_lambda
68 | 
69 |   return(out)
70 | }
71 | 


--------------------------------------------------------------------------------
/R/02_libs/vbGradientLogq.R:
--------------------------------------------------------------------------------
1 | # Function vbGradientLogq
2 | # b,c,theta,mu -> vector
3 | vbGradientLogq <- function(b,c,theta,mu){
4 |   x <- theta-mu
5 |   d <- b/c^2
6 |   grad_log_q <- -x/c^2 + c((d%*%x)/(1+(d%*%b)))*d
7 | }
8 | 


--------------------------------------------------------------------------------
/R/02_libs/vbLowerBound.R:
--------------------------------------------------------------------------------
 1 | # Function to calculate lowerbound of variational distribution
 2 | vbLowerBound <- function(b,c,distr,p,beta_sigma2,alpha_sigma2,alpha0_sigma2,beta0_sigma2,
 3 |                          mean_sigma2_inverse,n_units,shrinkage_gamma,mean_tau,datasize,
 4 |                          lambda_tau,d_w_tilde,shrinkage_l2,mean_w_tilde,mean_column_j_tilde,
 5 |                          mean_inverse_tau){
 6 |   if(distr=="normal"){
 7 |     mean_log_sig2 <- log(beta_sigma2)-digamma(alpha_sigma2)
 8 |     logdet <- log(det(1 + (b/(c^2)) %*% b)) + sum(log(c^2))
 9 |     constMean <- -(alpha0_sigma2+1)*mean_log_sig2 - beta0_sigma2*mean_sigma2_inverse+
10 |       0.5*sum(2*(n_units[1]+1)*log(shrinkage_gamma)-(shrinkage_gamma^2)*mean_tau)-
11 |       0.5*datasize*mean_log_sig2+
12 |       lgamma(alpha_sigma2)-alpha_sigma2*log(beta_sigma2)+
13 |       (alpha_sigma2+1)*mean_log_sig2+alpha_sigma2-
14 |       0.5*(sum(log(lambda_tau))-p) + 0.5*logdet +
15 |       0.5*d_w_tilde*log(shrinkage_l2) - 0.5*shrinkage_l2*mean_w_tilde -
16 |       0.5*sum(c(mean_column_j_tilde)*c(mean_inverse_tau))
17 |   }else{
18 |     logdet = log(det(1 + (b/(c^2)) %*% b)) + sum(log(c^2))
19 |     constMean = 0.5*sum(2*(n_units[1]+1)*log(shrinkage_gamma)-
20 |                           (shrinkage_gamma^2)*mean_tau)-0.5*(sum(log(lambda_tau))-p)+
21 |       0.5*logdet+0.5*d_w_tilde*log(shrinkage_l2) -
22 |       0.5*shrinkage_l2*mean_w_tilde-
23 |       0.5*sum(c(mean_column_j_tilde)*c(mean_inverse_tau))
24 |   }
25 |   return(constMean)
26 | }
27 | 


--------------------------------------------------------------------------------
/R/02_libs/vbNaturalGradient.R:
--------------------------------------------------------------------------------
 1 | # Function to calculate natural gradient
 2 | # Input:
 3 | #         b,c,grad -> vector
 4 | vbNaturalGradient <- function(b,c,grad){
 5 |   d <- length(b)
 6 |   grad1 <- grad[1:d]
 7 |   grad2 <- grad[(d+1):(2*d)]
 8 |   grad3 <- grad[(2*d+1):length(grad)]
 9 |   c2 <- c^2
10 |   b2 <- b^2
11 |   prod1 <- c(b %*% grad1)*b + (grad1*c2)
12 |   const <- sum(b2/c2)
13 |   const1 <- 0.5 + 0.5/const
14 |   prod2 <- c(b %*% grad2)*b + (grad2*c2)
15 |   prod2 <- const1*prod2
16 |   alpha <- 1/(1+const)
17 |   x <- alpha*b2/c^3
18 |   y <- 1/c2 - 2*alpha*(b/c2)^2
19 |   aux <- x/y
20 |   prod3 <- 0.5*(grad3/y-c(1/((1+sum(x^2/y)))*(aux %*% grad3)) * aux)
21 |   prod <- c(prod1,prod2,prod3)
22 |   return(prod)
23 | }
24 | 


--------------------------------------------------------------------------------
/R/deepGLMNormalExample.R:
--------------------------------------------------------------------------------
 1 | # Examples demonstrate how to use deepGLM function to fit data with continuous dependent variable
 2 | #
 3 | # Copyright 2018
 4 | #                Nghia Nguyen (nghia.nguyen@sydney.edu.au)
 5 | #                Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au)
 6 | #
 7 | # https://github.com/VBayesLab/deepGLM
 8 | #
 9 | # Version: 1.0
10 | # LAST UPDATE: May, 2018
11 | 
12 | # Clear all variables
13 | rm(list=ls())
14 | gc(reset=T)
15 | 
16 | # Load libs
17 | library(mvtnorm)
18 | library(rstudioapi)
19 | 
20 | RootDir <- dirname(rstudioapi::getSourceEditorContext()$path)
21 | setwd(RootDir)
22 | 
23 | # Source external functions
24 | source('dependencies.R')
25 | 
26 | # Read data file
27 | data <- read.csv(file = "01_data/abalone.csv",header = FALSE)
28 | 
29 | 
30 | # Divide data to training and test sets
31 | N <- nrow(data)                               # Total number of observations
32 | p <- ncol(data) - 1                           # Number of variables
33 | Ntest <- round(0.15*N)                        # Number of test observations
34 | idx <- sample.int(N, size = Ntest, replace = FALSE)    # Sampling indexes
35 | dataTest <- data[idx,]                        # Test data
36 | dataTrain <- data[-idx,]                      # Train data
37 | XTrain <- data.matrix(dataTrain[,1:p])        # X train
38 | y <- data.matrix(dataTrain[,p+1])                          # y train
39 | XTest <- data.matrix(dataTest[,1:p])          # X test
40 | yTest <- data.matrix(dataTest[,p+1])          # y test
41 | 
42 | # Normalize Train and Test data
43 | meanX <- colMeans(XTrain)
44 | stdX <- apply(XTrain, 2, sd)
45 | X <- sweep(sweep(XTrain,2,meanX,'-'),2,stdX,'/')
46 | XTest <- sweep(sweep(XTest,2,meanX,'-'),2,stdX,'/')
47 | 
48 | # Fit a deepGLM model
49 | deepGLMout <-deepGLMfit(X,y,Network = c(5,5,5),Seed = 100,Verbose = 1, MaxEpoch = 500)
50 | 
51 | # Make prediction (point estimation) on a test set, without true labels
52 | Pred1 <- deepGLMpredict(deepGLMout,XTest)
53 | 
54 | # If ytest is specified (for model evaluation purpose) then we can check PPS and MSE on test set
55 | print('----------------Prediction---------------')
56 | Pred2 <- deepGLMpredict(deepGLMout,XTest,y = yTest)
57 | cat('PPS on test set using deepGLM is: ',Pred2$pps,'\n')
58 | cat('MSE on test set using deepGLM is: ',Pred2$mse,'\n')
59 | 
60 | # You can also perform point and interval estimation for a single test observation
61 | idx <- nrow(XTest)                                                     # Pick a random unseen observation
62 | dataTest <- XTest[idx,]
63 | Pred3 <- deepGLMpredict(deepGLMout,dataTest,Interval=1,Nsample=1000)    # Make 1-std prediction interval
64 | cat('Prediction Interval: [',Pred3$interval[1],';',Pred3$interval[2],']','\n')
65 | cat('True value: ',yTest[idx],'\n')
66 | 
67 | # Estimate prediction interval for entire test data
68 | Pred4 <- deepGLMpredict(deepGLMout,XTest,y=yTest,Interval=1,Nsample=1000)
69 | y_pred <- colMeans(Pred4$yhatMatrix)
70 | mse2 <- mean((yTest-y_pred)^2)
71 | accuracy <- (yTest<Pred4$interval[,2] & yTest>Pred4$interval[,1])
72 | cat('Prediction Interval accuracy: ',sum(accuracy)/length(accuracy),'\n')
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/R/dependencies.R:
--------------------------------------------------------------------------------
 1 | # Source all files within libs 02_folder
 2 | RootDir <- dirname(rstudioapi::getSourceEditorContext()$path)
 3 | setwd(paste0(RootDir,'/02_libs'))
 4 | 
 5 | files.sources = list.files()
 6 | sapply(files.sources, source)
 7 | 
 8 | setwd(RootDir)
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepGLM
 2 | Version 0.0.0.9000<br/>
 3 | 
 4 | ## Introduction
 5 | DeepGLM is a flexible model that use Deep Feedforward Neuron Network as the basis function for Generalized Linear Model. DeepGLM is designed to work with Cross-Sectional Dataset such as real estate data, cencus data, etc. <br/>
 6 | 
 7 | For more information about DeepGLM, please read the paper: Minh-Ngoc Tran,Nghia Nguyen, David J. Nott and Robert Kohn (2018)  Bayesian Deep Net GLM and GLMM https://arxiv.org/abs/1805.10157
 8 | 
 9 | ## Authors
10 | Nghia Nguyen (nghia.nguyen@sydney.edu.au) <br/>
11 | Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au)
12 | 
13 | ## Usage
14 | Users can choose either Matlab, R or Python version to train and make prediction with deepGLM.
15 | ### MATLAB Version
16 | To use the Toolbox, add the folder called "deepGLM" (with Subfolders) to the MATLAB path.
17 | 
18 | The toolbox contains the following folders:
19 | 
20 | - Data: some datasets used in the examples.
21 | - Examples: examples of all the functions included in the toolbox.
22 | - Documents: documentations for the functions in deepGLM toolbox
23 | - deepGLM: all the functions of the toolbox all here. This is the folder you must add to the MATLAB path.
24 | 
25 | ### R Version
26 | Install *deepglm* package for R:
27 | - Clone the directory or directly download the zip file **deepglm_0.0.0.9000.zip** inside *deepGLM/R/* subdirectory on github. 
28 | - In Rstudio, run the command:<br/> 
29 | **install.packages("D:\\deepglm_0.0.0.9000.zip", repos = NULL, type="source")** <br/>
30 | where *D:\deepglm_0.0.0.9000.zip* is the package directory in my local machine
31 | - To use the package, run the command:<br/> 
32 | **library(deepglm)**
33 | 
34 | *deepglm* provides two function to train a deepGLM model on training data (*deepGLMfit*) and to make prediction using a trained deepGLM model on unseen data (*deepGLMpredict*). In Studio, use command: **?deepGLMfit** and **?deepGLMpredict** to read the documentation for two functions
35 | 
36 | Use command **example(deepGLMfit)** to run the example showing how to run *deepGLMpredict* and *deepGLMpredict* on a simulation data
37 | 
38 | User can run addition examples using scripts in *demos* folder in the installation directory. For example, the installation directory for *deepglm* package in my Window machine is: *D:\Program Files\R\R-3.4.3\R-3.4.3\library\deepglm*
39 | 
40 | ### Python Version
41 | Download the file **deepGLM.pyc** to your project folder.
42 | 
43 | ## How to cite
44 | Please, cite the toolbox as:
45 | 
46 | Tran, M.-N., Nguyen, N., Kohn, R., and Nott, D. (2019) Bayesian Deep Net GLM and GLMM. Journal of Computational and Graphical Statistics, 29(1):97-113
47 | 


--------------------------------------------------------------------------------