├── Matlab ├── Data │ ├── BikeSharingDeepGLM.mat │ ├── DataSimulationBinary.mat │ ├── DirectMarketing.mat │ ├── SchoolingData.mat │ └── abalone.mat ├── DeepGLM │ ├── nnfun │ │ ├── nnActivation.m │ │ ├── nnActivationGrad.m │ │ ├── nnBackPropagation.m │ │ ├── nnFeedForward.m │ │ ├── nnGradLogLikelihood.m │ │ ├── nnInitialize.m │ │ └── nnSumResidualSquare.m │ ├── plotfun │ │ ├── deepGLMplot.m │ │ ├── plotInterval.m │ │ ├── plotMSE.m │ │ ├── plotPPS.m │ │ ├── plotROC.m │ │ └── plotShrinkage.m │ ├── stafun │ │ ├── gen_Sobol.m │ │ ├── normrnd_qmc.m │ │ └── rqmc_rnd.m │ ├── train │ │ ├── deepGLMTrain.m │ │ ├── deepGLMTrainTest.m │ │ ├── deepGLMfit.m │ │ ├── deepGLMlogitPoisson.m │ │ ├── deepGLMnormalCV.m │ │ ├── deepGLMpoisson.m │ │ ├── deepGLMpredict.m │ │ └── deepGLMpredictLoss.m │ ├── utils │ │ ├── checkInput.m │ │ ├── deepGLMmsg.m │ │ ├── deepGLMout.m │ │ ├── isBinomial.m │ │ ├── predictionInterval.m │ │ ├── splitData.m │ │ └── sumResidualSquared.m │ └── vbfun │ │ ├── vbGradientLogLB.m │ │ ├── vbGradientLogq.m │ │ ├── vbLowerBound.m │ │ └── vbNaturalGradient.m ├── Document │ ├── deepGLM.pdf │ ├── deepGLMNormalExample.pdf │ ├── ~WRL1562.tmp │ └── ~WRL3227.tmp └── Examples │ ├── deepGLMBinomialExample.mlx │ ├── deepGLMBinomialExampleScript.m │ ├── deepGLMNormalExample.mlx │ ├── deepGLMNormalExampleScript.m │ └── deepGLMPoissonExampleScript.m ├── Python ├── .ipynb_checkpoints │ └── Example notebook-checkpoint.ipynb ├── DirectMarketing.mat ├── Example notebook.ipynb ├── __pycache__ │ └── deepGLM.cpython-37.pyc ├── deepGLM.pdf └── deepGLM.py ├── R ├── 01_data │ └── abalone.csv ├── 02_libs │ ├── checkInput.R │ ├── deepGLMTrain.R │ ├── deepGLMfit.R │ ├── deepGLMpredict.R │ ├── nnActivation.R │ ├── nnActivationGrad.R │ ├── nnBackPropagation.R │ ├── nnFeedForward.R │ ├── nnGradLogLikelihood.R │ ├── nnInitialize.R │ ├── nnSumResidualSquare.R │ ├── predictionInterval.R │ ├── vbGradientLogLB.R │ ├── vbGradientLogq.R │ ├── vbLowerBound.R │ └── vbNaturalGradient.R ├── deepGLMNormalExample.R └── dependencies.R ├── README.html └── README.md /Matlab/Data/BikeSharingDeepGLM.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Data/BikeSharingDeepGLM.mat -------------------------------------------------------------------------------- /Matlab/Data/DataSimulationBinary.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Data/DataSimulationBinary.mat -------------------------------------------------------------------------------- /Matlab/Data/DirectMarketing.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Data/DirectMarketing.mat -------------------------------------------------------------------------------- /Matlab/Data/SchoolingData.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Data/SchoolingData.mat -------------------------------------------------------------------------------- /Matlab/Data/abalone.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Data/abalone.mat -------------------------------------------------------------------------------- /Matlab/DeepGLM/nnfun/nnActivation.m: -------------------------------------------------------------------------------- 1 | function out = nnActivation(z,func) 2 | %NNACTIVATION Calculate activation output at nodes in each forward pass 3 | 4 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 5 | % Nguyen (nghia.nguyen@sydney.edu.au) 6 | % 7 | % http://www.xxx.com 8 | % 9 | % Version: 1.0 10 | % LAST UPDATE: April, 2018 11 | 12 | if nargin < 2 13 | error(deepGLMmsg('deepglm:MustSpecifyActivationFunction')); 14 | end 15 | 16 | switch func 17 | case 'Linear' 18 | out = z; 19 | case 'Sigmoid' 20 | out = 1.0 ./ (1.0 + exp(-z)); 21 | case 'Tanh' 22 | out = tanh(z); 23 | case 'ReLU' 24 | out = max(0,z); 25 | case 'LeakyReLU' 26 | out = max(0,z)+ alpha*min(0,z); 27 | end 28 | end 29 | 30 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/nnfun/nnActivationGrad.m: -------------------------------------------------------------------------------- 1 | function out = nnActivationGrad(z,func) 2 | %NNACTIVATIONGRAD Calculate derivative of activation output at hidden nodes 3 | %in each backward pass 4 | % 5 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 6 | % Nguyen (nghia.nguyen@sydney.edu.au) 7 | % 8 | % http://www.xxx.com 9 | % 10 | % Version: 1.0 11 | % LAST UPDATE: April, 2018 12 | 13 | 14 | switch func 15 | case 'Linear' 16 | out = ones(size(z)); 17 | case 'Sigmoid' 18 | temp = activation(z,text); 19 | out = temp.*(1-temp); 20 | case 'Tanh' 21 | temp = activation(z,text); 22 | out = 1 - temp^2; 23 | case 'ReLU' 24 | out = z>0; 25 | case 'LeakyReLU' 26 | if z > 0 27 | out = 1; 28 | else 29 | out = alpha; 30 | end 31 | end 32 | 33 | end 34 | 35 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/nnfun/nnBackPropagation.m: -------------------------------------------------------------------------------- 1 | function [gradient,nnOut] = nnBackPropagation(X,y,W_seq,beta,distr) 2 | %NNBACKPROPAGATION Compute gradient of weights in a neural net using 3 | % backpropagation algorithm 4 | % 5 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 6 | % Nguyen (nghia.nguyen@sydney.edu.au) 7 | % 8 | % http://www.xxx.com 9 | % 10 | % Version: 1.0 11 | % LAST UPDATE: April, 2018 12 | 13 | n_train = size(X,1); 14 | L = length(W_seq); 15 | a_seq = cell(1,L); 16 | Z_seq = cell(1,L); 17 | 18 | a_seq{1} = W_seq{1}*X'; 19 | Z_seq{1} = [ones(1,n_train);nnActivation(a_seq{1},'ReLU')]; 20 | for j=2:L 21 | a_seq{j} = W_seq{j}*Z_seq{j-1}; 22 | Z_seq{j} = [ones(1,n_train);nnActivation(a_seq{j},'ReLU')]; 23 | end 24 | delta_seq = cell(1,L+1); 25 | 26 | % Calculate error at the output layers according to distribution family of 27 | % response 28 | nnOut = beta'*Z_seq{L}; 29 | switch distr 30 | case 'normal' 31 | delta_seq{L+1} = y' - nnOut; 32 | case 'binomial' 33 | p_i = 1./(1+exp(-nnOut)); 34 | delta_seq{L+1} = y' - p_i; 35 | case 'poisson' 36 | delta_seq{L+1} = y' - exp(nnOut); 37 | end 38 | delta_seq{L} = (beta(2:end)*delta_seq{L+1}).*nnActivationGrad(a_seq{L},'ReLU'); 39 | for j=L-1:-1:1 40 | Wj_tilde = W_seq{j+1}; 41 | Wj_tilde = Wj_tilde(:,2:end); 42 | delta_seq{j} = (nnActivationGrad(a_seq{j},'ReLU')).*(Wj_tilde'*delta_seq{j+1}); 43 | end 44 | gradient_W1 = delta_seq{1}*X; 45 | gradient = gradient_W1(:); 46 | for j = 2:L 47 | gradient_Wj = delta_seq{j}*(Z_seq{j-1})'; 48 | gradient = [gradient;gradient_Wj(:)]; 49 | end 50 | gradient = [gradient;Z_seq{L}*delta_seq{L+1}']; 51 | end 52 | 53 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/nnfun/nnFeedForward.m: -------------------------------------------------------------------------------- 1 | function nnOutput = nnFeedForward(X,W_seq,beta) 2 | %NNFEEDFORWARD Compute the output of a neural net 3 | 4 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 5 | % Nguyen (nghia.nguyen@sydney.edu.au) 6 | % 7 | % http://www.xxx.com 8 | % 9 | % Version: 1.0 10 | % LAST UPDATE: April, 2018 11 | 12 | % Number of observations in dataset 13 | n_train = size(X,1); 14 | 15 | % Make forward passes to all layers 16 | a = W_seq{1}*X'; 17 | Z = [ones(1,n_train);nnActivation(a,'ReLU')]; 18 | L = length(W_seq); 19 | for j=2:L 20 | a = W_seq{j}*Z; 21 | Z = [ones(1,n_train);nnActivation(a,'ReLU')]; % Add biases 22 | end 23 | % a = W_seq{L}*Z; 24 | % Z = [ones(1,n_train);nnActivation(a,'ReLU')]; 25 | nnOutput = Z'*beta; 26 | end 27 | 28 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/nnfun/nnGradLogLikelihood.m: -------------------------------------------------------------------------------- 1 | function [gradient,nnOut] = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr,mean_sigma2_inverse) 2 | %NNGRADIENTLLH Calculate gradient of log likelihood 3 | % Detailed explanation goes here 4 | % 5 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 6 | % Nguyen (nghia.nguyen@sydney.edu.au) 7 | % 8 | % http://www.xxx.com 9 | % 10 | % Version: 1.0 11 | % LAST UPDATE: April, 2018 12 | 13 | n = length(y); 14 | [back_prop,nnOut] = nnBackPropagation(X,y,W_seq,beta,distr); 15 | nnOut = nnOut'; 16 | switch distr 17 | case 'normal' 18 | gradient_theta = mean_sigma2_inverse*back_prop; 19 | gradient = datasize/n*gradient_theta; % To compensate the variation 20 | case 'binomial' 21 | gradient = datasize/n*back_prop; 22 | case 'poisson' 23 | gradient = datasize/n*back_prop; 24 | end 25 | end 26 | 27 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/nnfun/nnInitialize.m: -------------------------------------------------------------------------------- 1 | function weights = nnInitialize(layers) 2 | %NNINITIALIZE Summary of this function goes here 3 | % layers: vector of doubles, each number specifing the amount of 4 | % nodes in a layer of the network. 5 | % 6 | % weights: cell array of weight matrices specifing the 7 | % translation from one layer of the network to the next. 8 | % 9 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 10 | % Nguyen (nghia.nguyen@sydney.edu.au) 11 | % 12 | % http://www.xxx.com 13 | % 14 | % Version: 1.0 15 | % LAST UPDATE: April, 2018 16 | 17 | weights = cell(1, length(layers)-1); 18 | 19 | for i = 1:length(layers)-1 20 | % Using random weights from -b to b 21 | b = sqrt(6)/(layers(i)+layers(i+1)); 22 | if i==1 23 | weights{i} = rand(layers(i+1),layers(i))*2*b - b; % Input layer already have bias 24 | else 25 | weights{i} = rand(layers(i+1),layers(i)+1)*2*b - b; % 1 bias in input layer 26 | end 27 | end 28 | 29 | end 30 | 31 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/nnfun/nnSumResidualSquare.m: -------------------------------------------------------------------------------- 1 | function out = nnSumResidualSquare(y,X,W_seq,beta) 2 | %NNSUMRESIDUALSQUARE Calculate sum of square of residuals 3 | % 4 | % 5 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 6 | % Nguyen (nghia.nguyen@sydney.edu.au). 7 | % 8 | % http://www.xxx.com 9 | % 10 | % Version: 1.0 11 | % LAST UPDATE: April, 2018 12 | 13 | nnet_output = nnFeedForward(X,W_seq,beta); 14 | out = sum((y-nnet_output).^2); 15 | end 16 | 17 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/plotfun/deepGLMplot.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/DeepGLM/plotfun/deepGLMplot.m -------------------------------------------------------------------------------- /Matlab/DeepGLM/plotfun/plotInterval.m: -------------------------------------------------------------------------------- 1 | function plotInterval(predMean,predInterval,opt,varargin) 2 | %PLOTINTERVAL Plot prediction interval for test data 3 | % 4 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 5 | % Nguyen (nghia.nguyen@sydney.edu.au) 6 | % 7 | % http://www.xxx.com 8 | % 9 | % Version: 1.0 10 | % LAST UPDATE: April, 2018 11 | 12 | if (nargin<2) 13 | disp('ERROR: not enough input arguments!'); 14 | return; 15 | end 16 | 17 | textTitle = opt.title; 18 | labelX = opt.labelX; 19 | labelY = opt.labelY; 20 | linewidth = opt.linewidth; 21 | 22 | % Define some default texts 23 | if(isempty(textTitle)) 24 | textTitle = 'Prediction Interval on Test Data'; 25 | end 26 | if(isempty(labelX)) 27 | labelX = 'Observation'; 28 | end 29 | 30 | % Parse additional options 31 | paramNames = {'Color' 'Style' 'ytrue'}; 32 | paramDflts = {'red' 'shade' []}; 33 | [color,style,ytrue] = internal.stats.parseArgs(paramNames,... 34 | paramDflts, varargin{:}); 35 | 36 | lower = predInterval(:,1); 37 | upper = predInterval(:,2); 38 | t = 1:1:length(predMean); 39 | switch style 40 | case 'shade' % Plot prediction interval in shade style 41 | p = plot(t,predMean,t,upper,t,lower); 42 | YLIM = get(gca,'YLim'); 43 | delete(p); 44 | a1 = area(t,upper,min(YLIM)); 45 | hold on; 46 | set(a1,'LineStyle','none'); 47 | set(a1,'FaceColor',[0.9 0.9 0.9]); 48 | a2 = area(t,lower,min(YLIM)); 49 | set(a2,'LineStyle','none'); 50 | set(a2,'FaceColor',[1 1 1]); 51 | p2 = scatter(t,predMean,40,'MarkerEdgeColor',[1 0 0]); 52 | if(~isempty(ytrue)) 53 | p1 = scatter(t,ytrue,40,'MarkerEdgeColor',[0 0 1]); 54 | legend([p1,p2],{'True values','Prediction values'}); 55 | end 56 | title(textTitle, 'FontSize',18) 57 | xlabel(labelX) 58 | ylabel(labelY) 59 | hold off; 60 | set(gca,'Layer','top','XGrid','on','YGrid','on'); 61 | case 'boundary' % Plot prediction interval in boundary style 62 | plot(t,predMean,'LineWidth',linewidth,'Color',color); 63 | hold on 64 | plot(t,upper,'--r',t,lower,'--r'); 65 | grid on 66 | title('Prediction Interval on Test Data', 'FontSize',18) 67 | xlabel('Observation') 68 | hold off 69 | case 'bar' % Plot prediction interval in bar style 70 | err = (upper-lower)/2; 71 | errorbar(predMean,err); 72 | grid on 73 | hold on 74 | plot(predMean,'Color','red','LineWidth',2); 75 | title('Prediction Interval on Test Data', 'FontSize',18) 76 | xlabel('Observation') 77 | hold off 78 | end 79 | end 80 | 81 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/plotfun/plotMSE.m: -------------------------------------------------------------------------------- 1 | function [outputArg1,outputArg2] = plotMSE(inputArg1,inputArg2) 2 | %PLOTMSE Summary of this function goes here 3 | % 4 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 5 | % Nguyen (nghia.nguyen@sydney.edu.au) 6 | % 7 | % http://www.xxx.com 8 | % 9 | % Version: 1.0 10 | % LAST UPDATE: April, 2018 11 | 12 | outputArg1 = inputArg1; 13 | outputArg2 = inputArg2; 14 | end 15 | 16 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/plotfun/plotPPS.m: -------------------------------------------------------------------------------- 1 | function plotPPS(loss,data) 2 | %PLOTPPS Plot prediction loss 3 | % 4 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 5 | % Nguyen (nghia.nguyen@sydney.edu.au) 6 | % 7 | % http://www.xxx.com 8 | % 9 | % Version: 1.0 10 | % LAST UPDATE: April, 2018 11 | 12 | plot(loss); 13 | grid on; 14 | title(['Prediction Loss on ',data,' set']); 15 | xlabel('Iterations'); 16 | ylabel('PPS') 17 | end 18 | 19 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/plotfun/plotROC.m: -------------------------------------------------------------------------------- 1 | function plotROC(y_true,y_pred) 2 | %PLOTROC Plot ROC curve and AUC 3 | 4 | if nargin<2 5 | disp('Too few input arguments'); 6 | return 7 | end 8 | 9 | if(size(y_true)~=size(y_pred)) 10 | disp('Target and output must have same size') 11 | return 12 | elseif(size(y_true,1)~=1) 13 | disp('Target and output must be row vectors with same length') 14 | return 15 | else 16 | plotroc(y_true,y_pred) 17 | grid on 18 | end 19 | 20 | end 21 | 22 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/plotfun/plotShrinkage.m: -------------------------------------------------------------------------------- 1 | function plotShrinkage(ShrinkageCoef,opt) 2 | %PLOTSHRINKAGE Plot shrinkage coefficient of Group Lasso regularization 3 | % 4 | % 5 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 6 | % Nguyen (nghia.nguyen@sydney.edu.au) 7 | % 8 | % http://www.xxx.com 9 | % 10 | % Version: 1.0 11 | % LAST UPDATE: April, 2018 12 | 13 | % Do not plot intercept coefficient 14 | % ShrinkageCoef = ShrinkageCoef(2:end,:); 15 | 16 | TextTitle = opt.title; 17 | labelX = opt.labelX; 18 | labelY = opt.labelY; 19 | linewidth = opt.linewidth; 20 | color = opt.color; 21 | 22 | numCoeff = size(ShrinkageCoef,1); % Number of shrinkage coefficients 23 | fontsize = 13; 24 | 25 | % Define default settings 26 | if(isempty(TextTitle)) 27 | TextTitle = 'Shrinakge Coefficients'; 28 | end 29 | if(isempty(labelX)) 30 | labelX = 'Iteration'; 31 | end 32 | 33 | % Plot 34 | plot(ShrinkageCoef','LineWidth',linewidth); 35 | grid on 36 | title(TextTitle,'FontSize', 20) 37 | xlabel(labelX,'FontSize', 15) 38 | ylabel(labelY,'FontSize', 15) 39 | Ytext = ShrinkageCoef(:,end); % Y coordination of text, different for coefficients 40 | Xtext = size(ShrinkageCoef,2); % X coordination of text, same for all coefficients 41 | for i=1:numCoeff 42 | text(Xtext,Ytext(i),['\gamma_{',num2str(i),'}'],'fontsize',fontsize) 43 | end 44 | end 45 | 46 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/stafun/gen_Sobol.m: -------------------------------------------------------------------------------- 1 | %genertate Sobol Sequence 2 | function [X1]=gen_Sobol(m,s) 3 | N = pow2(m); % Number of points; 4 | cmax = 52; % number of digits of generated points 5 | 6 | 7 | N = pow2(m); % Number of points; 8 | P = sobolset(s); % Get Sobol sequence; 9 | P = scramble(P,'MatousekAffineOwen'); % Scramble Sobol points; 10 | X1 = net(P,N); 11 | 12 | X1=X1'; 13 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/stafun/normrnd_qmc.m: -------------------------------------------------------------------------------- 1 | function x = normrnd_qmc(S,d) 2 | % generate Sxd matrix of standard normal numbers by RQMC 3 | rqmc = rqmc_rnd(S,d); 4 | rqmc = rqmc(1:S,:); 5 | x = norminv(rqmc); 6 | end -------------------------------------------------------------------------------- /Matlab/DeepGLM/stafun/rqmc_rnd.m: -------------------------------------------------------------------------------- 1 | function f = rqmc_rnd(S,d) 2 | % generate a matrix of RQMC of size S times d 3 | max_sobol = 1111; 4 | r = floor(d/max_sobol); 5 | s = d-r*max_sobol; 6 | if r>=1 7 | f = gen_Sobol(ceil(log2(S)),max_sobol)'; 8 | for i = 2:r 9 | f = [f,gen_Sobol(ceil(log2(S)),max_sobol)']; 10 | end 11 | f = [f,gen_Sobol(ceil(log2(S)),s)']; 12 | else 13 | f = gen_Sobol(ceil(log2(S)),d)'; 14 | end 15 | 16 | end 17 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/train/deepGLMTrain.m: -------------------------------------------------------------------------------- 1 | function est = deepGLMTrain(X_train,y_train,est) 2 | % Traing a deepGLM model with continuous reponse y. 3 | % Bayesian Adaptive Group Lasso is used on the first-layer weights; no 4 | % regularization is put on the rest. sigma2 and tau are updated by 5 | % mean-field VB. Inverse gamma prior is used for sigma2 6 | % INPUT 7 | % X_train, y_train: Training data (continuous response) 8 | % X_validation, y_validation: Validation data 9 | % n_units: Vector specifying the numbers of units in 10 | % each layer 11 | % batchsize: Mini-batch size used in each iteration 12 | % eps0: Constant learning rate 13 | % isotropic: True if isotropic structure on Sigma is 14 | % used, otherwise rank-1 structure is used 15 | % OUTPUT 16 | % W_seq: The optimal weights upto the last hidden 17 | % layer 18 | % beta The optimal weights that connect last hidden layer to the output 19 | % mean_sigma2 Estimate of sigma2 20 | % shrinkage_gamma_seq Update of shrinkage parameters over 21 | % iteration 22 | % MSE_DL Mean squared error over iteration 23 | % 24 | % 25 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 26 | % Nguyen (nghia.nguyen@sydney.edu.au) 27 | % 28 | % http://www.xxx.com 29 | % 30 | % Version: 1.0 31 | % LAST UPDATE: April, 2018 32 | 33 | % Extract training data and settings from input struct 34 | X_val = est.data.Xval; 35 | y_val = est.data.yval; 36 | n_units = est.network; 37 | batchsize = est.batchsize; 38 | lrate = est.lrate; 39 | isotropic = est.isIsotropic; 40 | S = est.S; % Number of Monte Carlo samples to estimate the gradient 41 | tau = est.tau; % Threshold before reducing constant learning rate eps0 42 | grad_weight = est.momentum; % Weight in the momentum 43 | cScale = est.c; % Random scale factor to initialize b,c 44 | patience = est.patience; % Stop if test error not improved after patience_parameter iterations 45 | epoch = est.epoch; % Number of times learning algorithm scan entire training data 46 | verbose = est.verbose; 47 | distr = est.dist; 48 | lbFlag = est.lowerbound; % Lowerbound flag 49 | LBwindow = est.windowSize; 50 | seed = est.seed; 51 | 52 | if(~isnan(seed)) 53 | rng(seed) 54 | end 55 | 56 | % Data merge for mini-batch sampling 57 | data = [y_train,X_train]; 58 | datasize = length(y_train); 59 | num1Epoch = round(datasize/batchsize); % Number of iterations per epoch 60 | 61 | % Network parameters 62 | L = length(n_units); % Number of hidden layers 63 | p = size(X_train,2)-1; % Number of covariates 64 | W_seq = cell(1,L); % Cells to store weight matrices 65 | index_track = zeros(1,L); % Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,... 66 | index_track(1) = n_units(1)*(p+1); % Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 67 | W1_tilde_index = n_units(1)+1:index_track(1); % Index of W1 without biases, as the first column if W1 are biases 68 | w_tilde_index = []; % indices of non-biase weights, excluding W1, for l2-regulization prior 69 | for j = 2:L 70 | index_track(j) = index_track(j-1)+n_units(j)*(n_units(j-1)+1); 71 | w_tilde_index = [w_tilde_index,(index_track(j-1)+n_units(j)+1):index_track(j)]; 72 | end 73 | d_w = index_track(L); % Total number of weights up to (and including) the last layer 74 | d_beta = n_units(L)+1; % Dimension of the weights beta connecting the last layer to the output 75 | d_theta = d_w+d_beta; % Total number of parameters 76 | w_tilde_index = [w_tilde_index,(d_w+2:d_theta)]; 77 | d_w_tilde = length(w_tilde_index); 78 | 79 | % Initialise weights and set initial mu equal to initial weights 80 | layers = [size(X_train,2) n_units 1]; % Full structure of NN -> [input,hidden,output] 81 | weights = nnInitialize(layers); 82 | mu=[]; 83 | for i=1:length(layers)-1 84 | mu=[mu;weights{i}(:)]; 85 | end 86 | % Initialize b and c 87 | % b = normrnd(0,cScale,d_theta,1); 88 | b = cScale*rand(d_theta,1); 89 | if isotropic 90 | c = cScale; 91 | else 92 | c = cScale*ones(d_theta,1); 93 | end 94 | % Initialize lambda 95 | lambda=[mu;b;c]; 96 | 97 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 98 | W_seq{1} = W1; 99 | for j = 2:L 100 | index = index_track(j-1)+1:index_track(j); 101 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 102 | W_seq{j} = Wj; 103 | end 104 | beta = mu(d_w+1:d_theta); 105 | 106 | % Get mini-batch 107 | idx = randperm(datasize,batchsize); 108 | minibatch = data(idx,:); 109 | y = minibatch(:,1); 110 | X = minibatch(:,2:end); 111 | 112 | % Remove this after doing R verison 113 | % X = X_train; 114 | % y = y_train; 115 | 116 | % minibatch = datasample(data,batchsize); 117 | % y = minibatch(:,1); 118 | % X = minibatch(:,2:end); 119 | 120 | % Hyperparameters for inverse-Gamma prior on sigma2 if y~Nomal(0,sigma2) 121 | if(strcmp(distr,'normal')) 122 | alpha0_sigma2 = 10; 123 | beta0_sigma2 = (alpha0_sigma2-1)*std(y); 124 | alpha_sigma2 = alpha0_sigma2 + length(y_train)/2; % Optimal VB parameter for updating sigma2 125 | beta_sigma2 = alpha_sigma2; % Mean_sigma2 and mean_sigma2_inverse are 126 | % Initialised at small values 1/2 and 1 respectively 127 | mean_sigma2_inverse = alpha_sigma2/beta_sigma2; 128 | mean_sigma2 = beta_sigma2/(alpha_sigma2-1); 129 | mean_sigma2_save(1) = mean_sigma2; 130 | end 131 | 132 | % Compute prediction loss if not using lowerbound for validation 133 | if(~lbFlag) 134 | if(strcmp(distr,'normal')) 135 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2); 136 | disp(['Initial MSE: ',num2str(MSE_current)]); 137 | else 138 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr); 139 | disp(['Initial PPS: ',num2str(PPS_current)]); 140 | end 141 | MSE_DL(1) = MSE_current; 142 | PPS_DL(1) = PPS_current; 143 | end 144 | 145 | % Calculations for group Lasso coefficients 146 | shrinkage_gamma = .01*ones(p,1); % Initialise gamma_beta, the shrinkage parameters 147 | shrinkage_l2 = .01; % Hype-parameter for L2 prior 148 | mu_tau = zeros(p,1); % Parameters for the auxiliary tau_j 149 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p); 150 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p); 151 | if isotropic 152 | for j = 1:p 153 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 154 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1); 155 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 156 | end 157 | lambda_tau = shrinkage_gamma.^2; 158 | else 159 | c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p); 160 | for j = 1:p 161 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 162 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2); 163 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 164 | end 165 | lambda_tau = shrinkage_gamma.^2; 166 | end 167 | mean_inverse_tau = mu_tau; % VB mean <1/tau_j> 168 | shrinkage_gamma_seq = shrinkage_gamma; % 169 | mean_tau = 1./mu_tau+1./lambda_tau; 170 | m = n_units(1); 171 | 172 | % Prepare to calculate lowerbound 173 | if(lbFlag) 174 | if(strcmp(distr,'normal')) 175 | const = alpha0_sigma2*log(beta0_sigma2)-gammaln(alpha0_sigma2)... 176 | -0.5*p*n_units(1)*log(2*pi)-0.5*d_w_tilde*log(2*pi)... 177 | -p*gammaln((n_units(1)+1)/2)-0.5*datasize*log(2*pi)... 178 | +p/2*log(2*pi)+0.5*d_theta*log(2*pi)+d_theta/2; 179 | else 180 | const = -0.5*p*n_units(1)*log(2*pi)-0.5*d_w_tilde*log(2*pi)... 181 | -p*gammaln((n_units(1)+1)/2)+p/2*log(2*pi)... 182 | +0.5*d_theta*log(2*pi)+d_theta/2; 183 | end 184 | 185 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 186 | W_seq{1} = W1; 187 | for j = 2:L 188 | index = index_track(j-1)+1:index_track(j); 189 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 190 | W_seq{j} = Wj; 191 | end 192 | beta = mu(d_w+1:d_theta); 193 | mu_w_tilde = mu(w_tilde_index); 194 | b_w_tilde = b(w_tilde_index); 195 | c_w_tilde = c(w_tilde_index); 196 | mean_w_tilde = mu_w_tilde'*mu_w_tilde+b_w_tilde'*b_w_tilde+sum(c_w_tilde.^2); 197 | iter = 1; 198 | vbLowerBound; 199 | % disp(['Initial LB: ',num2str(lb(iter))]); 200 | end 201 | 202 | %% Calcualte for the first iteration 203 | grad_g_lik_store = zeros(S,3*d_theta); 204 | lb_iter = zeros(1,S); 205 | %----------------------------Narutal Gradient (1st Iteration)-------------- 206 | vbGradientLogLB 207 | gradient_bar = gradient_lambda; 208 | if(lbFlag) 209 | lb(iter) = mean(lb_iter)/datasize; 210 | disp(['Initial LB: ',num2str(lb(iter))]); 211 | end 212 | %-------------------------------------------------------------------------- 213 | 214 | 215 | %% Training Phase 216 | % Prepare parameters for training 217 | idxEpoch = 0; % Index of current epoch 218 | iter = 1; % Index of current iteration 219 | stop = false; % Stop flag for early stopping 220 | lambda_best = lambda; % Store optimal lambda for output 221 | idxPatience = 0; % Index of number of consequent non-decreasing iterations 222 | % for early stopping 223 | disp('---------- Training Phase ----------') 224 | while ~stop 225 | iter = iter+1; 226 | 227 | %% ------------------Natural Gradient Calculation---------------------- 228 | % Get mini-batch 229 | idx = randperm(datasize,batchsize); 230 | minibatch = data(idx,:); 231 | y = minibatch(:,1); 232 | X = minibatch(:,2:end); 233 | 234 | % Remove this after doing R verison 235 | % X = X_train; 236 | % y = y_train; 237 | 238 | % minibatch = datasample(data,batchsize); 239 | % y = minibatch(:,1); 240 | % X = minibatch(:,2:end); 241 | 242 | % Calculate expected terms of lowerbound 243 | if(lbFlag) 244 | vbLowerBound; 245 | end 246 | 247 | % Calculate Natural Gradient 248 | vbGradientLogLB 249 | 250 | % Get lowerbound in the current iteration 251 | if(lbFlag) 252 | lb(iter) = mean(lb_iter)/datasize; 253 | end 254 | %---------------------------------------------------------------------- 255 | 256 | %% ------------------Stochastic gradient ascend update----------------- 257 | % Prevent exploding Gradient 258 | grad_norm = norm(gradient_lambda); 259 | norm_gradient_threshold = 100; 260 | if norm(gradient_lambda)>norm_gradient_threshold 261 | gradient_lambda = (norm_gradient_threshold/grad_norm)*gradient_lambda; 262 | end 263 | 264 | % Momentum gradient 265 | gradient_bar_old = gradient_bar; 266 | gradient_bar = grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda; 267 | 268 | % Adaptive learning rate 269 | if iter>tau 270 | stepsize=lrate*tau/iter; 271 | else 272 | stepsize=lrate; 273 | end 274 | 275 | % Gradient ascend 276 | lambda = lambda + stepsize*gradient_bar; 277 | 278 | % Restore model parameters from variational parameter lambda 279 | mu=lambda(1:d_theta,1); 280 | b=lambda(d_theta+1:2*d_theta,1); 281 | c=lambda(2*d_theta+1:end); 282 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 283 | W_seq{1} = W1; 284 | for j = 2:L 285 | index = index_track(j-1)+1:index_track(j); 286 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 287 | W_seq{j} = Wj; 288 | end 289 | beta = mu(d_w+1:d_theta); 290 | %---------------------------------------------------------------------- 291 | 292 | %% ---------------- Update tau and shrinkage parameters---------------- 293 | if mod(iter,1) == 0 294 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p); 295 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p); 296 | if isotropic 297 | for j = 1:p 298 | mean_column_j_tilde(j) = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 299 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1); 300 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde(j)); 301 | lambda_tau(j) = shrinkage_gamma(j)^2; 302 | end 303 | else 304 | c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p); 305 | for j = 1:p 306 | mean_column_j_tilde(j) = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 307 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2); 308 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde(j)); 309 | lambda_tau(j) = shrinkage_gamma(j)^2; 310 | end 311 | end 312 | mean_inverse_tau = mu_tau; 313 | mean_tau = 1./mu_tau+1./lambda_tau; 314 | shrinkage_gamma = sqrt((n_units(1)+1)./mean_tau); 315 | shrinkage_gamma_seq = [shrinkage_gamma_seq,shrinkage_gamma]; 316 | 317 | mu_w_tilde = mu(w_tilde_index); 318 | b_w_tilde = b(w_tilde_index); 319 | c_w_tilde = c(w_tilde_index); 320 | mean_w_tilde = mu_w_tilde'*mu_w_tilde+b_w_tilde'*b_w_tilde+sum(c_w_tilde.^2); 321 | % shrinkage_l2 = length(w_tilde_index)/mean_w_tilde; 322 | end 323 | %---------------------------------------------------------------------- 324 | 325 | %% ------Update VB posterior for sigma2, which is inverse Gamma ------- 326 | % if y ~ N(0,sigma2) 327 | if(strcmp(distr,'normal')) 328 | if (mod(iter,1) == 0) 329 | sum_squared = sumResidualSquared(y_train,X_train,W_seq,beta); 330 | beta_sigma2 = beta0_sigma2+sum_squared/2; 331 | mean_sigma2_inverse = alpha_sigma2/beta_sigma2; 332 | mean_sigma2 = beta_sigma2/(alpha_sigma2-1); 333 | mean_sigma2_save = [mean_sigma2_save,mean_sigma2]; 334 | end 335 | end 336 | %---------------------------------------------------------------------- 337 | 338 | %% ----------------------------Validation------------------------------ 339 | % If using lowerbound for validation 340 | if(lbFlag) 341 | % Storing lowerbound moving average values 342 | if (iter>LBwindow) 343 | lb_bar(iter-LBwindow) = mean(lb(iter-LBwindow+1:iter)); 344 | if lb_bar(end)>=max(lb_bar) 345 | lambda_best = lambda; 346 | idxPatience = 0; 347 | else 348 | idxPatience = idxPatience+1; 349 | % disp(['idxPatience: ',num2str(idxPatience)]) 350 | end 351 | end 352 | 353 | % If using MSE/Accuracy for validation 354 | else 355 | if(strcmp(distr,'normal')) 356 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2); 357 | else 358 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr); 359 | end 360 | 361 | MSE_DL(iter) = MSE_current; 362 | PPS_DL(iter) = PPS_current; 363 | 364 | if PPS_DL(iter)>=PPS_DL(iter-1) 365 | gradient_bar = gradient_bar_old; 366 | end 367 | 368 | if PPS_DL(iter)<=min(PPS_DL) 369 | lambda_best = lambda; 370 | idxPatience = 0; 371 | else 372 | idxPatience = idxPatience+1; 373 | % disp(['idxPatience: ',num2str(idxPatience)]) 374 | end 375 | end 376 | 377 | % Early stopping 378 | if (idxPatience>patience)||(idxEpoch>epoch) 379 | stop = true; 380 | end 381 | %---------------------------------------------------------------------- 382 | 383 | %% ------------------------------Display------------------------------- 384 | % Display epoch index whenever an epoch is finished 385 | if(~mod(iter,num1Epoch)) 386 | idxEpoch = idxEpoch + 1; 387 | end 388 | 389 | % Display training results after each 'verbose' iteration 390 | if (verbose && ~mod(iter,verbose)) 391 | if(lbFlag) % Display lowerbound 392 | % disp(['Epoch: ',num2str(idxEpoch)]); 393 | 394 | if (iter>LBwindow) 395 | disp(['Epoch: ',num2str(idxEpoch),' - ',... 396 | 'Current LB: ',num2str(lb_bar(iter-LBwindow))]); 397 | else 398 | disp(['Epoch: ',num2str(idxEpoch),' - ',... 399 | 'Current LB: ',num2str(lb(iter))]); 400 | end 401 | else % Or display MSE/Accuracy 402 | if(strcmp(distr,'binomial')) 403 | disp(['Current PPS: ',num2str(PPS_current)]); 404 | else 405 | disp(['Current MSE: ',num2str(MSE_current)]); 406 | end 407 | end 408 | end 409 | %---------------------------------------------------------------------- 410 | 411 | end 412 | 413 | %% --------------------------Display Training Results---------------------- 414 | disp('---------- Training Completed! ----------') 415 | disp(['Number of iteration:',num2str(iter)]); 416 | if(lbFlag) 417 | disp(['LBBar best: ',num2str(max(lb_bar))]); 418 | else 419 | disp(['PPS best: ',num2str(min(PPS_DL))]); 420 | disp(['MSE best: ',num2str(min(MSE_DL))]); 421 | end 422 | 423 | %% ----------------------Store training output----------------------------- 424 | lambda = lambda_best; 425 | mu = lambda(1:d_theta,1); 426 | b = lambda(d_theta+1:2*d_theta,1); 427 | c = lambda(2*d_theta+1:end); 428 | if isotropic % For isotropic structure 429 | SIGMA = b*b' + c^2*eyes(d_theta); 430 | else 431 | SIGMA = b*b' + diag(c.^2); 432 | end 433 | 434 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 435 | W_seq{1} = W1; 436 | for j = 2:L 437 | index = index_track(j-1)+1:index_track(j); 438 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 439 | W_seq{j} = Wj; 440 | end 441 | beta = mu(d_w+1:d_w+d_beta); 442 | 443 | % Store output in a struct 444 | est.out.weights = W_seq; 445 | est.out.beta = beta; 446 | est.out.shrinkage = shrinkage_gamma_seq; 447 | est.out.iteration = iter; 448 | est.out.vbMU = mu; % Mean of variational distribution of weights 449 | est.out.b = b; 450 | est.out.c = c; 451 | est.out.vbSIGMA = SIGMA; % Covariance matrix of variational distribution 452 | % of weights 453 | est.out.nparams = d_theta; % Number of parameters 454 | est.out.indexTrack = index_track; 455 | est.out.muTau = mu_tau; 456 | 457 | if(strcmp(distr,'normal')) 458 | est.out.sigma2Alpha = alpha_sigma2; 459 | est.out.sigma2Beta = beta_sigma2; 460 | est.out.sigma2Mean = mean_sigma2_save(end); 461 | est.out.sigma2MeanIter = mean_sigma2_save; 462 | end 463 | 464 | if(lbFlag) 465 | est.out.lbBar = lb_bar(2:end); 466 | est.out.lb = lb; 467 | else 468 | if(strcmp(distr,'binomial')) 469 | est.out.accuracy = MSE_DL; 470 | else 471 | est.out.mse = MSE_DL; 472 | end 473 | est.out.pps = PPS_DL; 474 | end 475 | end -------------------------------------------------------------------------------- /Matlab/DeepGLM/train/deepGLMTrainTest.m: -------------------------------------------------------------------------------- 1 | function est = deepGLMTrainTest(X_train,y_train,est) 2 | % Traing a fGLM model with continuous reponse y. 3 | % Bayesian Adaptive Group Lasso is used on the first-layer weights; no 4 | % regularization is put on the rest. sigma2 and tau are updated by 5 | % mean-field VB. Inverse gamma prior is used for sigma2 6 | % INPUT 7 | % X_train, y_train: Training data (continuous response) 8 | % X_validation, y_validation: Validation data 9 | % n_units: Vector specifying the numbers of units in 10 | % each layer 11 | % batchsize: Mini-batch size used in each iteration 12 | % eps0: Constant learning rate 13 | % isotropic: True if isotropic structure on Sigma is 14 | % used, otherwise rank-1 structure is used 15 | % OUTPUT 16 | % W_seq: The optimal weights upto the last hidden 17 | % layer 18 | % beta The optimal weights that connect last hidden layer to the output 19 | % mean_sigma2 Estimate of sigma2 20 | % shrinkage_gamma_seq Update of shrinkage parameters over 21 | % iteration 22 | % MSE_DL Mean squared error over iteration 23 | % 24 | % 25 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 26 | % Nguyen (nghia.nguyen@sydney.edu.au) 27 | % 28 | % http://www.xxx.com 29 | % 30 | % Version: 1.0 31 | % LAST UPDATE: April, 2018 32 | 33 | % Extract training data and settings from input struct 34 | X_val = est.data.Xval; 35 | y_val = est.data.yval; 36 | n_units = est.network; 37 | batchsize = est.batchsize; 38 | lrate = est.lrate; 39 | isotropic = est.isIsotropic; 40 | S = est.S; % Number of Monte Carlo samples to estimate the gradient 41 | tau = est.tau; % Threshold before reducing constant learning rate eps0 42 | grad_weight = est.momentum; % Weight in the momentum 43 | cScale = est.c; % Random scale factor to initialize b,c 44 | patience = est.patience; % Stop if test error not improved after patience_parameter iterations 45 | epoch = est.epoch; % Number of times learning algorithm scan entire training data 46 | verbose = est.verbose; 47 | distr = est.dist; 48 | lbFlag = est.lowerbound; % Lowerbound flag 49 | LBwindow = 20; 50 | seed = est.seed; 51 | 52 | if(~isnan(seed)) 53 | rng(seed); 54 | end 55 | 56 | % mu_tau = est.muTau; % Parameters for the auxiliary tau_j 57 | 58 | % Data merge for mini-batch sampling 59 | data = [y_train,X_train]; 60 | datasize = length(y_train); 61 | num1Epoch = round(datasize/batchsize); % Number of iterations per epoch 62 | 63 | % Network parameters 64 | L = length(n_units); % Number of hidden layers 65 | p = size(X_train,2)-1; % Number of covariates 66 | W_seq = cell(1,L); % Cells to store weight matrices 67 | index_track = zeros(1,L); % Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,... 68 | index_track(1) = n_units(1)*(p+1); % Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 69 | W1_tilde_index = n_units(1)+1:index_track(1); % Index of W1 without biases, as the first column if W1 are biases 70 | w_tilde_index = []; % indices of non-biase weights, excluding W1, for l2-regulization prior 71 | for j = 2:L 72 | index_track(j) = index_track(j-1)+n_units(j)*(n_units(j-1)+1); 73 | w_tilde_index = [w_tilde_index,(index_track(j-1)+n_units(j)+1):index_track(j)]; 74 | end 75 | d_w = index_track(L); % Total number of weights up to (and including) the last layer 76 | d_beta = n_units(L)+1; % Dimension of the weights beta connecting the last layer to the output 77 | d_theta = d_w+d_beta; % Total number of parameters 78 | w_tilde_index = [w_tilde_index,(d_w+2:d_theta)]; 79 | d_w_tilde = length(w_tilde_index); 80 | 81 | % Initialise weights and set initial mu equal to initial weights 82 | layers = [size(X_train,2) n_units 1]; % Full structure of NN -> [input,hidden,output] 83 | weights = nnInitialize(layers); 84 | mu=[]; 85 | for i=1:length(layers)-1 86 | mu=[mu;weights{i}(:)]; 87 | end 88 | % Initialize b and c 89 | b = normrnd(0,cScale,d_theta,1); 90 | if isotropic 91 | c = cScale; 92 | else 93 | c = cScale*ones(d_theta,1); 94 | end 95 | % Initialize lambda 96 | lambda=[mu;b;c]; 97 | 98 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 99 | W_seq{1} = W1; 100 | for j = 2:L 101 | index = index_track(j-1)+1:index_track(j); 102 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 103 | W_seq{j} = Wj; 104 | end 105 | beta = mu(d_w+1:d_theta); 106 | 107 | % if(batchsize~=datasize) 108 | % % Get mini-batch 109 | % idx = randperm(datasize,batchsize); 110 | % minibatch = data(idx,:); 111 | % y = minibatch(:,1); 112 | % X = minibatch(:,2:end); 113 | % else 114 | % y = y_train; 115 | % X = X_train; 116 | % end 117 | 118 | 119 | % % Hyperparameters for inverse-Gamma prior on sigma2 if y~Nomal(0,sigma2) 120 | % if(strcmp(distr,'normal')) 121 | % alpha0_sigma2 = 10; 122 | % beta0_sigma2 = (alpha0_sigma2-1)*std(y); 123 | % alpha_sigma2 = alpha0_sigma2 + length(y_train)/2; % Optimal VB parameter for updating sigma2 124 | % beta_sigma2 = alpha_sigma2; % Mean_sigma2 and mean_sigma2_inverse are 125 | % % Initialised at small values 1/2 and 1 respectively 126 | % mean_sigma2_inverse = alpha_sigma2/beta_sigma2; 127 | % mean_sigma2 = beta_sigma2/(alpha_sigma2-1); 128 | % mean_sigma2_save(1) = mean_sigma2; 129 | % end 130 | 131 | % Compute prediction loss if not using lowerbound for validation 132 | if(~lbFlag) 133 | if(strcmp(distr,'normal')) 134 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2); 135 | disp(['Initial MSE: ',num2str(MSE_current)]); 136 | else 137 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr); 138 | disp(['Initial PPS: ',num2str(PPS_current)]); 139 | end 140 | MSE_DL(1) = MSE_current; 141 | PPS_DL(1) = PPS_current; 142 | end 143 | 144 | % Calculations for group Lasso coefficients 145 | shrinkage_gamma = .01*ones(p,1); % Initialise gamma_beta, the shrinkage parameters 146 | shrinkage_l2 = .01; % Hype-parameter for L2 prior 147 | mu_tau = zeros(p,1); % Parameters for the auxiliary tau_j 148 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p); 149 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p); 150 | if isotropic 151 | for j = 1:p 152 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 153 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1); 154 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 155 | end 156 | lambda_tau = shrinkage_gamma.^2; 157 | else 158 | c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p); 159 | for j = 1:p 160 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 161 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2); 162 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 163 | end 164 | lambda_tau = shrinkage_gamma.^2; 165 | end 166 | mean_inverse_tau = mu_tau; % VB mean <1/tau_j> 167 | shrinkage_gamma_seq = shrinkage_gamma; % 168 | mean_tau = 1./mu_tau+1./lambda_tau; 169 | m = n_units(1); 170 | 171 | % Prepare to calculate lowerbound 172 | if(lbFlag) 173 | if(strcmp(distr,'normal')) 174 | const = alpha0_sigma2*log(beta0_sigma2)-gammaln(alpha0_sigma2)... 175 | -0.5*p*n_units(1)*log(2*pi)-0.5*d_w_tilde*log(2*pi)... 176 | -p*gammaln((n_units(1)+1)/2)-0.5*batchsize*log(2*pi)... 177 | +p/2*log(2*pi)+0.5*d_theta*log(2*pi)+d_theta/2; 178 | else 179 | const = -0.5*p*n_units(1)*log(2*pi)-0.5*d_w_tilde*log(2*pi)... 180 | -p*gammaln((n_units(1)+1)/2)+p/2*log(2*pi)... 181 | +0.5*d_theta*log(2*pi)+d_theta/2; 182 | end 183 | 184 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 185 | W_seq{1} = W1; 186 | for j = 2:L 187 | index = index_track(j-1)+1:index_track(j); 188 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 189 | W_seq{j} = Wj; 190 | end 191 | beta = mu(d_w+1:d_theta); 192 | mu_w_tilde = mu(w_tilde_index); 193 | b_w_tilde = b(w_tilde_index); 194 | c_w_tilde = c(w_tilde_index); 195 | mean_w_tilde = mu_w_tilde'*mu_w_tilde+b_w_tilde'*b_w_tilde+sum(c_w_tilde.^2); 196 | iter = 1; 197 | vbLowerBound; 198 | % disp(['Initial LB: ',num2str(lb(iter))]); 199 | end 200 | 201 | %% Calcualte for the first iteration 202 | grad_g_lik_store = zeros(S,3*d_theta); 203 | lb_iter = zeros(1,S); 204 | %----------------------------Narutal Gradient (1st Iteration)-------------- 205 | if(batchsize~=datasize) 206 | % Get mini-batch 207 | idx = randperm(datasize,batchsize); 208 | minibatch = data(idx,:); 209 | y = minibatch(:,1); 210 | X = minibatch(:,2:end); 211 | else 212 | y = y_train; 213 | X = X_train; 214 | end 215 | 216 | vbGradientLogLB 217 | gradient_bar = gradient_lambda; 218 | if(lbFlag) 219 | lb(iter) = mean(lb_iter)/batchsize; 220 | disp(['Initial LB: ',num2str(lb(iter))]); 221 | end 222 | %-------------------------------------------------------------------------- 223 | 224 | 225 | %% Training Phase 226 | % Prepare parameters for training 227 | idxEpoch = 0; % Index of current epoch 228 | iter = 1; % Index of current iteration 229 | stop = false; % Stop flag for early stopping 230 | lambda_best = lambda; % Store optimal lambda for output 231 | idxPatience = 0; % Index of number of consequent non-decreasing iterations 232 | % for early stopping 233 | disp('---------- Training Phase ----------') 234 | while ~stop 235 | iter = iter+1; 236 | 237 | %% ------------------Natural Gradient Calculation---------------------- 238 | if(batchsize~=datasize) 239 | % Get mini-batch 240 | idx = randperm(datasize,batchsize); 241 | minibatch = data(idx,:); 242 | y = minibatch(:,1); 243 | X = minibatch(:,2:end); 244 | else 245 | y = y_train; 246 | X = X_train; 247 | end 248 | 249 | % Calculate expected terms of lowerbound 250 | if(lbFlag) 251 | vbLowerBound; 252 | end 253 | 254 | % Calculate Natural Gradient 255 | vbGradientLogLB 256 | 257 | % Get lowerbound in the current iteration 258 | if(lbFlag) 259 | lb(iter) = mean(lb_iter)/batchsize; 260 | end 261 | %---------------------------------------------------------------------- 262 | 263 | %% ------------------Stochastic gradient ascend update----------------- 264 | % Prevent exploding Gradient 265 | grad_norm = norm(gradient_lambda); 266 | norm_gradient_threshold = 100; 267 | if norm(gradient_lambda)>norm_gradient_threshold 268 | gradient_lambda = (norm_gradient_threshold/grad_norm)*gradient_lambda; 269 | end 270 | 271 | % Momentum gradient 272 | gradient_bar_old = gradient_bar; 273 | gradient_bar = grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda; 274 | 275 | % Adaptive learning rate 276 | if iter>tau 277 | stepsize=lrate*tau/iter; 278 | else 279 | stepsize=lrate; 280 | end 281 | 282 | % Gradient ascend 283 | lambda=lambda+stepsize*gradient_bar; 284 | 285 | % Restore model parameters from variational parameter lambda 286 | mu=lambda(1:d_theta,1); 287 | b=lambda(d_theta+1:2*d_theta,1); 288 | c=lambda(2*d_theta+1:end); 289 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 290 | W_seq{1} = W1; 291 | for j = 2:L 292 | index = index_track(j-1)+1:index_track(j); 293 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 294 | W_seq{j} = Wj; 295 | end 296 | beta = mu(d_w+1:d_theta); 297 | %---------------------------------------------------------------------- 298 | 299 | %% ---------------- Update tau and shrinkage parameters---------------- 300 | if mod(iter,1) == 0 301 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p); 302 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p); 303 | if isotropic 304 | for j = 1:p 305 | mean_column_j_tilde(j) = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 306 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1); 307 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde(j)); 308 | lambda_tau(j) = shrinkage_gamma(j)^2; 309 | end 310 | else 311 | c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p); 312 | for j = 1:p 313 | mean_column_j_tilde(j) = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 314 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2); 315 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde(j)); 316 | lambda_tau(j) = shrinkage_gamma(j)^2; 317 | end 318 | end 319 | mean_inverse_tau = mu_tau; 320 | mean_tau = 1./mu_tau+1./lambda_tau; 321 | shrinkage_gamma = sqrt((n_units(1)+1)./mean_tau); 322 | shrinkage_gamma_seq = [shrinkage_gamma_seq,shrinkage_gamma]; 323 | 324 | mu_w_tilde = mu(w_tilde_index); 325 | b_w_tilde = b(w_tilde_index); 326 | c_w_tilde = c(w_tilde_index); 327 | mean_w_tilde = mu_w_tilde'*mu_w_tilde+b_w_tilde'*b_w_tilde+sum(c_w_tilde.^2); 328 | shrinkage_l2 = length(w_tilde_index)/mean_w_tilde; 329 | end 330 | %---------------------------------------------------------------------- 331 | 332 | %% ------Update VB posterior for sigma2, which is inverse Gamma ------- 333 | % if y ~ N(0,sigma2) 334 | if(strcmp(distr,'normal')) 335 | if (mod(iter,1) == 0) 336 | sum_squared = sumResidualSquared(y_train,X_train,W_seq,beta); 337 | beta_sigma2 = beta0_sigma2+sum_squared/2; 338 | mean_sigma2_inverse = alpha_sigma2/beta_sigma2; 339 | mean_sigma2 = beta_sigma2/(alpha_sigma2-1); 340 | mean_sigma2_save = [mean_sigma2_save,mean_sigma2]; 341 | end 342 | end 343 | %---------------------------------------------------------------------- 344 | 345 | %% ----------------------------Validation------------------------------ 346 | % Update lowerbound 347 | % vbLowerBound; 348 | 349 | % If using lowerbound for validation 350 | if(lbFlag) 351 | % Storing lowerbound moving average values 352 | if (iter>LBwindow) 353 | lb_bar(iter-LBwindow) = mean(lb(iter-LBwindow+1:iter)); 354 | if lb_bar(end)>=max(lb_bar) 355 | lambda_best = lambda; 356 | idxPatience = 0; 357 | else 358 | idxPatience = idxPatience+1; 359 | disp(['idxPatience: ',num2str(idxPatience)]) 360 | end 361 | end 362 | 363 | % If using MSE/Accuracy for validation 364 | else 365 | if(strcmp(distr,'normal')) 366 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2); 367 | else 368 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr); 369 | end 370 | 371 | MSE_DL(iter) = MSE_current; 372 | PPS_DL(iter) = PPS_current; 373 | 374 | if PPS_DL(iter)>=PPS_DL(iter-1) 375 | gradient_bar = gradient_bar_old; 376 | end 377 | 378 | if PPS_DL(iter)<=min(PPS_DL) 379 | lambda_best = lambda; 380 | idxPatience = 0; 381 | else 382 | idxPatience = idxPatience+1; 383 | disp(['idxPatience: ',num2str(idxPatience)]) 384 | end 385 | end 386 | 387 | % Early stopping 388 | if (idxPatience>patience)||(idxEpoch>epoch) 389 | stop = true; 390 | end 391 | %---------------------------------------------------------------------- 392 | 393 | %% ------------------------------Display------------------------------- 394 | % Display epoch index whenever an epoch is finished 395 | if(~mod(iter,num1Epoch)) 396 | idxEpoch = idxEpoch + 1; 397 | disp(['Epoch: ',num2str(idxEpoch)]); 398 | end 399 | 400 | % Display training results after each 'verbose' iteration 401 | if (verbose && ~mod(iter,verbose)) 402 | if(lbFlag) % Display lowerbound 403 | disp(['Current LB: ',num2str(lb(iter))]); 404 | if (iter>LBwindow) 405 | disp(['Current LB: ',num2str(lb_bar(iter-LBwindow))]); 406 | end 407 | else % Or display MSE/Accuracy 408 | if(strcmp(distr,'binomial')) 409 | disp(['Current PPS: ',num2str(PPS_current)]); 410 | else 411 | disp(['Current MSE: ',num2str(MSE_current)]); 412 | end 413 | end 414 | end 415 | %---------------------------------------------------------------------- 416 | 417 | end 418 | 419 | %% --------------------------Display Training Results---------------------- 420 | disp('---------- Training Completed! ----------') 421 | disp(['Number of iteration:',num2str(iter)]); 422 | if(lbFlag) 423 | disp(['LBBar best: ',num2str(max(lb_bar))]); 424 | else 425 | disp(['PPS best: ',num2str(min(PPS_DL))]); 426 | disp(['MSE best: ',num2str(min(MSE_DL))]); 427 | end 428 | 429 | %% ----------------------Store training output----------------------------- 430 | lambda = lambda_best; 431 | mu = lambda(1:d_theta,1); 432 | b = lambda(d_theta+1:2*d_theta,1); 433 | c = lambda(2*d_theta+1:end); 434 | if isotropic % For isotropic structure 435 | SIGMA = b*b' + c^2*eyes(d_theta); 436 | else 437 | SIGMA = b*b' + diag(c.^2); 438 | end 439 | 440 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 441 | W_seq{1} = W1; 442 | for j = 2:L 443 | index = index_track(j-1)+1:index_track(j); 444 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 445 | W_seq{j} = Wj; 446 | end 447 | beta = mu(d_w+1:d_w+d_beta); 448 | 449 | % Store output in a struct 450 | est.out.weights = W_seq; 451 | est.out.beta = beta; 452 | est.out.shrinkage = shrinkage_gamma_seq; 453 | est.out.iteration = iter; 454 | est.out.vbMU = mu; % Mean of variational distribution of weights 455 | est.out.b = b; 456 | est.out.c = c; 457 | est.out.vbSIGMA = SIGMA; % Covariance matrix of variational distribution 458 | % of weights 459 | est.out.nparams = d_theta; % Number of parameters 460 | est.out.indexTrack = index_track; 461 | est.out.muTau = mu_tau; 462 | 463 | if(strcmp(distr,'normal')) 464 | est.out.sigma2Alpha = alpha_sigma2; 465 | est.out.sigma2Beta = beta_sigma2; 466 | est.out.sigma2Mean = mean_sigma2_save(end); 467 | est.out.sigma2MeanIter = mean_sigma2_save; 468 | end 469 | 470 | if(lbFlag) 471 | est.out.lbBar = lb_bar(2:end); 472 | est.out.lb = lb; 473 | else 474 | if(strcmp(distr,'binomial')) 475 | est.out.accuracy = MSE_DL; 476 | else 477 | est.out.mse = MSE_DL; 478 | end 479 | est.out.pps = PPS_DL; 480 | end 481 | end -------------------------------------------------------------------------------- /Matlab/DeepGLM/train/deepGLMfit.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/DeepGLM/train/deepGLMfit.m -------------------------------------------------------------------------------- /Matlab/DeepGLM/train/deepGLMlogitPoisson.m: -------------------------------------------------------------------------------- 1 | function est = deepGLMlogitPoisson(X_train,y_train,est) 2 | %DEEPGLMBINOMIAL Summary of this function goes here 3 | % Traing a fGLM model with binary reponse y. 4 | % Bayesian Adaptive Group Lasso is used on the first-layer weights; no 5 | % regularization is put on the rest. sigma2 and tau are updated by 6 | % mean-field VB. Inverse gamma prior is used for sigma2 7 | % INPUT 8 | % X_train, y_train: training data (continuous response) 9 | % X_validation, y_validation: validation data 10 | % n_units: vector specifying the numbers of units in 11 | % each layer 12 | % batchsize: mini-batch size used in each iteration 13 | % eps0: constant learning rate 14 | % isotropic: true if isotropic structure on Sigma is 15 | % used, otherwise rank-1 structure is used 16 | % OUTPUT 17 | % W_seq: the optimal weights upto the last hidden 18 | % layer 19 | % beta the optimal weights that connect last hidden layer to the output 20 | % mean_sigma2 estimate of sigma2 21 | % shrinkage_gamma_seq update of shrinkage parameters over 22 | % iteration 23 | % 24 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 25 | % Nguyen (nghia.nguyen@sydney.edu.au) 26 | % 27 | % http://www.xxx.com 28 | % 29 | % Version: 1.0 30 | % LAST UPDATE: April, 2018 31 | 32 | % Extract training data and settings from input struct 33 | Xval = est.data.Xval; 34 | yval = est.data.yval; 35 | n_units = est.network; 36 | batchsize = est.batchsize; 37 | lrate = est.lrate; 38 | isotropic = est.isIsotropic; 39 | S = est.S; % Number of Monte Carlo samples to estimate the gradient 40 | tau = est.tau; % Threshold before reducing constant learning rate eps0 41 | grad_weight = est.momentum; % Weight in the momentum 42 | cScale = est.c; % Random scale factor to initialize b,c 43 | patience = est.patience; % Stop if test error not improved after patience_parameter iterations 44 | epoch = est.epoch; % Number of times learning algorithm scan entire training data 45 | verbose = est.verbose; 46 | distr = est.dist; 47 | 48 | % Data merge for mini-batch sampling 49 | data = [y_train,X_train]; 50 | datasize = length(y_train); 51 | num1Epoch = round(datasize/batchsize); % Number of iterations per epoch 52 | 53 | % Network parameters 54 | L = length(n_units); % Number of hidden layers 55 | p = size(X_train,2)-1; % Number of covariates 56 | W_seq = cell(1,L); % Cells to store weight matrices 57 | index_track = zeros(1,L); % Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,... 58 | index_track(1) = n_units(1)*(p+1); % Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 59 | W1_tilde_index = n_units(1)+1:index_track(1); % Index of W1 without biases, as the first column if W1 are biases 60 | w_tilde_index = []; % indices of non-biase weights, excluding W1, for l2-regulization prior 61 | for j = 2:L 62 | index_track(j) = index_track(j-1)+n_units(j)*(n_units(j-1)+1); 63 | w_tilde_index = [w_tilde_index,(index_track(j-1)+n_units(j)+1):index_track(j)]; 64 | end 65 | d_w = index_track(L); % Total number of weights up to (and including) the last layer 66 | d_beta = n_units(L)+1; % Dimension of the weights beta connecting the last layer to the output 67 | d_theta = d_w+d_beta; % Total number of parameters 68 | w_tilde_index = [w_tilde_index,(d_w+2:d_theta)]; 69 | d_w_tilde = length(w_tilde_index); 70 | 71 | % Initialise weights and set initial mu equal to initial weights 72 | layers = [size(X_train,2) n_units 1]; % Full structure of NN -> [input,hidden,output] 73 | weights = nnInitialize(layers); 74 | mu=[]; 75 | for i=1:length(layers)-1 76 | mu=[mu;weights{i}(:)]; 77 | end 78 | % Initialize b and c 79 | b = normrnd(0,cScale,d_theta,1); 80 | if isotropic 81 | c = cScale; 82 | else 83 | c = cScale*ones(d_theta,1); 84 | end 85 | % Initialize lambda 86 | lambda=[mu;b;c]; 87 | 88 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 89 | W_seq{1} = W1; 90 | for j = 2:L 91 | index = index_track(j-1)+1:index_track(j); 92 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 93 | W_seq{j} = Wj; 94 | end 95 | beta = mu(d_w+1:d_theta); 96 | 97 | [Loss_current,Accuracy_current] = deepGLMpredictLoss(Xval,yval,W_seq,beta,distr); % compute prediction loss 98 | disp(['Initial loss: ',num2str(Loss_current)]); 99 | PPS_DL(1) = Loss_current; 100 | Accuracy_DL(1) = Accuracy_current; 101 | 102 | shrinkage_gamma = .01*ones(p,1); % Initialise gamma_beta, the shrinkage parameters 103 | shrinkage_l2 = .01; % Hype-parameter for L2 prior 104 | mu_tau = zeros(p,1); % Parameters for the auxiliary tau_j 105 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p); 106 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p); 107 | if isotropic 108 | for j = 1:p 109 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 110 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1); 111 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 112 | end 113 | lambda_tau = shrinkage_gamma.^2; 114 | else 115 | c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p); 116 | for j = 1:p 117 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 118 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2); 119 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 120 | end 121 | lambda_tau = shrinkage_gamma.^2; 122 | end 123 | mean_inverse_tau = mu_tau; % VB mean <1/tau_j> 124 | shrinkage_gamma_seq = shrinkage_gamma; % 125 | 126 | minibatch = datasample(data,batchsize); 127 | y = minibatch(:,1); 128 | X = minibatch(:,2:end); 129 | 130 | rqmc = normrnd(0,1,S,d_theta+1); % using quasi MC random numbers 131 | for s=1:S 132 | U_normal = rqmc(s,:)'; 133 | epsilon1=U_normal(1); 134 | epsilon2=U_normal(2:end); 135 | theta=mu+epsilon1*b+c.*epsilon2; 136 | 137 | W_seq = cell(1,L); 138 | W1 = reshape(theta(1:index_track(1)),n_units(1),p+1); 139 | W_seq{1} = W1; 140 | W1_tilde = W1(:,2:end); % weights without biases 141 | W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau); 142 | grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:)]; 143 | for j = 2:L 144 | index = index_track(j-1)+1:index_track(j); 145 | Wj = reshape(theta(index),n_units(j),n_units(j-1)+1); 146 | W_seq{j} = Wj; 147 | Wj_tilde = Wj(:,2:end); 148 | grad_prior_Wj = [zeros(n_units(j),1);-shrinkage_l2*Wj_tilde(:)]; 149 | grad_prior_w_beta = [grad_prior_w_beta;grad_prior_Wj]; 150 | end 151 | beta = theta(d_w+1:d_theta); 152 | beta_tilde = beta(2:end); % vector beta without intercept 153 | grad_prior_beta = [0;-shrinkage_l2*beta_tilde]; 154 | grad_prior_w_beta = [grad_prior_w_beta;grad_prior_beta]; 155 | 156 | grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr); 157 | 158 | grad_h = grad_prior_w_beta+grad_llh; % gradient of log prior plus log-likelihood 159 | grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic); 160 | grad_theta = grad_h-grad_log_q; 161 | 162 | grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]'; 163 | end 164 | grad_lb = (mean(grad_g_lik_store))'; 165 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic); 166 | gradient_bar = gradient_lambda; 167 | 168 | % Prepare parameters for training 169 | idxEpoch = 0; % index of current epoch 170 | iter = 1; % index of current iteration 171 | stop = false; % Stop flag for early stopping 172 | lambda_best = lambda; % Store optimal lambda for output 173 | idxPatience = 0; % index of number of consequent non-decreasing iterations 174 | % for early stopping 175 | disp('---------- Training Phase ----------') 176 | while ~stop 177 | iter = iter+1; 178 | % Display training results after each 'verbose' iteration 179 | if (verbose && ~mod(iter,verbose)) 180 | if(~mod(iter,num1Epoch)) 181 | idxEpoch = idxEpoch + 1; 182 | disp(['Epoch: ',num2str(idxEpoch)]); 183 | end 184 | if(strcmp(distr,'binomial')) 185 | disp(['Current PPS: ',num2str(Loss_current)]); 186 | else 187 | disp(['Current MSE: ',num2str(Accuracy_current)]); 188 | end 189 | end 190 | 191 | minibatch = datasample(data,batchsize); 192 | y = minibatch(:,1); 193 | X = minibatch(:,2:end); 194 | 195 | rqmc = normrnd(0,1,S,d_theta+1); % using quasi MC random numbers 196 | for s=1:S 197 | U_normal = rqmc(s,:)'; 198 | epsilon1=U_normal(1); 199 | epsilon2=U_normal(2:end); 200 | theta=mu+b*epsilon1+c.*epsilon2; 201 | 202 | W_seq = cell(1,L); 203 | W1 = reshape(theta(1:index_track(1)),n_units(1),p+1); 204 | W_seq{1} = W1; 205 | W1_tilde = W1(:,2:end); % weights without biases 206 | W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau); 207 | grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:)]; 208 | for j = 2:L 209 | index = index_track(j-1)+1:index_track(j); 210 | Wj = reshape(theta(index),n_units(j),n_units(j-1)+1); 211 | W_seq{j} = Wj; 212 | Wj_tilde = Wj(:,2:end); 213 | grad_prior_Wj = [zeros(n_units(j),1);-shrinkage_l2*Wj_tilde(:)]; 214 | grad_prior_w_beta = [grad_prior_w_beta;grad_prior_Wj]; 215 | end 216 | beta = theta(d_w+1:d_w+d_beta); 217 | beta_tilde = beta(2:end); % vector beta without intercept 218 | grad_prior_beta = [0;-shrinkage_l2*beta_tilde]; 219 | grad_prior_w_beta = [grad_prior_w_beta;grad_prior_beta]; 220 | 221 | grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr); 222 | grad_h = grad_prior_w_beta+grad_llh; 223 | grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic); 224 | grad_theta = grad_h-grad_log_q; 225 | 226 | grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]'; 227 | end 228 | grad_lb = (mean(grad_g_lik_store))'; 229 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic); 230 | 231 | grad_norm = norm(gradient_lambda); 232 | norm_gradient_threshold = 100; 233 | if norm(gradient_lambda)>norm_gradient_threshold 234 | gradient_lambda = (norm_gradient_threshold/grad_norm)*gradient_lambda; 235 | end 236 | 237 | gradient_bar_old = gradient_bar; 238 | gradient_bar = grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda; 239 | 240 | if iter>tau 241 | stepsize=lrate*tau/iter; 242 | else 243 | stepsize=lrate; 244 | end 245 | 246 | lambda=lambda+stepsize*gradient_bar; 247 | 248 | mu=lambda(1:d_theta,1); 249 | b=lambda(d_theta+1:2*d_theta,1); 250 | c=lambda(2*d_theta+1:end); 251 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 252 | W_seq{1} = W1; 253 | for j = 2:L 254 | index = index_track(j-1)+1:index_track(j); 255 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 256 | W_seq{j} = Wj; 257 | end 258 | beta = mu(d_w+1:d_theta); 259 | 260 | % update tau and shrinkage parameters 261 | if mod(iter,5) == 0 262 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p); 263 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p); 264 | if isotropic 265 | for j = 1:p 266 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 267 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1); 268 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 269 | lambda_tau(j) = shrinkage_gamma(j)^2; 270 | end 271 | else 272 | c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p); 273 | for j = 1:p 274 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 275 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2); 276 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 277 | lambda_tau(j) = shrinkage_gamma(j)^2; 278 | end 279 | end 280 | mean_inverse_tau = mu_tau; 281 | mean_tau = 1./mu_tau+1./lambda_tau; 282 | shrinkage_gamma = sqrt((n_units(1)+1)./mean_tau); 283 | shrinkage_gamma_seq = [shrinkage_gamma_seq,shrinkage_gamma]; 284 | 285 | mu_w_tilde = mu(w_tilde_index); 286 | b_w_tilde = b(w_tilde_index); 287 | c_w_tilde = c(w_tilde_index); 288 | mean_w_tilde = mu_w_tilde'*mu_w_tilde+b_w_tilde'*b_w_tilde+sum(c_w_tilde.^2); 289 | % shrinkage_l2 = length(w_tilde_index)/mean_w_tilde; 290 | end 291 | 292 | 293 | [Loss_current,Accuracy_current] = deepGLMpredictLoss(Xval,yval,W_seq,beta,distr); % compute prediction loss 294 | PPS_DL(iter) = Loss_current; 295 | Accuracy_DL(iter) = Accuracy_current; 296 | 297 | if PPS_DL(iter)>=PPS_DL(iter-1) 298 | gradient_bar = gradient_bar_old; 299 | end 300 | 301 | if PPS_DL(iter)<=min(PPS_DL) 302 | lambda_best = lambda; 303 | idxPatience = 0; 304 | idxbest = iter; 305 | else 306 | idxPatience = idxPatience+1; 307 | end 308 | 309 | if (idxPatience>patience)||(idxEpoch>epoch) 310 | stop = true; 311 | end 312 | end 313 | 314 | % Showing that training phase has completed 315 | disp('---------- Training Completed! ----------') 316 | disp(['Number of iteration:',num2str(iter)]); 317 | disp(['PPS best: ',num2str(min(PPS_DL))]); 318 | if (strcmp(distr,'poisson')) 319 | disp(['MSE best: ',num2str(min(Accuracy_DL))]); 320 | est.out.mse = Accuracy_DL; 321 | else 322 | disp(['Accuracy best: ',num2str(max(Accuracy_DL))]); 323 | est.out.accuracy = Accuracy_DL; 324 | end 325 | % Extract mode of weights to make prediction 326 | lambda = lambda_best; 327 | mu = lambda(1:d_theta,1); 328 | b = lambda(d_theta+1:2*d_theta,1); 329 | c = lambda(2*d_theta+1:end); 330 | if isotropic % For isotropic structure 331 | SIGMA = b*b' + c^2*eyes(d_theta); 332 | else 333 | SIGMA = b*b' + diag(c.^2); 334 | end 335 | 336 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 337 | W_seq{1} = W1; 338 | for j = 2:L 339 | index = index_track(j-1)+1:index_track(j); 340 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 341 | W_seq{j} = Wj; 342 | end 343 | beta = mu(d_w+1:d_w+d_beta); 344 | 345 | % Store output in a struct 346 | est.out.weights = W_seq; 347 | est.out.beta = beta; 348 | est.out.shrinkage = shrinkage_gamma_seq; 349 | est.out.iteration = iter; 350 | est.out.pps = PPS_DL; 351 | est.out.vbMU = mu; % Mean of variational distribution of weights 352 | est.out.b = b; 353 | est.out.c = c; 354 | est.out.vbSIGMA = SIGMA; % Covariance matrix of variational distribution 355 | % of weights 356 | est.out.nparams = d_theta; % Number of parameters 357 | est.out.indexTrack = index_track; 358 | est.out.idxBest = idxbest; 359 | end 360 | 361 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/train/deepGLMnormalCV.m: -------------------------------------------------------------------------------- 1 | function est = deepGLMnormalCV(X_train,y_train,est) 2 | % Traing a fGLM model with continuous reponse y. 3 | % Bayesian Adaptive Group Lasso is used on the first-layer weights; no 4 | % regularization is put on the rest. sigma2 and tau are updated by 5 | % mean-field VB. Inverse gamma prior is used for sigma2 6 | % INPUT 7 | % X_train, y_train: Training data (continuous response) 8 | % X_validation, y_validation: Validation data 9 | % n_units: Vector specifying the numbers of units in 10 | % each layer 11 | % batchsize: Mini-batch size used in each iteration 12 | % eps0: Constant learning rate 13 | % isotropic: True if isotropic structure on Sigma is 14 | % used, otherwise rank-1 structure is used 15 | % OUTPUT 16 | % W_seq: The optimal weights upto the last hidden 17 | % layer 18 | % beta The optimal weights that connect last hidden layer to the output 19 | % mean_sigma2 Estimate of sigma2 20 | % shrinkage_gamma_seq Update of shrinkage parameters over 21 | % iteration 22 | % MSE_DL Mean squared error over iteration 23 | % 24 | % 25 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 26 | % Nguyen (nghia.nguyen@sydney.edu.au) 27 | % 28 | % http://www.xxx.com 29 | % 30 | % Version: 1.0 31 | % LAST UPDATE: April, 2018 32 | 33 | % Extract training data and settings from input struct 34 | X_val = est.data.Xval; 35 | y_val = est.data.yval; 36 | n_units = est.network; 37 | batchsize = est.batchsize; 38 | lrate = est.lrate; 39 | isotropic = est.isIsotropic; 40 | S = est.S; % Number of Monte Carlo samples to estimate the gradient 41 | tau = est.tau; % Threshold before reducing constant learning rate eps0 42 | grad_weight = est.momentum; % Weight in the momentum 43 | cScale = est.c; % Random scale factor to initialize b,c 44 | patience = est.patience; % Stop if test error not improved after patience_parameter iterations 45 | epoch = est.epoch; % Number of times learning algorithm scan entire training data 46 | verbose = est.verbose; 47 | distr = est.dist; 48 | 49 | % Data merge for mini-batch sampling 50 | data = [y_train,X_train]; 51 | datasize = length(y_train); 52 | num1Epoch = round(datasize/batchsize); % Number of iterations per epoch 53 | 54 | % Network parameters 55 | L = length(n_units); % Number of hidden layers 56 | p = size(X_train,2)-1; % Number of covariates 57 | W_seq = cell(1,L); % Cells to store weight matrices 58 | index_track = zeros(1,L); % Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,... 59 | index_track(1) = n_units(1)*(p+1); % Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 60 | W1_tilde_index = n_units(1)+1:index_track(1); % Index of W1 without biases, as the first column if W1 are biases 61 | for j = 2:L 62 | index_track(j) = index_track(j-1)+n_units(j)*(n_units(j-1)+1); 63 | end 64 | d_w = index_track(L); % Total number of weights up to (and including) the last layer 65 | d_beta = n_units(L)+1; % Dimension of the weights beta connecting the last layer to the output 66 | d_theta = d_w+d_beta; % Total number of parameters 67 | 68 | % Initialise weights and set initial mu equal to initial weights 69 | layers = [size(X_train,2) n_units 1]; % Full structure of NN -> [input,hidden,output] 70 | weights = nnInitialize(layers); 71 | mu=[]; 72 | for i=1:length(layers)-1 73 | mu=[mu;weights{i}(:)]; 74 | end 75 | % Initialize b and c 76 | b = normrnd(0,cScale,d_theta,1); 77 | if isotropic 78 | c = cScale; 79 | else 80 | c = cScale*ones(d_theta,1); 81 | end 82 | % Initialize lambda 83 | lambda=[mu;b;c]; 84 | 85 | % Hyperparameters for inverse-Gamma prior on sigma2 86 | alpha0_sigma2 = 0; 87 | beta0_sigma2 = 0; 88 | 89 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 90 | W_seq{1} = W1; 91 | for j = 2:L 92 | index = index_track(j-1)+1:index_track(j); 93 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 94 | W_seq{j} = Wj; 95 | end 96 | beta = mu(d_w+1:d_theta); 97 | alpha_sigma2 = alpha0_sigma2+length(y_train)/2; % Optimal VB parameter for updating sigma2 98 | beta_sigma2 = alpha_sigma2; % Mean_sigma2 and mean_sigma2_inverse are 99 | % Initialised at small values 1/2 and 1 respectively 100 | mean_sigma2_inverse = alpha_sigma2/beta_sigma2; 101 | mean_sigma2 = beta_sigma2/(alpha_sigma2-1); 102 | mean_sigma2_save(1) = mean_sigma2; 103 | % Compute prediction loss 104 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2); 105 | disp(['Initial MSE: ',num2str(MSE_current)]); 106 | MSE_DL(1) = MSE_current; 107 | PPS_DL(1) = PPS_current; 108 | 109 | shrinkage_gamma = .01*ones(p,1); % Initialise gamma_beta, the shrinkage parameters 110 | mu_tau = zeros(p,1); % Parameters for the auxiliary tau_j 111 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p); 112 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p); 113 | if isotropic 114 | for j = 1:p 115 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 116 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1); 117 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 118 | end 119 | lambda_tau = shrinkage_gamma.^2; 120 | else 121 | c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p); 122 | for j = 1:p 123 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 124 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2); 125 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 126 | end 127 | lambda_tau = shrinkage_gamma.^2; 128 | end 129 | mean_inverse_tau = mu_tau; % VB mean <1/tau_j> 130 | shrinkage_gamma_seq = shrinkage_gamma; % 131 | 132 | minibatch = datasample(data,batchsize); 133 | y = minibatch(:,1); 134 | X = minibatch(:,2:end); 135 | 136 | rqmc = normrnd_qmc(S,d_theta+1); % Using quasi MC random numbers 137 | % grad_g_lik_store = zeros(S,) 138 | for s=1:S 139 | U_normal = rqmc(s,:)'; 140 | epsilon1=U_normal(1); 141 | epsilon2=U_normal(2:end); 142 | theta=mu+epsilon1*b+c.*epsilon2; 143 | 144 | W_seq = cell(1,L); 145 | W1 = reshape(theta(1:index_track(1)),n_units(1),p+1); 146 | W_seq{1} = W1; 147 | for j = 2:L 148 | index = index_track(j-1)+1:index_track(j); 149 | Wj = reshape(theta(index),n_units(j),n_units(j-1)+1); 150 | W_seq{j} = Wj; 151 | end 152 | beta = theta(d_w+1:d_theta); 153 | 154 | W1_tilde = W1(:,2:end); % Weights without biases 155 | W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau); 156 | grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:);zeros(d_w+d_beta-index_track(1),1)]; 157 | 158 | grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr,mean_sigma2_inverse); 159 | 160 | grad_h = grad_prior_w_beta+grad_llh; % Gradient of log prior plus log-likelihood 161 | grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic); 162 | grad_theta = grad_h-grad_log_q; 163 | 164 | grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]'; 165 | end 166 | grad_lb = (mean(grad_g_lik_store))'; 167 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic); 168 | gradient_bar = gradient_lambda; 169 | 170 | % Prepare parameters for training 171 | idxEpoch = 0; % Index of current epoch 172 | iter = 1; % Index of current iteration 173 | stop = false; % Stop flag for early stopping 174 | lambda_best = lambda; % Store optimal lambda for output 175 | idxPatience = 0; % Index of number of consequent non-decreasing iterations 176 | % for early stopping 177 | disp('---------- Training Phase ----------') 178 | while ~stop 179 | iter = iter+1; 180 | % Display training results after each 'verbose' iteration 181 | if (verbose && ~mod(iter,verbose)) 182 | if(~mod(iter,num1Epoch)) 183 | idxEpoch = idxEpoch + 1; 184 | disp(['Epoch: ',num2str(idxEpoch)]); 185 | end 186 | disp(['Current MSE: ',num2str(MSE_current)]); 187 | end 188 | % minibatch = datasample(data,batchsize); 189 | for k=1:5 190 | [X,y,X_val,y_val] = splitData(X,y,k,5); 191 | rqmc = normrnd_qmc(S,d_theta+1); 192 | for s=1:S 193 | U_normal = rqmc(s,:)'; 194 | epsilon1=U_normal(1); 195 | epsilon2=U_normal(2:end); 196 | theta=mu+b*epsilon1+c.*epsilon2; 197 | 198 | W_seq = cell(1,L); 199 | W1 = reshape(theta(1:index_track(1)),n_units(1),p+1); 200 | W_seq{1} = W1; 201 | for j = 2:L 202 | index = index_track(j-1)+1:index_track(j); 203 | Wj = reshape(theta(index),n_units(j),n_units(j-1)+1); 204 | W_seq{j} = Wj; 205 | end 206 | beta = theta(d_w+1:d_w+d_beta); 207 | 208 | W1_tilde = W1(:,2:end); % Weights without biases 209 | W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau); 210 | grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:);zeros(d_w+d_beta-index_track(1),1)]; 211 | 212 | grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr,mean_sigma2_inverse); 213 | 214 | grad_h = grad_prior_w_beta+grad_llh; 215 | grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic); 216 | grad_theta = grad_h-grad_log_q; 217 | 218 | grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]'; 219 | end 220 | grad_lb = (mean(grad_g_lik_store))'; 221 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic); 222 | 223 | grad_norm = norm(gradient_lambda); 224 | norm_gradient_threshold = 100; 225 | if norm(gradient_lambda)>norm_gradient_threshold 226 | gradient_lambda = (norm_gradient_threshold/grad_norm)*gradient_lambda; 227 | end 228 | 229 | gradient_bar_old = gradient_bar; 230 | gradient_bar = grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda; 231 | 232 | if iter>tau 233 | stepsize=lrate*tau/iter; 234 | else 235 | stepsize=lrate; 236 | end 237 | 238 | lambda=lambda+stepsize*gradient_bar; 239 | 240 | mu=lambda(1:d_theta,1); 241 | b=lambda(d_theta+1:2*d_theta,1); 242 | c=lambda(2*d_theta+1:end); 243 | 244 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 245 | W_seq{1} = W1; 246 | for j = 2:L 247 | index = index_track(j-1)+1:index_track(j); 248 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 249 | W_seq{j} = Wj; 250 | end 251 | beta = mu(d_w+1:d_theta); 252 | 253 | % Update tau and shrinkage parameters 254 | % if mod(iter,10) == 0 255 | if mod(iter,1) == 0 256 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p); 257 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p); 258 | if isotropic 259 | for j = 1:p 260 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 261 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1); 262 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 263 | lambda_tau(j) = shrinkage_gamma(j)^2; 264 | end 265 | else 266 | c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p); 267 | for j = 1:p 268 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 269 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2); 270 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 271 | lambda_tau(j) = shrinkage_gamma(j)^2; 272 | end 273 | end 274 | mean_inverse_tau = mu_tau; 275 | mean_tau = 1./mu_tau+1./lambda_tau; 276 | shrinkage_gamma = sqrt((n_units(1)+1)./mean_tau); 277 | shrinkage_gamma_seq = [shrinkage_gamma_seq,shrinkage_gamma]; 278 | end 279 | 280 | % Update VB posterior for sigma2, which is inverse Gamma 281 | % if (iter >= 1000)&&(mod(iter,100) == 0) 282 | if (iter >= 1)&&(mod(iter,1) == 0) 283 | beta_sigma2 = beta0_sigma2+sumResidualSquared(y_train,X_train,W_seq,beta)/2; 284 | % beta_sigma2 = beta0_sigma2+sumResidualSquared(y,X,W_seq,beta)/2; 285 | mean_sigma2_inverse = alpha_sigma2/beta_sigma2; 286 | mean_sigma2 = beta_sigma2/(alpha_sigma2-1); 287 | mean_sigma2_save = [mean_sigma2_save,mean_sigma2]; 288 | end 289 | [PPS_current,MSE_current] = deepGLMpredictLoss(X_val,y_val,W_seq,beta,distr,mean_sigma2); 290 | end 291 | 292 | MSE_DL(iter) = MSE_current; 293 | PPS_DL(iter) = PPS_current; 294 | 295 | if MSE_DL(iter)>=MSE_DL(iter-1) 296 | gradient_bar = gradient_bar_old; 297 | end 298 | 299 | if MSE_DL(iter)<=min(MSE_DL) 300 | lambda_best = lambda; 301 | idxPatience = 0; 302 | idxbest = iter; 303 | else 304 | idxPatience = idxPatience+1; 305 | end 306 | 307 | if (idxPatience>patience)||(idxEpoch>epoch) 308 | stop = true; 309 | end 310 | end 311 | disp('---------- Training Completed! ----------') 312 | disp(['Number of iteration:',num2str(iter)]); 313 | disp(['PPS best: ',num2str(min(PPS_DL))]); 314 | disp(['MSE best: ',num2str(min(MSE_DL))]); 315 | % disp(['Sigma best: ',num2str(sqrt(mean_sigma2_save(idxbest)))]); 316 | 317 | % Extract mode of weights to make prediction 318 | lambda = lambda_best; 319 | mu = lambda(1:d_theta,1); 320 | b = lambda(d_theta+1:2*d_theta,1); 321 | c = lambda(2*d_theta+1:end); 322 | if isotropic % For isotropic structure 323 | SIGMA = b*b' + c^2*eyes(d_theta); 324 | else 325 | SIGMA = b*b' + diag(c.^2); 326 | end 327 | 328 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 329 | W_seq{1} = W1; 330 | for j = 2:L 331 | index = index_track(j-1)+1:index_track(j); 332 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 333 | W_seq{j} = Wj; 334 | end 335 | beta = mu(d_w+1:d_w+d_beta); 336 | 337 | % Store output in a struct 338 | est.out.weights = W_seq; 339 | est.out.beta = beta; 340 | est.out.shrinkage = shrinkage_gamma_seq; 341 | est.out.iteration = iter; 342 | est.out.mse = MSE_DL; 343 | est.out.pps = PPS_DL; 344 | est.out.vbMU = mu; % Mean of variational distribution of weights 345 | est.out.b = b; 346 | est.out.c = c; 347 | est.out.vbSIGMA = SIGMA; % Covariance matrix of variational distribution 348 | % of weights 349 | est.out.nparams = d_theta; % Number of parameters 350 | est.out.indexTrack = index_track; 351 | est.out.sigma2Alpha = alpha_sigma2; 352 | est.out.sigma2Beta = beta_sigma2; 353 | % est.out.idxBest = idxbest; 354 | est.out.sigma2Mean = mean_sigma2_save(end); 355 | est.out.sigma2MeanIter = mean_sigma2_save; 356 | end -------------------------------------------------------------------------------- /Matlab/DeepGLM/train/deepGLMpoisson.m: -------------------------------------------------------------------------------- 1 | function out = deepGLMpoisson(X_train,y_train,est) 2 | %DEEPGLMPOISSON Summary of this function goes here 3 | % 4 | % Traing a fGLM model with binary reponse y. 5 | % Bayesian Adaptive Group Lasso is used on the first-layer weights; no 6 | % regularization is put on the rest. sigma2 and tau are updated by 7 | % mean-field VB. Inverse gamma prior is used for sigma2 8 | % INPUT 9 | % X_train, y_train: training data (continuous response) 10 | % X_validation, y_validation: validation data 11 | % n_units: vector specifying the numbers of units in 12 | % each layer 13 | % batchsize: mini-batch size used in each iteration 14 | % eps0: constant learning rate 15 | % isotropic: true if isotropic structure on Sigma is 16 | % used, otherwise rank-1 structure is used 17 | % OUTPUT 18 | % W_seq: the optimal weights upto the last hidden 19 | % layer 20 | % beta the optimal weights that connect last hidden layer to the output 21 | % mean_sigma2 estimate of sigma2 22 | % shrinkage_gamma_seq update of shrinkage parameters over 23 | % iteration 24 | % 25 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 26 | % Nguyen (nghia.nguyen@sydney.edu.au) 27 | % 28 | % http://www.xxx.com 29 | % 30 | % Version: 1.0 31 | % LAST UPDATE: April, 2018 32 | 33 | 34 | % Extract training data and settings from input struct 35 | Xval = est.data.Xval; 36 | yval = est.data.yval; 37 | n_units = est.network; 38 | batchsize = est.batchsize; 39 | lrate = est.lrate; 40 | isotropic = est.isIsotropic; 41 | S = est.S; % Number of Monte Carlo samples to estimate the gradient 42 | tau = est.tau; % Threshold before reducing constant learning rate eps0 43 | grad_weight = est.momentum; % Weight in the momentum 44 | cScale = est.c; % Random scale factor to initialize b,c 45 | patience = est.patience; % Stop if test error not improved after patience_parameter iterations 46 | epoch = est.epoch; % Number of times learning algorithm scan entire training data 47 | verbose = est.verbose; 48 | distr = est.dist; 49 | 50 | % Data merge for mini-batch sampling 51 | data = [y_train,X_train]; 52 | datasize = length(y_train); 53 | num1Epoch = round(datasize/batchsize); % Number of iterations per epoch 54 | 55 | % Network parameters 56 | L = length(n_units); % Number of hidden layers 57 | p = size(X_train,2)-1; % Number of covariates 58 | W_seq = cell(1,L); % Cells to store weight matrices 59 | index_track = zeros(1,L); % Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,... 60 | index_track(1) = n_units(1)*(p+1); % Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 61 | W1_tilde_index = n_units(1)+1:index_track(1); % Index of W1 without biases, as the first column if W1 are biases 62 | for j = 2:L 63 | index_track(j) = index_track(j-1)+n_units(j)*(n_units(j-1)+1); 64 | end 65 | d_w = index_track(L); % Total number of weights up to (and including) the last layer 66 | d_beta = n_units(L)+1; % Dimension of the weights beta connecting the last layer to the output 67 | d_theta = d_w+d_beta; % Total number of parameters 68 | 69 | % Initialise weights and set initial mu equal to initial weights 70 | layers = [size(X_train,2) n_units 1]; % Full structure of NN -> [input,hidden,output] 71 | weights = nnInitialize(layers); 72 | mu=[]; 73 | for i=1:length(layers)-1 74 | mu=[mu;weights{i}(:)]; 75 | end 76 | % Initialize b and c 77 | b = normrnd(0,cScale,d_theta,1); 78 | if isotropic 79 | c = cScale; 80 | else 81 | c = cScale*ones(d_theta,1); 82 | end 83 | % Initialize lambda 84 | lambda=[mu;b;c]; 85 | 86 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 87 | W_seq{1} = W1; 88 | for j = 2:L 89 | index = index_track(j-1)+1:index_track(j); 90 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 91 | W_seq{j} = Wj; 92 | end 93 | beta = mu(d_w+1:d_theta); 94 | 95 | [Loss_current,~] = deepGLMpredict(Xval,yval,W_seq,beta,distr); % compute prediction loss 96 | disp(['Initial loss on validation set: ',num2str(Loss_current)]); 97 | Loss_DL(1) = Loss_current; 98 | 99 | shrinkage_gamma = .01*ones(p,1); % Initialise gamma_beta, the shrinkage parameters 100 | mu_tau = zeros(p,1); % Parameters for the auxiliary tau_j 101 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p); 102 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p); 103 | if isotropic 104 | for j = 1:p 105 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 106 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1); 107 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 108 | end 109 | lambda_tau = shrinkage_gamma.^2; 110 | else 111 | c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p); 112 | for j = 1:p 113 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 114 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2); 115 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 116 | end 117 | lambda_tau = shrinkage_gamma.^2; 118 | end 119 | mean_inverse_tau = mu_tau; % VB mean <1/tau_j> 120 | shrinkage_gamma_seq = shrinkage_gamma; % 121 | 122 | minibatch = datasample(data,batchsize); 123 | y = minibatch(:,1); 124 | X = minibatch(:,2:end); 125 | 126 | rqmc = normrnd_qmc(S,d_theta+1); % using quasi MC random numbers 127 | for s=1:S 128 | U_normal = rqmc(s,:)'; 129 | epsilon1=U_normal(1); 130 | epsilon2=U_normal(2:end); 131 | theta=mu+epsilon1*b+c.*epsilon2; 132 | 133 | W_seq = cell(1,L); 134 | W1 = reshape(theta(1:index_track(1)),n_units(1),p+1); 135 | W_seq{1} = W1; 136 | for j = 2:L 137 | index = index_track(j-1)+1:index_track(j); 138 | Wj = reshape(theta(index),n_units(j),n_units(j-1)+1); 139 | W_seq{j} = Wj; 140 | end 141 | beta = theta(d_w+1:d_theta); 142 | 143 | W1_tilde = W1(:,2:end); % weights without biases 144 | W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau); 145 | grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:);zeros(d_w+d_beta-index_track(1),1)]; 146 | 147 | grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr); 148 | 149 | grad_h = grad_prior_w_beta+grad_llh; % gradient of log prior plus log-likelihood 150 | grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic); 151 | grad_theta = grad_h-grad_log_q; 152 | 153 | grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]'; 154 | end 155 | grad_lb = (mean(grad_g_lik_store))'; 156 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic); 157 | gradient_bar = gradient_lambda; 158 | 159 | % Prepare parameters for training 160 | idxEpoch = 0; % index of current epoch 161 | iter = 1; % index of current iteration 162 | stop = false; % Stop flag for early stopping 163 | lambda_best = lambda; % Store optimal lambda for output 164 | idxPatience = 0; % index of number of consequent non-decreasing iterations 165 | % for early stopping 166 | disp('---------- Training Phase ----------') 167 | while ~stop 168 | iter = iter+1; 169 | % Display training results after each 'verbose' iteration 170 | if (verbose && ~mod(iter,verbose)) 171 | if(~mod(iter,num1Epoch)) 172 | idxEpoch = idxEpoch + 1; 173 | disp(['Epoch: ',num2str(idxEpoch)]); 174 | end 175 | disp(['Current PPS: ',num2str(Loss_current)]); 176 | end 177 | 178 | minibatch = datasample(data,batchsize); 179 | y = minibatch(:,1); 180 | X = minibatch(:,2:end); 181 | rqmc = normrnd_qmc(S,d_theta+1); 182 | for s=1:S 183 | U_normal = rqmc(s,:)'; 184 | epsilon1=U_normal(1); 185 | epsilon2=U_normal(2:end); 186 | theta=mu+b*epsilon1+c.*epsilon2; 187 | 188 | W_seq = cell(1,L); 189 | W1 = reshape(theta(1:index_track(1)),n_units(1),p+1); 190 | W_seq{1} = W1; 191 | for j = 2:L 192 | index = index_track(j-1)+1:index_track(j); 193 | Wj = reshape(theta(index),n_units(j),n_units(j-1)+1); 194 | W_seq{j} = Wj; 195 | end 196 | beta = theta(d_w+1:d_w+d_beta); 197 | 198 | W1_tilde = W1(:,2:end); % weights without biases 199 | W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau); 200 | grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:);zeros(d_w+d_beta-index_track(1),1)]; 201 | grad_llh = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr); 202 | grad_h = grad_prior_w_beta+grad_llh; 203 | grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic); 204 | grad_theta = grad_h-grad_log_q; 205 | 206 | grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]'; 207 | end 208 | grad_lb = (mean(grad_g_lik_store))'; 209 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic); 210 | 211 | grad_norm = norm(gradient_lambda); 212 | norm_gradient_threshold = 100; 213 | if norm(gradient_lambda)>norm_gradient_threshold 214 | gradient_lambda = (norm_gradient_threshold/grad_norm)*gradient_lambda; 215 | end 216 | 217 | gradient_bar_old = gradient_bar; 218 | gradient_bar = grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda; 219 | 220 | if iter>tau 221 | stepsize=lrate*tau/iter; 222 | else 223 | stepsize=lrate; 224 | end 225 | 226 | lambda=lambda+stepsize*gradient_bar; 227 | 228 | mu=lambda(1:d_theta,1); 229 | b=lambda(d_theta+1:2*d_theta,1); 230 | c=lambda(2*d_theta+1:end); 231 | 232 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 233 | W_seq{1} = W1; 234 | for j = 2:L 235 | index = index_track(j-1)+1:index_track(j); 236 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 237 | W_seq{j} = Wj; 238 | end 239 | beta = mu(d_w+1:d_theta); 240 | 241 | % update tau and shrinkage parameters 242 | if mod(iter,10) == 0 243 | mu_matrixW1_tilde = reshape(mu(W1_tilde_index),n_units(1),p); 244 | b_matrixW1_tilde = reshape(b(W1_tilde_index),n_units(1),p); 245 | if isotropic 246 | for j = 1:p 247 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 248 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+c^2*n_units(1); 249 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 250 | lambda_tau(j) = shrinkage_gamma(j)^2; 251 | end 252 | else 253 | c_matrixW1_tilde = reshape(c(W1_tilde_index),n_units(1),p); 254 | for j = 1:p 255 | mean_column_j_tilde = mu_matrixW1_tilde(:,j)'*mu_matrixW1_tilde(:,j)+... 256 | b_matrixW1_tilde(:,j)'*b_matrixW1_tilde(:,j)+sum(c_matrixW1_tilde(:,j).^2); 257 | mu_tau(j) = shrinkage_gamma(j)/sqrt(mean_column_j_tilde); 258 | lambda_tau(j) = shrinkage_gamma(j)^2; 259 | end 260 | end 261 | mean_inverse_tau = mu_tau; 262 | mean_tau = 1./mu_tau+1./lambda_tau; 263 | shrinkage_gamma = sqrt((n_units(1)+1)./mean_tau); 264 | shrinkage_gamma_seq = [shrinkage_gamma_seq,shrinkage_gamma]; 265 | end 266 | 267 | 268 | [Loss_current,~] = deepGLMpredict(Xval,yval,W_seq,beta,distr); % compute prediction loss 269 | Loss_DL(iter) = Loss_current; 270 | 271 | if Loss_DL(iter)>=Loss_DL(iter-1) 272 | gradient_bar = gradient_bar_old; 273 | end 274 | 275 | if Loss_DL(iter)<=min(Loss_DL) 276 | lambda_best = lambda; 277 | idxPatience = 0; 278 | else 279 | idxPatience = idxPatience+1; 280 | end 281 | 282 | if (idxPatience>patience)||(idxEpoch>epoch) 283 | stop = true; 284 | end 285 | end 286 | disp('---------- Training Completed! ----------') 287 | disp(['Number of iteration:',num2str(iter)]); 288 | disp(['PPS best: ',num2str(min(Loss_DL))]); 289 | 290 | lambda = lambda_best; 291 | mu=lambda(1:d_theta,1); 292 | W1 = reshape(mu(1:index_track(1)),n_units(1),p+1); 293 | W_seq{1} = W1; 294 | for j = 2:L 295 | index = index_track(j-1)+1:index_track(j); 296 | Wj = reshape(mu(index),n_units(j),n_units(j-1)+1); 297 | W_seq{j} = Wj; 298 | end 299 | beta = mu(d_w+1:d_w+d_beta); 300 | 301 | % [Loss_PPS,TP_MCR,MCR] = prediction_loss(y_validation,X_validation,W_seq,beta); % compute prediction loss 302 | % Store output in a struct 303 | out.weights = W_seq; 304 | out.beta = beta; 305 | out.shrinkage = shrinkage_gamma_seq; 306 | out.pps = Loss_DL; 307 | out.iteration = iter; 308 | end 309 | 310 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/train/deepGLMpredict.m: -------------------------------------------------------------------------------- 1 | function out = deepGLMpredict(mdl,X,varargin) 2 | %DEEPGLMPREDICT Make prediction from a trained deepGLM model 3 | % 4 | % OUT = DEEPGLMPREDICT(MDL,XTEST) predict responses for new data XTEST using 5 | % trained deepGLM structure MDL (output from DEEPGLMFIT) 6 | % 7 | % OUT = DEEPGLMPREDICT(MDL,XTEST,NAME,VALUE) predicts responses with additional 8 | % options specified by one or more of the following name/value pairs: 9 | % 10 | % 'ytest' Specify column of test responses. If this option 11 | % is specified with true response column of new 12 | % observations, deepGLMpredict will return prediction 13 | % scores (PPS, MSE or Classification Rate) using true 14 | % responses column vector ytest 15 | % 'Interval' Return prediction interval estimation for observations 16 | % in test data Xtest. By default, this predictive 17 | % interval capability is disable ('Interval' is 0). 18 | % Must be an positive number. 19 | % 'Nsample' Number of samples generated from posterior distribution 20 | % of model parameters used to make prediction interval 21 | % estimation for test data. Must be a positive integer 22 | % Example: 23 | % Fit a deepGLM model for Direcmarketing data set. All of the 24 | % exampled data are located inside /Data folder of installed package. 25 | % In order to use the sample dataset, user must add this Data folder 26 | % to Matlab path or explicitly direct to Data folder in 'load' 27 | % function 28 | % 29 | % load('DirectMarketing.mat') 30 | % mdl = deepGLMfit(X,y,... % Training data 31 | % 'Network',[5,5],... % Use 2 hidden layers 32 | % 'Lrate',0.01,... % Specify learning rate 33 | % 'Verbose',10,... % Display training result each 10 iteration 34 | % 'BatchSize',size(X,1),... % Use entire training data as mini-batch 35 | % 'MaxEpoch',10000,... % Maximum number of epoch 36 | % 'Patience',50,... % Higher patience values could lead to overfitting 37 | % 'Seed',100); % Set random seed to 100 38 | % 39 | % Pred = deepGLMpredict(mdl,X_test,... 40 | % 'ytest',y_test); 41 | % disp(['PPS on test data: ',num2str(Pred.pps)]) 42 | % disp(['MSE on test data: ',num2str(Pred.mse)]) 43 | % 44 | % For more examples, check EXAMPLES folder 45 | % 46 | % See also DEEPGLMFIT, DEEPGLMPLOT 47 | % 48 | % Copyright 2018: 49 | % Nghia Nguyen (nghia.nguyen@sydney.edu.au) 50 | % Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) 51 | % 52 | % https://github.com/VBayesLab/deepGLM 53 | % 54 | % Version: 1.0 55 | % LAST UPDATE: May, 2018 56 | 57 | % Check errors input arguments 58 | if nargin < 2 59 | error(deepGLMmsg('deepglm:TooFewInputs')); 60 | end 61 | 62 | % Load deepGLM params from struct 63 | W_seq = mdl.out.weights; 64 | beta = mdl.out.beta; 65 | distr = mdl.dist; 66 | 67 | % Parse additional options 68 | paramNames = {'ytest' 'Interval' 'Nsample' 'Intercept'}; 69 | paramDflts = {[] 0 1000 true}; 70 | [y,alpha,Nsample,intercept] = internal.stats.parseArgs(paramNames, paramDflts, varargin{:}); 71 | 72 | % If y test is specified, check input 73 | if(~isempty(y)) 74 | if size(y,1) ~= size(X,1) 75 | error(deepGLMmsg('deepglm:InputSizeMismatchX')); 76 | end 77 | if size(y,2) ~= 1 78 | error(deepGLMmsg('deepglm:InputSizeMismatchY')); 79 | end 80 | end 81 | 82 | % Add column of 1 to X if intercept is true 83 | if(intercept) 84 | X = [ones(size(X,1),1),X]; 85 | end 86 | 87 | % Store Nsample to deepGLMfit 88 | mdl.Nsample = Nsample; 89 | 90 | % Calculate neuron network output 91 | nnet_output = nnFeedForward(X,W_seq,beta); 92 | 93 | switch distr 94 | case 'normal' 95 | out.yhat = nnet_output; % Prediction for continuous response 96 | % If ytest if provided, then calculate pps and mse 97 | if(~isempty(y)) 98 | sigma2 = mdl.out.sigma2Mean; 99 | mse = mean((y-nnet_output).^2); 100 | pps = 1/2*log(sigma2) + 1/2/sigma2*mse; 101 | out.mse = mse; 102 | out.pps = pps; 103 | end 104 | % Calculate confidence interval if required 105 | if(alpha~=0) 106 | interval = predictionInterval(mdl,X,alpha); 107 | out.interval = interval.interval; 108 | out.yhatMatrix = interval.yhatMC; 109 | end 110 | 111 | case 'binomial' 112 | out.yNN = nnet_output; 113 | out.yProb = exp(nnet_output)./(1+exp(nnet_output)); 114 | y_pred = double(nnet_output>0); % Prediction for binary response 115 | out.yhat = y_pred; 116 | % If ytest if provided, then calculate pps and mse 117 | if(~isempty(y)) 118 | pps = mean(-y.*nnet_output+log(1+exp(nnet_output))); 119 | cr = mean(y==y_pred); % Miss-classification rate 120 | out.pps = pps; 121 | out.accuracy = cr; 122 | end 123 | 124 | case 'poisson' 125 | out.yNN = nnet_output; 126 | y_pred = exp(nnet_output); % Prediction for poisson response 127 | out.yhat = y_pred; 128 | if(~isempty(y)) 129 | pps = mean(-y.*nnet_output+exp(nnet_output)); 130 | mse = mean((y-y_pred).^2); 131 | out.mse = mse; 132 | out.pps = pps; 133 | end 134 | end 135 | end 136 | 137 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/train/deepGLMpredictLoss.m: -------------------------------------------------------------------------------- 1 | function [out1,out2] = deepGLMpredictLoss(X,y,W_seq,beta,distr,sigma2) 2 | %DEEPGLMPREDICTION Make prediction from estimated deepGLM model 3 | % 4 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 5 | % Nguyen (nghia.nguyen@sydney.edu.au) 6 | % 7 | % http://www.xxx.com 8 | % 9 | % Version: 1.0 10 | % LAST UPDATE: April, 2018 11 | 12 | % Calculate neuron network output 13 | nnet_output = nnFeedForward(X,W_seq,beta); 14 | 15 | switch distr 16 | case 'normal' 17 | mse = mean((y-nnet_output).^2); 18 | pps = 1/2*log(sigma2) + 1/2/sigma2*mse; 19 | out2 = mse; 20 | case 'binomial' 21 | pps = mean(-y.*nnet_output+log(1+exp(nnet_output))); 22 | y_pred = nnet_output>0; 23 | mcr = mean(abs(y-y_pred)); % Miss-classification rate 24 | out2 = 1 - mcr; % Report output in classification rate 25 | case 'poisson' 26 | pps = mean(-y.*nnet_output+exp(nnet_output)); 27 | mse = mean((y-exp(nnet_output)).^2); 28 | out2 = mse; 29 | end 30 | out1 = pps; 31 | 32 | end 33 | 34 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/utils/checkInput.m: -------------------------------------------------------------------------------- 1 | function checkInput(est) 2 | %CHECKDATA Check if user input correct model settings 3 | % 4 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 5 | % Nguyen (nghia.nguyen@sydney.edu.au). 6 | % 7 | % http://www.xxx.com 8 | % 9 | % Version: 1.0 10 | % LAST UPDATE: April, 2018 11 | dist = est.dist; 12 | network = est.network; 13 | lrate = est.lrate; 14 | momentum = est.momentum; 15 | batchsize = est.batchsize; 16 | epoch = est.epoch; 17 | patience = est.patience; 18 | tau = est.tau; 19 | S = est.S; 20 | windowSize = est.windowSize; 21 | icept = est.icept; 22 | verbose = est.verbose; 23 | monitor = est.monitor; 24 | isotropic = est.isIsotropic; 25 | seed = est.seed; 26 | 27 | if(~strcmpi(dist,'normal') && ~strcmpi(dist,'binomial') && ~strcmpi(dist,'poisson')) 28 | error(['Distribution must be one of the followings: ','normal,','binomial,','poisson']); 29 | end 30 | 31 | if(sum(network==0)>0) 32 | error('Network must be an array of positive integers') 33 | end 34 | 35 | if(sum(network==0)>0) 36 | error('Network must be an array of positive integers') 37 | end 38 | 39 | if(~isnumeric(lrate) || lrate<=0) 40 | error('Learning rate must be a positive numerical value') 41 | end 42 | 43 | if(~isnumeric(momentum) || momentum<0 || momentum > 1) 44 | error('Momentum must be a numerical value from 0 to 1') 45 | end 46 | 47 | if(~isnumeric(batchsize) || floor(batchsize)~= batchsize || batchsize <= 0) 48 | error('Batch size must be an positive integer smaller than number of observations in training data'); 49 | end 50 | 51 | if(~isnumeric(epoch) || floor(epoch)~= epoch || epoch <= 0) 52 | error('Number of epoches must be a positive integer'); 53 | end 54 | 55 | if(~isnumeric(patience) || floor(patience)~= patience || patience <= 0) 56 | error('Patience must be a positive integer'); 57 | end 58 | 59 | if(~isnumeric(tau) || floor(tau)~= tau || tau <= 0) 60 | error('LrateFactor must be a positive integer'); 61 | end 62 | 63 | if(~isnumeric(S) || floor(S)~= S || S <= 0) 64 | error('S must be a positive integer'); 65 | end 66 | 67 | if(~isnumeric(windowSize) || floor(windowSize)~= windowSize || windowSize <= 0) 68 | error('WindowSize must be a positive integer'); 69 | end 70 | 71 | if(~islogical(icept)) 72 | error('Intercept option must be a logical value'); 73 | end 74 | 75 | if(~isnumeric(verbose) || floor(verbose)~= verbose || verbose <= 0) 76 | error('Verbose must be a positive integer'); 77 | end 78 | 79 | if(~islogical(monitor)) 80 | error('Monitor option must be a logical value'); 81 | end 82 | 83 | if(~islogical(isotropic)) 84 | error('Isotropic option must be a logical value'); 85 | end 86 | 87 | if (~isnan(seed)) 88 | if(~isnumeric(seed) || floor(seed)~= seed || seed <= 0) 89 | error('Seed must be a nonnegative integer less than 2^32'); 90 | end 91 | end 92 | end 93 | 94 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/utils/deepGLMmsg.m: -------------------------------------------------------------------------------- 1 | function msg_out = deepGLMmsg(identifier) 2 | %DEEPGLMMSG Define custom error/warning messages for exceptions 3 | % DEEPGLMMSG = (IDENTIFIER) extract message for input indentifier 4 | % 5 | % 6 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 7 | % Nguyen (nghia.nguyen@sydney.edu.au) 8 | % 9 | % http://www.xxx.com 10 | % 11 | % Version: 1.0 12 | % LAST UPDATE: April, 2018 13 | 14 | switch identifier 15 | case 'deepglm:TooFewInputs' 16 | msg_out = 'At least two arguments are specified'; 17 | case 'deepglm:InputSizeMismatchX' 18 | msg_out = 'X and Y must have the same number of observations'; 19 | case 'deepglm:InputSizeMismatchY' 20 | msg_out = 'Y must be a single column vector'; 21 | case 'deepglm:ArgumentMustBePair' 22 | msg_out = 'Optinal arguments must be pairs'; 23 | case 'deepglm:ResponseMustBeBinary' 24 | msg_out = 'Two level categorical variable required'; 25 | case 'deepglm:DistributionMustBeBinomial' 26 | msg_out = 'Binomial distribution option required'; 27 | case 'deepglm:MustSpecifyActivationFunction' 28 | msg_out = 'Activation function type requied'; 29 | end 30 | end 31 | 32 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/utils/deepGLMout.m: -------------------------------------------------------------------------------- 1 | function out = deepGLMout() 2 | %DEEPGLMOUT Generate default output structure for deepGLM training results 3 | 4 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 5 | % Nguyen (nghia.nguyen@sydney.edu.au) 6 | % 7 | % http://www.xxx.com 8 | % 9 | % Version: 1.0 10 | % LAST UPDATE: April, 2018 11 | 12 | % Training method 13 | out.dist = 'normal'; % Default distribution of dependent variable. 14 | out.initialize = 'adaptive'; % Default initialize method 15 | out.isIsotropic = false; % Default structure of variational Covariance matrix 16 | out.ncore = 0; % Default parallel computing option 17 | 18 | % Optional settings 19 | out.seed = NaN; % No random seed by default 20 | out.nval = 0.2; % Default proportion of training data for validation 21 | out.verbose = 10; % Default number of iteration to display training results 22 | out.cutoff = 0.5; % Default Cutoff probability for sigmoid function 23 | out.stop = false; % Execution Flag 24 | out.quasiMC = true; % Using Quasi MC for random number generator 25 | out.monitor = false; % Display training progress window 26 | out.muTau = NaN; 27 | out.lowerbound = true; 28 | out.windowSize = 100; 29 | 30 | % Model hyper-parameters 31 | out.network = [10,10]; % Default network structure 32 | out.lrate = 0.01; % Default Learning rate 33 | out.S = 10; % Default Number of samples used to approximate gradient of likelihood 34 | out.batchsize = 5000; % Default Proportion of batch size over entire train set 35 | out.epoch = 1000; % Default Number of epoches in train phase 36 | out.tau = 10000; % Default Scale factor of learning rate 37 | out.patience = 100; % Default Number of consequence non-decreasing iterations (for early stopping checking) 38 | out.c = 0.01; % Default initial value of isotropic factor c 39 | out.bvar = 0.01; % Default initial variance of each element of b 40 | out.momentum = 0.6; % Default momentum weight 41 | 42 | % Variable names 43 | out.name.ynames = NaN; % y variables names 44 | out.name.xnames = NaN; % X variable names 45 | 46 | % Data properties 47 | out.data.y = NaN; % Dependent variable of training data 48 | out.data.X = NaN; % Independent variables of training data 49 | out.data.ytest = NaN; % Dependent variable of test data 50 | out.data.Xtest = NaN; % Independent variables of tets data 51 | out.data.nTrain = NaN; % Number of observation in training set 52 | out.data.nTest = NaN; % Number of observation in test set 53 | out.data.Xval = []; 54 | out.data.yval = []; 55 | out.data.icept = true; % Intercept option 56 | 57 | % Training results 58 | % out.out.mse = NaN; 59 | % out.out.accuracy = NaN; 60 | 61 | end 62 | 63 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/utils/isBinomial.m: -------------------------------------------------------------------------------- 1 | function out = isBinomial(array) 2 | %ISBINOMIAL Check if an array are binary vector 3 | 4 | % Copyright 2018 5 | % http://www.xxx.com 6 | % 7 | % Version: 1.0 8 | % LAST UPDATE: April, 2018 9 | 10 | out = false; 11 | uniqueVal = unique(array); % Extract unique values in array 12 | if (length(uniqueVal)==2) && (uniqueVal(1)==0) && (uniqueVal(2)==1) 13 | out = true; 14 | end 15 | end 16 | 17 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/utils/predictionInterval.m: -------------------------------------------------------------------------------- 1 | function predInterval = predictionInterval(mdl,X,zalpha) 2 | %CONFIDENTINTERVAL Interval estimation for test data using deepGLM 3 | % 4 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 5 | % Nguyen (nghia.nguyen@sydney.edu.au) 6 | % 7 | % http://www.xxx.com 8 | % 9 | % Version: 1.0 10 | % LAST UPDATE: April, 2018 11 | 12 | % Load deepGLM params from struct 13 | Nsample = mdl.Nsample; 14 | MU = mdl.out.vbMU; 15 | SIGMA = mdl.out.vbSIGMA; 16 | n_units = mdl.network; 17 | index_track = mdl.out.indexTrack; 18 | alpha_sigma2 = mdl.out.sigma2Alpha; 19 | beta_sigma2 = mdl.out.sigma2Beta; 20 | 21 | % Calculate network parameters 22 | L = length(n_units); % Number of hidden layers 23 | p = size(X,2)-1; % Number of covariates 24 | d_beta = n_units(L)+1; 25 | d_w = index_track(L); 26 | 27 | yhat = zeros(Nsample,size(X,1)); % Predicted values of test data 28 | nnOut = zeros(Nsample,size(X,1)); % Output of NN 29 | for i=1:Nsample 30 | % Generate samples of theta from Normal distribution 31 | theta_i = mvnrnd(MU,SIGMA); 32 | % Generate samples of sigma from IG distribution 33 | sigma2_i = 1/gamrnd(alpha_sigma2,1./beta_sigma2); 34 | 35 | % For each generated theta, restore neuron net structure 36 | W1 = reshape(theta_i(1:index_track(1)),n_units(1),p+1); 37 | W_seq{1} = W1; 38 | for j = 2:L 39 | index = index_track(j-1)+1:index_track(j); 40 | Wj = reshape(theta_i(index),n_units(j),n_units(j-1)+1); 41 | W_seq{j} = Wj; 42 | end 43 | beta = theta_i(d_w+1:d_w+d_beta)'; 44 | 45 | % Calculate neuron network output 46 | nnOut(i,:) = nnFeedForward(X,W_seq,beta); 47 | 48 | % Calculate p(y|theta_i,sigma_i,X) 49 | yhat(i,:) = normrnd(nnOut(i,:),sqrt(sigma2_i)); 50 | 51 | end 52 | 53 | % 95% confidence interval 54 | yhatLCL = mean(yhat) - zalpha*std(yhat); 55 | yhatUCL = mean(yhat) + zalpha*std(yhat); 56 | yhatInterval = [yhatLCL',yhatUCL']; 57 | predInterval.yhatMC = yhat; 58 | predInterval.interval = yhatInterval; 59 | end 60 | 61 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/utils/splitData.m: -------------------------------------------------------------------------------- 1 | function [Xtr,ytr,Xval,yval] = splitData(X,y,ratio,kfold) 2 | %SPLITDATA Split training data for crossvalidation 3 | 4 | n = size(X,1); 5 | 6 | end 7 | 8 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/utils/sumResidualSquared.m: -------------------------------------------------------------------------------- 1 | function S = sumResidualSquared(y,X,W_seq,beta) 2 | % compute the sum_residual_squared for normal-NN model 3 | 4 | nnet_output = nnFeedForward(X,W_seq,beta); 5 | S = sum((y-nnet_output).^2); 6 | end 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/vbfun/vbGradientLogLB.m: -------------------------------------------------------------------------------- 1 | %% Script to calculate natural gradient of lowerbound 2 | %rqmc = normrnd_qmc(S,d_theta+1); % Using quasi MC random numbers 3 | rqmc = normrnd(0,1,S,d_theta+1); 4 | % rng(iter) 5 | % rqmc = rand(S,d_theta+1); 6 | for s=1:S 7 | U_normal = rqmc(s,:)'; 8 | epsilon1=U_normal(1); 9 | epsilon2=U_normal(2:end); 10 | theta=mu+epsilon1*b+c.*epsilon2; 11 | 12 | W_seq = cell(1,L); 13 | W1 = reshape(theta(1:index_track(1)),n_units(1),p+1); 14 | W_seq{1} = W1; 15 | W1_tilde = W1(:,2:end); % weights without biases 16 | W1_tilde_gamma = W1_tilde*diag(mean_inverse_tau); 17 | grad_prior_w_beta = [zeros(n_units(1),1);-W1_tilde_gamma(:)]; 18 | for j = 2:L 19 | index = index_track(j-1)+1:index_track(j); 20 | Wj = reshape(theta(index),n_units(j),n_units(j-1)+1); 21 | W_seq{j} = Wj; 22 | Wj_tilde = Wj(:,2:end); 23 | grad_prior_Wj = [zeros(n_units(j),1);-shrinkage_l2*Wj_tilde(:)]; 24 | grad_prior_w_beta = [grad_prior_w_beta;grad_prior_Wj]; 25 | end 26 | beta = theta(d_w+1:d_theta); 27 | beta_tilde = beta(2:end); % vector beta without intercept 28 | grad_prior_beta = [0;-shrinkage_l2*beta_tilde]; 29 | grad_prior_w_beta = [grad_prior_w_beta;grad_prior_beta]; 30 | 31 | if(strcmp(distr,'normal')) 32 | [grad_llh,yNN] = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr,mean_sigma2_inverse); 33 | else 34 | [grad_llh,yNN] = nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr); 35 | end 36 | 37 | grad_h = grad_prior_w_beta+grad_llh; % Gradient of log prior plus log-likelihood 38 | grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic); 39 | grad_theta = grad_h-grad_log_q; 40 | grad_g_lik_store(s,:) = [grad_theta;epsilon1*grad_theta;epsilon2.*grad_theta]'; 41 | 42 | % ------------------ lower bound --------------------------------------- 43 | if(lbFlag) 44 | if(strcmp(distr,'normal')) 45 | lb_iter(s) = constMean... 46 | -0.5*mean_sigma2_inverse*sum((y-yNN).^2)*datasize/batchsize... 47 | +const; 48 | elseif(strcmp(distr,'binomial')) 49 | lb_iter(s) = constMean... 50 | +sum(y.*yNN - log(1+exp(yNN)))*datasize/batchsize... 51 | +const; 52 | else 53 | lb_iter(s) = constMean... 54 | +sum(y.*yNN - exp(yNN))*datasize/batchsize... 55 | +const; 56 | end 57 | end 58 | % ---------------------------------------------------------------------- 59 | end 60 | grad_lb = (mean(grad_g_lik_store))'; 61 | gradient_lambda = vbNaturalGradient(b,c,grad_lb,isotropic); -------------------------------------------------------------------------------- /Matlab/DeepGLM/vbfun/vbGradientLogq.m: -------------------------------------------------------------------------------- 1 | function grad_log_q = vbGradientLogq(b,c,theta,mu,isotropic) 2 | %VBGRADIENTLOGQ Summary of this function goes here 3 | % Detailed explanation goes here 4 | x = theta-mu; 5 | if isotropic 6 | grad_log_q = -x./c^2+(1/c^2)*((b'*x)/(c^2+(b'*b)))*b; 7 | else 8 | d = b./c.^2; 9 | grad_log_q = -x./c.^2+(d'*x)/(1+(d'*b))*d; 10 | end 11 | end 12 | 13 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/vbfun/vbLowerBound.m: -------------------------------------------------------------------------------- 1 | %% Group Lasso + L2 prior on remaining weigths 2 | if(strcmp(distr,'normal')) 3 | mean_log_sig2 = log(beta_sigma2)-psi(alpha_sigma2); 4 | logdet = log(det(1 + (b./(c.^2))'*b)) + sum(log((c.^2))); 5 | constMean = -(alpha0_sigma2+1)*mean_log_sig2 - beta0_sigma2*mean_sigma2_inverse... 6 | +0.5*sum(2*(n_units(1)+1)*log(shrinkage_gamma)- (shrinkage_gamma.^2).*mean_tau)... 7 | -0.5*datasize*mean_log_sig2+gammaln(alpha_sigma2)... 8 | -alpha_sigma2*log(beta_sigma2)+(alpha_sigma2+1)*mean_log_sig2... 9 | +alpha_sigma2-0.5*(sum(log(lambda_tau))-p)+0.5*logdet... 10 | +0.5*d_w_tilde*log(shrinkage_l2)-0.5*shrinkage_l2*mean_w_tilde... 11 | -0.5*sum(mean_column_j_tilde'.*mean_inverse_tau); 12 | else 13 | logdet = log(det(1 + (b./(c.^2))'*b)) + sum(log((c.^2))); 14 | constMean = 0.5*sum(2*(n_units(1)+1)*log(shrinkage_gamma)-(shrinkage_gamma.^2).*mean_tau)... 15 | -0.5*(sum(log(lambda_tau))-p)+0.5*logdet+0.5*d_w_tilde*log(shrinkage_l2)... 16 | -0.5*shrinkage_l2*mean_w_tilde-0.5*sum(mean_column_j_tilde'.*mean_inverse_tau); 17 | end 18 | 19 | % lb(iter) = (a1+constMean+const)/datasize; 20 | 21 | % a3 = alpha0_sigma2*log(beta0_sigma2)-gammaln(alpha0_sigma2)... 22 | % -0.5*p*n_units(1)*log(2*pi)-0.5*d_w_tilde*log(2*pi)... 23 | % -p*gammaln((n_units(1)+1)/2)-0.5*datasize*log(2*pi)... 24 | % +p/2*log(2*pi)+0.5*d_theta*log(2*pi)+d_theta/2; 25 | 26 | 27 | 28 | % lb(iter) = alpha0_sigma2*log(beta0_sigma2)-gammaln(alpha0_sigma2)-... 29 | % (alpha0_sigma2+1)*mean_log_sig2 - beta0_sigma2*mean_sigma2_inverse... 30 | % -0.5*p*n_units(1)*log(2*pi)-0.5*sum(mean_column_j_tilde'.*mean_inverse_tau)... 31 | % -0.5*d_w_tilde*log(2*pi)+0.5*d_w_tilde*log(shrinkage_l2)-... 32 | % 0.5*shrinkage_l2*mean_w_tilde-p*gammaln((n_units(1)+1)/2)+... 33 | % 0.5*sum(2*(n_units(1)+1)*log(shrinkage_gamma) - (shrinkage_gamma.^2).*mean_tau)... 34 | % -0.5*datasize*log(2*pi)-0.5*datasize*mean_log_sig2-... 35 | % 0.5*mean_sigma2_inverse*sum_squared+gammaln(alpha_sigma2)-... 36 | % alpha_sigma2*log(beta_sigma2)+(alpha_sigma2+1)*mean_log_sig2 + alpha_sigma2... 37 | % +p/2*log(2*pi)-0.5*(sum(log(lambda_tau))-p)+0.5*d_theta*log(2*pi)+0.5*logdet+... 38 | % d_theta/2 -0.5*((grad_llh'*b)^2+ sum((c.^2).*(grad_llh.^2))) 39 | -------------------------------------------------------------------------------- /Matlab/DeepGLM/vbfun/vbNaturalGradient.m: -------------------------------------------------------------------------------- 1 | function prod = vbNaturalGradient(b,c,grad,isotropic) 2 | %VBNATURALGRADIENT compute the product inverse_fisher times grad for two 3 | % cases: isotropic factor decompostion or rank-1 decomposition 4 | % INPUT: 5 | % grad: the traditional gradient 6 | % b,c: parameters in the factor decomposition 7 | % isotropic: true if isotropic structure is used, rand-1 otherwise 8 | % 9 | % OUTPUT: natural gradient 10 | % 11 | % Copyright 2018 Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) and Nghia 12 | % Nguyen (nghia.nguyen@sydney.edu.au). 13 | % 14 | % http://www.xxx.com 15 | % 16 | % Version: 1.0 17 | % LAST UPDATE: April, 2018 18 | 19 | if isotropic 20 | d = length(b); 21 | bb = b'*b; 22 | alpha = 1/(c^2+bb); 23 | omega = (2/c^2)*(d-1+c^4*alpha^2); 24 | kappa = (1+c^2/bb-.5*(1+c^2/bb)^2)*2*c*bb*alpha^2+2*c^3*alpha/bb; 25 | c2 = omega-2*c*alpha^2*kappa*bb; 26 | 27 | grad1 = grad(1:d); 28 | grad2 = grad(d+1:2*d); 29 | grad3 = grad(end); 30 | 31 | b_grad2 = b'*grad2; 32 | const1 = (1+c^2/bb-.5*(1+c^2/bb)^2); 33 | const2 = c^2*(1+c^2/bb); 34 | Ainv_times_grad2 = (const1*b_grad2)*b+const2*grad2; 35 | 36 | prod = [(b'*grad1)*b+c^2*grad1;Ainv_times_grad2+(kappa^2/c2*b_grad2)*b-(kappa/c2*grad3)*b;-kappa/c2*b_grad2+grad3/c2]; 37 | else 38 | % Close-form method 39 | % d = length(b); 40 | % grad1 = grad(1:d); 41 | % grad2 = grad(d+1:2*d); 42 | % grad3 = grad(2*d+1:end); 43 | % 44 | % c2 = c.^2; 45 | % b2 = b.^2; 46 | % 47 | % prod1 = (b'*grad1)*b+(grad1.*c2); 48 | % 49 | % alpha = 1/(1+sum(b2./c2)); 50 | % Cminus = diag(1./c2); 51 | % Cminus_b = b./c2; 52 | % Sigma_inv = Cminus-alpha*(Cminus_b*Cminus_b'); 53 | % 54 | % A11_inv = (1/(1-alpha))*((1-1/(sum(b2)+1-alpha))*(b*b')+diag(c2)); 55 | % 56 | % C = diag(c); 57 | % A12 = 2*(C*Sigma_inv*b*ones(1,d)).*Sigma_inv; 58 | % A21 = A12'; 59 | % A22 = 2*C*(Sigma_inv.*Sigma_inv)*C; 60 | % D = A22-A21*A11_inv*A12; 61 | % prod2 = A11_inv*grad2+(A11_inv*A12)*(D\A21)*(A11_inv*grad2)-(A11_inv*A12)*(D\grad3); 62 | % prod3 = -(D\A21)*(A11_inv*grad2)+D\grad3; 63 | % prod = [prod1;prod2;prod3]; 64 | 65 | % % Approximation method 66 | d = length(b); 67 | grad1 = grad(1:d); 68 | grad2 = grad(d+1:2*d); 69 | grad3 = grad(2*d+1:end); 70 | 71 | c2 = c.^2; 72 | b2 = b.^2; 73 | 74 | prod1 = (b'*grad1)*b+(grad1.*c2); 75 | 76 | const = sum(b2./c2); 77 | const1 = 1/2+1/2/const; 78 | prod2 = (b'*grad2)*b+(grad2.*c2); 79 | prod2 = const1*prod2; 80 | alpha = 1/(1+const); 81 | x = alpha*b2./(c.^3); 82 | y = 1./c2 - 2*alpha*(b./c2).^2; 83 | aux = x./y; 84 | prod3 = grad3./y-(1/(1+sum(x.^2./y)))*(aux'*grad3)*aux; 85 | prod3 = prod3/2; 86 | prod = [prod1;prod2;prod3]; 87 | end 88 | end 89 | 90 | -------------------------------------------------------------------------------- /Matlab/Document/deepGLM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Document/deepGLM.pdf -------------------------------------------------------------------------------- /Matlab/Document/deepGLMNormalExample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Document/deepGLMNormalExample.pdf -------------------------------------------------------------------------------- /Matlab/Document/~WRL1562.tmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Document/~WRL1562.tmp -------------------------------------------------------------------------------- /Matlab/Document/~WRL3227.tmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Document/~WRL3227.tmp -------------------------------------------------------------------------------- /Matlab/Examples/deepGLMBinomialExample.mlx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Examples/deepGLMBinomialExample.mlx -------------------------------------------------------------------------------- /Matlab/Examples/deepGLMBinomialExampleScript.m: -------------------------------------------------------------------------------- 1 | % Examples demonstate how to use deepGLM function to fit data with binomial 2 | % dependent variable 3 | % 4 | % Copyright 2018 5 | % Nghia Nguyen (nghia.nguyen@sydney.edu.au) 6 | % Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) 7 | % 8 | % https://github.com/VBayesLab/deepGLM 9 | % 10 | % Version: 1.0 11 | % LAST UPDATE: May, 2018 12 | 13 | clear 14 | clc 15 | 16 | % load data 17 | % load('../Data/dataSimulationBinary.mat') 18 | load('../Data/DataSimulationBinary.mat') 19 | 20 | %% Fit deepGLM model using default setting 21 | nn = [10]; 22 | mdl = deepGLMfit(X,y,... 23 | 'Distribution','binomial',... 24 | 'Network',nn,... 25 | 'Lrate',0.01,... 26 | 'Verbose',1,... % Display training result each iteration 27 | 'BatchSize',size(X,1),... % Use entire training data as mini-batch 28 | 'MaxEpoch',10000,... 29 | 'Patience',50,... % Higher patience values could lead to overfitting 30 | 'Seed',100); 31 | %% Plot training output 32 | % Plot lowerbound 33 | figure 34 | plot(mdl.out.lbBar,'LineWidth',2) 35 | title('Lowerbound of Variational Approximation','FontSize',20) 36 | xlabel('Iterations','FontSize',14,'FontWeight','bold') 37 | ylabel('Lowerbound','FontSize',14,'FontWeight','bold') 38 | grid on 39 | 40 | % Plot shrinkage coefficients 41 | figure 42 | deepGLMplot('Shrinkage',mdl.out.shrinkage,... 43 | 'Title','Shrinkage Coefficients',... 44 | 'Xlabel','Iterations',... 45 | 'LineWidth',2); 46 | 47 | %% Prediction on test data 48 | % Make prediction (point estimation) on a test set 49 | Pred1 = deepGLMpredict(mdl,X_test); 50 | 51 | % If ytest is specified (for model evaluation purpose) 52 | % then we can check PPS and MSE on test set 53 | Pred2 = deepGLMpredict(mdl,X_test,'ytest',y_test); 54 | disp(['PPS on test data: ',num2str(Pred2.pps)]) 55 | disp(['Classification rate on test data: ',num2str(Pred2.accuracy)]) 56 | 57 | % Plot ROC curve 58 | figure 59 | deepGLMplot('ROC',Pred2.yProb,... 60 | 'ytest',y_test,... 61 | 'Title','ROC',... 62 | 'Xlabel','False Positive Rate',... 63 | 'Ylabel','True Positive Rate') 64 | 65 | %% Compare to linear model 66 | figure 67 | mdlLR = fitglm(X,y,'Distribution','binomial','Link','logit'); 68 | yProb = predict(mdlLR,X_test); 69 | deepGLMplot('ROC',[Pred2.yProb,yProb],... 70 | 'ytest',y_test,... 71 | 'Title','ROC',... 72 | 'Xlabel','False Positive Rate',... 73 | 'Ylabel','True Positive Rate',... 74 | 'legend',{'deepGLM','Logistic Regression'}) 75 | 76 | -------------------------------------------------------------------------------- /Matlab/Examples/deepGLMNormalExample.mlx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Matlab/Examples/deepGLMNormalExample.mlx -------------------------------------------------------------------------------- /Matlab/Examples/deepGLMNormalExampleScript.m: -------------------------------------------------------------------------------- 1 | % Examples demonstate how to use deepGLM function to fit data with continuos 2 | % dependent variable 3 | % 4 | % Copyright 2018 5 | % Nghia Nguyen (nghia.nguyen@sydney.edu.au) 6 | % Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) 7 | % 8 | % https://github.com/VBayesLab/deepGLM 9 | % 10 | % Version: 1.0 11 | % LAST UPDATE: May, 2018 12 | 13 | clear 14 | clc 15 | 16 | % load data 17 | % load('../Data/dataSimulationContinuous.mat') 18 | % load('../Data/dataSimulationContinuousEasy.mat') 19 | % load('../Data/DirectMarketing.mat') 20 | % load('../Data/SchoolingDataBART.mat') 21 | % load('../Data/SchoolingDataDeepGLM.mat') 22 | % load('../Data/OnlineBART.mat') 23 | % load('../Data/HILDABart.mat') 24 | load('../Data/abalone.mat') 25 | 26 | %% Fit deepGLM model using default setting 27 | % By default, if 'distribution' option is not specified then deepGLMfit 28 | % will assign the response variables as 'normal' 29 | nn = [10,10]; 30 | mdl = deepGLMfit(X,y,... 31 | 'Network',nn,... 32 | 'Lrate',0.008,... 33 | 'Verbose',1,... % Display training result each iteration 34 | 'BatchSize',1000,... % Use entire training data as mini-batch 35 | 'MaxEpoch',10000,... 36 | 'Patience',100,... % Higher patience values could lead to overfitting 37 | 'Seed',NaN,... 38 | 'WindowSize',100); 39 | 40 | %% Plot training output 41 | figure 42 | plot(mdl.out.lbBar,'LineWidth',2) 43 | title('Lowerbound of Variational Approximation','FontSize',0.5) 44 | xlabel('Iterations','FontSize',0.2,'FontWeight','bold') 45 | ylabel('Lowerbound','FontSize',0.2,'FontWeight','bold') 46 | grid on 47 | 48 | %% Plot shrinkage coefficients 49 | figure 50 | deepGLMplot('Shrinkage',mdl.out.shrinkage,... 51 | 'Title','Shrinkage Coefficients',... 52 | 'Xlabel','Iterations',... 53 | 'LineWidth',2); 54 | 55 | %% Prediction on test data 56 | % Make prediction (point estimation) on a test set 57 | disp('---------- Prediction ----------') 58 | Pred1 = deepGLMpredict(mdl,X_test); 59 | 60 | % If ytest is specified (for model evaluation purpose) 61 | % then we can check PPS and MSE on test set 62 | Pred2 = deepGLMpredict(mdl,X_test,'ytest',y_test); 63 | disp(['PPS on test set using deepGLM is: ',num2str(Pred2.pps)]) 64 | disp(['MSE on test set using deepGLM is: ',num2str(Pred2.mse)]) 65 | 66 | % You can also perform point and interval estimation for a single test observation 67 | idx = randi(length(y_test)); % Pick a random test data observation 68 | dataTest = X_test(idx,:); 69 | Pred3 = deepGLMpredict(mdl,dataTest,... 70 | 'Interval',1,... 71 | 'Nsample',1000); 72 | disp(['Prediction Interval: [',num2str(Pred3.interval(1)),... 73 | ';',num2str(Pred3.interval(2)),']',]); 74 | disp(['True value: ',num2str(y_test(idx))]); 75 | 76 | 77 | % Estimate prediction interval for entire test data 78 | Pred4 = deepGLMpredict(mdl,X_test,... 79 | 'ytest',y_test,... 80 | 'Interval',1,... 81 | 'Nsample',1000); 82 | y_pred = mean(Pred4.yhatMatrix)'; 83 | mse2 = mean((y_test-y_pred).^2); 84 | accuracy = (y_testPred4.interval(:,1)); 85 | disp(['Prediction Interval accuracy: ',num2str(sum(accuracy)/length(accuracy))]); 86 | 87 | %% Plot prediction interval 88 | figure 89 | deepGLMplot('Interval',Pred4,... 90 | 'Title','Prediction Interval of Schooling Test Data',... 91 | 'Xlabel','Observations',... 92 | 'Ylabel','Wage($1000)',... 93 | 'Nsample',60); 94 | 95 | %% Plot prediction interval with true response 96 | figure 97 | deepGLMplot('Interval',Pred4,... 98 | 'ytest',y_test,... 99 | 'Title','Prediction Interval for Test Data',... 100 | 'Xlabel','Observations',... 101 | 'Ylabel','Wage($1000)',... 102 | 'Nsample',40); 103 | 104 | -------------------------------------------------------------------------------- /Matlab/Examples/deepGLMPoissonExampleScript.m: -------------------------------------------------------------------------------- 1 | % Examples demonstate how to use deepGLM function to fit data with Poisson 2 | % dependent variable 3 | % 4 | % Copyright 2018 5 | % Nghia Nguyen (nghia.nguyen@sydney.edu.au) 6 | % Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) 7 | % 8 | % https://github.com/VBayesLab/deepGLM 9 | % 10 | % Version: 1.0 11 | % LAST UPDATE: May, 2018 12 | 13 | clear 14 | clc 15 | 16 | clear 17 | clc 18 | 19 | % load data 20 | % load('../Data/BikeSharingDeepGLM.mat') 21 | load('../Data/abalone.mat') 22 | 23 | 24 | %% Fit deepGLM model using default setting 25 | nn = [10,10]; 26 | mdl = deepGLMfit(X,y,... 27 | 'Distribution','poisson',... 28 | 'Network',nn,... 29 | 'Lrate',0.005,... 30 | 'BatchSize',size(X,1),... 31 | 'MaxEpoch',2000,... 32 | 'Patience',50,... 33 | 'Verbose',10,... 34 | 'Seed',1000); 35 | 36 | %% Plot training output 37 | % Plot lowerbound 38 | figure 39 | plot(mdl.out.lbBar,'LineWidth',2) 40 | title('Lowerbound of Variational Approximation','FontSize',20) 41 | xlabel('Iterations','FontSize',14,'FontWeight','bold') 42 | ylabel('Lowerbound','FontSize',14,'FontWeight','bold') 43 | grid on 44 | 45 | % Plot shrinkage coefficients 46 | figure 47 | deepGLMplot('Shrinkage',mdl.out.shrinkage,... 48 | 'Title','Shrinkage Coefficients',... 49 | 'Xlabel','Iterations',... 50 | 'LineWidth',2); 51 | 52 | 53 | %% Prediction on test data 54 | % Make prediction (point estimation) on a test set 55 | Pred1 = deepGLMpredict(mdl,X_test); 56 | 57 | % If ytest is specified (for model evaluation purpose) 58 | % then we can check PPS and MSE on test set 59 | Pred2 = deepGLMpredict(mdl,X_test,'ytest',y_test); 60 | disp(['PPS on test data: ',num2str(Pred2.pps)]) 61 | disp(['Mean Square Error on test data: ',num2str(Pred2.mse)]) 62 | 63 | %% Compare with GLM Poisson 64 | mdlGLM = glmfit(X,y,'poisson'); 65 | X_test = [ones(size(X_test,1),1) X_test]; 66 | y_pred = exp(X_test*mdlGLM); 67 | ppsGLM = mean(-y_test'*X_test*mdlGLM + sum(y_pred)); 68 | mseGLM = mean((y_test-y_pred).^2); 69 | 70 | 71 | -------------------------------------------------------------------------------- /Python/DirectMarketing.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Python/DirectMarketing.mat -------------------------------------------------------------------------------- /Python/__pycache__/deepGLM.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Python/__pycache__/deepGLM.cpython-37.pyc -------------------------------------------------------------------------------- /Python/deepGLM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VBayesLab/deepGLM/9fd8c4277e8b30956fa9e191fa05e8aa79970691/Python/deepGLM.pdf -------------------------------------------------------------------------------- /R/02_libs/checkInput.R: -------------------------------------------------------------------------------- 1 | # Function to check if user input valid settings 2 | checkInput <- function(X,y,est){ 3 | # Check if X is null 4 | if(is.null(X)){ 5 | stop("'X' is missing") 6 | } 7 | 8 | # Check if y is null 9 | if(is.null(y)){ 10 | stop("'y' is missing") 11 | } 12 | 13 | # Check if there are NaN values in X 14 | 15 | # Check if there are NaN values in y 16 | 17 | # Check if S is a positive integer 18 | 19 | # Check if BatchSize is a positive integer 20 | 21 | # Check if Lrate is a positive numerical number 22 | 23 | # Check if MaxEpoch is positive integer 24 | 25 | # Check if LRateFactor is positive integer 26 | 27 | # Check if Patience is positive integer 28 | 29 | # Check if Network is a vector of integer 30 | 31 | # Check if Distribution is one of {"normal","binomial","poisson"} 32 | 33 | # Check if Seed is an integer 34 | 35 | # Check if Intercept is a logical number 36 | 37 | # Check if Momentum is a number from 0 to 1 38 | 39 | # Check if Verbose is positive integer 40 | 41 | # Check if WindowSize is positive integer 42 | 43 | 44 | } 45 | -------------------------------------------------------------------------------- /R/02_libs/deepGLMTrain.R: -------------------------------------------------------------------------------- 1 | # Function to train deepGLM model 2 | deepGLMTrain <- function(X_train,y_train,est){ 3 | 4 | # Extract model parameters provided by users 5 | n_units <- est$network 6 | batchsize <- est$batchsize 7 | lrate <- est$lrate 8 | S <- est$S # Number of Monte Carlo samples to estimate the gradient 9 | tau <- est$tau # Threshold before reducing constant learning rate eps0 10 | grad_weight <- est$momentum # Weight in the momentum 11 | cScale <- 0.01 # Random scale factor to initialize b,c 12 | patience <- est$patience # Stop if test error not improved after patience_parameter iterations 13 | epoch <- est$epoch # Number of times learning algorithm scan entire training data 14 | verbose <- est$verbose 15 | distr <- est$dist 16 | LBwindow <- est$windowSize 17 | seed <- est$seed 18 | 19 | # Set random seed if specified 20 | if(!is.nan(seed)){ 21 | set.seed(seed) 22 | # set.generator("MersenneTwister", initialization="init2002", resolution=53, seed=seed) 23 | } 24 | 25 | # Data merge for mini-batch sampling 26 | data <- cbind(y_train,X_train) 27 | datasize <- nrow(X_train) 28 | num1Epoch <- round(datasize/batchsize) # Number of iterations per epoch 29 | 30 | # Network parameters 31 | L <- length(n_units) # Number of hidden layers 32 | p <- ncol(X_train)-1 # Number of covariates 33 | W_seq <- vector("list",length = L) # Cells to store weight matrices 34 | index_track <- numeric(L) # Keep track of indices of Wj matrices: index_track(1) is the total elements in W1, index_track(2) is the total elements in W1 & W2,... 35 | index_track[1] <- n_units[1]*(p+1) # Size of W1 is m1 x (p+1) with m1 number of units in the 1st hidden layer 36 | W1_tilde_index <- c((n_units[1]+1):index_track[1]) # Index of W1 without biases, as the first column if W1 are biases 37 | w_tilde_index <- c() # indices of non-biase weights, excluding W1, for l2-regulization prior 38 | for (j in 2:L) { 39 | index_track[j] <- index_track[j-1]+n_units[j]*(n_units[j-1]+1) 40 | w_tilde_index <- c(w_tilde_index,(index_track[j-1]+n_units[j]+1):index_track[j]) 41 | } 42 | d_w <- index_track[L] # Total number of weights up to (and including) the last layer 43 | d_beta <- n_units[L]+1 # Dimension of the weights beta connecting the last layer to the output 44 | d_theta <- d_w+d_beta # Total number of parameters 45 | w_tilde_index <- c(w_tilde_index,((d_w+2):d_theta)) 46 | d_w_tilde <- length(w_tilde_index) 47 | 48 | # Initialise weights and set initial mu equal to initial weights 49 | layers <- c(ncol(X_train),n_units,1) # Full structure of NN -> [input,hidden,output] 50 | weights <- nnInitialize(layers) 51 | mu <- c() # Mean of variational distribution 52 | for (i in 1:(length(layers)-1)) { 53 | temp <- weights[[i]] 54 | mu <- c(mu,c(temp)) 55 | } 56 | 57 | # Initialize b and c and lambda 58 | b <- runif(d_theta, min=0, max=cScale) 59 | c <- cScale*rep(1,d_theta) 60 | lambda <- c(mu,b,c) 61 | 62 | # Separate weigths to 2 list: one for last hidden layers to output layer and for the rest 63 | W1 <- matrix(mu[1:index_track[1]],n_units[1],p+1) 64 | W_seq[[1]] <- W1 65 | for (j in 2:L) { 66 | index <- (index_track[j-1]+1):index_track[j] 67 | Wj <- matrix(mu[index],n_units[j],n_units[j-1]+1) 68 | W_seq[[j]] <- Wj 69 | } 70 | beta <- mu[(d_w+1):d_theta] 71 | 72 | # Get mini-batch 73 | idx <- sample.int(datasize,batchsize,replace = T) 74 | y <- y_train[idx,] 75 | X <- X_train[idx,] 76 | # X <- X_train 77 | # y <- y_train 78 | 79 | # Hyperparameters for inverse-Gamma prior on sigma2 if y~Nomal(0,sigma2) 80 | mean_sigma2_save <- c() 81 | if(distr == "normal"){ 82 | alpha0_sigma2 <- 10 83 | beta0_sigma2 <- (alpha0_sigma2-1)*sd(y) 84 | alpha_sigma2 <- alpha0_sigma2 + 0.5*length(y_train) # Optimal VB parameter for updating sigma2 85 | beta_sigma2 <- alpha_sigma2 # Mean_sigma2 and mean_sigma2_inverse are 86 | # Initialised at small values 1/2 and 1 respectively 87 | mean_sigma2_inverse <- alpha_sigma2/beta_sigma2 88 | mean_sigma2 <- beta_sigma2/(alpha_sigma2-1) 89 | mean_sigma2_save[1] <- mean_sigma2 90 | } 91 | 92 | # Calculations for group Lasso coefficients 93 | shrinkage_gamma <- .01*rep(1,p) # Initialise gamma_beta, the shrinkage parameters 94 | shrinkage_l2 <- .01 # Hype-parameter for L2 prior 95 | mu_tau <- rep(0,p) # Parameters for the auxiliary tau_j 96 | mu_matrixW1_tilde <- matrix(mu[W1_tilde_index],n_units[1],p) 97 | b_matrixW1_tilde <- matrix(b[W1_tilde_index],n_units[1],p) 98 | 99 | c_matrixW1_tilde <- matrix(c[W1_tilde_index],n_units[1],p) 100 | for (j in 1:p) { 101 | mean_column_j_tilde <- mu_matrixW1_tilde[,j] %*% mu_matrixW1_tilde[,j] + 102 | b_matrixW1_tilde[,j] %*% b_matrixW1_tilde[,j] + 103 | sum(c_matrixW1_tilde[,j]^2) 104 | mu_tau[j] <- shrinkage_gamma[j]/sqrt(mean_column_j_tilde) 105 | } 106 | lambda_tau <- shrinkage_gamma^2 107 | mean_inverse_tau <- mu_tau # VB mean <1/tau_j> 108 | shrinkage_gamma_seq <- shrinkage_gamma 109 | mean_tau <- 1/mu_tau + 1/lambda_tau 110 | m <- n_units[1] 111 | 112 | # Prepare to calculate lowerbound 113 | if(distr=="normal"){ 114 | const <- alpha0_sigma2*log(beta0_sigma2) - lgamma(alpha0_sigma2) - 115 | 0.5*p*n_units[1]*log(2*pi) - 0.5*d_w_tilde*log(2*pi) - 116 | p*lgamma((n_units[1]+1)/2) - 0.5*datasize*log(2*pi) + 117 | p/2*log(2*pi) + 0.5*d_theta*log(2*pi) + d_theta/2 118 | }else{ 119 | const <- -0.5*p*n_units[1]*log(2*pi) - 0.5*d_w_tilde*log(2*pi)- 120 | p*lgamma((n_units[1]+1)/2) + p/2*log(2*pi)+ 121 | 0.5*d_theta*log(2*pi) + d_theta/2 122 | } 123 | W1 <- matrix(mu[1:index_track[1]],n_units[1],p+1) 124 | W_seq[[1]] <- W1 125 | for (j in 2:L) { 126 | index <- (index_track[j-1]+1):index_track[j] 127 | Wj <- matrix(mu[index],n_units[j],n_units[j-1]+1) 128 | W_seq[[j]] <- Wj 129 | } 130 | beta <- mu[(d_w+1):d_theta] 131 | mu_w_tilde <- mu[w_tilde_index] 132 | b_w_tilde <- b[w_tilde_index] 133 | c_w_tilde <- c[w_tilde_index] 134 | mean_w_tilde <- c(mu_w_tilde %*% mu_w_tilde + b_w_tilde %*% b_w_tilde + sum(c_w_tilde^2)) 135 | iter <- 1 136 | 137 | # calculate analytical terms of lowerbound 138 | constMean <- vbLowerBound(b,c,distr,p,beta_sigma2,alpha_sigma2,alpha0_sigma2,beta0_sigma2, 139 | mean_sigma2_inverse,n_units,shrinkage_gamma,mean_tau,datasize, 140 | lambda_tau,d_w_tilde,shrinkage_l2,mean_w_tilde,mean_column_j_tilde, 141 | mean_inverse_tau) 142 | 143 | # Calculate gradient of lowerbound and lowerbound of the first iteration 144 | lb <- c() 145 | grad_g_lik_store <- matrix(0,S,3*d_theta) 146 | lb_iter <- matrix(0,1,S) 147 | iter <- 1 148 | gradient_lambda <- vbGradientLogLB(X,y,b,c,mu,S,p,L,d_theta,d_w,index_track,n_units,mean_inverse_tau, 149 | shrinkage_l2,datasize,distr,mean_sigma2_inverse,constMean, 150 | const,grad_g_lik_store,lb_iter,iter) 151 | gradient_bar <- gradient_lambda$gradient_lambda 152 | lb[iter] <- mean(gradient_lambda$lb_iter)/datasize 153 | cat("Initial LB: ", lb[iter],'\n') 154 | 155 | #--------------------------Training Phase----------------------------- 156 | # Prepare parameters for training 157 | idxEpoch <- 0 # Index of current epoch 158 | iter <- 1 # Index of current iteration 159 | stop <- FALSE # Stop flag for early stopping 160 | lambda_best <- lambda # Store optimal lambda for output 161 | idxPatience <- 0 # Index of number of consequent non-decreasing 162 | # iterations for early stopping 163 | mean_column_j_tilde <- matrix(0,1,p) 164 | lb_bar <- c() 165 | 166 | print("---------- Training Phase ----------") 167 | while (!stop) { 168 | iter <- iter+1 169 | 170 | # Extract mini-batch 171 | idx <- sample.int(datasize,batchsize,replace = T) 172 | y <- y_train[idx,] 173 | X <- X_train[idx,] 174 | # X <- X_train 175 | # y <- y_train 176 | 177 | # Calculate analytical terms of lowerbound 178 | constMean <- vbLowerBound(b,c,distr,p,beta_sigma2,alpha_sigma2,alpha0_sigma2,beta0_sigma2, 179 | mean_sigma2_inverse,n_units,shrinkage_gamma,mean_tau,datasize, 180 | lambda_tau,d_w_tilde,shrinkage_l2,mean_w_tilde,mean_column_j_tilde, 181 | mean_inverse_tau) 182 | 183 | # Calculate Natural Gradient 184 | grad_lb <- vbGradientLogLB(X,y,b,c,mu,S,p,L,d_theta,d_w,index_track,n_units,mean_inverse_tau, 185 | shrinkage_l2,datasize,distr,mean_sigma2_inverse,constMean, 186 | const,grad_g_lik_store,lb_iter,iter) 187 | gradient_lambda = grad_lb$gradient_lambda 188 | lb[iter] <- mean(grad_lb$lb_iter)/datasize 189 | 190 | # Prevent exploding Gradient 191 | grad_norm <- sqrt(sum(gradient_lambda^2)) 192 | norm_gradient_threshold <- 100 193 | if(grad_norm > norm_gradient_threshold){ 194 | gradient_lambda <- (norm_gradient_threshold/grad_norm)*gradient_lambda 195 | } 196 | 197 | # Momentum gradient 198 | gradient_bar_old <- gradient_bar 199 | gradient_bar <- grad_weight*gradient_bar+(1-grad_weight)*gradient_lambda 200 | 201 | # Adaptive learning rate 202 | if(iter>tau){ 203 | stepsize <- lrate*tau/iter 204 | }else{ 205 | stepsize <- lrate 206 | } 207 | 208 | # Gradient ascend 209 | lambda <- lambda + stepsize*gradient_bar 210 | 211 | # Restore model parameters from variational parameter lambda 212 | mu <- lambda[1:d_theta] 213 | b <- lambda[(d_theta+1):(2*d_theta)] 214 | c <- lambda[(2*d_theta+1):length(lambda)] 215 | W1 <- matrix(mu[1:index_track[1]],n_units[1],p+1) 216 | W_seq[[1]] <- W1 217 | for (j in 2:L){ 218 | index <- (index_track[j-1]+1):index_track[j] 219 | Wj <- matrix(mu[index],n_units[j],n_units[j-1]+1) 220 | W_seq[[j]] <- Wj 221 | } 222 | beta <- mu[(d_w+1):d_theta] 223 | 224 | # Update tau and shrinkage parameters 225 | if(iter%%1 == 0){ 226 | mu_matrixW1_tilde <- matrix(mu[W1_tilde_index],n_units[1],p) 227 | b_matrixW1_tilde <- matrix(b[W1_tilde_index],n_units[1],p) 228 | c_matrixW1_tilde <- matrix(c[W1_tilde_index],n_units[1],p) 229 | for (j in 1:p) { 230 | mean_column_j_tilde[j] <- mu_matrixW1_tilde[,j] %*% mu_matrixW1_tilde[,j] + 231 | b_matrixW1_tilde[,j] %*% b_matrixW1_tilde[,j] + 232 | sum(c_matrixW1_tilde[,j]^2) 233 | mu_tau[j] <- shrinkage_gamma[j]/sqrt(mean_column_j_tilde[j]) 234 | lambda_tau[j] <- shrinkage_gamma[j]^2 235 | } 236 | mean_inverse_tau <- mu_tau 237 | mean_tau <- 1/mu_tau + 1/lambda_tau 238 | shrinkage_gamma <- sqrt((n_units[1]+1)/mean_tau) 239 | shrinkage_gamma_seq <- cbind(shrinkage_gamma_seq,shrinkage_gamma) 240 | 241 | mu_w_tilde <- mu[w_tilde_index] 242 | b_w_tilde <- b[w_tilde_index] 243 | c_w_tilde <- c[w_tilde_index] 244 | mean_w_tilde <- c(mu_w_tilde %*% mu_w_tilde + b_w_tilde %*% b_w_tilde + sum(c_w_tilde^2)) 245 | } 246 | 247 | # Update VB posterior for sigma2, which is inverse Gamma 248 | if(distr=="normal"){ 249 | if (iter%%1 == 0){ 250 | sum_squared <- nnSumResidualSquare(y_train,X_train,W_seq,beta) 251 | beta_sigma2 <- beta0_sigma2 + sum_squared/2 252 | mean_sigma2_inverse <- alpha_sigma2/beta_sigma2 253 | mean_sigma2 <- beta_sigma2/(alpha_sigma2-1) 254 | mean_sigma2_save <- c(mean_sigma2_save,mean_sigma2) 255 | } 256 | } 257 | 258 | # Using lowerbound for validation 259 | if(iter>LBwindow){ 260 | lb_bar[iter-LBwindow] <- mean(lb[(iter-LBwindow+1):iter]) 261 | if(lb_bar[length(lb_bar)]>=max(lb_bar)){ 262 | lambda_best <- lambda 263 | idxPatience <- 0 264 | }else{ 265 | idxPatience <- idxPatience + 1 266 | } 267 | } 268 | 269 | # Early stopping 270 | if((idxPatience>patience)||(idxEpoch>epoch)){ 271 | stop <- TRUE 272 | } 273 | 274 | # Display epoch index whenever an epoch is finished 275 | if(iter%%num1Epoch==0){ 276 | idxEpoch <- idxEpoch + 1 277 | } 278 | 279 | # Display training results after each 'verbose' iteration 280 | if (verbose && iter%%verbose==0){ 281 | if(iter>LBwindow){ 282 | cat("Epoch: ", idxEpoch, " - Current LB: ",lb_bar[iter-LBwindow],"\n") 283 | # message("Epoch: ", idxEpoch, " - Current LB: ",lb_bar[iter-LBwindow]) 284 | } 285 | else{ 286 | cat("Epoch: ", idxEpoch, "- Current LB: ",lb[iter],"\n") 287 | # message("Epoch: ", idxEpoch, " - Current LB: ",lb[iter]) 288 | } 289 | } 290 | } 291 | 292 | # Display Training Results 293 | print('---------- Training Completed! ----------') 294 | cat("Number of iteration: ",iter,'\n') 295 | cat("LBBar best: ",max(lb_bar),'\n') 296 | # message("Number of iteration: ",iter) 297 | # message("LBBar best: ",max(lb_bar)) 298 | 299 | # Store training output 300 | lambda <- lambda_best 301 | mu <- lambda[1:d_theta] 302 | b <- lambda[(d_theta+1):(2*d_theta)] 303 | c <- lambda[(2*d_theta+1):length(lambda)] 304 | SIGMA = cbind(b) %*% b + diag(c^2) 305 | 306 | W1 <- matrix(mu[1:index_track[1]],n_units[1],p+1) 307 | W_seq[[1]] <- W1 308 | for (j in 2:L){ 309 | index <- (index_track[j-1]+1):index_track[j] 310 | Wj <- matrix(mu[index],n_units[j],n_units[j-1]+1) 311 | W_seq[[j]] <- Wj 312 | } 313 | beta <- mu[(d_w+1):d_theta] 314 | 315 | # Store output in a struct 316 | est$out.weights <- W_seq 317 | est$out.beta <- beta 318 | est$out.shrinkage <- shrinkage_gamma_seq 319 | colnames(est$out.shrinkage) <- NULL 320 | est$out.iteration <- iter 321 | est$out.vbMU <- mu # Mean of variational distribution of weights 322 | est$out.b <- b 323 | est$out.c <- c 324 | est$out.vbSIGMA <- SIGMA # Covariance matrix of variational distribution of weights 325 | est$out.nparams <- d_theta # Number of parameters 326 | est$out.indexTrack <- index_track 327 | est$out.muTau <- mu_tau 328 | 329 | if(distr=="normal"){ 330 | est$out.sigma2Alpha <- alpha_sigma2 331 | est$out.sigma2Beta <- beta_sigma2 332 | est$out.sigma2Mean <- mean_sigma2_save[length(mean_sigma2_save)] 333 | est$out.sigma2MeanIter <- mean_sigma2_save 334 | } 335 | est$out.lbBar <- lb_bar[2:length(lb_bar)] 336 | est$out.lb <- lb 337 | 338 | return(est) 339 | } 340 | -------------------------------------------------------------------------------- /R/02_libs/deepGLMfit.R: -------------------------------------------------------------------------------- 1 | deepGLMfit <- function(X,y, Lrate=0.01, Network=c(10,10) , BatchSize=5000, 2 | S=10, LRateFactor=10000, Momentum=0.6, Patience=100, 3 | MaxEpoch = 100, Verbose=10, Distribution="normal", 4 | WindowSize=100, Seed=NaN, Intercept=TRUE){ 5 | 6 | # Store training settings in a list 7 | est <- list() 8 | est$S <- S 9 | est$lrate <- Lrate 10 | est$epoch <- MaxEpoch 11 | est$tau <- LRateFactor 12 | est$patience <- Patience 13 | est$network <- Network 14 | est$dist <- Distribution 15 | est$seed <- Seed 16 | est$icept <- Intercept 17 | est$momentum <- Momentum 18 | est$verbose <- Verbose 19 | est$windowSize <- WindowSize 20 | 21 | # Check if inputs are corrected 22 | checkInput(X,y,est) 23 | 24 | # Calculate batch size 25 | if(BatchSize<=1){ # If specified batchsize is a propotion 26 | BatchSize <- BatchSize * nrow(X) 27 | } 28 | if(BatchSize>=nrow(X)){ 29 | BatchSize <- nrow(X) 30 | } 31 | est$batchsize <- BatchSize 32 | 33 | # Insert intercepts if Intercept=TRUE 34 | if(Intercept){ 35 | X <- cbind(matrix(1,nrow(X),1),X) 36 | } 37 | 38 | #Start to train deepGLM 39 | y <- as.matrix(y) 40 | t_start <- Sys.time() 41 | est <- deepGLMTrain(X,y,est) 42 | t_stop <- Sys.time() 43 | est$out.CPU <- t_stop - t_start 44 | cat("Training time: ",est$out.CPU,'\n') 45 | 46 | return(est) 47 | } 48 | -------------------------------------------------------------------------------- /R/02_libs/deepGLMpredict.R: -------------------------------------------------------------------------------- 1 | # Function to make prediction on an unseen data using a trained DeepGLM model 2 | # Input: 3 | deepGLMpredict <- function(mdl,X,y=NULL,Interval=0,Nsample=1000,Intercept=TRUE){ 4 | 5 | # Transform X to a row matrix 6 | if(is.numeric(X)){ 7 | X <- rbind(X) 8 | } 9 | 10 | # If y is specify, check y 11 | 12 | # Store Nsample to mdl 13 | mdl$Nsample <- Nsample 14 | 15 | # If training data does not include intercepts, the add intercepts 16 | N <- nrow(X) # Number of observation in test data 17 | if(Intercept){ 18 | X <- cbind(matrix(1,N,1),X) 19 | } 20 | 21 | alpha <- Interval 22 | 23 | # Load deepGLM params from struct 24 | W_seq <- mdl$out.weights 25 | beta <- mdl$out.beta 26 | distr <- mdl$dist 27 | 28 | # Calculate Neuron Network output 29 | nnet_output <- nnFeedForward(X,W_seq,beta) # Output vector of NN 30 | out <- list() 31 | 32 | if(distr=="normal"){ 33 | out$yhat = nnet_output # Prediction for continuous response 34 | # If ytest if provided, then calculate pps and mse 35 | if(length(y)>0){ 36 | sigma2 <- mdl$out.sigma2Mean 37 | mse <- mean((y-nnet_output)^2) 38 | pps <- 1/2*log(sigma2) + 1/2/sigma2*mse 39 | out$mse <- mse 40 | out$pps <- pps 41 | } 42 | # Calculate confidence interval if required 43 | if(alpha!=0){ 44 | interval <- predictionInterval(mdl,X,alpha) 45 | out$interval <- interval$interval 46 | out$yhatMatrix <- interval$yhatMC 47 | } 48 | 49 | }else if(distr=="binomial"){ 50 | out$yNN <- nnet_output 51 | out$yProb <- exp(nnet_output)/(1+exp(nnet_output)) 52 | y_pred <- as.numeric(nnet_output>0) # Prediction for binary response 53 | out$yhat <- y_pred 54 | #If ytest if provided, then calculate pps and mse 55 | if(length(y)>0){ 56 | pps <- mean(-y*nnet_output+log(1+exp(nnet_output))) 57 | cr <- mean(y==y_pred) # Miss-classification rate 58 | out$pps <- pps 59 | out$accuracy <- cr 60 | } 61 | 62 | }else if(distr=="poisson"){ 63 | out$yNN <- nnet_output 64 | y_pred <- exp(nnet_output) # Prediction for poisson response 65 | out$yhat <- y_pred 66 | if(length(y)>0) 67 | pps <- mean(-y*nnet_output+exp(nnet_output)) 68 | mse <- mean((y-y_pred)^2) 69 | out$mse <- mse 70 | out$pps <- pps 71 | 72 | }else{ 73 | message("Distribution must be: normal, binomial, poisson") 74 | } 75 | return(out) 76 | } 77 | -------------------------------------------------------------------------------- /R/02_libs/nnActivation.R: -------------------------------------------------------------------------------- 1 | # Function to calculate activation function 2 | # Input must be a matrix 3 | nnActivation <- function(a,func){ 4 | switch (func, 5 | Linear = {out <- a}, 6 | Sigmoid = {out <- 1/(1+exp(-a))}, 7 | ReLU = {out <- pmax(a,0)}, 8 | defaut) 9 | return(out) 10 | } 11 | -------------------------------------------------------------------------------- /R/02_libs/nnActivationGrad.R: -------------------------------------------------------------------------------- 1 | # Function to calculate derivative of activation function 2 | # Input must be a matrix 3 | nnActivationGrad <- function(a,func){ 4 | switch (func, 5 | Linear = {out <- matrix(1,nrow(a),ncol(a))}, 6 | Sigmoid = {out <- 1/(1+exp(-a))}, 7 | ReLU = {out <- (a>0)*1}, 8 | defaut) 9 | return(out) 10 | } 11 | -------------------------------------------------------------------------------- /R/02_libs/nnBackPropagation.R: -------------------------------------------------------------------------------- 1 | # Function to calculate backbrop of a DFNN 2 | # Input: 3 | # X,y -> Matrix 4 | # W_seq -> list of matrices 5 | # beta -> vector 6 | # distr -> character 7 | nnBackPropagation <- function(X,y,W_seq,beta,distr){ 8 | output = list() 9 | n_train <- nrow(X) # Number of mini-batch training observation 10 | L <- length(W_seq) # Number of hidden layers until the last layer 11 | a_seq <- vector("list", length = L) 12 | Z_seq <- vector("list",length = L) 13 | 14 | a_seq[[1]] <- W_seq[[1]] %*% t(X) 15 | Z_seq[[1]] <- rbind(matrix(1,1,n_train),nnActivation(a_seq[[1]],"ReLU")) 16 | 17 | for(j in 2:L){ 18 | a_seq[[j]] <- W_seq[[j]] %*% Z_seq[[j-1]] 19 | Z_seq[[j]] <- rbind(matrix(1,1,n_train),nnActivation(a_seq[[j]],"ReLU")) 20 | } 21 | delta_seq = vector("list", length = L+1) 22 | 23 | # Calculate error at the output layers according to distribution family of response 24 | nnOut = beta %*% Z_seq[[L]] 25 | switch(distr, 26 | normal = {delta_seq[[L+1]] <- t(y) - nnOut}, 27 | binomial = {p_i <- 1/(1+exp(-nnOut)) 28 | delta_seq[[L+1]] <- t(y) - p_i}, 29 | poisson = {delta_seq[[L+1]] <- t(y) - exp(nnOut)}, 30 | default) 31 | delta_seq[[L]] <- (beta[2:length(beta)] %*% delta_seq[[L+1]]) * nnActivationGrad(a_seq[[L]],"ReLU") 32 | 33 | for (j in (L-1):1) { 34 | Wj_tilde <- W_seq[[j+1]] 35 | Wj_tilde <- Wj_tilde[,2:ncol(Wj_tilde)] 36 | delta_seq[[j]] <- nnActivationGrad(a_seq[[j]],"ReLU")*(t(Wj_tilde) %*% delta_seq[[j+1]]) 37 | } 38 | gradient_W1 <- delta_seq[[1]] %*% X 39 | gradient <- c(gradient_W1) 40 | # dim(gradient) <- c(ncol(gradient)*nrow(gradient),1) 41 | for (j in 2:L) { 42 | gradient_Wj <- c(delta_seq[[j]] %*% t(Z_seq[[j-1]])) 43 | # dim(gradient_Wj) <- c(ncol(gradient_Wj)*nrow(gradient_Wj),1) 44 | gradient <- c(gradient,gradient_Wj) 45 | } 46 | gradient <- c(gradient,c(Z_seq[[L]] %*% t(delta_seq[[L+1]]))) 47 | output$gradient <- gradient 48 | output$nnOut <- nnOut 49 | return(output) 50 | } 51 | -------------------------------------------------------------------------------- /R/02_libs/nnFeedForward.R: -------------------------------------------------------------------------------- 1 | # Function to calculate output of a DFNN 2 | nnFeedForward <- function(X,W_seq,beta){ 3 | n_train <- nrow(X) # Number of training observations 4 | # Make forward passes to all layers 5 | a <- W_seq[[1]] %*% t(X) 6 | Z <- rbind(matrix(1,1,n_train),nnActivation(a,"ReLU")) 7 | L <- length(W_seq) 8 | for (j in 2:L) { 9 | a <- W_seq[[j]] %*% Z 10 | Z <- rbind(matrix(1,1,n_train),nnActivation(a,"ReLU")) # Add biases 11 | } 12 | nnOutput <- t(Z) %*% beta 13 | return(nnOutput) 14 | } 15 | -------------------------------------------------------------------------------- /R/02_libs/nnGradLogLikelihood.R: -------------------------------------------------------------------------------- 1 | # Function to calculate gradient of log-likelihood 2 | nnGradLogLikelihood <- function(w_seq,beta,X,y,datasize,distr,mean_sigma2_inverse){ 3 | output <- list() 4 | n = nrow(X) 5 | out <- nnBackPropagation(X,y,w_seq,beta,distr) 6 | back_prop <- out$gradient 7 | nnOut <- t(out$nnOut) 8 | 9 | switch (distr, 10 | normal = {gradient_theta <- mean_sigma2_inverse*back_prop 11 | gradient <- datasize/n*gradient_theta }, # To compensate the variational lowerbound 12 | binomial = {gradient <- datasize/n*back_prop}, 13 | poisson = {gradient <- datasize/n*back_prop}, 14 | default) 15 | output$gradient <- gradient 16 | output$nnOut <- nnOut 17 | return(output) 18 | } 19 | -------------------------------------------------------------------------------- /R/02_libs/nnInitialize.R: -------------------------------------------------------------------------------- 1 | # Function to initialize weights for deepGLM 2 | # Input: layers is a data array specifying (input+hidden) layers 3 | # Ex: c(20,10,10) 4 | nnInitialize <- function(layers){ 5 | # stopifnot(is.integer(layers)) # layer must be array of interger 6 | num_layer <- length(layers)-1 7 | w <- vector("list",length = num_layer) # Initialize a list to store matrices of weights 8 | for (i in 1:num_layer) { 9 | b <- sqrt(6)/(layers[i]+layers[i+1]) 10 | if(i==1){ 11 | w[[i]] <- matrix(runif(layers[i+1]*(layers[i]),-b,b),layers[i+1],layers[i]) # Input layer already has bias 12 | } 13 | else{ 14 | w[[i]] <- matrix(runif(layers[i+1]*(layers[i]+1),-b,b),layers[i+1],layers[i]+1) 15 | } 16 | } 17 | return(w) 18 | } 19 | -------------------------------------------------------------------------------- /R/02_libs/nnSumResidualSquare.R: -------------------------------------------------------------------------------- 1 | # Function to calculate sum square error of 2 vector 2 | nnSumResidualSquare <- function(y,X,W_seq,beta){ 3 | nnet_output <- nnFeedForward(X,W_seq,beta) # Output vector of NN 4 | S <- sum((y-nnet_output)^2) 5 | return(S) 6 | } 7 | -------------------------------------------------------------------------------- /R/02_libs/predictionInterval.R: -------------------------------------------------------------------------------- 1 | # Calculate prediction interval for new observations 2 | predictionInterval <- function(mdl,X,zalpha){ 3 | predInterval <- list() 4 | # Load deepGLM params from struct 5 | Nsample <- mdl$Nsample 6 | mu <- mdl$out.vbMU 7 | SIGMA <- mdl$out.vbSIGMA 8 | n_units <- mdl$network 9 | index_track <- mdl$out.indexTrack 10 | alpha_sigma2 <- mdl$out.sigma2Alpha 11 | beta_sigma2 <- mdl$out.sigma2Beta 12 | 13 | # Calculate network parameters 14 | L <- length(n_units) # Number of hidden layers 15 | p <- ncol(X)-1 # Number of covariates 16 | d_beta <- n_units[L]+1 17 | d_w <- index_track[L] 18 | 19 | yhat <- matrix(0,Nsample,nrow(X)) # Predicted values of test data 20 | nnOut <- matrix(0,Nsample,nrow(X)) # Output of NN 21 | W_seq <- vector("list",length = L) 22 | for (i in 1:Nsample) { 23 | theta_i <- rmvnorm(1,mean=mu,sigma=SIGMA) # Generate samples of theta from Normal distribution 24 | sigma2_i <- 1/rgamma(1,alpha_sigma2,beta_sigma2) # Generate samples of sigma from IG distribution 25 | 26 | # For each generated theta, restore neuron net structure 27 | W1 <- matrix(theta_i[1:index_track[1]],n_units[1],p+1) 28 | W_seq[[1]] <- W1 29 | for (j in 2:L){ 30 | index <- (index_track[j-1]+1):index_track[j] 31 | Wj <- matrix(theta_i[index],n_units[j],n_units[j-1]+1) 32 | W_seq[[j]] <- Wj 33 | } 34 | beta <- theta_i[(d_w+1):(d_w+d_beta)] 35 | 36 | nnOut[i,] <- nnFeedForward(X,W_seq,beta) # Calculate neuron network output 37 | yhat[i,] <- rnorm(nrow(X),mean=nnOut[i,],sd=sqrt(sigma2_i)) # Calculate p(y|theta_i,sigma_i,X) 38 | } 39 | 40 | # 1-std prediction interval interval 41 | yhatLCL <- colMeans(yhat) - zalpha*apply(yhat, 2, sd) 42 | yhatUCL <- colMeans(yhat) + zalpha*apply(yhat, 2, sd) 43 | yhatInterval <- cbind(cbind(yhatLCL),cbind(yhatUCL)) 44 | predInterval$yhatMC <- yhat 45 | predInterval$interval <- yhatInterval 46 | return(predInterval) 47 | } 48 | -------------------------------------------------------------------------------- /R/02_libs/vbGradientLogLB.R: -------------------------------------------------------------------------------- 1 | # Function to calculate the estimation of gradient of lowerbound 2 | vbGradientLogLB <- function(X,y,b,c,mu,S,p,L,d_theta,d_w,index_track,n_units,mean_inverse_tau, 3 | shrinkage_l2,datasize,distr,mean_sigma2_inverse,constMean, 4 | const,grad_g_lik_store,lb_iter,iter){ 5 | gradllh_out <- list() 6 | out <- list() 7 | batchsize <- nrow(X) 8 | # set.generator("MersenneTwister", initialization="init2002", resolution=53, seed=iter) 9 | rqmc <- matrix(rnorm(S*(d_theta+1),0,1),S,d_theta+1) 10 | for (s in 1:S) { 11 | # Calculate theta 12 | U_normal <- rqmc[s,] 13 | epsilon1 <- U_normal[1] 14 | epsilon2 <- U_normal[2:length(U_normal)] 15 | theta <- mu + epsilon1*b + c*epsilon2 16 | 17 | W_seq <- vector("list", length = L) 18 | W1 <- matrix(theta[1:index_track[1]],n_units[1],p+1) 19 | W_seq[[1]] <- W1 20 | W1_tilde <- W1[,2:ncol(W1)] # weights without biases 21 | W1_tilde_gamma <- W1_tilde %*% diag(c(mean_inverse_tau)) 22 | grad_prior_w_beta <- c(rep(0,n_units[1]),-c(W1_tilde_gamma)) 23 | for (j in 2:L) { 24 | index <- (index_track[j-1]+1):index_track[j] 25 | Wj <- matrix(theta[index],n_units[j],n_units[j-1]+1) 26 | W_seq[[j]] <- Wj 27 | Wj_tilde <- Wj[,2:ncol(Wj)] 28 | grad_prior_Wj <- c(rep(0,n_units[j]),-shrinkage_l2 %*% c(Wj_tilde)) 29 | grad_prior_w_beta <- c(grad_prior_w_beta,grad_prior_Wj) 30 | } 31 | beta <- theta[(d_w+1):d_theta] 32 | beta_tilde <- beta[2:length(beta)] # vector beta without intercept 33 | grad_prior_beta <- c(0,c(-shrinkage_l2 %*% beta_tilde)) 34 | grad_prior_w_beta <- c(grad_prior_w_beta,grad_prior_beta) 35 | 36 | if (distr=="normal"){ 37 | gradllh_out <- nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr,mean_sigma2_inverse) 38 | }else if(distr=="binomial"){ 39 | gradllh_out <- nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr) 40 | }else if(distr=="poisson"){ 41 | gradllh_out <- nnGradLogLikelihood(W_seq,beta,X,y,datasize,distr) 42 | }else{ 43 | message("Distribution must be: normal, binomial, poisson") 44 | } 45 | grad_llh <- gradllh_out$gradient 46 | yNN <- gradllh_out$nnOut 47 | 48 | grad_h <- grad_prior_w_beta + grad_llh # Gradient of log prior plus log-likelihood 49 | grad_log_q <- vbGradientLogq(b,c,theta,mu) 50 | grad_theta <- grad_h - grad_log_q 51 | grad_g_lik_store[s,] <- c(grad_theta,epsilon1*grad_theta, epsilon2*grad_theta) 52 | 53 | # Calculate Lowerbound 54 | if(distr=="normal"){ 55 | lb_iter[s] <- constMean-0.5*mean_sigma2_inverse*sum((y-yNN)^2)*datasize/batchsize + const 56 | }else if(distr=="binomial"){ 57 | lb_iter[s] <- constMean + sum(y*yNN - log(1+exp(yNN)))*datasize/batchsize + const 58 | }else if(distr=="poisson"){ 59 | lb_iter[s] <- constMean + sum(y*yNN - exp(yNN))*datasize/batchsize + const 60 | }else{ 61 | message("Distribution must be: normal, binomial, poisson") 62 | } 63 | } 64 | grad_lb <- colMeans(grad_g_lik_store) 65 | gradient_lambda <- vbNaturalGradient(b,c,grad_lb) 66 | out$lb_iter <- lb_iter 67 | out$gradient_lambda <- gradient_lambda 68 | 69 | return(out) 70 | } 71 | -------------------------------------------------------------------------------- /R/02_libs/vbGradientLogq.R: -------------------------------------------------------------------------------- 1 | # Function vbGradientLogq 2 | # b,c,theta,mu -> vector 3 | vbGradientLogq <- function(b,c,theta,mu){ 4 | x <- theta-mu 5 | d <- b/c^2 6 | grad_log_q <- -x/c^2 + c((d%*%x)/(1+(d%*%b)))*d 7 | } 8 | -------------------------------------------------------------------------------- /R/02_libs/vbLowerBound.R: -------------------------------------------------------------------------------- 1 | # Function to calculate lowerbound of variational distribution 2 | vbLowerBound <- function(b,c,distr,p,beta_sigma2,alpha_sigma2,alpha0_sigma2,beta0_sigma2, 3 | mean_sigma2_inverse,n_units,shrinkage_gamma,mean_tau,datasize, 4 | lambda_tau,d_w_tilde,shrinkage_l2,mean_w_tilde,mean_column_j_tilde, 5 | mean_inverse_tau){ 6 | if(distr=="normal"){ 7 | mean_log_sig2 <- log(beta_sigma2)-digamma(alpha_sigma2) 8 | logdet <- log(det(1 + (b/(c^2)) %*% b)) + sum(log(c^2)) 9 | constMean <- -(alpha0_sigma2+1)*mean_log_sig2 - beta0_sigma2*mean_sigma2_inverse+ 10 | 0.5*sum(2*(n_units[1]+1)*log(shrinkage_gamma)-(shrinkage_gamma^2)*mean_tau)- 11 | 0.5*datasize*mean_log_sig2+ 12 | lgamma(alpha_sigma2)-alpha_sigma2*log(beta_sigma2)+ 13 | (alpha_sigma2+1)*mean_log_sig2+alpha_sigma2- 14 | 0.5*(sum(log(lambda_tau))-p) + 0.5*logdet + 15 | 0.5*d_w_tilde*log(shrinkage_l2) - 0.5*shrinkage_l2*mean_w_tilde - 16 | 0.5*sum(c(mean_column_j_tilde)*c(mean_inverse_tau)) 17 | }else{ 18 | logdet = log(det(1 + (b/(c^2)) %*% b)) + sum(log(c^2)) 19 | constMean = 0.5*sum(2*(n_units[1]+1)*log(shrinkage_gamma)- 20 | (shrinkage_gamma^2)*mean_tau)-0.5*(sum(log(lambda_tau))-p)+ 21 | 0.5*logdet+0.5*d_w_tilde*log(shrinkage_l2) - 22 | 0.5*shrinkage_l2*mean_w_tilde- 23 | 0.5*sum(c(mean_column_j_tilde)*c(mean_inverse_tau)) 24 | } 25 | return(constMean) 26 | } 27 | -------------------------------------------------------------------------------- /R/02_libs/vbNaturalGradient.R: -------------------------------------------------------------------------------- 1 | # Function to calculate natural gradient 2 | # Input: 3 | # b,c,grad -> vector 4 | vbNaturalGradient <- function(b,c,grad){ 5 | d <- length(b) 6 | grad1 <- grad[1:d] 7 | grad2 <- grad[(d+1):(2*d)] 8 | grad3 <- grad[(2*d+1):length(grad)] 9 | c2 <- c^2 10 | b2 <- b^2 11 | prod1 <- c(b %*% grad1)*b + (grad1*c2) 12 | const <- sum(b2/c2) 13 | const1 <- 0.5 + 0.5/const 14 | prod2 <- c(b %*% grad2)*b + (grad2*c2) 15 | prod2 <- const1*prod2 16 | alpha <- 1/(1+const) 17 | x <- alpha*b2/c^3 18 | y <- 1/c2 - 2*alpha*(b/c2)^2 19 | aux <- x/y 20 | prod3 <- 0.5*(grad3/y-c(1/((1+sum(x^2/y)))*(aux %*% grad3)) * aux) 21 | prod <- c(prod1,prod2,prod3) 22 | return(prod) 23 | } 24 | -------------------------------------------------------------------------------- /R/deepGLMNormalExample.R: -------------------------------------------------------------------------------- 1 | # Examples demonstrate how to use deepGLM function to fit data with continuous dependent variable 2 | # 3 | # Copyright 2018 4 | # Nghia Nguyen (nghia.nguyen@sydney.edu.au) 5 | # Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) 6 | # 7 | # https://github.com/VBayesLab/deepGLM 8 | # 9 | # Version: 1.0 10 | # LAST UPDATE: May, 2018 11 | 12 | # Clear all variables 13 | rm(list=ls()) 14 | gc(reset=T) 15 | 16 | # Load libs 17 | library(mvtnorm) 18 | library(rstudioapi) 19 | 20 | RootDir <- dirname(rstudioapi::getSourceEditorContext()$path) 21 | setwd(RootDir) 22 | 23 | # Source external functions 24 | source('dependencies.R') 25 | 26 | # Read data file 27 | data <- read.csv(file = "01_data/abalone.csv",header = FALSE) 28 | 29 | 30 | # Divide data to training and test sets 31 | N <- nrow(data) # Total number of observations 32 | p <- ncol(data) - 1 # Number of variables 33 | Ntest <- round(0.15*N) # Number of test observations 34 | idx <- sample.int(N, size = Ntest, replace = FALSE) # Sampling indexes 35 | dataTest <- data[idx,] # Test data 36 | dataTrain <- data[-idx,] # Train data 37 | XTrain <- data.matrix(dataTrain[,1:p]) # X train 38 | y <- data.matrix(dataTrain[,p+1]) # y train 39 | XTest <- data.matrix(dataTest[,1:p]) # X test 40 | yTest <- data.matrix(dataTest[,p+1]) # y test 41 | 42 | # Normalize Train and Test data 43 | meanX <- colMeans(XTrain) 44 | stdX <- apply(XTrain, 2, sd) 45 | X <- sweep(sweep(XTrain,2,meanX,'-'),2,stdX,'/') 46 | XTest <- sweep(sweep(XTest,2,meanX,'-'),2,stdX,'/') 47 | 48 | # Fit a deepGLM model 49 | deepGLMout <-deepGLMfit(X,y,Network = c(5,5,5),Seed = 100,Verbose = 1, MaxEpoch = 500) 50 | 51 | # Make prediction (point estimation) on a test set, without true labels 52 | Pred1 <- deepGLMpredict(deepGLMout,XTest) 53 | 54 | # If ytest is specified (for model evaluation purpose) then we can check PPS and MSE on test set 55 | print('----------------Prediction---------------') 56 | Pred2 <- deepGLMpredict(deepGLMout,XTest,y = yTest) 57 | cat('PPS on test set using deepGLM is: ',Pred2$pps,'\n') 58 | cat('MSE on test set using deepGLM is: ',Pred2$mse,'\n') 59 | 60 | # You can also perform point and interval estimation for a single test observation 61 | idx <- nrow(XTest) # Pick a random unseen observation 62 | dataTest <- XTest[idx,] 63 | Pred3 <- deepGLMpredict(deepGLMout,dataTest,Interval=1,Nsample=1000) # Make 1-std prediction interval 64 | cat('Prediction Interval: [',Pred3$interval[1],';',Pred3$interval[2],']','\n') 65 | cat('True value: ',yTest[idx],'\n') 66 | 67 | # Estimate prediction interval for entire test data 68 | Pred4 <- deepGLMpredict(deepGLMout,XTest,y=yTest,Interval=1,Nsample=1000) 69 | y_pred <- colMeans(Pred4$yhatMatrix) 70 | mse2 <- mean((yTest-y_pred)^2) 71 | accuracy <- (yTestPred4$interval[,1]) 72 | cat('Prediction Interval accuracy: ',sum(accuracy)/length(accuracy),'\n') 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /R/dependencies.R: -------------------------------------------------------------------------------- 1 | # Source all files within libs 02_folder 2 | RootDir <- dirname(rstudioapi::getSourceEditorContext()$path) 3 | setwd(paste0(RootDir,'/02_libs')) 4 | 5 | files.sources = list.files() 6 | sapply(files.sources, source) 7 | 8 | setwd(RootDir) 9 | 10 | 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepGLM 2 | Version 0.0.0.9000
3 | 4 | ## Introduction 5 | DeepGLM is a flexible model that use Deep Feedforward Neuron Network as the basis function for Generalized Linear Model. DeepGLM is designed to work with Cross-Sectional Dataset such as real estate data, cencus data, etc.
6 | 7 | For more information about DeepGLM, please read the paper: Minh-Ngoc Tran,Nghia Nguyen, David J. Nott and Robert Kohn (2018) Bayesian Deep Net GLM and GLMM https://arxiv.org/abs/1805.10157 8 | 9 | ## Authors 10 | Nghia Nguyen (nghia.nguyen@sydney.edu.au)
11 | Minh-Ngoc Tran (minh-ngoc.tran@sydney.edu.au) 12 | 13 | ## Usage 14 | Users can choose either Matlab, R or Python version to train and make prediction with deepGLM. 15 | ### MATLAB Version 16 | To use the Toolbox, add the folder called "deepGLM" (with Subfolders) to the MATLAB path. 17 | 18 | The toolbox contains the following folders: 19 | 20 | - Data: some datasets used in the examples. 21 | - Examples: examples of all the functions included in the toolbox. 22 | - Documents: documentations for the functions in deepGLM toolbox 23 | - deepGLM: all the functions of the toolbox all here. This is the folder you must add to the MATLAB path. 24 | 25 | ### R Version 26 | Install *deepglm* package for R: 27 | - Clone the directory or directly download the zip file **deepglm_0.0.0.9000.zip** inside *deepGLM/R/* subdirectory on github. 28 | - In Rstudio, run the command:
29 | **install.packages("D:\\deepglm_0.0.0.9000.zip", repos = NULL, type="source")**
30 | where *D:\deepglm_0.0.0.9000.zip* is the package directory in my local machine 31 | - To use the package, run the command:
32 | **library(deepglm)** 33 | 34 | *deepglm* provides two function to train a deepGLM model on training data (*deepGLMfit*) and to make prediction using a trained deepGLM model on unseen data (*deepGLMpredict*). In Studio, use command: **?deepGLMfit** and **?deepGLMpredict** to read the documentation for two functions 35 | 36 | Use command **example(deepGLMfit)** to run the example showing how to run *deepGLMpredict* and *deepGLMpredict* on a simulation data 37 | 38 | User can run addition examples using scripts in *demos* folder in the installation directory. For example, the installation directory for *deepglm* package in my Window machine is: *D:\Program Files\R\R-3.4.3\R-3.4.3\library\deepglm* 39 | 40 | ### Python Version 41 | Download the file **deepGLM.pyc** to your project folder. 42 | 43 | ## How to cite 44 | Please, cite the toolbox as: 45 | 46 | Tran, M.-N., Nguyen, N., Kohn, R., and Nott, D. (2019) Bayesian Deep Net GLM and GLMM. Journal of Computational and Graphical Statistics, 29(1):97-113 47 | --------------------------------------------------------------------------------