├── init.m ├── .gitignore ├── demo ├── ch08 │ ├── letterX.mat │ ├── nbGauss_demo.m │ ├── nbBern_demo.m │ └── mrf_demo.m ├── ch09 │ ├── linRegEm_demo.m │ ├── mixBernEm_demo.m │ ├── kmedoids_demo.m │ ├── kmeans_mixGaussEm_demo.m │ ├── mixGaussEm_demo.m │ └── kmeans_demo.m ├── ch14 │ ├── adaboostBin_demo.m │ ├── mixLogitBin_demo.m │ └── mixLinReg_demo.m ├── ch04 │ ├── logitMn_demo.m │ └── logitBin_demo.m ├── ch12 │ ├── fa_demo.m │ ├── ppcaVb_demo.m │ ├── ppcaEm_demo.m │ └── pca_demo.m ├── ch06 │ ├── knCenter_demo.m │ ├── knReg_demo.m │ ├── knKmeans_demo.m │ └── knLin_demo.m ├── ch07 │ ├── rvmBinEm_demo.m │ ├── rvmBinFp_demo.m │ ├── rvmRegEm_demo.m │ ├── rvmRegSeq_demo.m │ ├── rvmRegFp_demo.m │ ├── rvmRegEm_spSignal_demo.m │ ├── rvmRegSeq_spSignal_demo.m │ └── rvmRegFp_spSignal_demo.m ├── ch03 │ ├── linReg_demo.m │ ├── linRegEm_demo.m │ └── linRegFp_demo.m ├── ch11 │ ├── mixGaussGb_demo.m │ └── gauss_demo.m ├── ch13 │ ├── hmm_demo.m │ └── lds_demo.m ├── ch10 │ ├── rvmRegVb_demo.m │ ├── mixGaussVb_demo.m │ └── rvmRegVb_spSignal_demo.m ├── ch05 │ └── mlp_demo.m └── ch01 │ └── info_demo.m ├── chapter04 ├── sigmoid.m ├── softmax.m ├── logitBinPred.m ├── logitMnPred.m ├── binPlot.m ├── fda.m ├── logitBin.m └── logitMn.m ├── common ├── randp.m ├── maxdiff.m ├── ud.m ├── logdet.m ├── log1mexp.m ├── standardize.m ├── log1pexp.m ├── solvpd.m ├── isequalf.m ├── sqdist.m ├── invpd.m ├── lognormexp.m ├── normalize.m ├── unitize.m ├── logsumexp.m ├── gson.m ├── symeig.m ├── plotCurveBar.m ├── besseliLn.m ├── sub.m ├── mgson.m ├── slice.m ├── lattice.m ├── plotgm.m ├── loggmpdf.m ├── plotkde.m ├── ld.m └── plotClass.m ├── chapter06 ├── kn2sd.m ├── knLin.m ├── sd2kn.m ├── knPca.m ├── knPcaPred.m ├── knGauss.m ├── knPoly.m ├── knKmeansPred.m ├── knReg.m ├── knRegPred.m ├── knKmeans.m └── knCenter.m ├── chapter02 ├── logMn.m ├── logKde.m ├── logVmf.m ├── logWishart.m ├── logDirichlet.m ├── logMvGamma.m ├── logGauss.m └── logSt.m ├── chapter01 ├── entropy.m ├── jointEntropy.m ├── relatEntropy.m ├── condEntropy.m ├── mutInfo.m ├── nvi.m └── nmi.m ├── chapter11 ├── dirichletRnd.m ├── discreteRnd.m ├── gaussRnd.m ├── mixGaussSample.m ├── mixGaussGb.m ├── mixDpGbOl.m ├── mixDpGb.m ├── Gauss.m └── GaussWishart.m ├── chapter08 ├── MRF │ ├── mrfGibbs.m │ ├── mrfBethe.m │ ├── mrfIsGa.m │ ├── mrfMf.m │ └── mrfBp.m └── NaiveBayes │ ├── nbBernPred.m │ ├── nbBern.m │ ├── nbGauss.m │ └── nbGaussPred.m ├── chapter09 ├── kseeds.m ├── kmeansPred.m ├── mixBernRnd.m ├── kmeansRnd.m ├── kmedoids.m ├── mixGaussPred.m ├── kmeans.m ├── mixGaussRnd.m ├── mixBernEm.m ├── linRegEm.m ├── rvmRegEm.m ├── rvmBinEm.m └── mixGaussEm.m ├── chapter05 ├── mlpRegPred.m ├── mlpClassPred.m ├── mlpReg.m └── mlpClass.m ├── chapter07 ├── rvmBinPred.m ├── rvmRegPred.m ├── rvmRegFp.m ├── rvmBinFp.m └── rvmRegSeq.m ├── chapter14 ├── mixLogitBinPred.m ├── mixLinRnd.m ├── adaboostBinPred.m ├── mixLinPred.m ├── adaboostBin.m ├── mixLogitBin.m └── mixLinReg.m ├── chapter12 ├── ppcaRnd.m ├── pca.m ├── pcaEmC.m ├── pcaEm.m ├── fa.m ├── ppcaEm.m └── ppcaVb.m ├── chapter03 ├── linRnd.m ├── linRegPred.m ├── linReg.m └── linRegFp.m ├── chapter13 ├── HMM │ ├── hmmRnd.m │ ├── hmmViterbi.m │ ├── hmmFilter.m │ ├── hmmSmoother.m │ └── hmmEm.m └── LDS │ ├── ldsPca.m │ ├── ldsRnd.m │ ├── kalmanFilter.m │ ├── ldsEm.m │ └── kalmanSmoother.m ├── LICENSE ├── chapter10 ├── mixGaussVbPred.m ├── rvmRegVb.m ├── linRegVb.m ├── mixGaussEvidence.m └── mixGaussVb.m ├── README.md └── Contents.m /init.m: -------------------------------------------------------------------------------- 1 | addpath(genpath(pwd)); -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | reference/* 2 | *.m~ 3 | *.asv -------------------------------------------------------------------------------- /demo/ch08/letterX.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PRML/PRMLT/HEAD/demo/ch08/letterX.mat -------------------------------------------------------------------------------- /chapter04/sigmoid.m: -------------------------------------------------------------------------------- 1 | function y = sigmoid(x) 2 | % Sigmod function 3 | % Written by Mo Chen (sth4nth@gmail.com). 4 | y = exp(-log1pexp(-x)); -------------------------------------------------------------------------------- /common/randp.m: -------------------------------------------------------------------------------- 1 | function i = randp(p) 2 | % Sample a integer in [1:k] with given probability p 3 | i = find(rand 0 6 | y = -inf; 7 | else 8 | y = 2*sum(log(diag(U))); 9 | end -------------------------------------------------------------------------------- /demo/ch12/ppcaVb_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch12 2 | clear; close all; 3 | d = 3; 4 | m = 2; 5 | n = 1000; 6 | 7 | X = ppcaRnd(m,d,n); 8 | plotClass(X); 9 | %% Variational Bayesian probabilistic PCA 10 | [model, L] = ppcaVb(X); 11 | plot(L); 12 | -------------------------------------------------------------------------------- /demo/ch06/knCenter_demo.m: -------------------------------------------------------------------------------- 1 | %% demo for knCenter 2 | clear; close all; 3 | kn = @knGauss; 4 | X=rand(2,100); 5 | X1=rand(2,10); 6 | X2=rand(2,5); 7 | 8 | maxdiff(knCenter(kn,X,X1),diag(knCenter(kn,X,X1,X1))') 9 | maxdiff(knCenter(kn,X),knCenter(kn,X,X,X)) -------------------------------------------------------------------------------- /demo/ch12/ppcaEm_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch12 2 | 3 | clear; close all; 4 | d = 3; 5 | m = 2; 6 | n = 1000; 7 | 8 | X = ppcaRnd(m,d,n); 9 | plotClass(X); 10 | 11 | %% EM probabilistic PCA 12 | [W,mu,beta,llh] = ppcaEm(X,m); 13 | plot(llh) 14 | -------------------------------------------------------------------------------- /common/log1mexp.m: -------------------------------------------------------------------------------- 1 | function y = log1mexp(x) 2 | % Accurately compute y = log(1-exp(x)) 3 | % reference: Accurately Computing log(1-exp(-|a|)) Martin Machler 4 | y = x; 5 | i = x < -log(2); 6 | y(i) = log1p(-exp(x(i))); 7 | y(~i) = log(-expm1(x(~i))); 8 | -------------------------------------------------------------------------------- /demo/ch09/kmedoids_demo.m: -------------------------------------------------------------------------------- 1 | close all; clear; 2 | d = 2; 3 | k = 3; 4 | n = 5000; 5 | [X,label] = kmeansRnd(d,k,n); 6 | init = ceil(k*rand(1,n)); 7 | [y, idx, v] = kmedoids(X,init); 8 | plotClass(X,label); 9 | figure; 10 | plotClass(X,y); 11 | 12 | -------------------------------------------------------------------------------- /common/standardize.m: -------------------------------------------------------------------------------- 1 | function [Y, s] = standardize(X) 2 | % Unitize the vectors to be unit length 3 | % By default dim = 1 (columns). 4 | % Written by Mo Chen (sth4nth@gmail.com). 5 | X = bsxfun(@minux,X,mean(X,2)); 6 | s = sqrt(mean(sum(X.^2,1))); 7 | Y = X/s; -------------------------------------------------------------------------------- /common/log1pexp.m: -------------------------------------------------------------------------------- 1 | function y = log1pexp(x) 2 | % Accurately compute y = log(1+exp(x)) 3 | % reference: Accurately Computing log(1-exp(-|a|)) Martin Machler 4 | y = x; 5 | i = x > 18; 6 | j = i & (x <= 33.3); 7 | y(~i) = log1p(exp(x(~i))); 8 | y(j) = x(j)+exp(-x(j)); 9 | -------------------------------------------------------------------------------- /demo/ch07/rvmBinEm_demo.m: -------------------------------------------------------------------------------- 1 | %% RVM for classification 2 | clear; close all 3 | k = 2; 4 | d = 2; 5 | n = 1000; 6 | [X,t] = kmeansRnd(d,k,n); 7 | 8 | [model, llh] = rvmBinEm(X,t-1); 9 | plot(llh); 10 | y = rvmBinPred(model,X)+1; 11 | figure; 12 | plotClass(X,y); 13 | -------------------------------------------------------------------------------- /demo/ch07/rvmBinFp_demo.m: -------------------------------------------------------------------------------- 1 | %% RVM for classification 2 | clear; close all 3 | k = 2; 4 | d = 2; 5 | n = 1000; 6 | [X,t] = kmeansRnd(d,k,n); 7 | 8 | [model, llh] = rvmBinFp(X,t-1); 9 | plot(llh); 10 | y = rvmBinPred(model,X)+1; 11 | figure; 12 | plotClass(X,y); 13 | -------------------------------------------------------------------------------- /demo/ch08/nbGauss_demo.m: -------------------------------------------------------------------------------- 1 | d = 2; 2 | k = 3; 3 | n = 1000; 4 | [X, t] = kmeansRnd(d,k,n); 5 | plotClass(X,t); 6 | 7 | m = floor(n/2); 8 | X1 = X(:,1:m); 9 | X2 = X(:,(m+1):end); 10 | t1 = t(1:m); 11 | model = nbGauss(X1,t1); 12 | y2 = nbGaussPred(model,X2); 13 | plotClass(X2,y2); -------------------------------------------------------------------------------- /demo/ch03/linReg_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch03 2 | clear; close all; 3 | d = 1; 4 | n = 200; 5 | [x,t] = linRnd(d,n); 6 | %% Linear regression 7 | model = linReg(x,t); 8 | [y,sigma] = linRegPred(model,x,t); 9 | plotCurveBar( x, y, sigma ); 10 | hold on; 11 | plot(x,t,'o'); 12 | hold off; -------------------------------------------------------------------------------- /common/solvpd.m: -------------------------------------------------------------------------------- 1 | function V = solvpd(A,B) 2 | % Compute A\B where A is a positive definite matrix 3 | % A: a positive difinie matrix 4 | % Written by Mo Chen (sth4nth@gmail.com). 5 | [U,p] = chol(A); 6 | if p > 0 7 | error('ERROR: the matrix is not positive definite.'); 8 | end 9 | V = U\(U'\B); -------------------------------------------------------------------------------- /common/isequalf.m: -------------------------------------------------------------------------------- 1 | function z = isequalf(x, y, tol) 2 | % Determine whether two float number x and y are equal up to precision tol 3 | % Written by Mo Chen (sth4nth@gmail.com). 4 | if nargin < 3 5 | tol = 1e-8; 6 | end 7 | assert(all(size(x)==size(y))); 8 | z = max(abs(x(:)-y(:))) 0 8 | error('ERROR: the matrix is not positive definite.'); 9 | end 10 | V = inv(U); 11 | W = V*V'; -------------------------------------------------------------------------------- /demo/ch03/linRegEm_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch03 2 | clear; close all; 3 | d = 1; 4 | n = 200; 5 | [x,t] = linRnd(d,n); 6 | %% Empirical Bayesian linear regression via EM 7 | [model,llh] = linRegEm(x,t); 8 | plot(llh); 9 | [y,sigma] = linRegPred(model,x,t); 10 | figure 11 | plotCurveBar(x,y,sigma); 12 | hold on; 13 | plot(x,t,'o'); 14 | hold off; -------------------------------------------------------------------------------- /demo/ch14/mixLinReg_demo.m: -------------------------------------------------------------------------------- 1 | %% Mixture of linear regression 2 | close all; clear 3 | d = 1; 4 | k = 2; 5 | n = 500; 6 | [X,y] = mixLinRnd(d,k,n); 7 | plot(X,y,'.'); 8 | [label,model,llh] = mixLinReg(X, y, k); 9 | plotClass([X;y],label); 10 | figure 11 | plot(llh); 12 | [y_,z,p] = mixLinPred(model,X,y); 13 | figure; 14 | plotClass([X;y],label); -------------------------------------------------------------------------------- /demo/ch06/knReg_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch06 2 | 3 | 4 | %% Kernel regression with gaussian kernel 5 | clear; close all; 6 | n = 100; 7 | x = linspace(0,2*pi,n); % test data 8 | t = sin(x)+rand(1,n)/2; 9 | model = knReg(x,t,1e-4,@knGauss); 10 | [y,s] = knRegPred(model,x); 11 | plotCurveBar(x,y,s); 12 | hold on; 13 | plot(x,t,'o'); 14 | hold off; -------------------------------------------------------------------------------- /chapter02/logMn.m: -------------------------------------------------------------------------------- 1 | function z = logMn(x, p) 2 | % Compute log pdf of a multinomial distribution. 3 | % Input: 4 | % x: d x 1 integer vector 5 | % p: d x 1 probability 6 | % Output: 7 | % z: probability density in logrithm scale z=log p(x) 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | z = gammaln(sum(x)+1)-sum(gammaln(x+1))+dot(x,log(p)); 10 | -------------------------------------------------------------------------------- /demo/ch08/nbBern_demo.m: -------------------------------------------------------------------------------- 1 | %% Naive Bayes with independent Bernoulli 2 | close all; clear; 3 | d = 10; 4 | k = 2; 5 | n = 2000; 6 | [X,t,mu] = mixBernRnd(d,k,n); 7 | m = floor(n/2); 8 | X1 = X(:,1:m); 9 | X2 = X(:,(m+1):end); 10 | t1 = t(1:m); 11 | t2 = t((m+1):end); 12 | model = nbBern(X1,t1); 13 | y2 = nbBernPred(model,X2); 14 | err = sum(t2~=y2)/numel(t2); -------------------------------------------------------------------------------- /chapter01/entropy.m: -------------------------------------------------------------------------------- 1 | function z = entropy(x) 2 | % Compute entropy z=H(x) of a discrete variable x. 3 | % Input: 4 | % x: a integer vectors 5 | % Output: 6 | % z: entropy z=H(x) 7 | % Written by Mo Chen (sth4nth@gmail.com). 8 | n = numel(x); 9 | [~,~,x] = unique(x); 10 | Px = accumarray(x, 1)/n; 11 | Hx = -dot(Px,log2(Px)); 12 | z = max(0,Hx); -------------------------------------------------------------------------------- /common/lognormexp.m: -------------------------------------------------------------------------------- 1 | function [Y,s] = lognormexp(X, dim) 2 | % Compute log(normalize(exp(x),dim)) while avoiding numerical underflow. 3 | % By default dim = 1 (columns). 4 | % Written by Mo Chen (sth4nth@gmail.com). 5 | if nargin == 1 6 | dim = find(size(X)~=1,1); 7 | if isempty(dim), dim = 1; end 8 | end 9 | s = logsumexp(X,dim); 10 | Y = X-s; 11 | -------------------------------------------------------------------------------- /demo/ch09/kmeans_mixGaussEm_demo.m: -------------------------------------------------------------------------------- 1 | 2 | %% Gauss mixture initialized by kmeans 3 | close all; clear; 4 | d = 2; 5 | k = 3; 6 | n = 500; 7 | [X,label] = mixGaussRnd(d,k,n); 8 | init = kmeans(X,k); 9 | [z,model,llh] = mixGaussEm(X,init); 10 | plotClass(X,label); 11 | figure; 12 | plotClass(X,init); 13 | figure; 14 | plotClass(X,z); 15 | figure; 16 | plot(llh); 17 | 18 | -------------------------------------------------------------------------------- /demo/ch11/mixGaussGb_demo.m: -------------------------------------------------------------------------------- 1 | %% Collapse Gibbs sampling for Dirichelt process gaussian mixture model 2 | close all; clear; 3 | d = 2; 4 | k = 3; 5 | n = 500; 6 | [X,z] = mixGaussRnd(d,k,n); 7 | plotClass(X,z); 8 | 9 | [z,Theta,w,llh] = mixGaussGb(X); 10 | figure 11 | plotClass(X,z); 12 | 13 | [X,z] = mixGaussSample(Theta,w,n); 14 | figure 15 | plotClass(X,z); 16 | 17 | -------------------------------------------------------------------------------- /chapter06/knLin.m: -------------------------------------------------------------------------------- 1 | function K = knLin(X, Y) 2 | % Linear kernel (inner product) 3 | % Input: 4 | % X: d x nx data matrix 5 | % Y: d x ny data matrix 6 | % Ouput: 7 | % K: nx x ny kernel matrix 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | if nargin < 2 || isempty(Y) 10 | K = dot(X,X,1); % norm in kernel space 11 | else 12 | K = X'*Y; 13 | end 14 | -------------------------------------------------------------------------------- /common/normalize.m: -------------------------------------------------------------------------------- 1 | function [Y, s] = normalize(X, dim) 2 | % Normalize the vectors to be summing to one 3 | % By default dim = 1 (columns). 4 | % Written by Michael Chen (sth4nth@gmail.com). 5 | if nargin == 1 6 | % Determine which dimension sum will use 7 | dim = find(size(X)~=1,1); 8 | if isempty(dim), dim = 1; end 9 | end 10 | s = sum(X,dim); 11 | Y = X./s; -------------------------------------------------------------------------------- /common/unitize.m: -------------------------------------------------------------------------------- 1 | function [Y, s] = unitize(X, dim) 2 | % Unitize the vectors to be unit length 3 | % By default dim = 1 (columns). 4 | % Written by Mo Chen (sth4nth@gmail.com). 5 | if nargin == 1 6 | % Determine which dimension sum will use 7 | dim = find(size(X)~=1,1); 8 | if isempty(dim), dim = 1; end 9 | end 10 | s = sqrt(dot(X,X,dim)); 11 | Y = bsxfun(@times,X,1./s); -------------------------------------------------------------------------------- /demo/ch03/linRegFp_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch03 2 | clear; close all; 3 | d = 1; 4 | n = 200; 5 | [x,t] = linRnd(d,n); 6 | %% Empirical Bayesian linear regression via Mackay fix point iteration method 7 | [model,llh] = linRegFp(x,t); 8 | plot(llh); 9 | [y,sigma] = linRegPred(model,x,t); 10 | figure 11 | plotCurveBar(x,y,sigma); 12 | hold on; 13 | plot(x,t,'o'); 14 | hold off; 15 | %% 16 | 17 | -------------------------------------------------------------------------------- /chapter11/dirichletRnd.m: -------------------------------------------------------------------------------- 1 | function x = dirichletRnd(a, m) 2 | % Generate samples from a Dirichlet distribution. 3 | % Input: 4 | % a: k dimensional vector 5 | % m: k dimensional mean vector 6 | % Outpet: 7 | % x: generated sample x~Dir(a,m) 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | if nargin == 2 10 | a = a*m; 11 | end 12 | x = gamrnd(a,1); 13 | x = x/sum(x); 14 | -------------------------------------------------------------------------------- /chapter08/MRF/mrfGibbs.m: -------------------------------------------------------------------------------- 1 | function lnZ = mrfGibbs(A, nodePot, edgePot, nodeBel) 2 | % Compute Gibbs energy 3 | [s,t,e] = find(triu(A)); 4 | edgeBel = zeros(size(edgePot)); 5 | for l = 1:numel(e) 6 | edgeBel(:,:,e(l)) = nodeBel(:,s(l))*nodeBel(:,t(l))'; 7 | end 8 | Ex = dot(nodeBel(:),nodePot(:)); 9 | Exy = dot(edgeBel(:),edgePot(:)); 10 | Hx = -dot(nodeBel(:),log(nodeBel(:))); 11 | lnZ = Ex+Exy+Hx; -------------------------------------------------------------------------------- /chapter09/kseeds.m: -------------------------------------------------------------------------------- 1 | function mu = kseeds(X, k) 2 | % Perform kmeans++ seeding 3 | % Input: 4 | % X: d x n data matrix 5 | % k: number of seeds 6 | % Output: 7 | % mu: d x k seeds 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | n = size(X,2); 10 | D = inf(1,n); 11 | mu = X(:,ceil(n*rand)); 12 | for i = 2:k 13 | D = min(D,sum((X-mu(:,i-1)).^2,1)); 14 | mu(:,i) = X(:,randp(D)); 15 | end 16 | -------------------------------------------------------------------------------- /demo/ch07/rvmRegEm_demo.m: -------------------------------------------------------------------------------- 1 | %% regression 2 | d = 100; 3 | beta = 1e-1; 4 | X = rand(1,d); 5 | w = randn; 6 | b = randn; 7 | t = w'*X+b+beta*randn(1,d); 8 | x = linspace(min(X),max(X),d); % test data 9 | 10 | %% RVM regression by EM 11 | [model,llh] = rvmRegEm(X,t); 12 | plot(llh); 13 | [y, sigma] = linRegPred(model,x,t); 14 | figure 15 | plotCurveBar(x,y,sigma); 16 | hold on; 17 | plot(X,t,'o'); 18 | hold off -------------------------------------------------------------------------------- /common/logsumexp.m: -------------------------------------------------------------------------------- 1 | function s = logsumexp(X, dim) 2 | % Compute log(sum(exp(X),dim)) while avoiding numerical underflow. 3 | % By default dim = 1 (columns). 4 | % Written by Mo Chen (sth4nth@gmail.com). 5 | if nargin == 1 6 | dim = find(size(X)~=1,1); 7 | if isempty(dim), dim = 1; end 8 | end 9 | a = max(X,[],dim); 10 | s = a+log(sum(exp(X-a),dim)); % TODO: use log1p 11 | i = isinf(a); 12 | s(i) = a(i); -------------------------------------------------------------------------------- /chapter06/sd2kn.m: -------------------------------------------------------------------------------- 1 | function K = sd2kn(D) 2 | % Transform a squared distance matrix to a kernel matrix. 3 | % The data are assumed to be centered, i.e., H=eye(n)-ones(n)/n; K=-(H*D*H)/2. 4 | % Input: 5 | % D: n x n squared distance matrix 6 | % Ouput: 7 | % K: n x n kernel matrix 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | D = bsxfun(@minus,D,mean(D,1)); 10 | D = bsxfun(@minus,D,mean(D,2)); 11 | K = (D+D')/(-4); -------------------------------------------------------------------------------- /demo/ch07/rvmRegSeq_demo.m: -------------------------------------------------------------------------------- 1 | %% regression 2 | d = 100; 3 | beta = 1e-1; 4 | X = rand(1,d); 5 | w = randn; 6 | b = randn; 7 | t = w'*X+b+beta*randn(1,d); 8 | x = linspace(min(X),max(X),d); % test data 9 | %% RVM regression by sequential update 10 | [model,llh] = rvmRegSeq(X,t); 11 | plot(llh); 12 | [y, sigma] = linRegPred(model,x,t); 13 | figure 14 | plotCurveBar(x,y,sigma); 15 | hold on; 16 | plot(X,t,'o'); 17 | hold off -------------------------------------------------------------------------------- /chapter08/NaiveBayes/nbBernPred.m: -------------------------------------------------------------------------------- 1 | function y = nbBernPred(model, X) 2 | % Prediction of naive Bayes classifier with independent Bernoulli. 3 | % input: 4 | % model: trained model structure 5 | % X: d x n data matrix 6 | % output: 7 | % y: 1 x n predicted class label 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | mu = model.mu; 10 | w = model.w; 11 | [~,y] = max(log(mu)'*X+log(1-mu)'*(1-X)+log(w(:)),[],1); 12 | 13 | -------------------------------------------------------------------------------- /chapter09/kmeansPred.m: -------------------------------------------------------------------------------- 1 | function [label, energy] = kmeansPred(mu, X) 2 | % Prediction for kmeans clusterng 3 | % Input: 4 | % model: dx k cluster center matrix 5 | % X: d x n testing data 6 | % Output: 7 | % label: 1 x n cluster label 8 | % energy: optimization target value 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | [val,label] = min(dot(X,X,1)+dot(mu,mu,1)'-2*mu'*X,[],1); % assign labels 11 | energy = sum(val); -------------------------------------------------------------------------------- /chapter11/discreteRnd.m: -------------------------------------------------------------------------------- 1 | function x = discreteRnd(p, n) 2 | % Generate samples from a discrete distribution (multinomial). 3 | % Input: 4 | % p: k dimensional probability vector 5 | % n: number of samples 6 | % Ouput: 7 | % x: k x n generated samples x~Mul(p) 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | if nargin == 1 10 | n = 1; 11 | end 12 | [~,~,x] = histcounts(rand(1,n),[0;cumsum(p(:))]); 13 | -------------------------------------------------------------------------------- /common/gson.m: -------------------------------------------------------------------------------- 1 | function [Q, R] = gson(X) 2 | % Gram-Schmidt orthonormalization which produces the same result as [Q,R]=qr(X,0) 3 | % Written by Mo Chen (sth4nth@gmail.com). 4 | [d,n] = size(X); 5 | m = min(d,n); 6 | R = zeros(m,n); 7 | Q = zeros(d,0); 8 | for i = 1:m 9 | R(1:i-1,i) = Q'*X(:,i); 10 | v = X(:,i)-Q*R(1:i-1,i); 11 | R(i,i) = norm(v); 12 | Q(:,i) = v/R(i,i); 13 | end 14 | R(:,m+1:n) = Q'*X(:,m+1:n); -------------------------------------------------------------------------------- /demo/ch13/hmm_demo.m: -------------------------------------------------------------------------------- 1 | % demos for HMM in ch13 2 | d = 3; k = 2; n = 10000; 3 | [x,model] = hmmRnd(d,k,n); 4 | %% Viterbi algorithm 5 | [z, llh] = hmmViterbi(model, x); 6 | %% HMM filter (forward algorithm) 7 | [alpha, llh] = hmmFilter(model, x); 8 | %% HMM smoother (forward backward) 9 | [gamma,alpha,beta,c] = hmmSmoother(model, x); 10 | %% Baum-Welch algorithm 11 | [model, llh] = hmmEm(x,k); 12 | plot(llh) 13 | -------------------------------------------------------------------------------- /demo/ch10/rvmRegVb_demo.m: -------------------------------------------------------------------------------- 1 | clear; close all; 2 | 3 | d = 100; 4 | beta = 1e-1; 5 | X = rand(1,d); 6 | w = randn; 7 | b = randn; 8 | t = w'*X+b+beta*randn(1,d); 9 | x = linspace(min(X),max(X),d); % test data 10 | 11 | [model,llh] = linRegVb(X,t); 12 | % [model,llh] = rvmRegVb(X,t); 13 | plot(llh); 14 | [y, sigma] = linRegPred(model,x,t); 15 | figure 16 | plotCurveBar(x,y,sigma); 17 | hold on; 18 | plot(X,t,'o'); 19 | hold off -------------------------------------------------------------------------------- /chapter04/logitBinPred.m: -------------------------------------------------------------------------------- 1 | function [y, p] = logitBinPred(model, X) 2 | % Prediction of binary logistic regression model 3 | % Input: 4 | % model: trained model structure 5 | % X: d x n testing data 6 | % Output: 7 | % y: 1 x n predict label (0/1) 8 | % p: 1 x n predict probability [0,1] 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | X = [X;ones(1,size(X,2))]; 11 | w = model.w; 12 | p = sigmoid(w'*X); 13 | y = round(p); 14 | 15 | -------------------------------------------------------------------------------- /demo/ch07/rvmRegFp_demo.m: -------------------------------------------------------------------------------- 1 | %% regression 2 | d = 100; 3 | beta = 1e-1; 4 | X = rand(1,d); 5 | w = randn; 6 | b = randn; 7 | t = w'*X+b+beta*randn(1,d); 8 | x = linspace(min(X),max(X),d); % test data 9 | 10 | 11 | %% RVM regression by Mackay fix point update 12 | [model,llh] = rvmRegFp(X,t); 13 | plot(llh); 14 | [y, sigma] = linRegPred(model,x,t); 15 | figure 16 | plotCurveBar(x,y,sigma); 17 | hold on; 18 | plot(X,t,'o'); 19 | hold off -------------------------------------------------------------------------------- /demo/ch09/mixGaussEm_demo.m: -------------------------------------------------------------------------------- 1 | %% Gausssian Mixture via EM 2 | close all; clear; 3 | d = 2; 4 | k = 3; 5 | n = 1000; 6 | [X,label] = mixGaussRnd(d,k,n); 7 | plotClass(X,label); 8 | 9 | m = floor(n/2); 10 | X1 = X(:,1:m); 11 | X2 = X(:,(m+1):end); 12 | % train 13 | [z1,model,llh] = mixGaussEm(X1,k); 14 | figure; 15 | plot(llh); 16 | figure; 17 | plotClass(X1,z1); 18 | % predict 19 | z2 = mixGaussPred(model,X2); 20 | figure; 21 | plotClass(X2,z2); -------------------------------------------------------------------------------- /chapter02/logKde.m: -------------------------------------------------------------------------------- 1 | function z = logKde (X, Y, sigma) 2 | % Compute log pdf of kernel density estimator. 3 | % Input: 4 | % X: d x n data matrix to be evaluate 5 | % Y: d x k data matrix served as database 6 | % Output: 7 | % z: probability density in logrithm scale z=log p(x|y) 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | D = dot(X,X,1)+dot(Y,Y,1)'-2*(Y'*X); 10 | z = logsumexp(D/(-2*sigma^2),1)-0.5*log(2*pi)-log(sigma*size(Y,2),1); 11 | -------------------------------------------------------------------------------- /chapter08/NaiveBayes/nbBern.m: -------------------------------------------------------------------------------- 1 | function model = nbBern(X, t) 2 | % Naive bayes classifier with indepenet Bernoulli. 3 | % Input: 4 | % X: d x n data matrix 5 | % t: 1 x n label (1~k) 6 | % Output: 7 | % model: trained model structure 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | n = size(X,2); 10 | E = sparse(1:n,t,1); 11 | nk = sum(E,1); 12 | w = full(nk)/n; 13 | mu = X*(E./nk); 14 | 15 | model.mu = mu; % d x k means 16 | model.w = w; -------------------------------------------------------------------------------- /chapter05/mlpRegPred.m: -------------------------------------------------------------------------------- 1 | function Y = mlpRegPred(model, X) 2 | % Multilayer perceptron regression prediction 3 | % tanh activation function is used. 4 | % Input: 5 | % model: model structure 6 | % X: d x n data matrix 7 | % Ouput: 8 | % Y: p x n response matrix 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | W = model.W; 11 | b = model.b; 12 | T = length(W); 13 | Y = X; 14 | for t = 1:T-1 15 | Y = tanh(W{t}'*Y+b{t}); 16 | end 17 | Y = W{T}'*Y+b{T}; -------------------------------------------------------------------------------- /chapter04/logitMnPred.m: -------------------------------------------------------------------------------- 1 | function [y, P] = logitMnPred(model, X) 2 | % Prediction of multiclass (multinomial) logistic regression model 3 | % Input: 4 | % model: trained model structure 5 | % X: d x n testing data 6 | % Output: 7 | % y: 1 x n predict label (1~k) 8 | % P: k x n predict probability for each class 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | W = model.W; 11 | X = [X; ones(1,size(X,2))]; 12 | P = softmax(W'*X); 13 | [~, y] = max(P,[],1); -------------------------------------------------------------------------------- /common/symeig.m: -------------------------------------------------------------------------------- 1 | function [V,A,flag] = symeig(S,d,m) 2 | % Compute eigenvalues and eigenvectors of symmetric matrix 3 | % m == 's' smallest (default) 4 | % m == 'l' largest 5 | % Written by Mo Chen (sth4nth@gmail.com). 6 | if nargin == 2 7 | m = 's'; 8 | end 9 | opt.disp = 0; 10 | opt.issym = 1; 11 | opt.isreal = 1; 12 | if any(m == 'ls') 13 | [V,A,flag] = eigs(S,d,[m,'a'],opt); 14 | else 15 | error('The third parameter must be l or s.'); 16 | end 17 | -------------------------------------------------------------------------------- /common/plotCurveBar.m: -------------------------------------------------------------------------------- 1 | function plotCurveBar( x, y, sigma ) 2 | % Plot 1d curve and variance 3 | % Input: 4 | % x: 1 x n 5 | % y: 1 x n 6 | % sigma: 1 x n or scaler 7 | % Written by Mo Chen (sth4nth@gmail.com). 8 | color = [255,228,225]/255; %pink 9 | [x,idx] = sort(x); 10 | y = y(idx); 11 | sigma = sigma(idx); 12 | 13 | fill([x,fliplr(x)],[y+sigma,fliplr(y-sigma)],color); 14 | hold on; 15 | plot(x,y,'r-'); 16 | hold off 17 | axis([x(1),x(end),-inf,inf]) 18 | 19 | -------------------------------------------------------------------------------- /chapter08/MRF/mrfBethe.m: -------------------------------------------------------------------------------- 1 | function lnZ = mrfBethe(A, nodePot, edgePot, nodeBel, edgeBel) 2 | % Compute Bethe energy 3 | [s,t,e] = find(triu(A)); 4 | edgeCor = zeros(size(edgePot)); 5 | for l = 1:numel(e) 6 | edgeCor(:,:,e(l)) = edgeBel(:,:,e(l))./(nodeBel(:,s(l))*nodeBel(:,t(l))'); 7 | end 8 | Ex = dot(nodeBel(:),nodePot(:)); 9 | Exy = dot(edgeBel(:),edgePot(:)); 10 | Hx = -dot(nodeBel(:),log(nodeBel(:))); 11 | Ixy = dot(edgeBel(:),log(edgeCor(:))); 12 | lnZ = Ex+Exy+Hx-Ixy; -------------------------------------------------------------------------------- /chapter02/logVmf.m: -------------------------------------------------------------------------------- 1 | function y = logVmf(X, mu, kappa) 2 | % Compute log pdf of a von Mises-Fisher distribution. 3 | % Input: 4 | % X: d x n data matrix 5 | % mu: d x k mean 6 | % kappa: 1 x k variance 7 | % Output: 8 | % y: k x n probability density in logrithm scale y=log p(x) 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | d = size(X,1); 11 | c = (d/2-1)*log(kappa)-(d/2)*log(2*pi)-logbesseli(d/2-1,kappa); 12 | q = bsxfun(@times,mu,kappa)'*X; 13 | y = bsxfun(@plus,q,c'); 14 | -------------------------------------------------------------------------------- /chapter02/logWishart.m: -------------------------------------------------------------------------------- 1 | function y = logWishart(Sigma, W, v) 2 | % Compute log pdf of a Wishart distribution. 3 | % Input: 4 | % Sigma: d x d covariance matrix 5 | % W: d x d covariance parameter 6 | % v: degree of freedom 7 | % Output: 8 | % y: probability density in logrithm scale y=log p(Sigma) 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | d = length(Sigma); 11 | B = -0.5*v*logdet(W)-0.5*v*d*log(2)-logmvgamma(0.5*v,d); 12 | y = B+0.5*(v-d-1)*logdet(Sigma)-0.5*trace(W\Sigma); -------------------------------------------------------------------------------- /chapter07/rvmBinPred.m: -------------------------------------------------------------------------------- 1 | function [y, p] = rvmBinPred(model, X) 2 | % Prodict the label for binary logistic regression model 3 | % Input: 4 | % model: trained model structure 5 | % X: d x n testing data 6 | % Output: 7 | % y: 1 x n predict label (0/1) 8 | % p: 1 x n predict probability [0,1] 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | index = model.index; 11 | X = [X;ones(1,size(X,2))]; 12 | X = X(index,:); 13 | w = model.w; 14 | p = sigmoid(w'*X); 15 | y = round(p); 16 | -------------------------------------------------------------------------------- /chapter14/mixLogitBinPred.m: -------------------------------------------------------------------------------- 1 | function t = mixLogitBinPred(model, X) 2 | % Prediction function for mixture of logistic regression 3 | % input: 4 | % model: trained model structure 5 | % X: d x n data matrix 6 | % output: 7 | % t: 1 x n cluster label 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | alpha = model.alpha; % mixing coefficient 10 | W = model.W ; % logistic model coefficentalpha 11 | n = size(X,2); 12 | X = [X; ones(1,n)]; 13 | t = round(alpha*sigmoid(W'*X)); 14 | 15 | -------------------------------------------------------------------------------- /common/besseliLn.m: -------------------------------------------------------------------------------- 1 | function y = besseliLn(nu,x) 2 | % Compute logarithm of besseli function (modified Bessel function of first kind). 3 | % Written by Mo Chen (mochen80@gmail.com). 4 | % TODO: improve precision using the method in 5 | % Clustering on the Unit Hypersphere using von Mises-Fisher Distributions. A. Banerjee, I. S. Dhillon, J. Ghosh, and S. Sra 6 | [v,ierr] = besseli(nu,x); 7 | if any(ierr ~= 0) || any(v == Inf) 8 | error('ERROR: logbesseli'); 9 | end 10 | y = log(v); 11 | -------------------------------------------------------------------------------- /chapter06/knPca.m: -------------------------------------------------------------------------------- 1 | function model = knPca(X, q, kn) 2 | % Kernel PCA 3 | % Input: 4 | % X: d x n data matrix 5 | % q: target dimension 6 | % kn: kernel function 7 | % Ouput: 8 | % model: trained model structure 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | if nargin < 3 11 | kn = @knGauss; 12 | end 13 | K = knCenter(kn,X); 14 | [V,L] = eig(K); 15 | [L,idx] = sort(diag(L),'descend'); 16 | V = V(:,idx(1:q)); 17 | L = L(1:q); 18 | 19 | model.kn = kn; 20 | model.V = V; 21 | model.L = L; 22 | model.X = X; -------------------------------------------------------------------------------- /common/sub.m: -------------------------------------------------------------------------------- 1 | function B = sub(A, varargin) 2 | % sub(A,i,j,k) = A(i;j;k) 3 | % Written by Mo Chen (sth4nth@gmail.com). 4 | assert(ndims(A)==numel(varargin)); 5 | sz = cellfun(@numel,varargin); 6 | IDX = cell(1,ndims(A)); 7 | for i = 1:ndims(A) 8 | idx = varargin{i}; 9 | shape = ones(1,ndims(A)); 10 | shape(i) = sz(i); 11 | idx = reshape(idx,shape); 12 | shape = sz; 13 | shape(i) = 1; 14 | idx = repmat(idx,shape); 15 | IDX{i} = idx(:); 16 | end 17 | B = reshape(A(sub2ind(size(A),IDX{:})),sz); -------------------------------------------------------------------------------- /chapter12/ppcaRnd.m: -------------------------------------------------------------------------------- 1 | function [X, model] = ppcaRnd(m, d, n) 2 | % Generate data from probabilistic PCA model 3 | % Input: 4 | % m: dimension of latent space 5 | % d: dimension of data 6 | % n: number of data 7 | % Output: 8 | % X: d x n data matrix 9 | % model: model structure 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | beta = randg; 12 | Z = randn(m,n); 13 | W = randn(d,m); 14 | mu = randn(d,1); 15 | X = bsxfun(@times,W*Z,mu)+randn(d,n)/sqrt(beta); 16 | 17 | model.W = W; 18 | model.mu = mu; 19 | model.beta = beta; -------------------------------------------------------------------------------- /chapter06/knPcaPred.m: -------------------------------------------------------------------------------- 1 | function Y = knPcaPred(model, Xt, opt) 2 | % Prediction for kernel PCA 3 | % Input: 4 | % model: trained model structure 5 | % X: d x n testing data 6 | % t (optional): 1 x n testing response 7 | % Output: 8 | % Y: prejection result of Xt 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | kn = model.kn; 11 | V = model.V; 12 | L = model.L; 13 | X = model.X; 14 | Y = bsxfun(@times,V'*knCenter(kn,X,X,Xt),1./sqrt(L)); 15 | if nargin == 3 && opt.whiten 16 | Y = bsxfun(@times,Y,1./sqrt(L)); 17 | end 18 | 19 | -------------------------------------------------------------------------------- /chapter05/mlpClassPred.m: -------------------------------------------------------------------------------- 1 | function [y, P] = mlpClassPred(model, X) 2 | % Multilayer perceptron classification prediction 3 | % logistic activation function is used. 4 | % Input: 5 | % model: model structure 6 | % X: d x n data matrix 7 | % Ouput: 8 | % y: 1 x n label vector 9 | % P: k x n probability matrix 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | W = model.W; 12 | b = model.b; 13 | T = length(W); 14 | Z = X; 15 | for t = 1:T-1 16 | Z = sigmoid(W{t}'*Z+b{t}); 17 | end 18 | P = softmax(W{T}'*Z+b{T}); 19 | [~,y] = max(P,[],1); -------------------------------------------------------------------------------- /common/mgson.m: -------------------------------------------------------------------------------- 1 | function [Q, R] = mgson(X) 2 | % Modified Gram-Schmidt orthonormalization (numerical stable version of Gram-Schmidt algorithm) 3 | % which produces the same result as [Q,R]=qr(X,0) 4 | % Written by Mo Chen (sth4nth@gmail.com). 5 | [d,n] = size(X); 6 | m = min(d,n); 7 | R = zeros(m,n); 8 | Q = zeros(d,m); 9 | for i = 1:m 10 | v = X(:,i); 11 | for j = 1:i-1 12 | R(j,i) = Q(:,j)'*v; 13 | v = v-R(j,i)*Q(:,j); 14 | end 15 | R(i,i) = norm(v); 16 | Q(:,i) = v/R(i,i); 17 | end 18 | R(:,m+1:n) = Q'*X(:,m+1:n); -------------------------------------------------------------------------------- /demo/ch06/knKmeans_demo.m: -------------------------------------------------------------------------------- 1 | %% Kernel kmeans with linear kernel is equivalent to kmeans 2 | close all; clear; 3 | d = 2; 4 | k = 3; 5 | n = 200; 6 | [X, y] = kmeansRnd(d,k,n); 7 | init = ceil(k*rand(1,n)); 8 | 9 | label = knKmeans(X,init,@knLin); 10 | 11 | label0 = kmeans(X,init); 12 | maxdiff(label,label0) 13 | plotClass(X,label); 14 | %% Kernel kmeans with Gaussian Kernel for nonlinear data 15 | x1 = linspace(0,pi,n/2); 16 | x2 = sin(x1); 17 | X = [x1,x1+pi/2; 18 | x2,-x2]; 19 | 20 | label = knKmeans(X,2,@knGauss); 21 | figure; 22 | plotClass(X,label); -------------------------------------------------------------------------------- /chapter06/knGauss.m: -------------------------------------------------------------------------------- 1 | function K = knGauss(X, Y, s) 2 | % Gaussian (RBF) kernel K = exp(-|x-y|/(2s)); 3 | % Input: 4 | % X: d x nx data matrix 5 | % Y: d x ny data matrix 6 | % s: sigma of gaussian 7 | % Ouput: 8 | % K: nx x ny kernel matrix 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | if nargin < 3 11 | s = 0.4; 12 | end 13 | 14 | if nargin < 2 || isempty(Y) 15 | K = ones(1,size(X,2)); % norm in kernel space 16 | else 17 | D = bsxfun(@plus,dot(X,X,1)',dot(Y,Y,1))-2*(X'*Y); 18 | K = exp(D/(-2*s^2)); 19 | end 20 | 21 | -------------------------------------------------------------------------------- /chapter06/knPoly.m: -------------------------------------------------------------------------------- 1 | function K = knPoly(X, Y, o, c) 2 | % Polynomial kernel k(x,y)=(x'y+c)^o 3 | % Input: 4 | % X: d x nx data matrix 5 | % Y: d x ny data matrix 6 | % o: order of polynomial 7 | % c: constant 8 | % Ouput: 9 | % K: nx x ny kernel matrix 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | if nargin < 4 12 | c = 0; 13 | end 14 | 15 | if nargin < 3 16 | o = 3; 17 | end 18 | 19 | if nargin < 2 || isempty(Y) 20 | K = (dot(X,X,1)+c).^o; % norm in kernel space 21 | else 22 | K = (X'*Y+c).^o; 23 | end 24 | 25 | -------------------------------------------------------------------------------- /chapter11/gaussRnd.m: -------------------------------------------------------------------------------- 1 | function x = gaussRnd(mu, Sigma, n) 2 | % Generate samples from a Gaussian distribution. 3 | % Input: 4 | % mu: d x 1 mean vector 5 | % Sigma: d x d covariance matrix 6 | % n: number of samples 7 | % Outpet: 8 | % x: d x n generated sample x~Gauss(mu,Sigma) 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | if nargin == 2 11 | n = 1; 12 | end 13 | [V,err] = chol(Sigma); 14 | if err ~= 0 15 | error('ERROR: sigma must be a symmetric positive definite matrix.'); 16 | end 17 | x = V'*randn(size(V,1),n)+repmat(mu,1,n); -------------------------------------------------------------------------------- /chapter14/mixLinRnd.m: -------------------------------------------------------------------------------- 1 | function [X, y, W ] = mixLinRnd(d, k, n) 2 | % Generate data from mixture of linear model 3 | % Input: 4 | % d: dimension of data 5 | % k: number of components 6 | % n: number of data 7 | % Output: 8 | % X: d x n data matrix 9 | % y: 1 x n response variable 10 | % W: d+1 x k weight matrix 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | W = randn(d+1,k); 13 | [X, z] = kmeansRnd(d, k, n); 14 | y = zeros(1,n); 15 | for j = 1:k 16 | idx = (z == j); 17 | y(idx) = W(1:(end-1),j)'*X(:,idx)+W(end,j); 18 | end 19 | 20 | 21 | -------------------------------------------------------------------------------- /chapter14/adaboostBinPred.m: -------------------------------------------------------------------------------- 1 | function t = adaboostBinPred(model, X) 2 | % Prediction of binary Adaboost 3 | % input: 4 | % model: trained model structure 5 | % X: d x n data matrix 6 | % output: 7 | % t: 1 x n prediction 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | Alpha = model.alpha; 10 | Theta = model.theta; 11 | M = size(Alpha,2); 12 | t = zeros(1,size(X,2)); 13 | for m = 1:M 14 | c = Theta(:,:,m); 15 | [~,y] = min(sqdist(c,X),[],1); 16 | y(y==1) = -1; 17 | y(y==2) = 1; 18 | t = t+Alpha(m)*y; 19 | end 20 | t = sign(t); 21 | t(t==-1) = 0; -------------------------------------------------------------------------------- /common/slice.m: -------------------------------------------------------------------------------- 1 | function B = slice(A, dim, index) 2 | % slice(A,2,index) = A(:,index,:) 3 | % Written by Mo Chen (sth4nth@gmail.com). 4 | sz = size(A); 5 | sz(dim) = numel(index); 6 | IDX = cell(1,ndims(A)); 7 | for i = 1:ndims(A) 8 | if i == dim 9 | idx = index; 10 | else 11 | idx = 1:sz(i); 12 | end 13 | shape = ones(1,ndims(A)); 14 | shape(i) = sz(i); 15 | idx = reshape(idx,shape); 16 | shape = sz; 17 | shape(i) = 1; 18 | idx = repmat(idx,shape); 19 | IDX{i} = idx(:); 20 | end 21 | B = reshape(A(sub2ind(size(A),IDX{:})),sz); -------------------------------------------------------------------------------- /chapter03/linRnd.m: -------------------------------------------------------------------------------- 1 | function [X, t] = linRnd(d, n) 2 | % Generate data from a linear model p(t|w,x)=G(w'x+w0,sigma), sigma=sqrt(1/beta) 3 | % where w and w0 are generated from Gauss(0,1), beta is generated from 4 | % Gamma(1,1), X is generated form [0,1]. 5 | % Input: 6 | % d: dimension of data 7 | % n: number of data 8 | % Output: 9 | % X: d x n data matrix 10 | % t: 1 x n response variable 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | beta = randg; % need statistcs toolbox 13 | X = rand(d,n); 14 | w = randn(d,1); 15 | w0 = randn(1,1); 16 | t = w'*X+w0+randn(1,n)/sqrt(beta); -------------------------------------------------------------------------------- /chapter06/knKmeansPred.m: -------------------------------------------------------------------------------- 1 | function [label, energy] = knKmeansPred(model, Xt) 2 | % Prediction for kernel kmeans clusterng 3 | % Input: 4 | % model: trained model structure 5 | % Xt: d x n testing data 6 | % Ouput: 7 | % label: 1 x n predict label 8 | % engery: optimization target value 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | X = model.X; 11 | t = model.label; 12 | kn = model.kn; 13 | 14 | n = size(X,2); 15 | k = max(t); 16 | E = sparse(t,1:n,1,k,n,n); 17 | E = E./sum(E,2); 18 | Z = E*kn(X,Xt)-dot(E*kn(X,X),E,2)/2; 19 | [val, label] = max(Z,[],1); 20 | energy = sum(kn(Xt))-2*sum(val); 21 | -------------------------------------------------------------------------------- /chapter02/logDirichlet.m: -------------------------------------------------------------------------------- 1 | function y = logDirichlet(X, a) 2 | % Compute log pdf of a Dirichlet distribution. 3 | % Input: 4 | % X: d x n data matrix, each column sums to one (sum(X,1)==ones(1,n) && X>=0) 5 | % a: d x k parameter of Dirichlet 6 | % y: k x n probability density 7 | % Output: 8 | % y: k x n probability density in logrithm scale y=log p(x) 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | X = bsxfun(@times,X,1./sum(X,1)); 11 | if size(a,1) == 1 12 | a = repmat(a,size(X,1),1); 13 | end 14 | c = gammaln(sum(a,1))-sum(gammaln(a),1); 15 | g = (a-1)'*log(X); 16 | y = bsxfun(@plus,g,c'); 17 | -------------------------------------------------------------------------------- /chapter02/logMvGamma.m: -------------------------------------------------------------------------------- 1 | function y = logMvGamma(x, d) 2 | % Compute logarithm multivariate Gamma function 3 | % which is used in the probability density function of the Wishart and inverse Wishart distributions. 4 | % Gamma_d(x) = pi^(d(d-1)/4) \prod_(j=1)^d Gamma(x+(1-j)/2) 5 | % log(Gamma_d(x)) = d(d-1)/4 log(pi) + \sum_(j=1)^d log(Gamma(x+(1-j)/2)) 6 | % Input: 7 | % x: m x n data matrix 8 | % d: dimension 9 | % Output: 10 | % y: m x n logarithm multivariate Gamma 11 | % Written by Michael Chen (sth4nth@gmail.com). 12 | y = d*(d-1)/4*log(pi)+sum(gammaln(x(:)+(1-(1:d))/2),2); 13 | y = reshape(y,size(x)); -------------------------------------------------------------------------------- /chapter12/pca.m: -------------------------------------------------------------------------------- 1 | function [U, L, mu, mse] = pca(X, m) 2 | % Principal component analysis 3 | % Input: 4 | % X: d x n data matrix 5 | % m: target dimension 6 | % Output: 7 | % U: d x m Projection matrix 8 | % L: m x 1 Eigen values 9 | % mu: d x 1 mean 10 | % mse: mean square error 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | n = size(X,2); 13 | mu = mean(X,2); 14 | Xo = bsxfun(@minus,X,mu); 15 | S = Xo*Xo'/n; % 12.3 16 | [U,L] = eig(S); % 12.5 17 | [L,idx] = sort(diag(L),'descend'); 18 | mse = sum(L)-sum(L(1:m)); 19 | U = U(:,idx(1:m)); 20 | L = L(1:m); 21 | 22 | -------------------------------------------------------------------------------- /common/lattice.m: -------------------------------------------------------------------------------- 1 | function A = lattice( sz ) 2 | % Create an undirected graph corresponding to sz lattice 3 | % Example: 4 | % plot(graph(lattice([2,2,3]))) 5 | % Input: 6 | % sz: 1 x d size of lattice 7 | % Output: 8 | % A: prod(sz) x prod(sz) adjacent matrix of an undirected graph 9 | % Written by Mo Chen (sth4nth@gmail.com) 10 | d = numel(sz); 11 | step = cumprod(sz); 12 | n = step(end); 13 | M = reshape(1:n,sz); 14 | S = arrayfun(@(i) reshape(slice(M,i,1:sz(i)-1),1,[]), 1:d,'UniformOutput',false); 15 | T = arrayfun(@(i) reshape(slice(M,i,2:sz(i)),1,[]), 1:d,'UniformOutput',false); 16 | A = sparse([S{:}],[T{:}],1,n,n); 17 | A = A+A'; -------------------------------------------------------------------------------- /chapter02/logGauss.m: -------------------------------------------------------------------------------- 1 | function y = logGauss(X, mu, sigma) 2 | % Compute log pdf of a Gaussian distribution. 3 | % Input: 4 | % X: d x n data matrix 5 | % mu: d x 1 mean vector of Gaussian 6 | % sigma: d x d covariance matrix of Gaussian 7 | % Output: 8 | % y: 1 x n probability density in logrithm scale y=log p(x) 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | d = size(X,1); 11 | X = X-mu; 12 | [U,p]= chol(sigma); 13 | if p ~= 0 14 | error('ERROR: sigma is not PD.'); 15 | end 16 | Q = U'\X; 17 | q = dot(Q,Q,1); % quadratic term (M distance) 18 | c = d*log(2*pi)+2*sum(log(diag(U))); % normalization constant 19 | y = -(c+q)/2; 20 | -------------------------------------------------------------------------------- /chapter09/mixBernRnd.m: -------------------------------------------------------------------------------- 1 | function [X, z, mu] = mixBernRnd(d, k, n) 2 | % Generate samples from a Bernoulli mixture distribution. 3 | % Input: 4 | % d: dimension of data 5 | % k: number of components 6 | % n: number of data 7 | % Output: 8 | % X: d x n data matrix 9 | % z: 1 x n response variable 10 | % mu: d x k parameters of each Bernoulli component 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | 13 | % w = dirichletRnd(1,ones(1,k)/k); 14 | w = ones(1,k)/k; 15 | z = discreteRnd(w,n); 16 | mu = rand(d,k); 17 | X = zeros(d,n); 18 | for i = 1:k 19 | idx = z==i; 20 | X(:,idx) = bsxfun(@le,rand(d,sum(idx)), mu(:,i)); 21 | end 22 | -------------------------------------------------------------------------------- /chapter11/mixGaussSample.m: -------------------------------------------------------------------------------- 1 | function [X, z] = mixGaussSample(Theta, w, n ) 2 | % Genarate samples form a Gaussian mixture model with GaussianWishart prior. 3 | % Input: 4 | % Theta: cell of GaussianWishart priors of components 5 | % w: weight of components 6 | % n: number of data 7 | % Output: 8 | % X: d x n data matrix 9 | % z: 1 x n response variable 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | z = discreteRnd(w,n); 12 | d = Theta{1}.dim(); 13 | X = zeros(d,n); 14 | for i = 1:numel(w) 15 | idx = z==i; 16 | [mu,Sigma] = Theta{i}.sample(); % invpd(wishrnd(W0,v0)); 17 | X(:,idx) = gaussRnd(mu,Sigma,sum(idx)); 18 | end 19 | -------------------------------------------------------------------------------- /chapter09/kmeansRnd.m: -------------------------------------------------------------------------------- 1 | function [X, z, mu] = kmeansRnd(d, k, n) 2 | % Generate samples from a Gaussian mixture distribution with common variances (kmeans model). 3 | % Input: 4 | % d: dimension of data 5 | % k: number of components 6 | % n: number of data 7 | % Output: 8 | % X: d x n data matrix 9 | % z: 1 x n response variable 10 | % mu: d x k centers of clusters 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | alpha = 1; 13 | beta = nthroot(k,d); % k points in volume x^d : x^d=k 14 | 15 | X = randn(d,n); 16 | w = dirichletRnd(alpha,ones(1,k)/k); 17 | z = discreteRnd(w,n); 18 | E = full(sparse(z,1:n,1,k,n,n)); 19 | mu = randn(d,k)*beta; 20 | X = X+mu*E; -------------------------------------------------------------------------------- /demo/ch05/mlp_demo.m: -------------------------------------------------------------------------------- 1 | clear; close all 2 | %% Regression 3 | n = 200; 4 | x = linspace(0,2*pi,n); 5 | y = sin(x); 6 | 7 | h = [10,6]; % two hidden layers with 10 and 6 neurons 8 | lambda = 1e-2; 9 | [model, L] = mlpReg(x,y,h,lambda); 10 | t = mlpRegPred(model,x); 11 | plot(L); 12 | figure; 13 | hold on 14 | plot(x,y,'.'); 15 | plot(x,t); 16 | hold off 17 | %% Classification 18 | clear; 19 | k = 2; 20 | n = 200; 21 | [X,y] = kmeansRnd(2,k,n); 22 | figure; 23 | plotClass(X,y); 24 | 25 | h = 3; 26 | lambda = 1e-2; 27 | [model, llh] = mlpClass(X,y,h,lambda); 28 | [t,p] = mlpClassPred(model,X); 29 | plot(llh); 30 | figure; 31 | plotClass(X,t); 32 | figure; 33 | -------------------------------------------------------------------------------- /demo/ch12/pca_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch12 2 | 3 | clear; close all; 4 | d = 3; 5 | m = 2; 6 | n = 1000; 7 | 8 | X = ppcaRnd(m,d,n); 9 | plotClass(X); 10 | %% PCA , EM PCA and Constraint EM PCA produce the same result in the sense of reconstruction mseor 11 | % classical PCA 12 | [U,L,mu,mse1] = pca(X,m); 13 | Y = U'*bsxfun(@minus,X,mu); % projection 14 | Z1 = bsxfun(@times,Y,1./sqrt(L)); % whiten 15 | figure; 16 | plotClass(Y); 17 | figure; 18 | plotClass(Z1); 19 | mse1 20 | % EM PCA 21 | [W2,Z2,mu,mse2] = pcaEm(X,m); 22 | figure; 23 | plotClass(Z1); 24 | mse2 25 | % Contrained EM PCA 26 | [W3,Z3,mu,mse3] = pcaEmC(X,m); 27 | figure; 28 | plotClass(Z1); 29 | mse3 30 | -------------------------------------------------------------------------------- /chapter01/jointEntropy.m: -------------------------------------------------------------------------------- 1 | function z = jointEntropy(x, y) 2 | % Compute joint entropy z=H(x,y) of two discrete variables x and y. 3 | % Input: 4 | % x, y: two integer vector of the same length 5 | % Output: 6 | % z: joint entroy z=H(x,y) 7 | % Written by Mo Chen (sth4nth@gmail.com). 8 | assert(numel(x) == numel(y)); 9 | n = numel(x); 10 | x = reshape(x,1,n); 11 | y = reshape(y,1,n); 12 | 13 | l = min(min(x),min(y)); 14 | x = x-l+1; 15 | y = y-l+1; 16 | k = max(max(x),max(y)); 17 | 18 | idx = 1:n; 19 | p = nonzeros(sparse(idx,x,1,n,k,n)'*sparse(idx,y,1,n,k,n)/n); %joint distribution of x and y 20 | 21 | z = -dot(p,log2(p)); 22 | z = max(0,z); -------------------------------------------------------------------------------- /chapter08/NaiveBayes/nbGauss.m: -------------------------------------------------------------------------------- 1 | function model = nbGauss(X, t) 2 | % Naive bayes classifier with indepenet Gaussian 3 | % Each dimension of data is assumed from a 1d Gaussian distribution with independent mean and variance. 4 | % Input: 5 | % X: d x n data matrix 6 | % t: 1 x n label (1~k) 7 | % Output: 8 | % model: trained model structure 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | n = size(X,2); 11 | k = max(t); 12 | E = sparse(t,1:n,1,k,n,n); 13 | nk = full(sum(E,2)); 14 | w = nk/n; 15 | R = E'*spdiags(1./nk,0,k,k); 16 | mu = X*R; 17 | var = X.^2*R-mu.^2; 18 | 19 | model.mu = mu; % d x k means 20 | model.var = var; % d x k variances 21 | model.w = w; -------------------------------------------------------------------------------- /chapter13/HMM/hmmRnd.m: -------------------------------------------------------------------------------- 1 | function [x, model] = hmmRnd(d, k, n) 2 | % Generate a data sequence from a hidden Markov model. 3 | % Input: 4 | % d: dimension of data 5 | % k: dimension of latent variable 6 | % n: number of data 7 | % Output: 8 | % X: d x n data matrix 9 | % model: model structure 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | A = normalize(rand(k,k),2); 12 | E = normalize(rand(k,d),2); 13 | s = normalize(rand(k,1),1); 14 | 15 | x = zeros(1,n); 16 | z = discreteRnd(s); 17 | x(1) = discreteRnd(E(z,:)); 18 | for i = 2:n 19 | z = discreteRnd(A(z,:)); 20 | x(i) = discreteRnd(E(z,:)); 21 | end 22 | 23 | model.A = A; 24 | model.E = E; 25 | model.s = s; -------------------------------------------------------------------------------- /chapter08/MRF/mrfIsGa.m: -------------------------------------------------------------------------------- 1 | function [A, nodePot, edgePot] = mrfIsGa(im, sigma, J) 2 | % Contruct a latent Ising MRF with Gaussian observation 3 | % Input: 4 | % im: row x col image 5 | % sigma: variance of Gaussian node potential 6 | % J: parameter of Ising edge 7 | % Output: 8 | % A: n x n adjacent matrix 9 | % nodePot: 2 x n node potential 10 | % edgePot: 2 x 2 x m edge potential 11 | % Written by Mo Chen (sth4nth@gmail.com) 12 | A = lattice(size(im)); 13 | [s,t,e] = find(triu(A)); 14 | m = numel(e); 15 | e(:) = 1:m; 16 | A = sparse([s;t],[t;s],[e;e]); 17 | 18 | z = [1;-1]; 19 | x = reshape(im,1,[]); 20 | nodePot = -(x-z).^2/(2*sigma^2); 21 | edgePot = repmat(J*(z*z'),[1, 1, m]); -------------------------------------------------------------------------------- /common/plotgm.m: -------------------------------------------------------------------------------- 1 | function plotgm(X, model) 2 | % Plot 2d Gaussian mixture model. 3 | % Written by Mo Chen (sth4nth@gmail.com). 4 | level = 64; 5 | n = 256; 6 | 7 | spread(X); 8 | x_range = xlim; 9 | y_range = ylim; 10 | 11 | x = linspace(x_range(1),x_range(2), n); 12 | y = linspace(y_range(2),y_range(1), n); 13 | 14 | [a,b] = meshgrid(x,y); 15 | z = exp(loggmpdf([a(:)';b(:)'],model)); 16 | 17 | z = z-min(z); 18 | z = floor(z/max(z)*(level-1)); 19 | 20 | figure; 21 | image(reshape(z,n,n)); 22 | colormap(jet(level)); 23 | set(gca, 'XTick', [1 256]); 24 | set(gca, 'XTickLabel', [min(x) max(x)]); 25 | set(gca, 'YTick', [1 256]); 26 | set(gca, 'YTickLabel', [min(y) max(y)]); 27 | axis off 28 | -------------------------------------------------------------------------------- /chapter08/NaiveBayes/nbGaussPred.m: -------------------------------------------------------------------------------- 1 | function y = nbGaussPred(model, X) 2 | % Prediction of naive Bayes classifier with independent Gaussian. 3 | % input: 4 | % model: trained model structure 5 | % X: d x n data matrix 6 | % output: 7 | % y: 1 x n predicted class label 8 | % Written by Mo Chen (sth4nth@gmail.com). 9 | mu = model.mu; 10 | var = model.var; 11 | w = model.w; 12 | assert(all(size(mu)==size(var))); 13 | d = size(mu,1); 14 | 15 | lambda = 1./var; 16 | ml = mu.*lambda; 17 | M = bsxfun(@plus,lambda'*X.^2-2*ml'*X,dot(mu,ml,1)'); % M distance 18 | c = d*log(2*pi)+2*sum(log(var),1)'; % normalization constant 19 | R = -0.5*bsxfun(@plus,M,c); 20 | [~,y] = max(bsxfun(@times,exp(R),w),[],1); 21 | -------------------------------------------------------------------------------- /demo/ch09/kmeans_demo.m: -------------------------------------------------------------------------------- 1 | close all; clear; 2 | d = 2; 3 | k = 3; 4 | n = 5000; 5 | %% Generate data 6 | [X,label] = kmeansRnd(d,k,n); 7 | plotClass(X,label); 8 | %% kmeans with random initialization 9 | y = kmeans(X,k); 10 | figure; 11 | plotClass(X,y); 12 | %% kmeans init with labels 13 | y = kmeans(X,label); 14 | figure; 15 | plotClass(X,y); 16 | %% kmeans init with centers 17 | mu = rand(d,k); 18 | y = kmeans(X,mu); 19 | figure; 20 | plotClass(X,y); 21 | %% kmeans init with kmeans++ seeding 22 | y = kmeans(X,kseeds(X,k)); 23 | figure; 24 | plotClass(X,y); 25 | %% kmeans++ seeding 26 | mu = kseeds(X,k); 27 | [~,y] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign sample labels 28 | figure; 29 | plotClass(X,y); 30 | -------------------------------------------------------------------------------- /chapter04/binPlot.m: -------------------------------------------------------------------------------- 1 | function binPlot(model, X, t) 2 | % Plot binary classification result for 2d data 3 | % Input: 4 | % model: trained model structure 5 | % X: 2 x n data matrix 6 | % t: 1 x n label 7 | % Written by Mo Chen (sth4nth@gmail.com). 8 | assert(size(X,1) == 2); 9 | w = model.w; 10 | xi = min(X,[],2); 11 | xa = max(X,[],2); 12 | [x1,x2] = meshgrid(linspace(xi(1),xa(1)), linspace(xi(2),xa(2))); 13 | 14 | color = 'brgmcyk'; 15 | m = length(color); 16 | figure(gcf); 17 | axis equal 18 | clf; 19 | hold on; 20 | view(2); 21 | for i = 1:max(t) 22 | idc = t==i; 23 | scatter(X(1,idc),X(2,idc),36,color(mod(i-1,m)+1)); 24 | end 25 | y = w(1)*x1+w(2)*x2+w(3); 26 | contour(x1,x2,y,[-0 0]); 27 | hold off; 28 | -------------------------------------------------------------------------------- /demo/ch10/mixGaussVb_demo.m: -------------------------------------------------------------------------------- 1 | 2 | %% Variational Bayesian for Gaussian Mixture Model 3 | close all; clear; 4 | d = 2; 5 | k = 3; 6 | n = 2000; 7 | [X,z] = mixGaussRnd(d,k,n); 8 | plotClass(X,z); 9 | m = floor(n/2); 10 | X1 = X(:,1:m); 11 | X2 = X(:,(m+1):end); 12 | % VB fitting 13 | [y1, model, L] = mixGaussVb(X1,10); 14 | figure; 15 | plotClass(X1,y1); 16 | figure; 17 | plot(L) 18 | % Model Evidence 19 | prior.alpha = 1; 20 | prior.kappa = 1; 21 | prior.m = mean(X1,2); 22 | prior.v = d+1; 23 | prior.M = eye(d); % M = inv(W) 24 | L0 = mixGaussEvidence(X1, model, prior); 25 | L0-L(end) 26 | % Predict testing data 27 | [y2, R] = mixGaussVbPred(model,X2); 28 | figure; 29 | plotClass(X2,y2); 30 | 31 | -------------------------------------------------------------------------------- /chapter13/LDS/ldsPca.m: -------------------------------------------------------------------------------- 1 | function [A, C, Z] = ldsPca(X, k, m) 2 | % Subspace method for learning linear dynamic system. 3 | % Input: 4 | % X: d x n data matrix 5 | % k: dimension of hidden variable 6 | % m: stacking order for the Hankel matrix 7 | % Output: 8 | % A: k x k transition matrix 9 | % C: k x d emission matrix 10 | % Z: k x n latent variable 11 | % Y: d x n reconstructed data 12 | % reference: Bayesian Reasoning and Machine Learning (BRML) chapter 24.5.3 p.507 13 | % Written by Mo Chen (sth4nth@gmail.com). 14 | [d,n] = size(X); 15 | H = reshape(X(:,hankel(1:m,m:n)),d*m,[]); 16 | [U,S,V] = svd(H,'econ'); 17 | C = U(1:d,1:k); 18 | Z = S(1:k,1:k)*V(:,1:k)'; 19 | A = Z(:,2:end)/Z(:,1:end-1); % estimated transition 20 | % Y = C*Z; % reconstructions -------------------------------------------------------------------------------- /common/loggmpdf.m: -------------------------------------------------------------------------------- 1 | function r = loggmpdf(X, model) 2 | % Compute log pdf of a Gaussian mixture model. 3 | % Written by Mo Chen (sth4nth@gmail.com). 4 | mu = model.mu; 5 | Sigma = model.Sigma; 6 | w = model.weight; 7 | 8 | n = size(X,2); 9 | k = size(mu,2); 10 | logRho = zeros(k,n); 11 | 12 | for i = 1:k 13 | logRho(i,:) = loggausspdf(X,mu(:,i),Sigma(:,:,i)); 14 | end 15 | r = logsumexp(bsxfun(@plus,logRho,log(w)'),1); 16 | 17 | 18 | function y = loggausspdf(X, mu, Sigma) 19 | d = size(X,1); 20 | X = bsxfun(@minus,X,mu); 21 | [U,p]= chol(Sigma); 22 | if p ~= 0 23 | error('ERROR: Sigma is not PD.'); 24 | end 25 | Q = U'\X; 26 | q = dot(Q,Q,1); % quadratic term (M distance) 27 | c = d*log(2*pi)+2*sum(log(diag(U))); % normalization constant 28 | y = -(c+q)/2; -------------------------------------------------------------------------------- /common/plotkde.m: -------------------------------------------------------------------------------- 1 | function plotkde(X, sigma2) 2 | % Plot 2d kernel density. 3 | % Written by Mo Chen (sth4nth@gmail.com). 4 | if nargin < 2 5 | sigma2 = 1e-1; 6 | end 7 | level = 64; 8 | n = 256; 9 | 10 | X = standardize(X); 11 | 12 | spread(X); 13 | x_range = xlim; 14 | y_range = ylim; 15 | 16 | x = linspace(x_range(1),x_range(2), n); 17 | y = linspace(y_range(2),y_range(1), n); 18 | 19 | [a,b] = meshgrid(x,y); 20 | 21 | z = exp(logkdepdf([a(:)';b(:)'],X,sigma2)); 22 | 23 | z = z-min(z); 24 | z = floor(z/max(z)*(level-1)); 25 | 26 | figure; 27 | image(reshape(z,n,n)); 28 | colormap(jet(level)); 29 | set(gca, 'XTick', [1 256]); 30 | set(gca, 'XTickLabel', [min(x) max(x)]); 31 | set(gca, 'YTick', [1 256]); 32 | set(gca, 'YTickLabel', [min(y) max(y)]); 33 | axis off 34 | -------------------------------------------------------------------------------- /chapter01/relatEntropy.m: -------------------------------------------------------------------------------- 1 | function z = relatEntropy (x, y) 2 | % Compute relative entropy (a.k.a KL divergence) z=KL(p(x)||p(y)) of two discrete variables x and y. 3 | % Input: 4 | % x, y: two integer vector of the same length 5 | % Output: 6 | % z: relative entropy (a.k.a KL divergence) z=KL(p(x)||p(y)) 7 | % Written by Mo Chen (sth4nth@gmail.com). 8 | assert(numel(x) == numel(y)); 9 | n = numel(x); 10 | x = reshape(x,1,n); 11 | y = reshape(y,1,n); 12 | 13 | l = min(min(x),min(y)); 14 | x = x-l+1; 15 | y = y-l+1; 16 | k = max(max(x),max(y)); 17 | 18 | idx = 1:n; 19 | Mx = sparse(idx,x,1,n,k,n); 20 | My = sparse(idx,y,1,n,k,n); 21 | Px = nonzeros(mean(Mx,1)); 22 | Py = nonzeros(mean(My,1)); 23 | 24 | z = -dot(Px,log2(Py)-log2(Px)); 25 | z = max(0,z); -------------------------------------------------------------------------------- /chapter14/mixLinPred.m: -------------------------------------------------------------------------------- 1 | function [y, z, p] = mixLinPred(model, X, t) 2 | % Prediction function for mxiture of linear regression 3 | % input: 4 | % model: trained model structure 5 | % X: d x n data matrix 6 | % t:(optional) 1 x n responding vector 7 | % output: 8 | % y: 1 x n prediction 9 | % z: 1 x n cluster label 10 | % p: 1 x n predict probability for t 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | W = model.W; 13 | alpha = model.alpha; 14 | beta = model.beta; 15 | 16 | X = [X;ones(1,size(X,2))]; % adding the bias term 17 | y = W'*X; 18 | D = bsxfun(@minus,y,t).^2; 19 | logRho = (-0.5)*beta*D; 20 | logRho = bsxfun(@plus,logRho,log(alpha)); 21 | T = logsumexp(logRho,1); 22 | p = exp(T); 23 | logR = bsxfun(@minus,logRho,T); 24 | R = exp(logR); 25 | z = max(R,[],1); 26 | -------------------------------------------------------------------------------- /demo/ch10/rvmRegVb_spSignal_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch07 2 | 3 | %% sparse signal recovery demo 4 | clear; close all; 5 | 6 | d = 512; % signal length 7 | k = 20; % number of spikes 8 | n = 100; % number of measurements 9 | % 10 | % random +/- 1 signal 11 | x = zeros(d,1); 12 | q = randperm(d); 13 | x(q(1:k)) = sign(randn(k,1)); 14 | 15 | % projection matrix 16 | A = unitize(randn(d,n),1); 17 | % noisy observations 18 | sigma = 0.005; 19 | e = sigma*randn(1,n); 20 | y = x'*A + e; 21 | 22 | [model,llh] = rvmRegVb(A,y); 23 | plot(llh); 24 | m = model.w; 25 | 26 | h = max(abs(x))+0.2; 27 | x_range = [1,d]; 28 | y_range = [-h,+h]; 29 | figure; 30 | subplot(2,1,1);plot(x); axis([x_range,y_range]); title('Original Signal'); 31 | subplot(2,1,2);plot(m); axis([x_range,y_range]); title('Recovery Signal'); 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /chapter04/fda.m: -------------------------------------------------------------------------------- 1 | function U = fda(X, t, q) 2 | % Fisher (linear) discriminant analysis 3 | % Input: 4 | % X: d x n data matrix 5 | % t: 1 x n class label 6 | % d: target dimension 7 | % Output: 8 | % U: projection matrix y=U'*x 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | n = size(X,2); 11 | k = max(t); 12 | 13 | E = sparse(1:n,t,true,n,k,n); % transform label into indicator matrix 14 | nk = full(sum(E)); 15 | 16 | m = mean(X,2); 17 | Xo = bsxfun(@minus,X,m); 18 | St = (Xo*Xo')/n; % 4.43 19 | 20 | mk = bsxfun(@times,X*E,1./nk); 21 | mo = bsxfun(@minus,mk,m); 22 | mo = bsxfun(@times,mo,sqrt(nk/n)); 23 | Sb = mo*mo'; % 4.46 24 | % Sw = St-Sb; % 4.45 25 | 26 | [U,A] = eig(Sb,St,'chol'); 27 | [~,idx] = sort(diag(A),'descend'); 28 | U = U(:,idx(1:q)); 29 | -------------------------------------------------------------------------------- /chapter01/condEntropy.m: -------------------------------------------------------------------------------- 1 | function z = condEntropy (x, y) 2 | % Compute conditional entropy z=H(x|y) of two discrete variables x and y. 3 | % Input: 4 | % x, y: two integer vector of the same length 5 | % Output: 6 | % z: conditional entropy z=H(x|y) 7 | % Written by Mo Chen (sth4nth@gmail.com). 8 | assert(numel(x) == numel(y)); 9 | n = numel(x); 10 | x = reshape(x,1,n); 11 | y = reshape(y,1,n); 12 | 13 | l = min(min(x),min(y)); 14 | x = x-l+1; 15 | y = y-l+1; 16 | k = max(max(x),max(y)); 17 | 18 | idx = 1:n; 19 | Mx = sparse(idx,x,1,n,k,n); 20 | My = sparse(idx,y,1,n,k,n); 21 | Pxy = nonzeros(Mx'*My/n); %joint distribution of x and y 22 | Hxy = -dot(Pxy,log2(Pxy)); 23 | 24 | Py = nonzeros(mean(My,1)); 25 | Hy = -dot(Py,log2(Py)); 26 | 27 | % conditional entropy H(x|y) 28 | z = Hxy-Hy; 29 | z = max(0,z); 30 | -------------------------------------------------------------------------------- /chapter06/knReg.m: -------------------------------------------------------------------------------- 1 | function model = knReg(X, t, lambda, kn) 2 | % Gaussian process (kernel) regression 3 | % Input: 4 | % X: d x n data 5 | % t: 1 x n response 6 | % lambda: regularization parameter 7 | % Output: 8 | % model: trained model structure 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | if nargin < 4 11 | kn = @knGauss; 12 | end 13 | if nargin < 3 14 | lambda = 1e-2; 15 | end 16 | K = knCenter(kn,X); 17 | tbar = mean(t); 18 | U = chol(K+lambda*eye(size(X,2))); % 6.62 19 | a = U\(U'\(t(:)-tbar)); % 6.68 20 | 21 | model.kn = kn; 22 | model.a = a; 23 | model.X = X; 24 | model.tbar = tbar; 25 | %% for probability prediction 26 | y = a'*K+tbar; 27 | beta = 1/mean((t-y).^2); % 3.21 28 | alpha = lambda*beta; % lambda=a/b P.153 3.55 29 | model.alpha = alpha; 30 | model.beta = beta; 31 | model.U = U; -------------------------------------------------------------------------------- /demo/ch07/rvmRegEm_spSignal_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch07 2 | 3 | %% sparse signal recovery demo 4 | clear; close all; 5 | 6 | d = 512; % signal length 7 | k = 20; % number of spikes 8 | n = 100; % number of measurements 9 | % 10 | % random +/- 1 signal 11 | x = zeros(d,1); 12 | q = randperm(d); 13 | x(q(1:k)) = sign(randn(k,1)); 14 | 15 | % projection matrix 16 | A = unitize(randn(d,n),1); 17 | % noisy observations 18 | sigma = 0.005; 19 | e = sigma*randn(1,n); 20 | y = x'*A + e; 21 | 22 | [model,llh] = rvmRegEm(A,y); 23 | plot(llh); 24 | 25 | m = zeros(d,1); 26 | m(model.index) = model.w; 27 | 28 | h = max(abs(x))+0.2; 29 | x_range = [1,d]; 30 | y_range = [-h,+h]; 31 | figure; 32 | subplot(2,1,1);plot(x); axis([x_range,y_range]); title('Original Signal'); 33 | subplot(2,1,2);plot(m); axis([x_range,y_range]); title('Recovery Signal'); 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /demo/ch07/rvmRegSeq_spSignal_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch07 2 | 3 | %% sparse signal recovery demo 4 | clear; close all; 5 | 6 | d = 512; % signal length 7 | k = 20; % number of spikes 8 | n = 100; % number of measurements 9 | % 10 | % random +/- 1 signal 11 | x = zeros(d,1); 12 | q = randperm(d); 13 | x(q(1:k)) = sign(randn(k,1)); 14 | 15 | % projection matrix 16 | A = unitize(randn(d,n),1); 17 | % noisy observations 18 | sigma = 0.005; 19 | e = sigma*randn(1,n); 20 | y = x'*A + e; 21 | 22 | [model,llh] = rvmRegSeq(A,y); 23 | plot(llh); 24 | 25 | m = zeros(d,1); 26 | m(model.index) = model.w; 27 | 28 | h = max(abs(x))+0.2; 29 | x_range = [1,d]; 30 | y_range = [-h,+h]; 31 | figure; 32 | subplot(2,1,1);plot(x); axis([x_range,y_range]); title('Original Signal'); 33 | subplot(2,1,2);plot(m); axis([x_range,y_range]); title('Recovery Signal'); 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /chapter07/rvmRegPred.m: -------------------------------------------------------------------------------- 1 | function [y, sigma, p] = rvmRegPred(model, X, t) 2 | % Compute RVM regression model reponse y = w'*X+w0 and likelihood 3 | % Input: 4 | % model: trained model structure 5 | % X: d x n testing data 6 | % t (optional): 1 x n testing response 7 | % Output: 8 | % y: 1 x n prediction 9 | % sigma: variance 10 | % p: 1 x n likelihood of t 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | index = model.index; 13 | w = model.w; 14 | w0 = model.w0; 15 | 16 | X = X(index,:); 17 | y = w'*X+w0; 18 | %% probability prediction 19 | if nargout > 1 20 | beta = model.beta; 21 | U = model.U; % 3.54 22 | Xo = bsxfun(@minus,X,model.xbar); 23 | XU = U'\Xo; 24 | sigma = sqrt((1+dot(XU,XU,1))/beta); %3.59 25 | end 26 | 27 | if nargin == 3 && nargout == 3 28 | p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma)); 29 | end 30 | -------------------------------------------------------------------------------- /chapter06/knRegPred.m: -------------------------------------------------------------------------------- 1 | function [y, sigma, p] = knRegPred(model, Xt, t) 2 | % Prediction for Gaussian Process (kernel) regression model 3 | % Input: 4 | % model: trained model structure 5 | % Xt: d x n testing data 6 | % t (optional): 1 x n testing response 7 | % Output: 8 | % y: 1 x n prediction 9 | % sigma: variance 10 | % p: 1 x n likelihood of t 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | kn = model.kn; 13 | a = model.a; 14 | X = model.X; 15 | tbar = model.tbar; 16 | Kt = knCenter(kn,X,X,Xt); 17 | y = a'*Kt+tbar; 18 | %% probability prediction 19 | if nargout > 1 20 | alpha = model.alpha; 21 | beta = model.beta; 22 | U = model.U; 23 | XU = U'\Kt; 24 | sigma = sqrt(1/beta+(knCenter(kn,X,Xt)-dot(XU,XU,1))/alpha); 25 | end 26 | 27 | if nargin == 3 && nargout == 3 28 | p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma)); 29 | end -------------------------------------------------------------------------------- /demo/ch07/rvmRegFp_spSignal_demo.m: -------------------------------------------------------------------------------- 1 | % demos for ch07 2 | 3 | %% sparse signal recovery demo 4 | clear; close all; 5 | 6 | d = 512; % signal length 7 | k = 20; % number of spikes 8 | n = 100; % number of measurements 9 | % 10 | % random +/- 1 signal 11 | x = zeros(d,1); 12 | q = randperm(d); 13 | x(q(1:k)) = sign(randn(k,1)); 14 | 15 | % projection matrix 16 | A = unitize(randn(d,n),1); 17 | % noisy observations 18 | sigma = 0.005; 19 | e = sigma*randn(1,n); 20 | y = x'*A + e; 21 | 22 | [model,llh] = rvmRegFp(A,y); 23 | plot(llh); 24 | 25 | m = zeros(d,1); 26 | m(model.index) = model.w; 27 | 28 | h = max(abs(x))+0.2; 29 | x_range = [1,d]; 30 | y_range = [-h,+h]; 31 | figure; 32 | subplot(2,1,1);plot(x); axis([x_range,y_range]); title('Original Signal'); 33 | subplot(2,1,2);plot(m); axis([x_range,y_range]); title('Recovery Signal'); 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /chapter01/mutInfo.m: -------------------------------------------------------------------------------- 1 | function z = mutInfo(x, y) 2 | % Compute mutual information I(x,y) of two discrete variables x and y. 3 | % Input: 4 | % x, y: two integer vector of the same length 5 | % Output: 6 | % z: mutual information z=I(x,y) 7 | % Written by Mo Chen (sth4nth@gmail.com). 8 | assert(numel(x) == numel(y)); 9 | n = numel(x); 10 | x = reshape(x,1,n); 11 | y = reshape(y,1,n); 12 | 13 | l = min(min(x),min(y)); 14 | x = x-l+1; 15 | y = y-l+1; 16 | k = max(max(x),max(y)); 17 | 18 | idx = 1:n; 19 | Mx = sparse(idx,x,1,n,k,n); 20 | My = sparse(idx,y,1,n,k,n); 21 | Pxy = nonzeros(Mx'*My/n); %joint distribution of x and y 22 | Hxy = -dot(Pxy,log2(Pxy)); 23 | 24 | Px = nonzeros(mean(Mx,1)); 25 | Py = nonzeros(mean(My,1)); 26 | 27 | % entropy of Py and Px 28 | Hx = -dot(Px,log2(Px)); 29 | Hy = -dot(Py,log2(Py)); 30 | % mutual information 31 | z = Hx+Hy-Hxy; 32 | z = max(0,z); -------------------------------------------------------------------------------- /chapter12/pcaEmC.m: -------------------------------------------------------------------------------- 1 | function [W, Z, mu, mse] = pcaEmC(X, m) 2 | % Perform Constrained EM like algorithm for PCA. 3 | % Input: 4 | % X: d x n data matrix 5 | % m: dimension of target space 6 | % Output: 7 | % W: d x m weight matrix 8 | % Z: m x n projected data matrix 9 | % mu: d x 1 mean vector 10 | % mse: mean square error 11 | % Reference: 12 | % A Constrained EM Algorithm for Principal Component Analysis by Jong-Hoon Ahn & Jong-Hoon Oh 13 | % Written by Mo Chen (sth4nth@gmail.com). 14 | 15 | d = size(X,1); 16 | mu = mean(X,2); 17 | X = bsxfun(@minus,X,mu); 18 | W = rand(d,m); 19 | 20 | tol = 1e-6; 21 | mse = inf; 22 | maxIter = 200; 23 | for iter = 1:maxIter 24 | Z = tril(W'*W)\(W'*X); 25 | W = (X*Z')/triu(Z*Z'); 26 | 27 | last = mse; 28 | E = X-W*Z; 29 | mse = mean(dot(E(:),E(:))); 30 | if abs(last-mse) 1 17 | beta = model.beta; 18 | if isfield(model,'U') 19 | U = model.U; % 3.54 20 | Xo = bsxfun(@minus,X,model.xbar); 21 | XU = U'\Xo; 22 | sigma = sqrt((1+dot(XU,XU,1))/beta); % 3.59 23 | else 24 | sigma = sqrt(1/beta)*ones(1,size(X,2)); 25 | end 26 | end 27 | 28 | if nargin == 3 && nargout == 3 29 | p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma)); 30 | end 31 | 32 | -------------------------------------------------------------------------------- /chapter12/pcaEm.m: -------------------------------------------------------------------------------- 1 | function [W, Z, mu, mse] = pcaEm(X, m) 2 | % Perform EM-like algorithm for PCA (by Sam Roweis). 3 | % Input: 4 | % X: d x n data matrix 5 | % m: dimension of target space 6 | % Output: 7 | % W: d x m weight matrix 8 | % Z: m x n projected data matrix 9 | % mu: d x 1 mean vector 10 | % mse: mean square error 11 | % Reference: 12 | % Pattern Recognition and Machine Learning by Christopher M. Bishop 13 | % EM algorithms for PCA and SPCA by Sam Roweis 14 | % Written by Mo Chen (sth4nth@gmail.com). 15 | d = size(X,1); 16 | mu = mean(X,2); 17 | X = bsxfun(@minus,X,mu); 18 | W = rand(d,m); 19 | 20 | tol = 1e-6; 21 | mse = inf; 22 | maxIter = 200; 23 | for iter = 1:maxIter 24 | Z = (W'*W)\(W'*X); % 12.58 25 | W = (X*Z')/(Z*Z'); % 12.59 26 | 27 | last = mse; 28 | E = X-W*Z; 29 | mse = mean(dot(E(:),E(:))); 30 | if abs(last-mse)0))); -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Mo Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /chapter06/knCenter.m: -------------------------------------------------------------------------------- 1 | function Kc = knCenter(kn, X, X1, X2) 2 | % Centerize the data in the kernel space 3 | % Input: 4 | % kn: kernel function 5 | % X: d x n data matrix of which the center in the kernel space is computed 6 | % X1, X2: d x n1 and d x n2 data matrix. the kernel k(x1,x2) is computed 7 | % where the origin of the kernel space is the center of phi(X) 8 | % Ouput: 9 | % Kc: n1 x n2 kernel matrix between X1 and X2 in kernel space centered by 10 | % center of phi(X) 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | K = kn(X,X); 13 | mK = mean(K); 14 | mmK = mean(mK); 15 | if nargin == 2 % compute the pairwise centerized version of the kernel of X. eq knCenter(kn,X,X,X) 16 | Kc = K+mmK-bsxfun(@plus,mK',mK); % Kc = K-M*K-K*M+M*K*M; where M = ones(n,n)/n; 17 | elseif nargin == 3 % compute the norms (k(x,x)) of X1 w.r.t. the center of X as the origin. eq diag(knCenter(kn,X,X1,X1)) 18 | Kc = kn(X1)+mmK-2*mean(kn(X,X1)); 19 | elseif nargin == 4 % compute the kernel of X1 and X2 w.r.t. the center of X as the origin 20 | Kc = kn(X1,X2)+mmK-bsxfun(@plus,mean(kn(X,X1))',mean(kn(X,X2))); 21 | end 22 | -------------------------------------------------------------------------------- /chapter11/mixDpGbOl.m: -------------------------------------------------------------------------------- 1 | function [label, Theta, w, llh] = mixDpGbOl(X, alpha, theta) 2 | % Online collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 3 | % Input: 4 | % X: d x n data matrix 5 | % alpha: parameter for Dirichlet process prior 6 | % theta: class object for prior of component distribution (such as Gauss) 7 | % Output: 8 | % label: 1 x n cluster label 9 | % Theta: 1 x k structure of trained components 10 | % w: 1 x k component weight vector 11 | % llh: loglikelihood 12 | % Written by Mo Chen (sth4nth@gmail.com). 13 | n = size(X,2); 14 | Theta = {}; 15 | nk = []; 16 | label = zeros(1,n); 17 | llh = 0; 18 | for i = randperm(n) 19 | x = X(:,i); 20 | Pk = log(nk)+cellfun(@(t) t.logPredPdf(x), Theta); 21 | P0 = log(alpha)+theta.logPredPdf(x); 22 | p = [Pk,P0]; 23 | llh = llh+sum(p-log(n)); 24 | k = discreteRnd(exp(p-logsumexp(p))); 25 | if k == numel(Theta)+1 26 | Theta{k} = theta.clone().addSample(x); 27 | nk = [nk,1]; 28 | else 29 | Theta{k} = Theta{k}.addSample(x); 30 | nk(k) = nk(k)+1; 31 | end 32 | label(i) = k; 33 | end 34 | w = nk/n; -------------------------------------------------------------------------------- /demo/ch11/gauss_demo.m: -------------------------------------------------------------------------------- 1 | 2 | %% Sequential update for Gaussian 3 | close all; clear; 4 | d = 2; 5 | n = 100; 6 | X = randn(d,n); 7 | x = randn(d,1); 8 | 9 | mu = mean(X,2); 10 | Xo = bsxfun(@minus,X,mu); 11 | Sigma = Xo*Xo'/n; 12 | p1 = logGauss(x,mu,Sigma); 13 | 14 | gauss = Gauss(X(:,3:end)).addSample(X(:,1)).addSample(X(:,2)).addSample(X(:,3)).delSample(X(:,3)); 15 | p2 = gauss.logPdf(x); 16 | maxdiff(p1,p2) 17 | %% Sequential update for Gaussian-Wishart 18 | close all; clear; 19 | d = 2; 20 | n = 100; 21 | X = randn(d,n); 22 | x = randn(d,1); 23 | 24 | kappa0 = 1; 25 | m0 = zeros(d,1); 26 | nu0 = d; 27 | S0 = eye(d); 28 | 29 | xbar = mean(X,2); 30 | kappa = kappa0+n; 31 | nu = nu0+n; 32 | m = (n*xbar+kappa0*m0)/kappa; 33 | Xo = bsxfun(@minus,X,m); 34 | X0 = m0-m; 35 | S = S0+Xo*Xo'+kappa0*(X0*X0'); 36 | 37 | v = (nu-d+1); 38 | r = (1+1/kappa)/v; 39 | p1 = logSt(x,m,r*S,v); 40 | 41 | gw0 = GaussWishart(kappa0,m0,nu0,S0); 42 | gw0 = gw0.addData(X); 43 | p0 = gw0.logPredPdf(x); 44 | 45 | gw = GaussWishart(kappa0,m0,nu0,S0); 46 | for i=1:n 47 | gw = gw.addSample(X(:,i)); 48 | end 49 | p2 = gw.logPredPdf(x); 50 | maxdiff(p1,p2) 51 | % 52 | -------------------------------------------------------------------------------- /chapter04/logitBin.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = logitBin(X, y, lambda) 2 | % Logistic regression for binary classification optimized by Newton-Raphson method. 3 | % Input: 4 | % X: d x n data matrix 5 | % y: 1 x n label (0/1) 6 | % lambda: regularization parameter 7 | % alpha: step size 8 | % Output: 9 | % model: trained model structure 10 | % llh: loglikelihood 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | if nargin < 4 13 | alpha = 1e-1; 14 | end 15 | if nargin < 3 16 | lambda = 1e-4; 17 | end 18 | X = [X; ones(1,size(X,2))]; 19 | [d,n] = size(X); 20 | tol = 1e-4; 21 | epoch = 200; 22 | llh = -inf(1,epoch); 23 | w = rand(d,1); 24 | for t = 2:epoch 25 | a = w'*X; 26 | llh(t) = (dot(a,y)-sum(log1pexp(a))-0.5*lambda*dot(w,w))/n; % 4.90 27 | if abs(llh(t)-llh(t-1)) < tol; break; end 28 | z = sigmoid(a); % 4.87 29 | g = X*(z-y)'+lambda*w; % 4.96 30 | r = z.*(1-z); % 4.98 31 | Xw = bsxfun(@times, X, sqrt(r)); 32 | H = Xw*Xw'+lambda*eye(d); % 4.97 33 | w = w-alpha*(H\g); % 4.92 34 | end 35 | llh = llh(2:t); 36 | model.w = w; 37 | -------------------------------------------------------------------------------- /chapter09/kmeans.m: -------------------------------------------------------------------------------- 1 | function [label, mu, energy] = kmeans(X, m) 2 | % Perform kmeans clustering. 3 | % Input: 4 | % X: d x n data matrix 5 | % m: initialization parameter 6 | % Output: 7 | % label: 1 x n sample labels 8 | % mu: d x k center of clusters 9 | % energy: optimization target value 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | label = init(X, m); 12 | n = numel(label); 13 | idx = 1:n; 14 | last = zeros(1,n); 15 | while any(label ~= last) 16 | [~,~,last(:)] = unique(label); % remove empty clusters 17 | mu = X*normalize(sparse(idx,last,1),1); % compute cluster centers 18 | [val,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign sample labels 19 | end 20 | energy = dot(X(:),X(:),1)+2*sum(val); 21 | 22 | function label = init(X, m) 23 | [d,n] = size(X); 24 | if numel(m) == 1 % random initialization 25 | mu = X(:,randperm(n,m)); 26 | [~,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); 27 | elseif all(size(m) == [1,n]) % init with labels 28 | label = m; 29 | elseif size(m,1) == d % init with seeds (centers) 30 | [~,label] = min(dot(m,m,1)'/2-m'*X,[],1); 31 | end -------------------------------------------------------------------------------- /chapter09/mixGaussRnd.m: -------------------------------------------------------------------------------- 1 | function [X, z, model] = mixGaussRnd(d, k, n) 2 | % Genarate samples form a Gaussian mixture model. 3 | % Input: 4 | % d: dimension of data 5 | % k: number of components 6 | % n: number of data 7 | % Output: 8 | % X: d x n data matrix 9 | % z: 1 x n response variable 10 | % model: model structure 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | alpha0 = 1; % hyperparameter of Dirichlet prior 13 | W0 = eye(d); % hyperparameter of inverse Wishart prior of covariances 14 | v0 = d+1; % hyperparameter of inverse Wishart prior of covariances 15 | mu0 = zeros(d,1); % hyperparameter of Guassian prior of means 16 | beta0 = nthroot(k,d); % hyperparameter of Guassian prior of means % in volume x^d there is k points: x^d=k 17 | 18 | 19 | w = dirichletRnd(alpha0,ones(1,k)/k); 20 | z = discreteRnd(w,n); 21 | 22 | mu = zeros(d,k); 23 | Sigma = zeros(d,d,k); 24 | X = zeros(d,n); 25 | for i = 1:k 26 | idx = z==i; 27 | Sigma(:,:,i) = iwishrnd(W0,v0); % invpd(wishrnd(W0,v0)); 28 | mu(:,i) = gaussRnd(mu0,beta0*Sigma(:,:,i)); 29 | X(:,idx) = gaussRnd(mu(:,i),Sigma(:,:,i),sum(idx)); 30 | end 31 | model.mu = mu; 32 | model.Sigma = Sigma; 33 | model.weight = w; -------------------------------------------------------------------------------- /chapter10/mixGaussVbPred.m: -------------------------------------------------------------------------------- 1 | function [z, R] = mixGaussVbPred(model, X) 2 | % Predict label and responsibility for Gaussian mixture model trained by VB. 3 | % Input: 4 | % X: d x n data matrix 5 | % model: trained model structure outputed by the EM algirthm 6 | % Output: 7 | % label: 1 x n cluster label 8 | % R: k x n responsibility 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | alpha = model.alpha; % Dirichlet 11 | kappa = model.kappa; % Gaussian 12 | m = model.m; % Gasusian 13 | v = model.v; % Whishart 14 | U = model.U; % Whishart 15 | logW = model.logW; 16 | n = size(X,2); 17 | [d,k] = size(m); 18 | 19 | EQ = zeros(n,k); 20 | for i = 1:k 21 | Q = (U(:,:,i)'\bsxfun(@minus,X,m(:,i))); 22 | EQ(:,i) = d/kappa(i)+v(i)*dot(Q,Q,1); % 10.64 23 | end 24 | ElogLambda = sum(psi(0,0.5*bsxfun(@minus,v+1,(1:d)')),1)+d*log(2)+logW; % 10.65 25 | Elogpi = psi(0,alpha)-psi(0,sum(alpha)); % 10.66 26 | logRho = -0.5*bsxfun(@minus,EQ,ElogLambda-d*log(2*pi)); % 10.46 27 | logRho = bsxfun(@plus,logRho,Elogpi); % 10.46 28 | logR = bsxfun(@minus,logRho,logsumexp(logRho,2)); % 10.49 29 | R = exp(logR); 30 | z = zeros(1,n); 31 | [~,z(:)] = max(R,[],2); 32 | [~,~,z(:)] = unique(z); 33 | 34 | -------------------------------------------------------------------------------- /chapter09/mixBernEm.m: -------------------------------------------------------------------------------- 1 | function [label, model, llh] = mixBernEm(X, k) 2 | % Perform EM algorithm for fitting the Bernoulli mixture model. 3 | % Input: 4 | % X: d x n binary (0/1) data matrix 5 | % k: number of cluster 6 | % Output: 7 | % label: 1 x n cluster label 8 | % model: trained model structure 9 | % llh: loglikelihood 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | %% initialization 12 | fprintf('EM for mixture model: running ... \n'); 13 | X = sparse(X); 14 | n = size(X,2); 15 | label = ceil(k*rand(1,n)); % random initialization 16 | R = full(sparse(1:n,label,1)); 17 | tol = 1e-8; 18 | maxiter = 500; 19 | llh = -inf(1,maxiter); 20 | for iter = 2:maxiter 21 | model = maximization(X,R); 22 | [R, llh(iter)] = expectation(X,model); 23 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter)); break; end; 24 | end 25 | [~,label(:)] = max(R,[],2); 26 | llh = llh(2:iter); 27 | 28 | function [R, llh] = expectation(X, model) 29 | mu = model.mu; 30 | w = model.w; 31 | R = X'*log(mu)+(1-X)'*log(1-mu)+log(w); 32 | T = logsumexp(R,2); 33 | llh = mean(T); % loglikelihood 34 | R = exp(R-T); 35 | 36 | function model = maximization(X, R) 37 | nk = sum(R,1); 38 | w = nk/sum(nk); 39 | mu = (X*R)./nk; 40 | model.mu = mu; 41 | model.w = w; -------------------------------------------------------------------------------- /chapter08/MRF/mrfMf.m: -------------------------------------------------------------------------------- 1 | function [nodeBel, edgeBel, L] = mrfMf(A, nodePot, edgePot, epoch) 2 | % Mean field for MRF 3 | % Assuming egdePot is symmetric 4 | % Input: 5 | % A: n x n adjacent matrix of undirected graph, where value is edge index 6 | % nodePot: k x n node potential 7 | % edgePot: k x k x m edge potential 8 | % Output: 9 | % nodeBel: k x n node belief 10 | % edgeBel: k x k x m edge belief 11 | % Written by Mo Chen (sth4nth@gmail.com) 12 | if nargin < 4 13 | epoch = 10; 14 | end 15 | L = -inf(1,epoch+1); 16 | [nodeBel,lnZ] = softmax(nodePot,1); % initialization 17 | for iter = 1:epoch 18 | for i = 1:size(nodePot,2) 19 | [~,j,e] = find(A(i,:)); % neighbors 20 | [nodeBel(:,i),lnZ(i)] = softmax(nodePot(:,i)+reshape(edgePot(:,:,e),2,[])*reshape(nodeBel(:,j),[],1)); 21 | end 22 | % E = dot(nodeBel,nodePot,1); 23 | % H = -dot(nodeBel,log(nodeBel),1); 24 | % L(iter+1) = sum(lnZ+E+H)/2; 25 | L(iter+1) = mrfGibbs(A,nodePot,edgePot,nodeBel); 26 | % if abs(L(iter+1)-L(iter))/abs(L(iter)) < tol; break; end 27 | end 28 | L = L(1,2:iter+1); 29 | 30 | [s,t,e] = find(triu(A)); 31 | edgeBel = zeros(size(edgePot)); 32 | for l = 1:numel(e) 33 | edgeBel(:,:,e(l)) = nodeBel(:,s(l))*nodeBel(:,t(l))'; 34 | end -------------------------------------------------------------------------------- /chapter13/HMM/hmmSmoother.m: -------------------------------------------------------------------------------- 1 | function [gamma, alpha, beta, c] = hmmSmoother(model, x) 2 | % HMM smoothing alogrithm (normalized forward-backward or normalized alpha-beta algorithm). 3 | % The alpha and beta returned by this function are the normalized version. 4 | % Input: 5 | % x: 1 x n integer vector which is the sequence of observations 6 | % model: model structure which contains 7 | % model.s: k x 1 start probability vector 8 | % model.A: k x k transition matrix 9 | % model.E: k x d emission matrix 10 | % Output: 11 | % gamma: k x n matrix of posterior gamma(t)=p(z_t,x_{1:T}) 12 | % alpha: k x n matrix of posterior alpha(t)=p(z_t|x_{1:T}) 13 | % beta: k x n matrix of posterior beta(t)=gamma(t)/alpha(t) 14 | % c: 1 x n normalization constant vector 15 | % Written by Mo Chen (sth4nth@gmail.com). 16 | s = model.s; 17 | A = model.A; 18 | E = model.E; 19 | 20 | n = size(x,2); 21 | X = sparse(x,1:n,1); 22 | M = E*X; 23 | 24 | [K,T] = size(M); 25 | At = A'; 26 | c = zeros(1,T); % normalization constant 27 | alpha = zeros(K,T); 28 | [alpha(:,1),c(1)] = normalize(s.*M(:,1),1); 29 | for t = 2:T 30 | [alpha(:,t),c(t)] = normalize((At*alpha(:,t-1)).*M(:,t),1); % 13.59 31 | end 32 | beta = ones(K,T); 33 | for t = T-1:-1:1 34 | beta(:,t) = A*(beta(:,t+1).*M(:,t+1))/c(t+1); % 13.62 35 | end 36 | gamma = alpha.*beta; % 13.64 37 | 38 | -------------------------------------------------------------------------------- /demo/ch08/mrf_demo.m: -------------------------------------------------------------------------------- 1 | % Done! 2 | clear; close all; 3 | % load letterA.mat; 4 | % X = A; 5 | load letterX.mat 6 | %% Original image 7 | img = double(X); 8 | img = sign(img-mean(img(:))); 9 | 10 | figure; 11 | subplot(2,2,1); 12 | imagesc(img); 13 | title('Original image'); 14 | axis image; 15 | colormap gray; 16 | %% Noisy image 17 | sigma = 1; % noise level 18 | x = img + sigma*randn(size(img)); % noisy signal 19 | subplot(2,2,2); 20 | imagesc(x); 21 | title('Noisy image'); 22 | axis image; 23 | colormap gray; 24 | %% Construct MRF data 25 | epoch = 20; 26 | J = 1; % ising parameter 27 | [A,nodePot,edgePot] = mrfIsGa(x,sigma,J); 28 | %% Mean Field 29 | [nodeBel0,edgeBel0,lnZ0] = mrfMf(A,nodePot,edgePot,epoch); 30 | 31 | L0 = mrfGibbs(A,nodePot,edgePot,nodeBel0); 32 | L1 = mrfBethe(A,nodePot,edgePot,nodeBel0,edgeBel0); 33 | maxdiff(L0,lnZ0(end)) 34 | maxdiff(L0,L1) 35 | 36 | subplot(2,2,3); 37 | imagesc(reshape(nodeBel0(1,:),size(img))); 38 | title('Mean Field'); 39 | axis image; 40 | colormap gray; 41 | %% Belief Propagation 42 | [nodeBel1,edgeBel1,lnZ1] = mrfBp(A,nodePot,edgePot,epoch); 43 | 44 | subplot(2,2,4); 45 | imagesc(reshape(nodeBel1(1,:),size(img))); 46 | title('Belief Propagation'); 47 | axis image; 48 | colormap gray; 49 | %% Energy comparation 50 | figure 51 | epochs = 1:epoch; 52 | plot( epochs,lnZ0,'-', ... 53 | epochs,lnZ1,'-'); 54 | xlabel('epoch'); % add axis labels and plot title 55 | ylabel('energy'); 56 | title('Energy Comparation'); 57 | legend('MF','BP'); -------------------------------------------------------------------------------- /chapter02/logSt.m: -------------------------------------------------------------------------------- 1 | function y = logSt(X, mu, sigma, v) 2 | % Compute log pdf of a Student's t distribution. 3 | % Input: 4 | % X: d x n data matrix 5 | % mu: mean 6 | % sigma: variance 7 | % v: degree of freedom 8 | % Output: 9 | % y: probability density in logrithm scale y=log p(x) 10 | % Written by mo Chen (sth4nth@gmail.com). 11 | [d,k] = size(mu); 12 | 13 | if size(sigma,1)==d && size(sigma,2)==d && k==1 14 | [R,p]= chol(sigma); 15 | if p ~= 0 16 | error('ERROR: sigma is not SPD.'); 17 | end 18 | X = bsxfun(@minus,X,mu); 19 | Q = R'\X; 20 | q = dot(Q,Q,1); % quadratic term (M distance) 21 | o = -log(1+q/v)*((v+d)/2); 22 | c = gammaln((v+d)/2)-gammaln(v/2)-(d*log(v*pi)+2*sum(log(diag(R))))/2; 23 | y = c+o; 24 | elseif size(sigma,1)==d && size(sigma,2)==k 25 | lambda = 1./sigma; 26 | ml = mu.*lambda; 27 | q = bsxfun(@plus,X'.^2*lambda-2*X'*ml,dot(mu,ml,1)); % M distance 28 | o = bsxfun(@times,log(1+bsxfun(@times,q,1./v)),-(v+d)/2); 29 | c = gammaln((v+d)/2)-gammaln(v/2)-(d*log(pi*v)+sum(log(sigma),1))/2; 30 | y = bsxfun(@plus,o,c); 31 | elseif size(sigma,1)==1 && size(sigma,2)==k 32 | X2 = repmat(dot(X,X,1)',1,k); 33 | D = bsxfun(@plus,X2-2*X'*mu,dot(mu,mu,1)); 34 | q = bsxfun(@times,D,1./sigma); % M distance 35 | o = bsxfun(@times,log(1+bsxfun(@times,q,1./v)),-(v+d)/2); 36 | c = gammaln((v+d)/2)-gammaln(v/2)-d*log(pi*v.*sigma)/2; 37 | y = bsxfun(@plus,o,c); 38 | else 39 | error('Parameters are mismatched.'); 40 | end 41 | -------------------------------------------------------------------------------- /demo/ch06/knLin_demo.m: -------------------------------------------------------------------------------- 1 | %% Kernel regression with linear kernel is EQUIVALENT to linear regression 2 | clear; close all; 3 | n = 100; 4 | x = linspace(0,2*pi,n); % test data 5 | t = sin(x)+rand(1,n)/2; 6 | 7 | lambda = 1e-4; 8 | model_kn = knReg(x,t,lambda,@knLin); 9 | model_lin = linReg(x,t,lambda); 10 | 11 | idx = 1:2:n; 12 | xt = x(:,idx); 13 | tt = t(idx); 14 | 15 | [y_kn, sigma_kn,p_kn] = knRegPred(model_kn,xt,tt); 16 | [y_lin, sigma_lin,p_lin] = linRegPred(model_lin,xt,tt); 17 | 18 | maxdiff(y_kn,y_lin) 19 | maxdiff(sigma_kn,sigma_lin) 20 | maxdiff(p_kn,p_lin) 21 | %% Kernel kmeans with linear kernel is EQUIVALENT to kmeans 22 | clear; close all; 23 | d = 2; 24 | k = 3; 25 | n = 500; 26 | [X,y] = kmeansRnd(d,k,n); 27 | init = ceil(k*rand(1,n)); 28 | [y_kn,model_kn,en_kn] = knKmeans(X,init,@knLin); 29 | [y_lin,model_lin,en_lin] = kmeans(X,init); 30 | 31 | idx = 1:2:n; 32 | Xt = X(:,idx); 33 | 34 | [t_kn,ent_kn] = knKmeansPred(model_kn, Xt); 35 | [t_lin,ent_lin] = kmeansPred(model_lin, Xt); 36 | 37 | maxdiff(y_kn,y_lin) 38 | maxdiff(en_kn,en_lin) 39 | 40 | maxdiff(t_kn,t_lin) 41 | maxdiff(ent_kn,ent_lin) 42 | %% Kernel PCA with linear kernel is EQUIVALENT TO PCA 43 | clear; close all; 44 | d = 10; 45 | q = 2; 46 | n = 500; 47 | X = randn(d,n); 48 | 49 | 50 | model_kn = knPca(X,q,@knLin); 51 | idx = 1:2:n; 52 | Xt = X(:,idx); 53 | 54 | Y_kn = knPcaPred(model_kn,Xt); 55 | 56 | [U,L,mu,mse] = pca(X,q); 57 | Y_lin = U'*bsxfun(@minus,Xt,mu); % projection 58 | 59 | 60 | R = Y_lin/Y_kn; % the results are equivalent up to a rotation. 61 | maxdiff(R*R', eye(q)) 62 | -------------------------------------------------------------------------------- /chapter03/linRegFp.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = linRegFp(X, t, alpha, beta) 2 | % Fit empirical Bayesian linear model with Mackay fixed point method (p.168) 3 | % Input: 4 | % X: d x n data 5 | % t: 1 x n response 6 | % alpha: prior parameter 7 | % beta: prior parameter 8 | % Output: 9 | % model: trained model structure 10 | % llh: loglikelihood 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | if nargin < 3 13 | alpha = 0.02; 14 | beta = 0.5; 15 | end 16 | [d,n] = size(X); 17 | 18 | xbar = mean(X,2); 19 | tbar = mean(t,2); 20 | 21 | X = bsxfun(@minus,X,xbar); 22 | t = bsxfun(@minus,t,tbar); 23 | 24 | XX = X*X'; 25 | Xt = X*t'; 26 | 27 | 28 | tol = 1e-4; 29 | maxiter = 200; 30 | llh = -inf(1,maxiter); 31 | for iter = 2:maxiter 32 | A = beta*XX+diag(alpha); % 3.81 3.54 33 | U = chol(A); 34 | 35 | m = beta*(U\(U'\Xt)); % 3.84 36 | m2 = dot(m,m); 37 | e = sum((t-m'*X).^2); 38 | 39 | logdetA = 2*sum(log(diag(U))); 40 | llh(iter) = 0.5*(d*log(alpha)+n*log(beta)-alpha*m2-beta*e-logdetA-n*log(2*pi)); % 3.86 41 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end 42 | 43 | V = inv(U); % A=inv(S) 44 | trS = dot(V(:),V(:)); 45 | gamma = d-alpha*trS; % 3.91 9.64 46 | alpha = gamma/m2; % 3.92 47 | beta = (n-gamma)/e; % 3.95 48 | 49 | end 50 | w0 = tbar-dot(m,xbar); 51 | 52 | llh = llh(2:iter); 53 | model.w0 = w0; 54 | model.w = m; 55 | %% optional for bayesian probabilistic prediction purpose 56 | model.alpha = alpha; 57 | model.beta = beta; 58 | model.xbar = xbar; 59 | model.U = U; -------------------------------------------------------------------------------- /chapter14/mixLogitBin.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = mixLogitBin(X, t, k) 2 | % Mixture of logistic regression model for binary classification optimized by Newton-Raphson method 3 | % Input: 4 | % X: d x n data matrix 5 | % t: 1 x n label (0/1) 6 | % k: number of mixture component 7 | % Output: 8 | % model: trained model structure 9 | % llh: loglikelihood 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | n = size(X,2); 12 | X = [X; ones(1,n)]; 13 | d = size(X,1); 14 | z = ceil(k*rand(1,n)); 15 | R = full(sparse(1:n,z,1,n,k,n)); % n x k 16 | 17 | W = zeros(d,k); 18 | tol = 1e-4; 19 | maxiter = 100; 20 | llh = -inf(1,maxiter); 21 | 22 | t = t(:); 23 | h = ones(n,1); 24 | h(t==0) = -1; 25 | A = X'*W; 26 | for iter = 2:maxiter 27 | % maximization 28 | nk = sum(R,1); 29 | alpha = nk/n; 30 | Y = sigmoid(A); 31 | for j = 1:k 32 | W(:,j) = newtonStep(X, t, Y(:,j), W(:,j), R(:,j)); 33 | end 34 | % expectation 35 | A = X'*W; 36 | logRho = -log1pexp(-bsxfun(@times,A,h)); 37 | logRho = bsxfun(@plus,logRho,log(alpha)); 38 | T = logsumexp(logRho,2); 39 | llh(iter) = sum(T)/n; % loglikelihood 40 | logR = bsxfun(@minus,logRho,T); 41 | R = exp(logR); 42 | 43 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter)); break; end 44 | end 45 | llh = llh(2:iter); 46 | model.alpha = alpha; % mixing coefficient 47 | model.W = W; % logistic model coefficent 48 | 49 | 50 | function w = newtonStep(X, t, y, w, r) 51 | lambda = 1e-6; 52 | v = y.*(1-y).*r; 53 | H = bsxfun(@times,X,v')*X'+lambda*eye(size(X,1)); 54 | s = (y-t).*r; 55 | g = X*s; 56 | w = w-H\g; 57 | 58 | -------------------------------------------------------------------------------- /chapter09/linRegEm.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = linRegEm(X, t, alpha, beta) 2 | % Fit empirical Bayesian linear regression model with EM (p.448 chapter 9.3.4) 3 | % Input: 4 | % X: d x n data 5 | % t: 1 x n response 6 | % alpha: prior parameter 7 | % beta: prior parameter 8 | % Output: 9 | % model: trained model structure 10 | % llh: loglikelihood 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | if nargin < 3 13 | alpha = 0.02; 14 | beta = 0.5; 15 | end 16 | [d,n] = size(X); 17 | I = eye(d); 18 | xbar = mean(X,2); 19 | tbar = mean(t,2); 20 | 21 | X = bsxfun(@minus,X,xbar); 22 | t = bsxfun(@minus,t,tbar); 23 | 24 | XX = X*X'; 25 | Xt = X*t'; 26 | 27 | tol = 1e-4; 28 | maxiter = 100; 29 | llh = -inf(1,maxiter+1); 30 | for iter = 2:maxiter 31 | A = beta*XX+alpha*eye(d); 32 | U = chol(A); 33 | 34 | m = beta*(U\(U'\Xt)); 35 | m2 = dot(m,m); 36 | e2 = sum((t-m'*X).^2); 37 | 38 | logdetA = 2*sum(log(diag(U))); 39 | llh(iter) = 0.5*(d*log(alpha)+n*log(beta)-alpha*m2-beta*e2-logdetA-n*log(2*pi)); % 3.86 40 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end 41 | 42 | invU = U'\I; 43 | trS = dot(invU(:),invU(:)); % A=inv(S) 44 | alpha = d/(m2+trS); % 9.63 45 | 46 | invUX = U'\X; 47 | trXSX = dot(invUX(:),invUX(:)); 48 | beta = n/(e2+trXSX); % 9.68 is wrong 49 | end 50 | w0 = tbar-dot(m,xbar); 51 | 52 | llh = llh(2:iter); 53 | model.w0 = w0; 54 | model.w = m; 55 | %% optional for bayesian probabilistic inference purpose 56 | model.alpha = alpha; 57 | model.beta = beta; 58 | model.xbar = xbar; 59 | model.U = U; 60 | -------------------------------------------------------------------------------- /chapter14/mixLinReg.m: -------------------------------------------------------------------------------- 1 | function [label, model, llh] = mixLinReg(X, y, k, lambda) 2 | % Mixture of linear regression 3 | % input: 4 | % X: d x n data matrix 5 | % y: 1 x n responding vector 6 | % k: number of mixture component 7 | % lambda: regularization parameter 8 | % output: 9 | % label: 1 x n cluster label 10 | % model: trained model structure 11 | % llh: loglikelihood 12 | % Written by Mo Chen (sth4nth@gmail.com). 13 | if nargin < 4 14 | lambda = 1; 15 | end 16 | n = size(X,2); 17 | X = [X;ones(1,n)]; % adding the bias term 18 | d = size(X,1); 19 | label = ceil(k*rand(1,n)); % random initialization 20 | R = full(sparse(label,1:n,1,k,n,n)); 21 | tol = 1e-6; 22 | maxiter = 500; 23 | llh = -inf(1,maxiter); 24 | Lambda = lambda*eye(d); 25 | W = zeros(d,k); 26 | Xy = bsxfun(@times,X,y); 27 | beta = 1; 28 | for iter = 2:maxiter 29 | % maximization 30 | nk = sum(R,2); 31 | alpha = nk/n; 32 | for j = 1:k 33 | Xw = bsxfun(@times,X,sqrt(R(j,:))); 34 | U = chol(Xw*Xw'+Lambda); 35 | W(:,j) = U\(U'\(Xy*R(j,:)')); % 3.15 & 3.28 36 | end 37 | D = bsxfun(@minus,W'*X,y).^2; 38 | % expectation 39 | logRho = (-0.5)*beta*D; 40 | logRho = bsxfun(@plus,logRho,log(alpha)); 41 | T = logsumexp(logRho,1); 42 | logR = bsxfun(@minus,logRho,T); 43 | R = exp(logR); 44 | llh(iter) = sum(T)/n; 45 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter)); break; end 46 | end 47 | llh = llh(2:iter); 48 | model.alpha = alpha; % mixing coefficient 49 | model.beta = beta; % mixture component precision 50 | model.W = W; % linear model coefficent 51 | [~,label] = max(R,[],1); 52 | model.label = label; 53 | -------------------------------------------------------------------------------- /chapter11/mixDpGb.m: -------------------------------------------------------------------------------- 1 | function [label, Theta, w, llh] = mixDpGb(X, alpha, theta) 2 | % Collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 3 | % Any component model can be used, such as Gaussian. 4 | % Input: 5 | % X: d x n data matrix 6 | % alpha: parameter for Dirichlet process prior 7 | % theta: class object for prior of component distribution (such as Gauss) 8 | % Output: 9 | % label: 1 x n cluster label 10 | % Theta: 1 x k structure of trained components 11 | % w: 1 x k component weight vector 12 | % llh: loglikelihood 13 | % Written by Mo Chen (sth4nth@gmail.com). 14 | n = size(X,2); 15 | [label,Theta,w] = mixDpGbOl(X,alpha,theta); 16 | nk = n*w; 17 | maxIter = 50; 18 | llh = zeros(1,maxIter); 19 | for iter = 1:maxIter 20 | for i = randperm(n) 21 | x = X(:,i); 22 | k = label(i); 23 | Theta{k} = Theta{k}.delSample(x); 24 | nk(k) = nk(k)-1; 25 | if nk(k) == 0 % remove empty cluster 26 | Theta(k) = []; 27 | nk(k) = []; 28 | which = label>k; 29 | label(which) = label(which)-1; 30 | end 31 | Pk = log(nk)+cellfun(@(t) t.logPredPdf(x), Theta); 32 | P0 = log(alpha)+theta.logPredPdf(x); 33 | p = [Pk,P0]; 34 | llh(iter) = llh(iter)+sum(p-log(n)); 35 | k = discreteRnd(exp(p-logsumexp(p))); 36 | if k == numel(Theta)+1 % add extra cluster 37 | Theta{k} = theta.clone().addSample(x); 38 | nk = [nk,1]; 39 | else 40 | Theta{k} = Theta{k}.addSample(x); 41 | nk(k) = nk(k)+1; 42 | end 43 | label(i) = k; 44 | end 45 | end 46 | w = nk/n; 47 | 48 | -------------------------------------------------------------------------------- /demo/ch13/lds_demo.m: -------------------------------------------------------------------------------- 1 | close all; 2 | % Parameter 3 | clear; 4 | d = 2; 5 | k = 3; 6 | n = 100; 7 | 8 | A = [1,0,1; 9 | 0 1,0; 10 | 0,0,1]; 11 | G = eye(k)*1e-3; 12 | 13 | C = [1,0,0; 14 | 0 1,0]; 15 | S = eye(d)*1e-1; 16 | 17 | mu0 = [0;0;0]; 18 | P0 = eye(k); 19 | 20 | model.A = A; 21 | model.G = G; 22 | model.C = C; 23 | model.S = S; 24 | model.mu0 = mu0; 25 | model.P0 = P0; 26 | 27 | %% Generate data 28 | [z,x] = ldsRnd(model,n); 29 | figure; 30 | hold on 31 | plot(x(1,:), x(2,:), 'ro'); 32 | plot(z(1,:), z(2,:), 'b*-'); 33 | legend('observed', 'latent') 34 | title('Generated Data') 35 | axis equal 36 | hold off 37 | %% Kalman filter 38 | [mu, V, llh] = kalmanFilter(model,x); 39 | figure 40 | hold on 41 | plot(x(1,:), x(2,:), 'ro'); 42 | plot(mu(1,:), mu(2,:), 'b*-'); 43 | legend('observed', 'filtered') 44 | title('Kalman filter') 45 | axis equal 46 | hold off 47 | %% Kalman smoother 48 | [nu, U, llh] = kalmanSmoother(model,x); 49 | figure 50 | hold on 51 | plot(x(1,:), x(2,:), 'ro'); 52 | plot(nu(1,:), nu(2,:), 'b*-'); 53 | legend('observed', 'smoothed') 54 | title('Kalman smoother') 55 | axis equal 56 | hold off 57 | %% LDS Subspace 58 | [A,C,nu] = ldsPca(x,k,3*k); 59 | y = C*nu; 60 | t = size(y,2); 61 | figure; 62 | hold on 63 | plot(x(1,1:t), x(2,1:t), 'ro'); 64 | plot(y(1,1:t), y(2,1:t), 'b*-'); 65 | legend('observed', 'projected') 66 | title('LDS subspace learning') 67 | axis equal 68 | hold off 69 | %% LDS EM 70 | [tmodel, llh] = ldsEm(x,k); 71 | nu = kalmanSmoother(tmodel,x); 72 | y = tmodel.C*nu; 73 | figure 74 | hold on 75 | plot(x(1,:), x(2,:), 'ro'); 76 | plot(y(1,:), y(2,:), 'b*-'); 77 | legend('observed', 'learned') 78 | title('LDS EM learning') 79 | axis equal 80 | hold off 81 | figure; 82 | plot(llh); 83 | -------------------------------------------------------------------------------- /chapter12/fa.m: -------------------------------------------------------------------------------- 1 | function [W, mu, psi, llh] = fa(X, m) 2 | % Perform EM algorithm for factor analysis model 3 | % Input: 4 | % X: d x n data matrix 5 | % m: dimension of target space 6 | % Output: 7 | % W: d x m weight matrix 8 | % mu: d x 1 mean vector 9 | % psi: d x 1 variance vector 10 | % llh: loglikelihood 11 | % Reference: Pattern Recognition and Machine Learning by Christopher M. Bishop 12 | % Written by Mo Chen (sth4nth@gmail.com). 13 | [d,n] = size(X); 14 | mu = mean(X,2); 15 | X = bsxfun(@minus,X,mu); 16 | 17 | tol = 1e-4; 18 | maxiter = 500; 19 | llh = -inf(1,maxiter); 20 | 21 | I = eye(m); 22 | r = dot(X,X,2); 23 | 24 | W = randn(d,m); 25 | lambda = 1./rand(d,1); 26 | for iter = 2:maxiter 27 | T = bsxfun(@times,W,sqrt(lambda)); 28 | M = T'*T+I; % M = W'*inv(Psi)*W+I 29 | U = chol(M); 30 | WInvPsiX = bsxfun(@times,W,lambda)'*X; % WInvPsiX = W'*inv(Psi)*X 31 | 32 | % likelihood 33 | logdetC = 2*sum(log(diag(U)))-sum(log(lambda)); % log(det(C)) 34 | Q = U'\WInvPsiX; 35 | trInvCS = (r'*lambda-dot(Q(:),Q(:)))/n; % trace(inv(C)*S) 36 | llh(iter) = -n*(d*log(2*pi)+logdetC+trInvCS)/2; 37 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end % check likelihood for convergence 38 | 39 | % E step 40 | Ez = M\WInvPsiX; % 12.66 41 | V = inv(U); 42 | Ezz = n*(V*V')+Ez*Ez'; % 12.67 43 | 44 | % M step 45 | U = chol(Ezz); 46 | XEz = X*Ez'; 47 | W = (XEz/U)/U'; % 12.69 48 | lambda = n./(r-dot(W,XEz,2)); % 12.70 49 | end 50 | llh = llh(2:iter); 51 | psi = 1./lambda; -------------------------------------------------------------------------------- /chapter05/mlpReg.m: -------------------------------------------------------------------------------- 1 | function [model, L] = mlpReg(X, y, k, lambda) 2 | % Train a multilayer perceptron neural network for regression with backpropagation 3 | % tanh activation function is used 4 | % Input: 5 | % X: d x n data matrix 6 | % y: 1 x n real value response vector 7 | % k: T x 1 vector to specify number of hidden nodes in each layer 8 | % lambda: regularization parameter 9 | % Ouput: 10 | % model: model structure 11 | % L: (regularized least square) loss 12 | % Written by Mo Chen (sth4nth@gmail.com). 13 | if nargin < 4 14 | lambda = 1e-2; 15 | end 16 | eta = 1e-5; 17 | tol = 1e-5; 18 | maxiter = 50000; 19 | L = inf(1,maxiter); 20 | 21 | k = [size(X,1);k(:);size(y,1)]; 22 | T = numel(k)-1; 23 | W = cell(T,1); 24 | b = cell(T,1); 25 | for t = 1:T 26 | W{t} = randn(k(t),k(t+1)); 27 | b{t} = randn(k(t+1),1); 28 | end 29 | R = cell(T,1); 30 | Z = cell(T+1,1); 31 | Z{1} = X; 32 | for iter = 2:maxiter 33 | % forward 34 | for t = 1:T-1 35 | Z{t+1} = tanh(W{t}'*Z{t}+b{t}); % 5.10 5.113 36 | end 37 | Z{T+1} = W{T}'*Z{T}+b{T}; % 5.114 38 | 39 | % loss 40 | E = Z{T+1}-y; 41 | Wn = cellfun(@(x) dot(x(:),x(:)),W); % |W|^2 42 | L(iter) = dot(E(:),E(:))+lambda*sum(Wn); 43 | if abs(L(iter)-L(iter-1)) < tol*L(iter-1); break; end 44 | 45 | % backward 46 | R{T} = E; 47 | for t = T-1:-1:1 48 | df = 1-Z{t+1}.^2; % h'(a) 49 | R{t} = df.*(W{t+1}*R{t+1}); % 5.66 50 | end 51 | 52 | % gradient descent 53 | for t=1:T 54 | dW = Z{t}*R{t}'+lambda*W{t}; % 5.67 55 | db = sum(R{t},2); 56 | W{t} = W{t}-eta*dW; % 5.43 57 | b{t} = b{t}-eta*db; 58 | end 59 | end 60 | L = L(2:iter); 61 | model.W = W; 62 | model.b = b; 63 | -------------------------------------------------------------------------------- /chapter12/ppcaEm.m: -------------------------------------------------------------------------------- 1 | function [W, mu, beta, llh] = ppcaEm(X, m) 2 | % Perform EM algorithm to maiximize likelihood of probabilistic PCA model. 3 | % Input: 4 | % X: d x n data matrix 5 | % m: dimension of target space 6 | % Output: 7 | % W: d x m weight matrix 8 | % mu: d x 1 mean vector 9 | % beta: precition vector (inverse of variance 10 | % llh: loglikelihood 11 | % Reference: 12 | % Pattern Recognition and Machine Learning by Christopher M. Bishop 13 | % Probabilistic Principal Component Analysis by Michael E. Tipping & Christopher M. Bishop 14 | % Written by Mo Chen (sth4nth@gmail.com). 15 | [d,n] = size(X); 16 | mu = mean(X,2); 17 | X = bsxfun(@minus,X,mu); 18 | 19 | tol = 1e-4; 20 | maxiter = 500; 21 | llh = -inf(1,maxiter); 22 | I = eye(m); 23 | r = dot(X(:),X(:)); % total norm of X 24 | W = randn(d,m); 25 | s = 1/randg; 26 | for iter = 2:maxiter 27 | M = W'*W+s*I; 28 | U = chol(M); 29 | WX = W'*X; 30 | 31 | % likelihood 32 | logdetC = 2*sum(log(diag(U)))+(d-m)*log(s); 33 | T = U'\WX; 34 | trInvCS = (r-dot(T(:),T(:)))/(s*n); 35 | llh(iter) = -n*(d*log(2*pi)+logdetC+trInvCS)/2; % 12.43 12.44 36 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end % check likelihood for convergence 37 | 38 | % E step 39 | Ez = M\WX; % 12.54 40 | V = inv(U); % inv(M) = V*V' 41 | Ezz = n*s*(V*V')+Ez*Ez'; % n*s because we are dealing with all n E[zi*zi'] % 12. 55 42 | 43 | % M step 44 | U = chol(Ezz); 45 | W = ((X*Ez')/U)/U'; % 12.56 46 | WR = W*U'; 47 | s = (r-2*dot(Ez(:),WX(:))+dot(WR(:),WR(:)))/(n*d); % 12.57 48 | end 49 | llh = llh(2:iter); 50 | beta = 1/s; -------------------------------------------------------------------------------- /chapter11/Gauss.m: -------------------------------------------------------------------------------- 1 | % Class for Gaussian distribution used by Dirichlet process 2 | classdef Gauss 3 | properties 4 | n_ 5 | mu_ 6 | U_ 7 | end 8 | 9 | methods 10 | function obj = Gauss(X) 11 | n = size(X,2); 12 | mu = mean(X,2); 13 | U = chol(X*X'); 14 | 15 | obj.n_ = n; 16 | obj.mu_ = mu; 17 | obj.U_ = U; 18 | end 19 | 20 | function obj = clone(obj) 21 | end 22 | 23 | function obj = addSample(obj, x) 24 | n = obj.n_; 25 | mu = obj.mu_; 26 | U = obj.U_; 27 | 28 | n = n+1; 29 | mu = mu+(x-mu)/n; 30 | U = cholupdate(U,x,'+'); 31 | 32 | obj.n_ = n; 33 | obj.mu_ = mu; 34 | obj.U_ = U; 35 | end 36 | 37 | function obj = delSample(obj, x) 38 | n = obj.n_; 39 | mu = obj.mu_; 40 | U = obj.U_; 41 | 42 | n = n-1; 43 | mu = mu-(x-mu)/n; 44 | U = cholupdate(U,x,'-'); 45 | 46 | obj.n_ = n; 47 | obj.mu_ = mu; 48 | obj.U_ = U; 49 | end 50 | 51 | function y = logPdf(obj,X) 52 | n = obj.n_; 53 | mu = obj.mu_; 54 | U = obj.U_; 55 | d = size(X,1); 56 | 57 | U = cholupdate(U/sqrt(n),mu,'-'); % Sigma=X*X'/n-mu*mu' 58 | Q = U'\bsxfun(@minus,X,mu); 59 | q = dot(Q,Q,1); % quadratic term (M distance) 60 | c = d*log(2*pi)+2*sum(log(diag(U))); % normalization constant 61 | y = -0.5*(c+q); 62 | end 63 | end 64 | end -------------------------------------------------------------------------------- /chapter09/rvmRegEm.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = rvmRegEm(X, t, alpha, beta) 2 | % Relevance Vector Machine (ARD sparse prior) for regression 3 | % trained by empirical bayesian (type II ML) using EM 4 | % Input: 5 | % X: d x n data 6 | % t: 1 x n response 7 | % alpha: prior parameter 8 | % beta: prior parameter 9 | % Output: 10 | % model: trained model structure 11 | % llh: loglikelihood 12 | % Written by Mo Chen (sth4nth@gmail.com). 13 | if nargin < 3 14 | alpha = 0.02; 15 | beta = 0.5; 16 | end 17 | [d,n] = size(X); 18 | xbar = mean(X,2); 19 | tbar = mean(t,2); 20 | X = bsxfun(@minus,X,xbar); 21 | t = bsxfun(@minus,t,tbar); 22 | XX = X*X'; 23 | Xt = X*t'; 24 | 25 | tol = 1e-3; 26 | maxiter = 500; 27 | llh = -inf(1,maxiter+1); 28 | index = 1:d; 29 | alpha = alpha*ones(d,1); 30 | for iter = 2 : maxiter 31 | nz = 1./alpha > tol ; % nonzeros 32 | index = index(nz); 33 | alpha = alpha(nz); 34 | XX = XX(nz,nz); 35 | Xt = Xt(nz); 36 | X = X(nz,:); 37 | % E-step 38 | U = chol(beta*(XX)+diag(alpha)); % 7.83 39 | m = beta*(U\(U'\(X*t'))); % E[m] % 7.82 40 | m2 = m.^2; 41 | e2 = sum((t-m'*X).^2); 42 | 43 | logdetS = 2*sum(log(diag(U))); 44 | llh(iter) = 0.5*(sum(log(alpha))+n*log(beta)-beta*e2-logdetS-dot(alpha,m2)-n*log(2*pi)); % 3.86 45 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end 46 | % M-step 47 | V = inv(U); 48 | dgS = dot(V,V,2); 49 | alpha = 1./(m2+dgS); % 9.67 50 | UX = U'\X; 51 | trXSX = dot(UX(:),UX(:)); 52 | beta = n/(e2+trXSX); % 9.68 is wrong 53 | end 54 | llh = llh(2:iter); 55 | 56 | model.index = index; 57 | model.w0 = tbar-dot(m,xbar(nz)); 58 | model.w = m; 59 | model.alpha = alpha; 60 | model.beta = beta; 61 | %% optional for bayesian probabilistic prediction purpose 62 | model.xbar = xbar; 63 | model.U = U; -------------------------------------------------------------------------------- /chapter05/mlpClass.m: -------------------------------------------------------------------------------- 1 | function [model, L] = mlpClass(X, y, k, lambda) 2 | % Train a multilayer perceptron neural network for multiclass classification with backpropagation 3 | % logistic activation function is used. 4 | % Input: 5 | % X: d x n data matrix 6 | % y: 1 x n label vector 7 | % k: T x 1 vector to specify number of hidden nodes in each layer 8 | % lambda: regularization parameter 9 | % Ouput: 10 | % model: model structure 11 | % L: (regularized cross entropy) loss 12 | % Written by Mo Chen (sth4nth@gmail.com). 13 | if nargin < 4 14 | lambda = 1e-2; 15 | end 16 | eta = 1e-3; 17 | tol = 1e-4; 18 | maxiter = 50000; 19 | L = inf(1,maxiter); 20 | 21 | Y = sparse(y,1:numel(y),1); 22 | k = [size(X,1);k(:);size(Y,1)]; 23 | T = numel(k)-1; 24 | W = cell(T,1); 25 | b = cell(T,1); 26 | for t = 1:T 27 | W{t} = randn(k(t),k(t+1)); 28 | b{t} = randn(k(t+1),1); 29 | end 30 | R = cell(T,1); 31 | Z = cell(T+1,1); 32 | Z{1} = X; 33 | for iter = 2:maxiter 34 | % forward 35 | for t = 1:T-1 36 | Z{t+1} = sigmoid(W{t}'*Z{t}+b{t}); % 5.10 5.113 37 | end 38 | Z{T+1} = softmax(W{T}'*Z{T}+b{T}); 39 | 40 | % loss 41 | E = Z{T+1}; 42 | Wn = cellfun(@(x) dot(x(:),x(:)),W); % |W|^2 43 | L(iter) = -dot(Y(:),log(E(:)))+0.5*lambda*sum(Wn); 44 | if abs(L(iter)-L(iter-1)) < tol*L(iter-1); break; end 45 | 46 | % backward 47 | R{T} = Z{T+1}-Y; 48 | for t = T-1:-1:1 49 | df = Z{t+1}.*(1-Z{t+1}); % h'(a) 50 | R{t} = df.*(W{t+1}*R{t+1}); % 5.66 51 | end 52 | 53 | % gradient descent 54 | for t=1:T 55 | dW = Z{t}*R{t}'+lambda*W{t}; % 5.67 56 | db = sum(R{t},2); 57 | W{t} = W{t}-eta*dW; % 5.43 58 | b{t} = b{t}-eta*db; 59 | end 60 | end 61 | L = L(2:iter); 62 | model.W = W; 63 | model.b = b; 64 | -------------------------------------------------------------------------------- /chapter07/rvmRegFp.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = rvmRegFp(X, t, alpha, beta) 2 | % Relevance Vector Machine (ARD sparse prior) for regression 3 | % training by empirical bayesian (type II ML) using Mackay fix point update. 4 | % Input: 5 | % X: d x n data 6 | % t: 1 x n response 7 | % alpha: prior parameter 8 | % beta: prior parameter 9 | % Output: 10 | % model: trained model structure 11 | % llh: loglikelihood 12 | % Written by Mo Chen (sth4nth@gmail.com). 13 | if nargin < 3 14 | alpha = 0.02; 15 | beta = 0.5; 16 | end 17 | [d,n] = size(X); 18 | xbar = mean(X,2); 19 | tbar = mean(t,2); 20 | X = bsxfun(@minus,X,xbar); 21 | t = bsxfun(@minus,t,tbar); 22 | XX = X*X'; 23 | Xt = X*t'; 24 | 25 | tol = 1e-3; 26 | maxiter = 500; 27 | llh = -inf(1,maxiter); 28 | index = 1:d; 29 | alpha = alpha*ones(d,1); 30 | for iter = 2:maxiter 31 | % remove zeros 32 | nz = 1./alpha > tol; % nonzeros 33 | index = index(nz); 34 | alpha = alpha(nz); 35 | XX = XX(nz,nz); 36 | Xt = Xt(nz); 37 | X = X(nz,:); 38 | 39 | U = chol(beta*XX+diag(alpha)); % 7.83 40 | m = beta*(U\(U'\Xt)); % 7.82 41 | m2 = m.^2; 42 | e = sum((t-m'*X).^2); 43 | 44 | logdetS = 2*sum(log(diag(U))); 45 | llh(iter) = 0.5*(sum(log(alpha))+n*log(beta)-beta*e-logdetS-dot(alpha,m2)-n*log(2*pi)); % 3.86 46 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end 47 | 48 | V = inv(U); 49 | dgSigma = dot(V,V,2); 50 | gamma = 1-alpha.*dgSigma; % 7.89 51 | alpha = gamma./m2; % 7.87 52 | beta = (n-sum(gamma))/e; % 7.88 53 | end 54 | llh = llh(2:iter); 55 | 56 | model.index = index; 57 | model.w0 = tbar-dot(m,xbar(nz)); 58 | model.w = m; 59 | model.alpha = alpha; 60 | model.beta = beta; 61 | %% optional for bayesian probabilistic prediction purpose 62 | model.xbar = xbar(index); 63 | model.U = U; -------------------------------------------------------------------------------- /chapter13/HMM/hmmEm.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = hmmEm(x, init) 2 | % EM algorithm to fit the parameters of HMM model (a.k.a Baum-Welch algorithm) 3 | % Input: 4 | % x: 1 x n integer vector which is the sequence of observations 5 | % init: model or k 6 | % Output:s 7 | % model: trained model structure 8 | % llh: loglikelihood 9 | % Written by Mo Chen (sth4nth@gmail.com). 10 | n = size(x,2); 11 | X = sparse(x,1:n,1); 12 | d = size(X,1); 13 | if isstruct(init) % init with a model 14 | A = init.A; 15 | E = init.E; 16 | s = init.s; 17 | elseif numel(init) == 1 % random init with latent k 18 | k = init; 19 | s = normalize(rand(k,1),1); 20 | A = normalize(rand(k,k),2); 21 | E = normalize(rand(k,d),2); 22 | end 23 | tol = 1e-4; 24 | maxIter = 1000; 25 | llh = -inf(1,maxIter); 26 | for iter = 2:maxIter 27 | M = E*X; 28 | % E-step 29 | [gamma,alpha,beta,c] = hmmSmoother(M,A,s); 30 | llh(iter) = mean(log(c)); 31 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end % check likelihood for convergence 32 | % M-step 33 | s = gamma(:,1); % 13.18 34 | A = normalize(A.*(alpha(:,1:n-1)*(beta(:,2:n).*M(:,2:n)./c(2:n))'),2); % 13.19 13.43 13.65 35 | E = (gamma*X')./sum(gamma,2); % 13.23 36 | end 37 | model.s = s; 38 | model.A = A; 39 | model.E = E; 40 | llh = llh(2:iter); 41 | 42 | function [gamma, alpha, beta, c] = hmmSmoother(M, A, s) 43 | [K,T] = size(M); 44 | At = A'; 45 | c = zeros(1,T); 46 | alpha = zeros(K,T); 47 | [alpha(:,1),c(1)] = normalize(s.*M(:,1),1); 48 | for t = 2:T 49 | [alpha(:,t),c(t)] = normalize((At*alpha(:,t-1)).*M(:,t),1); % 13.59 50 | end 51 | beta = ones(K,T); 52 | for t = T-1:-1:1 53 | beta(:,t) = A*(beta(:,t+1).*M(:,t+1))/c(t+1); % 13.62 54 | end 55 | gamma = alpha.*beta; % 13.64 56 | -------------------------------------------------------------------------------- /chapter08/MRF/mrfBp.m: -------------------------------------------------------------------------------- 1 | function [nodeBel, edgeBel, L] = mrfBp(A, nodePot, edgePot, epoch) 2 | % Undirected graph belief propagation for MRF 3 | % Assuming egdePot is symmetric 4 | % Input: 5 | % A: n x n adjacent matrix of undirected graph, where value is edge index 6 | % nodePot: k x n node potential 7 | % edgePot: k x k x m edge potential 8 | % Output: 9 | % nodeBel: k x n node belief 10 | % edgeBel: k x k x m edge belief 11 | % L: variational lower bound (Bethe energy) 12 | % Written by Mo Chen (sth4nth@gmail.com) 13 | if nargin < 4 14 | epoch = 10; 15 | end 16 | expNodePot = exp(nodePot); 17 | expEdgePot = exp(edgePot); 18 | [k,n] = size(nodePot); 19 | m = size(edgePot,3); 20 | 21 | [s,t,e] = find(triu(A)); 22 | A = sparse([s;t],[t;s],[e;e+m]); % digraph adjacent matrix, where value is message index 23 | mu = ones(k,2*m)/k; % message factor to node 24 | 25 | nodeBel = zeros(k,n); 26 | edgeBel = zeros(k,k,m); 27 | L = -inf(1,epoch+1); 28 | for iter = 1:epoch 29 | for i = 1:n 30 | in = nonzeros(A(:,i)); % incoming message index 31 | nb = expNodePot(:,i).*prod(mu(:,in),2); % product of incoming message 32 | for l = in' 33 | ep = expEdgePot(:,:,ud(l,m)); 34 | mu(:,rd(l,m)) = normalize(ep*(nb./mu(:,l))); 35 | end 36 | nodeBel(:,i) = nb/sum(nb); 37 | end 38 | 39 | for l = 1:m 40 | st = e(l); 41 | nut = nodeBel(:,t(l))./mu(:,st); 42 | nus = nodeBel(:,s(l))./mu(:,st+m); 43 | eb = expEdgePot(:,:,st).*(nus*nut'); 44 | edgeBel(:,:,st) = eb./sum(eb(:)); 45 | end 46 | L(iter+1) = mrfBethe(A,nodePot,edgePot,nodeBel,edgeBel); 47 | end 48 | L = L(1,2:iter+1); 49 | 50 | function i = rd(i, m) 51 | % reverse direction edge index 52 | i = mod(i+m-1,2*m)+1; 53 | 54 | function i = ud(i, m) 55 | % undirected edge index 56 | i = mod(i-1,m)+1; -------------------------------------------------------------------------------- /chapter13/LDS/kalmanFilter.m: -------------------------------------------------------------------------------- 1 | function [mu, V, llh] = kalmanFilter(model, X) 2 | % Kalman filter (forward algorithm for linear dynamic system) 3 | % NOTE: This is the exact implementation of the Kalman filter algorithm in PRML. 4 | % However, this algorithm is not practical. It is numerical unstable. 5 | % Input: 6 | % X: d x n data matrix 7 | % model: model structure 8 | % Output: 9 | % mu: q x n matrix of latent mean mu_t=E[z_t] w.r.t p(z_t|x_{1:t}) 10 | % V: q x q x n latent covariance U_t=cov[z_t] w.r.t p(z_t|x_{1:t}) 11 | % llh: loglikelihood 12 | % Written by Mo Chen (sth4nth@gmail.com). 13 | A = model.A; % transition matrix 14 | G = model.G; % transition covariance 15 | C = model.C; % emission matrix 16 | S = model.S; % emision covariance 17 | mu0 = model.mu0; % prior mean 18 | P = model.P0; % prior covairance 19 | 20 | n = size(X,2); 21 | k = size(mu0,1); 22 | mu = zeros(k,n); 23 | V = zeros(k,k,n); 24 | llh = zeros(1,n); 25 | I = eye(k); 26 | 27 | PC = P*C'; 28 | R = C*PC+S; 29 | K = PC/R; % 13.97 30 | mu(:,1) = mu0+K*(X(:,1)-C*mu0); % 13.94 31 | V(:,:,1) = (I-K*C)*P; % 13.95 32 | llh(1) = logGauss(X(:,1),C*mu0,R); 33 | for i = 2:n 34 | [mu(:,i), V(:,:,i), llh(i)] = ... 35 | forwardUpdate(X(:,i), mu(:,i-1), V(:,:,i-1), A, G, C, S, I); 36 | end 37 | llh = sum(llh); 38 | 39 | function [mu, V, llh] = forwardUpdate(x, mu, V, A, G, C, S, I) 40 | P = A*V*A'+G; % 13.88 41 | PC = P*C'; 42 | R = C*PC+S; 43 | K = PC/R; % 13.92 44 | Amu = A*mu; 45 | CAmu = C*Amu; 46 | mu = Amu+K*(x-CAmu); % 13.89 47 | V = (I-K*C)*P; % 13.90 48 | llh = logGauss(x,CAmu,R); % 13.91 -------------------------------------------------------------------------------- /chapter10/rvmRegVb.m: -------------------------------------------------------------------------------- 1 | function [model, energy] = rvmRegVb(X, t, prior) 2 | % Variational Bayesian inference for RVM regression. 3 | % Input: 4 | % X: d x n data 5 | % t: 1 x n response 6 | % prior: prior parameter 7 | % Output: 8 | % model: trained model structure 9 | % energy: variational lower bound 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | if nargin < 3 12 | a0 = 1e-4; 13 | b0 = 1e-4; 14 | c0 = 1e-4; 15 | d0 = 1e-4; 16 | else 17 | a0 = prior.a; 18 | b0 = prior.b; 19 | c0 = prior.c; 20 | d0 = prior.d; 21 | end 22 | [m,n] = size(X); 23 | idx = (1:m)'; 24 | dg = sub2ind([m,m],idx,idx); 25 | I = eye(m); 26 | xbar = mean(X,2); 27 | tbar = mean(t,2); 28 | 29 | X = bsxfun(@minus,X,xbar); 30 | t = bsxfun(@minus,t,tbar); 31 | 32 | XX = X*X'; 33 | Xt = X*t'; 34 | 35 | maxiter = 100; 36 | energy = -inf(1,maxiter+1); 37 | tol = 1e-8; 38 | 39 | a = a0+1/2; 40 | c = c0+n/2; 41 | Ealpha = 1e-2; 42 | Ebeta = 1e-2; 43 | for iter = 2:maxiter 44 | % q(w) 45 | invS = Ebeta*XX; 46 | invS(dg) = invS(dg)+Ealpha; 47 | U = chol(invS); 48 | Ew = Ebeta*(U\(U'\Xt)); 49 | KLw = -sum(log(diag(U))); 50 | % q(alpha) 51 | w2 = Ew.*Ew; 52 | invU = U'\I; 53 | dgS = dot(invU,invU,2); 54 | b = b0+0.5*(w2+dgS); 55 | Ealpha = a./b; 56 | KLalpha = -sum(a*log(b)); 57 | % q(beta) 58 | e2 = sum((t-Ew'*X).^2); 59 | invUX = U'\X; 60 | trXSX = dot(invUX(:),invUX(:)); 61 | d = d0+0.5*(e2+trXSX); 62 | Ebeta = c/d; 63 | KLbeta = -c*log(d); 64 | % lower bound 65 | energy(iter) = KLalpha+KLbeta+KLw; 66 | if energy(iter)-energy(iter-1) < tol*abs(energy(iter-1)); break; end 67 | end 68 | const = m*(gammaln(a)-gammaln(a0)+a0*log(b0))+gammaln(c)-gammaln(c0)+c0*log(d0)+0.5*(m-n*log(2*pi)); 69 | energy = energy(2:iter)+const; 70 | w0 = tbar-dot(Ew,xbar); 71 | 72 | model.w0 = w0; 73 | model.w = Ew; 74 | model.alpha = Ealpha; 75 | model.beta = Ebeta; 76 | model.a = a; 77 | model.b = b; 78 | model.c = c; 79 | model.d = d; 80 | model.xbar = xbar; 81 | -------------------------------------------------------------------------------- /chapter10/linRegVb.m: -------------------------------------------------------------------------------- 1 | function [model, energy] = linRegVb(X, t, prior) 2 | % Variational Bayesian inference for linear regression. 3 | % Input: 4 | % X: d x n data 5 | % t: 1 x n response 6 | % prior: prior parameter 7 | % Output: 8 | % model: trained model structure 9 | % energy: variational lower bound 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | if nargin < 3 12 | a0 = 1e-4; 13 | b0 = 1e-4; 14 | c0 = 1e-4; 15 | d0 = 1e-4; 16 | else 17 | a0 = prior.a; 18 | b0 = prior.b; 19 | c0 = prior.c; 20 | d0 = prior.d; 21 | end 22 | [m,n] = size(X); 23 | I = eye(m); 24 | xbar = mean(X,2); 25 | tbar = mean(t,2); 26 | 27 | X = bsxfun(@minus,X,xbar); 28 | t = bsxfun(@minus,t,tbar); 29 | 30 | XX = X*X'; 31 | Xt = X*t'; 32 | 33 | maxiter = 100; 34 | energy = -inf(1,maxiter+1); 35 | tol = 1e-8; 36 | 37 | a = a0+m/2; % 10.94 38 | c = c0+n/2; 39 | Ealpha = 1e-4; 40 | Ebeta = 1e-4; 41 | for iter = 2:maxiter 42 | % q(w) 43 | invS = diag(Ealpha)+Ebeta*XX; % 10.101 44 | U = chol(invS); 45 | Ew = Ebeta*(U\(U'\Xt)); % 10.100 46 | KLw = -sum(log(diag(U))); 47 | % q(alpha) 48 | w2 = dot(Ew,Ew); 49 | invU = U'\I; 50 | trS = dot(invU(:),invU(:)); 51 | b = b0+0.5*(w2+trS); % 10.95 52 | Ealpha = a/b; % 10.102 53 | KLalpha = -a*log(b); 54 | % q(beta) 55 | e2 = sum((t-Ew'*X).^2); 56 | invUX = U'\X; 57 | trXSX = dot(invUX(:),invUX(:)); 58 | d = d0+0.5*(e2+trXSX); 59 | Ebeta = c/d; 60 | KLbeta = -c*log(d); 61 | % lower bound 62 | energy(iter) = KLalpha+KLbeta+KLw; 63 | if energy(iter)-energy(iter-1) < tol*abs(energy(iter-1)); break; end 64 | end 65 | const = gammaln(a)-gammaln(a0)+gammaln(c)-gammaln(c0)+a0*log(b0)+c0*log(d0)+0.5*(m-n*log(2*pi)); 66 | energy = energy(2:iter)+const; 67 | w0 = tbar-dot(Ew,xbar); 68 | 69 | model.w0 = w0; 70 | model.w = Ew; 71 | model.alpha = Ealpha; 72 | model.beta = Ebeta; 73 | model.a = a; 74 | model.b = b; 75 | model.c = c; 76 | model.d = d; 77 | model.xbar = xbar; 78 | -------------------------------------------------------------------------------- /chapter12/ppcaVb.m: -------------------------------------------------------------------------------- 1 | function [model, L] = ppcaVb(X, q, prior) 2 | % Perform variatioanl Bayeisan inference for probabilistic PCA model. 3 | % Input: 4 | % X: d x n data matrix 5 | % q: dimension of target space 6 | % Output: 7 | % model: trained model structure 8 | % L: variantional lower bound 9 | % Reference: 10 | % Pattern Recognition and Machine Learning by Christopher M. Bishop 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | [m,n] = size(X); 13 | if nargin < 3 14 | a0 = 1e-4; 15 | b0 = 1e-4; 16 | c0 = 1e-4; 17 | d0 = 1e-4; 18 | else 19 | a0 = prior.a; 20 | b0 = prior.b; 21 | c0 = prior.c; 22 | d0 = prior.d; 23 | end 24 | 25 | if nargin < 2 26 | q = m-1; 27 | end 28 | tol = 1e-6; 29 | maxIter = 500; 30 | L = -inf(1,maxIter); 31 | 32 | mu = mean(X,2); 33 | Xo = bsxfun(@minus, X, mu); 34 | s = dot(Xo(:),Xo(:)); 35 | I = eye(q); 36 | % init parameters 37 | a = a0+m/2; 38 | c = c0+m*n/2; 39 | Ealpha = 1e-4; 40 | Ebeta = 1e-4; 41 | EW = rand(q,m); 42 | EWo = bsxfun(@minus,EW,mean(EW,2)); 43 | EWW = EWo*EWo'/m+EW*EW'; 44 | for iter = 2:maxIter 45 | % q(z) 46 | LZ = I+Ebeta*EWW; 47 | V = inv(chol(LZ)); % inv(LZ) = V*V'; 48 | EZ = LZ\EW*Xo*Ebeta; 49 | EZZ = n*(V*V')+EZ*EZ'; 50 | KLZ = n*sum(log(diag(V))); % KLZ = 0.5*n*log(det(inv(LZ))); 51 | % q(w) 52 | LW = diag(Ealpha)+Ebeta*EZZ; 53 | V = inv(chol(LW)); % inv(LW) = V*V'; 54 | EW = LW\EZ*Xo'*Ebeta; 55 | EWW = m*(V*V')+EW*EW'; 56 | KLW = m*sum(log(diag(V))); % KLW = 0.5*n*log(det(inv(LW))); 57 | % q(alpha) 58 | b = b0+diag(EWW)/2; 59 | Ealpha = a./b; 60 | KLalpha = -sum(a*log(b)); 61 | % q(beta) 62 | WZ = EW'*EZ; 63 | d = d0+(s-2*dot(Xo(:),WZ(:))+dot(EWW(:),EZZ(:)))/2; 64 | Ebeta = c/d; 65 | KLbeta = -c*log(d); 66 | % q(mu) 67 | % Emu = Ebeta/(lambda+n*Ebeta)*sum(X-WZ,2); 68 | 69 | % lower bound 70 | L(iter) = KLalpha+KLbeta+KLW+KLZ; 71 | if L(iter)-L(iter-1) < tol*abs(L(iter-1)); break; end 72 | end 73 | L = L(2:iter); 74 | 75 | model.Z = EZ; 76 | model.W = EW; 77 | model.apha = Ealpha; 78 | model.beta = Ebeta; 79 | model.a = a; 80 | model.b = b; 81 | model.c = c; 82 | model.d = d; 83 | model.mu = mu; -------------------------------------------------------------------------------- /chapter13/LDS/ldsEm.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = ldsEm(X, m) 2 | % EM algorithm for parameter estimation of linear dynamic system. 3 | % NOTE: This is an exact implementation of the algorithm in PRML. 4 | % However, this algorithm is numerical unstable and there is much redundant degree of freedom. 5 | % Input: 6 | % X: d x n data matrix 7 | % m: initilaization parameter, either a integer for dimension of z or 8 | % initi model structure. 9 | % Output: 10 | % model: trained model structure 11 | % llh: loglikelihood 12 | % reference: Bayesian Reasoning and Machine Learning (BRML) 13 | % Written by Mo Chen (sth4nth@gmail.com). 14 | if isstruct(m) % init with a model 15 | model = m; 16 | elseif numel(m) == 1 % random init with latent dimension m 17 | model = init(X,m); 18 | end 19 | tol = 1e-4; 20 | maxIter = 2000; 21 | llh = -inf(1,maxIter); 22 | for iter = 2:maxIter 23 | % E-step 24 | [nu, U, llh(iter),Ezz, Ezy] = kalmanSmoother(model,X); 25 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end % check likelihood for convergence 26 | % M-step 27 | model = maximization(X, nu, U, Ezz, Ezy); 28 | end 29 | llh = llh(2:iter); 30 | 31 | function model = init(X, k) 32 | % d = size(X,1); 33 | % model.mu0 = randn(k,1); 34 | % model.P0 = iwishrnd(eye(k),k); 35 | % model.A = randn(k,k); 36 | % model.G = iwishrnd(eye(k),k); 37 | % model.C = randn(d,k); 38 | % model.S = iwishrnd(eye(d),d); 39 | [A,C,Z] = ldsPca(X,k,3*k); 40 | model.mu0 = Z(:,1); 41 | E = Z(:,1:end-1)-Z(:,2:end); 42 | model.P0 = (dot(E(:),E(:))/(k*size(E,2)))*eye(k); 43 | model.A = A; 44 | E = A*Z(:,1:end-1)-Z(:,2:end); 45 | model.G = E*E'/size(E,2); 46 | model.C = C; 47 | E = C*Z-X(:,1:size(Z,2)); 48 | model.S = E*E'/size(E,2); 49 | 50 | function model = maximization(X ,nu, U, Ezz, Ezy) 51 | n = size(X,2); 52 | 53 | EZZ = sum(Ezz,3); 54 | EZY = sum(Ezy,3); 55 | A = EZY/(EZZ-Ezz(:,:,n)); % 13.113 56 | G = (EZZ-Ezz(:,:,1)-EZY*A')/(n-1); % 13.114, BRML 24.5.12 57 | 58 | Xnu = X*nu'; 59 | C = Xnu/EZZ; % 13.115 60 | S = (X*X'-Xnu*C')/n; % 13.116, BRML 24.5.11 61 | 62 | model.mu0 = nu(:,1); % 13.110 63 | model.P0 = U(:,:,1); % 13.111, 13.107 64 | model.A = A; 65 | model.G = (G+G')/2; 66 | model.C = C; 67 | model.S = (S+S')/2; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Introduction 2 | ------- 3 | This Matlab package implements machine learning algorithms described in the great textbook: 4 | Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)). 5 | 6 | It is written purely in Matlab language. It is self-contained. There is no external dependency. 7 | 8 | Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting). It also requires Statistics Toolbox (for some simple random number generator) and Image Processing Toolbox (for reading image data). 9 | 10 | Design Goal 11 | ------- 12 | * Succinct: The code is extremely compact. Minimizing code length is a major goal. As a result, the core of the algorithms can be easily spotted. 13 | * Efficient: Many tricks for speeding up Matlab code are applied (e.g. vectorization, matrix factorization, etc.). Usually, functions in this package are orders faster than Matlab builtin ones (e.g. kmeans). 14 | * Robust: Many tricks for numerical stability are applied, such as computing probability in logrithm domain, square root matrix update to enforce matrix symmetry\PD, etc. 15 | * Readable: The code is heavily commented. Corresponding formulas in PRML are annoted. Symbols are in sync with the book. 16 | * Practical: The package is not only readable, but also meant to be easily used and modified to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)). 17 | 18 | Installation 19 | ------- 20 | 1. Download the package to a local folder (e.g. ~/PRMLT/) by running: 21 | ```console 22 | git clone https://github.com/PRML/PRMLT.git 23 | ``` 24 | 2. Run Matlab and navigate to the folder (~/PRMLT/), then run the init.m script. 25 | 26 | 3. Run some demos in ~/PRMLT/demo folder. Enjoy! 27 | 28 | FeedBack 29 | ------- 30 | If you find any bug or have any suggestion, please do file issues. I am graceful for any feedback and will do my best to improve this package. 31 | 32 | License 33 | ------- 34 | Released under MIT license 35 | 36 | Contact 37 | ------- 38 | sth4nth at gmail dot com 39 | -------------------------------------------------------------------------------- /chapter04/logitMn.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = logitMn(X, t, lambda) 2 | % Multinomial regression for multiclass problem (Multinomial likelihood) 3 | % Input: 4 | % X: d x n data matrix 5 | % t: 1 x n label (1~k) 6 | % lambda: regularization parameter 7 | % Output: 8 | % model: trained model structure 9 | % llh: loglikelihood 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | if nargin < 3 12 | lambda = 1e-4; 13 | end 14 | X = [X; ones(1,size(X,2))]; 15 | [W, llh] = newtonRaphson(X, t, lambda); 16 | % [W, llh] = newtonBlock(X, t, lambda); 17 | model.W = W; 18 | 19 | function [W, llh] = newtonRaphson(X, t, lambda) 20 | [d,n] = size(X); 21 | k = max(t); 22 | tol = 1e-4; 23 | maxiter = 100; 24 | llh = -inf(1,maxiter); 25 | dk = d*k; 26 | idx = (1:dk)'; 27 | dg = sub2ind([dk,dk],idx,idx); 28 | T = sparse(t,1:n,1,k,n,n); 29 | W = zeros(d,k); 30 | HT = zeros(d,k,d,k); 31 | for iter = 2:maxiter 32 | A = W'*X; % 4.105 33 | logY = bsxfun(@minus,A,logsumexp(A,1)); % 4.104 34 | llh(iter) = dot(T(:),logY(:))-0.5*lambda*dot(W(:),W(:)); % 4.108 35 | if abs(llh(iter)-llh(iter-1)) < tol; break; end 36 | Y = exp(logY); 37 | for i = 1:k 38 | for j = 1:k 39 | r = Y(i,:).*((i==j)-Y(j,:)); % r has negative value, so cannot use sqrt 40 | HT(:,i,:,j) = bsxfun(@times,X,r)*X'; % 4.110 41 | end 42 | end 43 | G = X*(Y-T)'+lambda*W; % 4.96 44 | H = reshape(HT,dk,dk); 45 | H(dg) = H(dg)+lambda; 46 | W(:) = W(:)-H\G(:); % 4.92 47 | end 48 | llh = llh(2:iter); 49 | 50 | function [W, llh] = newtonBlock(X, t, lambda) 51 | [d,n] = size(X); 52 | k = max(t); 53 | idx = (1:d)'; 54 | dg = sub2ind([d,d],idx,idx); 55 | tol = 1e-4; 56 | maxiter = 100; 57 | llh = -inf(1,maxiter); 58 | T = sparse(t,1:n,1,k,n,n); 59 | W = zeros(d,k); 60 | A = W'*X; 61 | logY = bsxfun(@minus,A,logsumexp(A,1)); 62 | for iter = 2:maxiter 63 | for j = 1:k 64 | Y = exp(logY); 65 | Xw = bsxfun(@times,X,sqrt(Y(j,:).*(1-Y(j,:)))); 66 | H = Xw*Xw'; 67 | H(dg) = H(dg)+lambda; 68 | g = X*(Y(j,:)-T(j,:))'+lambda*W(:,j); 69 | W(:,j) = W(:,j)-H\g; 70 | A(j,:) = W(:,j)'*X; 71 | logY = bsxfun(@minus,A,logsumexp(A,1)); % must be here to renormalize 72 | end 73 | llh(iter) = dot(T(:),logY(:))-0.5*lambda*dot(W(:),W(:)); 74 | if abs(llh(iter)-llh(iter-1)) < tol; break; end 75 | end 76 | llh = llh(2:iter); 77 | -------------------------------------------------------------------------------- /chapter10/mixGaussEvidence.m: -------------------------------------------------------------------------------- 1 | function L = mixGaussEvidence(X, model, prior) 2 | % Variational lower bound of the model evidence (log of marginal likelihood) 3 | % This function implements the method in the book PRML. It is equivalent to the bound inside mixGaussVb function. 4 | % Reference: Pattern Recognition and Machine Learning by Christopher M. Bishop (P.474) 5 | % Written by Mo Chen (sth4nth@gmail.com). 6 | alpha0 = prior.alpha; 7 | kappa0 = prior.kappa; 8 | m0 = prior.m; 9 | v0 = prior.v; 10 | M0 = prior.M; 11 | 12 | alpha = model.alpha; % Dirichlet 13 | kappa = model.kappa; % Gaussian 14 | m = model.m; % Gasusian 15 | v = model.v; % Whishart 16 | % M = model.M; % Whishart: inv(W) = V'*V 17 | U = model.U; 18 | R = model.R; 19 | logR = model.logR; 20 | 21 | [d,k] = size(m); 22 | nk = sum(R,1); % 10.51 23 | 24 | Elogpi = psi(0,alpha)-psi(0,sum(alpha)); 25 | Epz = dot(nk,Elogpi); 26 | Eqz = dot(R(:),logR(:)); 27 | logCalpha0 = gammaln(k*alpha0)-k*gammaln(alpha0); 28 | Eppi = logCalpha0+(alpha0-1)*sum(Elogpi); 29 | logCalpha = gammaln(sum(alpha))-sum(gammaln(alpha)); 30 | Eqpi = dot(alpha-1,Elogpi)+logCalpha; 31 | 32 | U0 = chol(M0); 33 | sqrtR = sqrt(R); 34 | xbar = bsxfun(@times,X*R,1./nk); % 10.52 35 | 36 | logW = zeros(1,k); 37 | trSW = zeros(1,k); 38 | trM0W = zeros(1,k); 39 | xbarmWxbarm = zeros(1,k); 40 | mm0Wmm0 = zeros(1,k); 41 | for i = 1:k 42 | Ui = U(:,:,i); 43 | logW(i) = -2*sum(log(diag(Ui))); 44 | 45 | Xs = bsxfun(@times,bsxfun(@minus,X,xbar(:,i)),sqrtR(:,i)'); 46 | V = chol(Xs*Xs'/nk(i)); 47 | Q = V/Ui; 48 | trSW(i) = dot(Q(:),Q(:)); % equivalent to tr(SW)=trace(S/M) 49 | Q = U0/Ui; 50 | trM0W(i) = dot(Q(:),Q(:)); 51 | 52 | q = Ui'\(xbar(:,i)-m(:,i)); 53 | xbarmWxbarm(i) = dot(q,q); 54 | q = Ui'\(m(:,i)-m0); 55 | mm0Wmm0(i) = dot(q,q); 56 | end 57 | ElogLambda = sum(psi(0,bsxfun(@minus,v+1,(1:d)')/2),1)+d*log(2)+logW; % 10.65 58 | Epmu = sum(d*log(kappa0/(2*pi))+ElogLambda-d*kappa0./kappa-kappa0*(v.*mm0Wmm0))/2; 59 | logB0 = v0*sum(log(diag(U0)))-0.5*v0*d*log(2)-logMvGamma(0.5*v0,d); 60 | EpLambda = k*logB0+0.5*(v0-d-1)*sum(ElogLambda)-0.5*dot(v,trM0W); 61 | 62 | Eqmu = 0.5*sum(ElogLambda+d*log(kappa/(2*pi)))-0.5*d*k; 63 | logB = -v.*(logW+d*log(2))/2-logMvGamma(0.5*v,d); 64 | EqLambda = 0.5*sum((v-d-1).*ElogLambda-v*d)+sum(logB); 65 | 66 | EpX = 0.5*dot(nk,ElogLambda-d./kappa-v.*trSW-v.*xbarmWxbarm-d*log(2*pi)); 67 | 68 | L = Epz-Eqz+Eppi-Eqpi+Epmu-Eqmu+EpLambda-EqLambda+EpX; -------------------------------------------------------------------------------- /chapter09/rvmBinEm.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = rvmBinEm(X, t, alpha) 2 | % Relevance Vector Machine (ARD sparse prior) for binary classification. 3 | % trained by empirical bayesian (type II ML) using EM. 4 | % Input: 5 | % X: d x n data matrix 6 | % t: 1 x n label (0/1) 7 | % alpha: prior parameter 8 | % Output: 9 | % model: trained model structure 10 | % llh: loglikelihood 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | if nargin < 3 13 | alpha = 1; 14 | end 15 | n = size(X,2); 16 | X = [X;ones(1,n)]; 17 | d = size(X,1); 18 | alpha = alpha*ones(d,1); 19 | m = zeros(d,1); 20 | 21 | tol = 1e-4; 22 | maxiter = 100; 23 | llh = -inf(1,maxiter); 24 | index = 1:d; 25 | for iter = 2:maxiter 26 | % remove zeros 27 | nz = 1./alpha > tol; % nonzeros 28 | index = index(nz); 29 | alpha = alpha(nz); 30 | X = X(nz,:); 31 | m = m(nz); 32 | 33 | [m,e,U] = logitBin(X,t,alpha,m); % 7.110 ~ 7.113 34 | 35 | m2 = m.^2; 36 | llh(iter) = e(end)+0.5*(sum(log(alpha))-2*sum(log(diag(U)))-dot(alpha,m2)-n*log(2*pi)); % 7.114 & 7.118 37 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end 38 | 39 | V = inv(U); 40 | dgS = dot(V,V,2); 41 | alpha = 1./(m2+dgS); % 9.67 42 | end 43 | llh = llh(2:iter); 44 | 45 | model.index = index; 46 | model.w = m; 47 | model.alpha = alpha; 48 | 49 | function [w, llh, U] = logitBin(X, t, lambda, w) 50 | % Logistic regression 51 | [d,n] = size(X); 52 | tol = 1e-4; 53 | maxiter = 100; 54 | llh = -inf(1,maxiter); 55 | idx = (1:d)'; 56 | dg = sub2ind([d,d],idx,idx); 57 | h = ones(1,n); 58 | h(t==0) = -1; 59 | a = w'*X; 60 | for iter = 2:maxiter 61 | y = sigmoid(a); % 4.87 62 | r = y.*(1-y); % 4.98 63 | Xw = bsxfun(@times, X, sqrt(r)); 64 | H = Xw*Xw'; % 4.97 65 | H(dg) = H(dg)+lambda; 66 | U = chol(H); 67 | g = X*(y-t)'+lambda.*w; % 4.96 68 | p = -U\(U'\g); 69 | wo = w; % 4.92 70 | w = wo+p; 71 | a = w'*X; 72 | llh(iter) = -sum(log1pexp(-h.*a))-0.5*sum(lambda.*w.^2); % 4.89 73 | incr = llh(iter)-llh(iter-1); 74 | while incr < 0 % line search 75 | p = p/2; 76 | w = wo+p; 77 | a = w'*X; 78 | llh(iter) = -sum(log1pexp(-h.*a))-0.5*sum(lambda.*w.^2); 79 | incr = llh(iter)-llh(iter-1); 80 | end 81 | if incr < tol; break; end 82 | end 83 | llh = llh(2:iter); -------------------------------------------------------------------------------- /chapter07/rvmBinFp.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = rvmBinFp(X, t, alpha) 2 | % Relevance Vector Machine (ARD sparse prior) for binary classification. 3 | % trained by empirical bayesian (type II ML) using Mackay fix point update. 4 | % Input: 5 | % X: d x n data matrix 6 | % t: 1 x n label (0/1) 7 | % alpha: prior parameter 8 | % Output: 9 | % model: trained model structure 10 | % llh: loglikelihood 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | if nargin < 3 13 | alpha = 1; 14 | end 15 | n = size(X,2); 16 | X = [X;ones(1,n)]; 17 | d = size(X,1); 18 | alpha = alpha*ones(d,1); 19 | m = zeros(d,1); 20 | 21 | tol = 1e-4; 22 | maxiter = 100; 23 | llh = -inf(1,maxiter); 24 | index = 1:d; 25 | for iter = 2:maxiter 26 | % remove zeros 27 | nz = 1./alpha > tol; % nonzeros 28 | index = index(nz); 29 | alpha = alpha(nz); 30 | X = X(nz,:); 31 | m = m(nz); 32 | 33 | [m,e,U] = logitBin(X,t,alpha,m); % 7.110 ~ 7.113 34 | 35 | m2 = m.^2; 36 | llh(iter) = e(end)+0.5*(sum(log(alpha))-2*sum(log(diag(U)))-dot(alpha,m2)-n*log(2*pi)); % 7.114 & 7.118 37 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end 38 | 39 | V = inv(U); 40 | dgS = dot(V,V,2); 41 | alpha = (1-alpha.*dgS)./m2; % 7.89 & 7.87 & 7.116 42 | end 43 | llh = llh(2:iter); 44 | 45 | model.index = index; 46 | model.w = m; 47 | model.alpha = alpha; 48 | 49 | 50 | function [w, llh, U] = logitBin(X, t, lambda, w) 51 | % Logistic regression 52 | [d,n] = size(X); 53 | tol = 1e-4; 54 | maxiter = 100; 55 | llh = -inf(1,maxiter); 56 | idx = (1:d)'; 57 | dg = sub2ind([d,d],idx,idx); 58 | h = ones(1,n); 59 | h(t==0) = -1; 60 | a = w'*X; 61 | for iter = 2:maxiter 62 | y = sigmoid(a); % 4.87 63 | r = y.*(1-y); % 4.98 64 | Xw = bsxfun(@times, X, sqrt(r)); 65 | H = Xw*Xw'; % 4.97 66 | H(dg) = H(dg)+lambda; 67 | U = chol(H); 68 | g = X*(y-t)'+lambda.*w; % 4.96 69 | p = -U\(U'\g); 70 | wo = w; % 4.92 71 | w = wo+p; 72 | a = w'*X; 73 | llh(iter) = -sum(log1pexp(-h.*a))-0.5*sum(lambda.*w.^2); % 4.89 74 | incr = llh(iter)-llh(iter-1); 75 | while incr < 0 % line search 76 | p = p/2; 77 | w = wo+p; 78 | a = w'*X; 79 | llh(iter) = -sum(log1pexp(-h.*a))-0.5*sum(lambda.*w.^2); 80 | incr = llh(iter)-llh(iter-1); 81 | end 82 | if incr < tol; break; end 83 | end 84 | llh = llh(2:iter); 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /chapter09/mixGaussEm.m: -------------------------------------------------------------------------------- 1 | function [label, model, llh] = mixGaussEm(X, init) 2 | % Perform EM algorithm for fitting the Gaussian mixture model. 3 | % Input: 4 | % X: d x n data matrix 5 | % init: k (1 x 1) number of components or label (1 x n, 1<=label(i)<=k) or model structure 6 | % Output: 7 | % label: 1 x n cluster label 8 | % model: trained model structure 9 | % llh: loglikelihood 10 | % Written by Mo Chen (sth4nth@gmail.com). 11 | %% init 12 | fprintf('EM for Gaussian mixture: running ... \n'); 13 | tol = 1e-6; 14 | maxiter = 500; 15 | llh = -inf(1,maxiter); 16 | R = initialization(X,init); 17 | for iter = 2:maxiter 18 | [~,label(1,:)] = max(R,[],2); 19 | R = R(:,unique(label)); % remove empty clusters 20 | model = maximization(X,R); 21 | [R, llh(iter)] = expectation(X,model); 22 | if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter)); break; end; 23 | end 24 | llh = llh(2:iter); 25 | 26 | function R = initialization(X, init) 27 | n = size(X,2); 28 | if isstruct(init) % init with a model 29 | R = expectation(X,init); 30 | elseif numel(init) == 1 % random init k 31 | k = init; 32 | label = ceil(k*rand(1,n)); 33 | R = full(sparse(1:n,label,1,n,k,n)); 34 | elseif all(size(init)==[1,n]) % init with labels 35 | label = init; 36 | k = max(label); 37 | R = full(sparse(1:n,label,1,n,k,n)); 38 | else 39 | error('ERROR: init is not valid.'); 40 | end 41 | 42 | function [R, llh] = expectation(X, model) 43 | mu = model.mu; 44 | Sigma = model.Sigma; 45 | w = model.w; 46 | 47 | n = size(X,2); 48 | k = size(mu,2); 49 | R = zeros(n,k); 50 | for i = 1:k 51 | R(:,i) = loggausspdf(X,mu(:,i),Sigma(:,:,i)); 52 | end 53 | R = bsxfun(@plus,R,log(w)); 54 | T = logsumexp(R,2); 55 | llh = sum(T)/n; % loglikelihood 56 | R = exp(bsxfun(@minus,R,T)); 57 | 58 | function model = maximization(X, R) 59 | [d,n] = size(X); 60 | k = size(R,2); 61 | nk = sum(R,1); 62 | w = nk/n; 63 | mu = bsxfun(@times, X*R, 1./nk); 64 | 65 | Sigma = zeros(d,d,k); 66 | r = sqrt(R); 67 | for i = 1:k 68 | Xo = bsxfun(@minus,X,mu(:,i)); 69 | Xo = bsxfun(@times,Xo,r(:,i)'); 70 | Sigma(:,:,i) = Xo*Xo'/nk(i)+eye(d)*(1e-6); 71 | end 72 | 73 | model.mu = mu; 74 | model.Sigma = Sigma; 75 | model.w = w; 76 | 77 | function y = loggausspdf(X, mu, Sigma) 78 | d = size(X,1); 79 | X = bsxfun(@minus,X,mu); 80 | [U,p]= chol(Sigma); 81 | if p ~= 0 82 | error('ERROR: Sigma is not PD.'); 83 | end 84 | Q = U'\X; 85 | q = dot(Q,Q,1); % quadratic term (M distance) 86 | c = d*log(2*pi)+2*sum(log(diag(U))); % normalization constant 87 | y = -(c+q)/2; -------------------------------------------------------------------------------- /chapter13/LDS/kalmanSmoother.m: -------------------------------------------------------------------------------- 1 | function [nu, U, llh, Ezz, Ezy] = kalmanSmoother(model, X) 2 | % Kalman smoother (forward-backward algorithm for linear dynamic system) 3 | % NOTE: This is the exact implementation of the Kalman smoother algorithm in PRML. 4 | % However, this algorithm is not practical. It is numerical unstable. 5 | % Input: 6 | % X: d x n data matrix 7 | % model: model structure 8 | % Output: 9 | % nu: q x n matrix of latent mean mu_t=E[z_t] w.r.t p(z_t|x_{1:T}) 10 | % U: q x q x n latent covariance U_t=cov[z_t] w.r.t p(z_t|x_{1:T}) 11 | % Ezz: q x q matrix E[z_tz_t^T] 12 | % Ezy: q x q matrix E[z_tz_{t-1}^T] 13 | % llh: loglikelihood 14 | % Written by Mo Chen (sth4nth@gmail.com). 15 | A = model.A; % transition matrix 16 | G = model.G; % transition covariance 17 | C = model.C; % emission matrix 18 | S = model.S; % emision covariance 19 | mu0 = model.mu0; % prior mean 20 | P0 = model.P0; % prior covairance 21 | 22 | n = size(X,2); 23 | q = size(mu0,1); 24 | mu = zeros(q,n); 25 | V = zeros(q,q,n); 26 | P = zeros(q,q,n); % C_{t+1|t} 27 | Amu = zeros(q,n); % u_{t+1|t} 28 | llh = zeros(1,n); 29 | 30 | % forward 31 | PC = P0*C'; 32 | R = C*PC+S; 33 | K = PC/R; 34 | mu(:,1) = mu0+K*(X(:,1)-C*mu0); 35 | V(:,:,1) = (eye(q)-K*C)*P0; 36 | P(:,:,1) = P0; % useless, just make a point 37 | Amu(:,1) = mu0; % useless, just make a point 38 | llh(1) = logGauss(X(:,1),C*mu0,R); 39 | for i = 2:n 40 | [mu(:,i), V(:,:,i), Amu(:,i), P(:,:,i), llh(i)] = ... 41 | forwardUpdate(X(:,i), mu(:,i-1), V(:,:,i-1), A, G, C, S); 42 | end 43 | llh = sum(llh); 44 | % backward 45 | nu = zeros(q,n); 46 | U = zeros(q,q,n); 47 | Ezz = zeros(q,q,n); 48 | Ezy = zeros(q,q,n-1); 49 | 50 | nu(:,n) = mu(:,n); 51 | U(:,:,n) = V(:,:,n); 52 | Ezz(:,:,n) = U(:,:,n)+nu(:,n)*nu(:,n)'; 53 | for i = n-1:-1:1 54 | [nu(:,i), U(:,:,i), Ezz(:,:,i), Ezy(:,:,i)] = ... 55 | backwardUpdate(nu(:,i+1), U(:,:,i+1), mu(:,i), V(:,:,i), Amu(:,i+1), P(:,:,i+1), A); 56 | end 57 | 58 | function [mu1, V1, Amu, P, llh] = forwardUpdate(x, mu0, V0, A, G, C, S) 59 | k = numel(mu0); 60 | P = A*V0*A'+G; % 13.88 61 | PC = P*C'; 62 | R = C*PC+S; 63 | K = PC/R; % 13.92 64 | Amu = A*mu0; 65 | CAmu = C*Amu; 66 | mu1 = Amu+K*(x-CAmu); % 13.89 67 | V1 = (eye(k)-K*C)*P; % 13.90 68 | llh = logGauss(x,CAmu,R); % 13.91 69 | 70 | 71 | function [nu0, U0, E00, E10] = backwardUpdate(nu1, U1, mu, V, Amu, P, A) 72 | J = V*A'/P; % 13.102 73 | nu0 = mu+J*(nu1-Amu); % 13.100 74 | U0 = V+J*(U1-P)*J'; % 13.101 75 | E00 = U0+nu0*nu0'; % 13.107 76 | E10 = U1*J'+nu1*nu0'; % 13.106 77 | -------------------------------------------------------------------------------- /chapter11/GaussWishart.m: -------------------------------------------------------------------------------- 1 | % Class for Gaussian-Wishart distribution used by Dirichlet process 2 | 3 | classdef GaussWishart 4 | properties 5 | kappa_ 6 | m_ 7 | nu_ 8 | U_ 9 | end 10 | 11 | methods 12 | function obj = GaussWishart(kappa,m,nu,S) 13 | U = chol(S+kappa*(m*m')); 14 | obj.kappa_ = kappa; 15 | obj.m_ = m; 16 | obj.nu_ = nu; 17 | obj.U_ = U; 18 | end 19 | 20 | function obj = clone(obj) 21 | end 22 | 23 | function d = dim(obj) 24 | d = numel(obj.m_); 25 | end 26 | 27 | function obj = addData(obj, X) 28 | kappa0 = obj.kappa_; 29 | m0 = obj.m_; 30 | nu0 = obj.nu_; 31 | U0 = obj.U_; 32 | 33 | n = size(X,2); 34 | kappa = kappa0+n; 35 | m = (kappa0*m0+sum(X,2))/kappa; 36 | nu = nu0+n; 37 | U = chol(U0'*U0+X*X'); 38 | 39 | obj.kappa_ = kappa; 40 | obj.m_ = m; 41 | obj.nu_ = nu; 42 | obj.U_ = U; 43 | end 44 | 45 | function obj = addSample(obj, x) 46 | kappa = obj.kappa_; 47 | m = obj.m_; 48 | nu = obj.nu_; 49 | U = obj.U_; 50 | 51 | kappa = kappa+1; 52 | m = m+(x-m)/kappa; 53 | nu = nu+1; 54 | U = cholupdate(U,x,'+'); 55 | 56 | obj.kappa_ = kappa; 57 | obj.m_ = m; 58 | obj.nu_ = nu; 59 | obj.U_ = U; 60 | end 61 | 62 | function obj = delSample(obj, x) 63 | kappa = obj.kappa_; 64 | m = obj.m_; 65 | nu = obj.nu_; 66 | U = obj.U_; 67 | 68 | kappa = kappa-1; 69 | m = m-(x-m)/kappa; 70 | nu = nu-1; 71 | U = cholupdate(U,x,'-'); 72 | 73 | obj.kappa_ = kappa; 74 | obj.m_ = m; 75 | obj.nu_ = nu; 76 | obj.U_ = U; 77 | end 78 | 79 | function y = logPredPdf(obj,X) 80 | kappa = obj.kappa_; 81 | m = obj.m_; 82 | nu = obj.nu_; 83 | U = obj.U_; 84 | 85 | d = size(X,1); 86 | v = (nu-d+1); 87 | U = sqrt((1+1/kappa)/v)*cholupdate(U,sqrt(kappa)*m,'-'); 88 | 89 | X = bsxfun(@minus,X,m); 90 | Q = U'\X; 91 | q = dot(Q,Q,1); % quadratic term (M distance) 92 | o = -log(1+q/v)*((v+d)/2); 93 | c = gammaln((v+d)/2)-gammaln(v/2)-(d*log(v*pi)+2*sum(log(diag(U))))/2; 94 | y = c+o; 95 | end 96 | 97 | function [mu, Sigma] = sample(obj) 98 | % Sample a Gaussian distribution from GaussianWishart prior 99 | kappa = obj.kappa_; 100 | m = obj.m_; 101 | nu = obj.nu_; 102 | U = obj.U_; 103 | 104 | Sigma = iwishrnd(U'*U,nu); 105 | mu = gaussRnd(m,Sigma/kappa); 106 | end 107 | end 108 | end 109 | -------------------------------------------------------------------------------- /chapter10/mixGaussVb.m: -------------------------------------------------------------------------------- 1 | function [label, model, L] = mixGaussVb(X, m, prior) 2 | % Variational Bayesian inference for Gaussian mixture. 3 | % Input: 4 | % X: d x n data matrix 5 | % m: k (1 x 1) or label (1 x n, 1<=label(i)<=k) or model structure 6 | % Output: 7 | % label: 1 x n cluster label 8 | % model: trained model structure 9 | % L: variational lower bound 10 | % Reference: Pattern Recognition and Machine Learning by Christopher M. Bishop (P.474) 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | fprintf('Variational Bayesian Gaussian mixture: running ... \n'); 13 | [d,n] = size(X); 14 | if nargin < 3 15 | prior.alpha = 1; 16 | prior.kappa = 1; 17 | prior.m = mean(X,2); 18 | prior.v = d+1; 19 | prior.M = eye(d); % M = inv(W) 20 | end 21 | prior.logW = -2*sum(log(diag(chol(prior.M)))); 22 | 23 | tol = 1e-8; 24 | maxiter = 2000; 25 | L = -inf(1,maxiter); 26 | model = init(X,m,prior); 27 | for iter = 2:maxiter 28 | model = expect(X,model); 29 | model = maximize(X,model,prior); 30 | L(iter) = bound(X,model,prior); 31 | if abs(L(iter)-L(iter-1)) < tol*abs(L(iter)); break; end 32 | end 33 | L = L(2:iter); 34 | label = zeros(1,n); 35 | [~,label(:)] = max(model.R,[],2); 36 | [~,~,label(:)] = unique(label); 37 | 38 | function model = init(X, m, prior) 39 | n = size(X,2); 40 | if isstruct(m) % init with a model 41 | model = m; 42 | elseif numel(m) == 1 % random init k 43 | k = m; 44 | label = ceil(k*rand(1,n)); 45 | model.R = full(sparse(1:n,label,1,n,k,n)); 46 | elseif all(size(m)==[1,n]) % init with labels 47 | label = m; 48 | k = max(label); 49 | model.R = full(sparse(1:n,label,1,n,k,n)); 50 | else 51 | error('ERROR: init is not valid.'); 52 | end 53 | model = maximize(X,model,prior); 54 | 55 | % Done 56 | function model = maximize(X, model, prior) 57 | alpha0 = prior.alpha; 58 | kappa0 = prior.kappa; 59 | m0 = prior.m; 60 | v0 = prior.v; 61 | M0 = prior.M; 62 | R = model.R; 63 | 64 | nk = sum(R,1); % 10.51 65 | alpha = alpha0+nk; % 10.58 66 | kappa = kappa0+nk; % 10.60 67 | v = v0+nk; % 10.63 68 | m = bsxfun(@plus,kappa0*m0,X*R); 69 | m = bsxfun(@times,m,1./kappa); % 10.61 70 | 71 | [d,k] = size(m); 72 | U = zeros(d,d,k); 73 | logW = zeros(1,k); 74 | r = sqrt(R'); 75 | for i = 1:k 76 | Xm = bsxfun(@minus,X,m(:,i)); 77 | Xm = bsxfun(@times,Xm,r(i,:)); 78 | m0m = m0-m(:,i); 79 | M = M0+Xm*Xm'+kappa0*(m0m*m0m'); % equivalent to 10.62 80 | U(:,:,i) = chol(M); 81 | logW(i) = -2*sum(log(diag(U(:,:,i)))); 82 | end 83 | 84 | model.alpha = alpha; 85 | model.kappa = kappa; 86 | model.m = m; 87 | model.v = v; 88 | model.U = U; 89 | model.logW = logW; 90 | 91 | % Done 92 | function model = expect(X, model) 93 | alpha = model.alpha; % Dirichlet 94 | kappa = model.kappa; % Gaussian 95 | m = model.m; % Gasusian 96 | v = model.v; % Whishart 97 | U = model.U; % Whishart 98 | logW = model.logW; 99 | n = size(X,2); 100 | [d,k] = size(m); 101 | 102 | EQ = zeros(n,k); 103 | for i = 1:k 104 | Q = (U(:,:,i)'\bsxfun(@minus,X,m(:,i))); 105 | EQ(:,i) = d/kappa(i)+v(i)*dot(Q,Q,1); % 10.64 106 | end 107 | ElogLambda = sum(psi(0,0.5*bsxfun(@minus,v+1,(1:d)')),1)+d*log(2)+logW; % 10.65 108 | Elogpi = psi(0,alpha)-psi(0,sum(alpha)); % 10.66 109 | logRho = -0.5*bsxfun(@minus,EQ,ElogLambda-d*log(2*pi)); % 10.46 110 | logRho = bsxfun(@plus,logRho,Elogpi); % 10.46 111 | logR = bsxfun(@minus,logRho,logsumexp(logRho,2)); % 10.49 112 | R = exp(logR); 113 | 114 | model.logR = logR; 115 | model.R = R; 116 | 117 | % Done 118 | function L = bound(X, model, prior) 119 | alpha0 = prior.alpha; 120 | kappa0 = prior.kappa; 121 | v0 = prior.v; 122 | logW0 = prior.logW; 123 | alpha = model.alpha; 124 | kappa = model.kappa; 125 | v = model.v; 126 | logW = model.logW; 127 | R = model.R; 128 | logR = model.logR; 129 | [d,n] = size(X); 130 | k = size(R,2); 131 | 132 | Epz = 0; 133 | Eqz = dot(R(:),logR(:)); 134 | logCalpha0 = gammaln(k*alpha0)-k*gammaln(alpha0); 135 | Eppi = logCalpha0; 136 | logCalpha = gammaln(sum(alpha))-sum(gammaln(alpha)); 137 | Eqpi = logCalpha; 138 | Epmu = 0.5*d*k*log(kappa0); 139 | Eqmu = 0.5*d*sum(log(kappa)); 140 | logB0 = -0.5*v0*(logW0+d*log(2))-logMvGamma(0.5*v0,d); 141 | EpLambda = k*logB0; 142 | logB = -0.5*v.*(logW+d*log(2))-logMvGamma(0.5*v,d); 143 | EqLambda = sum(logB); 144 | EpX = -0.5*d*n*log(2*pi); 145 | L = Epz-Eqz+Eppi-Eqpi+Epmu-Eqmu+EpLambda-EqLambda+EpX; -------------------------------------------------------------------------------- /chapter07/rvmRegSeq.m: -------------------------------------------------------------------------------- 1 | function [model, llh] = rvmRegSeq(X, t) 2 | % Sparse Bayesian Regression (RVM) using sequential algorithm 3 | % Input: 4 | % X: d x n data 5 | % t: 1 x n response 6 | % Output: 7 | % model: trained model structure 8 | % llh: loglikelihood 9 | % reference: 10 | % Tipping and Faul. Fast marginal likelihood maximisation for sparse Bayesian models. AISTATS 2003. 11 | % Written by Mo Chen (sth4nth@gmail.com). 12 | maxiter = 1000; 13 | llh = -inf(1,maxiter); 14 | tol = 1e-4; 15 | 16 | [d,n] = size(X); 17 | xbar = mean(X,2); 18 | tbar = mean(t,2); 19 | X = bsxfun(@minus,X,xbar); 20 | t = bsxfun(@minus,t,tbar); 21 | 22 | beta = 1/mean(t.^2); % beta = 1/sigma^2 23 | alpha = inf(d,1); 24 | S = beta*dot(X,X,2); % eq.(22) 25 | Q = beta*(X*t'); % eq.(22) 26 | Sigma = zeros(0,0); 27 | mu = zeros(0,1); 28 | index = zeros(0,1); 29 | Phi = zeros(0,n); 30 | iAct = zeros(d,3); 31 | for iter = 2:maxiter 32 | s = S; q = Q; % p.353 Execrcies 7.17 33 | s(index) = alpha(index).*S(index)./(alpha(index)-S(index)); % 7.104 34 | q(index) = alpha(index).*Q(index)./(alpha(index)-S(index)); % 7.105 35 | 36 | theta = q.^2-s; 37 | iNew = theta>0; 38 | 39 | iUse = false(d,1); 40 | iUse(index) = true; 41 | 42 | iUpd = (iNew & iUse); % update 43 | iAdd = (iNew ~= iUpd); % add 44 | iDel = (iUse ~= iUpd); % del 45 | 46 | dllh = -inf(d,1); % delta likelihood (likelihood improvement of each step, eventually approches 0.) 47 | if any(iUpd) 48 | alpha_ = s(iUpd).^2./theta(iUpd); % eq.(20) 49 | delta = 1./alpha_-1./alpha(iUpd); 50 | dllh(iUpd) = Q(iUpd).^2.*delta./(S(iUpd).*delta+1)-log1p(S(iUpd).*delta); % eq.(32) 51 | end 52 | if any(iAdd) 53 | dllh(iAdd) = (Q(iAdd).^2-S(iAdd))./S(iAdd)+log(S(iAdd)./(Q(iAdd).^2)); % eq.(27) 54 | end 55 | if any(iDel) 56 | dllh(iDel) = Q(iDel).^2./(S(iDel)-alpha(iDel))-log1p(-S(iDel)./alpha(iDel)); % eq.(37) 57 | end 58 | 59 | [llh(iter),j] = max(dllh); 60 | if llh(iter) < tol; break; end 61 | 62 | iAct(:,1) = iUpd; 63 | iAct(:,2) = iAdd; 64 | iAct(:,3) = iDel; 65 | 66 | % update parameters 67 | switch find(iAct(j,:)) 68 | case 1 % update: 69 | idx = (index==j); 70 | alpha_ = s(j)^2/theta(j); 71 | 72 | Sigma_j = Sigma(:,idx); 73 | Sigma_jj = Sigma(idx,idx); 74 | mu_j = mu(idx); 75 | 76 | kappa = 1/(Sigma_jj+1/(alpha_-alpha(j))); 77 | Sigma = Sigma-kappa*(Sigma_j*Sigma_j'); % eq.(33) 78 | mu = mu-kappa*mu_j*Sigma_j; % eq.(34) 79 | 80 | v = beta*X*(Phi'*Sigma_j); 81 | S = S+kappa*v.^2; % eq.(35) 82 | Q = Q+kappa*mu_j*v; % eq.(36) 83 | alpha(j) = alpha_; 84 | case 2 % Add 85 | alpha_ = s(j)^2/theta(j); 86 | Sigma_jj = 1/(alpha_+S(j)); 87 | mu_j = Sigma_jj*Q(j); 88 | phi_j = X(j,:); 89 | 90 | v = beta*Sigma*(Phi*phi_j'); 91 | off = -Sigma_jj*v; % eq.(28) has error? 92 | Sigma = [Sigma+Sigma_jj*(v*v'), off; off', Sigma_jj]; % eq.(28) 93 | mu = [mu-mu_j*v; mu_j]; % eq.(29) 94 | 95 | e_j = phi_j-v'*Phi; 96 | v = beta*X*e_j'; 97 | S = S-Sigma_jj*v.^2; % eq.(30) 98 | Q = Q-mu_j*v; % eq.(31) 99 | 100 | index = [index;j]; %#ok 101 | alpha(j) = alpha_; 102 | case 3 % del 103 | idx = (index==j); 104 | Sigma_j = Sigma(:,idx); 105 | Sigma_jj = Sigma(idx,idx); 106 | mu_j = mu(idx); 107 | 108 | Sigma = Sigma-(Sigma_j*Sigma_j')/Sigma_jj; % eq.(38) 109 | mu = mu-mu_j*Sigma_j/Sigma_jj; % eq.(39) 110 | 111 | v = beta*X*(Phi'*Sigma_j); 112 | S = S+v.^2/Sigma_jj; % eq.(40) 113 | Q = Q+mu_j*v/Sigma_jj; % eq.(41) 114 | 115 | mu(idx) = []; 116 | Sigma(:,idx) = []; 117 | Sigma(idx,:) = []; 118 | index(idx) = []; 119 | alpha(j) = inf; 120 | end 121 | Phi = X(index,:); 122 | % beta = ; 123 | end 124 | llh = cumsum(llh(2:iter)); 125 | w0 = tbar-dot(mu,xbar(index)); 126 | 127 | model.index = index; 128 | model.w0 = w0; 129 | model.w = mu; 130 | model.alpha = alpha(index); 131 | model.beta = beta; -------------------------------------------------------------------------------- /Contents.m: -------------------------------------------------------------------------------- 1 | % CHAPTER01 2 | % condEntropy - Compute conditional entropy z=H(x|y) of two discrete variables x and y. 3 | % entropy - Compute entropy z=H(x) of a discrete variable x. 4 | % jointEntropy - Compute joint entropy z=H(x,y) of two discrete variables x and y. 5 | % mutInfo - Compute mutual information I(x,y) of two discrete variables x and y. 6 | % nmi - Compute normalized mutual information I(x,y)/sqrt(H(x)*H(y)) of two discrete variables x and y. 7 | % nvi - Compute normalized variation information z=(1-I(x,y)/H(x,y)) of two discrete variables x and y. 8 | % relatEntropy - Compute relative entropy (a.k.a KL divergence) z=KL(p(x)||p(y)) of two discrete variables x and y. 9 | % CHAPTER02 10 | % logDirichlet - Compute log pdf of a Dirichlet distribution. 11 | % logGauss - Compute log pdf of a Gaussian distribution. 12 | % logKde - Compute log pdf of kernel density estimator. 13 | % logMn - Compute log pdf of a multinomial distribution. 14 | % logMvGamma - Compute logarithm multivariate Gamma function 15 | % logSt - Compute log pdf of a Student's t distribution. 16 | % logVmf - Compute log pdf of a von Mises-Fisher distribution. 17 | % logWishart - Compute log pdf of a Wishart distribution. 18 | % CHAPTER03 19 | % linReg - Fit linear regression model y=w'x+w0 20 | % linRegFp - Fit empirical Bayesian linear model with Mackay fixed point method (p.168) 21 | % linRegPred - Compute linear regression model reponse y = w'*X+w0 and likelihood 22 | % linRnd - Generate data from a linear model p(t|w,x)=G(w'x+w0,sigma), sigma=sqrt(1/beta) 23 | % CHAPTER04 24 | % binPlot - Plot binary classification result for 2d data 25 | % fda - Fisher (linear) discriminant analysis 26 | % logitBin - Logistic regression for binary classification optimized by Newton-Raphson method. 27 | % logitBinPred - Prediction of binary logistic regression model 28 | % logitMn - Multinomial regression for multiclass problem (Multinomial likelihood) 29 | % logitMnPred - Prediction of multiclass (multinomial) logistic regression model 30 | % sigmoid - Sigmod function 31 | % softmax - Softmax function 32 | % CHAPTER05 33 | % mlpClass - Train a multilayer perceptron neural network for classification with backpropagation 34 | % mlpClassPred - Multilayer perceptron classification prediction 35 | % mlpReg - Train a multilayer perceptron neural network for regression with backpropagation 36 | % mlpRegPred - Multilayer perceptron regression prediction 37 | % CHAPTER06 38 | % kn2sd - Transform a kernel matrix (or inner product matrix) to a squared distance matrix 39 | % knCenter - Centerize the data in the kernel space 40 | % knGauss - Gaussian (RBF) kernel K = exp(-|x-y|/(2s)); 41 | % knKmeans - Perform kernel kmeans clustering. 42 | % knKmeansPred - Prediction for kernel kmeans clusterng 43 | % knLin - Linear kernel (inner product) 44 | % knPca - Kernel PCA 45 | % knPcaPred - Prediction for kernel PCA 46 | % knPoly - Polynomial kernel k(x,y)=(x'y+c)^o 47 | % knReg - Gaussian process (kernel) regression 48 | % knRegPred - Prediction for Gaussian Process (kernel) regression model 49 | % sd2kn - Transform a squared distance matrix to a kernel matrix. 50 | % CHAPTER07 51 | % rvmBinFp - Relevance Vector Machine (ARD sparse prior) for binary classification. 52 | % rvmBinPred - Prodict the label for binary logistic regression model 53 | % rvmRegFp - Relevance Vector Machine (ARD sparse prior) for regression 54 | % rvmRegPred - Compute RVM regression model reponse y = w'*X+w0 and likelihood 55 | % rvmRegSeq - Sparse Bayesian Regression (RVM) using sequential algorithm 56 | % CHAPTER08 57 | % MRF 58 | % mrfBethe - Compute Bethe energy 59 | % mrfBp - Undirected graph belief propagation for MRF 60 | % mrfGibbs - Compute Gibbs energy 61 | % mrfIsGa - Contruct a latent Ising MRF with Gaussian observation 62 | % mrfMf - Mean field for MRF 63 | % NaiveBayes 64 | % nbBern - Naive bayes classifier with indepenet Bernoulli. 65 | % nbBernPred - Prediction of naive Bayes classifier with independent Bernoulli. 66 | % nbGauss - Naive bayes classifier with indepenet Gaussian 67 | % nbGaussPred - Prediction of naive Bayes classifier with independent Gaussian. 68 | % CHAPTER09 69 | % kmeans - Perform kmeans clustering. 70 | % kmeansPred - Prediction for kmeans clusterng 71 | % kmeansRnd - Generate samples from a Gaussian mixture distribution with common variances (kmeans model). 72 | % kmedoids - Perform k-medoids clustering. 73 | % kseeds - Perform kmeans++ seeding 74 | % linRegEm - Fit empirical Bayesian linear regression model with EM (p.448 chapter 9.3.4) 75 | % mixBernEm - Perform EM algorithm for fitting the Bernoulli mixture model. 76 | % mixBernRnd - Generate samples from a Bernoulli mixture distribution. 77 | % mixGaussEm - Perform EM algorithm for fitting the Gaussian mixture model. 78 | % mixGaussPred - Predict label and responsibility for Gaussian mixture model. 79 | % mixGaussRnd - Genarate samples form a Gaussian mixture model. 80 | % rvmBinEm - Relevance Vector Machine (ARD sparse prior) for binary classification. 81 | % rvmRegEm - Relevance Vector Machine (ARD sparse prior) for regression 82 | % CHAPTER10 83 | % linRegVb - Variational Bayesian inference for linear regression. 84 | % mixGaussEvidence - Variational lower bound of the model evidence (log of marginal likelihood) 85 | % mixGaussVb - Variational Bayesian inference for Gaussian mixture. 86 | % mixGaussVbPred - Predict label and responsibility for Gaussian mixture model trained by VB. 87 | % rvmRegVb - Variational Bayesian inference for RVM regression. 88 | % CHAPTER11 89 | % dirichletRnd - Generate samples from a Dirichlet distribution. 90 | % discreteRnd - Generate samples from a discrete distribution (multinomial). 91 | % Gauss - Class for Gaussian distribution used by Dirichlet process 92 | % gaussRnd - Generate samples from a Gaussian distribution. 93 | % GaussWishart - Class for Gaussian-Wishart distribution used by Dirichlet process 94 | % mixDpGb - Collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 95 | % mixDpGbOl - Online collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 96 | % mixGaussGb - Collapsed Gibbs sampling for Dirichlet process (infinite) Gaussian mixture model (a.k.a. DPGM). 97 | % mixGaussSample - Genarate samples form a Gaussian mixture model with GaussianWishart prior. 98 | % CHAPTER12 99 | % fa - Perform EM algorithm for factor analysis model 100 | % pca - Principal component analysis 101 | % pcaEm - Perform EM-like algorithm for PCA (by Sam Roweis). 102 | % pcaEmC - Perform Constrained EM like algorithm for PCA. 103 | % ppcaEm - Perform EM algorithm to maiximize likelihood of probabilistic PCA model. 104 | % ppcaRnd - Generate data from probabilistic PCA model 105 | % ppcaVb - Perform variatioanl Bayeisan inference for probabilistic PCA model. 106 | % CHAPTER13 107 | % HMM 108 | % hmmEm - EM algorithm to fit the parameters of HMM model (a.k.a Baum-Welch algorithm) 109 | % hmmFilter - HMM forward filtering algorithm. 110 | % hmmRnd - Generate a data sequence from a hidden Markov model. 111 | % hmmSmoother - HMM smoothing alogrithm (normalized forward-backward or normalized alpha-beta algorithm). 112 | % hmmViterbi - Viterbi algorithm (calculated in log scale to improve numerical stability). 113 | % LDS 114 | % kalmanFilter - Kalman filter (forward algorithm for linear dynamic system) 115 | % kalmanSmoother - Kalman smoother (forward-backward algorithm for linear dynamic system) 116 | % ldsEm - EM algorithm for parameter estimation of linear dynamic system. 117 | % ldsPca - Subspace method for learning linear dynamic system. 118 | % ldsRnd - Generate a data sequence from linear dynamic system. 119 | % CHAPTER14 120 | % adaboostBin - Adaboost for binary classification (weak learner: kmeans) 121 | % adaboostBinPred - Prediction of binary Adaboost 122 | % mixLinPred - Prediction function for mxiture of linear regression 123 | % mixLinReg - Mixture of linear regression 124 | % mixLinRnd - Generate data from mixture of linear model 125 | % mixLogitBin - Mixture of logistic regression model for binary classification optimized by Newton-Raphson method 126 | % mixLogitBinPred - Prediction function for mixture of logistic regression 127 | --------------------------------------------------------------------------------