├── init.m
├── .gitignore
├── demo
    ├── ch08
    │   ├── letterX.mat
    │   ├── nbGauss_demo.m
    │   ├── nbBern_demo.m
    │   └── mrf_demo.m
    ├── ch09
    │   ├── linRegEm_demo.m
    │   ├── mixBernEm_demo.m
    │   ├── kmedoids_demo.m
    │   ├── kmeans_mixGaussEm_demo.m
    │   ├── mixGaussEm_demo.m
    │   └── kmeans_demo.m
    ├── ch14
    │   ├── adaboostBin_demo.m
    │   ├── mixLogitBin_demo.m
    │   └── mixLinReg_demo.m
    ├── ch04
    │   ├── logitMn_demo.m
    │   └── logitBin_demo.m
    ├── ch12
    │   ├── fa_demo.m
    │   ├── ppcaVb_demo.m
    │   ├── ppcaEm_demo.m
    │   └── pca_demo.m
    ├── ch06
    │   ├── knCenter_demo.m
    │   ├── knReg_demo.m
    │   ├── knKmeans_demo.m
    │   └── knLin_demo.m
    ├── ch07
    │   ├── rvmBinEm_demo.m
    │   ├── rvmBinFp_demo.m
    │   ├── rvmRegEm_demo.m
    │   ├── rvmRegSeq_demo.m
    │   ├── rvmRegFp_demo.m
    │   ├── rvmRegEm_spSignal_demo.m
    │   ├── rvmRegSeq_spSignal_demo.m
    │   └── rvmRegFp_spSignal_demo.m
    ├── ch03
    │   ├── linReg_demo.m
    │   ├── linRegEm_demo.m
    │   └── linRegFp_demo.m
    ├── ch11
    │   ├── mixGaussGb_demo.m
    │   └── gauss_demo.m
    ├── ch13
    │   ├── hmm_demo.m
    │   └── lds_demo.m
    ├── ch10
    │   ├── rvmRegVb_demo.m
    │   ├── mixGaussVb_demo.m
    │   └── rvmRegVb_spSignal_demo.m
    ├── ch05
    │   └── mlp_demo.m
    └── ch01
    │   └── info_demo.m
├── chapter04
    ├── sigmoid.m
    ├── softmax.m
    ├── logitBinPred.m
    ├── logitMnPred.m
    ├── binPlot.m
    ├── fda.m
    ├── logitBin.m
    └── logitMn.m
├── common
    ├── randp.m
    ├── maxdiff.m
    ├── ud.m
    ├── logdet.m
    ├── log1mexp.m
    ├── standardize.m
    ├── log1pexp.m
    ├── solvpd.m
    ├── isequalf.m
    ├── sqdist.m
    ├── invpd.m
    ├── lognormexp.m
    ├── normalize.m
    ├── unitize.m
    ├── logsumexp.m
    ├── gson.m
    ├── symeig.m
    ├── plotCurveBar.m
    ├── besseliLn.m
    ├── sub.m
    ├── mgson.m
    ├── slice.m
    ├── lattice.m
    ├── plotgm.m
    ├── loggmpdf.m
    ├── plotkde.m
    ├── ld.m
    └── plotClass.m
├── chapter06
    ├── kn2sd.m
    ├── knLin.m
    ├── sd2kn.m
    ├── knPca.m
    ├── knPcaPred.m
    ├── knGauss.m
    ├── knPoly.m
    ├── knKmeansPred.m
    ├── knReg.m
    ├── knRegPred.m
    ├── knKmeans.m
    └── knCenter.m
├── chapter02
    ├── logMn.m
    ├── logKde.m
    ├── logVmf.m
    ├── logWishart.m
    ├── logDirichlet.m
    ├── logMvGamma.m
    ├── logGauss.m
    └── logSt.m
├── chapter01
    ├── entropy.m
    ├── jointEntropy.m
    ├── relatEntropy.m
    ├── condEntropy.m
    ├── mutInfo.m
    ├── nvi.m
    └── nmi.m
├── chapter11
    ├── dirichletRnd.m
    ├── discreteRnd.m
    ├── gaussRnd.m
    ├── mixGaussSample.m
    ├── mixGaussGb.m
    ├── mixDpGbOl.m
    ├── mixDpGb.m
    ├── Gauss.m
    └── GaussWishart.m
├── chapter08
    ├── MRF
    │   ├── mrfGibbs.m
    │   ├── mrfBethe.m
    │   ├── mrfIsGa.m
    │   ├── mrfMf.m
    │   └── mrfBp.m
    └── NaiveBayes
    │   ├── nbBernPred.m
    │   ├── nbBern.m
    │   ├── nbGauss.m
    │   └── nbGaussPred.m
├── chapter09
    ├── kseeds.m
    ├── kmeansPred.m
    ├── mixBernRnd.m
    ├── kmeansRnd.m
    ├── kmedoids.m
    ├── mixGaussPred.m
    ├── kmeans.m
    ├── mixGaussRnd.m
    ├── mixBernEm.m
    ├── linRegEm.m
    ├── rvmRegEm.m
    ├── rvmBinEm.m
    └── mixGaussEm.m
├── chapter05
    ├── mlpRegPred.m
    ├── mlpClassPred.m
    ├── mlpReg.m
    └── mlpClass.m
├── chapter07
    ├── rvmBinPred.m
    ├── rvmRegPred.m
    ├── rvmRegFp.m
    ├── rvmBinFp.m
    └── rvmRegSeq.m
├── chapter14
    ├── mixLogitBinPred.m
    ├── mixLinRnd.m
    ├── adaboostBinPred.m
    ├── mixLinPred.m
    ├── adaboostBin.m
    ├── mixLogitBin.m
    └── mixLinReg.m
├── chapter12
    ├── ppcaRnd.m
    ├── pca.m
    ├── pcaEmC.m
    ├── pcaEm.m
    ├── fa.m
    ├── ppcaEm.m
    └── ppcaVb.m
├── chapter03
    ├── linRnd.m
    ├── linRegPred.m
    ├── linReg.m
    └── linRegFp.m
├── chapter13
    ├── HMM
    │   ├── hmmRnd.m
    │   ├── hmmViterbi.m
    │   ├── hmmFilter.m
    │   ├── hmmSmoother.m
    │   └── hmmEm.m
    └── LDS
    │   ├── ldsPca.m
    │   ├── ldsRnd.m
    │   ├── kalmanFilter.m
    │   ├── ldsEm.m
    │   └── kalmanSmoother.m
├── LICENSE
├── chapter10
    ├── mixGaussVbPred.m
    ├── rvmRegVb.m
    ├── linRegVb.m
    ├── mixGaussEvidence.m
    └── mixGaussVb.m
├── README.md
└── Contents.m


/init.m:
--------------------------------------------------------------------------------
1 | addpath(genpath(pwd));


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | reference/*
2 | *.m~
3 | *.asv


--------------------------------------------------------------------------------
/demo/ch08/letterX.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PRML/PRMLT/HEAD/demo/ch08/letterX.mat


--------------------------------------------------------------------------------
/chapter04/sigmoid.m:
--------------------------------------------------------------------------------
1 | function y = sigmoid(x)
2 | % Sigmod function
3 | % Written by Mo Chen (sth4nth@gmail.com).
4 | y = exp(-log1pexp(-x));


--------------------------------------------------------------------------------
/common/randp.m:
--------------------------------------------------------------------------------
1 | function i = randp(p)
2 | % Sample a integer in [1:k] with given probability p
3 | i = find(rand<cumsum(normalize(p)),1);
4 | 


--------------------------------------------------------------------------------
/common/maxdiff.m:
--------------------------------------------------------------------------------
1 | function z = maxdiff(x, y)
2 | % Written by Mo Chen (sth4nth@gmail.com).
3 | assert(all(size(x)==size(y)));
4 | z = max(abs(x(:)-y(:)));
5 | 
6 | 


--------------------------------------------------------------------------------
/demo/ch09/linRegEm_demo.m:
--------------------------------------------------------------------------------
1 | %% Empirical Bayesian linear regression via EM
2 | close all; clear;
3 | d = 5;
4 | n = 200;
5 | [x,t] = linRnd(d,n);
6 | [model,llh] = linRegEm(x,t);
7 | plot(llh);
8 | 


--------------------------------------------------------------------------------
/common/ud.m:
--------------------------------------------------------------------------------
1 | function [U, D] = ud(X)
2 | % UD factorization U'*D*U=X'*X;
3 | % Written by Mo Chen (sth4nth@gmail.com).
4 | [~,R] = qr(X,0);
5 | d = diag(R);
6 | D = d.^2;
7 | U = bsxfun(@times,R,1./d);


--------------------------------------------------------------------------------
/demo/ch09/mixBernEm_demo.m:
--------------------------------------------------------------------------------
1 | %% Bernoulli Mixture via EM
2 | close all; clear;
3 | d = 2;
4 | k = 3;
5 | n = 5000;
6 | [X,z,mu] = mixBernRnd(d,k,n);
7 | [label,model,llh] = mixBernEm(X,k);
8 | plot(llh);
9 | 


--------------------------------------------------------------------------------
/demo/ch14/adaboostBin_demo.m:
--------------------------------------------------------------------------------
 1 | 
 2 | %% adaboost
 3 | d = 2;
 4 | k = 2;
 5 | n = 500;
 6 | [X,t] = kmeansRnd(d,k,n);
 7 | model = adaboostBin(X,t-1);
 8 | y = adaboostBinPred(model,X);
 9 | plotClass(X,y+1)
10 | 
11 | 


--------------------------------------------------------------------------------
/demo/ch04/logitMn_demo.m:
--------------------------------------------------------------------------------
1 | %% Logistic logistic regression for multiclass classification
2 | clear
3 | k = 3;
4 | n = 1000;
5 | [X,t] = kmeansRnd(2,k,n);
6 | [model, llh] = logitMn(X,t);
7 | y = logitMnPred(model,X);
8 | plotClass(X,y)
9 | 


--------------------------------------------------------------------------------
/demo/ch12/fa_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch12
 2 | 
 3 | clear; close all;
 4 | d = 3;
 5 | m = 2;
 6 | n = 1000;
 7 | 
 8 | X = ppcaRnd(m,d,n);
 9 | plotClass(X);
10 | 
11 | %% Factor analysis
12 | [W, mu, psi, llh] = fa(X, m);
13 | plot(llh);
14 | 


--------------------------------------------------------------------------------
/demo/ch14/mixLogitBin_demo.m:
--------------------------------------------------------------------------------
 1 | %% Mixture of logistic regression
 2 | d = 2;
 3 | c = 4;
 4 | k = 4;
 5 | n = 500;
 6 | [X,t] = kmeansRnd(d,c,n);
 7 | 
 8 | model = mixLogitBin(X,t-1,k);
 9 | y = mixLogitBinPred(model,X);
10 | plotClass(X,y+1)


--------------------------------------------------------------------------------
/common/logdet.m:
--------------------------------------------------------------------------------
1 | function y = logdet(A)
2 | % Compute log(det(A)) where A is positive definite.
3 | % Written by Michael Chen (sth4nth@gmail.com).
4 | [U,p] = chol(A);
5 | if p > 0
6 |     y = -inf;
7 | else
8 |     y = 2*sum(log(diag(U)));
9 | end


--------------------------------------------------------------------------------
/demo/ch12/ppcaVb_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch12
 2 | clear; close all;
 3 | d = 3;
 4 | m = 2;
 5 | n = 1000;
 6 | 
 7 | X = ppcaRnd(m,d,n);
 8 | plotClass(X);
 9 | %% Variational Bayesian probabilistic PCA
10 | [model, L] = ppcaVb(X);
11 | plot(L);
12 | 


--------------------------------------------------------------------------------
/demo/ch06/knCenter_demo.m:
--------------------------------------------------------------------------------
1 | %% demo for knCenter
2 | clear; close all;
3 | kn = @knGauss;
4 | X=rand(2,100);
5 | X1=rand(2,10);
6 | X2=rand(2,5);
7 | 
8 | maxdiff(knCenter(kn,X,X1),diag(knCenter(kn,X,X1,X1))')
9 | maxdiff(knCenter(kn,X),knCenter(kn,X,X,X))


--------------------------------------------------------------------------------
/demo/ch12/ppcaEm_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch12
 2 | 
 3 | clear; close all;
 4 | d = 3;
 5 | m = 2;
 6 | n = 1000;
 7 | 
 8 | X = ppcaRnd(m,d,n);
 9 | plotClass(X);
10 | 
11 | %% EM probabilistic PCA
12 | [W,mu,beta,llh] = ppcaEm(X,m);
13 | plot(llh)
14 | 


--------------------------------------------------------------------------------
/common/log1mexp.m:
--------------------------------------------------------------------------------
1 | function y = log1mexp(x)
2 | % Accurately compute y = log(1-exp(x))
3 | % reference: Accurately Computing log(1-exp(-|a|)) Martin Machler
4 | y = x;
5 | i = x < -log(2);
6 | y(i) = log1p(-exp(x(i)));
7 | y(~i) = log(-expm1(x(~i)));
8 | 


--------------------------------------------------------------------------------
/demo/ch09/kmedoids_demo.m:
--------------------------------------------------------------------------------
 1 | close all; clear;
 2 | d = 2;
 3 | k = 3;
 4 | n = 5000;
 5 | [X,label] = kmeansRnd(d,k,n);
 6 | init = ceil(k*rand(1,n));
 7 | [y, idx, v] = kmedoids(X,init);
 8 | plotClass(X,label);
 9 | figure;
10 | plotClass(X,y);
11 | 
12 | 


--------------------------------------------------------------------------------
/common/standardize.m:
--------------------------------------------------------------------------------
1 | function [Y, s] = standardize(X)
2 | % Unitize the vectors to be unit length
3 | %   By default dim = 1 (columns).
4 | % Written by Mo Chen (sth4nth@gmail.com).
5 | X = bsxfun(@minux,X,mean(X,2));
6 | s = sqrt(mean(sum(X.^2,1)));
7 | Y = X/s;


--------------------------------------------------------------------------------
/common/log1pexp.m:
--------------------------------------------------------------------------------
1 | function y = log1pexp(x)
2 | % Accurately compute y = log(1+exp(x))
3 | % reference: Accurately Computing log(1-exp(-|a|)) Martin Machler
4 | y = x;
5 | i = x > 18;
6 | j = i & (x <= 33.3);
7 | y(~i) = log1p(exp(x(~i)));
8 | y(j) = x(j)+exp(-x(j));
9 | 


--------------------------------------------------------------------------------
/demo/ch07/rvmBinEm_demo.m:
--------------------------------------------------------------------------------
 1 | %% RVM for classification
 2 | clear; close all
 3 | k = 2;
 4 | d = 2;
 5 | n = 1000;
 6 | [X,t] = kmeansRnd(d,k,n);
 7 | 
 8 | [model, llh] = rvmBinEm(X,t-1);
 9 | plot(llh);
10 | y = rvmBinPred(model,X)+1;
11 | figure;
12 | plotClass(X,y);
13 | 


--------------------------------------------------------------------------------
/demo/ch07/rvmBinFp_demo.m:
--------------------------------------------------------------------------------
 1 | %% RVM for classification
 2 | clear; close all
 3 | k = 2;
 4 | d = 2;
 5 | n = 1000;
 6 | [X,t] = kmeansRnd(d,k,n);
 7 | 
 8 | [model, llh] = rvmBinFp(X,t-1);
 9 | plot(llh);
10 | y = rvmBinPred(model,X)+1;
11 | figure;
12 | plotClass(X,y);
13 | 


--------------------------------------------------------------------------------
/demo/ch08/nbGauss_demo.m:
--------------------------------------------------------------------------------
 1 | d = 2;
 2 | k = 3;
 3 | n = 1000;
 4 | [X, t] = kmeansRnd(d,k,n);
 5 | plotClass(X,t);
 6 | 
 7 | m = floor(n/2);
 8 | X1 = X(:,1:m);
 9 | X2 = X(:,(m+1):end);
10 | t1 = t(1:m);
11 | model = nbGauss(X1,t1);
12 | y2 = nbGaussPred(model,X2);
13 | plotClass(X2,y2);


--------------------------------------------------------------------------------
/demo/ch03/linReg_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch03
 2 | clear; close all;
 3 | d = 1;
 4 | n = 200;
 5 | [x,t] = linRnd(d,n);
 6 | %% Linear regression
 7 | model = linReg(x,t);
 8 | [y,sigma] = linRegPred(model,x,t);
 9 | plotCurveBar( x, y, sigma );
10 | hold on;
11 | plot(x,t,'o');
12 | hold off;


--------------------------------------------------------------------------------
/common/solvpd.m:
--------------------------------------------------------------------------------
1 | function V = solvpd(A,B)
2 | % Compute A\B where A is a positive definite matrix
3 | %   A: a positive difinie matrix
4 | % Written by Mo Chen (sth4nth@gmail.com).
5 | [U,p] = chol(A);
6 | if p > 0
7 |     error('ERROR: the matrix is not positive definite.');
8 | end
9 | V = U\(U'\B);


--------------------------------------------------------------------------------
/common/isequalf.m:
--------------------------------------------------------------------------------
 1 | function z = isequalf(x, y, tol)
 2 | % Determine whether two float number x and y are equal up to precision tol
 3 | % Written by Mo Chen (sth4nth@gmail.com).
 4 | if nargin < 3
 5 |     tol = 1e-8;
 6 | end
 7 | assert(all(size(x)==size(y)));
 8 | z = max(abs(x(:)-y(:)))<tol;
 9 | 
10 | 


--------------------------------------------------------------------------------
/chapter04/softmax.m:
--------------------------------------------------------------------------------
 1 | function [Y,s] = softmax(X, dim)
 2 | % Softmax function
 3 | %   By default dim = 1 (columns).
 4 | % Written by Mo Chen (sth4nth@gmail.com).
 5 | if nargin == 1
 6 |     dim = find(size(X)~=1,1);
 7 |     if isempty(dim), dim = 1; end
 8 | end
 9 | s = logsumexp(X,dim);
10 | Y = exp(X-s);
11 | 


--------------------------------------------------------------------------------
/common/sqdist.m:
--------------------------------------------------------------------------------
1 | function D = sqdist(X1, X2)
2 | % Pairwise square Euclidean distance between two sample sets
3 | % Input:
4 | %   X1, X2: dxn1 dxn2 sample matrices
5 | % Output:
6 | %   D: n1 x n2 square Euclidean distance matrix
7 | % Written by Mo Chen (sth4nth@gmail.com).
8 | D = dot(X1,X1,1)'+dot(X2,X2,1)-2*(X1'*X2);
9 | 


--------------------------------------------------------------------------------
/demo/ch04/logitBin_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch04
 2 | 
 3 | %% Logistic logistic regression for binary classification
 4 | close all;
 5 | clear; 
 6 | d = 2;
 7 | k = 2;
 8 | n = 1000;
 9 | [X,t] = kmeansRnd(d,k,n);
10 | [model, llh] = logitBin(X,t-1);
11 | plot(llh);
12 | y = logitBinPred(model,X)+1;
13 | figure
14 | binPlot(model,X,y)


--------------------------------------------------------------------------------
/chapter06/kn2sd.m:
--------------------------------------------------------------------------------
 1 | function D = kn2sd(K)
 2 | % Transform a kernel matrix (or inner product matrix) to a squared distance matrix
 3 | % Input:
 4 | %   K: n x n kernel matrix
 5 | % Ouput:
 6 | %   D: n x n squared distance matrix
 7 | % Written by Mo Chen (sth4nth@gmail.com).
 8 | d = diag(K);
 9 | D = -2*K+bsxfun(@plus,d,d');
10 | 


--------------------------------------------------------------------------------
/common/invpd.m:
--------------------------------------------------------------------------------
 1 | function W = invpd(M)
 2 | % Compute A\B where A is a positive definite matrix
 3 | % Input:
 4 | %   M: a positive difinie matrix
 5 | % Written by Michael Chen (sth4nth@gmail.com).
 6 | [U,p] = chol(M);
 7 | if p > 0
 8 |     error('ERROR: the matrix is not positive definite.');
 9 | end
10 | V = inv(U);
11 | W = V*V';


--------------------------------------------------------------------------------
/demo/ch03/linRegEm_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch03
 2 | clear; close all;
 3 | d = 1;
 4 | n = 200;
 5 | [x,t] = linRnd(d,n);
 6 | %% Empirical Bayesian linear regression via EM
 7 | [model,llh] = linRegEm(x,t);
 8 | plot(llh);
 9 | [y,sigma] = linRegPred(model,x,t);
10 | figure
11 | plotCurveBar(x,y,sigma);
12 | hold on;
13 | plot(x,t,'o');
14 | hold off;


--------------------------------------------------------------------------------
/demo/ch14/mixLinReg_demo.m:
--------------------------------------------------------------------------------
 1 | %% Mixture of linear regression
 2 | close all; clear
 3 | d = 1;
 4 | k = 2;
 5 | n = 500;
 6 | [X,y] = mixLinRnd(d,k,n);
 7 | plot(X,y,'.');
 8 | [label,model,llh] = mixLinReg(X, y, k);
 9 | plotClass([X;y],label);
10 | figure
11 | plot(llh);
12 | [y_,z,p] = mixLinPred(model,X,y);
13 | figure;
14 | plotClass([X;y],label);


--------------------------------------------------------------------------------
/demo/ch06/knReg_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch06
 2 | 
 3 | 
 4 | %% Kernel regression with gaussian kernel
 5 | clear; close all;
 6 | n = 100;
 7 | x = linspace(0,2*pi,n);   % test data
 8 | t = sin(x)+rand(1,n)/2;
 9 | model = knReg(x,t,1e-4,@knGauss);
10 | [y,s] = knRegPred(model,x);
11 | plotCurveBar(x,y,s);
12 | hold on;
13 | plot(x,t,'o');
14 | hold off;


--------------------------------------------------------------------------------
/chapter02/logMn.m:
--------------------------------------------------------------------------------
 1 | function z = logMn(x, p)
 2 | % Compute log pdf of a multinomial distribution.
 3 | % Input:
 4 | %   x: d x 1 integer vector 
 5 | %   p: d x 1 probability
 6 | % Output:
 7 | %   z: probability density in logrithm scale z=log p(x)
 8 | % Written by Mo Chen (sth4nth@gmail.com).    
 9 | z = gammaln(sum(x)+1)-sum(gammaln(x+1))+dot(x,log(p));
10 | 


--------------------------------------------------------------------------------
/demo/ch08/nbBern_demo.m:
--------------------------------------------------------------------------------
 1 | %% Naive Bayes with independent Bernoulli
 2 | close all; clear;
 3 | d = 10;
 4 | k = 2;
 5 | n = 2000;
 6 | [X,t,mu] = mixBernRnd(d,k,n);
 7 | m = floor(n/2);
 8 | X1 = X(:,1:m);
 9 | X2 = X(:,(m+1):end);
10 | t1 = t(1:m);
11 | t2 = t((m+1):end);
12 | model = nbBern(X1,t1);
13 | y2 = nbBernPred(model,X2);
14 | err = sum(t2~=y2)/numel(t2);


--------------------------------------------------------------------------------
/chapter01/entropy.m:
--------------------------------------------------------------------------------
 1 | function z = entropy(x)
 2 | % Compute entropy z=H(x) of a discrete variable x.
 3 | % Input:
 4 | %   x: a integer vectors  
 5 | % Output:
 6 | %   z: entropy z=H(x)
 7 | % Written by Mo Chen (sth4nth@gmail.com).
 8 | n = numel(x);
 9 | [~,~,x] = unique(x);
10 | Px = accumarray(x, 1)/n;
11 | Hx = -dot(Px,log2(Px));
12 | z = max(0,Hx);


--------------------------------------------------------------------------------
/common/lognormexp.m:
--------------------------------------------------------------------------------
 1 | function [Y,s] = lognormexp(X, dim)
 2 | % Compute log(normalize(exp(x),dim)) while avoiding numerical underflow.
 3 | %   By default dim = 1 (columns).
 4 | % Written by Mo Chen (sth4nth@gmail.com).
 5 | if nargin == 1
 6 |     dim = find(size(X)~=1,1);
 7 |     if isempty(dim), dim = 1; end
 8 | end
 9 | s = logsumexp(X,dim);
10 | Y = X-s;
11 | 


--------------------------------------------------------------------------------
/demo/ch09/kmeans_mixGaussEm_demo.m:
--------------------------------------------------------------------------------
 1 | 
 2 | %% Gauss mixture initialized by kmeans
 3 | close all; clear;
 4 | d = 2;
 5 | k = 3;
 6 | n = 500;
 7 | [X,label] = mixGaussRnd(d,k,n);
 8 | init = kmeans(X,k);
 9 | [z,model,llh] = mixGaussEm(X,init);
10 | plotClass(X,label);
11 | figure;
12 | plotClass(X,init);
13 | figure;
14 | plotClass(X,z);
15 | figure;
16 | plot(llh);
17 | 
18 | 


--------------------------------------------------------------------------------
/demo/ch11/mixGaussGb_demo.m:
--------------------------------------------------------------------------------
 1 | %% Collapse Gibbs sampling for Dirichelt process gaussian mixture model
 2 | close all; clear;
 3 | d = 2;
 4 | k = 3;
 5 | n = 500;
 6 | [X,z] = mixGaussRnd(d,k,n);
 7 | plotClass(X,z);
 8 | 
 9 | [z,Theta,w,llh] = mixGaussGb(X);
10 | figure
11 | plotClass(X,z);
12 | 
13 | [X,z] = mixGaussSample(Theta,w,n);
14 | figure
15 | plotClass(X,z);
16 | 
17 | 


--------------------------------------------------------------------------------
/chapter06/knLin.m:
--------------------------------------------------------------------------------
 1 | function K = knLin(X, Y)
 2 | % Linear kernel (inner product)
 3 | % Input:
 4 | %   X: d x nx data matrix
 5 | %   Y: d x ny data matrix
 6 | % Ouput:
 7 | %   K: nx x ny kernel matrix
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | if nargin < 2 || isempty(Y)  
10 |     K = dot(X,X,1);            % norm in kernel space
11 | else
12 |     K = X'*Y;
13 | end
14 | 


--------------------------------------------------------------------------------
/common/normalize.m:
--------------------------------------------------------------------------------
 1 | function [Y, s] = normalize(X, dim)
 2 | % Normalize the vectors to be summing to one
 3 | %   By default dim = 1 (columns).
 4 | % Written by Michael Chen (sth4nth@gmail.com).
 5 | if nargin == 1
 6 |     % Determine which dimension sum will use
 7 |     dim = find(size(X)~=1,1);
 8 |     if isempty(dim), dim = 1; end
 9 | end
10 | s = sum(X,dim);
11 | Y = X./s;


--------------------------------------------------------------------------------
/common/unitize.m:
--------------------------------------------------------------------------------
 1 | function [Y, s] = unitize(X, dim)
 2 | % Unitize the vectors to be unit length
 3 | %   By default dim = 1 (columns).
 4 | % Written by Mo Chen (sth4nth@gmail.com).
 5 | if nargin == 1
 6 |     % Determine which dimension sum will use
 7 |     dim = find(size(X)~=1,1);
 8 |     if isempty(dim), dim = 1; end
 9 | end
10 | s = sqrt(dot(X,X,dim));
11 | Y = bsxfun(@times,X,1./s);


--------------------------------------------------------------------------------
/demo/ch03/linRegFp_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch03
 2 | clear; close all;
 3 | d = 1;
 4 | n = 200;
 5 | [x,t] = linRnd(d,n);
 6 | %%  Empirical Bayesian linear regression via Mackay fix point iteration method
 7 | [model,llh] = linRegFp(x,t);
 8 | plot(llh);
 9 | [y,sigma] = linRegPred(model,x,t);
10 | figure
11 | plotCurveBar(x,y,sigma);
12 | hold on;
13 | plot(x,t,'o');
14 | hold off;
15 | %%
16 | 
17 | 


--------------------------------------------------------------------------------
/chapter11/dirichletRnd.m:
--------------------------------------------------------------------------------
 1 | function x = dirichletRnd(a, m)
 2 | % Generate samples from a Dirichlet distribution.
 3 | % Input:
 4 | %   a: k dimensional vector
 5 | %   m: k dimensional mean vector
 6 | % Outpet:
 7 | %   x: generated sample x~Dir(a,m)
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | if nargin == 2
10 |     a = a*m;
11 | end
12 | x = gamrnd(a,1);
13 | x = x/sum(x);
14 | 


--------------------------------------------------------------------------------
/chapter08/MRF/mrfGibbs.m:
--------------------------------------------------------------------------------
 1 | function lnZ = mrfGibbs(A, nodePot, edgePot, nodeBel)
 2 | % Compute Gibbs energy
 3 | [s,t,e] = find(triu(A));
 4 | edgeBel = zeros(size(edgePot));
 5 | for l = 1:numel(e)
 6 |     edgeBel(:,:,e(l)) = nodeBel(:,s(l))*nodeBel(:,t(l))';
 7 | end
 8 | Ex = dot(nodeBel(:),nodePot(:));
 9 | Exy = dot(edgeBel(:),edgePot(:));
10 | Hx = -dot(nodeBel(:),log(nodeBel(:)));
11 | lnZ = Ex+Exy+Hx;


--------------------------------------------------------------------------------
/chapter09/kseeds.m:
--------------------------------------------------------------------------------
 1 | function mu = kseeds(X, k)
 2 | % Perform kmeans++ seeding
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   k: number of seeds
 6 | % Output:
 7 | %   mu: d x k seeds
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | n = size(X,2);
10 | D = inf(1,n);
11 | mu = X(:,ceil(n*rand));
12 | for i = 2:k
13 |     D = min(D,sum((X-mu(:,i-1)).^2,1));
14 |     mu(:,i) = X(:,randp(D));
15 | end
16 | 


--------------------------------------------------------------------------------
/demo/ch07/rvmRegEm_demo.m:
--------------------------------------------------------------------------------
 1 | %% regression
 2 | d = 100;
 3 | beta = 1e-1;
 4 | X = rand(1,d);
 5 | w = randn;
 6 | b = randn;
 7 | t = w'*X+b+beta*randn(1,d);
 8 | x = linspace(min(X),max(X),d);   % test data
 9 | 
10 | %% RVM regression by EM
11 | [model,llh] = rvmRegEm(X,t);
12 | plot(llh);
13 | [y, sigma] = linRegPred(model,x,t);
14 | figure
15 | plotCurveBar(x,y,sigma);
16 | hold on;
17 | plot(X,t,'o');
18 | hold off


--------------------------------------------------------------------------------
/common/logsumexp.m:
--------------------------------------------------------------------------------
 1 | function s = logsumexp(X, dim)
 2 | % Compute log(sum(exp(X),dim)) while avoiding numerical underflow.
 3 | %   By default dim = 1 (columns).
 4 | % Written by Mo Chen (sth4nth@gmail.com).
 5 | if nargin == 1
 6 |     dim = find(size(X)~=1,1);
 7 |     if isempty(dim), dim = 1; end
 8 | end
 9 | a = max(X,[],dim);
10 | s = a+log(sum(exp(X-a),dim));   % TODO: use log1p
11 | i = isinf(a);
12 | s(i) = a(i);


--------------------------------------------------------------------------------
/chapter06/sd2kn.m:
--------------------------------------------------------------------------------
 1 | function K = sd2kn(D)
 2 | % Transform a squared distance matrix to a kernel matrix. 
 3 | % The data are assumed to be centered, i.e., H=eye(n)-ones(n)/n; K=-(H*D*H)/2.
 4 | % Input:
 5 | %   D: n x n squared distance matrix
 6 | % Ouput:
 7 | %   K: n x n kernel matrix
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | D = bsxfun(@minus,D,mean(D,1));
10 | D = bsxfun(@minus,D,mean(D,2));
11 | K = (D+D')/(-4);


--------------------------------------------------------------------------------
/demo/ch07/rvmRegSeq_demo.m:
--------------------------------------------------------------------------------
 1 | %% regression
 2 | d = 100;
 3 | beta = 1e-1;
 4 | X = rand(1,d);
 5 | w = randn;
 6 | b = randn;
 7 | t = w'*X+b+beta*randn(1,d);
 8 | x = linspace(min(X),max(X),d);   % test data
 9 | %% RVM regression by sequential update
10 | [model,llh] = rvmRegSeq(X,t);
11 | plot(llh);
12 | [y, sigma] = linRegPred(model,x,t);
13 | figure
14 | plotCurveBar(x,y,sigma);
15 | hold on;
16 | plot(X,t,'o');
17 | hold off


--------------------------------------------------------------------------------
/chapter08/NaiveBayes/nbBernPred.m:
--------------------------------------------------------------------------------
 1 | function y = nbBernPred(model, X)
 2 | % Prediction of naive Bayes classifier with independent Bernoulli.
 3 | % input:
 4 | %   model: trained model structure
 5 | %   X: d x n data matrix
 6 | % output:
 7 | %   y: 1 x n predicted class label
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | mu = model.mu;
10 | w = model.w;
11 | [~,y] = max(log(mu)'*X+log(1-mu)'*(1-X)+log(w(:)),[],1);
12 | 
13 | 


--------------------------------------------------------------------------------
/chapter09/kmeansPred.m:
--------------------------------------------------------------------------------
 1 | function [label, energy] = kmeansPred(mu, X)
 2 | % Prediction for kmeans clusterng
 3 | % Input:
 4 | %   model: dx k cluster center matrix
 5 | %   X: d x n testing data
 6 | % Output:
 7 | %   label: 1 x n cluster label
 8 | %   energy: optimization target value
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | [val,label] = min(dot(X,X,1)+dot(mu,mu,1)'-2*mu'*X,[],1); % assign labels
11 | energy = sum(val);


--------------------------------------------------------------------------------
/chapter11/discreteRnd.m:
--------------------------------------------------------------------------------
 1 | function x = discreteRnd(p, n)
 2 | % Generate samples from a discrete distribution (multinomial).
 3 | % Input:
 4 | %   p: k dimensional probability vector
 5 | %   n: number of samples
 6 | % Ouput:
 7 | %   x: k x n generated samples x~Mul(p)
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | if nargin == 1
10 |     n = 1;
11 | end
12 | [~,~,x] = histcounts(rand(1,n),[0;cumsum(p(:))]);
13 | 


--------------------------------------------------------------------------------
/common/gson.m:
--------------------------------------------------------------------------------
 1 | function [Q, R] = gson(X)
 2 | % Gram-Schmidt orthonormalization which produces the same result as [Q,R]=qr(X,0)
 3 | % Written by Mo Chen (sth4nth@gmail.com).
 4 | [d,n] = size(X);
 5 | m = min(d,n);
 6 | R = zeros(m,n);
 7 | Q = zeros(d,0);
 8 | for i = 1:m
 9 |     R(1:i-1,i) = Q'*X(:,i);
10 |     v = X(:,i)-Q*R(1:i-1,i);
11 |     R(i,i) = norm(v);
12 |     Q(:,i) = v/R(i,i);
13 | end
14 | R(:,m+1:n) = Q'*X(:,m+1:n);


--------------------------------------------------------------------------------
/demo/ch13/hmm_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for HMM in ch13
 2 | d = 3; k = 2; n = 10000;
 3 | [x,model] = hmmRnd(d,k,n);
 4 | %% Viterbi algorithm
 5 | [z, llh] = hmmViterbi(model, x);
 6 | %% HMM filter (forward algorithm)
 7 | [alpha, llh] = hmmFilter(model, x);
 8 | %% HMM smoother (forward backward)
 9 | [gamma,alpha,beta,c] = hmmSmoother(model, x);
10 | %% Baum-Welch algorithm
11 | [model, llh] = hmmEm(x,k);
12 | plot(llh)
13 | 


--------------------------------------------------------------------------------
/demo/ch10/rvmRegVb_demo.m:
--------------------------------------------------------------------------------
 1 | clear; close all;
 2 | 
 3 | d = 100;
 4 | beta = 1e-1;
 5 | X = rand(1,d);
 6 | w = randn;
 7 | b = randn;
 8 | t = w'*X+b+beta*randn(1,d);
 9 | x = linspace(min(X),max(X),d);   % test data
10 | 
11 | [model,llh] = linRegVb(X,t);
12 | % [model,llh] = rvmRegVb(X,t);
13 | plot(llh);
14 | [y, sigma] = linRegPred(model,x,t);
15 | figure
16 | plotCurveBar(x,y,sigma);
17 | hold on;
18 | plot(X,t,'o');
19 | hold off


--------------------------------------------------------------------------------
/chapter04/logitBinPred.m:
--------------------------------------------------------------------------------
 1 | function [y, p] = logitBinPred(model, X)
 2 | % Prediction of binary logistic regression model
 3 | % Input:
 4 | %   model: trained model structure
 5 | %   X: d x n testing data
 6 | % Output:
 7 | %   y: 1 x n predict label (0/1)
 8 | %   p: 1 x n predict probability [0,1]
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | X = [X;ones(1,size(X,2))];
11 | w = model.w;
12 | p = sigmoid(w'*X);
13 | y = round(p);
14 | 
15 | 


--------------------------------------------------------------------------------
/demo/ch07/rvmRegFp_demo.m:
--------------------------------------------------------------------------------
 1 | %% regression
 2 | d = 100;
 3 | beta = 1e-1;
 4 | X = rand(1,d);
 5 | w = randn;
 6 | b = randn;
 7 | t = w'*X+b+beta*randn(1,d);
 8 | x = linspace(min(X),max(X),d);   % test data
 9 | 
10 | 
11 | %% RVM regression by Mackay fix point update
12 | [model,llh] = rvmRegFp(X,t);
13 | plot(llh);
14 | [y, sigma] = linRegPred(model,x,t);
15 | figure
16 | plotCurveBar(x,y,sigma);
17 | hold on;
18 | plot(X,t,'o');
19 | hold off


--------------------------------------------------------------------------------
/demo/ch09/mixGaussEm_demo.m:
--------------------------------------------------------------------------------
 1 | %% Gausssian Mixture via EM
 2 | close all; clear;
 3 | d = 2;
 4 | k = 3;
 5 | n = 1000;
 6 | [X,label] = mixGaussRnd(d,k,n);
 7 | plotClass(X,label);
 8 | 
 9 | m = floor(n/2);
10 | X1 = X(:,1:m);
11 | X2 = X(:,(m+1):end);
12 | % train
13 | [z1,model,llh] = mixGaussEm(X1,k);
14 | figure;
15 | plot(llh);
16 | figure;
17 | plotClass(X1,z1);
18 | % predict
19 | z2 = mixGaussPred(model,X2);
20 | figure;
21 | plotClass(X2,z2);


--------------------------------------------------------------------------------
/chapter02/logKde.m:
--------------------------------------------------------------------------------
 1 | function z = logKde (X, Y, sigma)
 2 | % Compute log pdf of kernel density estimator.
 3 | % Input:
 4 | %   X: d x n data matrix to be evaluate
 5 | %   Y: d x k data matrix served as database
 6 | % Output:
 7 | %   z: probability density in logrithm scale z=log p(x|y)
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | D = dot(X,X,1)+dot(Y,Y,1)'-2*(Y'*X);
10 | z = logsumexp(D/(-2*sigma^2),1)-0.5*log(2*pi)-log(sigma*size(Y,2),1);
11 | 


--------------------------------------------------------------------------------
/chapter08/NaiveBayes/nbBern.m:
--------------------------------------------------------------------------------
 1 | function model = nbBern(X, t)
 2 | % Naive bayes classifier with indepenet Bernoulli.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   t: 1 x n label (1~k)
 6 | % Output:
 7 | %   model: trained model structure
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | n = size(X,2);
10 | E = sparse(1:n,t,1);
11 | nk = sum(E,1);
12 | w = full(nk)/n;
13 | mu = X*(E./nk);  
14 | 
15 | model.mu = mu;      % d x k means 
16 | model.w = w;


--------------------------------------------------------------------------------
/chapter05/mlpRegPred.m:
--------------------------------------------------------------------------------
 1 | function Y = mlpRegPred(model, X)
 2 | % Multilayer perceptron regression prediction
 3 | % tanh activation function is used.
 4 | % Input:
 5 | %   model: model structure
 6 | %   X: d x n data matrix
 7 | % Ouput:
 8 | %   Y: p x n response matrix
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | W = model.W;
11 | b = model.b;
12 | T = length(W);
13 | Y = X;
14 | for t = 1:T-1
15 |     Y = tanh(W{t}'*Y+b{t});
16 | end
17 | Y = W{T}'*Y+b{T};


--------------------------------------------------------------------------------
/chapter04/logitMnPred.m:
--------------------------------------------------------------------------------
 1 | function [y, P] = logitMnPred(model, X)
 2 | % Prediction of multiclass (multinomial) logistic regression model
 3 | % Input:
 4 | %   model: trained model structure
 5 | %   X: d x n testing data
 6 | % Output:
 7 | %   y: 1 x n predict label (1~k)
 8 | %   P: k x n predict probability for each class
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | W = model.W;
11 | X = [X; ones(1,size(X,2))];
12 | P = softmax(W'*X);
13 | [~, y] = max(P,[],1);


--------------------------------------------------------------------------------
/common/symeig.m:
--------------------------------------------------------------------------------
 1 | function [V,A,flag] = symeig(S,d,m)
 2 | % Compute eigenvalues and eigenvectors of symmetric matrix
 3 | %   m == 's' smallest (default)
 4 | %   m == 'l' largest
 5 | % Written by Mo Chen (sth4nth@gmail.com).
 6 | if nargin == 2
 7 |     m = 's';
 8 | end
 9 | opt.disp = 0;
10 | opt.issym = 1;
11 | opt.isreal = 1;
12 | if any(m == 'ls')
13 |     [V,A,flag] = eigs(S,d,[m,'a'],opt);
14 | else
15 |     error('The third parameter must be l or s.');
16 | end
17 | 


--------------------------------------------------------------------------------
/common/plotCurveBar.m:
--------------------------------------------------------------------------------
 1 | function plotCurveBar( x, y, sigma )
 2 | % Plot 1d curve and variance
 3 | % Input:
 4 | %   x: 1 x n 
 5 | %   y: 1 x n 
 6 | %   sigma: 1 x n or scaler 
 7 | % Written by Mo Chen (sth4nth@gmail.com).
 8 | color = [255,228,225]/255; %pink
 9 | [x,idx] = sort(x);
10 | y = y(idx);
11 | sigma = sigma(idx);
12 | 
13 | fill([x,fliplr(x)],[y+sigma,fliplr(y-sigma)],color);
14 | hold on;
15 | plot(x,y,'r-');
16 | hold off
17 | axis([x(1),x(end),-inf,inf])
18 | 
19 | 


--------------------------------------------------------------------------------
/chapter08/MRF/mrfBethe.m:
--------------------------------------------------------------------------------
 1 | function lnZ = mrfBethe(A, nodePot, edgePot, nodeBel, edgeBel)
 2 | % Compute Bethe energy
 3 | [s,t,e] = find(triu(A));
 4 | edgeCor = zeros(size(edgePot));
 5 | for l = 1:numel(e)
 6 |     edgeCor(:,:,e(l)) = edgeBel(:,:,e(l))./(nodeBel(:,s(l))*nodeBel(:,t(l))');
 7 | end
 8 | Ex = dot(nodeBel(:),nodePot(:));
 9 | Exy = dot(edgeBel(:),edgePot(:));
10 | Hx = -dot(nodeBel(:),log(nodeBel(:)));
11 | Ixy = dot(edgeBel(:),log(edgeCor(:)));
12 | lnZ = Ex+Exy+Hx-Ixy;


--------------------------------------------------------------------------------
/chapter02/logVmf.m:
--------------------------------------------------------------------------------
 1 | function y = logVmf(X, mu, kappa)
 2 | % Compute log pdf of a von Mises-Fisher distribution.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   mu: d x k mean
 6 | %   kappa: 1 x k variance
 7 | % Output:
 8 | %   y: k x n probability density in logrithm scale y=log p(x)
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | d = size(X,1);
11 | c = (d/2-1)*log(kappa)-(d/2)*log(2*pi)-logbesseli(d/2-1,kappa);
12 | q = bsxfun(@times,mu,kappa)'*X;
13 | y = bsxfun(@plus,q,c');
14 | 


--------------------------------------------------------------------------------
/chapter02/logWishart.m:
--------------------------------------------------------------------------------
 1 | function y = logWishart(Sigma, W, v)
 2 | % Compute log pdf of a Wishart distribution.
 3 | % Input:
 4 | %   Sigma: d x d covariance matrix
 5 | %   W: d x d covariance parameter
 6 | %   v: degree of freedom
 7 | % Output:
 8 | %   y: probability density in logrithm scale y=log p(Sigma)
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | d = length(Sigma);
11 | B = -0.5*v*logdet(W)-0.5*v*d*log(2)-logmvgamma(0.5*v,d);
12 | y = B+0.5*(v-d-1)*logdet(Sigma)-0.5*trace(W\Sigma);


--------------------------------------------------------------------------------
/chapter07/rvmBinPred.m:
--------------------------------------------------------------------------------
 1 | function [y, p] = rvmBinPred(model, X)
 2 | % Prodict the label for binary logistic regression model
 3 | % Input:
 4 | %   model: trained model structure
 5 | %   X: d x n testing data
 6 | % Output:
 7 | %   y: 1 x n predict label (0/1)
 8 | %   p: 1 x n predict probability [0,1]
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | index = model.index;
11 | X = [X;ones(1,size(X,2))];
12 | X = X(index,:);
13 | w = model.w;
14 | p = sigmoid(w'*X); 
15 | y = round(p);
16 | 


--------------------------------------------------------------------------------
/chapter14/mixLogitBinPred.m:
--------------------------------------------------------------------------------
 1 | function t = mixLogitBinPred(model, X)
 2 | % Prediction function for mixture of logistic regression
 3 | % input:
 4 | %   model: trained model structure
 5 | %   X: d x n data matrix
 6 | % output:
 7 | %   t: 1 x n cluster label
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | alpha = model.alpha; % mixing coefficient
10 | W = model.W ;  % logistic model coefficentalpha
11 | n = size(X,2);
12 | X = [X; ones(1,n)];
13 | t = round(alpha*sigmoid(W'*X));
14 | 
15 | 


--------------------------------------------------------------------------------
/common/besseliLn.m:
--------------------------------------------------------------------------------
 1 | function y = besseliLn(nu,x)
 2 | % Compute logarithm of besseli function (modified Bessel function of first kind).
 3 | % Written by Mo Chen (mochen80@gmail.com).
 4 | % TODO: improve precision using the method in 
 5 | % Clustering on the Unit Hypersphere using von Mises-Fisher Distributions.  A. Banerjee, I. S. Dhillon, J. Ghosh, and S. Sra
 6 | [v,ierr] = besseli(nu,x);
 7 | if any(ierr ~= 0) || any(v == Inf)
 8 |     error('ERROR: logbesseli');
 9 | end
10 | y = log(v);
11 | 


--------------------------------------------------------------------------------
/chapter06/knPca.m:
--------------------------------------------------------------------------------
 1 | function model = knPca(X, q, kn)
 2 | % Kernel PCA
 3 | % Input:
 4 | %   X: d x n data matrix 
 5 | %   q: target dimension
 6 | %   kn: kernel function
 7 | % Ouput:
 8 | %   model: trained model structure
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | if nargin < 3
11 |     kn = @knGauss;
12 | end
13 | K = knCenter(kn,X);
14 | [V,L] = eig(K);
15 | [L,idx] = sort(diag(L),'descend');
16 | V = V(:,idx(1:q));
17 | L = L(1:q);
18 | 
19 | model.kn = kn;
20 | model.V = V;
21 | model.L = L;
22 | model.X = X;


--------------------------------------------------------------------------------
/common/sub.m:
--------------------------------------------------------------------------------
 1 | function B = sub(A, varargin)
 2 | % sub(A,i,j,k) = A(i;j;k)
 3 | % Written by Mo Chen (sth4nth@gmail.com).
 4 | assert(ndims(A)==numel(varargin));
 5 | sz = cellfun(@numel,varargin);
 6 | IDX = cell(1,ndims(A));
 7 | for i = 1:ndims(A)
 8 |     idx = varargin{i};
 9 |     shape = ones(1,ndims(A));
10 |     shape(i) = sz(i);
11 |     idx = reshape(idx,shape);
12 |     shape = sz;
13 |     shape(i) = 1;
14 |     idx = repmat(idx,shape);
15 |     IDX{i} = idx(:);
16 | end
17 | B = reshape(A(sub2ind(size(A),IDX{:})),sz);


--------------------------------------------------------------------------------
/chapter12/ppcaRnd.m:
--------------------------------------------------------------------------------
 1 | function [X, model] = ppcaRnd(m, d, n)
 2 | % Generate data from probabilistic PCA model
 3 | % Input:
 4 | %   m: dimension of latent space
 5 | %   d: dimension of data
 6 | %   n: number of data
 7 | % Output:
 8 | %   X: d x n data matrix
 9 | %   model: model structure
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | beta = randg;
12 | Z = randn(m,n);
13 | W = randn(d,m); 
14 | mu = randn(d,1);
15 | X = bsxfun(@times,W*Z,mu)+randn(d,n)/sqrt(beta);
16 | 
17 | model.W = W;
18 | model.mu = mu;
19 | model.beta = beta;


--------------------------------------------------------------------------------
/chapter06/knPcaPred.m:
--------------------------------------------------------------------------------
 1 | function Y = knPcaPred(model, Xt, opt)
 2 | % Prediction for kernel PCA
 3 | % Input:
 4 | %   model: trained model structure
 5 | %   X: d x n testing data
 6 | %   t (optional): 1 x n testing response
 7 | % Output:
 8 | %   Y: prejection result of Xt
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | kn = model.kn;
11 | V = model.V;
12 | L = model.L;
13 | X = model.X;
14 | Y = bsxfun(@times,V'*knCenter(kn,X,X,Xt),1./sqrt(L));
15 | if nargin == 3 && opt.whiten
16 |     Y = bsxfun(@times,Y,1./sqrt(L));
17 | end
18 | 
19 | 


--------------------------------------------------------------------------------
/chapter05/mlpClassPred.m:
--------------------------------------------------------------------------------
 1 | function [y, P] = mlpClassPred(model, X)
 2 | % Multilayer perceptron classification prediction
 3 | % logistic activation function is used.
 4 | % Input:
 5 | %   model: model structure
 6 | %   X: d x n data matrix
 7 | % Ouput:
 8 | %   y: 1 x n label vector
 9 | %   P: k x n probability matrix
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | W = model.W;
12 | b = model.b;
13 | T = length(W);
14 | Z = X;
15 | for t = 1:T-1
16 |     Z = sigmoid(W{t}'*Z+b{t});
17 | end
18 | P = softmax(W{T}'*Z+b{T});
19 | [~,y] = max(P,[],1);  


--------------------------------------------------------------------------------
/common/mgson.m:
--------------------------------------------------------------------------------
 1 | function [Q, R] = mgson(X)
 2 | % Modified Gram-Schmidt orthonormalization (numerical stable version of Gram-Schmidt algorithm) 
 3 | % which produces the same result as [Q,R]=qr(X,0)
 4 | % Written by Mo Chen (sth4nth@gmail.com).
 5 | [d,n] = size(X);
 6 | m = min(d,n);
 7 | R = zeros(m,n);
 8 | Q = zeros(d,m);
 9 | for i = 1:m
10 |     v = X(:,i);
11 |     for j = 1:i-1
12 |         R(j,i) = Q(:,j)'*v;
13 |         v = v-R(j,i)*Q(:,j);
14 |     end
15 |     R(i,i) = norm(v);
16 |     Q(:,i) = v/R(i,i);
17 | end
18 | R(:,m+1:n) = Q'*X(:,m+1:n);


--------------------------------------------------------------------------------
/demo/ch06/knKmeans_demo.m:
--------------------------------------------------------------------------------
 1 | %% Kernel kmeans with linear kernel is equivalent to kmeans
 2 | close all; clear;
 3 | d = 2;
 4 | k = 3;
 5 | n = 200;
 6 | [X, y] = kmeansRnd(d,k,n);
 7 | init = ceil(k*rand(1,n));
 8 | 
 9 | label = knKmeans(X,init,@knLin);
10 | 
11 | label0 = kmeans(X,init);
12 | maxdiff(label,label0)
13 | plotClass(X,label);
14 | %% Kernel kmeans with Gaussian Kernel for nonlinear data
15 | x1 = linspace(0,pi,n/2);
16 | x2 = sin(x1);
17 | X = [x1,x1+pi/2;
18 |     x2,-x2];
19 | 
20 | label = knKmeans(X,2,@knGauss);
21 | figure;
22 | plotClass(X,label);


--------------------------------------------------------------------------------
/chapter06/knGauss.m:
--------------------------------------------------------------------------------
 1 | function K = knGauss(X, Y, s)
 2 | % Gaussian (RBF) kernel K = exp(-|x-y|/(2s));
 3 | % Input:
 4 | %   X: d x nx data matrix
 5 | %   Y: d x ny data matrix
 6 | %   s: sigma of gaussian
 7 | % Ouput:
 8 | %   K: nx x ny kernel matrix
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | if nargin < 3
11 |     s = 0.4;
12 | end
13 | 
14 | if nargin < 2 || isempty(Y)  
15 |     K = ones(1,size(X,2));            % norm in kernel space
16 | else
17 |     D = bsxfun(@plus,dot(X,X,1)',dot(Y,Y,1))-2*(X'*Y);
18 |     K = exp(D/(-2*s^2));
19 | end
20 | 
21 | 


--------------------------------------------------------------------------------
/chapter06/knPoly.m:
--------------------------------------------------------------------------------
 1 | function K = knPoly(X, Y, o, c)
 2 | % Polynomial kernel k(x,y)=(x'y+c)^o
 3 | % Input:
 4 | %   X: d x nx data matrix
 5 | %   Y: d x ny data matrix
 6 | %   o: order of polynomial
 7 | %   c: constant
 8 | % Ouput:
 9 | %   K: nx x ny kernel matrix
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | if nargin < 4
12 |     c = 0;
13 | end
14 | 
15 | if nargin < 3
16 |     o = 3;
17 | end
18 | 
19 | if nargin < 2 || isempty(Y)  
20 |     K = (dot(X,X,1)+c).^o;            % norm in kernel space
21 | else
22 |     K = (X'*Y+c).^o;
23 | end
24 | 
25 | 


--------------------------------------------------------------------------------
/chapter11/gaussRnd.m:
--------------------------------------------------------------------------------
 1 | function x = gaussRnd(mu, Sigma, n)
 2 | % Generate samples from a Gaussian distribution.
 3 | % Input:
 4 | %   mu: d x 1 mean vector
 5 | %   Sigma: d x d covariance matrix
 6 | %   n: number of samples
 7 | % Outpet:
 8 | %   x: d x n generated sample x~Gauss(mu,Sigma)
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | if nargin == 2
11 |     n = 1;
12 | end
13 | [V,err] = chol(Sigma);
14 | if err ~= 0
15 |     error('ERROR: sigma must be a symmetric positive definite matrix.');
16 | end
17 | x = V'*randn(size(V,1),n)+repmat(mu,1,n);


--------------------------------------------------------------------------------
/chapter14/mixLinRnd.m:
--------------------------------------------------------------------------------
 1 | function [X, y, W ] = mixLinRnd(d, k, n)
 2 | % Generate data from mixture of linear model
 3 | % Input:
 4 | %   d: dimension of data
 5 | %   k: number of components
 6 | %   n: number of data
 7 | % Output:
 8 | %   X: d x n data matrix
 9 | %   y: 1 x n response variable
10 | %   W: d+1 x k weight matrix 
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | W = randn(d+1,k);
13 | [X, z] = kmeansRnd(d, k, n);
14 | y = zeros(1,n);
15 | for j = 1:k
16 |     idx = (z == j);
17 |     y(idx) = W(1:(end-1),j)'*X(:,idx)+W(end,j);
18 | end
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/chapter14/adaboostBinPred.m:
--------------------------------------------------------------------------------
 1 | function t = adaboostBinPred(model, X)
 2 | % Prediction of binary Adaboost
 3 | % input:
 4 | %   model: trained model structure
 5 | %   X: d x n data matrix
 6 | % output:
 7 | %   t: 1 x n prediction 
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | Alpha = model.alpha;
10 | Theta = model.theta;
11 | M = size(Alpha,2);
12 | t = zeros(1,size(X,2));
13 | for m = 1:M
14 |     c = Theta(:,:,m);
15 |     [~,y] = min(sqdist(c,X),[],1);
16 |     y(y==1) = -1;
17 |     y(y==2) = 1;
18 |     t = t+Alpha(m)*y;
19 | end
20 | t = sign(t);
21 | t(t==-1) = 0;


--------------------------------------------------------------------------------
/common/slice.m:
--------------------------------------------------------------------------------
 1 | function B = slice(A, dim, index)
 2 | % slice(A,2,index) = A(:,index,:)
 3 | % Written by Mo Chen (sth4nth@gmail.com).
 4 | sz = size(A);
 5 | sz(dim) = numel(index);
 6 | IDX = cell(1,ndims(A));
 7 | for i = 1:ndims(A)
 8 |     if i == dim
 9 |         idx = index;
10 |     else
11 |         idx = 1:sz(i);
12 |     end
13 |     shape = ones(1,ndims(A));
14 |     shape(i) = sz(i);
15 |     idx = reshape(idx,shape);
16 |     shape = sz;
17 |     shape(i) = 1;
18 |     idx = repmat(idx,shape);
19 |     IDX{i} = idx(:);
20 | end
21 | B = reshape(A(sub2ind(size(A),IDX{:})),sz);


--------------------------------------------------------------------------------
/chapter03/linRnd.m:
--------------------------------------------------------------------------------
 1 | function [X, t] = linRnd(d, n)
 2 | % Generate data from a linear model p(t|w,x)=G(w'x+w0,sigma), sigma=sqrt(1/beta) 
 3 | % where w and w0 are generated from Gauss(0,1), beta is generated from
 4 | % Gamma(1,1), X is generated form [0,1].
 5 | % Input:
 6 | %   d: dimension of data
 7 | %   n: number of data
 8 | % Output:
 9 | %   X: d x n data matrix
10 | %   t: 1 x n response variable
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | beta = randg;   % need statistcs toolbox
13 | X = rand(d,n);
14 | w = randn(d,1);
15 | w0 = randn(1,1);
16 | t = w'*X+w0+randn(1,n)/sqrt(beta);


--------------------------------------------------------------------------------
/chapter06/knKmeansPred.m:
--------------------------------------------------------------------------------
 1 | function [label, energy] = knKmeansPred(model, Xt)
 2 | % Prediction for kernel kmeans clusterng
 3 | % Input:
 4 | %   model: trained model structure
 5 | %   Xt: d x n testing data
 6 | % Ouput:
 7 | %   label: 1 x n predict label
 8 | %   engery: optimization target value
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | X = model.X;
11 | t = model.label;
12 | kn = model.kn;
13 | 
14 | n = size(X,2);
15 | k = max(t);
16 | E = sparse(t,1:n,1,k,n,n);
17 | E = E./sum(E,2);
18 | Z = E*kn(X,Xt)-dot(E*kn(X,X),E,2)/2;
19 | [val, label] = max(Z,[],1);
20 | energy = sum(kn(Xt))-2*sum(val);
21 | 


--------------------------------------------------------------------------------
/chapter02/logDirichlet.m:
--------------------------------------------------------------------------------
 1 | function y = logDirichlet(X, a)
 2 | % Compute log pdf of a Dirichlet distribution.
 3 | % Input:
 4 | %   X: d x n data matrix, each column sums to one (sum(X,1)==ones(1,n) && X>=0)
 5 | %   a: d x k parameter of Dirichlet
 6 | %   y: k x n probability density
 7 | % Output:
 8 | %   y: k x n probability density in logrithm scale y=log p(x)
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | X = bsxfun(@times,X,1./sum(X,1));
11 | if size(a,1) == 1
12 |     a = repmat(a,size(X,1),1);
13 | end
14 | c = gammaln(sum(a,1))-sum(gammaln(a),1);
15 | g = (a-1)'*log(X);
16 | y = bsxfun(@plus,g,c');
17 | 


--------------------------------------------------------------------------------
/chapter02/logMvGamma.m:
--------------------------------------------------------------------------------
 1 | function y = logMvGamma(x, d)
 2 | % Compute logarithm multivariate Gamma function 
 3 | % which is used in the probability density function of the Wishart and inverse Wishart distributions.
 4 | % Gamma_d(x) = pi^(d(d-1)/4) \prod_(j=1)^d Gamma(x+(1-j)/2)
 5 | % log(Gamma_d(x)) = d(d-1)/4 log(pi) + \sum_(j=1)^d log(Gamma(x+(1-j)/2))
 6 | % Input:
 7 | %   x: m x n data matrix
 8 | %   d: dimension
 9 | % Output:
10 | %   y: m x n logarithm multivariate Gamma
11 | % Written by Michael Chen (sth4nth@gmail.com).
12 | y = d*(d-1)/4*log(pi)+sum(gammaln(x(:)+(1-(1:d))/2),2);
13 | y = reshape(y,size(x));


--------------------------------------------------------------------------------
/chapter12/pca.m:
--------------------------------------------------------------------------------
 1 | function [U, L, mu, mse] = pca(X, m)
 2 | % Principal component analysis
 3 | % Input:
 4 | %   X: d x n data matrix 
 5 | %   m: target dimension
 6 | % Output:
 7 | %   U: d x m Projection matrix
 8 | %   L: m x 1 Eigen values
 9 | %   mu: d x 1 mean
10 | %   mse: mean square error
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | n = size(X,2);
13 | mu = mean(X,2);
14 | Xo = bsxfun(@minus,X,mu);
15 | S = Xo*Xo'/n;                   % 12.3
16 | [U,L] = eig(S);                         % 12.5
17 | [L,idx] = sort(diag(L),'descend');      
18 | mse = sum(L)-sum(L(1:m));
19 | U = U(:,idx(1:m));
20 | L = L(1:m);
21 | 
22 | 


--------------------------------------------------------------------------------
/common/lattice.m:
--------------------------------------------------------------------------------
 1 | function A = lattice( sz )
 2 | % Create an undirected graph corresponding to sz lattice
 3 | % Example:
 4 | %   plot(graph(lattice([2,2,3])))
 5 | % Input:
 6 | %   sz: 1 x d size of lattice
 7 | % Output:
 8 | %   A: prod(sz) x prod(sz) adjacent matrix of an undirected graph
 9 | % Written by Mo Chen (sth4nth@gmail.com)
10 | d = numel(sz);
11 | step = cumprod(sz);
12 | n = step(end);
13 | M = reshape(1:n,sz);
14 | S = arrayfun(@(i) reshape(slice(M,i,1:sz(i)-1),1,[]), 1:d,'UniformOutput',false);
15 | T = arrayfun(@(i) reshape(slice(M,i,2:sz(i)),1,[]), 1:d,'UniformOutput',false);
16 | A = sparse([S{:}],[T{:}],1,n,n);
17 | A = A+A';


--------------------------------------------------------------------------------
/chapter02/logGauss.m:
--------------------------------------------------------------------------------
 1 | function y = logGauss(X, mu, sigma)
 2 | % Compute log pdf of a Gaussian distribution.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   mu: d x 1 mean vector of Gaussian
 6 | %   sigma: d x d covariance matrix of Gaussian
 7 | % Output:
 8 | %   y: 1 x n probability density in logrithm scale y=log p(x)
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | d = size(X,1);
11 | X = X-mu;
12 | [U,p]= chol(sigma);
13 | if p ~= 0
14 |     error('ERROR: sigma is not PD.');
15 | end
16 | Q = U'\X;
17 | q = dot(Q,Q,1);  % quadratic term (M distance)
18 | c = d*log(2*pi)+2*sum(log(diag(U)));   % normalization constant
19 | y = -(c+q)/2;
20 | 


--------------------------------------------------------------------------------
/chapter09/mixBernRnd.m:
--------------------------------------------------------------------------------
 1 | function [X, z, mu] = mixBernRnd(d, k, n)
 2 | % Generate samples from a Bernoulli mixture distribution.
 3 | % Input:
 4 | %   d: dimension of data
 5 | %   k: number of components
 6 | %   n: number of data
 7 | % Output:
 8 | %   X: d x n data matrix
 9 | %   z: 1 x n response variable
10 | %   mu: d x k parameters of each Bernoulli component
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | 
13 | % w = dirichletRnd(1,ones(1,k)/k);
14 | w = ones(1,k)/k;
15 | z = discreteRnd(w,n);
16 | mu = rand(d,k);
17 | X = zeros(d,n);
18 | for i = 1:k
19 |     idx = z==i;
20 |     X(:,idx) = bsxfun(@le,rand(d,sum(idx)), mu(:,i));
21 | end
22 | 


--------------------------------------------------------------------------------
/chapter11/mixGaussSample.m:
--------------------------------------------------------------------------------
 1 | function [X, z] = mixGaussSample(Theta, w, n )
 2 | % Genarate samples form a Gaussian mixture model with GaussianWishart prior.
 3 | % Input:
 4 | %   Theta: cell of GaussianWishart priors of components
 5 | %   w: weight of components
 6 | %   n: number of data
 7 | % Output:
 8 | %   X: d x n data matrix
 9 | %   z: 1 x n response variable
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | z = discreteRnd(w,n);
12 | d = Theta{1}.dim();
13 | X = zeros(d,n);
14 | for i = 1:numel(w)
15 |     idx = z==i;
16 |     [mu,Sigma] = Theta{i}.sample(); % invpd(wishrnd(W0,v0));
17 |     X(:,idx) = gaussRnd(mu,Sigma,sum(idx));
18 | end
19 | 


--------------------------------------------------------------------------------
/chapter09/kmeansRnd.m:
--------------------------------------------------------------------------------
 1 | function [X, z, mu] = kmeansRnd(d, k, n)
 2 | % Generate samples from a Gaussian mixture distribution with common variances (kmeans model).
 3 | % Input:
 4 | %   d: dimension of data
 5 | %   k: number of components
 6 | %   n: number of data
 7 | % Output:
 8 | %   X: d x n data matrix
 9 | %   z: 1 x n response variable
10 | %   mu: d x k centers of clusters
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | alpha = 1;
13 | beta = nthroot(k,d); % k points in volume x^d : x^d=k
14 | 
15 | X = randn(d,n);
16 | w = dirichletRnd(alpha,ones(1,k)/k);
17 | z = discreteRnd(w,n);
18 | E = full(sparse(z,1:n,1,k,n,n));
19 | mu = randn(d,k)*beta;
20 | X = X+mu*E;


--------------------------------------------------------------------------------
/demo/ch05/mlp_demo.m:
--------------------------------------------------------------------------------
 1 | clear; close all
 2 | %% Regression
 3 | n = 200;
 4 | x = linspace(0,2*pi,n);
 5 | y = sin(x);
 6 | 
 7 | h = [10,6];            % two hidden layers with 10 and 6 neurons
 8 | lambda = 1e-2;
 9 | [model, L] = mlpReg(x,y,h,lambda);
10 | t = mlpRegPred(model,x);
11 | plot(L);
12 | figure;
13 | hold on
14 | plot(x,y,'.');
15 | plot(x,t);
16 | hold off
17 | %% Classification
18 | clear;
19 | k = 2;
20 | n = 200;
21 | [X,y] = kmeansRnd(2,k,n);
22 | figure;
23 | plotClass(X,y);
24 | 
25 | h = 3;
26 | lambda = 1e-2;
27 | [model, llh] = mlpClass(X,y,h,lambda);
28 | [t,p] = mlpClassPred(model,X);
29 | plot(llh);
30 | figure;
31 | plotClass(X,t);
32 | figure;
33 | 


--------------------------------------------------------------------------------
/demo/ch12/pca_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch12
 2 | 
 3 | clear; close all;
 4 | d = 3;
 5 | m = 2;
 6 | n = 1000;
 7 | 
 8 | X = ppcaRnd(m,d,n);
 9 | plotClass(X);
10 | %% PCA , EM PCA and Constraint EM PCA produce the same result in the sense of reconstruction mseor
11 | % classical PCA
12 | [U,L,mu,mse1] = pca(X,m);
13 | Y = U'*bsxfun(@minus,X,mu);   % projection
14 | Z1 = bsxfun(@times,Y,1./sqrt(L));  % whiten
15 | figure;
16 | plotClass(Y);
17 | figure;
18 | plotClass(Z1);
19 | mse1
20 | % EM PCA
21 | [W2,Z2,mu,mse2] = pcaEm(X,m);
22 | figure;
23 | plotClass(Z1);
24 | mse2
25 | % Contrained EM PCA
26 | [W3,Z3,mu,mse3] = pcaEmC(X,m);
27 | figure;
28 | plotClass(Z1);
29 | mse3
30 | 


--------------------------------------------------------------------------------
/chapter01/jointEntropy.m:
--------------------------------------------------------------------------------
 1 | function z = jointEntropy(x, y)
 2 | % Compute joint entropy z=H(x,y) of two discrete variables x and y.
 3 | % Input:
 4 | %   x, y: two integer vector of the same length 
 5 | % Output:
 6 | %   z: joint entroy z=H(x,y)
 7 | % Written by Mo Chen (sth4nth@gmail.com).    
 8 | assert(numel(x) == numel(y));
 9 | n = numel(x);
10 | x = reshape(x,1,n);
11 | y = reshape(y,1,n);
12 | 
13 | l = min(min(x),min(y));
14 | x = x-l+1;
15 | y = y-l+1;
16 | k = max(max(x),max(y));
17 | 
18 | idx = 1:n;
19 | p = nonzeros(sparse(idx,x,1,n,k,n)'*sparse(idx,y,1,n,k,n)/n); %joint distribution of x and y
20 | 
21 | z = -dot(p,log2(p));
22 | z = max(0,z);


--------------------------------------------------------------------------------
/chapter08/NaiveBayes/nbGauss.m:
--------------------------------------------------------------------------------
 1 | function model = nbGauss(X, t)
 2 | % Naive bayes classifier with indepenet Gaussian
 3 | % Each dimension of data is assumed from a 1d Gaussian distribution with independent mean and variance.
 4 | % Input:
 5 | %   X: d x n data matrix
 6 | %   t: 1 x n label (1~k)
 7 | % Output:
 8 | %   model: trained model structure
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | n = size(X,2);
11 | k = max(t);
12 | E = sparse(t,1:n,1,k,n,n);
13 | nk = full(sum(E,2));
14 | w = nk/n;
15 | R = E'*spdiags(1./nk,0,k,k);
16 | mu = X*R;  
17 | var = X.^2*R-mu.^2;
18 | 
19 | model.mu = mu;      % d x k means 
20 | model.var = var;  % d x k variances
21 | model.w = w;


--------------------------------------------------------------------------------
/chapter13/HMM/hmmRnd.m:
--------------------------------------------------------------------------------
 1 | function [x, model] = hmmRnd(d, k, n)
 2 | % Generate a data sequence from a hidden Markov model.
 3 | % Input:
 4 | %   d: dimension of data
 5 | %   k: dimension of latent variable
 6 | %   n: number of data
 7 | % Output:
 8 | %   X: d x n data matrix
 9 | %   model: model structure
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | A = normalize(rand(k,k),2);
12 | E = normalize(rand(k,d),2);
13 | s = normalize(rand(k,1),1);
14 | 
15 | x = zeros(1,n);
16 | z = discreteRnd(s);
17 | x(1) = discreteRnd(E(z,:));
18 | for i = 2:n
19 |     z = discreteRnd(A(z,:));
20 |     x(i) = discreteRnd(E(z,:));
21 | end
22 | 
23 | model.A = A;
24 | model.E = E;
25 | model.s = s;


--------------------------------------------------------------------------------
/chapter08/MRF/mrfIsGa.m:
--------------------------------------------------------------------------------
 1 | function [A, nodePot, edgePot] = mrfIsGa(im, sigma, J)
 2 | % Contruct a latent Ising MRF with Gaussian observation
 3 | % Input:
 4 | %   im: row x col image
 5 | %   sigma: variance of Gaussian node potential
 6 | %   J: parameter of Ising edge
 7 | % Output:
 8 | %   A: n x n adjacent matrix
 9 | %   nodePot: 2 x n node potential
10 | %   edgePot: 2 x 2 x m edge potential
11 | % Written by Mo Chen (sth4nth@gmail.com)
12 | A = lattice(size(im));
13 | [s,t,e] = find(triu(A));
14 | m = numel(e);
15 | e(:) = 1:m;
16 | A = sparse([s;t],[t;s],[e;e]);
17 | 
18 | z = [1;-1];
19 | x = reshape(im,1,[]);
20 | nodePot = -(x-z).^2/(2*sigma^2);
21 | edgePot = repmat(J*(z*z'),[1, 1, m]);


--------------------------------------------------------------------------------
/common/plotgm.m:
--------------------------------------------------------------------------------
 1 | function plotgm(X, model)
 2 | % Plot 2d Gaussian mixture model.
 3 | % Written by Mo Chen (sth4nth@gmail.com).
 4 | level = 64;
 5 | n = 256;
 6 | 
 7 | spread(X);
 8 | x_range = xlim;
 9 | y_range = ylim;
10 | 
11 | x = linspace(x_range(1),x_range(2), n);
12 | y = linspace(y_range(2),y_range(1), n);
13 | 
14 | [a,b] = meshgrid(x,y);
15 | z = exp(loggmpdf([a(:)';b(:)'],model));
16 | 
17 | z = z-min(z);
18 | z = floor(z/max(z)*(level-1));
19 | 
20 | figure;
21 | image(reshape(z,n,n));
22 | colormap(jet(level));
23 | set(gca, 'XTick', [1 256]);
24 | set(gca, 'XTickLabel', [min(x) max(x)]);
25 | set(gca, 'YTick', [1 256]);
26 | set(gca, 'YTickLabel', [min(y) max(y)]);
27 | axis off
28 | 


--------------------------------------------------------------------------------
/chapter08/NaiveBayes/nbGaussPred.m:
--------------------------------------------------------------------------------
 1 | function y = nbGaussPred(model, X)
 2 | % Prediction of naive Bayes classifier with independent Gaussian.
 3 | % input:
 4 | %   model: trained model structure
 5 | %   X: d x n data matrix
 6 | % output:
 7 | %   y: 1 x n predicted class label
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | mu = model.mu;
10 | var = model.var;
11 | w = model.w;
12 | assert(all(size(mu)==size(var)));
13 | d = size(mu,1);
14 | 
15 | lambda = 1./var;
16 | ml = mu.*lambda;
17 | M = bsxfun(@plus,lambda'*X.^2-2*ml'*X,dot(mu,ml,1)'); % M distance
18 | c = d*log(2*pi)+2*sum(log(var),1)'; % normalization constant
19 | R = -0.5*bsxfun(@plus,M,c);
20 | [~,y] = max(bsxfun(@times,exp(R),w),[],1);
21 | 


--------------------------------------------------------------------------------
/demo/ch09/kmeans_demo.m:
--------------------------------------------------------------------------------
 1 | close all; clear;
 2 | d = 2;
 3 | k = 3;
 4 | n = 5000;
 5 | %% Generate data
 6 | [X,label] = kmeansRnd(d,k,n);
 7 | plotClass(X,label);
 8 | %% kmeans with random initialization 
 9 | y = kmeans(X,k);
10 | figure;
11 | plotClass(X,y);
12 | %% kmeans init with labels
13 | y = kmeans(X,label);
14 | figure;
15 | plotClass(X,y);
16 | %% kmeans init with centers 
17 | mu = rand(d,k);
18 | y = kmeans(X,mu);
19 | figure;
20 | plotClass(X,y);
21 | %% kmeans init with kmeans++ seeding 
22 | y = kmeans(X,kseeds(X,k));
23 | figure;
24 | plotClass(X,y);
25 | %% kmeans++ seeding 
26 | mu = kseeds(X,k);
27 | [~,y] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign sample labels
28 | figure;
29 | plotClass(X,y);
30 | 


--------------------------------------------------------------------------------
/chapter04/binPlot.m:
--------------------------------------------------------------------------------
 1 | function binPlot(model, X, t)
 2 | % Plot binary classification result for 2d data
 3 | % Input:
 4 | %   model: trained model structure
 5 | %   X: 2 x n data matrix
 6 | %   t: 1 x n label
 7 | % Written by Mo Chen (sth4nth@gmail.com).
 8 | assert(size(X,1) == 2);
 9 | w = model.w;
10 | xi = min(X,[],2);
11 | xa = max(X,[],2);
12 | [x1,x2] = meshgrid(linspace(xi(1),xa(1)), linspace(xi(2),xa(2)));
13 | 
14 | color = 'brgmcyk';
15 | m = length(color);
16 | figure(gcf);
17 | axis equal
18 | clf;
19 | hold on;
20 | view(2);
21 | for i = 1:max(t)
22 |     idc = t==i;
23 |     scatter(X(1,idc),X(2,idc),36,color(mod(i-1,m)+1));
24 | end
25 | y = w(1)*x1+w(2)*x2+w(3);
26 | contour(x1,x2,y,[-0 0]);
27 | hold off;
28 | 


--------------------------------------------------------------------------------
/demo/ch10/mixGaussVb_demo.m:
--------------------------------------------------------------------------------
 1 | 
 2 | %% Variational Bayesian for Gaussian Mixture Model
 3 | close all; clear;
 4 | d = 2;
 5 | k = 3;
 6 | n = 2000;
 7 | [X,z] = mixGaussRnd(d,k,n);
 8 | plotClass(X,z);
 9 | m = floor(n/2);
10 | X1 = X(:,1:m);
11 | X2 = X(:,(m+1):end);
12 | % VB fitting
13 | [y1, model, L] = mixGaussVb(X1,10);
14 | figure;
15 | plotClass(X1,y1);
16 | figure;
17 | plot(L)
18 | % Model Evidence
19 | prior.alpha = 1;
20 | prior.kappa = 1;
21 | prior.m = mean(X1,2);
22 | prior.v = d+1;
23 | prior.M = eye(d);   % M = inv(W)
24 | L0 = mixGaussEvidence(X1, model, prior);
25 | L0-L(end)
26 | % Predict testing data
27 | [y2, R] = mixGaussVbPred(model,X2);
28 | figure;
29 | plotClass(X2,y2);
30 | 
31 | 


--------------------------------------------------------------------------------
/chapter13/LDS/ldsPca.m:
--------------------------------------------------------------------------------
 1 | function [A, C, Z] = ldsPca(X, k, m)
 2 | % Subspace method for learning linear dynamic system.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   k: dimension of hidden variable
 6 | %   m: stacking order for the Hankel matrix
 7 | % Output:
 8 | %   A: k x k transition matrix
 9 | %   C: k x d emission matrix
10 | %   Z: k x n latent variable
11 | %   Y: d x n reconstructed data
12 | % reference: Bayesian Reasoning and Machine Learning (BRML) chapter 24.5.3 p.507
13 | % Written by Mo Chen (sth4nth@gmail.com).
14 | [d,n] = size(X);
15 | H = reshape(X(:,hankel(1:m,m:n)),d*m,[]);
16 | [U,S,V] = svd(H,'econ');
17 | C = U(1:d,1:k);
18 | Z = S(1:k,1:k)*V(:,1:k)';
19 | A = Z(:,2:end)/Z(:,1:end-1); % estimated transition
20 | % Y = C*Z; % reconstructions


--------------------------------------------------------------------------------
/common/loggmpdf.m:
--------------------------------------------------------------------------------
 1 | function r = loggmpdf(X, model)
 2 | % Compute log pdf of a Gaussian mixture model.
 3 | % Written by Mo Chen (sth4nth@gmail.com).
 4 | mu = model.mu;
 5 | Sigma = model.Sigma;
 6 | w = model.weight;
 7 | 
 8 | n = size(X,2);
 9 | k = size(mu,2);
10 | logRho = zeros(k,n);
11 | 
12 | for i = 1:k
13 |     logRho(i,:) = loggausspdf(X,mu(:,i),Sigma(:,:,i));
14 | end
15 | r = logsumexp(bsxfun(@plus,logRho,log(w)'),1);
16 | 
17 | 
18 | function y = loggausspdf(X, mu, Sigma)
19 | d = size(X,1);
20 | X = bsxfun(@minus,X,mu);
21 | [U,p]= chol(Sigma);
22 | if p ~= 0
23 |     error('ERROR: Sigma is not PD.');
24 | end
25 | Q = U'\X;
26 | q = dot(Q,Q,1);  % quadratic term (M distance)
27 | c = d*log(2*pi)+2*sum(log(diag(U)));   % normalization constant
28 | y = -(c+q)/2;


--------------------------------------------------------------------------------
/common/plotkde.m:
--------------------------------------------------------------------------------
 1 | function plotkde(X, sigma2)
 2 | % Plot 2d kernel density.
 3 | % Written by Mo Chen (sth4nth@gmail.com).
 4 | if nargin < 2
 5 |     sigma2 = 1e-1;
 6 | end
 7 | level = 64;
 8 | n = 256;
 9 | 
10 | X = standardize(X);
11 | 
12 | spread(X);
13 | x_range = xlim;
14 | y_range = ylim;
15 | 
16 | x = linspace(x_range(1),x_range(2), n);
17 | y = linspace(y_range(2),y_range(1), n);
18 | 
19 | [a,b] = meshgrid(x,y);
20 | 
21 | z = exp(logkdepdf([a(:)';b(:)'],X,sigma2));
22 | 
23 | z = z-min(z);
24 | z = floor(z/max(z)*(level-1));
25 | 
26 | figure;
27 | image(reshape(z,n,n));
28 | colormap(jet(level));
29 | set(gca, 'XTick', [1 256]);
30 | set(gca, 'XTickLabel', [min(x) max(x)]);
31 | set(gca, 'YTick', [1 256]);
32 | set(gca, 'YTickLabel', [min(y) max(y)]);
33 | axis off
34 | 


--------------------------------------------------------------------------------
/chapter01/relatEntropy.m:
--------------------------------------------------------------------------------
 1 | function z = relatEntropy (x, y)
 2 | % Compute relative entropy (a.k.a KL divergence) z=KL(p(x)||p(y)) of two discrete variables x and y.
 3 | % Input:
 4 | %   x, y: two integer vector of the same length 
 5 | % Output:
 6 | %   z: relative entropy (a.k.a KL divergence) z=KL(p(x)||p(y))
 7 | % Written by Mo Chen (sth4nth@gmail.com).    
 8 | assert(numel(x) == numel(y));
 9 | n = numel(x);
10 | x = reshape(x,1,n);
11 | y = reshape(y,1,n);
12 | 
13 | l = min(min(x),min(y));
14 | x = x-l+1;
15 | y = y-l+1;
16 | k = max(max(x),max(y));
17 | 
18 | idx = 1:n;
19 | Mx = sparse(idx,x,1,n,k,n);
20 | My = sparse(idx,y,1,n,k,n);
21 | Px = nonzeros(mean(Mx,1));
22 | Py = nonzeros(mean(My,1));
23 | 
24 | z = -dot(Px,log2(Py)-log2(Px));
25 | z = max(0,z);


--------------------------------------------------------------------------------
/chapter14/mixLinPred.m:
--------------------------------------------------------------------------------
 1 | function [y, z, p] = mixLinPred(model, X, t)
 2 | % Prediction function for mxiture of linear regression
 3 | % input:
 4 | %   model: trained model structure
 5 | %   X: d x n data matrix
 6 | %   t:(optional) 1 x n responding vector
 7 | % output:
 8 | %   y: 1 x n prediction 
 9 | %   z: 1 x n cluster label
10 | %   p: 1 x n predict probability for t
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | W = model.W;
13 | alpha = model.alpha;
14 | beta = model.beta;
15 | 
16 | X = [X;ones(1,size(X,2))]; % adding the bias term
17 | y = W'*X;
18 | D = bsxfun(@minus,y,t).^2;
19 | logRho = (-0.5)*beta*D;
20 | logRho = bsxfun(@plus,logRho,log(alpha));
21 | T = logsumexp(logRho,1);
22 | p = exp(T);
23 | logR = bsxfun(@minus,logRho,T);
24 | R = exp(logR);
25 | z = max(R,[],1);
26 | 


--------------------------------------------------------------------------------
/demo/ch10/rvmRegVb_spSignal_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch07
 2 | 
 3 | %% sparse signal recovery demo
 4 | clear; close all; 
 5 | 
 6 | d = 512; % signal length
 7 | k = 20;  % number of spikes
 8 | n = 100; % number of measurements
 9 | %
10 | % random +/- 1 signal
11 | x = zeros(d,1);
12 | q = randperm(d);
13 | x(q(1:k)) = sign(randn(k,1)); 
14 | 
15 | % projection matrix
16 | A = unitize(randn(d,n),1);
17 | % noisy observations
18 | sigma = 0.005;
19 | e = sigma*randn(1,n);
20 | y = x'*A + e;
21 | 
22 | [model,llh] = rvmRegVb(A,y);
23 | plot(llh);
24 | m = model.w;
25 | 
26 | h = max(abs(x))+0.2;
27 | x_range = [1,d];
28 | y_range = [-h,+h];
29 | figure;
30 | subplot(2,1,1);plot(x); axis([x_range,y_range]); title('Original Signal');
31 | subplot(2,1,2);plot(m); axis([x_range,y_range]); title('Recovery Signal');
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/chapter04/fda.m:
--------------------------------------------------------------------------------
 1 | function U = fda(X, t, q)
 2 | % Fisher (linear) discriminant analysis
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   t: 1 x n class label
 6 | %   d: target dimension
 7 | % Output:
 8 | %   U: projection matrix y=U'*x
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | n = size(X,2);
11 | k = max(t);
12 | 
13 | E = sparse(1:n,t,true,n,k,n);  % transform label into indicator matrix
14 | nk = full(sum(E));
15 | 
16 | m = mean(X,2);
17 | Xo = bsxfun(@minus,X,m);
18 | St = (Xo*Xo')/n;                   % 4.43
19 | 
20 | mk = bsxfun(@times,X*E,1./nk);
21 | mo = bsxfun(@minus,mk,m);
22 | mo = bsxfun(@times,mo,sqrt(nk/n));
23 | Sb = mo*mo';                       % 4.46
24 | % Sw = St-Sb;                        % 4.45
25 | 
26 | [U,A] = eig(Sb,St,'chol');        
27 | [~,idx] = sort(diag(A),'descend');
28 | U = U(:,idx(1:q));
29 | 


--------------------------------------------------------------------------------
/chapter01/condEntropy.m:
--------------------------------------------------------------------------------
 1 | function z = condEntropy (x, y)
 2 | % Compute conditional entropy z=H(x|y) of two discrete variables x and y.
 3 | % Input:
 4 | %   x, y: two integer vector of the same length 
 5 | % Output:
 6 | %   z: conditional entropy z=H(x|y)
 7 | % Written by Mo Chen (sth4nth@gmail.com).
 8 | assert(numel(x) == numel(y));
 9 | n = numel(x);
10 | x = reshape(x,1,n);
11 | y = reshape(y,1,n);
12 | 
13 | l = min(min(x),min(y));
14 | x = x-l+1;
15 | y = y-l+1;
16 | k = max(max(x),max(y));
17 | 
18 | idx = 1:n;
19 | Mx = sparse(idx,x,1,n,k,n);
20 | My = sparse(idx,y,1,n,k,n);
21 | Pxy = nonzeros(Mx'*My/n); %joint distribution of x and y
22 | Hxy = -dot(Pxy,log2(Pxy));
23 | 
24 | Py = nonzeros(mean(My,1));
25 | Hy = -dot(Py,log2(Py));
26 | 
27 | % conditional entropy H(x|y)
28 | z = Hxy-Hy;
29 | z = max(0,z);
30 | 


--------------------------------------------------------------------------------
/chapter06/knReg.m:
--------------------------------------------------------------------------------
 1 | function model = knReg(X, t, lambda, kn)
 2 | % Gaussian process (kernel) regression
 3 | % Input:
 4 | %   X: d x n data
 5 | %   t: 1 x n response
 6 | %   lambda: regularization parameter
 7 | % Output:
 8 | %   model: trained model structure
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | if nargin < 4
11 |     kn = @knGauss;
12 | end
13 | if nargin < 3
14 |     lambda = 1e-2;
15 | end
16 | K = knCenter(kn,X);
17 | tbar = mean(t);
18 | U = chol(K+lambda*eye(size(X,2)));    % 6.62
19 | a = U\(U'\(t(:)-tbar));               % 6.68
20 | 
21 | model.kn = kn;
22 | model.a = a;
23 | model.X = X;
24 | model.tbar = tbar;
25 | %% for probability prediction
26 | y = a'*K+tbar;
27 | beta = 1/mean((t-y).^2);              % 3.21
28 | alpha = lambda*beta;           % lambda=a/b P.153 3.55
29 | model.alpha = alpha;
30 | model.beta = beta;
31 | model.U = U;


--------------------------------------------------------------------------------
/demo/ch07/rvmRegEm_spSignal_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch07
 2 | 
 3 | %% sparse signal recovery demo
 4 | clear; close all; 
 5 | 
 6 | d = 512; % signal length
 7 | k = 20;  % number of spikes
 8 | n = 100; % number of measurements
 9 | %
10 | % random +/- 1 signal
11 | x = zeros(d,1);
12 | q = randperm(d);
13 | x(q(1:k)) = sign(randn(k,1)); 
14 | 
15 | % projection matrix
16 | A = unitize(randn(d,n),1);
17 | % noisy observations
18 | sigma = 0.005;
19 | e = sigma*randn(1,n);
20 | y = x'*A + e;
21 | 
22 | [model,llh] = rvmRegEm(A,y);
23 | plot(llh);
24 | 
25 | m = zeros(d,1);
26 | m(model.index) = model.w;
27 | 
28 | h = max(abs(x))+0.2;
29 | x_range = [1,d];
30 | y_range = [-h,+h];
31 | figure;
32 | subplot(2,1,1);plot(x); axis([x_range,y_range]); title('Original Signal');
33 | subplot(2,1,2);plot(m); axis([x_range,y_range]); title('Recovery Signal');
34 |  
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/demo/ch07/rvmRegSeq_spSignal_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch07
 2 | 
 3 | %% sparse signal recovery demo
 4 | clear; close all; 
 5 | 
 6 | d = 512; % signal length
 7 | k = 20;  % number of spikes
 8 | n = 100; % number of measurements
 9 | %
10 | % random +/- 1 signal
11 | x = zeros(d,1);
12 | q = randperm(d);
13 | x(q(1:k)) = sign(randn(k,1)); 
14 | 
15 | % projection matrix
16 | A = unitize(randn(d,n),1);
17 | % noisy observations
18 | sigma = 0.005;
19 | e = sigma*randn(1,n);
20 | y = x'*A + e;
21 | 
22 | [model,llh] = rvmRegSeq(A,y);
23 | plot(llh);
24 | 
25 | m = zeros(d,1);
26 | m(model.index) = model.w;
27 | 
28 | h = max(abs(x))+0.2;
29 | x_range = [1,d];
30 | y_range = [-h,+h];
31 | figure;
32 | subplot(2,1,1);plot(x); axis([x_range,y_range]); title('Original Signal');
33 | subplot(2,1,2);plot(m); axis([x_range,y_range]); title('Recovery Signal');
34 |  
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/chapter07/rvmRegPred.m:
--------------------------------------------------------------------------------
 1 | function [y, sigma, p] = rvmRegPred(model, X, t)
 2 | % Compute RVM regression model reponse y = w'*X+w0 and likelihood 
 3 | % Input:
 4 | %   model: trained model structure
 5 | %   X: d x n testing data
 6 | %   t (optional): 1 x n testing response
 7 | % Output:
 8 | %   y: 1 x n prediction
 9 | %   sigma: variance
10 | %   p: 1 x n likelihood of t
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | index = model.index;
13 | w = model.w;
14 | w0 = model.w0;
15 | 
16 | X = X(index,:);
17 | y = w'*X+w0;
18 | %% probability prediction
19 | if nargout > 1
20 |     beta = model.beta;
21 |     U = model.U;        % 3.54
22 |     Xo = bsxfun(@minus,X,model.xbar);
23 |     XU = U'\Xo;
24 |     sigma = sqrt((1+dot(XU,XU,1))/beta);   %3.59
25 | end
26 | 
27 | if nargin == 3 && nargout == 3
28 |     p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma));
29 | end
30 | 


--------------------------------------------------------------------------------
/chapter06/knRegPred.m:
--------------------------------------------------------------------------------
 1 | function [y, sigma, p] = knRegPred(model, Xt, t)
 2 | % Prediction for Gaussian Process (kernel) regression model
 3 | % Input:
 4 | %   model: trained model structure
 5 | %   Xt: d x n testing data
 6 | %   t (optional): 1 x n testing response
 7 | % Output:
 8 | %   y: 1 x n prediction
 9 | %   sigma: variance
10 | %   p: 1 x n likelihood of t
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | kn = model.kn;
13 | a = model.a;
14 | X = model.X;
15 | tbar = model.tbar;
16 | Kt = knCenter(kn,X,X,Xt);
17 | y = a'*Kt+tbar;
18 | %% probability prediction 
19 | if nargout > 1
20 |     alpha = model.alpha;
21 |     beta = model.beta;
22 |     U = model.U;
23 |     XU = U'\Kt;
24 |     sigma = sqrt(1/beta+(knCenter(kn,X,Xt)-dot(XU,XU,1))/alpha); 
25 | end
26 | 
27 | if nargin == 3 && nargout == 3
28 |     p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma));
29 | end


--------------------------------------------------------------------------------
/demo/ch07/rvmRegFp_spSignal_demo.m:
--------------------------------------------------------------------------------
 1 | % demos for ch07
 2 | 
 3 | %% sparse signal recovery demo
 4 | clear; close all; 
 5 | 
 6 | d = 512; % signal length
 7 | k = 20;  % number of spikes
 8 | n = 100; % number of measurements
 9 | %
10 | % random +/- 1 signal
11 | x = zeros(d,1);
12 | q = randperm(d);
13 | x(q(1:k)) = sign(randn(k,1)); 
14 | 
15 | % projection matrix
16 | A = unitize(randn(d,n),1);
17 | % noisy observations
18 | sigma = 0.005;
19 | e = sigma*randn(1,n);
20 | y = x'*A + e;
21 | 
22 | [model,llh] = rvmRegFp(A,y);
23 | plot(llh);
24 | 
25 | m = zeros(d,1);
26 | m(model.index) = model.w;
27 | 
28 | h = max(abs(x))+0.2;
29 | x_range = [1,d];
30 | y_range = [-h,+h];
31 | figure;
32 | subplot(2,1,1);plot(x); axis([x_range,y_range]); title('Original Signal');
33 | subplot(2,1,2);plot(m); axis([x_range,y_range]); title('Recovery Signal');
34 |  
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/chapter01/mutInfo.m:
--------------------------------------------------------------------------------
 1 | function z = mutInfo(x, y)
 2 | % Compute mutual information I(x,y) of two discrete variables x and y.
 3 | % Input:
 4 | %   x, y: two integer vector of the same length 
 5 | % Output:
 6 | %   z: mutual information z=I(x,y)
 7 | % Written by Mo Chen (sth4nth@gmail.com).
 8 | assert(numel(x) == numel(y));
 9 | n = numel(x);
10 | x = reshape(x,1,n);
11 | y = reshape(y,1,n);
12 | 
13 | l = min(min(x),min(y));
14 | x = x-l+1;
15 | y = y-l+1;
16 | k = max(max(x),max(y));
17 | 
18 | idx = 1:n;
19 | Mx = sparse(idx,x,1,n,k,n);
20 | My = sparse(idx,y,1,n,k,n);
21 | Pxy = nonzeros(Mx'*My/n); %joint distribution of x and y
22 | Hxy = -dot(Pxy,log2(Pxy));
23 | 
24 | Px = nonzeros(mean(Mx,1));
25 | Py = nonzeros(mean(My,1));
26 | 
27 | % entropy of Py and Px
28 | Hx = -dot(Px,log2(Px));
29 | Hy = -dot(Py,log2(Py));
30 | % mutual information
31 | z = Hx+Hy-Hxy;
32 | z = max(0,z);


--------------------------------------------------------------------------------
/chapter12/pcaEmC.m:
--------------------------------------------------------------------------------
 1 | function [W, Z, mu, mse] = pcaEmC(X, m)
 2 | % Perform Constrained EM like algorithm for PCA.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   m: dimension of target space
 6 | % Output:
 7 | %   W: d x m weight matrix
 8 | %   Z: m x n projected data matrix
 9 | %   mu: d x 1 mean vector
10 | %   mse: mean square error
11 | % Reference: 
12 | %   A Constrained EM Algorithm for Principal Component Analysis by Jong-Hoon Ahn & Jong-Hoon Oh
13 | % Written by Mo Chen (sth4nth@gmail.com).
14 | 
15 | d = size(X,1);
16 | mu = mean(X,2);
17 | X = bsxfun(@minus,X,mu);
18 | W = rand(d,m); 
19 | 
20 | tol = 1e-6;
21 | mse = inf;
22 | maxIter = 200;
23 | for iter = 1:maxIter
24 |     Z = tril(W'*W)\(W'*X);
25 |     W = (X*Z')/triu(Z*Z');
26 | 
27 |     last = mse;
28 |     E = X-W*Z;
29 |     mse = mean(dot(E(:),E(:)));
30 |     if abs(last-mse)<mse*tol; break; end;
31 | end
32 | fprintf('Converged in %d steps.\n',iter);


--------------------------------------------------------------------------------
/chapter14/adaboostBin.m:
--------------------------------------------------------------------------------
 1 | function model = adaboostBin(X, t)
 2 | % Adaboost for binary classification (weak learner: kmeans)
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   t: 1 x n label (0/1)
 6 | % Output:
 7 | %   model: trained model structure
 8 | % Written by Mo Chen (sth4nth@gmail.com).
 9 | t = t+1;
10 | k = 2;
11 | [d,n] = size(X);
12 | w = ones(1,n)/n;
13 | M = 100;
14 | Alpha = zeros(1,M);
15 | Theta = zeros(d,k,M);
16 | T = sparse(1:n,t,1,n,k,n);  % transform label into indicator matrix
17 | for m = 1:M
18 |     % weak learner
19 |     E = spdiags(w',0,n,n)*T;
20 |     E = E*spdiags(1./sum(E,1)',0,k,k);
21 |     c = X*E;
22 |     [~,y] = min(sqdist(c,X),[],1);
23 |     Theta(:,:,m) = c;
24 |     % adaboost
25 |     I = y~=t;
26 |     e = dot(w,I);
27 |     alpha = log((1-e)/e);
28 |     w = w.*exp(alpha*I);
29 |     w = w/sum(w);
30 |     Alpha(m) = alpha;
31 | end
32 | model.alpha = Alpha;
33 | model.theta = Theta;


--------------------------------------------------------------------------------
/chapter13/HMM/hmmViterbi.m:
--------------------------------------------------------------------------------
 1 | function [z, llh] = hmmViterbi(model, x)
 2 | % Viterbi algorithm (calculated in log scale to improve numerical stability).
 3 | % Input:
 4 | %   x: 1 x n integer vector which is the sequence of observations
 5 | %   model: model structure which contains
 6 | %       model.s: k x 1 start probability vector
 7 | %       model.A: k x k transition matrix
 8 | %       model.E: k x d emission matrix
 9 | % Output:
10 | %   z: 1 x n latent state
11 | %   llh:  loglikelihood
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | n = size(x,2);
14 | X = sparse(x,1:n,1);
15 | s = log(model.s);
16 | A = log(model.A);
17 | M = log(model.E*X);
18 | 
19 | k = numel(s);
20 | Z = zeros(k,n);
21 | Z(:,1) = 1:k;
22 | v = s(:)+M(:,1);
23 | for t = 2:n
24 |     [v,idx] = max(bsxfun(@plus,A,v),[],1);    % 13.68
25 |     v = v(:)+M(:,t);
26 |     Z = Z(idx,:);
27 |     Z(:,t) = 1:k;
28 | end
29 | [llh,idx] = max(v);
30 | z = Z(idx,:);
31 | 
32 | 


--------------------------------------------------------------------------------
/chapter01/nvi.m:
--------------------------------------------------------------------------------
 1 | function z = nvi(x, y)
 2 | % Compute normalized variation information z=(1-I(x,y)/H(x,y)) of two discrete variables x and y.
 3 | % Input:
 4 | %   x, y: two integer vector of the same length 
 5 | % Output:
 6 | %   z: normalized variation information z=(1-I(x,y)/H(x,y))
 7 | % Written by Mo Chen (sth4nth@gmail.com).
 8 | assert(numel(x) == numel(y));
 9 | n = numel(x);
10 | x = reshape(x,1,n);
11 | y = reshape(y,1,n);
12 | 
13 | l = min(min(x),min(y));
14 | x = x-l+1;
15 | y = y-l+1;
16 | k = max(max(x),max(y));
17 | 
18 | idx = 1:n;
19 | Mx = sparse(idx,x,1,n,k,n);
20 | My = sparse(idx,y,1,n,k,n);
21 | Pxy = nonzeros(Mx'*My/n); %joint distribution of x and y
22 | Hxy = -dot(Pxy,log2(Pxy));
23 | 
24 | Px = nonzeros(mean(Mx,1));
25 | Py = nonzeros(mean(My,1));
26 | 
27 | % entropy of Py and Px
28 | Hx = -dot(Px,log2(Px));
29 | Hy = -dot(Py,log2(Py));
30 | 
31 | % nvi
32 | z = 2-(Hx+Hy)/Hxy;
33 | z = max(0,z);


--------------------------------------------------------------------------------
/chapter03/linRegPred.m:
--------------------------------------------------------------------------------
 1 | function [y, sigma, p] = linRegPred(model, X, t)
 2 | % Compute linear regression model reponse y = w'*X+w0 and likelihood
 3 | % Input:
 4 | %   model: trained model structure
 5 | %   X: d x n testing data
 6 | %   t (optional): 1 x n testing response
 7 | % Output:
 8 | %   y: 1 x n prediction
 9 | %   sigma: variance
10 | %   p: 1 x n likelihood of t
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | w = model.w;
13 | w0 = model.w0;
14 | y = w'*X+w0;
15 | %% probability prediction
16 | if nargout > 1
17 |     beta = model.beta;
18 |     if isfield(model,'U')
19 |         U = model.U;        % 3.54
20 |         Xo = bsxfun(@minus,X,model.xbar);
21 |         XU = U'\Xo;
22 |         sigma = sqrt((1+dot(XU,XU,1))/beta);   % 3.59
23 |     else
24 |         sigma = sqrt(1/beta)*ones(1,size(X,2));
25 |     end
26 | end
27 | 
28 | if nargin == 3 && nargout == 3
29 |     p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma));
30 | end
31 | 
32 | 


--------------------------------------------------------------------------------
/chapter12/pcaEm.m:
--------------------------------------------------------------------------------
 1 | function [W, Z, mu, mse] = pcaEm(X, m)
 2 | % Perform EM-like algorithm for PCA (by Sam Roweis).
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   m: dimension of target space
 6 | % Output:
 7 | %   W: d x m weight matrix
 8 | %   Z: m x n projected data matrix
 9 | %   mu: d x 1 mean vector
10 | %   mse: mean square error
11 | % Reference: 
12 | %   Pattern Recognition and Machine Learning by Christopher M. Bishop 
13 | %   EM algorithms for PCA and SPCA by Sam Roweis 
14 | % Written by Mo Chen (sth4nth@gmail.com).
15 | d = size(X,1);
16 | mu = mean(X,2);
17 | X = bsxfun(@minus,X,mu);
18 | W = rand(d,m); 
19 | 
20 | tol = 1e-6;
21 | mse = inf;
22 | maxIter = 200;
23 | for iter = 1:maxIter
24 |     Z = (W'*W)\(W'*X);             % 12.58
25 |     W = (X*Z')/(Z*Z');              % 12.59
26 | 
27 |     last = mse;
28 |     E = X-W*Z;
29 |     mse = mean(dot(E(:),E(:)));
30 |     if abs(last-mse)<mse*tol; break; end;
31 | end
32 | fprintf('Converged in %d steps.\n',iter);
33 | 
34 | 


--------------------------------------------------------------------------------
/demo/ch01/info_demo.m:
--------------------------------------------------------------------------------
 1 | 
 2 | % demos for ch01
 3 | clear;
 4 | k = 10;  % variable range
 5 | n = 100;  % number of variables
 6 | 
 7 | x = ceil(k*rand(1,n));
 8 | y = ceil(k*rand(1,n));
 9 | 
10 | %% Entropy H(x), H(y)
11 | Hx = entropy(x);
12 | Hy = entropy(y);
13 | %% Joint entropy H(x,y)
14 | Hxy = jointEntropy(x,y);
15 | %% Conditional entropy H(x|y)
16 | Hx_y = condEntropy(x,y);
17 | %% Mutual information I(x,y)
18 | Ixy = mutInfo(x,y);
19 | %% Relative entropy (KL divergence) KL(p(x)|p(y))
20 | Dxy = relatEntropy(x,y);
21 | %% Normalized mutual information I_n(x,y)
22 | nIxy = nmi(x,y);
23 | %% Nomalized variation information I_v(x,y)
24 | vIxy = nvi(x,y);
25 | %% H(x|y) = H(x,y)-H(y)
26 | isequalf(Hx_y,Hxy-Hy)
27 | %% I(x,y) = H(x)-H(x|y)
28 | isequalf(Ixy,Hx-Hx_y)
29 | %% I(x,y) = H(x)+H(y)-H(x,y)
30 | isequalf(Ixy,Hx+Hy-Hxy)
31 | %% I_n(x,y) = I(x,y)/sqrt(H(x)*H(y))
32 | isequalf(nIxy,Ixy/sqrt(Hx*Hy))
33 | %% I_v(x,y) = (1-I(x,y)/H(x,y))
34 | isequalf(vIxy,1-Ixy/Hxy)
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/chapter03/linReg.m:
--------------------------------------------------------------------------------
 1 | function model = linReg(X, t, lambda)
 2 | % Fit linear regression model y=w'x+w0  
 3 | % Input:
 4 | %   X: d x n data
 5 | %   t: 1 x n response
 6 | %   lambda: regularization parameter
 7 | % Output:
 8 | %   model: trained model structure
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | if nargin < 3
11 |     lambda = 0;
12 | end
13 | d = size(X,1);
14 | idx = (1:d)';
15 | dg = sub2ind([d,d],idx,idx);
16 | 
17 | xbar = mean(X,2);
18 | tbar = mean(t,2);
19 | X = bsxfun(@minus,X,xbar);
20 | t = bsxfun(@minus,t,tbar);
21 | 
22 | XX = X*X';
23 | XX(dg) = XX(dg)+lambda;     % 3.54 XX=inv(S)/beta
24 | % w = XX\(X*t');
25 | U = chol(XX);
26 | w = U\(U'\(X*t'));  % 3.15 & 3.28
27 | w0 = tbar-dot(w,xbar);  % 3.19
28 | 
29 | model.w = w;
30 | model.w0 = w0;
31 | model.xbar = xbar;
32 | %% for probability prediction
33 | beta = 1/mean((t-w'*X).^2); % 3.21
34 | % alpha = lambda*beta;           % lambda=a/b P.153 3.55
35 | % model.alpha = alpha;
36 | model.beta = beta;
37 | model.U = U;
38 | 


--------------------------------------------------------------------------------
/chapter13/LDS/ldsRnd.m:
--------------------------------------------------------------------------------
 1 | function [Z, X] = ldsRnd(model, n)
 2 | % Generate a data sequence from linear dynamic system.
 3 | % Input:
 4 | %   d: dimension of data
 5 | %   k: dimension of latent variable
 6 | %   n: number of data
 7 | % Output:
 8 | %   X: d x n data matrix
 9 | %   model: model structure
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | mu0 = model.mu0;
12 | P0 = model.P0;
13 | A = model.A;
14 | G = model.G;
15 | C = model.C;
16 | S = model.S;
17 | 
18 | k = size(G,1);
19 | d = size(S,1);
20 | 
21 | X = zeros(d,n);
22 | Z = zeros(k,n);
23 | Z(:,1) = gaussRnd(mu0,P0);                  % 13.80
24 | X(:,1) = gaussRnd(C*Z(:,1),S);
25 | for i = 2:n
26 |     Z(:,i) = gaussRnd(A*Z(:,i-1),G);        % 13.75, 13.78
27 |     X(:,i) = gaussRnd(C*Z(:,i),S);          % 13.76, 13.79
28 | end
29 | model.mu0 = mu0; % prior mean
30 | model.P0 = P0;  % prior covairance
31 | model.A = A; % transition matrix 
32 | model.G = G; % transition covariance
33 | model.C = C; % emission matrix
34 | model.S = S;  % emision covariance
35 | 


--------------------------------------------------------------------------------
/chapter11/mixGaussGb.m:
--------------------------------------------------------------------------------
 1 | function [label, Theta, w, llh] = mixGaussGb(X, opt)
 2 | % Collapsed Gibbs sampling for Dirichlet process (infinite) Gaussian mixture model (a.k.a. DPGM). 
 3 | % This is a wrapper function which calls underlying Dirichlet process mixture model.
 4 | % Input: 
 5 | %   X: d x n data matrix
 6 | %   opt(optional): prior parameters
 7 | % Output:
 8 | %   label: 1 x n cluster label
 9 | %   Theta: 1 x k structure of trained Gaussian components
10 | %   w: 1 x k component weight vector
11 | %   llh: loglikelihood
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | [d,n] = size(X);
14 | mu = mean(X,2);
15 | Xo = bsxfun(@minus,X,mu);
16 | s = sum(Xo(:).^2)/(d*n);
17 | if nargin == 1
18 |     kappa0 = 1;
19 |     m0 = mean(X,2);
20 |     nu0 = d;
21 |     S0 = s*eye(d);
22 |     alpha0 = 1;
23 | else
24 |     kappa0 = opt.kappa;
25 |     m0 = opt.m;
26 |     nu0 = opt.nu;
27 |     S0 = opt.S;
28 |     alpha0 = opt.alpha;
29 | end
30 | prior = GaussWishart(kappa0,m0,nu0,S0);
31 | [label, Theta, w, llh] = mixDpGb(X,alpha0,prior);


--------------------------------------------------------------------------------
/chapter06/knKmeans.m:
--------------------------------------------------------------------------------
 1 | function [label, model, energy] = knKmeans(X, init, kn)
 2 | % Perform kernel kmeans clustering.
 3 | % Input:
 4 | %   K: n x n data matrix
 5 | %   init: either number of clusters (k) or initial label (1xn)
 6 | % Output:
 7 | %   label: 1 x n sample labels
 8 | %   model: trained model structure
 9 | %   energy: optimization target value
10 | % Reference: Kernel Methods for Pattern Analysis
11 | % by John Shawe-Taylor, Nello Cristianini
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | K = kn(X,X);
14 | n = size(K,2);
15 | if numel(init)==1
16 |     k = init;
17 |     label = ceil(k*rand(1,n));
18 | elseif numel(init)==n
19 |     label = init;
20 | end
21 | last = zeros(1,n);
22 | while any(label ~= last)
23 |     [~,~,last(:)] = unique(label);   % remove empty clusters
24 |     E = sparse(last,1:n,1);
25 |     E = E./sum(E,2);
26 |     T = E*K;
27 |     [val, label] = max(T-dot(T,E,2)/2,[],1);
28 | end
29 | energy = trace(K)-2*sum(val); 
30 | model.kn = kn;
31 | model.label = label;
32 | model.X = X;
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/chapter09/kmedoids.m:
--------------------------------------------------------------------------------
 1 | function [label, index, energy] = kmedoids(X, init)
 2 | % Perform k-medoids clustering.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   init: k number of clusters or label (1 x n vector)
 6 | % Output:
 7 | %   label: 1 x n sample labels
 8 | %   index: index of medoids
 9 | %   energy: optimization target value
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | [d,n] = size(X);
12 | if numel(init)==1
13 |     k = init;
14 |     label = ceil(k*rand(1,n));
15 | elseif numel(init)==n
16 |     label = init;
17 | end
18 | X = X-mean(X,2);             % reduce chance of numerical problems
19 | v = dot(X,X,1);
20 | D = v+v'-2*(X'*X);            % Euclidean distance matrix
21 | D(sub2ind([n,n],1:n,1:n)) = 0;              % reduce chance of numerical problems
22 | last = zeros(1,n);
23 | while any(label ~= last)
24 |     [~,~,last(:)] = unique(label);   % remove empty clusters
25 |     [~, index] = min(D*sparse(1:n,last,1),[],1);  % find k medoids
26 |     [val, label] = min(D(index,:),[],1);                % assign labels
27 | end
28 | energy = sum(val);
29 | 


--------------------------------------------------------------------------------
/common/ld.m:
--------------------------------------------------------------------------------
 1 | % function [L, D] = ld(X)
 2 | % % LD factorization produces LDL'=X*X' which is the same as [L,D] = ldl(X*X');
 3 | % % the underlying algorithm is Gram-Schmidt orthogonalization
 4 | % [d,n] = size(X);
 5 | % m = min(d,n);
 6 | % L = eye(d,m);
 7 | % Q = zeros(m,n);
 8 | % D = zeros(m,1);
 9 | % for i = 1:m
10 | %     L(i,1:i-1) = X(i,:)*bsxfun(@times,Q(1:i-1,:),1./D(1:i-1))';
11 | %     Q(i,:) = X(i,:)-L(i,1:i-1)*Q(1:i-1,:);
12 | %     D(i) = dot(Q(i,:),Q(i,:));
13 | % end
14 | % L(m+1:d,:) = X(m+1:d,:)*bsxfun(@times,Q,1./D)';
15 | 
16 | function [L, D] = ld(X)
17 | % LD factorization produces LDL'=X*X' which is the same as [L,D] = ldl(X*X');
18 | % the underlying algorithm is modified Gram-Schmidt orthogonalization
19 | [d,n] = size(X);
20 | m = min(d,n);
21 | L = eye(d,m);
22 | Q = zeros(m,n);
23 | D = zeros(m,1);
24 | for i = 1:m
25 |     v = X(i,:);
26 |     for j = 1:i-1
27 |         L(i,j) = v*Q(j,:)'/D(j);
28 |         v = v-L(i,j)*Q(j,:);
29 |     end
30 |     Q(i,:) = v;
31 |     D(i) = dot(Q(i,:),Q(i,:));
32 | end
33 | L(m+1:d,:) = X(m+1:d,:)*bsxfun(@times,Q,1./D)';


--------------------------------------------------------------------------------
/chapter09/mixGaussPred.m:
--------------------------------------------------------------------------------
 1 | function [label, R] = mixGaussPred(model, X)
 2 | % Predict label and responsibility for Gaussian mixture model.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   model: trained model structure outputed by the EM algirthm
 6 | % Output:
 7 | %   label: 1 x n cluster label
 8 | %   R: k x n responsibility
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | mu = model.mu;
11 | Sigma = model.Sigma;
12 | w = model.w;
13 | 
14 | n = size(X,2);
15 | k = size(mu,2);
16 | logRho = zeros(n,k);
17 | 
18 | for i = 1:k
19 |     logRho(:,i) = loggausspdf(X,mu(:,i),Sigma(:,:,i));
20 | end
21 | logRho = bsxfun(@plus,logRho,log(w));
22 | T = logsumexp(logRho,2);
23 | logR = bsxfun(@minus,logRho,T);
24 | R = exp(logR);
25 | [~,label(1,:)] = max(R,[],2);
26 | 
27 | function y = loggausspdf(X, mu, Sigma)
28 | d = size(X,1);
29 | X = bsxfun(@minus,X,mu);
30 | [U,p]= chol(Sigma);
31 | if p ~= 0
32 |     error('ERROR: Sigma is not PD.');
33 | end
34 | Q = U'\X;
35 | q = dot(Q,Q,1);  % quadratic term (M distance)
36 | c = d*log(2*pi)+2*sum(log(diag(U)));   % normalization constant
37 | y = -(c+q)/2;


--------------------------------------------------------------------------------
/common/plotClass.m:
--------------------------------------------------------------------------------
 1 | function plotClass(X, label)
 2 | % Plot 2d/3d samples of different classes with different colors.
 3 | % Written by Mo Chen (sth4nth@gmail.com).
 4 | [d,n] = size(X);
 5 | if nargin == 1
 6 |     label = ones(n,1);
 7 | end
 8 | assert(n == length(label));
 9 | 
10 | color = 'brgmcyk';
11 | m = length(color);
12 | c = max(label);
13 | 
14 | figure(gcf);
15 | clf;
16 | hold on;
17 | switch d
18 |     case 2
19 |         view(2);
20 |         for i = 1:c
21 |             idc = label==i;
22 | %             plot(X(1,label==i),X(2,label==i),['.' color(i)],'MarkerSize',15);
23 |             scatter(X(1,idc),X(2,idc),36,color(mod(i-1,m)+1));
24 |         end
25 |     case 3
26 |         view(3);
27 |         for i = 1:c
28 |             idc = label==i;
29 | %             plot3(X(1,idc),X(2,idci),X(3,idc),['.' idc],'MarkerSize',15);
30 |             scatter3(X(1,idc),X(2,idc),X(3,idc),36,color(mod(i-1,m)+1));
31 |         end
32 |     otherwise
33 |         error('ERROR: only support data of 2D or 3D.');
34 | end
35 | axis equal
36 | grid on
37 | hold off


--------------------------------------------------------------------------------
/chapter01/nmi.m:
--------------------------------------------------------------------------------
 1 | function z = nmi(x, y)
 2 | % Compute normalized mutual information I(x,y)/sqrt(H(x)*H(y)) of two discrete variables x and y.
 3 | % Input:
 4 | %   x, y: two integer vector of the same length 
 5 | % Ouput:
 6 | %   z: normalized mutual information z=I(x,y)/sqrt(H(x)*H(y))
 7 | % Written by Mo Chen (sth4nth@gmail.com).
 8 | assert(numel(x) == numel(y));
 9 | n = numel(x);
10 | x = reshape(x,1,n);
11 | y = reshape(y,1,n);
12 | 
13 | l = min(min(x),min(y));
14 | x = x-l+1;
15 | y = y-l+1;
16 | k = max(max(x),max(y));
17 | 
18 | idx = 1:n;
19 | Mx = sparse(idx,x,1,n,k,n);
20 | My = sparse(idx,y,1,n,k,n);
21 | Pxy = nonzeros(Mx'*My/n); %joint distribution of x and y
22 | Hxy = -dot(Pxy,log2(Pxy));
23 | 
24 | 
25 | % hacking, to elimative the 0log0 issue
26 | Px = nonzeros(mean(Mx,1));
27 | Py = nonzeros(mean(My,1));
28 | 
29 | % entropy of Py and Px
30 | Hx = -dot(Px,log2(Px));
31 | Hy = -dot(Py,log2(Py));
32 | 
33 | % mutual information
34 | MI = Hx + Hy - Hxy;
35 | 
36 | % normalized mutual information
37 | z = sqrt((MI/Hx)*(MI/Hy));
38 | z = max(0,z);
39 | 
40 | 


--------------------------------------------------------------------------------
/chapter13/HMM/hmmFilter.m:
--------------------------------------------------------------------------------
 1 | function [alpha, llh] = hmmFilter(model, x)
 2 | % HMM forward filtering algorithm. 
 3 | % The alpha returned by this function is the normalized version (posterior): alpha(t)=p(z_t|x_{1:t})
 4 | % Unnormalized version (joint distribution): alpha(t)=p(z_t,x_{1:t}) is numerical unstable.
 5 | % Input:
 6 | %   x: 1 x n integer vector which is the sequence of observations
 7 | %   model: model structure which contains
 8 | %       model.s: k x 1 start probability vector
 9 | %       model.A: k x k transition matrix
10 | %       model.E: k x d emission matrix
11 | % Output:
12 | %   alpha: k x n matrix of posterior alpha(t)=p(z_t|x_{1:t})
13 | %   llh: loglikelihood
14 | % Written by Mo Chen (sth4nth@gmail.com).
15 | s = model.s;
16 | A = model.A;
17 | E = model.E;
18 | 
19 | n = size(x,2);
20 | X = sparse(x,1:n,1);
21 | M = E*X;
22 | 
23 | [K,T] = size(M);
24 | At = A';
25 | llh = zeros(1,T);
26 | alpha = zeros(K,T);
27 | [alpha(:,1),llh(1)] = normalize(s.*M(:,1),1);
28 | for t = 2:T
29 |     [alpha(:,t),llh(t)] = normalize((At*alpha(:,t-1)).*M(:,t),1);    % 13.59
30 | end
31 | llh = sum(log(llh(llh>0)));


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Mo Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/chapter06/knCenter.m:
--------------------------------------------------------------------------------
 1 | function Kc = knCenter(kn, X, X1, X2)
 2 | % Centerize the data in the kernel space
 3 | % Input:
 4 | %   kn: kernel function
 5 | %   X: d x n data matrix of which the center in the kernel space is computed
 6 | %   X1, X2: d x n1 and d x n2 data matrix. the kernel k(x1,x2) is computed
 7 | %       where the origin of the kernel space is the center of phi(X)
 8 | % Ouput:
 9 | %   Kc: n1 x n2 kernel matrix between X1 and X2 in kernel space centered by
10 | %       center of phi(X)
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | K = kn(X,X);
13 | mK = mean(K);
14 | mmK = mean(mK);
15 | if nargin == 2     % compute the pairwise centerized version of the kernel of X. eq knCenter(kn,X,X,X)
16 |     Kc = K+mmK-bsxfun(@plus,mK',mK);        % Kc = K-M*K-K*M+M*K*M; where M = ones(n,n)/n; 
17 | elseif nargin == 3  % compute the norms (k(x,x)) of X1 w.r.t. the center of X as the origin. eq diag(knCenter(kn,X,X1,X1))
18 |     Kc = kn(X1)+mmK-2*mean(kn(X,X1));
19 | elseif nargin == 4  % compute the kernel of X1 and X2 w.r.t. the center of X as the origin
20 |     Kc = kn(X1,X2)+mmK-bsxfun(@plus,mean(kn(X,X1))',mean(kn(X,X2)));
21 | end
22 | 


--------------------------------------------------------------------------------
/chapter11/mixDpGbOl.m:
--------------------------------------------------------------------------------
 1 | function [label, Theta, w, llh] = mixDpGbOl(X, alpha, theta)
 2 | % Online collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 
 3 | % Input: 
 4 | %   X: d x n data matrix
 5 | %   alpha: parameter for Dirichlet process prior
 6 | %   theta: class object for prior of component distribution (such as Gauss)
 7 | % Output:
 8 | %   label: 1 x n cluster label
 9 | %   Theta: 1 x k structure of trained components
10 | %   w: 1 x k component weight vector
11 | %   llh: loglikelihood
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | n = size(X,2);
14 | Theta = {};
15 | nk = [];
16 | label = zeros(1,n);
17 | llh = 0;
18 | for i = randperm(n)
19 |     x = X(:,i);
20 |     Pk = log(nk)+cellfun(@(t) t.logPredPdf(x), Theta);
21 |     P0 = log(alpha)+theta.logPredPdf(x);
22 |     p = [Pk,P0];
23 |     llh = llh+sum(p-log(n));
24 |     k = discreteRnd(exp(p-logsumexp(p)));
25 |     if k == numel(Theta)+1
26 |         Theta{k} = theta.clone().addSample(x);
27 |         nk = [nk,1];
28 |     else
29 |         Theta{k} = Theta{k}.addSample(x);
30 |         nk(k) = nk(k)+1;
31 |     end
32 |     label(i) = k;
33 | end
34 | w = nk/n;


--------------------------------------------------------------------------------
/demo/ch11/gauss_demo.m:
--------------------------------------------------------------------------------
 1 | 
 2 | %% Sequential update for Gaussian 
 3 | close all; clear;
 4 | d = 2;
 5 | n = 100;
 6 | X = randn(d,n);
 7 | x = randn(d,1);
 8 | 
 9 | mu = mean(X,2);
10 | Xo = bsxfun(@minus,X,mu);
11 | Sigma = Xo*Xo'/n;
12 | p1 = logGauss(x,mu,Sigma);
13 | 
14 | gauss = Gauss(X(:,3:end)).addSample(X(:,1)).addSample(X(:,2)).addSample(X(:,3)).delSample(X(:,3));
15 | p2 = gauss.logPdf(x);
16 | maxdiff(p1,p2)
17 | %% Sequential update for Gaussian-Wishart
18 | close all; clear;
19 | d = 2;
20 | n = 100;
21 | X = randn(d,n);
22 | x = randn(d,1);
23 | 
24 | kappa0 = 1;
25 | m0 = zeros(d,1);
26 | nu0 = d;
27 | S0 = eye(d);
28 | 
29 | xbar = mean(X,2);
30 | kappa = kappa0+n;
31 | nu = nu0+n;
32 | m = (n*xbar+kappa0*m0)/kappa;
33 | Xo = bsxfun(@minus,X,m);
34 | X0 = m0-m;
35 | S = S0+Xo*Xo'+kappa0*(X0*X0');
36 | 
37 | v = (nu-d+1);
38 | r = (1+1/kappa)/v;
39 | p1 = logSt(x,m,r*S,v);
40 | 
41 | gw0 = GaussWishart(kappa0,m0,nu0,S0);
42 | gw0 = gw0.addData(X);
43 | p0 = gw0.logPredPdf(x);
44 | 
45 | gw = GaussWishart(kappa0,m0,nu0,S0);
46 | for i=1:n
47 |     gw = gw.addSample(X(:,i));
48 | end
49 | p2 = gw.logPredPdf(x);
50 | maxdiff(p1,p2)
51 | % 
52 | 


--------------------------------------------------------------------------------
/chapter04/logitBin.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = logitBin(X, y, lambda)
 2 | % Logistic regression for binary classification optimized by Newton-Raphson method.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   y: 1 x n label (0/1)
 6 | %   lambda: regularization parameter
 7 | %   alpha: step size
 8 | % Output:
 9 | %   model: trained model structure
10 | %   llh: loglikelihood
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | if nargin < 4
13 |     alpha = 1e-1;
14 | end
15 | if nargin < 3
16 |     lambda = 1e-4;
17 | end
18 | X = [X; ones(1,size(X,2))];
19 | [d,n] = size(X);
20 | tol = 1e-4;
21 | epoch = 200;
22 | llh = -inf(1,epoch);
23 | w = rand(d,1);
24 | for t = 2:epoch
25 |     a = w'*X;
26 |     llh(t) = (dot(a,y)-sum(log1pexp(a))-0.5*lambda*dot(w,w))/n; % 4.90
27 |     if abs(llh(t)-llh(t-1)) < tol; break; end
28 |     z = sigmoid(a);                     % 4.87
29 |     g = X*(z-y)'+lambda*w;              % 4.96
30 |     r = z.*(1-z);                       % 4.98
31 |     Xw = bsxfun(@times, X, sqrt(r));
32 |     H = Xw*Xw'+lambda*eye(d);           % 4.97
33 |     w = w-alpha*(H\g);                  % 4.92
34 | end
35 | llh = llh(2:t);
36 | model.w = w;
37 | 


--------------------------------------------------------------------------------
/chapter09/kmeans.m:
--------------------------------------------------------------------------------
 1 | function [label, mu, energy] = kmeans(X, m)
 2 | % Perform kmeans clustering.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   m: initialization parameter
 6 | % Output:
 7 | %   label: 1 x n sample labels
 8 | %   mu: d x k center of clusters
 9 | %   energy: optimization target value
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | label = init(X, m);
12 | n = numel(label);
13 | idx = 1:n;
14 | last = zeros(1,n);
15 | while any(label ~= last)
16 |     [~,~,last(:)] = unique(label);                  % remove empty clusters
17 |     mu = X*normalize(sparse(idx,last,1),1);         % compute cluster centers 
18 |     [val,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1);  % assign sample labels
19 | end
20 | energy = dot(X(:),X(:),1)+2*sum(val);
21 | 
22 | function label = init(X, m)
23 | [d,n] = size(X);
24 | if numel(m) == 1                           % random initialization
25 |     mu = X(:,randperm(n,m));
26 |     [~,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); 
27 | elseif all(size(m) == [1,n])               % init with labels
28 |     label = m;
29 | elseif size(m,1) == d                      % init with seeds (centers)
30 |     [~,label] = min(dot(m,m,1)'/2-m'*X,[],1); 
31 | end


--------------------------------------------------------------------------------
/chapter09/mixGaussRnd.m:
--------------------------------------------------------------------------------
 1 | function [X, z, model] = mixGaussRnd(d, k, n)
 2 | % Genarate samples form a Gaussian mixture model.
 3 | % Input:
 4 | %   d: dimension of data
 5 | %   k: number of components
 6 | %   n: number of data
 7 | % Output:
 8 | %   X: d x n data matrix
 9 | %   z: 1 x n response variable
10 | %   model: model structure
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | alpha0 = 1;  % hyperparameter of Dirichlet prior
13 | W0 = eye(d);  % hyperparameter of inverse Wishart prior of covariances
14 | v0 = d+1;  % hyperparameter of inverse Wishart prior of covariances
15 | mu0 = zeros(d,1);  % hyperparameter of Guassian prior of means
16 | beta0 = nthroot(k,d); % hyperparameter of Guassian prior of means % in volume x^d there is k points: x^d=k
17 | 
18 | 
19 | w = dirichletRnd(alpha0,ones(1,k)/k);
20 | z = discreteRnd(w,n);
21 | 
22 | mu = zeros(d,k);
23 | Sigma = zeros(d,d,k);
24 | X = zeros(d,n);
25 | for i = 1:k
26 |     idx = z==i;
27 |     Sigma(:,:,i) = iwishrnd(W0,v0); % invpd(wishrnd(W0,v0));
28 |     mu(:,i) = gaussRnd(mu0,beta0*Sigma(:,:,i));
29 |     X(:,idx) = gaussRnd(mu(:,i),Sigma(:,:,i),sum(idx));
30 | end
31 | model.mu = mu;
32 | model.Sigma = Sigma;
33 | model.weight = w;


--------------------------------------------------------------------------------
/chapter10/mixGaussVbPred.m:
--------------------------------------------------------------------------------
 1 | function [z, R] = mixGaussVbPred(model, X)
 2 | % Predict label and responsibility for Gaussian mixture model trained by VB.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   model: trained model structure outputed by the EM algirthm
 6 | % Output:
 7 | %   label: 1 x n cluster label
 8 | %   R: k x n responsibility
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | alpha = model.alpha; % Dirichlet
11 | kappa = model.kappa;   % Gaussian
12 | m = model.m;         % Gasusian
13 | v = model.v;         % Whishart
14 | U = model.U;         % Whishart 
15 | logW = model.logW;
16 | n = size(X,2);
17 | [d,k] = size(m);
18 | 
19 | EQ = zeros(n,k);
20 | for i = 1:k
21 |     Q = (U(:,:,i)'\bsxfun(@minus,X,m(:,i)));
22 |     EQ(:,i) = d/kappa(i)+v(i)*dot(Q,Q,1);    % 10.64
23 | end
24 | ElogLambda = sum(psi(0,0.5*bsxfun(@minus,v+1,(1:d)')),1)+d*log(2)+logW; % 10.65
25 | Elogpi = psi(0,alpha)-psi(0,sum(alpha)); % 10.66
26 | logRho = -0.5*bsxfun(@minus,EQ,ElogLambda-d*log(2*pi)); % 10.46
27 | logRho = bsxfun(@plus,logRho,Elogpi);   % 10.46
28 | logR = bsxfun(@minus,logRho,logsumexp(logRho,2)); % 10.49
29 | R = exp(logR);
30 | z = zeros(1,n);
31 | [~,z(:)] = max(R,[],2);
32 | [~,~,z(:)] = unique(z);
33 | 
34 | 


--------------------------------------------------------------------------------
/chapter09/mixBernEm.m:
--------------------------------------------------------------------------------
 1 | function [label, model, llh] = mixBernEm(X, k)
 2 | % Perform EM algorithm for fitting the Bernoulli mixture model.
 3 | % Input: 
 4 | %   X: d x n binary (0/1) data matrix 
 5 | %   k: number of cluster
 6 | % Output:
 7 | %   label: 1 x n cluster label
 8 | %   model: trained model structure
 9 | %   llh: loglikelihood
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | %% initialization
12 | fprintf('EM for mixture model: running ... \n');
13 | X = sparse(X);
14 | n = size(X,2);
15 | label = ceil(k*rand(1,n));  % random initialization
16 | R = full(sparse(1:n,label,1));
17 | tol = 1e-8;
18 | maxiter = 500;
19 | llh = -inf(1,maxiter);
20 | for iter = 2:maxiter
21 |     model = maximization(X,R);
22 |     [R, llh(iter)] = expectation(X,model);
23 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter)); break; end;
24 | end
25 | [~,label(:)] = max(R,[],2);
26 | llh = llh(2:iter);
27 | 
28 | function [R, llh] = expectation(X, model)
29 | mu = model.mu;
30 | w = model.w;
31 | R = X'*log(mu)+(1-X)'*log(1-mu)+log(w);
32 | T = logsumexp(R,2);
33 | llh = mean(T); % loglikelihood
34 | R = exp(R-T);
35 | 
36 | function model = maximization(X, R)
37 | nk = sum(R,1);
38 | w = nk/sum(nk);
39 | mu = (X*R)./nk;
40 | model.mu = mu;
41 | model.w = w;


--------------------------------------------------------------------------------
/chapter08/MRF/mrfMf.m:
--------------------------------------------------------------------------------
 1 | function [nodeBel, edgeBel, L] = mrfMf(A, nodePot, edgePot, epoch)
 2 | % Mean field for MRF
 3 | % Assuming egdePot is symmetric
 4 | % Input: 
 5 | %   A: n x n adjacent matrix of undirected graph, where value is edge index
 6 | %   nodePot: k x n node potential
 7 | %   edgePot: k x k x m edge potential
 8 | % Output:
 9 | %   nodeBel: k x n node belief
10 | %   edgeBel: k x k x m edge belief
11 | % Written by Mo Chen (sth4nth@gmail.com)
12 | if nargin < 4
13 |     epoch = 10;
14 | end
15 | L = -inf(1,epoch+1);
16 | [nodeBel,lnZ] = softmax(nodePot,1);    % initialization    
17 | for iter = 1:epoch
18 |     for i = 1:size(nodePot,2)
19 |         [~,j,e] = find(A(i,:));             % neighbors
20 |         [nodeBel(:,i),lnZ(i)] = softmax(nodePot(:,i)+reshape(edgePot(:,:,e),2,[])*reshape(nodeBel(:,j),[],1));
21 |     end
22 | %     E = dot(nodeBel,nodePot,1);
23 | %     H = -dot(nodeBel,log(nodeBel),1);
24 | %     L(iter+1) = sum(lnZ+E+H)/2;
25 |     L(iter+1) = mrfGibbs(A,nodePot,edgePot,nodeBel);
26 | %     if abs(L(iter+1)-L(iter))/abs(L(iter)) < tol; break; end
27 | end
28 | L = L(1,2:iter+1);
29 | 
30 | [s,t,e] = find(triu(A));
31 | edgeBel = zeros(size(edgePot));
32 | for l = 1:numel(e)
33 |     edgeBel(:,:,e(l)) = nodeBel(:,s(l))*nodeBel(:,t(l))';
34 | end


--------------------------------------------------------------------------------
/chapter13/HMM/hmmSmoother.m:
--------------------------------------------------------------------------------
 1 | function [gamma, alpha, beta, c] = hmmSmoother(model, x)
 2 | % HMM smoothing alogrithm (normalized forward-backward or normalized alpha-beta algorithm).
 3 | % The alpha and beta returned by this function are the normalized version.
 4 | % Input:
 5 | %   x: 1 x n integer vector which is the sequence of observations
 6 | %   model: model structure which contains
 7 | %       model.s: k x 1 start probability vector
 8 | %       model.A: k x k transition matrix
 9 | %       model.E: k x d emission matrix
10 | % Output:
11 | %   gamma: k x n matrix of posterior gamma(t)=p(z_t,x_{1:T})
12 | %   alpha: k x n matrix of posterior alpha(t)=p(z_t|x_{1:T})
13 | %   beta: k x n matrix of posterior beta(t)=gamma(t)/alpha(t)
14 | %   c: 1 x n normalization constant vector
15 | % Written by Mo Chen (sth4nth@gmail.com).
16 | s = model.s;
17 | A = model.A;
18 | E = model.E;
19 | 
20 | n = size(x,2);
21 | X = sparse(x,1:n,1);
22 | M = E*X;
23 | 
24 | [K,T] = size(M);
25 | At = A';
26 | c = zeros(1,T); % normalization constant
27 | alpha = zeros(K,T);
28 | [alpha(:,1),c(1)] = normalize(s.*M(:,1),1);
29 | for t = 2:T
30 |     [alpha(:,t),c(t)] = normalize((At*alpha(:,t-1)).*M(:,t),1);  % 13.59
31 | end
32 | beta = ones(K,T);
33 | for t = T-1:-1:1
34 |     beta(:,t) = A*(beta(:,t+1).*M(:,t+1))/c(t+1);   % 13.62
35 | end
36 | gamma = alpha.*beta;                  % 13.64
37 | 
38 | 


--------------------------------------------------------------------------------
/demo/ch08/mrf_demo.m:
--------------------------------------------------------------------------------
 1 | % Done!
 2 | clear; close all;
 3 | % load letterA.mat;
 4 | % X = A;
 5 | load letterX.mat
 6 | %% Original image
 7 | img = double(X);
 8 | img = sign(img-mean(img(:)));
 9 | 
10 | figure;
11 | subplot(2,2,1);
12 | imagesc(img);
13 | title('Original image');
14 | axis image;
15 | colormap gray;
16 | %% Noisy image
17 | sigma = 1; % noise level
18 | x = img + sigma*randn(size(img)); % noisy signal
19 | subplot(2,2,2);
20 | imagesc(x);
21 | title('Noisy image');
22 | axis image;
23 | colormap gray;
24 | %% Construct MRF data
25 | epoch = 20;
26 | J = 1;   % ising parameter
27 | [A,nodePot,edgePot] = mrfIsGa(x,sigma,J);
28 | %% Mean Field
29 | [nodeBel0,edgeBel0,lnZ0] = mrfMf(A,nodePot,edgePot,epoch);
30 | 
31 | L0 = mrfGibbs(A,nodePot,edgePot,nodeBel0);
32 | L1 = mrfBethe(A,nodePot,edgePot,nodeBel0,edgeBel0);
33 | maxdiff(L0,lnZ0(end))
34 | maxdiff(L0,L1)
35 | 
36 | subplot(2,2,3);
37 | imagesc(reshape(nodeBel0(1,:),size(img)));
38 | title('Mean Field');
39 | axis image;
40 | colormap gray;
41 | %% Belief Propagation
42 | [nodeBel1,edgeBel1,lnZ1] = mrfBp(A,nodePot,edgePot,epoch);
43 | 
44 | subplot(2,2,4);
45 | imagesc(reshape(nodeBel1(1,:),size(img)));
46 | title('Belief Propagation');
47 | axis image;
48 | colormap gray;
49 | %% Energy comparation
50 | figure
51 | epochs = 1:epoch;
52 | plot( epochs,lnZ0,'-', ...
53 |       epochs,lnZ1,'-');
54 | xlabel('epoch');       %  add axis labels and plot title
55 | ylabel('energy');
56 | title('Energy Comparation');
57 | legend('MF','BP');


--------------------------------------------------------------------------------
/chapter02/logSt.m:
--------------------------------------------------------------------------------
 1 | function y = logSt(X, mu, sigma, v)
 2 | % Compute log pdf of a Student's t distribution.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   mu: mean
 6 | %   sigma: variance
 7 | %   v: degree of freedom
 8 | % Output:
 9 | %   y: probability density in logrithm scale y=log p(x)
10 | % Written by mo Chen (sth4nth@gmail.com).
11 | [d,k] = size(mu);
12 | 
13 | if size(sigma,1)==d && size(sigma,2)==d && k==1
14 |     [R,p]= chol(sigma);
15 |     if p ~= 0
16 |         error('ERROR: sigma is not SPD.');
17 |     end
18 |     X = bsxfun(@minus,X,mu);
19 |     Q = R'\X;
20 |     q = dot(Q,Q,1);  % quadratic term (M distance)
21 |     o = -log(1+q/v)*((v+d)/2);
22 |     c = gammaln((v+d)/2)-gammaln(v/2)-(d*log(v*pi)+2*sum(log(diag(R))))/2;
23 |     y = c+o;
24 | elseif size(sigma,1)==d && size(sigma,2)==k
25 |     lambda = 1./sigma;
26 |     ml = mu.*lambda;
27 |     q = bsxfun(@plus,X'.^2*lambda-2*X'*ml,dot(mu,ml,1)); % M distance
28 |     o = bsxfun(@times,log(1+bsxfun(@times,q,1./v)),-(v+d)/2);
29 |     c = gammaln((v+d)/2)-gammaln(v/2)-(d*log(pi*v)+sum(log(sigma),1))/2;
30 |     y = bsxfun(@plus,o,c);
31 | elseif size(sigma,1)==1 && size(sigma,2)==k
32 |     X2 = repmat(dot(X,X,1)',1,k);
33 |     D = bsxfun(@plus,X2-2*X'*mu,dot(mu,mu,1));
34 |     q = bsxfun(@times,D,1./sigma);  % M distance
35 |     o = bsxfun(@times,log(1+bsxfun(@times,q,1./v)),-(v+d)/2);
36 |     c = gammaln((v+d)/2)-gammaln(v/2)-d*log(pi*v.*sigma)/2;
37 |     y = bsxfun(@plus,o,c);
38 | else
39 |     error('Parameters are mismatched.');
40 | end
41 | 


--------------------------------------------------------------------------------
/demo/ch06/knLin_demo.m:
--------------------------------------------------------------------------------
 1 | %% Kernel regression with linear kernel is EQUIVALENT to linear regression
 2 | clear; close all;
 3 | n = 100;
 4 | x = linspace(0,2*pi,n);   % test data
 5 | t = sin(x)+rand(1,n)/2;
 6 | 
 7 | lambda = 1e-4;
 8 | model_kn = knReg(x,t,lambda,@knLin);
 9 | model_lin = linReg(x,t,lambda);
10 | 
11 | idx = 1:2:n;
12 | xt = x(:,idx);
13 | tt = t(idx);
14 | 
15 | [y_kn, sigma_kn,p_kn] = knRegPred(model_kn,xt,tt);
16 | [y_lin, sigma_lin,p_lin] = linRegPred(model_lin,xt,tt);
17 | 
18 | maxdiff(y_kn,y_lin)
19 | maxdiff(sigma_kn,sigma_lin)
20 | maxdiff(p_kn,p_lin)
21 | %% Kernel kmeans with linear kernel is EQUIVALENT to kmeans
22 | clear; close all;
23 | d = 2;
24 | k = 3;
25 | n = 500;
26 | [X,y] = kmeansRnd(d,k,n);
27 | init = ceil(k*rand(1,n));
28 | [y_kn,model_kn,en_kn] = knKmeans(X,init,@knLin);
29 | [y_lin,model_lin,en_lin] = kmeans(X,init);
30 | 
31 | idx = 1:2:n;
32 | Xt = X(:,idx);
33 | 
34 | [t_kn,ent_kn] = knKmeansPred(model_kn, Xt);
35 | [t_lin,ent_lin] = kmeansPred(model_lin, Xt);
36 | 
37 | maxdiff(y_kn,y_lin)
38 | maxdiff(en_kn,en_lin)
39 | 
40 | maxdiff(t_kn,t_lin)
41 | maxdiff(ent_kn,ent_lin)
42 | %% Kernel PCA with linear kernel is EQUIVALENT TO PCA
43 | clear; close all;
44 | d = 10;
45 | q = 2;
46 | n = 500;
47 | X = randn(d,n);
48 | 
49 | 
50 | model_kn = knPca(X,q,@knLin);
51 | idx = 1:2:n;
52 | Xt = X(:,idx);
53 | 
54 | Y_kn = knPcaPred(model_kn,Xt);
55 | 
56 | [U,L,mu,mse] = pca(X,q);
57 | Y_lin = U'*bsxfun(@minus,Xt,mu);   % projection
58 | 
59 | 
60 | R = Y_lin/Y_kn;    % the results are equivalent up to a rotation.
61 | maxdiff(R*R', eye(q))
62 | 


--------------------------------------------------------------------------------
/chapter03/linRegFp.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = linRegFp(X, t, alpha, beta)
 2 | % Fit empirical Bayesian linear model with Mackay fixed point method (p.168)
 3 | % Input:
 4 | %   X: d x n data
 5 | %   t: 1 x n response
 6 | %   alpha: prior parameter
 7 | %   beta: prior parameter
 8 | % Output:
 9 | %   model: trained model structure
10 | %   llh: loglikelihood
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | if nargin < 3
13 |     alpha = 0.02;
14 |     beta = 0.5;
15 | end
16 | [d,n] = size(X);
17 | 
18 | xbar = mean(X,2);
19 | tbar = mean(t,2);
20 | 
21 | X = bsxfun(@minus,X,xbar);
22 | t = bsxfun(@minus,t,tbar);
23 | 
24 | XX = X*X';
25 | Xt = X*t';
26 | 
27 | 
28 | tol = 1e-4;
29 | maxiter = 200;
30 | llh = -inf(1,maxiter);
31 | for iter = 2:maxiter
32 |     A = beta*XX+diag(alpha);  % 3.81 3.54
33 |     U = chol(A);
34 | 
35 |     m = beta*(U\(U'\Xt));  % 3.84
36 |     m2 = dot(m,m);
37 |     e = sum((t-m'*X).^2);   
38 |     
39 |     logdetA = 2*sum(log(diag(U)));    
40 |     llh(iter) = 0.5*(d*log(alpha)+n*log(beta)-alpha*m2-beta*e-logdetA-n*log(2*pi)); % 3.86
41 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end
42 | 
43 |     V = inv(U);               % A=inv(S)
44 |     trS = dot(V(:),V(:));  
45 |     gamma = d-alpha*trS;  % 3.91 9.64
46 |     alpha = gamma/m2;    % 3.92
47 |     beta = (n-gamma)/e;   % 3.95
48 | 
49 | end
50 | w0 = tbar-dot(m,xbar);
51 | 
52 | llh = llh(2:iter);
53 | model.w0 = w0;
54 | model.w = m;
55 | %% optional for bayesian probabilistic prediction purpose
56 | model.alpha = alpha;
57 | model.beta = beta;
58 | model.xbar = xbar;
59 | model.U = U;


--------------------------------------------------------------------------------
/chapter14/mixLogitBin.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = mixLogitBin(X, t, k)
 2 | % Mixture of logistic regression model for binary classification optimized by Newton-Raphson method
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   t: 1 x n label (0/1)
 6 | %   k: number of mixture component
 7 | % Output:
 8 | %   model: trained model structure
 9 | %   llh: loglikelihood
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | n = size(X,2);
12 | X = [X; ones(1,n)];
13 | d = size(X,1);
14 | z = ceil(k*rand(1,n));
15 | R = full(sparse(1:n,z,1,n,k,n)); %  n x k
16 | 
17 | W = zeros(d,k);
18 | tol = 1e-4;
19 | maxiter = 100;
20 | llh = -inf(1,maxiter);
21 | 
22 | t = t(:);
23 | h = ones(n,1);
24 | h(t==0) = -1;
25 | A = X'*W;
26 | for iter = 2:maxiter
27 |     % maximization
28 |     nk = sum(R,1);
29 |     alpha = nk/n;
30 |     Y = sigmoid(A);
31 |     for j = 1:k
32 |         W(:,j) = newtonStep(X, t, Y(:,j), W(:,j), R(:,j));
33 |     end
34 |     % expectation
35 |     A = X'*W;
36 |     logRho = -log1pexp(-bsxfun(@times,A,h));
37 |     logRho = bsxfun(@plus,logRho,log(alpha));
38 |     T = logsumexp(logRho,2);
39 |     llh(iter) = sum(T)/n; % loglikelihood
40 |     logR = bsxfun(@minus,logRho,T);
41 |     R = exp(logR);
42 |     
43 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter)); break; end
44 | end
45 | llh = llh(2:iter);
46 | model.alpha = alpha; % mixing coefficient
47 | model.W = W;  % logistic model coefficent
48 | 
49 | 
50 | function w = newtonStep(X, t, y, w, r)
51 | lambda = 1e-6;
52 | v = y.*(1-y).*r;
53 | H = bsxfun(@times,X,v')*X'+lambda*eye(size(X,1));
54 | s = (y-t).*r;
55 | g = X*s;
56 | w = w-H\g;
57 | 
58 | 


--------------------------------------------------------------------------------
/chapter09/linRegEm.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = linRegEm(X, t, alpha, beta)
 2 | % Fit empirical Bayesian linear regression model with EM (p.448 chapter 9.3.4)
 3 | % Input:
 4 | %   X: d x n data
 5 | %   t: 1 x n response
 6 | %   alpha: prior parameter
 7 | %   beta: prior parameter
 8 | % Output:
 9 | %   model: trained model structure
10 | %   llh: loglikelihood
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | if nargin < 3
13 |     alpha = 0.02;
14 |     beta = 0.5;
15 | end
16 | [d,n] = size(X);
17 | I = eye(d);
18 | xbar = mean(X,2);
19 | tbar = mean(t,2);
20 | 
21 | X = bsxfun(@minus,X,xbar);
22 | t = bsxfun(@minus,t,tbar);
23 | 
24 | XX = X*X';
25 | Xt = X*t';
26 | 
27 | tol = 1e-4;
28 | maxiter = 100;
29 | llh = -inf(1,maxiter+1);
30 | for iter = 2:maxiter
31 |     A = beta*XX+alpha*eye(d);
32 |     U = chol(A);
33 |     
34 |     m = beta*(U\(U'\Xt));
35 |     m2 = dot(m,m);
36 |     e2 = sum((t-m'*X).^2);
37 |     
38 |     logdetA = 2*sum(log(diag(U)));    
39 |     llh(iter) = 0.5*(d*log(alpha)+n*log(beta)-alpha*m2-beta*e2-logdetA-n*log(2*pi));  % 3.86
40 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end
41 |     
42 |     invU = U'\I;
43 |     trS = dot(invU(:),invU(:));    % A=inv(S)
44 |     alpha = d/(m2+trS);   % 9.63
45 |     
46 |     invUX = U'\X;
47 |     trXSX = dot(invUX(:),invUX(:));
48 |     beta = n/(e2+trXSX);  % 9.68 is wrong
49 | end
50 | w0 = tbar-dot(m,xbar);
51 | 
52 | llh = llh(2:iter);
53 | model.w0 = w0;
54 | model.w = m;
55 | %% optional for bayesian probabilistic inference purpose
56 | model.alpha = alpha;
57 | model.beta = beta;
58 | model.xbar = xbar;
59 | model.U = U;
60 | 


--------------------------------------------------------------------------------
/chapter14/mixLinReg.m:
--------------------------------------------------------------------------------
 1 | function [label, model, llh] = mixLinReg(X, y, k, lambda)
 2 | % Mixture of linear regression
 3 | % input:
 4 | %   X: d x n data matrix
 5 | %   y: 1 x n responding vector
 6 | %   k: number of mixture component
 7 | %   lambda: regularization parameter
 8 | % output:
 9 | %   label: 1 x n cluster label
10 | %   model: trained model structure
11 | %   llh: loglikelihood
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | if nargin < 4
14 |     lambda = 1;
15 | end
16 | n = size(X,2);
17 | X = [X;ones(1,n)]; % adding the bias term
18 | d = size(X,1);
19 | label = ceil(k*rand(1,n));  % random initialization
20 | R = full(sparse(label,1:n,1,k,n,n));
21 | tol = 1e-6;
22 | maxiter = 500;
23 | llh = -inf(1,maxiter);
24 | Lambda = lambda*eye(d);
25 | W = zeros(d,k);
26 | Xy = bsxfun(@times,X,y);
27 | beta = 1;
28 | for iter = 2:maxiter
29 |     % maximization
30 |     nk = sum(R,2);
31 |     alpha = nk/n;
32 |     for j = 1:k
33 |         Xw = bsxfun(@times,X,sqrt(R(j,:)));
34 |         U = chol(Xw*Xw'+Lambda);
35 |         W(:,j) = U\(U'\(Xy*R(j,:)'));  % 3.15 & 3.28
36 |     end
37 |     D = bsxfun(@minus,W'*X,y).^2;
38 |     % expectation
39 |     logRho = (-0.5)*beta*D;
40 |     logRho = bsxfun(@plus,logRho,log(alpha));
41 |     T = logsumexp(logRho,1);
42 |     logR = bsxfun(@minus,logRho,T);
43 |     R = exp(logR);
44 |     llh(iter) = sum(T)/n;
45 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter)); break; end
46 | end
47 | llh = llh(2:iter);
48 | model.alpha = alpha; % mixing coefficient
49 | model.beta = beta; % mixture component precision
50 | model.W = W;  % linear model coefficent
51 | [~,label] = max(R,[],1);
52 | model.label = label;
53 | 


--------------------------------------------------------------------------------
/chapter11/mixDpGb.m:
--------------------------------------------------------------------------------
 1 | function [label, Theta, w, llh] = mixDpGb(X, alpha, theta)
 2 | % Collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 
 3 | % Any component model can be used, such as Gaussian.
 4 | % Input: 
 5 | %   X: d x n data matrix
 6 | %   alpha: parameter for Dirichlet process prior
 7 | %   theta: class object for prior of component distribution (such as Gauss)
 8 | % Output:
 9 | %   label: 1 x n cluster label
10 | %   Theta: 1 x k structure of trained components
11 | %   w: 1 x k component weight vector
12 | %   llh: loglikelihood
13 | % Written by Mo Chen (sth4nth@gmail.com).
14 | n = size(X,2);
15 | [label,Theta,w] = mixDpGbOl(X,alpha,theta);
16 | nk = n*w;
17 | maxIter = 50;
18 | llh = zeros(1,maxIter);
19 | for iter = 1:maxIter
20 |     for i = randperm(n)
21 |         x = X(:,i);
22 |         k = label(i);
23 |         Theta{k} = Theta{k}.delSample(x);
24 |         nk(k) = nk(k)-1;
25 |         if nk(k) == 0           % remove empty cluster
26 |             Theta(k) = [];
27 |             nk(k) = [];
28 |             which = label>k;
29 |             label(which) = label(which)-1;
30 |         end
31 |         Pk = log(nk)+cellfun(@(t) t.logPredPdf(x), Theta);
32 |         P0 = log(alpha)+theta.logPredPdf(x);
33 |         p = [Pk,P0];
34 |         llh(iter) = llh(iter)+sum(p-log(n));
35 |         k = discreteRnd(exp(p-logsumexp(p)));
36 |         if k == numel(Theta)+1                 % add extra cluster
37 |             Theta{k} = theta.clone().addSample(x);
38 |             nk = [nk,1];
39 |         else
40 |             Theta{k} = Theta{k}.addSample(x);
41 |             nk(k) = nk(k)+1;
42 |         end
43 |         label(i) = k;
44 |     end
45 | end
46 | w = nk/n;
47 | 
48 | 


--------------------------------------------------------------------------------
/demo/ch13/lds_demo.m:
--------------------------------------------------------------------------------
 1 | close all;
 2 | % Parameter
 3 | clear; 
 4 | d = 2;
 5 | k = 3;
 6 | n = 100;
 7 | 
 8 | A = [1,0,1; 
 9 |      0 1,0;
10 |      0,0,1];
11 | G = eye(k)*1e-3;
12 |  
13 | C = [1,0,0;
14 |      0 1,0];
15 | S = eye(d)*1e-1;
16 | 
17 | mu0 = [0;0;0];
18 | P0 = eye(k);
19 | 
20 | model.A = A;
21 | model.G = G;
22 | model.C = C;
23 | model.S = S;
24 | model.mu0 = mu0;
25 | model.P0 = P0;
26 | 
27 | %% Generate data
28 | [z,x] = ldsRnd(model,n);
29 | figure;
30 | hold on
31 | plot(x(1,:), x(2,:), 'ro');
32 | plot(z(1,:), z(2,:), 'b*-');
33 | legend('observed', 'latent')
34 | title('Generated Data')
35 | axis equal
36 | hold off
37 | %% Kalman filter
38 | [mu, V, llh] = kalmanFilter(model,x);
39 | figure
40 | hold on
41 | plot(x(1,:), x(2,:), 'ro');
42 | plot(mu(1,:), mu(2,:), 'b*-');
43 | legend('observed', 'filtered')
44 | title('Kalman filter')
45 | axis equal
46 | hold off
47 | %% Kalman smoother
48 | [nu, U, llh] = kalmanSmoother(model,x);
49 | figure
50 | hold on
51 | plot(x(1,:), x(2,:), 'ro');
52 | plot(nu(1,:), nu(2,:), 'b*-');
53 | legend('observed', 'smoothed')
54 | title('Kalman smoother')
55 | axis equal
56 | hold off
57 | %% LDS Subspace
58 | [A,C,nu] = ldsPca(x,k,3*k);
59 | y = C*nu;
60 | t = size(y,2);
61 | figure;
62 | hold on
63 | plot(x(1,1:t), x(2,1:t), 'ro');
64 | plot(y(1,1:t), y(2,1:t), 'b*-');
65 | legend('observed', 'projected')
66 | title('LDS subspace learning')
67 | axis equal
68 | hold off
69 | %% LDS EM
70 | [tmodel, llh] = ldsEm(x,k);
71 | nu = kalmanSmoother(tmodel,x);
72 | y = tmodel.C*nu;
73 | figure
74 | hold on
75 | plot(x(1,:), x(2,:), 'ro');
76 | plot(y(1,:), y(2,:), 'b*-');
77 | legend('observed', 'learned')
78 | title('LDS EM learning')
79 | axis equal
80 | hold off
81 | figure;
82 | plot(llh);
83 | 


--------------------------------------------------------------------------------
/chapter12/fa.m:
--------------------------------------------------------------------------------
 1 | function [W, mu, psi, llh] = fa(X, m)
 2 | % Perform EM algorithm for factor analysis model
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   m: dimension of target space
 6 | % Output:
 7 | %   W: d x m weight matrix
 8 | %   mu: d x 1 mean vector
 9 | %   psi: d x 1 variance vector
10 | %   llh: loglikelihood
11 | % Reference: Pattern Recognition and Machine Learning by Christopher M. Bishop 
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | [d,n] = size(X);
14 | mu = mean(X,2);
15 | X = bsxfun(@minus,X,mu);
16 | 
17 | tol = 1e-4;
18 | maxiter = 500;
19 | llh = -inf(1,maxiter);
20 | 
21 | I = eye(m);
22 | r = dot(X,X,2);
23 | 
24 | W = randn(d,m); 
25 | lambda = 1./rand(d,1);
26 | for iter = 2:maxiter
27 |     T = bsxfun(@times,W,sqrt(lambda));
28 |     M = T'*T+I;                     % M = W'*inv(Psi)*W+I
29 |     U = chol(M);
30 |     WInvPsiX = bsxfun(@times,W,lambda)'*X;       % WInvPsiX = W'*inv(Psi)*X
31 |     
32 |     % likelihood
33 |     logdetC = 2*sum(log(diag(U)))-sum(log(lambda));              % log(det(C))
34 |     Q = U'\WInvPsiX;
35 |     trInvCS = (r'*lambda-dot(Q(:),Q(:)))/n;  % trace(inv(C)*S)
36 |     llh(iter) = -n*(d*log(2*pi)+logdetC+trInvCS)/2;
37 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
38 |     
39 |     % E step
40 |     Ez = M\WInvPsiX;                                         % 12.66
41 |     V = inv(U);
42 |     Ezz = n*(V*V')+Ez*Ez';                                        % 12.67
43 |     
44 |     % M step    
45 |     U = chol(Ezz);  
46 |     XEz = X*Ez';
47 |     W = (XEz/U)/U';                                         % 12.69
48 |     lambda = n./(r-dot(W,XEz,2));                           % 12.70
49 | end
50 | llh = llh(2:iter);
51 | psi = 1./lambda;


--------------------------------------------------------------------------------
/chapter05/mlpReg.m:
--------------------------------------------------------------------------------
 1 | function [model, L] = mlpReg(X, y, k, lambda)
 2 | % Train a multilayer perceptron neural network for regression with backpropagation
 3 | % tanh activation function is used
 4 | % Input:
 5 | %   X: d x n data matrix
 6 | %   y: 1 x n real value response vector
 7 | %   k: T x 1 vector to specify number of hidden nodes in each layer
 8 | %   lambda: regularization parameter
 9 | % Ouput:
10 | %   model: model structure
11 | %   L: (regularized least square) loss
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | if nargin < 4
14 |     lambda = 1e-2;
15 | end
16 | eta = 1e-5;
17 | tol = 1e-5;
18 | maxiter = 50000;
19 | L = inf(1,maxiter);
20 | 
21 | k = [size(X,1);k(:);size(y,1)];
22 | T = numel(k)-1;
23 | W = cell(T,1);
24 | b = cell(T,1);
25 | for t = 1:T
26 |     W{t} = randn(k(t),k(t+1));
27 |     b{t} = randn(k(t+1),1);
28 | end
29 | R = cell(T,1);
30 | Z = cell(T+1,1);
31 | Z{1} = X;
32 | for iter = 2:maxiter
33 | %     forward
34 |     for t = 1:T-1
35 |         Z{t+1} = tanh(W{t}'*Z{t}+b{t});             % 5.10 5.113
36 |     end
37 |     Z{T+1} = W{T}'*Z{T}+b{T};                       % 5.114
38 | 
39 | %     loss
40 |     E = Z{T+1}-y;     
41 |     Wn = cellfun(@(x) dot(x(:),x(:)),W);            % |W|^2
42 |     L(iter) = dot(E(:),E(:))+lambda*sum(Wn);
43 |     if abs(L(iter)-L(iter-1)) < tol*L(iter-1); break; end
44 |     
45 | %     backward
46 |     R{T} = E;                
47 |     for t = T-1:-1:1
48 |         df = 1-Z{t+1}.^2;    % h'(a)
49 |         R{t} = df.*(W{t+1}*R{t+1});    % 5.66
50 |     end
51 |     
52 | %     gradient descent
53 |     for t=1:T
54 |         dW = Z{t}*R{t}'+lambda*W{t};    % 5.67
55 |         db = sum(R{t},2);
56 |         W{t} = W{t}-eta*dW;             % 5.43
57 |         b{t} = b{t}-eta*db;
58 |     end
59 | end
60 | L = L(2:iter);
61 | model.W = W;
62 | model.b = b;
63 | 


--------------------------------------------------------------------------------
/chapter12/ppcaEm.m:
--------------------------------------------------------------------------------
 1 | function [W, mu, beta, llh] = ppcaEm(X, m)
 2 | % Perform EM algorithm to maiximize likelihood of probabilistic PCA model.
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   m: dimension of target space
 6 | % Output:
 7 | %   W: d x m weight matrix
 8 | %   mu: d x 1 mean vector
 9 | %   beta: precition vector (inverse of variance
10 | %   llh: loglikelihood
11 | % Reference: 
12 | %   Pattern Recognition and Machine Learning by Christopher M. Bishop 
13 | %   Probabilistic Principal Component Analysis by Michael E. Tipping & Christopher M. Bishop
14 | % Written by Mo Chen (sth4nth@gmail.com).
15 | [d,n] = size(X);
16 | mu = mean(X,2);
17 | X = bsxfun(@minus,X,mu);
18 | 
19 | tol = 1e-4;
20 | maxiter = 500;
21 | llh = -inf(1,maxiter);
22 | I = eye(m);
23 | r = dot(X(:),X(:)); % total norm of X
24 | W = randn(d,m); 
25 | s = 1/randg;
26 | for iter = 2:maxiter
27 |     M = W'*W+s*I;
28 |     U = chol(M);
29 |     WX = W'*X;
30 |     
31 |     % likelihood
32 |     logdetC = 2*sum(log(diag(U)))+(d-m)*log(s);
33 |     T = U'\WX;
34 |     trInvCS = (r-dot(T(:),T(:)))/(s*n);
35 |     llh(iter) = -n*(d*log(2*pi)+logdetC+trInvCS)/2;                     % 12.43 12.44
36 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
37 |     
38 |     % E step
39 |     Ez = M\WX;                                     % 12.54
40 |     V = inv(U);                                % inv(M) = V*V'
41 |     Ezz = n*s*(V*V')+Ez*Ez'; % n*s because we are dealing with all n E[zi*zi']    % 12. 55
42 |     
43 |     % M step
44 |     U = chol(Ezz);                                           
45 |     W = ((X*Ez')/U)/U';                                 % 12.56
46 |     WR = W*U';
47 |     s = (r-2*dot(Ez(:),WX(:))+dot(WR(:),WR(:)))/(n*d);         % 12.57
48 | end
49 | llh = llh(2:iter);
50 | beta = 1/s;


--------------------------------------------------------------------------------
/chapter11/Gauss.m:
--------------------------------------------------------------------------------
 1 | % Class for Gaussian distribution used by Dirichlet process
 2 | classdef Gauss
 3 |      properties
 4 |          n_
 5 |          mu_
 6 |          U_
 7 |      end
 8 |      
 9 |      methods
10 |          function obj = Gauss(X)
11 |              n = size(X,2);
12 |              mu = mean(X,2);
13 |              U = chol(X*X');
14 |              
15 |              obj.n_ = n;
16 |              obj.mu_ = mu;
17 |              obj.U_ = U;
18 |          end
19 |          
20 |          function obj = clone(obj)
21 |          end
22 |         
23 |          function obj = addSample(obj, x)
24 |              n = obj.n_;
25 |              mu = obj.mu_;
26 |              U = obj.U_;
27 |              
28 |              n = n+1;
29 |              mu = mu+(x-mu)/n;
30 |              U = cholupdate(U,x,'+');
31 |              
32 |              obj.n_ = n;
33 |              obj.mu_ = mu;
34 |              obj.U_ = U;
35 |          end
36 |          
37 |          function obj = delSample(obj, x)
38 |              n = obj.n_;
39 |              mu = obj.mu_;
40 |              U = obj.U_;
41 | 
42 |              n = n-1;
43 |              mu = mu-(x-mu)/n;
44 |              U = cholupdate(U,x,'-');
45 |              
46 |              obj.n_ = n;
47 |              obj.mu_ = mu;
48 |              obj.U_ = U;
49 |          end
50 |          
51 |          function y = logPdf(obj,X)
52 |              n = obj.n_;
53 |              mu = obj.mu_;
54 |              U = obj.U_;
55 |              d = size(X,1);
56 |              
57 |              U = cholupdate(U/sqrt(n),mu,'-');       % Sigma=X*X'/n-mu*mu'
58 |              Q = U'\bsxfun(@minus,X,mu);
59 |              q = dot(Q,Q,1);  % quadratic term (M distance)
60 |              c = d*log(2*pi)+2*sum(log(diag(U)));   % normalization constant
61 |              y = -0.5*(c+q);
62 |          end
63 |      end
64 | end


--------------------------------------------------------------------------------
/chapter09/rvmRegEm.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = rvmRegEm(X, t, alpha, beta)
 2 | % Relevance Vector Machine (ARD sparse prior) for regression
 3 | % trained by empirical bayesian (type II ML) using EM
 4 | % Input:
 5 | %   X: d x n data
 6 | %   t: 1 x n response
 7 | %   alpha: prior parameter
 8 | %   beta: prior parameter
 9 | % Output:
10 | %   model: trained model structure
11 | %   llh: loglikelihood
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | if nargin < 3
14 |     alpha = 0.02;
15 |     beta = 0.5;
16 | end
17 | [d,n] = size(X);
18 | xbar = mean(X,2);
19 | tbar = mean(t,2);
20 | X = bsxfun(@minus,X,xbar);
21 | t = bsxfun(@minus,t,tbar);
22 | XX = X*X';
23 | Xt = X*t';
24 | 
25 | tol = 1e-3;
26 | maxiter = 500;
27 | llh = -inf(1,maxiter+1);
28 | index = 1:d;
29 | alpha = alpha*ones(d,1);
30 | for iter = 2 : maxiter
31 |     nz = 1./alpha > tol ;   % nonzeros
32 |     index = index(nz);
33 |     alpha = alpha(nz);
34 |     XX = XX(nz,nz);
35 |     Xt = Xt(nz);
36 |     X = X(nz,:);
37 |     % E-step
38 |     U = chol(beta*(XX)+diag(alpha));        % 7.83
39 |     m = beta*(U\(U'\(X*t')));   % E[m]     % 7.82
40 |     m2 = m.^2;       
41 |     e2 = sum((t-m'*X).^2);
42 | 
43 |     logdetS = 2*sum(log(diag(U)));    
44 |     llh(iter) = 0.5*(sum(log(alpha))+n*log(beta)-beta*e2-logdetS-dot(alpha,m2)-n*log(2*pi));  % 3.86
45 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end
46 |     % M-step
47 |     V = inv(U);
48 |     dgS = dot(V,V,2);
49 |     alpha = 1./(m2+dgS);    % 9.67
50 |     UX = U'\X;
51 |     trXSX = dot(UX(:),UX(:));
52 |     beta = n/(e2+trXSX);    % 9.68 is wrong
53 | end
54 | llh = llh(2:iter);
55 | 
56 | model.index = index;
57 | model.w0 = tbar-dot(m,xbar(nz));
58 | model.w = m;
59 | model.alpha = alpha;
60 | model.beta = beta;
61 | %% optional for bayesian probabilistic prediction purpose
62 | model.xbar = xbar;
63 | model.U = U;


--------------------------------------------------------------------------------
/chapter05/mlpClass.m:
--------------------------------------------------------------------------------
 1 | function [model, L] = mlpClass(X, y, k, lambda)
 2 | % Train a multilayer perceptron neural network for multiclass classification with backpropagation
 3 | % logistic activation function is used.
 4 | % Input:
 5 | %   X: d x n data matrix
 6 | %   y: 1 x n label vector
 7 | %   k: T x 1 vector to specify number of hidden nodes in each layer
 8 | %   lambda: regularization parameter
 9 | % Ouput:
10 | %   model: model structure
11 | %   L: (regularized cross entropy) loss
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | if nargin < 4
14 |     lambda = 1e-2;
15 | end
16 | eta = 1e-3;
17 | tol = 1e-4;
18 | maxiter = 50000;
19 | L = inf(1,maxiter);
20 | 
21 | Y = sparse(y,1:numel(y),1);
22 | k = [size(X,1);k(:);size(Y,1)];
23 | T = numel(k)-1;
24 | W = cell(T,1);
25 | b = cell(T,1);
26 | for t = 1:T
27 |     W{t} = randn(k(t),k(t+1));
28 |     b{t} = randn(k(t+1),1);
29 | end
30 | R = cell(T,1);
31 | Z = cell(T+1,1);
32 | Z{1} = X;
33 | for iter = 2:maxiter
34 | %     forward
35 |     for t = 1:T-1
36 |         Z{t+1} = sigmoid(W{t}'*Z{t}+b{t});         % 5.10 5.113
37 |     end
38 |     Z{T+1} = softmax(W{T}'*Z{T}+b{T});   
39 |     
40 | %     loss
41 |     E = Z{T+1};
42 |     Wn = cellfun(@(x) dot(x(:),x(:)),W);            % |W|^2
43 |     L(iter) = -dot(Y(:),log(E(:)))+0.5*lambda*sum(Wn);
44 |     if abs(L(iter)-L(iter-1)) < tol*L(iter-1); break; end
45 | 
46 | %     backward
47 |     R{T} = Z{T+1}-Y;                
48 |     for t = T-1:-1:1
49 |         df = Z{t+1}.*(1-Z{t+1});    % h'(a)
50 |         R{t} = df.*(W{t+1}*R{t+1});     % 5.66
51 |     end
52 |     
53 | %     gradient descent
54 |     for t=1:T
55 |         dW = Z{t}*R{t}'+lambda*W{t};      % 5.67
56 |         db = sum(R{t},2);
57 |         W{t} = W{t}-eta*dW;               % 5.43
58 |         b{t} = b{t}-eta*db;
59 |     end
60 | end
61 | L = L(2:iter);
62 | model.W = W;
63 | model.b = b;
64 | 


--------------------------------------------------------------------------------
/chapter07/rvmRegFp.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = rvmRegFp(X, t, alpha, beta)
 2 | % Relevance Vector Machine (ARD sparse prior) for regression
 3 | % training by empirical bayesian (type II ML) using Mackay fix point update.
 4 | % Input:
 5 | %   X: d x n data
 6 | %   t: 1 x n response
 7 | %   alpha: prior parameter
 8 | %   beta: prior parameter
 9 | % Output:
10 | %   model: trained model structure
11 | %   llh: loglikelihood
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | if nargin < 3
14 |     alpha = 0.02;
15 |     beta = 0.5;
16 | end
17 | [d,n] = size(X);
18 | xbar = mean(X,2);
19 | tbar = mean(t,2);
20 | X = bsxfun(@minus,X,xbar);
21 | t = bsxfun(@minus,t,tbar);
22 | XX = X*X';
23 | Xt = X*t';
24 | 
25 | tol = 1e-3;
26 | maxiter = 500;
27 | llh = -inf(1,maxiter);
28 | index = 1:d;
29 | alpha = alpha*ones(d,1);
30 | for iter = 2:maxiter
31 |     % remove zeros
32 |     nz = 1./alpha > tol;    % nonzeros
33 |     index = index(nz);
34 |     alpha = alpha(nz);
35 |     XX = XX(nz,nz);
36 |     Xt = Xt(nz);
37 |     X = X(nz,:);
38 |     
39 |     U = chol(beta*XX+diag(alpha));      % 7.83
40 |     m = beta*(U\(U'\Xt));               % 7.82    
41 |     m2 = m.^2;
42 |     e = sum((t-m'*X).^2);
43 |     
44 |     logdetS = 2*sum(log(diag(U)));    
45 |     llh(iter) = 0.5*(sum(log(alpha))+n*log(beta)-beta*e-logdetS-dot(alpha,m2)-n*log(2*pi)); % 3.86
46 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end
47 | 
48 |     V = inv(U);    
49 |     dgSigma = dot(V,V,2);
50 |     gamma = 1-alpha.*dgSigma;   % 7.89
51 |     alpha = gamma./m2;           % 7.87
52 |     beta = (n-sum(gamma))/e;    % 7.88
53 | end
54 | llh = llh(2:iter);
55 | 
56 | model.index = index;
57 | model.w0 = tbar-dot(m,xbar(nz));
58 | model.w = m;
59 | model.alpha = alpha;
60 | model.beta = beta;
61 | %% optional for bayesian probabilistic prediction purpose
62 | model.xbar = xbar(index);
63 | model.U = U;


--------------------------------------------------------------------------------
/chapter13/HMM/hmmEm.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = hmmEm(x, init)
 2 | % EM algorithm to fit the parameters of HMM model (a.k.a Baum-Welch algorithm)
 3 | % Input:
 4 | %   x: 1 x n integer vector which is the sequence of observations
 5 | %   init: model or k
 6 | % Output:s
 7 | %   model: trained model structure
 8 | %   llh: loglikelihood
 9 | % Written by Mo Chen (sth4nth@gmail.com).
10 | n = size(x,2);
11 | X = sparse(x,1:n,1);
12 | d = size(X,1);
13 | if isstruct(init)   % init with a model
14 |     A = init.A;
15 |     E = init.E;
16 |     s = init.s;
17 | elseif numel(init) == 1  % random init with latent k
18 |     k = init;
19 |     s = normalize(rand(k,1),1);  
20 |     A = normalize(rand(k,k),2);
21 |     E = normalize(rand(k,d),2);
22 | end
23 | tol = 1e-4;
24 | maxIter = 1000;
25 | llh = -inf(1,maxIter);
26 | for iter = 2:maxIter
27 |     M = E*X;
28 | %     E-step
29 |     [gamma,alpha,beta,c] = hmmSmoother(M,A,s);
30 |     llh(iter) = mean(log(c));
31 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
32 | %     M-step 
33 |     s = gamma(:,1);                                                                             % 13.18
34 |     A = normalize(A.*(alpha(:,1:n-1)*(beta(:,2:n).*M(:,2:n)./c(2:n))'),2);      % 13.19 13.43 13.65
35 |     E = (gamma*X')./sum(gamma,2);                            % 13.23
36 | end
37 | model.s = s;
38 | model.A = A;
39 | model.E = E;
40 | llh = llh(2:iter);
41 | 
42 | function [gamma, alpha, beta, c] = hmmSmoother(M, A, s)
43 | [K,T] = size(M);
44 | At = A';
45 | c = zeros(1,T);
46 | alpha = zeros(K,T);
47 | [alpha(:,1),c(1)] = normalize(s.*M(:,1),1);
48 | for t = 2:T
49 |     [alpha(:,t),c(t)] = normalize((At*alpha(:,t-1)).*M(:,t),1);  % 13.59
50 | end
51 | beta = ones(K,T);
52 | for t = T-1:-1:1
53 |     beta(:,t) = A*(beta(:,t+1).*M(:,t+1))/c(t+1);   % 13.62
54 | end
55 | gamma = alpha.*beta;                  % 13.64
56 | 


--------------------------------------------------------------------------------
/chapter08/MRF/mrfBp.m:
--------------------------------------------------------------------------------
 1 | function [nodeBel, edgeBel, L] = mrfBp(A, nodePot, edgePot, epoch)
 2 | % Undirected graph belief propagation for MRF
 3 | % Assuming egdePot is symmetric
 4 | % Input: 
 5 | %   A: n x n adjacent matrix of undirected graph, where value is edge index
 6 | %   nodePot: k x n node potential
 7 | %   edgePot: k x k x m edge potential
 8 | % Output:
 9 | %   nodeBel: k x n node belief
10 | %   edgeBel: k x k x m edge belief
11 | %   L: variational lower bound (Bethe energy)
12 | % Written by Mo Chen (sth4nth@gmail.com)
13 | if nargin < 4
14 |     epoch = 10;
15 | end
16 | expNodePot = exp(nodePot);  
17 | expEdgePot = exp(edgePot);
18 | [k,n] = size(nodePot);
19 | m = size(edgePot,3);
20 | 
21 | [s,t,e] = find(triu(A));
22 | A = sparse([s;t],[t;s],[e;e+m]);       % digraph adjacent matrix, where value is message index
23 | mu = ones(k,2*m)/k;                     % message factor to node
24 | 
25 | nodeBel = zeros(k,n);
26 | edgeBel = zeros(k,k,m);
27 | L = -inf(1,epoch+1);
28 | for iter = 1:epoch
29 |     for i = 1:n
30 |         in = nonzeros(A(:,i));                      % incoming message index
31 |         nb = expNodePot(:,i).*prod(mu(:,in),2);                       % product of incoming message
32 |         for l = in'
33 |             ep = expEdgePot(:,:,ud(l,m));
34 |             mu(:,rd(l,m)) = normalize(ep*(nb./mu(:,l)));
35 |         end
36 |         nodeBel(:,i) = nb/sum(nb);
37 |     end
38 |     
39 |     for l = 1:m
40 |         st = e(l);
41 |         nut = nodeBel(:,t(l))./mu(:,st);
42 |         nus = nodeBel(:,s(l))./mu(:,st+m);
43 |         eb = expEdgePot(:,:,st).*(nus*nut');
44 |         edgeBel(:,:,st) = eb./sum(eb(:));
45 |     end
46 |     L(iter+1) = mrfBethe(A,nodePot,edgePot,nodeBel,edgeBel);
47 | end
48 | L = L(1,2:iter+1);
49 | 
50 | function i = rd(i, m)
51 | % reverse direction edge index
52 | i = mod(i+m-1,2*m)+1;
53 | 
54 | function i = ud(i, m)
55 | % undirected edge index
56 | i = mod(i-1,m)+1;


--------------------------------------------------------------------------------
/chapter13/LDS/kalmanFilter.m:
--------------------------------------------------------------------------------
 1 | function [mu, V, llh] = kalmanFilter(model, X)
 2 | % Kalman filter (forward algorithm for linear dynamic system)
 3 | % NOTE: This is the exact implementation of the Kalman filter algorithm in PRML.
 4 | % However, this algorithm is not practical. It is numerical unstable. 
 5 | % Input:
 6 | %   X: d x n data matrix
 7 | %   model: model structure
 8 | % Output:
 9 | %   mu: q x n matrix of latent mean mu_t=E[z_t] w.r.t p(z_t|x_{1:t})
10 | %   V: q x q x n latent covariance U_t=cov[z_t] w.r.t p(z_t|x_{1:t})
11 | %   llh: loglikelihood
12 | % Written by Mo Chen (sth4nth@gmail.com).
13 | A = model.A; % transition matrix 
14 | G = model.G; % transition covariance
15 | C = model.C; % emission matrix
16 | S = model.S;  % emision covariance
17 | mu0 = model.mu0; % prior mean
18 | P = model.P0;  % prior covairance
19 | 
20 | n = size(X,2);
21 | k = size(mu0,1);
22 | mu = zeros(k,n);
23 | V = zeros(k,k,n);
24 | llh = zeros(1,n);
25 | I = eye(k);
26 | 
27 | PC = P*C';
28 | R = C*PC+S;
29 | K = PC/R;                                        % 13.97
30 | mu(:,1) = mu0+K*(X(:,1)-C*mu0);                     % 13.94
31 | V(:,:,1) = (I-K*C)*P;                               % 13.95
32 | llh(1) = logGauss(X(:,1),C*mu0,R);
33 | for i = 2:n
34 |     [mu(:,i), V(:,:,i), llh(i)] = ...
35 |         forwardUpdate(X(:,i), mu(:,i-1), V(:,:,i-1), A, G, C, S, I);
36 | end
37 | llh = sum(llh);
38 | 
39 | function [mu, V, llh] = forwardUpdate(x, mu, V, A, G, C, S, I)
40 | P = A*V*A'+G;                                               % 13.88
41 | PC = P*C';                                                      
42 | R = C*PC+S;
43 | K = PC/R;                                                   % 13.92
44 | Amu = A*mu;
45 | CAmu = C*Amu;                                               
46 | mu = Amu+K*(x-CAmu);                                        % 13.89
47 | V = (I-K*C)*P;                                              % 13.90
48 | llh = logGauss(x,CAmu,R);                                   % 13.91


--------------------------------------------------------------------------------
/chapter10/rvmRegVb.m:
--------------------------------------------------------------------------------
 1 | function [model, energy] = rvmRegVb(X, t, prior)
 2 | % Variational Bayesian inference for RVM regression.
 3 | % Input:
 4 | %   X: d x n data
 5 | %   t: 1 x n response
 6 | %   prior: prior parameter
 7 | % Output:
 8 | %   model: trained model structure
 9 | %   energy: variational lower bound
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | if nargin < 3
12 |     a0 = 1e-4;
13 |     b0 = 1e-4;
14 |     c0 = 1e-4;
15 |     d0 = 1e-4;
16 | else
17 |     a0 = prior.a;
18 |     b0 = prior.b;
19 |     c0 = prior.c;
20 |     d0 = prior.d;
21 | end
22 | [m,n] = size(X);
23 | idx = (1:m)';
24 | dg = sub2ind([m,m],idx,idx);
25 | I = eye(m);
26 | xbar = mean(X,2);
27 | tbar = mean(t,2);
28 | 
29 | X = bsxfun(@minus,X,xbar);
30 | t = bsxfun(@minus,t,tbar);
31 | 
32 | XX = X*X';
33 | Xt = X*t';
34 | 
35 | maxiter = 100;
36 | energy = -inf(1,maxiter+1);
37 | tol = 1e-8;
38 | 
39 | a = a0+1/2;
40 | c = c0+n/2;
41 | Ealpha = 1e-2;
42 | Ebeta = 1e-2;
43 | for iter = 2:maxiter
44 | %     q(w)
45 |     invS = Ebeta*XX;
46 |     invS(dg) = invS(dg)+Ealpha;
47 |     U = chol(invS);
48 |     Ew = Ebeta*(U\(U'\Xt));
49 |     KLw = -sum(log(diag(U)));        
50 | %     q(alpha)
51 |     w2 = Ew.*Ew;
52 |     invU = U'\I;
53 |     dgS = dot(invU,invU,2);
54 |     b = b0+0.5*(w2+dgS);
55 |     Ealpha = a./b;
56 |     KLalpha = -sum(a*log(b));
57 | %     q(beta)
58 |     e2 = sum((t-Ew'*X).^2);    
59 |     invUX = U'\X;
60 |     trXSX = dot(invUX(:),invUX(:));
61 |     d = d0+0.5*(e2+trXSX);
62 |     Ebeta = c/d; 
63 |     KLbeta = -c*log(d);
64 | %     lower bound
65 |     energy(iter) = KLalpha+KLbeta+KLw;
66 |     if energy(iter)-energy(iter-1) < tol*abs(energy(iter-1)); break; end
67 | end
68 | const = m*(gammaln(a)-gammaln(a0)+a0*log(b0))+gammaln(c)-gammaln(c0)+c0*log(d0)+0.5*(m-n*log(2*pi));
69 | energy = energy(2:iter)+const;
70 | w0 = tbar-dot(Ew,xbar);
71 | 
72 | model.w0 = w0;
73 | model.w = Ew;
74 | model.alpha = Ealpha;
75 | model.beta = Ebeta;
76 | model.a = a;
77 | model.b = b;
78 | model.c = c;
79 | model.d = d;
80 | model.xbar = xbar;
81 | 


--------------------------------------------------------------------------------
/chapter10/linRegVb.m:
--------------------------------------------------------------------------------
 1 | function [model, energy] = linRegVb(X, t, prior)
 2 | % Variational Bayesian inference for linear regression.
 3 | % Input:
 4 | %   X: d x n data
 5 | %   t: 1 x n response
 6 | %   prior: prior parameter
 7 | % Output:
 8 | %   model: trained model structure
 9 | %   energy: variational lower bound
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | if nargin < 3
12 |     a0 = 1e-4;
13 |     b0 = 1e-4;
14 |     c0 = 1e-4;
15 |     d0 = 1e-4;
16 | else
17 |     a0 = prior.a;
18 |     b0 = prior.b;
19 |     c0 = prior.c;
20 |     d0 = prior.d;
21 | end
22 | [m,n] = size(X);
23 | I = eye(m);
24 | xbar = mean(X,2);
25 | tbar = mean(t,2);
26 | 
27 | X = bsxfun(@minus,X,xbar);
28 | t = bsxfun(@minus,t,tbar);
29 | 
30 | XX = X*X';
31 | Xt = X*t';
32 | 
33 | maxiter = 100;
34 | energy = -inf(1,maxiter+1);
35 | tol = 1e-8;
36 | 
37 | a = a0+m/2;              % 10.94
38 | c = c0+n/2;
39 | Ealpha = 1e-4;
40 | Ebeta = 1e-4;
41 | for iter = 2:maxiter
42 | %     q(w)
43 |     invS = diag(Ealpha)+Ebeta*XX;            % 10.101
44 |     U = chol(invS);
45 |     Ew = Ebeta*(U\(U'\Xt));                  % 10.100
46 |     KLw = -sum(log(diag(U)));        
47 | %     q(alpha)
48 |     w2 = dot(Ew,Ew);
49 |     invU = U'\I;   
50 |     trS = dot(invU(:),invU(:));
51 |     b = b0+0.5*(w2+trS);                      % 10.95
52 |     Ealpha = a/b;                              % 10.102
53 |     KLalpha = -a*log(b);
54 | %     q(beta)
55 |     e2 = sum((t-Ew'*X).^2);    
56 |     invUX = U'\X;
57 |     trXSX = dot(invUX(:),invUX(:));
58 |     d = d0+0.5*(e2+trXSX);
59 |     Ebeta = c/d; 
60 |     KLbeta = -c*log(d);
61 | %     lower bound
62 |     energy(iter) = KLalpha+KLbeta+KLw;
63 |     if energy(iter)-energy(iter-1) < tol*abs(energy(iter-1)); break; end
64 | end
65 | const = gammaln(a)-gammaln(a0)+gammaln(c)-gammaln(c0)+a0*log(b0)+c0*log(d0)+0.5*(m-n*log(2*pi));
66 | energy = energy(2:iter)+const;
67 | w0 = tbar-dot(Ew,xbar);
68 | 
69 | model.w0 = w0;
70 | model.w = Ew;
71 | model.alpha = Ealpha;
72 | model.beta = Ebeta;
73 | model.a = a;
74 | model.b = b;
75 | model.c = c;
76 | model.d = d;
77 | model.xbar = xbar;
78 | 


--------------------------------------------------------------------------------
/chapter12/ppcaVb.m:
--------------------------------------------------------------------------------
 1 | function [model, L] = ppcaVb(X, q, prior)
 2 | % Perform variatioanl Bayeisan inference for probabilistic PCA model. 
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   q: dimension of target space
 6 | % Output:
 7 | %   model: trained model structure
 8 | %   L: variantional lower bound
 9 | % Reference: 
10 | %   Pattern Recognition and Machine Learning by Christopher M. Bishop 
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | [m,n] = size(X);
13 | if nargin < 3
14 |     a0 = 1e-4;
15 |     b0 = 1e-4;
16 |     c0 = 1e-4;
17 |     d0 = 1e-4;
18 | else
19 |     a0 = prior.a;
20 |     b0 = prior.b;
21 |     c0 = prior.c;
22 |     d0 = prior.d;
23 | end
24 | 
25 | if nargin < 2
26 |     q = m-1;
27 | end
28 | tol = 1e-6;
29 | maxIter = 500;
30 | L = -inf(1,maxIter);
31 | 
32 | mu = mean(X,2);
33 | Xo = bsxfun(@minus, X, mu);
34 | s = dot(Xo(:),Xo(:));
35 | I = eye(q);
36 | % init parameters
37 | a = a0+m/2;
38 | c = c0+m*n/2;
39 | Ealpha = 1e-4;
40 | Ebeta = 1e-4;
41 | EW = rand(q,m); 
42 | EWo = bsxfun(@minus,EW,mean(EW,2));
43 | EWW = EWo*EWo'/m+EW*EW';
44 | for iter = 2:maxIter  
45 | %     q(z)
46 |     LZ = I+Ebeta*EWW;
47 |     V = inv(chol(LZ));                   % inv(LZ) = V*V';
48 |     EZ = LZ\EW*Xo*Ebeta;
49 |     EZZ = n*(V*V')+EZ*EZ';
50 |     KLZ = n*sum(log(diag(V)));           % KLZ = 0.5*n*log(det(inv(LZ)));
51 | %     q(w)
52 |     LW = diag(Ealpha)+Ebeta*EZZ;
53 |     V = inv(chol(LW));                   % inv(LW) = V*V';  
54 |     EW = LW\EZ*Xo'*Ebeta;
55 |     EWW = m*(V*V')+EW*EW';
56 |     KLW = m*sum(log(diag(V)));           % KLW = 0.5*n*log(det(inv(LW)));
57 | %     q(alpha)
58 |     b = b0+diag(EWW)/2;
59 |     Ealpha = a./b;
60 |     KLalpha = -sum(a*log(b));
61 | %     q(beta)
62 |     WZ = EW'*EZ;
63 |     d = d0+(s-2*dot(Xo(:),WZ(:))+dot(EWW(:),EZZ(:)))/2;
64 |     Ebeta = c/d;
65 |     KLbeta = -c*log(d);
66 | %     q(mu)
67 | %     Emu = Ebeta/(lambda+n*Ebeta)*sum(X-WZ,2);
68 | 
69 | %     lower bound
70 |     L(iter) = KLalpha+KLbeta+KLW+KLZ;
71 |     if L(iter)-L(iter-1) < tol*abs(L(iter-1)); break; end  
72 | end
73 | L = L(2:iter);
74 | 
75 | model.Z = EZ;
76 | model.W = EW;
77 | model.apha = Ealpha;
78 | model.beta = Ebeta;
79 | model.a = a;
80 | model.b = b;
81 | model.c = c;
82 | model.d = d;
83 | model.mu = mu;


--------------------------------------------------------------------------------
/chapter13/LDS/ldsEm.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = ldsEm(X, m)
 2 | % EM algorithm for parameter estimation of linear dynamic system.
 3 | % NOTE: This is an exact implementation of the algorithm in PRML.
 4 | % However, this algorithm is numerical unstable and there is much redundant degree of freedom. 
 5 | % Input:
 6 | %   X: d x n data matrix
 7 | %   m: initilaization parameter, either a integer for dimension of z or
 8 | %   initi model structure.
 9 | % Output:
10 | %   model: trained model structure
11 | %   llh: loglikelihood
12 | % reference: Bayesian Reasoning and Machine Learning (BRML)
13 | % Written by Mo Chen (sth4nth@gmail.com).
14 | if isstruct(m)   % init with a model
15 |     model = m;
16 | elseif numel(m) == 1  % random init with latent dimension m
17 |     model = init(X,m);
18 | end
19 | tol = 1e-4;
20 | maxIter = 2000;
21 | llh = -inf(1,maxIter);
22 | for iter = 2:maxIter
23 | %     E-step
24 |     [nu, U, llh(iter),Ezz, Ezy] = kalmanSmoother(model,X);
25 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
26 | %     M-step 
27 |     model = maximization(X, nu, U, Ezz, Ezy);
28 | end
29 | llh = llh(2:iter);
30 | 
31 | function model = init(X, k)
32 | % d = size(X,1);
33 | % model.mu0 = randn(k,1);
34 | % model.P0 = iwishrnd(eye(k),k);
35 | % model.A = randn(k,k);
36 | % model.G = iwishrnd(eye(k),k);
37 | % model.C = randn(d,k);
38 | % model.S = iwishrnd(eye(d),d);
39 | [A,C,Z] = ldsPca(X,k,3*k);
40 | model.mu0 = Z(:,1);
41 | E = Z(:,1:end-1)-Z(:,2:end);
42 | model.P0 = (dot(E(:),E(:))/(k*size(E,2)))*eye(k);
43 | model.A = A;
44 | E = A*Z(:,1:end-1)-Z(:,2:end);
45 | model.G = E*E'/size(E,2);
46 | model.C = C;
47 | E = C*Z-X(:,1:size(Z,2));
48 | model.S = E*E'/size(E,2);
49 | 
50 | function model = maximization(X ,nu, U, Ezz, Ezy)
51 | n = size(X,2);
52 | 
53 | EZZ = sum(Ezz,3);
54 | EZY = sum(Ezy,3);
55 | A = EZY/(EZZ-Ezz(:,:,n));                         % 13.113
56 | G = (EZZ-Ezz(:,:,1)-EZY*A')/(n-1);                % 13.114, BRML 24.5.12
57 | 
58 | Xnu = X*nu';
59 | C = Xnu/EZZ;                                      % 13.115
60 | S = (X*X'-Xnu*C')/n;                              % 13.116, BRML 24.5.11
61 | 
62 | model.mu0 = nu(:,1);                              % 13.110
63 | model.P0 = U(:,:,1);                              % 13.111, 13.107 
64 | model.A = A;
65 | model.G = (G+G')/2;
66 | model.C = C;
67 | model.S = (S+S')/2;


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | -------
 3 | This Matlab package implements machine learning algorithms described in the great textbook:
 4 | Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)).
 5 | 
 6 | It is written purely in Matlab language. It is self-contained. There is no external dependency.
 7 | 
 8 | Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting). It also requires Statistics Toolbox (for some simple random number generator) and Image Processing Toolbox (for reading image data).
 9 | 
10 | Design Goal
11 | -------
12 | * Succinct: The code is extremely compact. Minimizing code length is a major goal. As a result, the core of the algorithms can be easily spotted.
13 | * Efficient: Many tricks for speeding up Matlab code are applied (e.g. vectorization, matrix factorization, etc.). Usually, functions in this package are orders faster than Matlab builtin ones (e.g. kmeans).
14 | * Robust: Many tricks for numerical stability are applied, such as computing probability in logrithm domain, square root matrix update to enforce matrix symmetry\PD, etc.
15 | * Readable: The code is heavily commented. Corresponding formulas in PRML are annoted. Symbols are in sync with the book.
16 | * Practical: The package is not only readable, but also meant to be easily used and modified to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
17 | 
18 | Installation
19 | -------
20 | 1. Download the package to a local folder (e.g. ~/PRMLT/) by running: 
21 | ```console
22 | git clone https://github.com/PRML/PRMLT.git
23 | ```
24 | 2. Run Matlab and navigate to the folder (~/PRMLT/), then run the init.m script.
25 | 
26 | 3. Run some demos in ~/PRMLT/demo folder. Enjoy!
27 | 
28 | FeedBack
29 | -------
30 | If you find any bug or have any suggestion, please do file issues. I am graceful for any feedback and will do my best to improve this package.
31 | 
32 | License
33 | -------
34 | Released under MIT license
35 | 
36 | Contact
37 | -------
38 | sth4nth at gmail dot com
39 | 


--------------------------------------------------------------------------------
/chapter04/logitMn.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = logitMn(X, t, lambda)
 2 | % Multinomial regression for multiclass problem (Multinomial likelihood)
 3 | % Input:
 4 | %   X: d x n data matrix
 5 | %   t: 1 x n label (1~k)
 6 | %   lambda: regularization parameter
 7 | % Output:
 8 | %   model: trained model structure
 9 | %   llh: loglikelihood
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | if nargin < 3
12 |     lambda = 1e-4;
13 | end
14 | X = [X; ones(1,size(X,2))];
15 | [W, llh] = newtonRaphson(X, t, lambda);
16 | % [W, llh] = newtonBlock(X, t, lambda);
17 | model.W = W;
18 | 
19 | function [W, llh] = newtonRaphson(X, t, lambda)
20 | [d,n] = size(X);
21 | k = max(t);
22 | tol = 1e-4;
23 | maxiter = 100;
24 | llh = -inf(1,maxiter);
25 | dk = d*k;
26 | idx = (1:dk)';
27 | dg = sub2ind([dk,dk],idx,idx);
28 | T = sparse(t,1:n,1,k,n,n);
29 | W = zeros(d,k);
30 | HT = zeros(d,k,d,k);
31 | for iter = 2:maxiter
32 |     A = W'*X;                                        % 4.105
33 |     logY = bsxfun(@minus,A,logsumexp(A,1));            % 4.104
34 |     llh(iter) = dot(T(:),logY(:))-0.5*lambda*dot(W(:),W(:));  % 4.108
35 |     if abs(llh(iter)-llh(iter-1)) < tol; break; end
36 |     Y = exp(logY);
37 |     for i = 1:k
38 |          for j = 1:k
39 |             r = Y(i,:).*((i==j)-Y(j,:));  % r has negative value, so cannot use sqrt
40 |             HT(:,i,:,j) = bsxfun(@times,X,r)*X';     % 4.110
41 |         end
42 |     end
43 |     G = X*(Y-T)'+lambda*W;      % 4.96
44 |     H = reshape(HT,dk,dk);
45 |     H(dg) = H(dg)+lambda;
46 |     W(:) = W(:)-H\G(:);    % 4.92
47 | end
48 | llh = llh(2:iter);
49 | 
50 | function [W, llh] = newtonBlock(X, t, lambda)
51 | [d,n] = size(X);
52 | k = max(t);
53 | idx = (1:d)';
54 | dg = sub2ind([d,d],idx,idx);
55 | tol = 1e-4;
56 | maxiter = 100;
57 | llh = -inf(1,maxiter);
58 | T = sparse(t,1:n,1,k,n,n);
59 | W = zeros(d,k);
60 | A = W'*X;
61 | logY = bsxfun(@minus,A,logsumexp(A,1));
62 | for iter = 2:maxiter
63 |     for j = 1:k
64 |         Y = exp(logY);
65 |         Xw =  bsxfun(@times,X,sqrt(Y(j,:).*(1-Y(j,:))));
66 |         H = Xw*Xw';
67 |         H(dg) = H(dg)+lambda;
68 |         g = X*(Y(j,:)-T(j,:))'+lambda*W(:,j);
69 |         W(:,j) = W(:,j)-H\g;
70 |         A(j,:) = W(:,j)'*X;
71 |         logY = bsxfun(@minus,A,logsumexp(A,1));  % must be here to renormalize
72 |     end
73 |     llh(iter) = dot(T(:),logY(:))-0.5*lambda*dot(W(:),W(:));
74 |     if abs(llh(iter)-llh(iter-1)) < tol; break; end
75 | end
76 | llh = llh(2:iter);
77 | 


--------------------------------------------------------------------------------
/chapter10/mixGaussEvidence.m:
--------------------------------------------------------------------------------
 1 | function L = mixGaussEvidence(X, model, prior)
 2 | % Variational lower bound of the model evidence (log of marginal likelihood)
 3 | % This function implements the method in the book PRML. It is equivalent to the bound inside mixGaussVb function.
 4 | % Reference: Pattern Recognition and Machine Learning by Christopher M. Bishop (P.474)
 5 | % Written by Mo Chen (sth4nth@gmail.com).
 6 | alpha0 = prior.alpha;
 7 | kappa0 = prior.kappa;
 8 | m0 = prior.m;
 9 | v0 = prior.v;
10 | M0 = prior.M;
11 | 
12 | alpha = model.alpha; % Dirichlet
13 | kappa = model.kappa;   % Gaussian
14 | m = model.m;         % Gasusian
15 | v = model.v;         % Whishart
16 | % M = model.M;         % Whishart: inv(W) = V'*V
17 | U = model.U;
18 | R = model.R;
19 | logR = model.logR;
20 | 
21 | [d,k] = size(m);
22 | nk = sum(R,1); % 10.51
23 | 
24 | Elogpi = psi(0,alpha)-psi(0,sum(alpha));
25 | Epz = dot(nk,Elogpi);
26 | Eqz = dot(R(:),logR(:));
27 | logCalpha0 = gammaln(k*alpha0)-k*gammaln(alpha0);
28 | Eppi = logCalpha0+(alpha0-1)*sum(Elogpi);
29 | logCalpha = gammaln(sum(alpha))-sum(gammaln(alpha));
30 | Eqpi = dot(alpha-1,Elogpi)+logCalpha;
31 | 
32 | U0 = chol(M0);
33 | sqrtR = sqrt(R);
34 | xbar = bsxfun(@times,X*R,1./nk); % 10.52
35 | 
36 | logW = zeros(1,k);
37 | trSW = zeros(1,k);
38 | trM0W = zeros(1,k);
39 | xbarmWxbarm = zeros(1,k);
40 | mm0Wmm0 = zeros(1,k);
41 | for i = 1:k
42 |     Ui = U(:,:,i);
43 |     logW(i) = -2*sum(log(diag(Ui)));      
44 |     
45 |     Xs = bsxfun(@times,bsxfun(@minus,X,xbar(:,i)),sqrtR(:,i)');
46 |     V = chol(Xs*Xs'/nk(i));
47 |     Q = V/Ui;
48 |     trSW(i) = dot(Q(:),Q(:));  % equivalent to tr(SW)=trace(S/M)
49 |     Q = U0/Ui;
50 |     trM0W(i) = dot(Q(:),Q(:));
51 | 
52 |     q = Ui'\(xbar(:,i)-m(:,i));
53 |     xbarmWxbarm(i) = dot(q,q);
54 |     q = Ui'\(m(:,i)-m0);
55 |     mm0Wmm0(i) = dot(q,q);
56 | end
57 | ElogLambda = sum(psi(0,bsxfun(@minus,v+1,(1:d)')/2),1)+d*log(2)+logW; % 10.65
58 | Epmu = sum(d*log(kappa0/(2*pi))+ElogLambda-d*kappa0./kappa-kappa0*(v.*mm0Wmm0))/2;
59 | logB0 = v0*sum(log(diag(U0)))-0.5*v0*d*log(2)-logMvGamma(0.5*v0,d);
60 | EpLambda = k*logB0+0.5*(v0-d-1)*sum(ElogLambda)-0.5*dot(v,trM0W);
61 | 
62 | Eqmu = 0.5*sum(ElogLambda+d*log(kappa/(2*pi)))-0.5*d*k;
63 | logB =  -v.*(logW+d*log(2))/2-logMvGamma(0.5*v,d);
64 | EqLambda = 0.5*sum((v-d-1).*ElogLambda-v*d)+sum(logB);
65 | 
66 | EpX = 0.5*dot(nk,ElogLambda-d./kappa-v.*trSW-v.*xbarmWxbarm-d*log(2*pi));
67 | 
68 | L = Epz-Eqz+Eppi-Eqpi+Epmu-Eqmu+EpLambda-EqLambda+EpX;


--------------------------------------------------------------------------------
/chapter09/rvmBinEm.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = rvmBinEm(X, t, alpha)
 2 | % Relevance Vector Machine (ARD sparse prior) for binary classification.
 3 | % trained by empirical bayesian (type II ML) using EM.
 4 | % Input:
 5 | %   X: d x n data matrix
 6 | %   t: 1 x n label (0/1)
 7 | %   alpha: prior parameter
 8 | % Output:
 9 | %   model: trained model structure
10 | %   llh: loglikelihood
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | if nargin < 3
13 |     alpha = 1;
14 | end
15 | n = size(X,2);
16 | X = [X;ones(1,n)];
17 | d = size(X,1);
18 | alpha = alpha*ones(d,1);
19 | m = zeros(d,1);
20 | 
21 | tol = 1e-4;
22 | maxiter = 100;
23 | llh = -inf(1,maxiter);
24 | index = 1:d;
25 | for iter = 2:maxiter
26 |     % remove zeros
27 |     nz = 1./alpha > tol;    % nonzeros
28 |     index = index(nz);
29 |     alpha = alpha(nz);
30 |     X = X(nz,:);
31 |     m = m(nz); 
32 |     
33 |     [m,e,U] = logitBin(X,t,alpha,m);            % 7.110 ~ 7.113
34 |     
35 |     m2 = m.^2;
36 |     llh(iter) = e(end)+0.5*(sum(log(alpha))-2*sum(log(diag(U)))-dot(alpha,m2)-n*log(2*pi)); % 7.114  & 7.118
37 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end
38 |     
39 |     V = inv(U);
40 |     dgS = dot(V,V,2);
41 |     alpha = 1./(m2+dgS);    % 9.67
42 | end
43 | llh = llh(2:iter);
44 | 
45 | model.index = index;
46 | model.w = m;                  
47 | model.alpha = alpha;
48 | 
49 | function [w, llh, U] = logitBin(X, t, lambda, w)
50 | % Logistic regression
51 | [d,n] = size(X);
52 | tol = 1e-4;
53 | maxiter = 100;
54 | llh = -inf(1,maxiter);
55 | idx = (1:d)';
56 | dg = sub2ind([d,d],idx,idx);
57 | h = ones(1,n);
58 | h(t==0) = -1;
59 | a = w'*X;
60 | for iter = 2:maxiter
61 |     y = sigmoid(a);                     % 4.87
62 |     r = y.*(1-y);                       % 4.98
63 |     Xw = bsxfun(@times, X, sqrt(r));
64 |     H = Xw*Xw';                         % 4.97
65 |     H(dg) = H(dg)+lambda;
66 |     U = chol(H);
67 |     g = X*(y-t)'+lambda.*w;             % 4.96
68 |     p = -U\(U'\g);
69 |     wo = w;                             % 4.92
70 |     w = wo+p;   
71 |     a = w'*X;   
72 |     llh(iter) = -sum(log1pexp(-h.*a))-0.5*sum(lambda.*w.^2);  % 4.89
73 |     incr = llh(iter)-llh(iter-1);
74 |     while incr < 0      % line search
75 |         p = p/2;
76 |         w = wo+p;
77 |         a = w'*X;   
78 |         llh(iter) = -sum(log1pexp(-h.*a))-0.5*sum(lambda.*w.^2);
79 |         incr = llh(iter)-llh(iter-1);
80 |     end
81 |     if incr < tol; break; end
82 | end
83 | llh = llh(2:iter);


--------------------------------------------------------------------------------
/chapter07/rvmBinFp.m:
--------------------------------------------------------------------------------
 1 | function [model, llh] = rvmBinFp(X, t, alpha)
 2 | % Relevance Vector Machine (ARD sparse prior) for binary classification.
 3 | % trained by empirical bayesian (type II ML) using Mackay fix point update.
 4 | % Input:
 5 | %   X: d x n data matrix
 6 | %   t: 1 x n label (0/1)
 7 | %   alpha: prior parameter
 8 | % Output:
 9 | %   model: trained model structure
10 | %   llh: loglikelihood
11 | % Written by Mo Chen (sth4nth@gmail.com).
12 | if nargin < 3
13 |     alpha = 1;
14 | end
15 | n = size(X,2);
16 | X = [X;ones(1,n)];
17 | d = size(X,1);
18 | alpha = alpha*ones(d,1);
19 | m = zeros(d,1);
20 | 
21 | tol = 1e-4;
22 | maxiter = 100;
23 | llh = -inf(1,maxiter);
24 | index = 1:d;
25 | for iter = 2:maxiter
26 |     % remove zeros
27 |     nz = 1./alpha > tol;    % nonzeros
28 |     index = index(nz);
29 |     alpha = alpha(nz);
30 |     X = X(nz,:);
31 |     m = m(nz); 
32 |     
33 |     [m,e,U] = logitBin(X,t,alpha,m);            % 7.110 ~ 7.113
34 |     
35 |     m2 = m.^2;
36 |     llh(iter) = e(end)+0.5*(sum(log(alpha))-2*sum(log(diag(U)))-dot(alpha,m2)-n*log(2*pi)); % 7.114  & 7.118
37 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end
38 |     
39 |     V = inv(U);
40 |     dgS = dot(V,V,2);
41 |     alpha = (1-alpha.*dgS)./m2;       % 7.89 & 7.87 & 7.116
42 | end
43 | llh = llh(2:iter);
44 | 
45 | model.index = index;
46 | model.w = m;                  
47 | model.alpha = alpha;
48 | 
49 | 
50 | function [w, llh, U] = logitBin(X, t, lambda, w)
51 | % Logistic regression
52 | [d,n] = size(X);
53 | tol = 1e-4;
54 | maxiter = 100;
55 | llh = -inf(1,maxiter);
56 | idx = (1:d)';
57 | dg = sub2ind([d,d],idx,idx);
58 | h = ones(1,n);
59 | h(t==0) = -1;
60 | a = w'*X;
61 | for iter = 2:maxiter
62 |     y = sigmoid(a);                     % 4.87
63 |     r = y.*(1-y);                       % 4.98
64 |     Xw = bsxfun(@times, X, sqrt(r));
65 |     H = Xw*Xw';                         % 4.97
66 |     H(dg) = H(dg)+lambda;
67 |     U = chol(H);
68 |     g = X*(y-t)'+lambda.*w;             % 4.96
69 |     p = -U\(U'\g);
70 |     wo = w;                             % 4.92
71 |     w = wo+p;   
72 |     a = w'*X;   
73 |     llh(iter) = -sum(log1pexp(-h.*a))-0.5*sum(lambda.*w.^2);  % 4.89
74 |     incr = llh(iter)-llh(iter-1);
75 |     while incr < 0      % line search
76 |         p = p/2;
77 |         w = wo+p;
78 |         a = w'*X;   
79 |         llh(iter) = -sum(log1pexp(-h.*a))-0.5*sum(lambda.*w.^2);
80 |         incr = llh(iter)-llh(iter-1);
81 |     end
82 |     if incr < tol; break; end
83 | end
84 | llh = llh(2:iter);
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/chapter09/mixGaussEm.m:
--------------------------------------------------------------------------------
 1 | function [label, model, llh] = mixGaussEm(X, init)
 2 | % Perform EM algorithm for fitting the Gaussian mixture model.
 3 | % Input: 
 4 | %   X: d x n data matrix
 5 | %   init: k (1 x 1) number of components or label (1 x n, 1<=label(i)<=k) or model structure
 6 | % Output:
 7 | %   label: 1 x n cluster label
 8 | %   model: trained model structure
 9 | %   llh: loglikelihood
10 | % Written by Mo Chen (sth4nth@gmail.com).
11 | %% init
12 | fprintf('EM for Gaussian mixture: running ... \n');
13 | tol = 1e-6;
14 | maxiter = 500;
15 | llh = -inf(1,maxiter);
16 | R = initialization(X,init);
17 | for iter = 2:maxiter
18 |     [~,label(1,:)] = max(R,[],2);
19 |     R = R(:,unique(label));   % remove empty clusters
20 |     model = maximization(X,R);
21 |     [R, llh(iter)] = expectation(X,model);
22 |     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter)); break; end;
23 | end
24 | llh = llh(2:iter);
25 | 
26 | function R = initialization(X, init)
27 | n = size(X,2);
28 | if isstruct(init)  % init with a model
29 |     R  = expectation(X,init);
30 | elseif numel(init) == 1  % random init k
31 |     k = init;
32 |     label = ceil(k*rand(1,n));
33 |     R = full(sparse(1:n,label,1,n,k,n));
34 | elseif all(size(init)==[1,n])  % init with labels
35 |     label = init;
36 |     k = max(label);
37 |     R = full(sparse(1:n,label,1,n,k,n));
38 | else
39 |     error('ERROR: init is not valid.');
40 | end
41 | 
42 | function [R, llh] = expectation(X, model)
43 | mu = model.mu;
44 | Sigma = model.Sigma;
45 | w = model.w;
46 | 
47 | n = size(X,2);
48 | k = size(mu,2);
49 | R = zeros(n,k);
50 | for i = 1:k
51 |     R(:,i) = loggausspdf(X,mu(:,i),Sigma(:,:,i));
52 | end
53 | R = bsxfun(@plus,R,log(w));
54 | T = logsumexp(R,2);
55 | llh = sum(T)/n; % loglikelihood
56 | R = exp(bsxfun(@minus,R,T));
57 | 
58 | function model = maximization(X, R)
59 | [d,n] = size(X);
60 | k = size(R,2);
61 | nk = sum(R,1);
62 | w = nk/n;
63 | mu = bsxfun(@times, X*R, 1./nk);
64 | 
65 | Sigma = zeros(d,d,k);
66 | r = sqrt(R);
67 | for i = 1:k
68 |     Xo = bsxfun(@minus,X,mu(:,i));
69 |     Xo = bsxfun(@times,Xo,r(:,i)');
70 |     Sigma(:,:,i) = Xo*Xo'/nk(i)+eye(d)*(1e-6);
71 | end
72 | 
73 | model.mu = mu;
74 | model.Sigma = Sigma;
75 | model.w = w;
76 | 
77 | function y = loggausspdf(X, mu, Sigma)
78 | d = size(X,1);
79 | X = bsxfun(@minus,X,mu);
80 | [U,p]= chol(Sigma);
81 | if p ~= 0
82 |     error('ERROR: Sigma is not PD.');
83 | end
84 | Q = U'\X;
85 | q = dot(Q,Q,1);  % quadratic term (M distance)
86 | c = d*log(2*pi)+2*sum(log(diag(U)));   % normalization constant
87 | y = -(c+q)/2;


--------------------------------------------------------------------------------
/chapter13/LDS/kalmanSmoother.m:
--------------------------------------------------------------------------------
 1 | function [nu, U, llh, Ezz, Ezy] = kalmanSmoother(model, X)
 2 | % Kalman smoother (forward-backward algorithm for linear dynamic system)
 3 | % NOTE: This is the exact implementation of the Kalman smoother algorithm in PRML.
 4 | % However, this algorithm is not practical. It is numerical unstable. 
 5 | % Input:
 6 | %   X: d x n data matrix
 7 | %   model: model structure
 8 | % Output:
 9 | %   nu: q x n matrix of latent mean mu_t=E[z_t] w.r.t p(z_t|x_{1:T})
10 | %   U: q x q x n latent covariance U_t=cov[z_t] w.r.t p(z_t|x_{1:T})
11 | %   Ezz: q x q matrix E[z_tz_t^T]
12 | %   Ezy: q x q matrix E[z_tz_{t-1}^T]
13 | %   llh: loglikelihood
14 | % Written by Mo Chen (sth4nth@gmail.com).
15 | A = model.A; % transition matrix 
16 | G = model.G; % transition covariance
17 | C = model.C; % emission matrix
18 | S = model.S;  % emision covariance
19 | mu0 = model.mu0; % prior mean
20 | P0 = model.P0;  % prior covairance
21 | 
22 | n = size(X,2);
23 | q = size(mu0,1);
24 | mu = zeros(q,n);
25 | V = zeros(q,q,n);
26 | P = zeros(q,q,n); % C_{t+1|t}
27 | Amu = zeros(q,n); % u_{t+1|t}
28 | llh = zeros(1,n);
29 | 
30 | % forward
31 | PC = P0*C';
32 | R = C*PC+S;
33 | K = PC/R;
34 | mu(:,1) = mu0+K*(X(:,1)-C*mu0);
35 | V(:,:,1) = (eye(q)-K*C)*P0;
36 | P(:,:,1) = P0;  % useless, just make a point
37 | Amu(:,1) = mu0; % useless, just make a point
38 | llh(1) = logGauss(X(:,1),C*mu0,R);
39 | for i = 2:n    
40 |     [mu(:,i), V(:,:,i), Amu(:,i), P(:,:,i), llh(i)] = ...
41 |         forwardUpdate(X(:,i), mu(:,i-1), V(:,:,i-1), A, G, C, S);
42 | end
43 | llh = sum(llh);
44 | % backward
45 | nu = zeros(q,n);
46 | U = zeros(q,q,n);
47 | Ezz = zeros(q,q,n);
48 | Ezy = zeros(q,q,n-1);
49 | 
50 | nu(:,n) = mu(:,n);
51 | U(:,:,n) = V(:,:,n);
52 | Ezz(:,:,n) = U(:,:,n)+nu(:,n)*nu(:,n)';
53 | for i = n-1:-1:1  
54 |     [nu(:,i), U(:,:,i), Ezz(:,:,i), Ezy(:,:,i)] = ...
55 |         backwardUpdate(nu(:,i+1), U(:,:,i+1), mu(:,i), V(:,:,i), Amu(:,i+1), P(:,:,i+1), A);
56 | end
57 | 
58 | function [mu1, V1, Amu, P, llh] = forwardUpdate(x, mu0, V0, A, G, C, S)
59 | k = numel(mu0);
60 | P = A*V0*A'+G;                                               % 13.88
61 | PC = P*C';
62 | R = C*PC+S;
63 | K = PC/R;                                                    % 13.92
64 | Amu = A*mu0;
65 | CAmu = C*Amu;
66 | mu1 = Amu+K*(x-CAmu);                                        % 13.89
67 | V1 = (eye(k)-K*C)*P;                                         % 13.90
68 | llh = logGauss(x,CAmu,R);                                    % 13.91
69 | 
70 | 
71 | function [nu0, U0, E00, E10] = backwardUpdate(nu1, U1, mu, V, Amu, P, A)
72 | J = V*A'/P;                                                  % 13.102
73 | nu0 = mu+J*(nu1-Amu);                                        % 13.100
74 | U0 = V+J*(U1-P)*J';                                          % 13.101
75 | E00 = U0+nu0*nu0';                                           % 13.107
76 | E10 = U1*J'+nu1*nu0';                                        % 13.106 
77 | 


--------------------------------------------------------------------------------
/chapter11/GaussWishart.m:
--------------------------------------------------------------------------------
  1 | % Class for Gaussian-Wishart distribution used by Dirichlet process
  2 | 
  3 | classdef GaussWishart
  4 |         properties
  5 |          kappa_
  6 |          m_
  7 |          nu_
  8 |          U_
  9 |      end
 10 |      
 11 |      methods
 12 |          function obj = GaussWishart(kappa,m,nu,S)
 13 |              U = chol(S+kappa*(m*m'));
 14 |              obj.kappa_ = kappa;
 15 |              obj.m_ = m;
 16 |              obj.nu_ = nu;
 17 |              obj.U_ = U;
 18 |          end
 19 |          
 20 |          function obj = clone(obj)
 21 |          end
 22 |          
 23 |          function d = dim(obj)
 24 |              d = numel(obj.m_);
 25 |          end
 26 |          
 27 |          function obj = addData(obj, X)
 28 |              kappa0 = obj.kappa_;
 29 |              m0 = obj.m_;
 30 |              nu0 = obj.nu_;
 31 |              U0 = obj.U_;
 32 |              
 33 |              n = size(X,2);
 34 |              kappa = kappa0+n;
 35 |              m = (kappa0*m0+sum(X,2))/kappa;
 36 |              nu = nu0+n;
 37 |              U = chol(U0'*U0+X*X');
 38 | 
 39 |              obj.kappa_ = kappa;
 40 |              obj.m_ = m;
 41 |              obj.nu_ = nu;
 42 |              obj.U_ = U;
 43 |          end
 44 |         
 45 |          function obj = addSample(obj, x)
 46 |              kappa = obj.kappa_;
 47 |              m = obj.m_;
 48 |              nu = obj.nu_;
 49 |              U = obj.U_;
 50 |              
 51 |              kappa = kappa+1;
 52 |              m = m+(x-m)/kappa;
 53 |              nu = nu+1;
 54 |              U = cholupdate(U,x,'+');
 55 |              
 56 |              obj.kappa_ = kappa;
 57 |              obj.m_ = m;
 58 |              obj.nu_ = nu;
 59 |              obj.U_ = U;
 60 |          end
 61 |          
 62 |          function obj = delSample(obj, x)
 63 |              kappa = obj.kappa_;
 64 |              m = obj.m_;
 65 |              nu = obj.nu_;
 66 |              U = obj.U_;
 67 | 
 68 |              kappa = kappa-1;
 69 |              m = m-(x-m)/kappa;
 70 |              nu = nu-1;
 71 |              U = cholupdate(U,x,'-');
 72 |              
 73 |              obj.kappa_ = kappa;
 74 |              obj.m_ = m;
 75 |              obj.nu_ = nu;
 76 |              obj.U_ = U;
 77 |          end
 78 |          
 79 |          function y = logPredPdf(obj,X)
 80 |              kappa = obj.kappa_;
 81 |              m = obj.m_;
 82 |              nu = obj.nu_;
 83 |              U = obj.U_;
 84 |              
 85 |              d = size(X,1);
 86 |              v = (nu-d+1);
 87 |              U = sqrt((1+1/kappa)/v)*cholupdate(U,sqrt(kappa)*m,'-');
 88 |              
 89 |              X = bsxfun(@minus,X,m);
 90 |              Q = U'\X;
 91 |              q = dot(Q,Q,1);  % quadratic term (M distance)
 92 |              o = -log(1+q/v)*((v+d)/2);
 93 |              c = gammaln((v+d)/2)-gammaln(v/2)-(d*log(v*pi)+2*sum(log(diag(U))))/2;
 94 |              y = c+o;
 95 |          end
 96 |          
 97 |          function [mu, Sigma] = sample(obj)
 98 | %              Sample a Gaussian distribution from GaussianWishart prior
 99 |              kappa = obj.kappa_;
100 |              m = obj.m_;
101 |              nu = obj.nu_;
102 |              U = obj.U_;
103 |              
104 |              Sigma = iwishrnd(U'*U,nu);
105 |              mu = gaussRnd(m,Sigma/kappa);
106 |          end
107 |      end
108 | end
109 | 


--------------------------------------------------------------------------------
/chapter10/mixGaussVb.m:
--------------------------------------------------------------------------------
  1 | function [label, model, L] = mixGaussVb(X, m, prior)
  2 | % Variational Bayesian inference for Gaussian mixture.
  3 | % Input: 
  4 | %   X: d x n data matrix
  5 | %   m: k (1 x 1) or label (1 x n, 1<=label(i)<=k) or model structure
  6 | % Output:
  7 | %   label: 1 x n cluster label
  8 | %   model: trained model structure
  9 | %   L: variational lower bound
 10 | % Reference: Pattern Recognition and Machine Learning by Christopher M. Bishop (P.474)
 11 | % Written by Mo Chen (sth4nth@gmail.com).
 12 | fprintf('Variational Bayesian Gaussian mixture: running ... \n');
 13 | [d,n] = size(X);
 14 | if nargin < 3
 15 |     prior.alpha = 1;
 16 |     prior.kappa = 1;
 17 |     prior.m = mean(X,2);
 18 |     prior.v = d+1;
 19 |     prior.M = eye(d);   % M = inv(W)
 20 | end
 21 | prior.logW = -2*sum(log(diag(chol(prior.M))));
 22 | 
 23 | tol = 1e-8;
 24 | maxiter = 2000;
 25 | L = -inf(1,maxiter);
 26 | model = init(X,m,prior);
 27 | for iter = 2:maxiter
 28 |     model = expect(X,model);
 29 |     model = maximize(X,model,prior);
 30 |     L(iter) = bound(X,model,prior);
 31 |     if abs(L(iter)-L(iter-1)) < tol*abs(L(iter)); break; end
 32 | end
 33 | L = L(2:iter);
 34 | label = zeros(1,n);
 35 | [~,label(:)] = max(model.R,[],2);
 36 | [~,~,label(:)] = unique(label);
 37 | 
 38 | function model = init(X, m, prior)
 39 | n = size(X,2);
 40 | if isstruct(m)  % init with a model
 41 |     model = m;
 42 | elseif numel(m) == 1  % random init k
 43 |     k = m;
 44 |     label = ceil(k*rand(1,n));
 45 |     model.R = full(sparse(1:n,label,1,n,k,n));
 46 | elseif all(size(m)==[1,n])  % init with labels
 47 |     label = m;
 48 |     k = max(label);
 49 |     model.R = full(sparse(1:n,label,1,n,k,n));
 50 | else
 51 |     error('ERROR: init is not valid.');
 52 | end
 53 | model = maximize(X,model,prior);
 54 | 
 55 | % Done
 56 | function model = maximize(X, model, prior)
 57 | alpha0 = prior.alpha;
 58 | kappa0 = prior.kappa;
 59 | m0 = prior.m;
 60 | v0 = prior.v;
 61 | M0 = prior.M;
 62 | R = model.R;
 63 | 
 64 | nk = sum(R,1); % 10.51
 65 | alpha = alpha0+nk; % 10.58
 66 | kappa = kappa0+nk; % 10.60
 67 | v = v0+nk; % 10.63
 68 | m = bsxfun(@plus,kappa0*m0,X*R);
 69 | m = bsxfun(@times,m,1./kappa); % 10.61
 70 | 
 71 | [d,k] = size(m);
 72 | U = zeros(d,d,k); 
 73 | logW = zeros(1,k);
 74 | r = sqrt(R');
 75 | for i = 1:k
 76 |     Xm = bsxfun(@minus,X,m(:,i));
 77 |     Xm = bsxfun(@times,Xm,r(i,:));
 78 |     m0m = m0-m(:,i);
 79 |     M = M0+Xm*Xm'+kappa0*(m0m*m0m');     % equivalent to 10.62
 80 |     U(:,:,i) = chol(M);
 81 |     logW(i) = -2*sum(log(diag(U(:,:,i))));      
 82 | end
 83 | 
 84 | model.alpha = alpha;
 85 | model.kappa = kappa;
 86 | model.m = m;
 87 | model.v = v;
 88 | model.U = U;
 89 | model.logW = logW;
 90 | 
 91 | % Done
 92 | function model = expect(X, model)
 93 | alpha = model.alpha; % Dirichlet
 94 | kappa = model.kappa;   % Gaussian
 95 | m = model.m;         % Gasusian
 96 | v = model.v;         % Whishart
 97 | U = model.U;         % Whishart 
 98 | logW = model.logW;
 99 | n = size(X,2);
100 | [d,k] = size(m);
101 | 
102 | EQ = zeros(n,k);
103 | for i = 1:k
104 |     Q = (U(:,:,i)'\bsxfun(@minus,X,m(:,i)));
105 |     EQ(:,i) = d/kappa(i)+v(i)*dot(Q,Q,1);    % 10.64
106 | end
107 | ElogLambda = sum(psi(0,0.5*bsxfun(@minus,v+1,(1:d)')),1)+d*log(2)+logW; % 10.65
108 | Elogpi = psi(0,alpha)-psi(0,sum(alpha)); % 10.66
109 | logRho = -0.5*bsxfun(@minus,EQ,ElogLambda-d*log(2*pi)); % 10.46
110 | logRho = bsxfun(@plus,logRho,Elogpi);   % 10.46
111 | logR = bsxfun(@minus,logRho,logsumexp(logRho,2)); % 10.49
112 | R = exp(logR);
113 | 
114 | model.logR = logR;
115 | model.R = R;
116 | 
117 | % Done
118 | function L = bound(X, model, prior)
119 | alpha0 = prior.alpha;
120 | kappa0 = prior.kappa;
121 | v0 = prior.v;
122 | logW0 = prior.logW;
123 | alpha = model.alpha; 
124 | kappa = model.kappa; 
125 | v = model.v;         
126 | logW = model.logW;
127 | R = model.R;
128 | logR = model.logR;
129 | [d,n] = size(X);
130 | k = size(R,2);
131 | 
132 | Epz = 0;
133 | Eqz = dot(R(:),logR(:));
134 | logCalpha0 = gammaln(k*alpha0)-k*gammaln(alpha0);
135 | Eppi = logCalpha0;
136 | logCalpha = gammaln(sum(alpha))-sum(gammaln(alpha));
137 | Eqpi = logCalpha;
138 | Epmu = 0.5*d*k*log(kappa0);
139 | Eqmu = 0.5*d*sum(log(kappa));
140 | logB0 = -0.5*v0*(logW0+d*log(2))-logMvGamma(0.5*v0,d);
141 | EpLambda = k*logB0;
142 | logB =  -0.5*v.*(logW+d*log(2))-logMvGamma(0.5*v,d);
143 | EqLambda = sum(logB);
144 | EpX = -0.5*d*n*log(2*pi);
145 | L = Epz-Eqz+Eppi-Eqpi+Epmu-Eqmu+EpLambda-EqLambda+EpX;


--------------------------------------------------------------------------------
/chapter07/rvmRegSeq.m:
--------------------------------------------------------------------------------
  1 | function [model, llh] = rvmRegSeq(X, t)
  2 | % Sparse Bayesian Regression (RVM) using sequential algorithm
  3 | % Input:
  4 | %   X: d x n data
  5 | %   t: 1 x n response
  6 | % Output:
  7 | %   model: trained model structure
  8 | %   llh: loglikelihood 
  9 | % reference:
 10 | % Tipping and Faul. Fast marginal likelihood maximisation for sparse Bayesian models. AISTATS 2003.
 11 | % Written by Mo Chen (sth4nth@gmail.com).
 12 | maxiter = 1000;
 13 | llh = -inf(1,maxiter);
 14 | tol = 1e-4;
 15 | 
 16 | [d,n] = size(X);
 17 | xbar = mean(X,2);
 18 | tbar = mean(t,2);
 19 | X = bsxfun(@minus,X,xbar);
 20 | t = bsxfun(@minus,t,tbar);
 21 | 
 22 | beta = 1/mean(t.^2);   % beta = 1/sigma^2
 23 | alpha = inf(d,1);
 24 | S = beta*dot(X,X,2);     % eq.(22)
 25 | Q = beta*(X*t');         % eq.(22)
 26 | Sigma = zeros(0,0);  
 27 | mu = zeros(0,1);  
 28 | index = zeros(0,1);
 29 | Phi = zeros(0,n);
 30 | iAct = zeros(d,3);   
 31 | for iter = 2:maxiter
 32 |     s = S; q = Q; % p.353 Execrcies 7.17
 33 |     s(index) = alpha(index).*S(index)./(alpha(index)-S(index)); % 7.104
 34 |     q(index) = alpha(index).*Q(index)./(alpha(index)-S(index)); % 7.105    
 35 | 
 36 |     theta = q.^2-s;
 37 |     iNew = theta>0;
 38 | 
 39 |     iUse = false(d,1);
 40 |     iUse(index) = true;
 41 |     
 42 |     iUpd = (iNew & iUse); % update
 43 |     iAdd = (iNew ~= iUpd); % add
 44 |     iDel = (iUse ~= iUpd); % del
 45 |     
 46 |     dllh = -inf(d,1);  % delta likelihood (likelihood improvement of each step, eventually approches 0.)
 47 |     if any(iUpd)
 48 |         alpha_ = s(iUpd).^2./theta(iUpd);      % eq.(20)
 49 |         delta = 1./alpha_-1./alpha(iUpd);
 50 |         dllh(iUpd) = Q(iUpd).^2.*delta./(S(iUpd).*delta+1)-log1p(S(iUpd).*delta);  % eq.(32)
 51 |     end
 52 |     if any(iAdd)
 53 |         dllh(iAdd) = (Q(iAdd).^2-S(iAdd))./S(iAdd)+log(S(iAdd)./(Q(iAdd).^2));    % eq.(27)
 54 |     end
 55 |     if any(iDel)
 56 |         dllh(iDel) = Q(iDel).^2./(S(iDel)-alpha(iDel))-log1p(-S(iDel)./alpha(iDel));  % eq.(37)
 57 |     end
 58 | 
 59 |     [llh(iter),j] = max(dllh);
 60 |     if llh(iter) < tol; break; end
 61 | 
 62 |     iAct(:,1) = iUpd;
 63 |     iAct(:,2) = iAdd;
 64 |     iAct(:,3) = iDel;
 65 |     
 66 |     % update parameters
 67 |     switch find(iAct(j,:))
 68 |         case 1 % update: 
 69 |             idx = (index==j);
 70 |             alpha_ = s(j)^2/theta(j);
 71 | 
 72 |             Sigma_j = Sigma(:,idx);
 73 |             Sigma_jj = Sigma(idx,idx);
 74 |             mu_j = mu(idx);
 75 |             
 76 |             kappa = 1/(Sigma_jj+1/(alpha_-alpha(j)));
 77 |             Sigma = Sigma-kappa*(Sigma_j*Sigma_j');                    % eq.(33)
 78 |             mu = mu-kappa*mu_j*Sigma_j;                                  % eq.(34)
 79 | 
 80 |             v = beta*X*(Phi'*Sigma_j);
 81 |             S = S+kappa*v.^2;                   % eq.(35)
 82 |             Q = Q+kappa*mu_j*v;                 % eq.(36)
 83 |             alpha(j) = alpha_;
 84 |         case 2 % Add
 85 |             alpha_ = s(j)^2/theta(j);
 86 |             Sigma_jj = 1/(alpha_+S(j));
 87 |             mu_j = Sigma_jj*Q(j);
 88 |             phi_j = X(j,:);             
 89 | 
 90 |             v = beta*Sigma*(Phi*phi_j');  
 91 |             off = -Sigma_jj*v;                         % eq.(28) has error?
 92 |             Sigma = [Sigma+Sigma_jj*(v*v'), off; off', Sigma_jj];   % eq.(28)
 93 |             mu = [mu-mu_j*v; mu_j];                                % eq.(29)
 94 |             
 95 |             e_j = phi_j-v'*Phi;
 96 |             v = beta*X*e_j';
 97 |             S = S-Sigma_jj*v.^2;                      % eq.(30)
 98 |             Q = Q-mu_j*v;                              % eq.(31)
 99 |             
100 |             index = [index;j]; %#ok<AGROW>
101 |             alpha(j) = alpha_;
102 |         case 3 % del
103 |             idx = (index==j);
104 |             Sigma_j = Sigma(:,idx);
105 |             Sigma_jj = Sigma(idx,idx);
106 |             mu_j = mu(idx);
107 |             
108 |             Sigma = Sigma-(Sigma_j*Sigma_j')/Sigma_jj;                    % eq.(38)
109 |             mu = mu-mu_j*Sigma_j/Sigma_jj;                                  % eq.(39)
110 | 
111 |             v = beta*X*(Phi'*Sigma_j);
112 |             S = S+v.^2/Sigma_jj;                   % eq.(40)
113 |             Q = Q+mu_j*v/Sigma_jj;                  % eq.(41)
114 |             
115 |             mu(idx) = [];
116 |             Sigma(:,idx) = [];
117 |             Sigma(idx,:) = [];
118 |             index(idx) = [];
119 |             alpha(j) = inf;
120 |     end
121 |     Phi = X(index,:); 
122 | %     beta = ;
123 | end
124 | llh = cumsum(llh(2:iter));
125 | w0 = tbar-dot(mu,xbar(index));
126 | 
127 | model.index = index;
128 | model.w0 = w0;
129 | model.w = mu;
130 | model.alpha = alpha(index);
131 | model.beta = beta;


--------------------------------------------------------------------------------
/Contents.m:
--------------------------------------------------------------------------------
  1 | % CHAPTER01
  2 | %   condEntropy      - Compute conditional entropy z=H(x|y) of two discrete variables x and y.
  3 | %   entropy          - Compute entropy z=H(x) of a discrete variable x.
  4 | %   jointEntropy     - Compute joint entropy z=H(x,y) of two discrete variables x and y.
  5 | %   mutInfo          - Compute mutual information I(x,y) of two discrete variables x and y.
  6 | %   nmi              - Compute normalized mutual information I(x,y)/sqrt(H(x)*H(y)) of two discrete variables x and y.
  7 | %   nvi              - Compute normalized variation information z=(1-I(x,y)/H(x,y)) of two discrete variables x and y.
  8 | %   relatEntropy     - Compute relative entropy (a.k.a KL divergence) z=KL(p(x)||p(y)) of two discrete variables x and y.
  9 | % CHAPTER02    
 10 | %   logDirichlet     - Compute log pdf of a Dirichlet distribution.
 11 | %   logGauss         - Compute log pdf of a Gaussian distribution.
 12 | %   logKde           - Compute log pdf of kernel density estimator.
 13 | %   logMn            - Compute log pdf of a multinomial distribution.
 14 | %   logMvGamma       - Compute logarithm multivariate Gamma function 
 15 | %   logSt            - Compute log pdf of a Student's t distribution.
 16 | %   logVmf           - Compute log pdf of a von Mises-Fisher distribution.
 17 | %   logWishart       - Compute log pdf of a Wishart distribution.
 18 | % CHAPTER03    
 19 | %   linReg           - Fit linear regression model y=w'x+w0  
 20 | %   linRegFp         - Fit empirical Bayesian linear model with Mackay fixed point method (p.168)
 21 | %   linRegPred       - Compute linear regression model reponse y = w'*X+w0 and likelihood
 22 | %   linRnd           - Generate data from a linear model p(t|w,x)=G(w'x+w0,sigma), sigma=sqrt(1/beta) 
 23 | % CHAPTER04    
 24 | %   binPlot          - Plot binary classification result for 2d data
 25 | %   fda              - Fisher (linear) discriminant analysis
 26 | %   logitBin         - Logistic regression for binary classification optimized by Newton-Raphson method.
 27 | %   logitBinPred     - Prediction of binary logistic regression model
 28 | %   logitMn          - Multinomial regression for multiclass problem (Multinomial likelihood)
 29 | %   logitMnPred      - Prediction of multiclass (multinomial) logistic regression model
 30 | %   sigmoid          - Sigmod function
 31 | %   softmax          - Softmax function
 32 | % CHAPTER05
 33 | %   mlpClass         - Train a multilayer perceptron neural network for classification with backpropagation
 34 | %   mlpClassPred     - Multilayer perceptron classification prediction
 35 | %   mlpReg           - Train a multilayer perceptron neural network for regression with backpropagation
 36 | %   mlpRegPred       - Multilayer perceptron regression prediction
 37 | % CHAPTER06    
 38 | %   kn2sd            - Transform a kernel matrix (or inner product matrix) to a squared distance matrix
 39 | %   knCenter         - Centerize the data in the kernel space
 40 | %   knGauss          - Gaussian (RBF) kernel K = exp(-|x-y|/(2s));
 41 | %   knKmeans         - Perform kernel kmeans clustering.
 42 | %   knKmeansPred     - Prediction for kernel kmeans clusterng
 43 | %   knLin            - Linear kernel (inner product)
 44 | %   knPca            - Kernel PCA
 45 | %   knPcaPred        - Prediction for kernel PCA
 46 | %   knPoly           - Polynomial kernel k(x,y)=(x'y+c)^o
 47 | %   knReg            - Gaussian process (kernel) regression
 48 | %   knRegPred        - Prediction for Gaussian Process (kernel) regression model
 49 | %   sd2kn            - Transform a squared distance matrix to a kernel matrix. 
 50 | % CHAPTER07    
 51 | %   rvmBinFp         - Relevance Vector Machine (ARD sparse prior) for binary classification.
 52 | %   rvmBinPred       - Prodict the label for binary logistic regression model
 53 | %   rvmRegFp         - Relevance Vector Machine (ARD sparse prior) for regression
 54 | %   rvmRegPred       - Compute RVM regression model reponse y = w'*X+w0 and likelihood 
 55 | %   rvmRegSeq        - Sparse Bayesian Regression (RVM) using sequential algorithm
 56 | % CHAPTER08    
 57 | %  MRF    
 58 | %   mrfBethe         - Compute Bethe energy
 59 | %   mrfBp            - Undirected graph belief propagation for MRF
 60 | %   mrfGibbs         - Compute Gibbs energy
 61 | %   mrfIsGa          - Contruct a latent Ising MRF with Gaussian observation
 62 | %   mrfMf            - Mean field for MRF
 63 | %  NaiveBayes    
 64 | %   nbBern           - Naive bayes classifier with indepenet Bernoulli.
 65 | %   nbBernPred       - Prediction of naive Bayes classifier with independent Bernoulli.
 66 | %   nbGauss          - Naive bayes classifier with indepenet Gaussian
 67 | %   nbGaussPred      - Prediction of naive Bayes classifier with independent Gaussian.
 68 | % CHAPTER09    
 69 | %   kmeans           - Perform kmeans clustering.
 70 | %   kmeansPred       - Prediction for kmeans clusterng
 71 | %   kmeansRnd        - Generate samples from a Gaussian mixture distribution with common variances (kmeans model).
 72 | %   kmedoids         - Perform k-medoids clustering.
 73 | %   kseeds           - Perform kmeans++ seeding
 74 | %   linRegEm         - Fit empirical Bayesian linear regression model with EM (p.448 chapter 9.3.4)
 75 | %   mixBernEm        - Perform EM algorithm for fitting the Bernoulli mixture model.
 76 | %   mixBernRnd       - Generate samples from a Bernoulli mixture distribution.
 77 | %   mixGaussEm       - Perform EM algorithm for fitting the Gaussian mixture model.
 78 | %   mixGaussPred     - Predict label and responsibility for Gaussian mixture model.
 79 | %   mixGaussRnd      - Genarate samples form a Gaussian mixture model.
 80 | %   rvmBinEm         - Relevance Vector Machine (ARD sparse prior) for binary classification.
 81 | %   rvmRegEm         - Relevance Vector Machine (ARD sparse prior) for regression
 82 | % CHAPTER10
 83 | %   linRegVb         - Variational Bayesian inference for linear regression.
 84 | %   mixGaussEvidence - Variational lower bound of the model evidence (log of marginal likelihood)
 85 | %   mixGaussVb       - Variational Bayesian inference for Gaussian mixture.
 86 | %   mixGaussVbPred   - Predict label and responsibility for Gaussian mixture model trained by VB.
 87 | %   rvmRegVb         - Variational Bayesian inference for RVM regression.
 88 | % CHAPTER11
 89 | %   dirichletRnd     - Generate samples from a Dirichlet distribution.
 90 | %   discreteRnd      - Generate samples from a discrete distribution (multinomial).
 91 | %   Gauss            - Class for Gaussian distribution used by Dirichlet process
 92 | %   gaussRnd         - Generate samples from a Gaussian distribution.
 93 | %   GaussWishart     - Class for Gaussian-Wishart distribution used by Dirichlet process
 94 | %   mixDpGb          - Collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 
 95 | %   mixDpGbOl        - Online collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 
 96 | %   mixGaussGb       - Collapsed Gibbs sampling for Dirichlet process (infinite) Gaussian mixture model (a.k.a. DPGM). 
 97 | %   mixGaussSample   - Genarate samples form a Gaussian mixture model with GaussianWishart prior.
 98 | % CHAPTER12 
 99 | %   fa               - Perform EM algorithm for factor analysis model
100 | %   pca              - Principal component analysis
101 | %   pcaEm            - Perform EM-like algorithm for PCA (by Sam Roweis).
102 | %   pcaEmC           - Perform Constrained EM like algorithm for PCA.
103 | %   ppcaEm           - Perform EM algorithm to maiximize likelihood of probabilistic PCA model.
104 | %   ppcaRnd          - Generate data from probabilistic PCA model
105 | %   ppcaVb           - Perform variatioanl Bayeisan inference for probabilistic PCA model. 
106 | % CHAPTER13 
107 | %  HMM 
108 | %   hmmEm            - EM algorithm to fit the parameters of HMM model (a.k.a Baum-Welch algorithm)
109 | %   hmmFilter        - HMM forward filtering algorithm. 
110 | %   hmmRnd           - Generate a data sequence from a hidden Markov model.
111 | %   hmmSmoother      - HMM smoothing alogrithm (normalized forward-backward or normalized alpha-beta algorithm).
112 | %   hmmViterbi       - Viterbi algorithm (calculated in log scale to improve numerical stability).
113 | %  LDS 
114 | %   kalmanFilter     - Kalman filter (forward algorithm for linear dynamic system)
115 | %   kalmanSmoother   - Kalman smoother (forward-backward algorithm for linear dynamic system)
116 | %   ldsEm            - EM algorithm for parameter estimation of linear dynamic system.
117 | %   ldsPca           - Subspace method for learning linear dynamic system.
118 | %   ldsRnd           - Generate a data sequence from linear dynamic system.
119 | % CHAPTER14 
120 | %   adaboostBin      - Adaboost for binary classification (weak learner: kmeans)
121 | %   adaboostBinPred  - Prediction of binary Adaboost
122 | %   mixLinPred       - Prediction function for mxiture of linear regression
123 | %   mixLinReg        - Mixture of linear regression
124 | %   mixLinRnd        - Generate data from mixture of linear model
125 | %   mixLogitBin      - Mixture of logistic regression model for binary classification optimized by Newton-Raphson method
126 | %   mixLogitBinPred  - Prediction function for mixture of logistic regression
127 | 


--------------------------------------------------------------------------------