├── LICENSE
├── README.md
├── kernel_regression.m
├── run_code_example.m
├── test_error_unbalanced.m
├── train_alternating.m
├── train_alternating_epsilon.m
└── train_kernel.m


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2008 Andreas Argyriou
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Multi-task feature learning
 2 | 
 3 | This is a method for learning multiple tasks simultaneously, assuming that they share a set of common latent features. It is based on regularizing the spectrum of the matrix of tasks. Regularization with the trace norm is a special case of this framework. Among the diverse applications of multi-task learning, one example is the personalized recommendation of products to consumers. 
 4 | 
 5 | The methodology is presented in detail in the papers
 6 | [Multi-Task Feature Learning](http://ttic.uchicago.edu/~argyriou/papers/nips06_online.pdf),
 7 | [Convex Multi-Task Feature Learning](http://ttic.uchicago.edu/~argyriou/papers/mtl_feat.pdf)
 8 | and 
 9 | [A Spectral Regularization Framework for Multi-Task Structure Learning](http://ttic.uchicago.edu/~argyriou/papers/spectral_mtl.pdf).
10 | 
11 | Note that it is possible to use the method with a nonlinear kernel instead of explicit features. It suffices to run the code on the Gram matrix after a preprocessing with a Gram-Schmidt or Cholesky decomposition (see [Convex Multi-Task Feature Learning](http://ttic.uchicago.edu/~argyriou/papers/mtl_feat.pdf) sec. 5).
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/kernel_regression.m:
--------------------------------------------------------------------------------
1 | function [a, cost, err, reg] = kernel_regression(K,y,gamma)
2 | 
3 | n = size(K,1);
4 | a = inv(K+gamma*eye(n)) * y;
5 | cost = gamma * y' * inv(K+gamma*eye(n)) * y;
6 | err = gamma^2 * y' * (inv(K+gamma*eye(n)))^2 * y;
7 | reg = cost - err;
8 | 


--------------------------------------------------------------------------------
/run_code_example.m:
--------------------------------------------------------------------------------
 1 | function [bestcv,bestgamma,cverrs,testerrs,theW,theD] = ...
 2 |   run_code_example(gammas,trainx,trainy,testx,testy,task_indexes,task_indexes_test,cv_size,Dini,iterations,...
 3 |     method_str, epsilon_init, fname)
 4 | 
 5 | % Example script of running the train_alternating_epsilon() code
 6 | % Uses regularizer trace( W' D^{-1} W ) and square loss (see file
 7 | % kernel_regression.m)
 8 | 
 9 | % INPUTS:
10 | % gammas: a vector of the gammas to select from using cross-validation
11 | % task_indexes: starting indexes for each task in data 
12 | % cv_size: number of cross validations performed 
13 | % Dini: the initial matrix D 
14 | % epsilon_init : perturbation epsilon (use 0 for no perturbation)
15 | % method_str : see below
16 | % See train_alternating.m for the rest
17 | 
18 | % OUTPUTS:
19 | % bestcv: the best cross-validation performance among the gammas tried
20 | % bestgamma: the best gamma found using cross-validation
21 | % theW: a matrix for which each column is a w_t
22 | % theD: matrix D estimated
23 | 
24 | feat = 1; independent = 2; diagonal = 3;
25 | 
26 | if (strcmp(method_str,'feat'))
27 |     method = feat;
28 | elseif (strcmp(method_str,'ind'))
29 |     method = independent;
30 | elseif (strcmp(method_str,'diag'))
31 |     method = diagonal;
32 | else
33 |     error('Wrong method');
34 | end
35 | % (see file train_alternating.m)
36 | 
37 | % Define method for computing f(D)
38 | function v = vec_inv(d)
39 |     v = zeros(length(d),1);
40 |     ind = find(d > eps);
41 |     v(ind) = 1 ./ d(ind);
42 | end
43 | 
44 | [theW,theD,costs,mineps] = train_alternating_epsilon(trainx, trainy, task_indexes, gammas, Dini, iterations, ...
45 |     method, 'kernel_regression', @vec_inv, @(b)(b/sum(b)), epsilon_init);
46 | 
47 | testerrs = mean(test_error_unbalanced(theW,testx,testy,task_indexes_test));
48 | 
49 | save(sprintf('results_%s_%s_lin.mat',fname,method_str),'gammas','Dini','method_str',...
50 |     'testerrs','theW','theD','costs','mineps');
51 | 
52 | end


--------------------------------------------------------------------------------
/test_error_unbalanced.m:
--------------------------------------------------------------------------------
 1 | function [testerrs] = test_error_unbalanced(W,testx,testy,task_indexes)
 2 | 
 3 | T = length(task_indexes);
 4 | testerrs = zeros(T,1);
 5 | task_indexes(T+1) = length(testy)+1;
 6 | dim = size(testx,1);
 7 | 
 8 | for t = 1:T
 9 |     t_testx = testx(:, task_indexes(t):task_indexes(t+1)-1 );
10 |     t_testy = testy(task_indexes(t):task_indexes(t+1)-1)';
11 |     prediction = W(:,t)' * t_testx;
12 |     testerrs(t) = (t_testy - prediction) * (t_testy - prediction)' / (task_indexes(t+1)-task_indexes(t));
13 | end
14 | 
15 | 


--------------------------------------------------------------------------------
/train_alternating.m:
--------------------------------------------------------------------------------
  1 | function [W,D,costfunc] = train_alternating(trainx,trainy,task_indexes,gamma,Dini,iterations,...
  2 |     method,kernel_method,f_method,Dmin_method)
  3 | 
  4 | % Main algorithm for Multi-task Feature Learning (with a linear kernel)
  5 | % See [Argyriou,Evgeniou,Pontil, NIPS 2006, ML journal 2007]
  6 | % 
  7 | % task_indexes : sizes of samples per task (may be unbalanced)
  8 | % gamma        : regularization parameter
  9 | % method :  feat        = orthonormal feature learning, i.e. using trace(W' f(D) W)
 10 | %           independent = learning with no coupling across tasks (i.e.
 11 | %                         using ||W||_2 regularization)
 12 | %           diagonal    = variable (feature) selection (i.e. D is diagonal)
 13 | % kernel_method : method for kernel learning (e.g. SVM, least square
 14 | %                 regression etc.)
 15 | % f_method    : evaluates f(D) (acts on the singular values of D)
 16 | % Dmin_method : method for minimizing over D of the form 
 17 | %               min_d { sum_i f(d_i) b_i^2 } 
 18 | %               (b_i are the singular values of W, 
 19 | %               or in case of var. selection the L2 norms of the rows of W)
 20 | 
 21 | num_data = size(trainx,2);
 22 | dim = size(trainx,1);
 23 | T = length(task_indexes);
 24 | 
 25 | feat = 1; independent = 2; diagonal = 3;
 26 | 
 27 | if (max(max(abs(Dini-Dini'))) > eps)
 28 |     error('D should be symmetric');
 29 | end
 30 | if (min(eig(Dini)) < -eps)
 31 |     error('D should be positive semidefinite');
 32 | end
 33 | if (abs(trace(Dini)-1) > 100*eps)
 34 |     error('D should have trace  1');
 35 | end
 36 | D = Dini;
 37 | 
 38 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 39 | %                                                            %
 40 | %               Feature Learning                             %
 41 | %                                                            %
 42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 43 | 
 44 | if (method == feat)
 45 |     costfunc = [];
 46 |     
 47 |     % Compute f(D)^{-1/2} for the next step
 48 |     [U,S,dummy] = svd(D);               % svd seems more robust than eig
 49 |     fS = feval(f_method,diag(S));
 50 |     temp = sqrt(fS);
 51 |     tempi = find(temp > eps);
 52 |     temp(tempi) = 1./temp(tempi);
 53 |     fD_isqrt = U * diag(temp) * U';
 54 | 
 55 |     for iter = 1:iterations
 56 |         % Use variable transform to solve the regularization problem for
 57 |         % fixed D
 58 |         new_trainx = fD_isqrt * trainx;
 59 |         [W,costf,err,reg] = train_kernel(new_trainx,trainy,task_indexes,gamma,kernel_method);
 60 |         W = fD_isqrt * W;
 61 |         
 62 |         costfunc = [costfunc; iter, costf, err, reg];
 63 | 
 64 |         % Update D
 65 |         [U,S,V] = svd(W);
 66 |         if (dim > T)
 67 |             S = [S, zeros(dim,dim-T)];
 68 |         end
 69 |         Smin = feval(Dmin_method, diag(S));
 70 |         D = U * diag(Smin) * U';
 71 |         
 72 |         % Compute f(D)^{-1/2} for the next step
 73 |         fS = feval(f_method,Smin);
 74 |         temp = sqrt(fS);
 75 |         tempi = find(temp > eps);
 76 |         temp(tempi) = 1./temp(tempi);
 77 |         fD_isqrt = U * diag(temp) * U';
 78 |     end
 79 | end
 80 | 
 81 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 82 | %                                                          %
 83 | %           Independent Regularizations                    %
 84 | %                                                          %
 85 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 86 | 
 87 | if (method == independent)
 88 |     [W,costfunc,err,reg] = train_kernel(trainx,trainy,task_indexes,gamma,kernel_method);
 89 |     D = [];
 90 | end
 91 | 
 92 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 93 | %                                                          %
 94 | %           Variable selection                             %
 95 | %                                                          %
 96 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 97 | 
 98 | if (method == diagonal)
 99 |     if (norm(D-diag(diag(D))) > eps)
100 |         error('D should be diagonal');
101 |     end
102 |     costfunc = [];
103 | 
104 |     % Compute f(D)^{-1/2} for the next step
105 |     fS = feval(f_method,diag(D));
106 |     temp = sqrt(fS);
107 |     tempi = find(temp > eps);
108 |     temp(tempi) = 1./temp(tempi);
109 |     fD_isqrt = diag(temp);
110 |     
111 |     for iter = 1:iterations
112 |         new_trainx = fD_isqrt * trainx;
113 |         [W,costf,err,reg] = train_kernel(new_trainx,trainy,task_indexes,gamma,kernel_method);
114 |         W = fD_isqrt * W;
115 |         
116 |         costfunc = [costfunc; iter, costf, err, reg];
117 | 
118 |         % Update D
119 |         Smin = feval(Dmin_method, sqrt(sum(W.^2,2)));
120 |         D = diag(Smin);
121 |         
122 |         % Compute f(D)^{-1/2} for the next step
123 |         fS = feval(f_method,Smin);
124 |         temp = sqrt(fS);
125 |         tempi = find(temp > eps);
126 |         temp(tempi) = 1./temp(tempi);
127 |         fD_isqrt = diag(temp);
128 |     end
129 | end
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/train_alternating_epsilon.m:
--------------------------------------------------------------------------------
 1 | function [W,D,costfunc,mineps] = train_alternating_epsilon(trainx,trainy,task_indexes,gamma,Dini,iterations,...
 2 |     method,kernel_method,f_method,Dmin_method,epsilon_init)
 3 | 
 4 | if (epsilon_init < eps)
 5 |     % Run without epsilon
 6 |     [W,D,costfunc] = train_alternating(trainx,trainy,task_indexes,gamma,Dini,iterations,...
 7 |         method,kernel_method,f_method,Dmin_method);
 8 |     mineps = 0;
 9 |     return;
10 | end
11 | 
12 | mincost = inf;
13 | epsilon = epsilon_init;
14 | 
15 | i = 1;
16 | while (epsilon > eps)
17 |     Dmin_e_method = @(b) (feval(Dmin_method, sqrt(b.^2+epsilon)));
18 |     [We,De,costfunc_e] = train_alternating(trainx,trainy,task_indexes,gamma,Dini,iterations,...
19 |         method,kernel_method,f_method,Dmin_e_method);
20 |     s = svd(De);
21 |     costfunc_e(:,[2,4]) = costfunc_e(:,[2,4]) + gamma * epsilon * sum(feval(f_method,s));
22 | 
23 |     curcost = costfunc_e(size(costfunc_e,1),2);
24 |     if (curcost < mincost)
25 |         mincost = curcost;
26 |         mineps = epsilon;
27 |         W = We;
28 |         D = De;
29 |     end
30 | 
31 |     costfunc{i} = costfunc_e;
32 |     i = i+1;
33 |     epsilon = epsilon / 10;
34 | end
35 | 


--------------------------------------------------------------------------------
/train_kernel.m:
--------------------------------------------------------------------------------
 1 | function [W,costfunc,err,reg] = train_kernel(trainx,trainy,task_indexes,gamma,kernel_method)
 2 | 
 3 | num_data = size(trainx,2);
 4 | dim = size(trainx,1);
 5 | T = length(task_indexes);
 6 | task_indexes(T+1) = num_data+1;
 7 | 
 8 | costfunc = 0;
 9 | err = 0;
10 | reg = 0;
11 | 
12 | for t = 1:T
13 |     % get the data for this task
14 |     x = trainx(: , task_indexes(t):task_indexes(t+1)-1);
15 |     y = trainy(task_indexes(t):task_indexes(t+1)-1);
16 |     K = x'*x;
17 |     [a, costfunct, errt, regt] = feval(kernel_method,K,y,gamma);
18 |     W(:,t) = x*a;
19 | 
20 |     costfunc = costfunc + costfunct;
21 |     err = err + errt;
22 |     reg = reg + regt;
23 | end
24 | 
25 | 


--------------------------------------------------------------------------------