├── LICENSE ├── README.md ├── kernel_regression.m ├── run_code_example.m ├── test_error_unbalanced.m ├── train_alternating.m ├── train_alternating_epsilon.m └── train_kernel.m /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2008 Andreas Argyriou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Multi-task feature learning 2 | 3 | This is a method for learning multiple tasks simultaneously, assuming that they share a set of common latent features. It is based on regularizing the spectrum of the matrix of tasks. Regularization with the trace norm is a special case of this framework. Among the diverse applications of multi-task learning, one example is the personalized recommendation of products to consumers. 4 | 5 | The methodology is presented in detail in the papers 6 | [Multi-Task Feature Learning](http://ttic.uchicago.edu/~argyriou/papers/nips06_online.pdf), 7 | [Convex Multi-Task Feature Learning](http://ttic.uchicago.edu/~argyriou/papers/mtl_feat.pdf) 8 | and 9 | [A Spectral Regularization Framework for Multi-Task Structure Learning](http://ttic.uchicago.edu/~argyriou/papers/spectral_mtl.pdf). 10 | 11 | Note that it is possible to use the method with a nonlinear kernel instead of explicit features. It suffices to run the code on the Gram matrix after a preprocessing with a Gram-Schmidt or Cholesky decomposition (see [Convex Multi-Task Feature Learning](http://ttic.uchicago.edu/~argyriou/papers/mtl_feat.pdf) sec. 5). 12 | 13 | 14 | -------------------------------------------------------------------------------- /kernel_regression.m: -------------------------------------------------------------------------------- 1 | function [a, cost, err, reg] = kernel_regression(K,y,gamma) 2 | 3 | n = size(K,1); 4 | a = inv(K+gamma*eye(n)) * y; 5 | cost = gamma * y' * inv(K+gamma*eye(n)) * y; 6 | err = gamma^2 * y' * (inv(K+gamma*eye(n)))^2 * y; 7 | reg = cost - err; 8 | -------------------------------------------------------------------------------- /run_code_example.m: -------------------------------------------------------------------------------- 1 | function [bestcv,bestgamma,cverrs,testerrs,theW,theD] = ... 2 | run_code_example(gammas,trainx,trainy,testx,testy,task_indexes,task_indexes_test,cv_size,Dini,iterations,... 3 | method_str, epsilon_init, fname) 4 | 5 | % Example script of running the train_alternating_epsilon() code 6 | % Uses regularizer trace( W' D^{-1} W ) and square loss (see file 7 | % kernel_regression.m) 8 | 9 | % INPUTS: 10 | % gammas: a vector of the gammas to select from using cross-validation 11 | % task_indexes: starting indexes for each task in data 12 | % cv_size: number of cross validations performed 13 | % Dini: the initial matrix D 14 | % epsilon_init : perturbation epsilon (use 0 for no perturbation) 15 | % method_str : see below 16 | % See train_alternating.m for the rest 17 | 18 | % OUTPUTS: 19 | % bestcv: the best cross-validation performance among the gammas tried 20 | % bestgamma: the best gamma found using cross-validation 21 | % theW: a matrix for which each column is a w_t 22 | % theD: matrix D estimated 23 | 24 | feat = 1; independent = 2; diagonal = 3; 25 | 26 | if (strcmp(method_str,'feat')) 27 | method = feat; 28 | elseif (strcmp(method_str,'ind')) 29 | method = independent; 30 | elseif (strcmp(method_str,'diag')) 31 | method = diagonal; 32 | else 33 | error('Wrong method'); 34 | end 35 | % (see file train_alternating.m) 36 | 37 | % Define method for computing f(D) 38 | function v = vec_inv(d) 39 | v = zeros(length(d),1); 40 | ind = find(d > eps); 41 | v(ind) = 1 ./ d(ind); 42 | end 43 | 44 | [theW,theD,costs,mineps] = train_alternating_epsilon(trainx, trainy, task_indexes, gammas, Dini, iterations, ... 45 | method, 'kernel_regression', @vec_inv, @(b)(b/sum(b)), epsilon_init); 46 | 47 | testerrs = mean(test_error_unbalanced(theW,testx,testy,task_indexes_test)); 48 | 49 | save(sprintf('results_%s_%s_lin.mat',fname,method_str),'gammas','Dini','method_str',... 50 | 'testerrs','theW','theD','costs','mineps'); 51 | 52 | end -------------------------------------------------------------------------------- /test_error_unbalanced.m: -------------------------------------------------------------------------------- 1 | function [testerrs] = test_error_unbalanced(W,testx,testy,task_indexes) 2 | 3 | T = length(task_indexes); 4 | testerrs = zeros(T,1); 5 | task_indexes(T+1) = length(testy)+1; 6 | dim = size(testx,1); 7 | 8 | for t = 1:T 9 | t_testx = testx(:, task_indexes(t):task_indexes(t+1)-1 ); 10 | t_testy = testy(task_indexes(t):task_indexes(t+1)-1)'; 11 | prediction = W(:,t)' * t_testx; 12 | testerrs(t) = (t_testy - prediction) * (t_testy - prediction)' / (task_indexes(t+1)-task_indexes(t)); 13 | end 14 | 15 | -------------------------------------------------------------------------------- /train_alternating.m: -------------------------------------------------------------------------------- 1 | function [W,D,costfunc] = train_alternating(trainx,trainy,task_indexes,gamma,Dini,iterations,... 2 | method,kernel_method,f_method,Dmin_method) 3 | 4 | % Main algorithm for Multi-task Feature Learning (with a linear kernel) 5 | % See [Argyriou,Evgeniou,Pontil, NIPS 2006, ML journal 2007] 6 | % 7 | % task_indexes : sizes of samples per task (may be unbalanced) 8 | % gamma : regularization parameter 9 | % method : feat = orthonormal feature learning, i.e. using trace(W' f(D) W) 10 | % independent = learning with no coupling across tasks (i.e. 11 | % using ||W||_2 regularization) 12 | % diagonal = variable (feature) selection (i.e. D is diagonal) 13 | % kernel_method : method for kernel learning (e.g. SVM, least square 14 | % regression etc.) 15 | % f_method : evaluates f(D) (acts on the singular values of D) 16 | % Dmin_method : method for minimizing over D of the form 17 | % min_d { sum_i f(d_i) b_i^2 } 18 | % (b_i are the singular values of W, 19 | % or in case of var. selection the L2 norms of the rows of W) 20 | 21 | num_data = size(trainx,2); 22 | dim = size(trainx,1); 23 | T = length(task_indexes); 24 | 25 | feat = 1; independent = 2; diagonal = 3; 26 | 27 | if (max(max(abs(Dini-Dini'))) > eps) 28 | error('D should be symmetric'); 29 | end 30 | if (min(eig(Dini)) < -eps) 31 | error('D should be positive semidefinite'); 32 | end 33 | if (abs(trace(Dini)-1) > 100*eps) 34 | error('D should have trace 1'); 35 | end 36 | D = Dini; 37 | 38 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 39 | % % 40 | % Feature Learning % 41 | % % 42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 43 | 44 | if (method == feat) 45 | costfunc = []; 46 | 47 | % Compute f(D)^{-1/2} for the next step 48 | [U,S,dummy] = svd(D); % svd seems more robust than eig 49 | fS = feval(f_method,diag(S)); 50 | temp = sqrt(fS); 51 | tempi = find(temp > eps); 52 | temp(tempi) = 1./temp(tempi); 53 | fD_isqrt = U * diag(temp) * U'; 54 | 55 | for iter = 1:iterations 56 | % Use variable transform to solve the regularization problem for 57 | % fixed D 58 | new_trainx = fD_isqrt * trainx; 59 | [W,costf,err,reg] = train_kernel(new_trainx,trainy,task_indexes,gamma,kernel_method); 60 | W = fD_isqrt * W; 61 | 62 | costfunc = [costfunc; iter, costf, err, reg]; 63 | 64 | % Update D 65 | [U,S,V] = svd(W); 66 | if (dim > T) 67 | S = [S, zeros(dim,dim-T)]; 68 | end 69 | Smin = feval(Dmin_method, diag(S)); 70 | D = U * diag(Smin) * U'; 71 | 72 | % Compute f(D)^{-1/2} for the next step 73 | fS = feval(f_method,Smin); 74 | temp = sqrt(fS); 75 | tempi = find(temp > eps); 76 | temp(tempi) = 1./temp(tempi); 77 | fD_isqrt = U * diag(temp) * U'; 78 | end 79 | end 80 | 81 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 82 | % % 83 | % Independent Regularizations % 84 | % % 85 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 86 | 87 | if (method == independent) 88 | [W,costfunc,err,reg] = train_kernel(trainx,trainy,task_indexes,gamma,kernel_method); 89 | D = []; 90 | end 91 | 92 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 93 | % % 94 | % Variable selection % 95 | % % 96 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 97 | 98 | if (method == diagonal) 99 | if (norm(D-diag(diag(D))) > eps) 100 | error('D should be diagonal'); 101 | end 102 | costfunc = []; 103 | 104 | % Compute f(D)^{-1/2} for the next step 105 | fS = feval(f_method,diag(D)); 106 | temp = sqrt(fS); 107 | tempi = find(temp > eps); 108 | temp(tempi) = 1./temp(tempi); 109 | fD_isqrt = diag(temp); 110 | 111 | for iter = 1:iterations 112 | new_trainx = fD_isqrt * trainx; 113 | [W,costf,err,reg] = train_kernel(new_trainx,trainy,task_indexes,gamma,kernel_method); 114 | W = fD_isqrt * W; 115 | 116 | costfunc = [costfunc; iter, costf, err, reg]; 117 | 118 | % Update D 119 | Smin = feval(Dmin_method, sqrt(sum(W.^2,2))); 120 | D = diag(Smin); 121 | 122 | % Compute f(D)^{-1/2} for the next step 123 | fS = feval(f_method,Smin); 124 | temp = sqrt(fS); 125 | tempi = find(temp > eps); 126 | temp(tempi) = 1./temp(tempi); 127 | fD_isqrt = diag(temp); 128 | end 129 | end 130 | 131 | 132 | -------------------------------------------------------------------------------- /train_alternating_epsilon.m: -------------------------------------------------------------------------------- 1 | function [W,D,costfunc,mineps] = train_alternating_epsilon(trainx,trainy,task_indexes,gamma,Dini,iterations,... 2 | method,kernel_method,f_method,Dmin_method,epsilon_init) 3 | 4 | if (epsilon_init < eps) 5 | % Run without epsilon 6 | [W,D,costfunc] = train_alternating(trainx,trainy,task_indexes,gamma,Dini,iterations,... 7 | method,kernel_method,f_method,Dmin_method); 8 | mineps = 0; 9 | return; 10 | end 11 | 12 | mincost = inf; 13 | epsilon = epsilon_init; 14 | 15 | i = 1; 16 | while (epsilon > eps) 17 | Dmin_e_method = @(b) (feval(Dmin_method, sqrt(b.^2+epsilon))); 18 | [We,De,costfunc_e] = train_alternating(trainx,trainy,task_indexes,gamma,Dini,iterations,... 19 | method,kernel_method,f_method,Dmin_e_method); 20 | s = svd(De); 21 | costfunc_e(:,[2,4]) = costfunc_e(:,[2,4]) + gamma * epsilon * sum(feval(f_method,s)); 22 | 23 | curcost = costfunc_e(size(costfunc_e,1),2); 24 | if (curcost < mincost) 25 | mincost = curcost; 26 | mineps = epsilon; 27 | W = We; 28 | D = De; 29 | end 30 | 31 | costfunc{i} = costfunc_e; 32 | i = i+1; 33 | epsilon = epsilon / 10; 34 | end 35 | -------------------------------------------------------------------------------- /train_kernel.m: -------------------------------------------------------------------------------- 1 | function [W,costfunc,err,reg] = train_kernel(trainx,trainy,task_indexes,gamma,kernel_method) 2 | 3 | num_data = size(trainx,2); 4 | dim = size(trainx,1); 5 | T = length(task_indexes); 6 | task_indexes(T+1) = num_data+1; 7 | 8 | costfunc = 0; 9 | err = 0; 10 | reg = 0; 11 | 12 | for t = 1:T 13 | % get the data for this task 14 | x = trainx(: , task_indexes(t):task_indexes(t+1)-1); 15 | y = trainy(task_indexes(t):task_indexes(t+1)-1); 16 | K = x'*x; 17 | [a, costfunct, errt, regt] = feval(kernel_method,K,y,gamma); 18 | W(:,t) = x*a; 19 | 20 | costfunc = costfunc + costfunct; 21 | err = err + errt; 22 | reg = reg + regt; 23 | end 24 | 25 | --------------------------------------------------------------------------------