├── +core ├── Graph.m ├── Instance.m └── Instances.m ├── +random_generators ├── RandomGraphGenerator.m └── RandomInstancesGenerator.m ├── +utils ├── CostFunction.m ├── EdgeWeighter.m └── ParameterLearner.m ├── .gitattributes ├── .gitignore ├── LICENSE.txt ├── README.md ├── architecture.png ├── cost.png ├── log_cost.png ├── main.m └── plotcost.m /+core/Graph.m: -------------------------------------------------------------------------------- 1 | classdef Graph < handle 2 | %Holds information about the graph topology and edge features. 3 | %Only existing edges have features. 4 | 5 | properties 6 | num_nodes = 0; %number of nodes in the graph 7 | num_features = 0; %number fo features in the graph 8 | G = [[]]; %unweigthed directed adjacency matrix for the graph [n x n] matrix of 0s and 1s 9 | features = {}; % f x {n x n} feature matrix - each double value corresponds to a single feature type and a single edge 10 | isSparse = true; 11 | end 12 | 13 | methods 14 | function g = Graph(num_nodes,num_features,G,features,isSparse) 15 | %Populates and returns a 16 | %Graph object with the supplied properties, 17 | if nargin == 5 18 | g.num_nodes = num_nodes; 19 | g.num_features = num_features; 20 | g.G = G; 21 | g.features = features; 22 | g.isSparse = isSparse; 23 | end 24 | end 25 | 26 | function adjMat = getWeightedAdjMatrix(g,weigther,w) 27 | %uses the weighter function and w parameters to combine the 28 | %features of each edge into a single double value (weight or 29 | %strength), returns aa n x n matrix of doubles 30 | if g.isSparse 31 | dot_product = sparse(g.num_nodes,g.num_nodes); 32 | else 33 | dot_product = zeros(g.num_nodes); 34 | end 35 | for k=1:g.num_features 36 | dot_product = dot_product + g.features{k}*w(k); 37 | end 38 | adjMat = g.G.*weigther.calcWeights(dot_product); 39 | end 40 | 41 | function setSparse(g,isSparse) 42 | %if tru forces the graph to use the sparse representation, and 43 | %the dense representation otherwise. Cost computation can 44 | %differ signicantly depending on graph representation. If your 45 | %graph is really sparse <5% od the edges exist the sparse 46 | %representation is best. 47 | if g.isSparse ~= isSparse 48 | if isSparse 49 | g.G = sparse(g.G); 50 | for k=1:g.num_features 51 | g.features{k} = sparse(g.features{k}); 52 | end 53 | else 54 | g.G = full(g.G); 55 | for k=1:g.num_features 56 | g.features{k} = full(g.features{k}); 57 | end 58 | end 59 | end 60 | end 61 | end 62 | methods (Static) 63 | 64 | end 65 | end 66 | 67 | -------------------------------------------------------------------------------- /+core/Instance.m: -------------------------------------------------------------------------------- 1 | classdef Instance < handle 2 | %Instance holds information on positive and negative links for a source 3 | %node in a graph. 4 | 5 | properties 6 | source_node_index = 1; % source node 7 | positive_links = []; % positive nodes (links) 8 | negative_links = []; % negative nodes (links) 9 | graph@core.Graph; %a graph object that contains information on links and features 10 | end 11 | 12 | methods 13 | function this = Instance(source_node_index,positive_links,negative_links,graph) 14 | if nargin > 0 15 | this.source_node_index = source_node_index; 16 | this.positive_links = positive_links; 17 | this.negative_links = negative_links; 18 | this.graph = graph; 19 | end 20 | end 21 | 22 | 23 | 24 | function cost = calcCost(this,weighter,alpha,costf,w) 25 | %calculates the cost for the given parameters w as defined by 26 | %Leskovec. The wighter and cost functions need to be specified 27 | %as well as the restart probability (alpha). 28 | pagerank = calcPagerank(this,weighter,alpha,w); 29 | cost = 0; 30 | for di=1:length(this.positive_links) 31 | cost = cost+sum(costf.calcCost(pagerank(this.negative_links)-pagerank(this.positive_links(di)))); 32 | end 33 | end 34 | 35 | function [cost,gradient] = calcCostAndGradient(this,weighter,alpha,costf,w) 36 | %calculates the cost and the gradient/ for the given parameters w as defined by 37 | %Leskovec. The wighter and cost functions need to be specified 38 | %as well as the restart probability (alpha). 39 | 40 | %%%%%%%%%%%%%%%%%%%% 41 | % calculate the weigthed adjacency matrix 42 | %% %%%%%%%%%%%%%%%%% 43 | adjMat = this.graph.getWeightedAdjMatrix(weighter,w); 44 | %%%%%%%%%%%%%%%%%%%% 45 | % precalculate the sums of rows in the adjMat 46 | %%%%%%%%%%%%%%%%%%%% 47 | sumRowsAdjMat = full(sum(adjMat,2)); 48 | [i,j,v_adjMat] = find(adjMat); 49 | v_sum_fuv_w = sumRowsAdjMat(i); 50 | v_sum_fuv_w_squared = v_sum_fuv_w.^2; 51 | %%%%%%%%%%%%%%%%%%%% 52 | %calculate the transition probability matrix with respect to a starting node s 53 | %%%%%%%%%%%%%%%%%%%% 54 | Q = this.calcTransitionProbabilityMatrixForSourceSparse(this.graph.num_nodes,i,j,v_adjMat,this.source_node_index,alpha,v_sum_fuv_w); 55 | Qt = Q'; 56 | %% 57 | %calc pagerank 58 | %% 59 | p = zeros(this.graph.num_nodes,100); 60 | p(:,1) = repmat(1.0/this.graph.num_nodes,this.graph.num_nodes,1); 61 | last_iter = 0; 62 | for iter=2:100 63 | p(:,iter) = Qt*p(:,iter-1); 64 | if sum((p(:,iter)-p(:,iter-1)).^2) < 1e-12 65 | last_iter = iter; 66 | break; 67 | end 68 | if iter == 99 69 | 'p didnt converge' 70 | end 71 | end 72 | pagerank = p(:,last_iter); 73 | %% 74 | %calc derivative, for every feature 75 | %% 76 | d_p = cell(this.graph.num_features,1); 77 | for k=1:this.graph.num_features 78 | %init gradient 79 | d_p_t = zeros(this.graph.num_nodes,1); 80 | d_p_t_1 = zeros(this.graph.num_nodes,1); 81 | 82 | tic 83 | %calculate dQ #1 84 | dQ = this.TcalcdQM(weighter,w,alpha,k,adjMat,v_sum_fuv_w,v_sum_fuv_w_squared); 85 | 86 | % dQ1 = sDL.TcalcdQ(weighter,w,alpha,k,adjMat,sumRowsAdjMat); 87 | 88 | 89 | dQt = dQ'; 90 | for iter=1:100 91 | d_p_t = Qt*d_p_t_1+dQt*p(:,min(iter,last_iter)); 92 | if sum((d_p_t_1-d_p_t).^2) < 1e-12 93 | break; 94 | end 95 | d_p_t_1 = d_p_t; 96 | if iter == 99 97 | 'dp didnt converge' 98 | end 99 | end 100 | d_p{k} = d_p_t'; 101 | end 102 | 103 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 104 | %calc cost and gradient 105 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 106 | l = repmat(this.negative_links,1,length(this.positive_links)); 107 | d = repmat(this.positive_links,1,length(this.negative_links))'; 108 | gradient = zeros(1,this.graph.num_features); 109 | [costs,gradients] = costf.calcCostAndGradient(pagerank(l)-pagerank(d)); 110 | cost = sum(sum(costs)); 111 | for k=1:this.graph.num_features 112 | gradient(k) = sum(sum(gradients.*(d_p{k}(l)-d_p{k}(d)))); 113 | end 114 | 115 | end 116 | 117 | function pagerank = calcPagerank(this,weighter,alpha,w) 118 | %calculates the pagerank for this instance and the specified 119 | %parameters, weighter function and restart probability. 120 | %%%%%%%%%%%%%%% 121 | % calculate the weigthed adjacency matrix 122 | %%%%%%%%%%%%%%% 123 | adjMat = this.graph.getWeightedAdjMatrix(weighter,w); 124 | %%%%%%%%%%%%%%% 125 | % precalculate the sums of rows in the adjMat 126 | %%%%%%%%%%%%%%% 127 | sumRowsAdjMat = full(sum(adjMat,2)); 128 | [i,j,v_adjMat] = find(adjMat); 129 | v_sum_fuv_w = sumRowsAdjMat(i); 130 | %%%%%%%%%%%%%%% 131 | % calculate the transition probability matrix with respect to a 132 | % starting node s 133 | %%%%%%%%%%%%%%% 134 | Q = this.calcTransitionProbabilityMatrixForSourceSparse(this.graph.num_nodes,i,j,v_adjMat,this.source_node_index,alpha,v_sum_fuv_w); 135 | Qt = Q'; 136 | %%%%%%%%%%%%%% 137 | % calc the actual pagerank using the transition probability matrix 138 | %%%%%%%%%%%%%% 139 | pagerank = repmat(1.0/this.graph.num_nodes,this.graph.num_nodes,1); 140 | previous_pagerank = repmat(1.0/this.graph.num_nodes,this.graph.num_nodes,1); 141 | for iter=1:100 142 | pagerank = Qt*previous_pagerank; 143 | pagerank = pagerank./sum(pagerank); 144 | if sum((pagerank-previous_pagerank).^2) < 1e-12 145 | break; 146 | end 147 | if iter == 99 148 | w 149 | 'Error: pagerank didnt converge' 150 | end 151 | previous_pagerank = pagerank; 152 | end 153 | end 154 | end 155 | 156 | methods (Access = private) 157 | 158 | 159 | function dQ = TcalcdQM(sDL,weighter,w,alpha,k,adjMat,v_sum_fuv_w,v_sum_fuv_w_squared) 160 | [i,j,v_adjMat] = find(adjMat); 161 | %calc dufv_dw 162 | dfuv_dwk = sDL.graph.G.*weighter.calcGradient(w(k),sDL.graph.features{k}); 163 | [~,~,v_dfuv_dwk] = find(dfuv_dwk); 164 | if length(v_dfuv_dwk) ~= length(v_adjMat); 165 | [~,~,v_dfuv_dwk] = find(sDL.graph.features{k}); 166 | end 167 | %calc sum_dufv_dw 168 | sum_dufv_dw = full(sum(dfuv_dwk,2)); 169 | v_sum_dfuv_dwk = sum_dufv_dw(i); 170 | %precalc dQ 171 | res = (1-alpha).*(v_dfuv_dwk.*v_sum_fuv_w-v_adjMat.*v_sum_dfuv_dwk)./v_sum_fuv_w_squared; 172 | dQ = sparse(i,j,res,sDL.graph.num_nodes,sDL.graph.num_nodes); 173 | end 174 | 175 | function dQ = TcalcdQ(sDL,weighter,w,alpha,k,adjMat,sumRowsAdjMatRepmat) 176 | %depracated 177 | dQ = sparse(sDL.graph.num_nodes,sDL.graph.num_nodes); 178 | sum_dufv_dw = zeros(sDL.graph.num_nodes,1); 179 | for j=1:sDL.graph.num_nodes 180 | for u=1:sDL.graph.num_nodes 181 | if sDL.graph.G(j,u) == 1 182 | sum_dufv_dw(j) = sum_dufv_dw(j)+weighter.calcGradient(w(k),sDL.graph.features{k}(j,u)); 183 | end 184 | end 185 | end 186 | for j=1:sDL.graph.num_nodes 187 | for u=1:sDL.graph.num_nodes 188 | if sDL.graph.G(j,u) == 1 189 | dQ(j,u) = (1-alpha)*(weighter.calcGradient(w(k),sDL.graph.features{k}(j,u))*sumRowsAdjMatRepmat(j)-adjMat(j,u)*sum_dufv_dw(j))/(sumRowsAdjMatRepmat(j)^2); 190 | end 191 | end 192 | end 193 | dQ = dQ; 194 | end 195 | 196 | function TransProbMatrix = calcTransitionProbabilityMatrixForSourceSparse(this,n,i,j,v_adjMat,source_node,alpha,v_row_sums) 197 | adjMat = v_adjMat./v_row_sums; 198 | TransProbMatrix = sparse(i,j,(1-alpha)*adjMat,n,n); 199 | TransProbMatrix(:,source_node) = TransProbMatrix(:,source_node)+alpha; 200 | TransProbMatrix(sum(TransProbMatrix,2)==alpha,source_node) = 1; 201 | end 202 | 203 | function TransProbMatrix = calcTransitionProbabilityMatrixForSource(this,adjMat,source_node,alpha,row_sums) 204 | %Q = normalize adjMat to make it row stochastic 205 | %Q = (1-alpha)Q+alpha*1(v=s) 206 | %1(v=s) is a matrix with zeros except for the s column that contains ones 207 | adjMat = bsxfun(@rdivide,adjMat,row_sums); 208 | TransProbMatrix = (1-alpha)*adjMat; 209 | TransProbMatrix(:,source_node) = TransProbMatrix(:,source_node)+alpha; 210 | end 211 | end 212 | 213 | end 214 | 215 | -------------------------------------------------------------------------------- /+core/Instances.m: -------------------------------------------------------------------------------- 1 | classdef Instances < handle 2 | %A set of sDL groups with their respective graphs 3 | %an Instances object can be a training or a test dataset for 4 | %link prediction 5 | 6 | properties 7 | instances;%an array of sDLGroups 8 | end 9 | 10 | properties (SetAccess = private) 11 | n = 0; 12 | end 13 | 14 | methods 15 | function iset = Instances(instances) 16 | %initializes the instances with a set of sDL groups 17 | iset.instances = instances; 18 | iset.n = length(instances); 19 | end 20 | 21 | function num_instances = getNumberOfInstances(this) 22 | %returns the number of instances in this dataset 23 | num_instances = this.n; 24 | end 25 | 26 | function [cost,gradient] = calcCostAndGradient(this,weighter,alpha,costf,w) 27 | % calculates the cost and gradient for this Instances using a 28 | % given cost function. Calls the calcCostAndGradient on each 29 | % instance in its instances array 30 | costs = zeros(this.n,1); 31 | gradients = zeros(this.n,length(w)); 32 | for i=1:this.n 33 | [cost_i,gradient_i] = this.instances(i).calcCostAndGradient(weighter,alpha,costf,w); 34 | costs(i) = cost_i; 35 | gradients(i,:) = gradient_i; 36 | end 37 | cost = sum(w.^2)+sum(costs); 38 | gradient = 2*w+sum(gradients); 39 | end 40 | 41 | function cost = calcCost(this,weighter,alpha,costf,w) 42 | %calculates the cost for this Instances using a given cost 43 | %function. Calls the calcCost on each 44 | % instance in its instances array 45 | costs = zeros(this.n,1); 46 | for i=1:this.n 47 | costs(i) = this.instances(i).calcCost(weighter,alpha,costf,w); 48 | end 49 | cost = sum(w.^2)+sum(costs); 50 | end 51 | 52 | end 53 | 54 | end 55 | 56 | -------------------------------------------------------------------------------- /+random_generators/RandomGraphGenerator.m: -------------------------------------------------------------------------------- 1 | classdef RandomGraphGenerator < handle 2 | % Generates a random graph built with the hybrid process. 3 | % With probability p_preferential a link chosen using preferential attachment 4 | % and with probability 1-p_preferenial a link is chose unifrmly at 5 | % random. Also generates num_features random features for each edge. 6 | 7 | properties 8 | num_nodes = 100;%number of nodes in each graph 9 | num_features = 2;%number of features in each graph 10 | start_nodes = 10;%starting number of nodes 11 | p_preferential = .8;%probability for preferential attachment 12 | end 13 | 14 | methods 15 | function g = generate(this) 16 | %generates a random graph (using preferential attachement with probability p_preferential), 17 | %starting with start_nodes fully connected nodes up to a total of num_nodes nodes. 18 | %Also generates num_features random features for each edge taken from a normal distribution (0,1) 19 | G = zeros(this.num_nodes,this.num_nodes); 20 | G(1:this.start_nodes,1:this.start_nodes) = ones(this.start_nodes,this.start_nodes)-diag(ones(1,this.start_nodes)); 21 | degrees = zeros(1,this.num_nodes); 22 | degrees(1:this.start_nodes) = repmat(this.start_nodes-1,this.start_nodes,1); 23 | total_sum_degrees = (this.start_nodes-1)*this.start_nodes; 24 | for k=this.start_nodes+1:this.num_nodes 25 | perm = randperm(k-1); 26 | perm_i = 1; 27 | for j=1:this.start_nodes 28 | l = 0; 29 | if rand() < this.p_preferential 30 | l = perm(perm_i); 31 | perm_i = perm_i+1; 32 | else 33 | flag = true; 34 | while flag 35 | p = rand(); 36 | for i=1:k 37 | p = p-double(degrees(i))/total_sum_degrees; 38 | if p<0 39 | l = i; 40 | break 41 | end 42 | end 43 | if G(l,k) ~= 1 44 | flag = false; 45 | end 46 | end 47 | end 48 | degrees(l) = degrees(l)+1; 49 | total_sum_degrees = total_sum_degrees+1; 50 | G(l,k) = 1; 51 | G(k,l) = 1; 52 | end 53 | degrees(k) = this.start_nodes; 54 | total_sum_degrees = total_sum_degrees+this.start_nodes; 55 | end 56 | G = sparse(G); 57 | psi = cell(this.num_features); 58 | for k=1:this.num_features 59 | psi{k} = sparse(randn(this.num_nodes).*G); 60 | end 61 | g = core.Graph(this.num_nodes,this.num_features,G,psi,true); 62 | end 63 | end 64 | 65 | end 66 | 67 | -------------------------------------------------------------------------------- /+random_generators/RandomInstancesGenerator.m: -------------------------------------------------------------------------------- 1 | classdef RandomInstancesGenerator < handle 2 | %Used to generate a random instances object for testing. Users can 3 | %specify how many instance objects to generate and the generator for 4 | %the graph objects. 5 | 6 | properties 7 | K = 10;%top K links are chosen as positive the rest as negative 8 | alpha = .3;%restart probability 9 | num_graphs = 3;%number of graphs to generate 10 | num_instances = 50;%number of instances (sDL groups) 11 | graph_generator = random_generators.RandomGraphGenerator();%a generator used to generate graphs 12 | weighter = utils.EdgeWeighter(1);%weighter function 13 | w = [1,-1];%feature parameters, length must be same as the number of features, otheriwse num of features will be changed 14 | end 15 | 16 | methods 17 | function this = RandomInstancesGenerator() 18 | %returns a generator object with default parameters 19 | end 20 | 21 | function [dataset,weighter,alpha,w] = generate(this) 22 | %generates the instances objects as specified by the properties 23 | graphs(1,this.num_graphs) = core.Graph(); 24 | this.graph_generator.num_features = length(this.w); 25 | for i=1:this.num_graphs 26 | graphs(i) = this.graph_generator.generate(); 27 | end 28 | instances(1,this.num_instances) = core.Instance(); 29 | k = 1; 30 | for i=1:this.num_graphs 31 | num_start_nodes = this.num_instances/this.num_graphs; 32 | if i <= mod(this.num_instances,this.num_graphs) 33 | num_start_nodes = num_start_nodes+1; 34 | end 35 | for s=1:num_start_nodes 36 | instance = core.Instance(); 37 | instance.graph = graphs(i); 38 | instance.source_node_index = s; 39 | pagerank = instance.calcPagerank(this.weighter,this.alpha,this.w); 40 | [~,Idxs] = sort(pagerank); 41 | instance.positive_links = Idxs(graphs(i).num_nodes-this.K+1:graphs(i).num_nodes); 42 | instance.negative_links = Idxs(1:graphs(i).num_nodes-this.K); 43 | instances(k) = instance; 44 | k = k+1; 45 | end 46 | end 47 | %group all sDL triplests with their respective graphs together in an 48 | %instances object 49 | dataset = core.Instances(instances); 50 | weighter = this.weighter; 51 | alpha = this.alpha; 52 | w = this.w; 53 | end 54 | end 55 | 56 | end 57 | 58 | -------------------------------------------------------------------------------- /+utils/CostFunction.m: -------------------------------------------------------------------------------- 1 | classdef CostFunction < handle 2 | %Calculates the cost and gradient for a given delta value 3 | %the function is usually a sigmoid 4 | 5 | properties 6 | b = .00001;%b parameter present in every type of cost function refer to Leskovec 7 | type = 1;%1 WMW loss function 8 | end 9 | 10 | methods 11 | function costf = CostFunction(b,type) 12 | if nargin > 0 13 | costf.b = b; 14 | costf.type = type; 15 | end 16 | end 17 | 18 | function cost = calcCost(cf,x) 19 | if cf.type == 1 20 | cost = 1.0./(1.0+exp(-x./cf.b)); 21 | end 22 | end 23 | 24 | function gradient = calcGradient(cf,x) 25 | if cf.type == 1 26 | tmp = 1.0 ./ (1+exp(x./cf.b)); 27 | gradient = tmp .* (1-tmp) ./ cf.b; 28 | end 29 | end 30 | 31 | function [cost,gradient] = calcCostAndGradient(cf,x) 32 | if cf.type == 1 33 | cost = 1.0./(1.0+exp(-x./cf.b)); 34 | gradient = cost .* (1-cost) ./ cf.b; 35 | end 36 | end 37 | 38 | end 39 | 40 | end 41 | 42 | -------------------------------------------------------------------------------- /+utils/EdgeWeighter.m: -------------------------------------------------------------------------------- 1 | classdef EdgeWeighter < handle 2 | 3 | properties 4 | type = 1;%one is exponential edge strength, 2 is logistic edge strength 5 | end 6 | 7 | methods 8 | function weighter = EdgeWeighter(type) 9 | if nargin > 0 10 | weighter.type = type; 11 | end 12 | end 13 | 14 | function weights = calcWeights(weighter,dot_product) 15 | if weighter.type == 1 16 | weights = spfun(@exp,dot_product); 17 | else 18 | weights = 1/(1+exp(-dot_product)); 19 | end 20 | end 21 | 22 | function gradient = calcGradient(weighter,w,psi) 23 | if weighter.type == 1 24 | gradient = psi.*spfun(@exp,psi*w); 25 | else 26 | gradient = psi.*exp(-psi*w)./(1+exp(-psi*w)).^2; 27 | end 28 | end 29 | end 30 | 31 | end 32 | 33 | -------------------------------------------------------------------------------- /+utils/ParameterLearner.m: -------------------------------------------------------------------------------- 1 | classdef ParameterLearner 2 | %A utility class that laerns the best parameters for a specifeid 3 | %instances with the learn method. Groups togehter information about the 4 | %wighter and cost functions and the restart probability. Can also 5 | %define the maximal time limit for learning the parameters. 6 | 7 | properties 8 | weighter@utils.EdgeWeighter;%wegihter function 9 | costf@utils.CostFunction;%cost function 10 | alpha = .3;%restart probability 11 | time_limit = 30;%time limit for optimization in seconds 12 | print_progress=true;%whether or not to display progress while learning the parameters 13 | end 14 | 15 | methods 16 | function this = ParameterLearner(weighter,alpha,costf,time_limit,print_progress) 17 | %creates a new parameter learner either with all parameters 18 | %specified or with default values. 19 | if nargin == 5 20 | this.weighter = weighter; 21 | this.costf = costf; 22 | this.alpha = alpha; 23 | this.time_limit = time_limit; 24 | this.print_progress = print_progress; 25 | else 26 | this.weighter = utils.EdgeWeighter(); 27 | this.costf = utils.CostFunction(); 28 | end 29 | end 30 | 31 | function w = learn(this,instances) 32 | %learns the parameters using the simulated anealing method 33 | %the parameters are bounded to -3,3 34 | nf = instances.instances(1).graph.num_features; 35 | w0 = repmat(.000001,1,nf); 36 | ObjectiveFunction = @(x) instances.calcCost(this.weighter,this.alpha,this.costf,x); 37 | display = 'iter'; 38 | if ~this.print_progress 39 | display = 'off'; 40 | end 41 | options = saoptimset('Display',display,'ReannealInterval',10,... 42 | 'ObjectiveLimit',nf+1,'TimeLimit',this.time_limit,... 43 | 'TemperatureFcn',@temperatureboltz,'TolFun',1e-10); 44 | w = simulannealbnd(ObjectiveFunction,w0,-repmat(3.0,1,nf),repmat(3.0,1,nf),options); 45 | end 46 | end 47 | 48 | end 49 | 50 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Andrej 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Link prediction with supervised random walks 2 | ================================== 3 | 4 | A Matlab implementation of the supervised random walks algorithm for link prediction proposed by Backstrom and Leskovec. 5 | 6 | For detailed explanation of the algorithm we refer users to the their [2011 paper](http://arxiv.org/pdf/1011.4071.pdf). 7 | 8 | ---------------------------------- 9 | User manual 10 | ---------------------------------- 11 | 12 | You can test the code with artificial data to check its performance and computational efficiency. To do this you need to generate a random dataset with some prespecified parameters. Next you can use a parameter learner and see how well it will guess the parameters you specified in the previous step. 13 | 14 | ```matlab 15 | %generate a random dataset, if you want to change the number of instances, 16 | %features or the weighter function, simply change the respective properties 17 | %of the generator before invoking the generate function 18 | 19 | generator = random_generators.RandomInstancesGenerator(); 20 | [dataset,weighter,alpha,true_w] = generator.generate(); 21 | 22 | %use a WMW cost function; a time limit of 30 seconds and progress printing 23 | %for learning the parameters 24 | learner = utils.ParameterLearner(weighter,alpha,utils.CostFunction(),30,true); 25 | learned_w = learner.learn(dataset); 26 | 27 | disp(['The true parameters are: ',num2str(true_w)]) 28 | disp(['The learned parameters are: ',num2str(learned_w)]) 29 | ``` 30 | 31 | The above code is pretty self-explanatory, but I feel it is important to discuss the system abstractions here. I designed the system guided by the Weka organization, so anyone who is familiar with it will feel very comfortable using this framework. 32 | I'll just give a brief definition of the major abstractions. 33 | 34 | - A *Graph* is represented by an unweighted adjacency matrix that determines which nodes are connected to each other. Additionally, each edge has an array of values attached to it which we call *features*. 35 | 36 | - An *Instance* object defines the positive and negative links for a given node. You can think of these as the classes/labels for the node in a multilabel classification task, which ultimately is what we are trying to learn to predict. For predicting the positive and negative links the algorithm uses information about the graph topology and the features for each link. The effect that each feature has on the prediction depends heavily on the feature parameters and slightly on the *weighter function*. 37 | 38 | - In order to learn anything useful, the algorithm needs many instance objects which are grouped in an *Instances* object for easier manipulation. 39 | 40 | - The *ParameterLearner* tries to find those parameters that give the predictions that match the positive/negative links specified. Each instance is treated independently although some of them may share the same graph. The learner uses the *cost function* to evaluate its predictions. 41 | 42 | 43 | The system architecture and package structure is summarized in the following diagram 44 | 45 | ![alt tag](https://raw.githubusercontent.com/gajduk/link-prediction-with-supervised-random-walks/master/architecture.png) 46 | 47 | 48 | You can confirm that the problem is smooth by plotting the cost function for a small number of features using the following code 49 | 50 | ```matlab 51 | %define the bounds and the granularity 52 | granularity = 10; 53 | bound1 = 2.5; 54 | bound2 = 2.5; 55 | costf = utils.CostFunction(); 56 | 57 | %create the meshgrid and initilize the cost 58 | x = -bound1:bound1/granularity*2:bound1; 59 | y = -bound2:bound2/granularity*2:bound2; 60 | [X,Y] = meshgrid(x,y); 61 | n = length(x); 62 | Z = zeros(n,n); 63 | 64 | %generate the random dataset 65 | generator = random_generators.RandomInstancesGenerator(); 66 | [dataset,weighter,alpha,true_w] = generator.generate(); 67 | 68 | %calculate the cost for each point 69 | parfor i=1:n 70 | for k=1:n 71 | cost = dataset.calcCost(weighter,alpha,costf,[X(i,k)+0.000001,Y(i,k)+0.000001]); 72 | Z(i,k) = cost; 73 | end 74 | end 75 | 76 | %plot the results 77 | figure;surf(X,Y,Z);xlabel('w1');ylabel('w2');title('cost'); 78 | figure;surf(X,Y,log(Z));xlabel('w1');ylabel('w2');title('log cost'); 79 | ``` 80 | 81 | By increasing the granularity you can get the following images 82 | 83 | ![alt tag](https://raw.githubusercontent.com/gajduk/link-prediction-with-supervised-random-walks/master/cost.png) 84 | 85 | ![alt tag](https://raw.githubusercontent.com/gajduk/link-prediction-with-supervised-random-walks/master/log_cost.png) 86 | 87 | 88 | ------------------- 89 | 90 | I implemented this mostly to hone my Matlab skills. 91 | However, I also did [another implementation](https://github.com/gajduk/TwitterLinkPrediction), this time in Java, which I am currently using to study the social network landscape in Macedonia. -------------------------------------------------------------------------------- /architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gajduk/link-prediction-with-supervised-random-walks/d22a109c64d2129759b1d3144ddf27e584e073ac/architecture.png -------------------------------------------------------------------------------- /cost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gajduk/link-prediction-with-supervised-random-walks/d22a109c64d2129759b1d3144ddf27e584e073ac/cost.png -------------------------------------------------------------------------------- /log_cost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gajduk/link-prediction-with-supervised-random-walks/d22a109c64d2129759b1d3144ddf27e584e073ac/log_cost.png -------------------------------------------------------------------------------- /main.m: -------------------------------------------------------------------------------- 1 | %generates a random dataset with prespecified parameters, then uses a 2 | %parameter learner to learn them. 3 | 4 | %generate a random dataset, if you want to change the number of instances, 5 | %features or the weighter function, change the properties of the generator 6 | %before invoking generate 7 | generator = random_generators.RandomInstancesGenerator(); 8 | [dataset,weighter,alpha,true_w] = generator.generate(); 9 | %use a WMW cost function; a time limit of 30 seconds and progress printing 10 | learner = utils.ParameterLearner(weighter,alpha,utils.CostFunction(),30,true); 11 | learned_w = learner.learn(dataset); 12 | disp(['The true parameters are: ',num2str(true_w)]) 13 | disp(['The learned parameters are: ',num2str(learned_w)]) 14 | -------------------------------------------------------------------------------- /plotcost.m: -------------------------------------------------------------------------------- 1 | %plots the cost function for a random instances object. Might take a while. 2 | %Runing "matlabpool open" before calling this script is a good idea. 3 | 4 | %you can change the granularity or the bounds here 5 | granularity = 10; 6 | bound1 = 2.5; 7 | bound2 = 2.5; 8 | costf = utils.CostFunction(); 9 | 10 | %create the meshgrid and initilizes the cost 11 | x = -bound1:bound1/granularity*2:bound1; 12 | y = -bound2:bound2/granularity*2:bound2; 13 | [X,Y] = meshgrid(x,y); 14 | n = length(x); 15 | Z = zeros(n,n); 16 | 17 | %generate the random dataset 18 | generator = random_generators.RandomInstancesGenerator(); 19 | [dataset,weighter,alpha,true_w] = generator.generate(); 20 | %calculate the cost for each point 21 | parfor i=1:n 22 | for k=1:n 23 | cost = dataset.calcCost(weighter,alpha,costf,[X(i,k)+0.000001,Y(i,k)+0.000001]); 24 | Z(i,k) = cost; 25 | end 26 | end 27 | figure;surf(X,Y,Z);xlabel('w1');ylabel('w2');title('cost'); 28 | figure;surf(X,Y,log(Z));xlabel('w1');ylabel('w2');title('log cost'); --------------------------------------------------------------------------------