├── README.md ├── .gitattributes ├── .gitignore ├── LICENSE ├── node2vec.py └── connectome_embed_nature.py /README.md: -------------------------------------------------------------------------------- 1 | Connectome embeddings: A deep learning framework for 2 | mapping higher-order relations between brain structure and function 3 | Author: Gideon Rosenthal 4 | 5 | Original Node2vec implementation - 6 | node2vec: Scalable Feature Learning for Networks 7 | Aditya Grover and Jure Leskovec 8 | Knowledge Discovery and Data Mining (KDD), 2016 9 | https://github.com/aditya-grover/node2vec 10 | 11 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 gidonro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /node2vec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import networkx as nx 3 | import random 4 | from joblib import Parallel, delayed 5 | import itertools 6 | #from numba import jit 7 | from datetime import datetime 8 | import pickle 9 | 10 | class nodesWalk(): 11 | def __init__(self, walk_length,nodes,graph_current): 12 | self.walk_length = walk_length 13 | self.nodes = nodes 14 | self.graph = graph_current 15 | 16 | 17 | def do_walks(graph, walk_length,nodes,i, node_num_walks=100): 18 | print i 19 | current_walks = [] 20 | for walk_iter in range(node_num_walks): 21 | if walk_iter%5000==0: 22 | print walk_iter 23 | random.seed(datetime.now()) 24 | random.shuffle(nodes) 25 | for node in nodes: 26 | current_walks.append(graph.node2vec_walk(walk_length=walk_length, start_node=node)) 27 | return current_walks 28 | 29 | 30 | 31 | class Graph(object): 32 | def __init__(self, nx_G, is_directed, p, q): 33 | self.G = nx_G 34 | self.is_directed = is_directed 35 | self.p = p 36 | self.q = q 37 | 38 | def node2vec_walk(self, walk_length, start_node): 39 | ''' 40 | Simulate a random walk starting from start node. 41 | ''' 42 | G = self.G 43 | alias_nodes = self.alias_nodes 44 | alias_edges = self.alias_edges 45 | 46 | walk = [start_node] 47 | 48 | while len(walk) < walk_length: 49 | cur = walk[-1] 50 | cur_nbrs = sorted(G.neighbors(cur)) 51 | if len(cur_nbrs) > 0: 52 | if len(walk) == 1: 53 | walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) 54 | else: 55 | prev = walk[-2] 56 | next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], 57 | alias_edges[(prev, cur)][1])] 58 | walk.append(next) 59 | else: 60 | break 61 | 62 | return walk 63 | 64 | def simulate_walks(self, num_walks, walk_length): 65 | ''' 66 | Repeatedly simulate random walks from each node. 67 | ''' 68 | G = self.G 69 | walks = [] 70 | nodes = list(G.nodes()) 71 | print 'Walk iteration:' 72 | for walk_iter in range(num_walks): 73 | if walk_iter%100 ==0: 74 | print str(walk_iter+1), '/', str(num_walks) 75 | random.shuffle(nodes) 76 | for node in nodes: 77 | walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node)) 78 | 79 | return walks 80 | 81 | 82 | def simulate_walks_parallel(self, num_walks, walk_length, workers=6): 83 | ''' 84 | 85 | Repeatedly simulate random walks from each node. 86 | ''' 87 | G = self.G 88 | nodes = list(G.nodes()) 89 | print 'Walk iteration:' 90 | 91 | 92 | #node_walk_object = nodesWalk(walk_length,nodes,self) 93 | walks = Parallel(n_jobs=workers, max_nbytes=1e6)(delayed(do_walks)(self, walk_length,nodes, i,num_walks/workers) for i in range(workers)) 94 | 95 | walks = list(itertools.chain.from_iterable(walks)) 96 | 97 | 98 | return walks 99 | 100 | 101 | def get_alias_edge(self, src, dst): 102 | ''' 103 | Get the alias edge setup lists for a given edge. 104 | ''' 105 | G = self.G 106 | p = self.p 107 | q = self.q 108 | 109 | unnormalized_probs = [] 110 | for dst_nbr in sorted(G.neighbors(dst)): 111 | if dst_nbr == src: 112 | unnormalized_probs.append(G[dst][dst_nbr]['weight']/p) 113 | elif G.has_edge(dst_nbr, src): 114 | unnormalized_probs.append(G[dst][dst_nbr]['weight']) 115 | else: 116 | unnormalized_probs.append(G[dst][dst_nbr]['weight']/q) 117 | norm_const = sum(unnormalized_probs) 118 | normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] 119 | 120 | return alias_setup(normalized_probs) 121 | 122 | def preprocess_transition_probs(self): 123 | ''' 124 | Preprocessing of transition probabilities for guiding the random walks. 125 | ''' 126 | G = self.G 127 | is_directed = self.is_directed 128 | 129 | alias_nodes = {} 130 | for node in G.nodes(): 131 | unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))] 132 | norm_const = sum(unnormalized_probs) 133 | normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] 134 | alias_nodes[node] = alias_setup(normalized_probs) 135 | 136 | alias_edges = {} 137 | triads = {} 138 | 139 | if is_directed: 140 | for edge in G.edges(): 141 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 142 | else: 143 | for edge in G.edges(): 144 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 145 | alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) 146 | 147 | self.alias_nodes = alias_nodes 148 | self.alias_edges = alias_edges 149 | 150 | return 151 | 152 | 153 | def alias_setup(probs): 154 | ''' 155 | Compute utility lists for non-uniform sampling from discrete distributions. 156 | Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ 157 | for details 158 | ''' 159 | K = len(probs) 160 | q = np.zeros(K) 161 | J = np.zeros(K, dtype=np.int) 162 | 163 | smaller = [] 164 | larger = [] 165 | for kk, prob in enumerate(probs): 166 | q[kk] = K*prob 167 | if q[kk] < 1.0: 168 | smaller.append(kk) 169 | else: 170 | larger.append(kk) 171 | 172 | while len(smaller) > 0 and len(larger) > 0: 173 | small = smaller.pop() 174 | large = larger.pop() 175 | 176 | J[small] = large 177 | q[large] = q[large] + q[small] - 1.0 178 | if q[large] < 1.0: 179 | smaller.append(large) 180 | else: 181 | larger.append(large) 182 | 183 | return J, q 184 | 185 | def alias_draw(J, q): 186 | ''' 187 | Draw sample from a non-uniform discrete distribution using alias sampling. 188 | ''' 189 | K = len(J) 190 | 191 | kk = int(np.floor(np.random.rand()*K)) 192 | if np.random.rand() < q[kk]: 193 | return kk 194 | else: 195 | return J[kk] -------------------------------------------------------------------------------- /connectome_embed_nature.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Reference implementation of node2vec. 3 | 4 | Original node2vec functions and implementation 5 | Author: Aditya Grover 6 | 7 | For more details, refer to the paper: 8 | node2vec: Scalable Feature Learning for Networks 9 | Aditya Grover and Jure Leskovec 10 | Knowledge Discovery and Data Mining (KDD), 2016 11 | 12 | Modifications for: 13 | Connectome embeddings: A deep learning framework for 14 | mapping higher-order relations between brain structure and function 15 | Author: Gideon Rosenthal 16 | ''' 17 | 18 | import numpy as np 19 | import networkx as nx 20 | import node2vec 21 | from gensim.models import Word2Vec 22 | from sklearn.preprocessing import Normalizer 23 | import pickle 24 | 25 | def create_embedding(dir_name, input_edge_list, output_embedding, current_dti, current_name, 26 | permutation_no=500, lesion_node = 0, dimensions=30, walk_length=20, 27 | num_walks=800, window_size=3, iter=1, workers=10, p=0.1, q=1.6, sg=0, 28 | weighted=True, directed=False): 29 | ''' 30 | 31 | Args: 32 | Connectome embedding related attributes 33 | dir_name: directory name 34 | input_edge_list: name of input edge list 35 | output_embedding: name of output embedding 36 | current_dti: matrix of current dti to embbed 37 | current_name: name of the analysis 38 | permutation_no: how many permutations are needed 39 | lesion_node: if a lesion node is needed 40 | 41 | node2vec related attributes 42 | dimensions: dimensions of embeddings 43 | walk_length: Length of walk per source 44 | num_walks:Number of walks per source 45 | window_size : Context size for optimization 46 | iter : Number of epochs in SGD 47 | workers : Number of parallel workers 48 | p: Return hyperparameter 49 | q: Inout hyperparameter 50 | sg: skipgram = 1, cbow=0 51 | weighted:Boolean specifying (un)weighted 52 | directed:Graph is (un)directed 53 | 54 | 55 | Returns: 56 | word2Vecmodelsorted: word2vec embeddings 57 | 58 | ''' 59 | zero = 1.11324633283e-16 60 | #creating edge list in the format which is digestible by node2vec 61 | if lesion_node > 0: 62 | 63 | with open(input_edge_list, 'w') as edge_list: 64 | for r in range(current_dti.shape[0]): 65 | for c in range(current_dti.shape[0]): 66 | if current_dti[r, c] != 0 and r != lesion_node and c != lesion_node: 67 | edge_list.write('%s %s %s \n' % (r, c, current_dti[r, c])) 68 | if r == lesion_node or c == lesion_node: 69 | edge_list.write('%s %s %s \n' % (r, c, zero)) 70 | 71 | else: 72 | with open(input_edge_list, 'w') as edge_list: 73 | for r in range(current_dti.shape[0]): 74 | for c in range(current_dti.shape[0]): 75 | if current_dti[r, c] != 0: 76 | edge_list.write('%s %s %s \n' % (r, c, current_dti[r, c])) 77 | 78 | # we multiply the num_walks by permutation_no to save time in calling the functions. 79 | walks_agg = node2vec_agg_walks(input_edge_list, walk_length=walk_length, num_walks=num_walks * permutation_no, 80 | workers=workers, p=p, q=q, weighted=weighted, directed=directed) 81 | with open(dir_name + current_name + '_walks_lesion_' + str(lesion_node), 'w') as f: 82 | pickle.dump(walks_agg, f) 83 | word2Vecmodelsorted = node2veclearn_agg(walks_agg, output_embedding, num_walks=num_walks, 84 | permutation_no=permutation_no, number_of_nodes=current_dti.shape[0], 85 | dimensions=dimensions, window_size=window_size, iter=iter, workers=workers) 86 | 87 | return word2Vecmodelsorted 88 | 89 | def read_graph(input_edge_list, weighted, directed): 90 | ''' 91 | Reads the input network in networkx. 92 | ''' 93 | if weighted: 94 | G = nx.read_edgelist(input_edge_list, nodetype=int, data=(('weight', float),), create_using=nx.DiGraph()) 95 | else: 96 | G = nx.read_edgelist(input_edge_list, nodetype=int, create_using=nx.DiGraph()) 97 | for edge in G.edges(): 98 | G[edge[0]][edge[1]]['weight'] = 1 99 | 100 | if not directed: 101 | G = G.to_undirected() 102 | 103 | return G 104 | 105 | 106 | def learn_embeddings(walks, dimensions, window_size, workers, iter, output_embedding, sg=0): 107 | ''' 108 | Learn embeddings 109 | ''' 110 | walks = [map(str, walk) for walk in walks] 111 | model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, sg=sg, 112 | workers=workers, iter=iter) 113 | model.save(output_embedding + '.embeddings') 114 | # model.save_word2vec_format(output_embedding + 'word2vecformat.embeddings') 115 | 116 | return model 117 | 118 | 119 | def normalize_embeddings(word2Vecmodel): 120 | normalizer = Normalizer(copy=False) 121 | 122 | word2Vecmodelsorted = np.zeros([word2Vecmodel.syn0.shape[0], word2Vecmodel.syn0.shape[1]]) 123 | for i in range(word2Vecmodel.syn0.shape[0]): 124 | word2Vecmodelsorted[i] = normalizer.fit_transform(word2Vecmodel[str(i)]) 125 | return word2Vecmodelsorted 126 | 127 | 128 | def node2veclearn(input_edge_list, output_embedding, dimensions=128, walk_length=10, num_walks=10, window_size=10, 129 | iter=1, workers=8, p=1, q=1, weighted=True, directed=True, sg=0): 130 | """Pipeline for representational learning for all nodes in a graph. 131 | 132 | Keyword arguments: 133 | input_edge_list -- Input graph path 134 | output_embedding -- Embeddings path 135 | dimensions -- Number of dimensions (default=128) 136 | walk-length -- Length of walk per source (default=10) 137 | num-walks -- Number of walks per source (default=10) 138 | window-size -- Context size for optimization (default=10) 139 | iter -- Number of epochs in SGD (default=1) 140 | workers -- Number of parallel workers (default=8) 141 | p -- Return hyperparameter (default=1) 142 | q -- Inout hyperparameter (default=1) 143 | weighted -- Boolean specifying (un)weighted (default=True) 144 | directed -- Graph is (un)directed(default=True) 145 | 146 | example - 147 | 148 | working_dir = '/home/lab_users/Downloads/NKI_Rockland/hagmann/' 149 | input_edge_list = working_dir + 'hagmann_dti_no_ENT_only_positive.txt' 150 | output_embedding = working_dir + 'hagmann_dti.embeddings' 151 | 152 | node2veclearn(input_edge_list, output_embedding, dimensions = 30, walk_length = 50, num_walks=400, window_size=3, iter=1, workers=40, p=0.2, q=2.0, weighted=True, directed=True) 153 | 154 | """ 155 | 156 | nx_G = read_graph(input_edge_list, weighted, directed) 157 | G = node2vec.Graph(nx_G, directed, p, q) 158 | G.preprocess_transition_probs() 159 | walks = G.simulate_walks(num_walks, walk_length) 160 | model = learn_embeddings(walks, dimensions, window_size, workers, iter, output_embedding, sg) 161 | return model 162 | 163 | 164 | def node2vec_agg_walks(input_edge_list, walk_length=10, num_walks=10, workers=8, p=1, q=1, weighted=True, 165 | directed=True): 166 | """Pipeline for representational learning for all nodes in a graph. 167 | 168 | Keyword arguments: 169 | input_edge_list -- Input graph path 170 | walk-length -- Length of walk per source (default=10) 171 | num-walks -- Number of walks per source (default=10) 172 | workers -- Number of parallel workers (default=8) 173 | p -- Return hyperparameter (default=1) 174 | q -- Inout hyperparameter (default=1) 175 | weighted -- Boolean specifying (un)weighted (default=True) 176 | directed -- Graph is (un)directed(default=True) 177 | 178 | example - 179 | 180 | working_dir = '/home/lab_users/Downloads/NKI_Rockland/hagmann/' 181 | input_edge_list = working_dir + 'hagmann_dti_no_ENT_only_positive.txt' 182 | output_embedding = working_dir + 'hagmann_dti.embeddings' 183 | 184 | node2veclearn(input_edge_list, output_embedding, dimensions = 30, walk_length = 50, num_walks=400, window_size=3, iter=1, workers=40, p=0.2, q=2.0, weighted=True, directed=True) 185 | 186 | """ 187 | 188 | nx_G = read_graph(input_edge_list, weighted, directed) 189 | G = node2vec.Graph(nx_G, directed, p, q) 190 | G.preprocess_transition_probs() 191 | walks = G.simulate_walks_parallel(num_walks, walk_length, workers) 192 | 193 | return walks 194 | 195 | 196 | def node2veclearn_agg(walks, output_embedding, num_walks=10, permutation_no=10, number_of_nodes=83, dimensions=128, 197 | window_size=10, iter=1, workers=8, sg=0): 198 | """Pipeline for representational learning for all nodes in a graph. 199 | 200 | Keyword arguments: 201 | input_edge_list -- Input graph path 202 | output_embedding -- Embeddings path 203 | dimensions -- Number of dimensions (default=128) 204 | num-walks -- Number of walks per source (default=10) 205 | permutation_no -- number of permutation (default = 10) 206 | window-size -- Context size for optimization (default=10) 207 | iter -- Number of epochs in SGD (default=1) 208 | workers -- Number of parallel workers (default=8) 209 | sg -- skipgram = 1, cbow=1 210 | p -- Return hyperparameter (default=1) 211 | q -- Inout hyperparameter (default=1) 212 | weighted -- Boolean specifying (un)weighted (default=True) 213 | directed -- Graph is (un)directed(default=True) 214 | 215 | example - 216 | 217 | working_dir = '/home/lab_users/Downloads/NKI_Rockland/hagmann/' 218 | input_edge_list = working_dir + 'hagmann_dti_no_ENT_only_positive.txt' 219 | output_embedding = working_dir + 'hagmann_dti.embeddings' 220 | 221 | node2veclearn(input_edge_list, output_embedding, dimensions = 30, walk_length = 50, num_walks=400, window_size=3, iter=1, workers=40, p=0.2, q=2.0, weighted=True, directed=True) 222 | 223 | """ 224 | 225 | word2vec_permutations = np.zeros([permutation_no, number_of_nodes, dimensions]) 226 | count = 0 227 | for permute in range(0, permutation_no * num_walks * number_of_nodes, num_walks * number_of_nodes): 228 | model = learn_embeddings(walks[permute:permute + num_walks * number_of_nodes], dimensions, window_size, workers, 229 | iter, output_embedding, sg) 230 | word2Vecmodelsorted = normalize_embeddings(model) 231 | 232 | word2vec_permutations[count, ...] = word2Vecmodelsorted 233 | count += 1 234 | return word2vec_permutations 235 | 236 | 237 | def node2veclearn_update(input_edge_list, org_embedding, new_embedding, dimensions=128, walk_length=10, num_walks=10, 238 | window_size=10, iter=1, workers=8, p=1, q=1, weighted=True, directed=True): 239 | """Pipeline for updating an embedding 240 | 241 | Keyword arguments: 242 | org_embedding-- original embedging 243 | new_embedding -- new Embeddings path 244 | dimensions -- Number of dimensions (default=128) 245 | walk-length -- Length of walk per source (default=10) 246 | num-walks -- Number of walks per source (default=10) 247 | window-size -- Context size for optimization (default=10) 248 | iter -- Number of epochs in SGD (default=1) 249 | workers -- Number of parallel workers (default=8) 250 | p -- Return hyperparameter (default=1) 251 | q -- Inout hyperparameter (default=1) 252 | weighted -- Boolean specifying (un)weighted (default=True) 253 | directed -- Graph is (un)directed(default=True) 254 | 255 | example - 256 | 257 | working_dir = '/home/lab_users/Downloads/NKI_Rockland/hagmann/' 258 | input_edge_list = working_dir + 'hagmann_dti_no_ENT_only_positive.txt' 259 | org_embedding = working_dir + 'hagmann_dti.embeddings' 260 | new_embedding = working_dir + 'hagmann_dti_updated' 261 | 262 | node2veclearn_update(org_embedding, new_embedding, walk_length = 50, num_walks=400, p=0.2, q=2.0, weighted=True, directed=True) 263 | 264 | """ 265 | 266 | nx_G = read_graph(input_edge_list, weighted, directed) 267 | G = node2vec.Graph(nx_G, directed, p, q) 268 | G.preprocess_transition_probs() 269 | walks = G.simulate_walks(num_walks, walk_length) 270 | 271 | model = Word2Vec.load(org_embedding) 272 | model.train(walks) 273 | model.save(new_embedding + '.embeddings') 274 | # model.save_word2vec_format(new_embedding + 'word2vecformat.embeddings') 275 | 276 | return model --------------------------------------------------------------------------------