├── graph ├── test_cat.tsv ├── .DS_Store └── test.tsv ├── overview.png ├── emb ├── .DS_Store ├── test_emb.txt └── test_emb_ind.txt ├── src ├── .DS_Store ├── latent_summary.pkl ├── util.py ├── main_inductive.py └── main.py └── README.md /graph/test_cat.tsv: -------------------------------------------------------------------------------- 1 | 0 0 4 2 | 1 5 6 3 | 2 7 8 4 | -------------------------------------------------------------------------------- /overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/MultiLENS/HEAD/overview.png -------------------------------------------------------------------------------- /emb/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/MultiLENS/HEAD/emb/.DS_Store -------------------------------------------------------------------------------- /graph/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/MultiLENS/HEAD/graph/.DS_Store -------------------------------------------------------------------------------- /src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/MultiLENS/HEAD/src/.DS_Store -------------------------------------------------------------------------------- /src/latent_summary.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/MultiLENS/HEAD/src/latent_summary.pkl -------------------------------------------------------------------------------- /graph/test.tsv: -------------------------------------------------------------------------------- 1 | 0 2 1 2 | 1 0 1 3 | 1 2 5 4 | 1 5 1 5 | 2 8 5 6 | 3 0 5 7 | 3 4 1 8 | 3 6 1 9 | 4 0 1 10 | 4 3 1 11 | 4 7 5 12 | 7 0 1 13 | -------------------------------------------------------------------------------- /src/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np, networkx as nx 2 | import scipy.sparse 3 | 4 | class RepMethod(): 5 | def __init__(self, bucket_max_value = None, method="hetero", num_buckets = None, use_other_features = False, operators = None, 6 | use_total = 0): 7 | self.method = method 8 | self.bucket_max_value = bucket_max_value 9 | self.num_buckets = num_buckets 10 | self.use_other_features = use_other_features 11 | self.operators = operators 12 | self.use_total = use_total 13 | 14 | 15 | class Graph(): 16 | def __init__(self, adj_matrix = None, num_nodes = None, max_id = None, directed = False, neighbor_list = None, 17 | num_buckets = None, base_features = None, cat_dict = None, id_cat_dict = None, unique_cat = None, check_eq = True): 18 | # self.nx_graph = nx_graph 19 | self.adj_matrix = adj_matrix 20 | self.num_nodes = num_nodes 21 | self.max_id = max_id 22 | self.base_features = base_features 23 | self.unique_cat = unique_cat 24 | self.directed = directed 25 | self.num_buckets = num_buckets 26 | 27 | self. neighbor_list = neighbor_list 28 | self.cat_dict = cat_dict 29 | self.id_cat_dict = id_cat_dict 30 | self.check_eq = check_eq 31 | 32 | 33 | def get_delimiter(input_file_path): 34 | delimiter = " " 35 | if ".csv" in input_file_path: 36 | delimiter = "," 37 | elif ".tsv" in input_file_path: 38 | delimiter = "\t" 39 | else: 40 | sys.exit('Format not supported.') 41 | 42 | return delimiter 43 | 44 | 45 | def write_embedding(rep, output_file_path): 46 | N, K = rep.shape 47 | 48 | fOut = open(output_file_path, 'w') 49 | fOut.write(str(N) + ' ' + str(K) + '\n') 50 | 51 | for i in range(N): 52 | cur_line = ' '.join([str(np.round(ii, 6)) for ii in rep[i,:]]) 53 | fOut.write(str(i) + ' ' + cur_line + '\n') 54 | 55 | fOut.close() 56 | 57 | return 58 | 59 | 60 | def read_embedding(input_file_path): 61 | 62 | fIn = open(input_file_path, 'r') 63 | N, K = fIn.readline().split(' ') 64 | 65 | rep = np.zeros((int(N), int(K))) 66 | 67 | for line in fIn.readlines(): 68 | parts = line.strip().split(' ') 69 | rep[int(parts[0]),:] = [float(ele) for ele in parts[1:]] 70 | 71 | 72 | return rep 73 | 74 | -------------------------------------------------------------------------------- /emb/test_emb.txt: -------------------------------------------------------------------------------- 1 | 9 26 2 | 0 1.850588 0.830303 -0.191896 -1.045115 0.505562 -0.200553 -0.050742 0.13008 3.053755 1.433569 -0.041825 -1.78883 -0.094202 0.089586 -0.459508 -0.309663 -0.11008 4.866181 2.22298 0.121998 0.05738 2.989983 -0.308342 -0.771511 -0.265161 0.491672 3 | 1 1.207391 -1.041898 0.126836 -0.047614 0.20108 0.967246 0.191076 0.189968 1.986099 -1.576634 -0.158664 0.161196 1.013833 1.562144 0.347487 -0.345401 0.01183 3.145423 -1.64266 -1.225175 -3.435987 -0.479035 1.606629 0.03074 -0.480261 0.1073 4 | 2 1.188927 0.931284 0.089476 -0.004928 -1.120979 0.403132 0.194414 -0.181278 1.91365 1.80887 0.097809 0.765877 1.351447 -0.421045 0.770864 0.39684 0.192125 2.953462 3.391384 0.03733 -1.149727 -1.662207 -1.414858 1.096449 0.322216 -0.792043 5 | 3 1.541783 -0.948771 -0.094118 0.014943 -0.073187 -0.172418 -0.618337 -0.337874 2.5019 -1.497652 -0.284534 0.18921 0.107144 -0.477646 -0.974225 0.808616 0.120295 3.988445 -2.647114 -0.410472 0.669848 -0.570027 -1.046523 -1.542457 1.035089 -0.694214 6 | 4 1.54687 -0.447575 0.671183 0.533284 -0.30057 -0.728973 0.295225 0.238886 2.489298 -0.986318 1.031641 0.670041 -0.66237 -1.136445 0.493327 -0.607405 -0.193471 3.963646 -2.033719 1.867637 1.288111 -1.023158 -0.773682 1.35929 -0.758336 0.823649 7 | 5 0.451265 -0.023006 -1.259903 0.367417 -0.008656 -0.075931 0.127082 0.042334 0.697913 0.189865 -2.03451 0.350973 -0.466462 0.054177 0.397503 0.197009 -0.853154 1.073465 0.53663 -3.199775 1.000627 -0.488686 0.578496 0.847164 1.107696 1.141338 8 | 6 0.451265 -0.023006 -1.259903 0.367417 -0.008656 -0.075931 0.127082 0.042334 0.675869 0.110825 -2.04555 0.352309 -0.615303 -0.199929 0.062071 -0.397289 0.804648 1.1152 0.10538 -2.937997 1.942738 -0.203464 0.380247 0.061913 -1.357843 -0.927939 9 | 7 0.74425 0.536652 0.348272 0.632578 0.905468 0.09093 0.388578 -0.32813 1.21809 0.428394 0.820024 0.099353 -1.686427 0.981121 0.622814 0.55406 0.21437 1.977655 0.234332 1.678586 1.164126 0.527168 2.891226 0.903616 0.674615 -0.804256 10 | 8 0.424346 0.920717 0.189408 0.81788 0.15575 0.278821 -0.648462 0.242351 0.565392 1.353081 0.359149 1.64749 -0.239997 0.629258 -1.11555 -0.323173 -0.148373 0.848442 2.115553 0.914508 0.711768 -2.470263 1.300451 -1.864505 -0.332716 0.605318 11 | -------------------------------------------------------------------------------- /emb/test_emb_ind.txt: -------------------------------------------------------------------------------- 1 | 9 26 2 | 0 1.850588 0.830303 -0.191896 -1.045115 0.505562 -0.200553 -0.050742 0.13008 3.053755 1.433569 -0.041825 -1.78883 -0.094202 0.089586 -0.459508 -0.309663 -0.11008 4.866181 2.22298 0.121998 0.05738 2.989983 -0.308342 -0.771511 -0.265161 0.491672 3 | 1 1.207391 -1.041898 0.126836 -0.047614 0.20108 0.967246 0.191076 0.189968 1.986099 -1.576634 -0.158664 0.161196 1.013833 1.562144 0.347487 -0.345401 0.01183 3.145423 -1.64266 -1.225175 -3.435987 -0.479035 1.606629 0.03074 -0.480261 0.1073 4 | 2 1.188927 0.931284 0.089476 -0.004928 -1.120979 0.403132 0.194414 -0.181278 1.91365 1.80887 0.097809 0.765877 1.351447 -0.421045 0.770864 0.39684 0.192125 2.953462 3.391384 0.03733 -1.149727 -1.662207 -1.414858 1.096449 0.322216 -0.792043 5 | 3 1.541783 -0.948771 -0.094118 0.014943 -0.073187 -0.172418 -0.618337 -0.337874 2.5019 -1.497652 -0.284534 0.18921 0.107144 -0.477646 -0.974225 0.808616 0.120295 3.988445 -2.647114 -0.410472 0.669848 -0.570027 -1.046523 -1.542457 1.035089 -0.694214 6 | 4 1.54687 -0.447575 0.671183 0.533284 -0.30057 -0.728973 0.295225 0.238886 2.489298 -0.986318 1.031641 0.670041 -0.66237 -1.136445 0.493327 -0.607405 -0.193471 3.963646 -2.033719 1.867637 1.288111 -1.023158 -0.773682 1.35929 -0.758336 0.823649 7 | 5 0.451265 -0.023006 -1.259903 0.367417 -0.008656 -0.075931 0.127082 0.042334 0.697913 0.189865 -2.03451 0.350973 -0.466462 0.054177 0.397503 0.197009 -0.853154 1.073465 0.53663 -3.199775 1.000627 -0.488686 0.578496 0.847164 1.107696 1.141338 8 | 6 0.451265 -0.023006 -1.259903 0.367417 -0.008656 -0.075931 0.127082 0.042334 0.675869 0.110825 -2.04555 0.352309 -0.615303 -0.199929 0.062071 -0.397289 0.804648 1.1152 0.10538 -2.937997 1.942738 -0.203464 0.380247 0.061913 -1.357843 -0.927939 9 | 7 0.74425 0.536652 0.348272 0.632578 0.905468 0.09093 0.388578 -0.32813 1.21809 0.428394 0.820024 0.099353 -1.686427 0.981121 0.622814 0.55406 0.21437 1.977655 0.234332 1.678586 1.164126 0.527168 2.891226 0.903616 0.674615 -0.804256 10 | 8 0.424346 0.920717 0.189408 0.81788 0.15575 0.278821 -0.648462 0.242351 0.565392 1.353081 0.359149 1.64749 -0.239997 0.629258 -1.11555 -0.323173 -0.148373 0.848442 2.115553 0.914508 0.711768 -2.470263 1.300451 -1.864505 -0.332716 0.605318 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MultiLENS 2 | 3 | 4 | **Paper**: Di Jin, Ryan A. Rossi, Eunyee Koh, Sungchul Kim, Anup Rao, Danai Koutra. Latent Network Summarization: Bridging Network Embedding and Summarization. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2019. 5 | 6 | *Link*: https://gemslab.github.io/papers/jin-2019-latent.pdf 7 | 8 |

9 | Overview of MultiLENS 10 |

11 | 12 | 13 | **Citation (bibtex)**: 14 | ``` 15 | @inproceedings{DBLP:conf/kdd/JinRKKRK19, 16 | author = {Di Jin and 17 | Ryan A. Rossi and 18 | Eunyee Koh and 19 | Sungchul Kim and 20 | Anup Rao and 21 | Danai Koutra}, 22 | title = {Latent Network Summarization: Bridging Network Embedding and Summarization}, 23 | booktitle = {Proceedings of the 25th {ACM} {SIGKDD} International Conference on 24 | Knowledge Discovery {\&} Data Mining, {KDD} 2019, London, UK, 25 | August 4-8, 2019}, 26 | year = {2019}, 27 | } 28 | ``` 29 | 30 | # Code 31 | 32 | ## Inputs: 33 | 34 | MultiLENS takes two files as input, the graph file and the category file. 35 | 36 | ### Input graph file 37 | The input graph file can be either static or temporal edge list in the following format separated by tab: 38 | ``` 39 | (optional) 40 | ``` 41 | MultiLENS will automatically determine if the input graph is static or temporal. The edge list is assumed to be re-ordered consecutively from 0, i.e., the minimum node ID is 0, and the maximum node ID is <#node - 1>. A toy static graph is under "/graph/" directory. 42 | 43 | ### Input category file 44 | The category file is a mapping between the node ID and its type (e.g., IP, cookie, web agent) with the following format separated by tab: 45 | ``` 46 | 47 | ``` 48 | if the node IDs are grouped by the type, where `````` and `````` are the starting and ending node ids in type `````` 49 | For example, 50 | ``` 51 | 0 0 279629 52 | 1 279630 283182 53 | ``` 54 | means node 0, 1, ... 279629 are in type 0, node 279630, 279631, ... 283182 are in type 1. 55 | 56 | But if the node IDs are not grouped by the types, this implementation also supports the following format separated by tab: 57 | ``` 58 | 59 | ``` 60 | which is just the 1-1 mapping. The code accepts either format. 61 | 62 | ## Usage 63 | 64 | The complete command to run MultiLENS is as follows. 65 | 66 | ``` 67 | python main.py --input --cat --output --dim 68 | --L <#level> --base --operators 69 | ``` 70 | 71 | - input, the input graph file stated under the "Graph Input" section above. Default value: '../graph/test.tsv' 72 | - cat, the input category file stated under the "Graph Input" section above. Default value: '../graph/test_cat.tsv' 73 | - output, the ouput file path of the embeddings. Default value: '../emb/test_emb.txt' 74 | - dim, the dimension of the embeddings. Default value: 128 75 | - L, the maximum subgraph order. Default value: 2 76 | - base, the base constant of logarithm binning. Default value: 4 77 | - operators, a list of relational operators to use. Use the command such as ```--operators 'mean','sum'``` to specify which operators to use. Default value: ['mean', 'var', 'sum', 'max', 'min', 'L1', 'L2'] 78 | 79 | ## Output 80 | In addition to embedding file indicated in the path ```output```, MultiLENS also outputs "latent_summary.pkl", which is the latent graph summary file that can be used for inductive learning tasks. 81 | 82 | ## Inductive learning task 83 | This repo also provides the python script to perform inductive learnings, i.e., deriving node embeddings from the latent summary on the fly. The commands to run it is as follows: 84 | 85 | ``` 86 | python main_inducitve.py --input --cat --summary --output 87 | --dim --L <#level> --base --operators 88 | ``` 89 | 90 | In addition to the identical arguments shown above, MultiLENS takes ```summary``` as the input: 91 | 92 | - summary, the input latent graph summary file derived on the (same/different) graph. Default value: './latent_summary.pkl' 93 | - output. Default value: './emb/test_emb_ind.txt' 94 | 95 | One may also set the variable "check_difference" in "main_inducitve.py" to compute the sum of node-wise distances (Frobenius norm) to measure graph difference. 96 | 97 | 98 | # Question & troubleshooting 99 | 100 | If you encounter any problems running the code, pls feel free to contact Di Jin (dijin@umich.edu) 101 | 102 | 103 | -------------------------------------------------------------------------------- /src/main_inductive.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import datetime 3 | from pathlib import Path 4 | import numpy as np, scipy as sp, networkx as nx 5 | import math, time, os, sys, random 6 | from collections import deque 7 | import pickle 8 | import argparse 9 | 10 | import scipy.sparse as sps 11 | from scipy.sparse import coo_matrix 12 | from scipy.sparse.linalg import svds, eigs 13 | import sparsesvd 14 | 15 | from sklearn.decomposition import NMF, DictionaryLearning 16 | from sklearn.manifold import TSNE 17 | 18 | from collections import defaultdict 19 | 20 | from util import * 21 | 22 | def get_combined_feature_sequence(graph, rep_method, current_node, input_dense_matrix = None, feature_wid_ind = None): 23 | ''' 24 | Get the combined degree/other feature sequence for a given node 25 | ''' 26 | N, cur_P = input_dense_matrix.shape 27 | 28 | id_cat_dict = graph.id_cat_dict 29 | combined_feature_vector = [] 30 | cur_neighbors = graph.neighbor_list[current_node][:] 31 | cur_neighbors.append(current_node) 32 | 33 | for cat in graph.cat_dict.keys(): 34 | 35 | features = [] 36 | for i in range(cur_P): 37 | features.append([0.0] * feature_wid_ind[i]) 38 | 39 | for neighbor in cur_neighbors: 40 | 41 | if id_cat_dict[neighbor] != cat: 42 | continue 43 | 44 | try: 45 | for i in range(cur_P): 46 | node_feature = input_dense_matrix[neighbor, i] 47 | 48 | if (rep_method.num_buckets is not None) and (node_feature != 0): 49 | bucket_index = int(math.log(node_feature, rep_method.num_buckets)) 50 | else: 51 | bucket_index = int(node_feature) 52 | 53 | features[i][min(bucket_index, len(features[i]) - 1)] += 1#(rep_method.alpha ** layer) * weight 54 | except Exception as e: 55 | print "Exception:", e 56 | print("Node %d has %s value %d and will not contribute to feature distribution" % (khop_neighbor, feature, node_feature)) 57 | cur_feature_vector = features[0] 58 | 59 | for feature_vector in features[1:]: 60 | cur_feature_vector += feature_vector 61 | 62 | combined_feature_vector += cur_feature_vector 63 | 64 | return combined_feature_vector 65 | 66 | def get_features(graph, rep_method, input_dense_matrix = None, nodes_to_embed = None): 67 | 68 | feature_wid_sum, feature_wid_ind = get_feature_n_buckets(input_dense_matrix, num_buckets, rep_method.bucket_max_value) 69 | feature_matrix = np.zeros([graph.num_nodes, feature_wid_sum * len(graph.unique_cat)]) 70 | 71 | for n in nodes_to_embed: 72 | if n % 50000 == 0: 73 | print "[Generate combined feature vetor] node: " + str(n) 74 | combined_feature_sequence = get_combined_feature_sequence(graph, rep_method, n, input_dense_matrix = input_dense_matrix, feature_wid_ind = feature_wid_ind) 75 | feature_matrix[n,:] = combined_feature_sequence 76 | 77 | return feature_matrix 78 | 79 | 80 | def get_seq_features(graph, rep_method, input_dense_matrix = None, nodes_to_embed = None): 81 | 82 | if input_dense_matrix is None: 83 | sys.exit('get_seq_features: no input matrix.') 84 | 85 | if nodes_to_embed is None: 86 | nodes_to_embed = range(graph.num_nodes) 87 | num_nodes = graph.num_nodes 88 | else: 89 | num_nodes = len(nodes_to_embed) 90 | 91 | feature_matrix = get_features(graph, rep_method, input_dense_matrix, nodes_to_embed) 92 | 93 | if graph.directed: 94 | print "[Starting to obtain features from in-components]" 95 | 96 | neighbor_list_r = construct_neighbor_list(graph.adj_matrix.transpose(), nodes_to_embed) 97 | 98 | indegree_graph = Graph(graph.adj_matrix.transpose(), max_id = graph.max_id, num_nodes = graph.num_nodes, 99 | directed = graph.directed, base_features = graph.base_features, neighbor_list = neighbor_list_r, 100 | cat_dict = graph.cat_dict, id_cat_dict = graph.id_cat_dict, unique_cat = graph.unique_cat, check_eq = graph.check_eq) 101 | base_feature_matrix_in = get_features(indegree_graph, rep_method, input_dense_matrix, nodes_to_embed = nodes_to_embed) 102 | 103 | feature_matrix = np.hstack((feature_matrix, base_feature_matrix_in)) 104 | 105 | return feature_matrix 106 | 107 | 108 | def construct_cat(input_gt_path, delimiter): 109 | ''' 110 | # Input: per line, 1) cat-id_init, id_end or 2) cat-id 111 | ''' 112 | result = defaultdict(set) 113 | id_cat_dict = dict() 114 | 115 | fIn = open(input_gt_path, 'r') 116 | lines = fIn.readlines() 117 | for line in lines: 118 | 119 | parts = line.strip('\r\n').split(delimiter) 120 | if len(parts) == 3: 121 | cat = parts[0] 122 | node_id_start = parts[1] 123 | node_id_end = parts[2] 124 | 125 | for i in range( int(node_id_start), int(node_id_end)+1 ): 126 | result[ int(cat) ].add( i ) 127 | id_cat_dict[i] = int(cat) 128 | 129 | elif len(parts) == 2: 130 | cat = parts[0] 131 | node_id = parts[1] 132 | 133 | result[int(cat)].add( int(node_id) ) 134 | id_cat_dict[int(node_id)] = int(cat) 135 | 136 | else: 137 | sys.exit('Cat file format not supported') 138 | 139 | fIn.close() 140 | return result, result.keys(), id_cat_dict 141 | 142 | 143 | def search_feature_layer(graph, rep_method, base_feature_matrix = None): 144 | 145 | n,p = base_feature_matrix.shape 146 | result = np.zeros([n, p*rep_method.use_total]) 147 | ops = rep_method.operators 148 | 149 | for u in range(n): 150 | if u % 50000 == 0: 151 | print '[Current_node_id] ' + str(u) 152 | 153 | neighbors = graph.neighbor_list[u] 154 | 155 | for fid in range(p): 156 | 157 | mean_v = 0.0; sum_v = 0.0; var_v = 0.0; max_v = 0.0; min_v = 0.0; sum_sq_diff = 0.0; prod_v = 1.0; L1_v = 0.0; L2_v = 0.0 158 | 159 | for v in neighbors: 160 | 161 | L1_v += abs(base_feature_matrix[u][fid] - base_feature_matrix[v][fid]) # L1 162 | diff = base_feature_matrix[u][fid] - base_feature_matrix[v][fid] 163 | L2_v += diff*diff # L2 164 | sum_sq_diff += base_feature_matrix[v][fid] * base_feature_matrix[v][fid] # var 165 | sum_v += base_feature_matrix[v][fid] # used in sum and mean 166 | if max_v < base_feature_matrix[v][fid]: # max 167 | max_v = base_feature_matrix[v][fid] 168 | if min_v > base_feature_matrix[v][fid]: # min 169 | min_v = base_feature_matrix[v][fid] 170 | 171 | deg = len(neighbors) 172 | if deg == 0: 173 | mean_v = 0 174 | var_v = 0 175 | else: 176 | mean_v = sum_v / float(deg) 177 | var_v = (sum_sq_diff / float(deg)) - (mean_v * mean_v) 178 | 179 | temp_vec = [0.0] * rep_method.use_total 180 | 181 | for idx, op in enumerate(ops): 182 | if op == 'mean': 183 | temp_vec[idx] = mean_v 184 | elif op == 'var': 185 | temp_vec[idx] = var_v 186 | elif op == 'sum': 187 | temp_vec[idx] = sum_v 188 | elif op == 'max': 189 | temp_vec[idx] = max_v 190 | elif op == 'min': 191 | temp_vec[idx] = min_v 192 | elif op == 'L1': 193 | temp_vec[idx] = L1_v 194 | elif op == 'L2': 195 | temp_vec[idx] = L2_v 196 | else: 197 | sys.exit('[Unsupported operation]') 198 | 199 | result[u, fid*rep_method.use_total:(fid+1)*rep_method.use_total] = temp_vec 200 | 201 | return result 202 | 203 | 204 | def construct_neighbor_list(adj_matrix, nodes_to_embed): 205 | result = {} 206 | 207 | for i in nodes_to_embed: 208 | result[i] = list(adj_matrix.getrow(i).nonzero()[1]) 209 | 210 | return result 211 | 212 | 213 | def get_init_features(graph, base_features, nodes_to_embed): 214 | ''' 215 | # set fb: sum as default. 216 | ''' 217 | init_feature_matrix = np.zeros((len(nodes_to_embed), len(base_features))) 218 | adj = graph.adj_matrix 219 | 220 | if "row_col" in base_features: 221 | init_feature_matrix[:,base_features.index("row_col")] = (adj.sum(axis=0).transpose() + adj.sum(axis=1)).ravel() 222 | 223 | if "col" in base_features: 224 | init_feature_matrix[:,base_features.index("col")] = adj.sum(axis=0).transpose().ravel() 225 | 226 | if "row" in base_features: 227 | init_feature_matrix[:,base_features.index("row")] = adj.sum(axis=1).ravel() 228 | 229 | print '[Initial_feature_all finished]' 230 | return init_feature_matrix 231 | 232 | 233 | def get_feature_n_buckets(feature_matrix, num_buckets, bucket_max_value): 234 | 235 | result_sum = 0 236 | result_ind = [] 237 | N, cur_P = feature_matrix.shape 238 | 239 | if num_buckets is not None: 240 | for i in range(cur_P): 241 | temp = max(bucket_max_value, int(math.log(max(max(feature_matrix[:,i]), 1), num_buckets) + 1)) 242 | n_buckets = temp 243 | # print max(feature_matrix[:,i]) 244 | result_sum += n_buckets 245 | result_ind.append(n_buckets) 246 | else: 247 | for i in range(cur_P): 248 | temp = max(bucket_max_value, int( max(feature_matrix[:,i]) ) + 1) 249 | n_buckets = temp 250 | result_sum += n_buckets 251 | result_ind.append(n_buckets) 252 | 253 | return result_sum, result_ind 254 | 255 | 256 | 257 | def parse_args(): 258 | ''' 259 | Parses the arguments. 260 | ''' 261 | parser = argparse.ArgumentParser(description="Multi-Lens: Bridging Network Embedding and Summarization.") 262 | 263 | parser.add_argument('--input', nargs='?', default='../graph/test.tsv', help='Input graph file path') 264 | 265 | parser.add_argument('--cat', nargs='?', default='../graph/test_cat.tsv', help='Input node category file path') 266 | 267 | parser.add_argument('--summary', nargs='?', default='./latent_summary.pkl', help='Summary file path') 268 | 269 | parser.add_argument('--output', nargs='?', default='../emb/test_emb_ind.txt', help='Embedding file path') 270 | 271 | parser.add_argument('--test', nargs='?', default='../emb/test_emb.txt', help='Embedding file (old) path. This file is just used for testing.') 272 | 273 | parser.add_argument('--dim', type=int, default=128, help='Embedding dimension') 274 | 275 | parser.add_argument('--L', type=int, default=2, help='Subgraph level') 276 | 277 | parser.add_argument('--base', type=int, default=4, help='Base constant of logarithm histograms') 278 | 279 | parser.add_argument('--operators', default=['mean', 'var', 'sum', 'max', 'min', 'L1', 'L2'], nargs="+", help='Relational operators to use.') 280 | 281 | return parser.parse_args() 282 | 283 | 284 | def dist_cal(row1, row2): 285 | dist = np.linalg.norm(row1-row2) 286 | 287 | return dist 288 | 289 | if __name__ == '__main__': 290 | 291 | # assume the graph is directed, weighted 292 | directed = True 293 | 294 | args = parse_args() 295 | 296 | ###################################################### 297 | # Base features to use 298 | ###################################################### 299 | 300 | emb_write = True 301 | check_difference = False 302 | 303 | base_features = ['row', 'col', 'row_col'] 304 | 305 | ########################################### 306 | # graph_2_file_path 307 | ########################################### 308 | input_graph_file_path = args.input 309 | input_gt_path = args.cat 310 | input_summary_path = args.summary 311 | input_emb_file_path = args.test 312 | 313 | output_file_path = args.output 314 | 315 | dim = args.dim 316 | L = args.L 317 | num_buckets = args.base 318 | op = args.operators 319 | print '----------------------------------' 320 | print '[Input graph (new) file] ' + input_graph_file_path 321 | print '[Input category file] ' + input_gt_path 322 | print '[Input summary (existing) file]' + input_summary_path 323 | print '[Output embedding file] ' + output_file_path 324 | print '[Embedding dimension] ' + str(dim) 325 | print '[Number of levels] ' + str(L) 326 | print '[Base of logarithm binning] ' + str(num_buckets) 327 | print '[Relational operators] ' + str(op) 328 | print '----------------------------------' 329 | 330 | 331 | pkl_file = open(input_summary_path, 'rb') 332 | g_summs = pickle.load(pkl_file) 333 | pkl_file.close() 334 | 335 | 336 | delimiter = get_delimiter(input_graph_file_path) 337 | 338 | 339 | raw = np.genfromtxt(input_graph_file_path, dtype=int) 340 | COL = raw.shape[1] 341 | 342 | if COL < 2: 343 | sys.exit('[Input format error.]') 344 | elif COL == 2: 345 | print '[unweighted graph detected.]' 346 | rows = raw[:,0] 347 | cols = raw[:,1] 348 | weis = np.ones(len(rows)) 349 | 350 | elif COL == 3: 351 | print '[weighted graph detected.]' 352 | rows = raw[:,0] 353 | cols = raw[:,1] 354 | weis = raw[:,2] 355 | 356 | 357 | check_eq = True 358 | max_id = int(max(max(rows), max(cols))) 359 | num_nodes = max_id + 1 360 | print '[max_node_id] ' + str(max_id) 361 | print '[num_nodes] ' + str(num_nodes) 362 | 363 | nodes_to_embed = range(int(max_id)+1) 364 | 365 | if max(rows) != max(cols): 366 | rows = np.append(rows,max(max(rows), max(cols))) 367 | cols = np.append(cols,max(max(rows), max(cols))) 368 | weis = np.append(weis, 0) 369 | check_eq = False 370 | 371 | 372 | adj_matrix = sps.lil_matrix( sps.csc_matrix((weis, (rows, cols))) ) 373 | 374 | CAT_DICT, unique_cat, ID_CAT_DICT = construct_cat(input_gt_path, delimiter) 375 | 376 | g_sums = [] 377 | 378 | neighbor_list = construct_neighbor_list(adj_matrix, nodes_to_embed) 379 | 380 | graph = Graph(adj_matrix = adj_matrix, max_id = max_id, num_nodes = num_nodes, base_features = base_features, neighbor_list = neighbor_list, 381 | directed = directed, cat_dict = CAT_DICT, id_cat_dict = ID_CAT_DICT, unique_cat = unique_cat, check_eq = check_eq) 382 | 383 | init_feature_matrix = get_init_features(graph, base_features, nodes_to_embed) 384 | 385 | rep_method = RepMethod(method = "hetero", bucket_max_value = 30, num_buckets = num_buckets, operators = op, use_total = len(op)) 386 | 387 | 388 | ###################################################### 389 | # Step 1: get node embeddings from the summary 390 | # The number of layers we want to explore - 391 | # layer 0 is the base feature matrix 392 | # layer 1+: are the layers of higher order 393 | ############################################ 394 | init_feature_matrix_seq = get_seq_features(graph, rep_method, input_dense_matrix = init_feature_matrix, nodes_to_embed = nodes_to_embed) 395 | init_gs = g_summs[0] 396 | print 'init_gs shape: ' + str(init_gs.shape) 397 | U = np.dot( init_feature_matrix_seq, np.linalg.pinv(init_gs) ) 398 | 399 | 400 | feature_matrix = init_feature_matrix 401 | 402 | for i in range(L): 403 | print '[Current layer] ' + str(i) 404 | 405 | feature_matrix_new = search_feature_layer(graph, rep_method, base_feature_matrix = feature_matrix) 406 | 407 | feature_matrix_new_seq = get_seq_features(graph, rep_method, input_dense_matrix = feature_matrix_new, nodes_to_embed = nodes_to_embed) 408 | 409 | cur_gs = g_summs[i+1] 410 | print '[Summary shape] ' + str(cur_gs.shape) 411 | cur_U = np.dot( feature_matrix_new_seq, np.linalg.pinv(cur_gs) ) 412 | U = np.concatenate((U, cur_U), axis=1) 413 | 414 | feature_matrix = feature_matrix_new 415 | 416 | 417 | ###################################################### 418 | # Step 2: evaluate the difference (optional) 419 | ###################################################### 420 | 421 | 422 | if check_difference: 423 | orig = read_embedding(input_emb_file_path) 424 | modi = U 425 | 426 | rows = orig[:,0] 427 | cols = orig[:,1] 428 | 429 | max_id_1 = orig.shape[0] 430 | max_id_2 = U.shape[0] 431 | max_id = min(max_id_1, max_id_2) 432 | 433 | ID_DIST_DICT = {} 434 | dist_total = 0 435 | 436 | for i in range(max_id): 437 | if i % 10000 == 0: 438 | print '[Current_node_id] ' + str(i) 439 | cur_row_orig = orig[i,:] 440 | cur_row_modi = modi[i,:] 441 | dist = dist_cal(cur_row_orig, cur_row_modi) 442 | 443 | ID_DIST_DICT[i] = dist 444 | dist_total += dist 445 | 446 | print '[total dist] ' + str(dist_total) 447 | 448 | 449 | if emb_write: 450 | write_embedding(U, output_file_path) 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import datetime 3 | from pathlib import Path 4 | import numpy as np, scipy as sp, networkx as nx 5 | import math, time, os, sys, random 6 | from collections import deque 7 | import pickle 8 | import argparse 9 | 10 | import scipy.sparse as sps 11 | from scipy.sparse import coo_matrix 12 | from scipy.sparse.linalg import svds, eigs 13 | import sparsesvd 14 | 15 | from collections import defaultdict 16 | 17 | from sklearn.decomposition import NMF, DictionaryLearning 18 | from sklearn.manifold import TSNE 19 | 20 | from util import * 21 | 22 | 23 | def get_combined_feature_sequence(graph, rep_method, current_node, input_dense_matrix = None, feature_wid_ind = None): 24 | ''' 25 | Get the combined degree/other feature sequence for a given node 26 | ''' 27 | N, cur_P = input_dense_matrix.shape 28 | 29 | id_cat_dict = graph.id_cat_dict 30 | combined_feature_vector = [] 31 | cur_neighbors = graph.neighbor_list[current_node][:] 32 | cur_neighbors.append(current_node) 33 | 34 | for cat in graph.cat_dict.keys(): 35 | 36 | features = [] 37 | for i in range(cur_P): 38 | features.append([0.0] * feature_wid_ind[i]) 39 | 40 | for neighbor in cur_neighbors: 41 | if id_cat_dict[neighbor] != cat: 42 | continue 43 | try: 44 | # print cur_P 45 | for i in range(cur_P): 46 | node_feature = input_dense_matrix[neighbor, i] 47 | 48 | if (rep_method.num_buckets is not None) and (node_feature != 0): 49 | bucket_index = int(math.log(node_feature, rep_method.num_buckets)) 50 | else: 51 | bucket_index = int(node_feature) 52 | 53 | bucket_index = max(bucket_index, 0) 54 | features[i][min(bucket_index, len(features[i]) - 1)] += 1 55 | 56 | except Exception as e: 57 | print "Exception:", e 58 | cur_feature_vector = features[0] 59 | 60 | for feature_vector in features[1:]: 61 | cur_feature_vector += feature_vector 62 | 63 | combined_feature_vector += cur_feature_vector 64 | 65 | return combined_feature_vector 66 | 67 | 68 | def get_features(graph, rep_method, input_dense_matrix = None, nodes_to_embed = None): 69 | 70 | feature_wid_sum, feature_wid_ind = get_feature_n_buckets(input_dense_matrix, num_buckets, rep_method.bucket_max_value) 71 | feature_matrix = np.zeros([graph.num_nodes, feature_wid_sum * len(graph.unique_cat)]) 72 | 73 | for n in nodes_to_embed: 74 | if n % 50000 == 0: 75 | print "[Generate combined feature vetor] node: " + str(n) 76 | combined_feature_sequence = get_combined_feature_sequence(graph, rep_method, n, input_dense_matrix = input_dense_matrix, feature_wid_ind = feature_wid_ind) 77 | feature_matrix[n,:] = combined_feature_sequence 78 | 79 | return feature_matrix 80 | 81 | 82 | def get_seq_features(graph, rep_method, input_dense_matrix = None, nodes_to_embed = None): 83 | 84 | if input_dense_matrix is None: 85 | sys.exit('get_seq_features: no input matrix.') 86 | 87 | if nodes_to_embed is None: 88 | nodes_to_embed = range(graph.num_nodes) 89 | num_nodes = graph.num_nodes 90 | else: 91 | num_nodes = len(nodes_to_embed) 92 | 93 | feature_matrix = get_features(graph, rep_method, input_dense_matrix, nodes_to_embed) 94 | 95 | if graph.directed: 96 | print "[Starting to obtain features from in-components]" 97 | 98 | neighbor_list_r = construct_neighbor_list(graph.adj_matrix.transpose(), nodes_to_embed) 99 | 100 | indegree_graph = Graph(graph.adj_matrix.transpose(), max_id = graph.max_id, num_nodes = graph.num_nodes, 101 | directed = graph.directed, base_features = graph.base_features, neighbor_list = neighbor_list_r, 102 | cat_dict = graph.cat_dict, id_cat_dict = graph.id_cat_dict, unique_cat = graph.unique_cat, check_eq = graph.check_eq) 103 | base_feature_matrix_in = get_features(indegree_graph, rep_method, input_dense_matrix, nodes_to_embed = nodes_to_embed) 104 | 105 | feature_matrix = np.hstack((feature_matrix, base_feature_matrix_in)) 106 | 107 | return feature_matrix 108 | 109 | 110 | def construct_cat(input_gt_path, delimiter): 111 | ''' 112 | # Input: per line, 1) cat-id_init, id_end or 2) cat-id 113 | ''' 114 | result = defaultdict(set) 115 | id_cat_dict = dict() 116 | 117 | fIn = open(input_gt_path, 'r') 118 | lines = fIn.readlines() 119 | for line in lines: 120 | 121 | parts = line.strip('\r\n').split(delimiter) 122 | if len(parts) == 3: 123 | cat = parts[0] 124 | node_id_start = parts[1] 125 | node_id_end = parts[2] 126 | 127 | for i in range( int(node_id_start), int(node_id_end)+1 ): 128 | result[ int(cat) ].add( i ) 129 | id_cat_dict[i] = int(cat) 130 | 131 | elif len(parts) == 2: 132 | cat = parts[0] 133 | node_id = parts[1] 134 | 135 | result[int(cat)].add( int(node_id) ) 136 | id_cat_dict[int(node_id)] = int(cat) 137 | 138 | else: 139 | sys.exit('Cat file format not supported') 140 | 141 | fIn.close() 142 | return result, result.keys(), id_cat_dict 143 | 144 | 145 | def search_feature_layer(graph, rep_method, base_feature_matrix = None): 146 | 147 | n,p = base_feature_matrix.shape 148 | result = np.zeros([n, p*rep_method.use_total]) 149 | ops = rep_method.operators 150 | 151 | for u in range(n): 152 | if u % 50000 == 0: 153 | print '[Current_node_id] ' + str(u) 154 | 155 | neighbors = graph.neighbor_list[u] 156 | 157 | for fid in range(p): 158 | 159 | mean_v = 0.0; sum_v = 0.0; var_v = 0.0; max_v = 0.0; min_v = 0.0; sum_sq_diff = 0.0; prod_v = 1.0; L1_v = 0.0; L2_v = 0.0 160 | 161 | for v in neighbors: 162 | 163 | L1_v += abs(base_feature_matrix[u][fid] - base_feature_matrix[v][fid]) # L1 164 | diff = base_feature_matrix[u][fid] - base_feature_matrix[v][fid] 165 | L2_v += diff*diff # L2 166 | sum_sq_diff += base_feature_matrix[v][fid] * base_feature_matrix[v][fid] # var 167 | sum_v += base_feature_matrix[v][fid] # used in sum and mean 168 | if max_v < base_feature_matrix[v][fid]: # max 169 | max_v = base_feature_matrix[v][fid] 170 | if min_v > base_feature_matrix[v][fid]: # min 171 | min_v = base_feature_matrix[v][fid] 172 | 173 | deg = len(neighbors) 174 | if deg == 0: 175 | mean_v = 0 176 | var_v = 0 177 | else: 178 | mean_v = sum_v / float(deg) 179 | var_v = (sum_sq_diff / float(deg)) - (mean_v * mean_v) #- 2.0*mean_v/float(deg)*sum_v 180 | 181 | temp_vec = [0.0] * rep_method.use_total 182 | 183 | for idx, op in enumerate(ops): 184 | if op == 'mean': 185 | temp_vec[idx] = mean_v 186 | elif op == 'var': 187 | temp_vec[idx] = var_v 188 | elif op == 'sum': 189 | temp_vec[idx] = sum_v 190 | elif op == 'max': 191 | temp_vec[idx] = max_v 192 | elif op == 'min': 193 | temp_vec[idx] = min_v 194 | elif op == 'L1': 195 | temp_vec[idx] = L1_v 196 | elif op == 'L2': 197 | temp_vec[idx] = L2_v 198 | else: 199 | sys.exit('[Unsupported operation]') 200 | 201 | result[u, fid*rep_method.use_total:(fid+1)*rep_method.use_total] = temp_vec 202 | 203 | return result 204 | 205 | 206 | def feature_layer_evaluation_embedding(graph, rep_method, feature_matrix = None, k = 17): 207 | 208 | temp = scipy.sparse.csc_matrix(feature_matrix) 209 | U,s,V = sparsesvd.sparsesvd(temp, k) 210 | 211 | S = np.diag(s) 212 | emb = np.dot(U.T, (S ** 0.5)) 213 | g_sum = np.dot((S**0.5), V) 214 | 215 | return emb, g_sum 216 | 217 | 218 | def construct_neighbor_list(adj_matrix, nodes_to_embed): 219 | result = {} 220 | 221 | for i in nodes_to_embed: 222 | result[i] = list(adj_matrix.getrow(i).nonzero()[1]) 223 | 224 | return result 225 | 226 | 227 | 228 | def get_init_features(graph, base_features, nodes_to_embed): 229 | ''' 230 | # set fb: sum as default. 231 | ''' 232 | init_feature_matrix = np.zeros((len(nodes_to_embed), len(base_features))) 233 | adj = graph.adj_matrix 234 | 235 | if "row_col" in base_features: 236 | init_feature_matrix[:,base_features.index("row_col")] = (adj.sum(axis=0).transpose() + adj.sum(axis=1)).ravel() 237 | 238 | if "col" in base_features: 239 | init_feature_matrix[:,base_features.index("col")] = adj.sum(axis=0).transpose().ravel() 240 | 241 | if "row" in base_features: 242 | init_feature_matrix[:,base_features.index("row")] = adj.sum(axis=1).ravel() 243 | 244 | print '[Initial_feature_all finished]' 245 | return init_feature_matrix 246 | 247 | def get_feature_n_buckets(feature_matrix, num_buckets, bucket_max_value): 248 | 249 | result_sum = 0 250 | result_ind = [] 251 | N, cur_P = feature_matrix.shape 252 | 253 | if num_buckets is not None: 254 | for i in range(cur_P): 255 | temp = max(bucket_max_value, int(math.log(max(max(feature_matrix[:,i]), 1), num_buckets) + 1)) 256 | n_buckets = temp 257 | # print max(feature_matrix[:,i]) 258 | result_sum += n_buckets 259 | result_ind.append(n_buckets) 260 | else: 261 | for i in range(cur_P): 262 | temp = max(bucket_max_value, int( max(feature_matrix[:,i]) ) + 1) 263 | n_buckets = temp 264 | result_sum += n_buckets 265 | result_ind.append(n_buckets) 266 | 267 | return result_sum, result_ind 268 | 269 | 270 | def parse_args(): 271 | ''' 272 | Parses the arguments. 273 | ''' 274 | parser = argparse.ArgumentParser(description="Multi-Lens: Bridging Network Embedding and Summarization.") 275 | 276 | parser.add_argument('--input', nargs='?', default='../graph/test.tsv', help='Input graph file path') 277 | 278 | parser.add_argument('--cat', nargs='?', default='../graph/test_cat.tsv', help='Input node category file path') 279 | 280 | parser.add_argument('--output', nargs='?', default='../emb/test_emb.txt', help='Embedding file path') 281 | 282 | parser.add_argument('--dim', type=int, default=128, help='Embedding dimension') 283 | 284 | parser.add_argument('--L', type=int, default=2, help='Subgraph level') 285 | 286 | parser.add_argument('--base', type=int, default=4, help='Base constant of logarithm histograms') 287 | 288 | parser.add_argument('--operators', default=['mean', 'var', 'sum', 'max', 'min', 'L1', 'L2'], nargs="+", help='Relational operators to use.') 289 | 290 | return parser.parse_args() 291 | 292 | 293 | 294 | def get_Kis(init_feature_matrix_seq, K, L): 295 | 296 | result = [] 297 | rank_init = np.linalg.matrix_rank(init_feature_matrix_seq) 298 | 299 | if L == 0: 300 | result.append( min(rank_init, K) ) 301 | else: 302 | l_0 = min(rank_init, K/(L+1)) 303 | result.append(l_0) 304 | for i in range(L-1): 305 | result.append( K/(L+1) ) 306 | 307 | result.append(K - sum(result)) 308 | 309 | return result 310 | 311 | 312 | 313 | if __name__ == '__main__': 314 | 315 | # assume the graph is directed, weighted 316 | directed = True 317 | 318 | args = parse_args() 319 | 320 | ###################################################### 321 | # Base features to use. 322 | ###################################################### 323 | 324 | emb_write = True 325 | 326 | base_features = ['row', 'col', 'row_col'] 327 | 328 | ###################################################### 329 | # Parameters to setup 330 | ###################################################### 331 | input_file_path = args.input 332 | input_gt_path = args.cat 333 | output_file_path = args.output 334 | 335 | dim = args.dim 336 | L = args.L 337 | num_buckets = args.base 338 | op = args.operators 339 | print '----------------------------------' 340 | print '[Input graph file] ' + input_file_path 341 | print '[Input category file] ' + input_gt_path 342 | print '[Output embedding file] ' + output_file_path 343 | print '[Embedding dimension] ' + str(dim) 344 | print '[Number of levels] ' + str(L) 345 | print '[Base of logarithm binning] ' + str(num_buckets) 346 | print '[Relational operators] ' + str(op) 347 | print '----------------------------------' 348 | 349 | ###################################################### 350 | # Preprocess 351 | ###################################################### 352 | 353 | delimiter = get_delimiter(input_file_path) 354 | 355 | 356 | raw = np.genfromtxt(input_file_path, dtype=int) 357 | COL = raw.shape[1] 358 | 359 | if COL < 2: 360 | sys.exit('[Input format error.]') 361 | elif COL == 2: 362 | print '[unweighted graph detected.]' 363 | rows = raw[:,0] 364 | cols = raw[:,1] 365 | weis = np.ones(len(rows)) 366 | 367 | elif COL == 3: 368 | print '[weighted graph detected.]' 369 | rows = raw[:,0] 370 | cols = raw[:,1] 371 | weis = raw[:,2] 372 | 373 | 374 | check_eq = True 375 | max_id = int(max(max(rows), max(cols))) 376 | num_nodes = max_id + 1 377 | print '[max_node_id] ' + str(max_id) 378 | print '[num_nodes] ' + str(num_nodes) 379 | 380 | nodes_to_embed = range(int(max_id)+1) #[1,2]# 381 | 382 | if max(rows) != max(cols): 383 | rows = np.append(rows,max(max(rows), max(cols))) 384 | cols = np.append(cols,max(max(rows), max(cols))) 385 | weis = np.append(weis, 0) 386 | check_eq = False 387 | 388 | 389 | adj_matrix = sps.lil_matrix( sps.csc_matrix((weis, (rows, cols)))) 390 | print '[shape of adj_matrix] ' + str(adj_matrix.shape) 391 | 392 | CAT_DICT, unique_cat, ID_CAT_DICT = construct_cat(input_gt_path, delimiter) 393 | 394 | 395 | ###################################################### 396 | # Multi-Lens starts. 397 | ###################################################### 398 | 399 | g_sums = [] 400 | 401 | neighbor_list = construct_neighbor_list(adj_matrix, nodes_to_embed) 402 | 403 | graph = Graph(adj_matrix = adj_matrix, max_id = max_id, num_nodes = num_nodes, base_features = base_features, 404 | neighbor_list = neighbor_list, directed = directed, cat_dict = CAT_DICT, id_cat_dict = ID_CAT_DICT, unique_cat = unique_cat, check_eq = check_eq) 405 | 406 | rep_method = RepMethod(method = "hetero", bucket_max_value = 30, num_buckets = num_buckets, operators = op, use_total = len(op)) 407 | 408 | ######################################## 409 | # Step 1: get base features 410 | ######################################## 411 | init_feature_matrix = get_init_features(graph, base_features, nodes_to_embed) 412 | init_feature_matrix_seq = get_seq_features(graph, rep_method, input_dense_matrix = init_feature_matrix, nodes_to_embed = nodes_to_embed) 413 | 414 | Kis = get_Kis(init_feature_matrix_seq, dim, L) 415 | 416 | feature_matrix_emb, g_sum = feature_layer_evaluation_embedding(graph, rep_method, feature_matrix = init_feature_matrix_seq, k = Kis[0]) 417 | 418 | g_sums.append(g_sum) 419 | 420 | 421 | ######################################## 422 | # Step 2: feature proliferation. 423 | # layer 0 is the base feature matrix 424 | # layer 1+: are the layers of higher order 425 | ######################################## 426 | 427 | rep = feature_matrix_emb 428 | feature_matrix = init_feature_matrix 429 | 430 | 431 | for i in range(L): 432 | print '[Current layer] ' + str(i) 433 | print '[feature_matrix shape] ' + str(feature_matrix.shape) 434 | 435 | feature_matrix_new = search_feature_layer(graph, rep_method, base_feature_matrix = feature_matrix) 436 | feature_matrix_new_seq = get_seq_features(graph, rep_method, input_dense_matrix = feature_matrix_new, nodes_to_embed = nodes_to_embed) 437 | feature_matrix_new_emb, g_new_sum = feature_layer_evaluation_embedding(graph, rep_method, feature_matrix = feature_matrix_new_seq, k = Kis[i+1]) 438 | 439 | feature_matrix = feature_matrix_new 440 | rep_new = feature_matrix_new_emb 441 | rep = np.concatenate((rep, rep_new), axis=1) 442 | 443 | g_sums.append(g_new_sum) 444 | 445 | ###################################################### 446 | # Write output 447 | ###################################################### 448 | 449 | print '[Multi-Lens ends. Summary sizes:]' 450 | for ele in g_sums: 451 | print ele.shape 452 | 453 | fOut = open('latent_summary.pkl', 'wb') 454 | pickle.dump(g_sums, fOut, -1) 455 | fOut.close() 456 | 457 | if emb_write: 458 | write_embedding(rep, output_file_path) 459 | 460 | 461 | 462 | --------------------------------------------------------------------------------