├── graph
    ├── test_cat.tsv
    ├── .DS_Store
    └── test.tsv
├── overview.png
├── emb
    ├── .DS_Store
    ├── test_emb.txt
    └── test_emb_ind.txt
├── src
    ├── .DS_Store
    ├── latent_summary.pkl
    ├── util.py
    ├── main_inductive.py
    └── main.py
└── README.md


/graph/test_cat.tsv:
--------------------------------------------------------------------------------
1 | 0	0	4
2 | 1	5	6
3 | 2	7	8
4 | 


--------------------------------------------------------------------------------
/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GemsLab/MultiLENS/HEAD/overview.png


--------------------------------------------------------------------------------
/emb/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GemsLab/MultiLENS/HEAD/emb/.DS_Store


--------------------------------------------------------------------------------
/graph/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GemsLab/MultiLENS/HEAD/graph/.DS_Store


--------------------------------------------------------------------------------
/src/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GemsLab/MultiLENS/HEAD/src/.DS_Store


--------------------------------------------------------------------------------
/src/latent_summary.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GemsLab/MultiLENS/HEAD/src/latent_summary.pkl


--------------------------------------------------------------------------------
/graph/test.tsv:
--------------------------------------------------------------------------------
 1 | 0	2	1
 2 | 1	0	1
 3 | 1	2	5
 4 | 1	5	1
 5 | 2	8	5
 6 | 3	0	5
 7 | 3	4	1
 8 | 3	6	1
 9 | 4	0	1
10 | 4	3	1
11 | 4	7	5
12 | 7	0	1
13 | 


--------------------------------------------------------------------------------
/src/util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np, networkx as nx
 2 | import scipy.sparse
 3 | 
 4 | class RepMethod():
 5 | 	def __init__(self, bucket_max_value = None, method="hetero", num_buckets = None, use_other_features = False, operators = None,
 6 | 		use_total = 0):
 7 | 		self.method = method
 8 | 		self.bucket_max_value = bucket_max_value
 9 | 		self.num_buckets = num_buckets
10 | 		self.use_other_features = use_other_features
11 | 		self.operators = operators
12 | 		self.use_total = use_total
13 | 
14 | 
15 | class Graph():
16 | 	def __init__(self, adj_matrix = None, num_nodes = None, max_id = None, directed = False, neighbor_list = None,
17 | 			num_buckets = None, base_features = None, cat_dict = None, id_cat_dict = None, unique_cat = None, check_eq = True):
18 | 		# self.nx_graph = nx_graph
19 | 		self.adj_matrix = adj_matrix
20 | 		self.num_nodes = num_nodes
21 | 		self.max_id = max_id
22 | 		self.base_features = base_features
23 | 		self.unique_cat = unique_cat
24 | 		self.directed = directed
25 | 		self.num_buckets = num_buckets
26 | 
27 | 		self. neighbor_list = neighbor_list
28 | 		self.cat_dict = cat_dict
29 | 		self.id_cat_dict = id_cat_dict
30 | 		self.check_eq = check_eq
31 | 		
32 | 
33 | def get_delimiter(input_file_path):
34 | 	delimiter = " "
35 | 	if ".csv" in input_file_path:
36 | 		delimiter = ","
37 | 	elif ".tsv" in input_file_path:
38 | 		delimiter = "\t"
39 | 	else:
40 | 		sys.exit('Format not supported.')
41 | 
42 | 	return delimiter
43 | 
44 | 
45 | def write_embedding(rep, output_file_path):
46 | 	N, K = rep.shape
47 | 
48 | 	fOut = open(output_file_path, 'w')
49 | 	fOut.write(str(N) + ' ' + str(K) + '\n')
50 | 
51 | 	for i in range(N):
52 | 		cur_line = ' '.join([str(np.round(ii, 6)) for ii in rep[i,:]])
53 | 		fOut.write(str(i) + ' ' + cur_line + '\n')
54 | 
55 | 	fOut.close()
56 | 
57 | 	return
58 | 
59 | 
60 | def read_embedding(input_file_path):
61 | 
62 | 	fIn = open(input_file_path, 'r')
63 | 	N, K = fIn.readline().split(' ')
64 | 
65 | 	rep = np.zeros((int(N), int(K)))
66 | 
67 | 	for line in fIn.readlines():
68 | 		parts = line.strip().split(' ')
69 | 		rep[int(parts[0]),:] = [float(ele) for ele in parts[1:]]
70 | 
71 | 	
72 | 	return rep
73 | 
74 | 


--------------------------------------------------------------------------------
/emb/test_emb.txt:
--------------------------------------------------------------------------------
 1 | 9 26
 2 | 0 1.850588 0.830303 -0.191896 -1.045115 0.505562 -0.200553 -0.050742 0.13008 3.053755 1.433569 -0.041825 -1.78883 -0.094202 0.089586 -0.459508 -0.309663 -0.11008 4.866181 2.22298 0.121998 0.05738 2.989983 -0.308342 -0.771511 -0.265161 0.491672
 3 | 1 1.207391 -1.041898 0.126836 -0.047614 0.20108 0.967246 0.191076 0.189968 1.986099 -1.576634 -0.158664 0.161196 1.013833 1.562144 0.347487 -0.345401 0.01183 3.145423 -1.64266 -1.225175 -3.435987 -0.479035 1.606629 0.03074 -0.480261 0.1073
 4 | 2 1.188927 0.931284 0.089476 -0.004928 -1.120979 0.403132 0.194414 -0.181278 1.91365 1.80887 0.097809 0.765877 1.351447 -0.421045 0.770864 0.39684 0.192125 2.953462 3.391384 0.03733 -1.149727 -1.662207 -1.414858 1.096449 0.322216 -0.792043
 5 | 3 1.541783 -0.948771 -0.094118 0.014943 -0.073187 -0.172418 -0.618337 -0.337874 2.5019 -1.497652 -0.284534 0.18921 0.107144 -0.477646 -0.974225 0.808616 0.120295 3.988445 -2.647114 -0.410472 0.669848 -0.570027 -1.046523 -1.542457 1.035089 -0.694214
 6 | 4 1.54687 -0.447575 0.671183 0.533284 -0.30057 -0.728973 0.295225 0.238886 2.489298 -0.986318 1.031641 0.670041 -0.66237 -1.136445 0.493327 -0.607405 -0.193471 3.963646 -2.033719 1.867637 1.288111 -1.023158 -0.773682 1.35929 -0.758336 0.823649
 7 | 5 0.451265 -0.023006 -1.259903 0.367417 -0.008656 -0.075931 0.127082 0.042334 0.697913 0.189865 -2.03451 0.350973 -0.466462 0.054177 0.397503 0.197009 -0.853154 1.073465 0.53663 -3.199775 1.000627 -0.488686 0.578496 0.847164 1.107696 1.141338
 8 | 6 0.451265 -0.023006 -1.259903 0.367417 -0.008656 -0.075931 0.127082 0.042334 0.675869 0.110825 -2.04555 0.352309 -0.615303 -0.199929 0.062071 -0.397289 0.804648 1.1152 0.10538 -2.937997 1.942738 -0.203464 0.380247 0.061913 -1.357843 -0.927939
 9 | 7 0.74425 0.536652 0.348272 0.632578 0.905468 0.09093 0.388578 -0.32813 1.21809 0.428394 0.820024 0.099353 -1.686427 0.981121 0.622814 0.55406 0.21437 1.977655 0.234332 1.678586 1.164126 0.527168 2.891226 0.903616 0.674615 -0.804256
10 | 8 0.424346 0.920717 0.189408 0.81788 0.15575 0.278821 -0.648462 0.242351 0.565392 1.353081 0.359149 1.64749 -0.239997 0.629258 -1.11555 -0.323173 -0.148373 0.848442 2.115553 0.914508 0.711768 -2.470263 1.300451 -1.864505 -0.332716 0.605318
11 | 


--------------------------------------------------------------------------------
/emb/test_emb_ind.txt:
--------------------------------------------------------------------------------
 1 | 9 26
 2 | 0 1.850588 0.830303 -0.191896 -1.045115 0.505562 -0.200553 -0.050742 0.13008 3.053755 1.433569 -0.041825 -1.78883 -0.094202 0.089586 -0.459508 -0.309663 -0.11008 4.866181 2.22298 0.121998 0.05738 2.989983 -0.308342 -0.771511 -0.265161 0.491672
 3 | 1 1.207391 -1.041898 0.126836 -0.047614 0.20108 0.967246 0.191076 0.189968 1.986099 -1.576634 -0.158664 0.161196 1.013833 1.562144 0.347487 -0.345401 0.01183 3.145423 -1.64266 -1.225175 -3.435987 -0.479035 1.606629 0.03074 -0.480261 0.1073
 4 | 2 1.188927 0.931284 0.089476 -0.004928 -1.120979 0.403132 0.194414 -0.181278 1.91365 1.80887 0.097809 0.765877 1.351447 -0.421045 0.770864 0.39684 0.192125 2.953462 3.391384 0.03733 -1.149727 -1.662207 -1.414858 1.096449 0.322216 -0.792043
 5 | 3 1.541783 -0.948771 -0.094118 0.014943 -0.073187 -0.172418 -0.618337 -0.337874 2.5019 -1.497652 -0.284534 0.18921 0.107144 -0.477646 -0.974225 0.808616 0.120295 3.988445 -2.647114 -0.410472 0.669848 -0.570027 -1.046523 -1.542457 1.035089 -0.694214
 6 | 4 1.54687 -0.447575 0.671183 0.533284 -0.30057 -0.728973 0.295225 0.238886 2.489298 -0.986318 1.031641 0.670041 -0.66237 -1.136445 0.493327 -0.607405 -0.193471 3.963646 -2.033719 1.867637 1.288111 -1.023158 -0.773682 1.35929 -0.758336 0.823649
 7 | 5 0.451265 -0.023006 -1.259903 0.367417 -0.008656 -0.075931 0.127082 0.042334 0.697913 0.189865 -2.03451 0.350973 -0.466462 0.054177 0.397503 0.197009 -0.853154 1.073465 0.53663 -3.199775 1.000627 -0.488686 0.578496 0.847164 1.107696 1.141338
 8 | 6 0.451265 -0.023006 -1.259903 0.367417 -0.008656 -0.075931 0.127082 0.042334 0.675869 0.110825 -2.04555 0.352309 -0.615303 -0.199929 0.062071 -0.397289 0.804648 1.1152 0.10538 -2.937997 1.942738 -0.203464 0.380247 0.061913 -1.357843 -0.927939
 9 | 7 0.74425 0.536652 0.348272 0.632578 0.905468 0.09093 0.388578 -0.32813 1.21809 0.428394 0.820024 0.099353 -1.686427 0.981121 0.622814 0.55406 0.21437 1.977655 0.234332 1.678586 1.164126 0.527168 2.891226 0.903616 0.674615 -0.804256
10 | 8 0.424346 0.920717 0.189408 0.81788 0.15575 0.278821 -0.648462 0.242351 0.565392 1.353081 0.359149 1.64749 -0.239997 0.629258 -1.11555 -0.323173 -0.148373 0.848442 2.115553 0.914508 0.711768 -2.470263 1.300451 -1.864505 -0.332716 0.605318
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MultiLENS
  2 | 
  3 | 
  4 | **Paper**: Di Jin, Ryan A. Rossi, Eunyee Koh, Sungchul Kim, Anup Rao, Danai Koutra. Latent Network Summarization: Bridging Network Embedding and Summarization. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2019.
  5 | 
  6 | *Link*: https://gemslab.github.io/papers/jin-2019-latent.pdf 
  7 | 
  8 | <p align="center">
  9 | <img src="https://raw.githubusercontent.com/GemsLab/MultiLENS/master/overview.png" width="550"  alt="Overview of MultiLENS">
 10 | </p>
 11 | 
 12 | 
 13 | **Citation (bibtex)**:
 14 | ```
 15 | @inproceedings{DBLP:conf/kdd/JinRKKRK19,
 16 |   author    = {Di Jin and
 17 |                Ryan A. Rossi and
 18 |                Eunyee Koh and
 19 |                Sungchul Kim and 
 20 |                Anup Rao and
 21 |                Danai Koutra},
 22 |   title     = {Latent Network Summarization: Bridging Network Embedding and Summarization},
 23 |   booktitle = {Proceedings of the 25th {ACM} {SIGKDD} International Conference on
 24 |                Knowledge Discovery {\&} Data Mining, {KDD} 2019, London, UK,
 25 |                August 4-8, 2019},
 26 |   year      = {2019},
 27 |   }
 28 | ```
 29 | 
 30 | # Code
 31 | 
 32 | ## Inputs:
 33 | 
 34 | MultiLENS takes two files as input, the graph file and the category file.
 35 | 
 36 | ### Input graph file
 37 | The input graph file can be either static or temporal edge list in the following format separated by tab:
 38 | ```
 39 | <src> <dst> <weight> <timestamp> (optional)
 40 | ```
 41 | MultiLENS will automatically determine if the input graph is static or temporal. The edge list is assumed to be re-ordered consecutively from 0, i.e., the minimum node ID is 0, and the maximum node ID is <#node - 1>. A toy static graph is under "/graph/" directory.
 42 | 
 43 | ### Input category file
 44 | The category file is a mapping between the node ID and its type (e.g., IP, cookie, web agent) with the following format separated by tab:
 45 | ```
 46 | <category> <id_initial> <id_ending>
 47 | ```
 48 | if the node IDs are grouped by the type, where ```<id_initial>``` and ```<id_ending>``` are the starting and ending node ids in type ```<category>```
 49 | For example,
 50 | ```
 51 | 0 0 279629
 52 | 1 279630  283182
 53 | ```
 54 | means node 0, 1, ... 279629 are in type 0, node 279630, 279631, ... 283182 are in type 1.
 55 | 
 56 | But if the node IDs are not grouped by the types, this implementation also supports the following format separated by tab:
 57 | ```
 58 | <category> <node_id>
 59 | ```
 60 | which is just the 1-1 mapping. The code accepts either format.
 61 | 
 62 | ## Usage
 63 | 
 64 | The complete command to run MultiLENS is as follows.
 65 | 
 66 | ```
 67 | python main.py --input <graph_file_path> --cat <category_file_path> --output <embedding_file_path> --dim <embedding_dimension> 
 68 |   --L <#level> --base <constant of logarithm binning> --operators <relational_operators_to_use>
 69 | ```
 70 | 
 71 | - input, the input graph file stated under the "Graph Input" section above. Default value: '../graph/test.tsv'
 72 | - cat, the input category file stated under the "Graph Input" section above. Default value: '../graph/test_cat.tsv'
 73 | - output, the ouput file path of the embeddings. Default value: '../emb/test_emb.txt'
 74 | - dim, the dimension of the embeddings. Default value: 128
 75 | - L, the maximum subgraph order. Default value: 2
 76 | - base, the base constant of logarithm binning. Default value: 4
 77 | - operators, a list of relational operators to use. Use the command such as ```--operators 'mean','sum'``` to specify which operators to use. Default value: ['mean', 'var', 'sum', 'max', 'min', 'L1', 'L2']
 78 | 
 79 | ## Output
 80 | In addition to embedding file indicated in the path ```output```, MultiLENS also outputs "latent_summary.pkl", which is the latent graph summary file that can be used for inductive learning tasks.
 81 | 
 82 | ## Inductive learning task
 83 | This repo also provides the python script to perform inductive learnings, i.e., deriving node embeddings from the latent summary on the fly. The commands to run it is as follows: 
 84 | 
 85 | ```
 86 | python main_inducitve.py --input <graph_file_path> --cat <category_file_path> --summary <latent_summary_file> --output <embedding_file_path> 
 87 |   --dim <embedding_dimension> --L <#level> --base <constant of logarithm binning> --operators <relational_operators_to_use>
 88 | ```
 89 | 
 90 | In addition to the identical arguments shown above, MultiLENS takes ```summary``` as the input:
 91 | 
 92 | - summary, the input latent graph summary file derived on the (same/different) graph. Default value: './latent_summary.pkl'
 93 | - output. Default value: './emb/test_emb_ind.txt'
 94 | 
 95 | One may also set the variable "check_difference" in "main_inducitve.py" to compute the sum of node-wise distances (Frobenius norm) to measure graph difference. 
 96 | 
 97 | 
 98 | # Question & troubleshooting
 99 | 
100 | If you encounter any problems running the code, pls feel free to contact Di Jin (dijin@umich.edu)
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/src/main_inductive.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import datetime
  3 | from pathlib import Path
  4 | import numpy as np, scipy as sp, networkx as nx
  5 | import math, time, os, sys, random
  6 | from collections import deque
  7 | import pickle
  8 | import argparse
  9 | 
 10 | import scipy.sparse as sps
 11 | from scipy.sparse import coo_matrix
 12 | from scipy.sparse.linalg import svds, eigs
 13 | import sparsesvd
 14 | 
 15 | from sklearn.decomposition import NMF, DictionaryLearning
 16 | from sklearn.manifold import TSNE
 17 | 
 18 | from collections import defaultdict
 19 | 
 20 | from util import *
 21 | 
 22 | def get_combined_feature_sequence(graph, rep_method, current_node, input_dense_matrix = None, feature_wid_ind = None):
 23 | 	'''
 24 | 	Get the combined degree/other feature sequence for a given node
 25 | 	'''
 26 | 	N, cur_P = input_dense_matrix.shape
 27 | 
 28 | 	id_cat_dict = graph.id_cat_dict
 29 | 	combined_feature_vector = []
 30 | 	cur_neighbors = graph.neighbor_list[current_node][:]
 31 | 	cur_neighbors.append(current_node)
 32 | 	
 33 | 	for cat in graph.cat_dict.keys():
 34 | 
 35 | 		features = []
 36 | 		for i in range(cur_P):
 37 | 			features.append([0.0] * feature_wid_ind[i])
 38 | 
 39 | 		for neighbor in cur_neighbors:
 40 | 
 41 | 			if id_cat_dict[neighbor] != cat:
 42 | 				continue			
 43 | 
 44 | 			try:
 45 | 				for i in range(cur_P):
 46 | 					node_feature = input_dense_matrix[neighbor, i]
 47 | 
 48 | 					if (rep_method.num_buckets is not None) and (node_feature != 0):
 49 | 						bucket_index = int(math.log(node_feature, rep_method.num_buckets))
 50 | 					else:
 51 | 						bucket_index = int(node_feature)
 52 | 
 53 | 					features[i][min(bucket_index, len(features[i]) - 1)] += 1#(rep_method.alpha ** layer) * weight
 54 | 			except Exception as e:
 55 | 				print "Exception:", e
 56 | 				print("Node %d has %s value %d and will not contribute to feature distribution" % (khop_neighbor, feature, node_feature))
 57 | 		cur_feature_vector = features[0]
 58 | 		
 59 | 		for feature_vector in features[1:]:
 60 | 			cur_feature_vector += feature_vector
 61 | 
 62 | 		combined_feature_vector += cur_feature_vector
 63 | 	
 64 | 	return combined_feature_vector
 65 | 
 66 | def get_features(graph, rep_method, input_dense_matrix = None, nodes_to_embed = None):
 67 | 
 68 | 	feature_wid_sum, feature_wid_ind = get_feature_n_buckets(input_dense_matrix, num_buckets, rep_method.bucket_max_value)
 69 | 	feature_matrix = np.zeros([graph.num_nodes, feature_wid_sum * len(graph.unique_cat)])
 70 | 
 71 | 	for n in nodes_to_embed:
 72 | 		if n % 50000 == 0:
 73 | 			print "[Generate combined feature vetor] node: " + str(n)
 74 | 		combined_feature_sequence = get_combined_feature_sequence(graph, rep_method, n, input_dense_matrix = input_dense_matrix, feature_wid_ind = feature_wid_ind)
 75 | 		feature_matrix[n,:] = combined_feature_sequence
 76 | 
 77 | 	return feature_matrix
 78 | 
 79 | 
 80 | def get_seq_features(graph, rep_method, input_dense_matrix = None, nodes_to_embed = None):
 81 | 	
 82 | 	if input_dense_matrix is None:
 83 | 		sys.exit('get_seq_features: no input matrix.')
 84 | 
 85 | 	if nodes_to_embed is None:
 86 | 		nodes_to_embed = range(graph.num_nodes)
 87 | 		num_nodes = graph.num_nodes
 88 | 	else:
 89 | 		num_nodes = len(nodes_to_embed)
 90 | 	
 91 | 	feature_matrix = get_features(graph, rep_method, input_dense_matrix, nodes_to_embed)
 92 | 
 93 | 	if graph.directed:
 94 | 		print "[Starting to obtain features from in-components]"
 95 | 
 96 | 		neighbor_list_r = construct_neighbor_list(graph.adj_matrix.transpose(), nodes_to_embed)
 97 | 
 98 | 		indegree_graph = Graph(graph.adj_matrix.transpose(),  max_id = graph.max_id, num_nodes = graph.num_nodes, 
 99 | 			directed = graph.directed, base_features = graph.base_features, neighbor_list = neighbor_list_r,
100 | 			cat_dict = graph.cat_dict, id_cat_dict = graph.id_cat_dict, unique_cat = graph.unique_cat, check_eq = graph.check_eq)
101 | 		base_feature_matrix_in = get_features(indegree_graph, rep_method, input_dense_matrix, nodes_to_embed = nodes_to_embed)
102 | 
103 | 		feature_matrix = np.hstack((feature_matrix, base_feature_matrix_in))
104 | 
105 | 	return feature_matrix
106 | 
107 | 
108 | def construct_cat(input_gt_path, delimiter):
109 | 	'''
110 | 	# Input: per line, 1) cat-id_init, id_end or 2) cat-id
111 | 	'''
112 | 	result = defaultdict(set)
113 | 	id_cat_dict = dict()
114 | 
115 | 	fIn = open(input_gt_path, 'r')
116 | 	lines = fIn.readlines()
117 | 	for line in lines:
118 | 
119 | 		parts = line.strip('\r\n').split(delimiter)
120 | 		if len(parts) == 3:
121 | 			cat = parts[0]
122 | 			node_id_start = parts[1]
123 | 			node_id_end = parts[2]
124 | 
125 | 			for i in range( int(node_id_start), int(node_id_end)+1 ):
126 | 				result[ int(cat) ].add( i )
127 | 				id_cat_dict[i] = int(cat)
128 | 
129 | 		elif len(parts) == 2:
130 | 			cat = parts[0]
131 | 			node_id = parts[1]
132 | 
133 | 			result[int(cat)].add( int(node_id) )
134 | 			id_cat_dict[int(node_id)] = int(cat)
135 | 
136 | 		else:
137 | 			sys.exit('Cat file format not supported')
138 | 
139 | 	fIn.close()
140 | 	return result, result.keys(), id_cat_dict
141 | 
142 | 
143 | def search_feature_layer(graph, rep_method, base_feature_matrix = None):
144 | 
145 | 	n,p = base_feature_matrix.shape
146 | 	result = np.zeros([n, p*rep_method.use_total])
147 | 	ops = rep_method.operators
148 | 
149 | 	for u in range(n):
150 | 		if u % 50000 == 0:
151 | 			print '[Current_node_id] ' + str(u)
152 | 
153 | 		neighbors = graph.neighbor_list[u]
154 | 
155 | 		for fid in range(p):
156 | 
157 | 			mean_v = 0.0; sum_v = 0.0; var_v = 0.0; max_v = 0.0; min_v = 0.0; sum_sq_diff = 0.0; prod_v = 1.0; L1_v = 0.0; L2_v = 0.0
158 | 
159 | 			for v in neighbors:
160 | 
161 | 				L1_v += abs(base_feature_matrix[u][fid] - base_feature_matrix[v][fid])	# L1
162 | 				diff = base_feature_matrix[u][fid] - base_feature_matrix[v][fid]
163 | 				L2_v += diff*diff	# L2
164 | 				sum_sq_diff += base_feature_matrix[v][fid] * base_feature_matrix[v][fid]     # var
165 | 				sum_v += base_feature_matrix[v][fid]  # used in sum and mean
166 | 				if max_v < base_feature_matrix[v][fid]:	# max
167 | 					max_v = base_feature_matrix[v][fid]
168 | 				if min_v > base_feature_matrix[v][fid]: # min
169 | 					min_v = base_feature_matrix[v][fid]
170 | 
171 | 			deg = len(neighbors)
172 | 			if deg == 0:
173 | 				mean_v = 0
174 | 				var_v = 0
175 | 			else:
176 | 				mean_v = sum_v / float(deg)
177 | 				var_v = (sum_sq_diff / float(deg)) - (mean_v * mean_v) 
178 | 
179 | 			temp_vec = [0.0] * rep_method.use_total
180 | 			
181 | 			for idx, op in enumerate(ops):
182 | 				if op == 'mean':
183 | 					temp_vec[idx] = mean_v
184 | 				elif op == 'var':
185 | 					temp_vec[idx] = var_v
186 | 				elif op == 'sum':
187 | 					temp_vec[idx] = sum_v
188 | 				elif op == 'max':
189 | 					temp_vec[idx] = max_v
190 | 				elif op == 'min':
191 | 					temp_vec[idx] = min_v
192 | 				elif op == 'L1':
193 | 					temp_vec[idx] = L1_v
194 | 				elif op == 'L2':
195 | 					temp_vec[idx] = L2_v
196 | 				else:
197 | 					sys.exit('[Unsupported operation]')
198 | 
199 | 			result[u, fid*rep_method.use_total:(fid+1)*rep_method.use_total] = temp_vec
200 | 
201 | 	return result
202 | 
203 | 
204 | def construct_neighbor_list(adj_matrix, nodes_to_embed):
205 | 	result = {}
206 | 
207 | 	for i in nodes_to_embed:
208 | 		result[i] = list(adj_matrix.getrow(i).nonzero()[1])
209 | 
210 | 	return result
211 | 
212 | 
213 | def get_init_features(graph, base_features, nodes_to_embed):
214 | 	'''
215 | 	# set fb: sum as default.
216 | 	'''
217 | 	init_feature_matrix = np.zeros((len(nodes_to_embed), len(base_features)))
218 | 	adj = graph.adj_matrix
219 | 
220 | 	if "row_col" in base_features:
221 | 		init_feature_matrix[:,base_features.index("row_col")] = (adj.sum(axis=0).transpose() +  adj.sum(axis=1)).ravel()
222 | 
223 | 	if "col" in base_features:
224 | 		init_feature_matrix[:,base_features.index("col")] = adj.sum(axis=0).transpose().ravel()
225 | 
226 | 	if "row" in base_features:
227 | 		init_feature_matrix[:,base_features.index("row")] = adj.sum(axis=1).ravel()
228 | 
229 | 	print '[Initial_feature_all finished]'
230 | 	return init_feature_matrix
231 | 
232 | 
233 | def get_feature_n_buckets(feature_matrix, num_buckets, bucket_max_value):
234 | 
235 | 	result_sum = 0
236 | 	result_ind = []
237 | 	N, cur_P = feature_matrix.shape
238 | 
239 | 	if num_buckets is not None:
240 | 		for i in range(cur_P):
241 | 			temp = max(bucket_max_value, int(math.log(max(max(feature_matrix[:,i]), 1), num_buckets) + 1))
242 | 			n_buckets = temp
243 | 			# print max(feature_matrix[:,i])
244 | 			result_sum += n_buckets
245 | 			result_ind.append(n_buckets)
246 | 	else:
247 | 		for i in range(cur_P):
248 | 			temp = max(bucket_max_value, int( max(feature_matrix[:,i]) ) + 1)
249 | 			n_buckets = temp
250 | 			result_sum += n_buckets
251 | 			result_ind.append(n_buckets)
252 | 
253 | 	return result_sum, result_ind
254 | 
255 | 
256 | 
257 | def parse_args():
258 | 	'''
259 | 	Parses the arguments.
260 | 	'''
261 | 	parser = argparse.ArgumentParser(description="Multi-Lens: Bridging Network Embedding and Summarization.")
262 | 
263 | 	parser.add_argument('--input', nargs='?', default='../graph/test.tsv', help='Input graph file path')
264 | 
265 | 	parser.add_argument('--cat', nargs='?', default='../graph/test_cat.tsv', help='Input node category file path')
266 | 
267 | 	parser.add_argument('--summary', nargs='?', default='./latent_summary.pkl', help='Summary file path')
268 | 
269 | 	parser.add_argument('--output', nargs='?', default='../emb/test_emb_ind.txt', help='Embedding file path')
270 | 
271 | 	parser.add_argument('--test', nargs='?', default='../emb/test_emb.txt', help='Embedding file (old) path. This file is just used for testing.')
272 | 
273 | 	parser.add_argument('--dim', type=int, default=128, help='Embedding dimension')
274 | 
275 | 	parser.add_argument('--L', type=int, default=2, help='Subgraph level')
276 | 
277 | 	parser.add_argument('--base', type=int, default=4, help='Base constant of logarithm histograms')
278 | 
279 | 	parser.add_argument('--operators', default=['mean', 'var', 'sum', 'max', 'min', 'L1', 'L2'], nargs="+", help='Relational operators to use.')
280 | 
281 | 	return parser.parse_args()
282 | 
283 | 
284 | def dist_cal(row1, row2):
285 | 	dist = np.linalg.norm(row1-row2)
286 | 
287 | 	return dist
288 | 
289 | if __name__ == '__main__':
290 | 
291 | 	# assume the graph is directed, weighted
292 | 	directed = True
293 | 
294 | 	args = parse_args()
295 | 
296 | 	######################################################
297 | 	# Base features to use
298 | 	######################################################
299 | 
300 | 	emb_write = True
301 | 	check_difference = False
302 | 
303 | 	base_features = ['row', 'col', 'row_col']
304 | 
305 | 	###########################################
306 | 	# graph_2_file_path
307 | 	###########################################
308 | 	input_graph_file_path = args.input
309 | 	input_gt_path = args.cat
310 | 	input_summary_path = args.summary
311 | 	input_emb_file_path = args.test
312 | 
313 | 	output_file_path = args.output
314 | 
315 | 	dim = args.dim
316 | 	L = args.L
317 | 	num_buckets = args.base
318 | 	op = args.operators
319 | 	print '----------------------------------'
320 | 	print '[Input graph (new) file] ' + input_graph_file_path
321 | 	print '[Input category file] ' + input_gt_path
322 | 	print '[Input summary (existing) file]' + input_summary_path
323 | 	print '[Output embedding file] ' + output_file_path
324 | 	print '[Embedding dimension] ' + str(dim)
325 | 	print '[Number of levels] ' + str(L)
326 | 	print '[Base of logarithm binning] ' + str(num_buckets)
327 | 	print '[Relational operators] ' + str(op)
328 | 	print '----------------------------------'
329 | 
330 | 
331 | 	pkl_file = open(input_summary_path, 'rb')
332 | 	g_summs = pickle.load(pkl_file)
333 | 	pkl_file.close()
334 | 
335 | 
336 | 	delimiter = get_delimiter(input_graph_file_path)
337 | 
338 | 
339 | 	raw = np.genfromtxt(input_graph_file_path, dtype=int)
340 | 	COL = raw.shape[1]
341 | 
342 | 	if COL < 2:
343 | 		sys.exit('[Input format error.]')
344 | 	elif COL == 2:
345 | 		print '[unweighted graph detected.]'
346 | 		rows = raw[:,0]
347 | 		cols = raw[:,1]
348 | 		weis = np.ones(len(rows))
349 | 
350 | 	elif COL == 3:
351 | 		print '[weighted graph detected.]'
352 | 		rows = raw[:,0]
353 | 		cols = raw[:,1]
354 | 		weis = raw[:,2]
355 | 
356 | 
357 | 	check_eq = True
358 | 	max_id = int(max(max(rows), max(cols)))
359 | 	num_nodes = max_id + 1
360 | 	print '[max_node_id] ' + str(max_id)
361 | 	print '[num_nodes] ' + str(num_nodes)
362 | 
363 | 	nodes_to_embed = range(int(max_id)+1)
364 | 
365 | 	if max(rows) != max(cols):
366 | 		rows = np.append(rows,max(max(rows), max(cols)))
367 | 		cols = np.append(cols,max(max(rows), max(cols)))
368 | 		weis = np.append(weis, 0)
369 | 		check_eq = False
370 | 
371 | 
372 | 	adj_matrix = sps.lil_matrix( sps.csc_matrix((weis, (rows, cols))) )
373 | 
374 | 	CAT_DICT, unique_cat, ID_CAT_DICT = construct_cat(input_gt_path, delimiter)	
375 | 
376 | 	g_sums = []
377 | 
378 | 	neighbor_list = construct_neighbor_list(adj_matrix, nodes_to_embed)
379 | 
380 | 	graph = Graph(adj_matrix = adj_matrix, max_id = max_id, num_nodes = num_nodes, base_features = base_features, neighbor_list = neighbor_list,
381 | 		directed = directed, cat_dict = CAT_DICT, id_cat_dict = ID_CAT_DICT, unique_cat = unique_cat, check_eq = check_eq)
382 | 	
383 | 	init_feature_matrix = get_init_features(graph, base_features, nodes_to_embed)
384 | 
385 | 	rep_method = RepMethod(method = "hetero", bucket_max_value = 30, num_buckets = num_buckets, operators = op, use_total = len(op))
386 | 	
387 | 
388 | 	######################################################
389 | 	# Step 1: get node embeddings from the summary
390 | 	# The number of layers we want to explore - 
391 | 	# layer 0 is the base feature matrix
392 | 	# layer 1+: are the layers of higher order
393 | 	############################################
394 | 	init_feature_matrix_seq = get_seq_features(graph, rep_method, input_dense_matrix = init_feature_matrix, nodes_to_embed = nodes_to_embed)
395 | 	init_gs = g_summs[0]
396 | 	print 'init_gs shape: ' + str(init_gs.shape)
397 | 	U = np.dot( init_feature_matrix_seq, np.linalg.pinv(init_gs) )
398 | 
399 | 
400 | 	feature_matrix = init_feature_matrix
401 | 
402 | 	for i in range(L):
403 | 		print '[Current layer] ' + str(i)
404 | 
405 | 		feature_matrix_new = search_feature_layer(graph, rep_method, base_feature_matrix = feature_matrix)
406 | 
407 | 		feature_matrix_new_seq = get_seq_features(graph, rep_method, input_dense_matrix = feature_matrix_new, nodes_to_embed = nodes_to_embed)
408 | 
409 | 		cur_gs = g_summs[i+1]
410 | 		print '[Summary shape] ' + str(cur_gs.shape)
411 | 		cur_U = np.dot( feature_matrix_new_seq, np.linalg.pinv(cur_gs) )
412 | 		U = np.concatenate((U, cur_U), axis=1)
413 | 
414 | 		feature_matrix = feature_matrix_new
415 | 
416 | 
417 | 	######################################################
418 | 	# Step 2: evaluate the difference (optional)
419 | 	######################################################
420 | 
421 | 
422 | 	if check_difference:
423 | 		orig = read_embedding(input_emb_file_path)
424 | 		modi = U
425 | 
426 | 		rows = orig[:,0]
427 | 		cols = orig[:,1]
428 | 
429 | 		max_id_1 = orig.shape[0]
430 | 		max_id_2 = U.shape[0]
431 | 		max_id = min(max_id_1, max_id_2)
432 | 
433 | 		ID_DIST_DICT = {}
434 | 		dist_total = 0
435 | 
436 | 		for i in range(max_id):
437 | 			if i % 10000 == 0:
438 | 				print '[Current_node_id] ' + str(i)
439 | 			cur_row_orig = orig[i,:]
440 | 			cur_row_modi = modi[i,:]
441 | 			dist = dist_cal(cur_row_orig, cur_row_modi)
442 | 
443 | 			ID_DIST_DICT[i] = dist
444 | 			dist_total += dist
445 | 
446 | 		print '[total dist] ' + str(dist_total)
447 | 
448 | 
449 | 	if emb_write:
450 | 		write_embedding(U, output_file_path)
451 | 
452 | 	
453 | 
454 | 	
455 | 
456 | 
457 | 
458 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import datetime
  3 | from pathlib import Path
  4 | import numpy as np, scipy as sp, networkx as nx
  5 | import math, time, os, sys, random
  6 | from collections import deque
  7 | import pickle
  8 | import argparse
  9 | 
 10 | import scipy.sparse as sps
 11 | from scipy.sparse import coo_matrix
 12 | from scipy.sparse.linalg import svds, eigs
 13 | import sparsesvd
 14 | 
 15 | from collections import defaultdict
 16 | 
 17 | from sklearn.decomposition import NMF, DictionaryLearning
 18 | from sklearn.manifold import TSNE
 19 | 
 20 | from util import *
 21 | 
 22 | 
 23 | def get_combined_feature_sequence(graph, rep_method, current_node, input_dense_matrix = None, feature_wid_ind = None):
 24 | 	'''
 25 | 	Get the combined degree/other feature sequence for a given node
 26 | 	'''
 27 | 	N, cur_P = input_dense_matrix.shape
 28 | 
 29 | 	id_cat_dict = graph.id_cat_dict
 30 | 	combined_feature_vector = []
 31 | 	cur_neighbors = graph.neighbor_list[current_node][:]
 32 | 	cur_neighbors.append(current_node)
 33 | 	
 34 | 	for cat in graph.cat_dict.keys():
 35 | 
 36 | 		features = []
 37 | 		for i in range(cur_P):
 38 | 			features.append([0.0] * feature_wid_ind[i])
 39 | 
 40 | 		for neighbor in cur_neighbors:
 41 | 			if id_cat_dict[neighbor] != cat:
 42 | 				continue			
 43 | 			try:
 44 | 				# print cur_P
 45 | 				for i in range(cur_P):
 46 | 					node_feature = input_dense_matrix[neighbor, i]
 47 | 
 48 | 					if (rep_method.num_buckets is not None) and (node_feature != 0):
 49 | 						bucket_index = int(math.log(node_feature, rep_method.num_buckets))
 50 | 					else:
 51 | 						bucket_index = int(node_feature)
 52 | 
 53 | 					bucket_index = max(bucket_index, 0)
 54 | 					features[i][min(bucket_index, len(features[i]) - 1)] += 1
 55 | 
 56 | 			except Exception as e:
 57 | 				print "Exception:", e
 58 | 		cur_feature_vector = features[0]
 59 | 		
 60 | 		for feature_vector in features[1:]:
 61 | 			cur_feature_vector += feature_vector
 62 | 
 63 | 		combined_feature_vector += cur_feature_vector
 64 | 	
 65 | 	return combined_feature_vector
 66 | 
 67 | 
 68 | def get_features(graph, rep_method, input_dense_matrix = None, nodes_to_embed = None):
 69 | 
 70 | 	feature_wid_sum, feature_wid_ind = get_feature_n_buckets(input_dense_matrix, num_buckets, rep_method.bucket_max_value)
 71 | 	feature_matrix = np.zeros([graph.num_nodes, feature_wid_sum * len(graph.unique_cat)])
 72 | 
 73 | 	for n in nodes_to_embed:
 74 | 		if n % 50000 == 0:
 75 | 			print "[Generate combined feature vetor] node: " + str(n)
 76 | 		combined_feature_sequence = get_combined_feature_sequence(graph, rep_method, n, input_dense_matrix = input_dense_matrix, feature_wid_ind = feature_wid_ind)
 77 | 		feature_matrix[n,:] = combined_feature_sequence
 78 | 
 79 | 	return feature_matrix
 80 | 
 81 | 
 82 | def get_seq_features(graph, rep_method, input_dense_matrix = None, nodes_to_embed = None):
 83 | 	
 84 | 	if input_dense_matrix is None:
 85 | 		sys.exit('get_seq_features: no input matrix.')
 86 | 
 87 | 	if nodes_to_embed is None:
 88 | 		nodes_to_embed = range(graph.num_nodes)
 89 | 		num_nodes = graph.num_nodes
 90 | 	else:
 91 | 		num_nodes = len(nodes_to_embed)
 92 | 	
 93 | 	feature_matrix = get_features(graph, rep_method, input_dense_matrix, nodes_to_embed)
 94 | 
 95 | 	if graph.directed:
 96 | 		print "[Starting to obtain features from in-components]"
 97 | 
 98 | 		neighbor_list_r = construct_neighbor_list(graph.adj_matrix.transpose(), nodes_to_embed)
 99 | 
100 | 		indegree_graph = Graph(graph.adj_matrix.transpose(),  max_id = graph.max_id, num_nodes = graph.num_nodes, 
101 | 			directed = graph.directed, base_features = graph.base_features, neighbor_list = neighbor_list_r,
102 | 			cat_dict = graph.cat_dict, id_cat_dict = graph.id_cat_dict, unique_cat = graph.unique_cat, check_eq = graph.check_eq)
103 | 		base_feature_matrix_in = get_features(indegree_graph, rep_method, input_dense_matrix, nodes_to_embed = nodes_to_embed)
104 | 
105 | 		feature_matrix = np.hstack((feature_matrix, base_feature_matrix_in))
106 | 
107 | 	return feature_matrix
108 | 
109 | 
110 | def construct_cat(input_gt_path, delimiter):
111 | 	'''
112 | 	# Input: per line, 1) cat-id_init, id_end or 2) cat-id
113 | 	'''
114 | 	result = defaultdict(set)
115 | 	id_cat_dict = dict()
116 | 
117 | 	fIn = open(input_gt_path, 'r')
118 | 	lines = fIn.readlines()
119 | 	for line in lines:
120 | 
121 | 		parts = line.strip('\r\n').split(delimiter)
122 | 		if len(parts) == 3:
123 | 			cat = parts[0]
124 | 			node_id_start = parts[1]
125 | 			node_id_end = parts[2]
126 | 
127 | 			for i in range( int(node_id_start), int(node_id_end)+1 ):
128 | 				result[ int(cat) ].add( i )
129 | 				id_cat_dict[i] = int(cat)
130 | 
131 | 		elif len(parts) == 2:
132 | 			cat = parts[0]
133 | 			node_id = parts[1]
134 | 
135 | 			result[int(cat)].add( int(node_id) )
136 | 			id_cat_dict[int(node_id)] = int(cat)
137 | 
138 | 		else:
139 | 			sys.exit('Cat file format not supported')
140 | 
141 | 	fIn.close()
142 | 	return result, result.keys(), id_cat_dict
143 | 
144 | 
145 | def search_feature_layer(graph, rep_method, base_feature_matrix = None):
146 | 
147 | 	n,p = base_feature_matrix.shape
148 | 	result = np.zeros([n, p*rep_method.use_total])
149 | 	ops = rep_method.operators
150 | 
151 | 	for u in range(n):
152 | 		if u % 50000 == 0:
153 | 			print '[Current_node_id] ' + str(u)
154 | 
155 | 		neighbors = graph.neighbor_list[u]
156 | 
157 | 		for fid in range(p):
158 | 
159 | 			mean_v = 0.0; sum_v = 0.0; var_v = 0.0; max_v = 0.0; min_v = 0.0; sum_sq_diff = 0.0; prod_v = 1.0; L1_v = 0.0; L2_v = 0.0
160 | 
161 | 			for v in neighbors:
162 | 
163 | 				L1_v += abs(base_feature_matrix[u][fid] - base_feature_matrix[v][fid])	# L1
164 | 				diff = base_feature_matrix[u][fid] - base_feature_matrix[v][fid]
165 | 				L2_v += diff*diff	# L2
166 | 				sum_sq_diff += base_feature_matrix[v][fid] * base_feature_matrix[v][fid]     # var
167 | 				sum_v += base_feature_matrix[v][fid]  # used in sum and mean
168 | 				if max_v < base_feature_matrix[v][fid]:	# max
169 | 					max_v = base_feature_matrix[v][fid]
170 | 				if min_v > base_feature_matrix[v][fid]: # min
171 | 					min_v = base_feature_matrix[v][fid]
172 | 
173 | 			deg = len(neighbors)
174 | 			if deg == 0:
175 | 				mean_v = 0
176 | 				var_v = 0
177 | 			else:
178 | 				mean_v = sum_v / float(deg)
179 | 				var_v = (sum_sq_diff / float(deg)) - (mean_v * mean_v) #- 2.0*mean_v/float(deg)*sum_v
180 | 
181 | 			temp_vec = [0.0] * rep_method.use_total
182 | 			
183 | 			for idx, op in enumerate(ops):
184 | 				if op == 'mean':
185 | 					temp_vec[idx] = mean_v
186 | 				elif op == 'var':
187 | 					temp_vec[idx] = var_v
188 | 				elif op == 'sum':
189 | 					temp_vec[idx] = sum_v
190 | 				elif op == 'max':
191 | 					temp_vec[idx] = max_v
192 | 				elif op == 'min':
193 | 					temp_vec[idx] = min_v
194 | 				elif op == 'L1':
195 | 					temp_vec[idx] = L1_v
196 | 				elif op == 'L2':
197 | 					temp_vec[idx] = L2_v
198 | 				else:
199 | 					sys.exit('[Unsupported operation]')
200 | 
201 | 			result[u, fid*rep_method.use_total:(fid+1)*rep_method.use_total] = temp_vec
202 | 
203 | 	return result
204 | 
205 | 
206 | def feature_layer_evaluation_embedding(graph, rep_method, feature_matrix = None, k = 17):
207 | 
208 | 	temp = scipy.sparse.csc_matrix(feature_matrix)
209 | 	U,s,V = sparsesvd.sparsesvd(temp, k)
210 | 
211 | 	S = np.diag(s)
212 | 	emb = np.dot(U.T, (S ** 0.5))
213 | 	g_sum = np.dot((S**0.5), V)
214 | 
215 | 	return emb, g_sum
216 | 
217 | 
218 | def construct_neighbor_list(adj_matrix, nodes_to_embed):
219 | 	result = {}
220 | 
221 | 	for i in nodes_to_embed:
222 | 		result[i] = list(adj_matrix.getrow(i).nonzero()[1])
223 | 
224 | 	return result
225 | 
226 | 
227 | 
228 | def get_init_features(graph, base_features, nodes_to_embed):
229 | 	'''
230 | 	# set fb: sum as default.
231 | 	'''
232 | 	init_feature_matrix = np.zeros((len(nodes_to_embed), len(base_features)))
233 | 	adj = graph.adj_matrix
234 | 
235 | 	if "row_col" in base_features:
236 | 		init_feature_matrix[:,base_features.index("row_col")] = (adj.sum(axis=0).transpose() +  adj.sum(axis=1)).ravel()
237 | 
238 | 	if "col" in base_features:
239 | 		init_feature_matrix[:,base_features.index("col")] = adj.sum(axis=0).transpose().ravel()
240 | 
241 | 	if "row" in base_features:
242 | 		init_feature_matrix[:,base_features.index("row")] = adj.sum(axis=1).ravel()
243 | 
244 | 	print '[Initial_feature_all finished]'
245 | 	return init_feature_matrix
246 | 
247 | def get_feature_n_buckets(feature_matrix, num_buckets, bucket_max_value):
248 | 
249 | 	result_sum = 0
250 | 	result_ind = []
251 | 	N, cur_P = feature_matrix.shape
252 | 
253 | 	if num_buckets is not None:
254 | 		for i in range(cur_P):
255 | 			temp = max(bucket_max_value, int(math.log(max(max(feature_matrix[:,i]), 1), num_buckets) + 1))
256 | 			n_buckets = temp
257 | 			# print max(feature_matrix[:,i])
258 | 			result_sum += n_buckets
259 | 			result_ind.append(n_buckets)
260 | 	else:
261 | 		for i in range(cur_P):
262 | 			temp = max(bucket_max_value, int( max(feature_matrix[:,i]) ) + 1)
263 | 			n_buckets = temp
264 | 			result_sum += n_buckets
265 | 			result_ind.append(n_buckets)
266 | 
267 | 	return result_sum, result_ind
268 | 
269 | 
270 | def parse_args():
271 | 	'''
272 | 	Parses the arguments.
273 | 	'''
274 | 	parser = argparse.ArgumentParser(description="Multi-Lens: Bridging Network Embedding and Summarization.")
275 | 
276 | 	parser.add_argument('--input', nargs='?', default='../graph/test.tsv', help='Input graph file path')
277 | 
278 | 	parser.add_argument('--cat', nargs='?', default='../graph/test_cat.tsv', help='Input node category file path')
279 | 
280 | 	parser.add_argument('--output', nargs='?', default='../emb/test_emb.txt', help='Embedding file path')
281 | 
282 | 	parser.add_argument('--dim', type=int, default=128, help='Embedding dimension')
283 | 
284 | 	parser.add_argument('--L', type=int, default=2, help='Subgraph level')
285 | 
286 | 	parser.add_argument('--base', type=int, default=4, help='Base constant of logarithm histograms')
287 | 
288 | 	parser.add_argument('--operators', default=['mean', 'var', 'sum', 'max', 'min', 'L1', 'L2'], nargs="+", help='Relational operators to use.')
289 | 
290 | 	return parser.parse_args()
291 | 
292 | 
293 | 
294 | def get_Kis(init_feature_matrix_seq, K, L):
295 | 	
296 | 	result = []
297 | 	rank_init = np.linalg.matrix_rank(init_feature_matrix_seq)
298 | 	
299 | 	if L == 0:
300 | 		result.append( min(rank_init, K) )
301 | 	else:
302 | 		l_0 = min(rank_init, K/(L+1))
303 | 		result.append(l_0)
304 | 		for i in range(L-1):
305 | 			result.append( K/(L+1) )
306 | 
307 | 		result.append(K - sum(result))
308 | 
309 | 	return result
310 | 	
311 | 
312 | 
313 | if __name__ == '__main__':
314 | 
315 | 	# assume the graph is directed, weighted
316 | 	directed = True
317 | 
318 | 	args = parse_args()
319 | 	
320 | 	######################################################
321 | 	# Base features to use.
322 | 	######################################################
323 | 
324 | 	emb_write = True
325 | 
326 | 	base_features = ['row', 'col', 'row_col']
327 | 
328 | 	######################################################
329 | 	# Parameters to setup
330 | 	######################################################
331 | 	input_file_path = args.input
332 | 	input_gt_path = args.cat
333 | 	output_file_path = args.output
334 | 
335 | 	dim = args.dim
336 | 	L = args.L
337 | 	num_buckets = args.base
338 | 	op = args.operators
339 | 	print '----------------------------------'
340 | 	print '[Input graph file] ' + input_file_path
341 | 	print '[Input category file] ' + input_gt_path
342 | 	print '[Output embedding file] ' + output_file_path
343 | 	print '[Embedding dimension] ' + str(dim)
344 | 	print '[Number of levels] ' + str(L)
345 | 	print '[Base of logarithm binning] ' + str(num_buckets)
346 | 	print '[Relational operators] ' + str(op)
347 | 	print '----------------------------------'
348 | 
349 | 	######################################################
350 | 	# Preprocess
351 | 	######################################################
352 | 	
353 | 	delimiter = get_delimiter(input_file_path)
354 | 	
355 | 
356 | 	raw = np.genfromtxt(input_file_path, dtype=int)
357 | 	COL = raw.shape[1]
358 | 
359 | 	if COL < 2:
360 | 		sys.exit('[Input format error.]')
361 | 	elif COL == 2:
362 | 		print '[unweighted graph detected.]'
363 | 		rows = raw[:,0]
364 | 		cols = raw[:,1]
365 | 		weis = np.ones(len(rows))
366 | 
367 | 	elif COL == 3:
368 | 		print '[weighted graph detected.]'
369 | 		rows = raw[:,0]
370 | 		cols = raw[:,1]
371 | 		weis = raw[:,2]
372 | 
373 | 
374 | 	check_eq = True
375 | 	max_id = int(max(max(rows), max(cols)))
376 | 	num_nodes = max_id + 1
377 | 	print '[max_node_id] ' + str(max_id)
378 | 	print '[num_nodes] ' + str(num_nodes)
379 | 
380 | 	nodes_to_embed = range(int(max_id)+1) #[1,2]#
381 | 
382 | 	if max(rows) != max(cols):
383 | 		rows = np.append(rows,max(max(rows), max(cols)))
384 | 		cols = np.append(cols,max(max(rows), max(cols)))
385 | 		weis = np.append(weis, 0)
386 | 		check_eq = False
387 | 
388 | 
389 | 	adj_matrix = sps.lil_matrix( sps.csc_matrix((weis, (rows, cols))))
390 | 	print '[shape of adj_matrix] ' + str(adj_matrix.shape)
391 | 
392 | 	CAT_DICT, unique_cat, ID_CAT_DICT = construct_cat(input_gt_path, delimiter)
393 | 
394 | 
395 | 	######################################################
396 | 	# Multi-Lens starts.
397 | 	######################################################
398 | 
399 | 	g_sums = []
400 | 
401 | 	neighbor_list = construct_neighbor_list(adj_matrix, nodes_to_embed)
402 | 
403 | 	graph = Graph(adj_matrix = adj_matrix, max_id = max_id, num_nodes = num_nodes, base_features = base_features,
404 | 		neighbor_list = neighbor_list, directed = directed, cat_dict = CAT_DICT, id_cat_dict = ID_CAT_DICT, unique_cat = unique_cat, check_eq = check_eq)
405 | 
406 | 	rep_method = RepMethod(method = "hetero", bucket_max_value = 30, num_buckets = num_buckets, operators = op, use_total = len(op))
407 | 
408 | 	########################################
409 | 	# Step 1: get base features
410 | 	########################################
411 | 	init_feature_matrix = get_init_features(graph, base_features, nodes_to_embed)
412 | 	init_feature_matrix_seq = get_seq_features(graph, rep_method, input_dense_matrix = init_feature_matrix, nodes_to_embed = nodes_to_embed)
413 | 
414 | 	Kis = get_Kis(init_feature_matrix_seq, dim, L)
415 | 	
416 | 	feature_matrix_emb, g_sum = feature_layer_evaluation_embedding(graph, rep_method, feature_matrix = init_feature_matrix_seq, k = Kis[0])
417 | 
418 | 	g_sums.append(g_sum)
419 | 
420 | 
421 | 	########################################
422 | 	# Step 2: feature proliferation.
423 | 	# layer 0 is the base feature matrix
424 | 	# layer 1+: are the layers of higher order
425 | 	########################################
426 | 
427 | 	rep = feature_matrix_emb
428 | 	feature_matrix = init_feature_matrix
429 | 
430 | 
431 | 	for i in range(L):
432 | 		print '[Current layer] ' + str(i)
433 | 		print '[feature_matrix shape] ' + str(feature_matrix.shape)
434 | 
435 | 		feature_matrix_new = search_feature_layer(graph, rep_method, base_feature_matrix = feature_matrix)
436 | 		feature_matrix_new_seq = get_seq_features(graph, rep_method, input_dense_matrix = feature_matrix_new, nodes_to_embed = nodes_to_embed)
437 | 		feature_matrix_new_emb, g_new_sum = feature_layer_evaluation_embedding(graph, rep_method, feature_matrix = feature_matrix_new_seq, k = Kis[i+1])
438 | 
439 | 		feature_matrix = feature_matrix_new
440 | 		rep_new = feature_matrix_new_emb
441 | 		rep = np.concatenate((rep, rep_new), axis=1)
442 | 
443 | 		g_sums.append(g_new_sum)
444 | 
445 | 	######################################################
446 | 	# Write output
447 | 	######################################################
448 | 
449 | 	print '[Multi-Lens ends. Summary sizes:]'
450 | 	for ele in g_sums:
451 | 		print ele.shape
452 | 
453 | 	fOut = open('latent_summary.pkl', 'wb')
454 | 	pickle.dump(g_sums, fOut, -1)
455 | 	fOut.close()
456 | 
457 | 	if emb_write:
458 | 		write_embedding(rep, output_file_path)
459 | 	
460 | 
461 | 
462 | 


--------------------------------------------------------------------------------