├── README.md ├── SNE_runner.py ├── evaluation.py ├── LoadData.py └── SNE.py /README.md: -------------------------------------------------------------------------------- 1 | # Attributed-Social-Network-Embedding [ tensorflow 0.12 ] 2 | 3 | Tensorflow implementation of Attributed Social Network Embedding framework (ASNE). 4 | 5 | The instruction of commands has been clearly stated in the codes (see the parse_args function). 6 | 7 | ## Example to run the codes [tensorflow 0.12 version] 8 | ``` 9 | python SNE_runner.py --data_path path --id_dim 20 --attr_dim 20 --n_neg_samples 10 --epoch 20 10 | ``` 11 | 12 | ## Cite 13 | If you use the code, please kindly cite the following paper: 14 | ``` 15 | @article{liao2017attributed, 16 | title={Attributed Social Network Embedding}, 17 | author={Liao, Lizi and He, Xiangnan and Zhang, Hanwang and Chua, Tat-Seng}, 18 | journal={arXiv preprint arXiv:1705.04969}, 19 | year={2017} 20 | } 21 | ``` 22 | 23 | ## Contact 24 | liaolizi.llz@gmail.com 25 | -------------------------------------------------------------------------------- /SNE_runner.py: -------------------------------------------------------------------------------- 1 | import random 2 | import argparse 3 | import numpy as np 4 | import LoadData as data 5 | from SNE import SNE 6 | 7 | # Set random seeds 8 | SEED = 2016 9 | random.seed(SEED) 10 | np.random.seed(SEED) 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser(description="Run SNE.") 14 | parser.add_argument('--data_path', nargs='?', default='../UNC/', 15 | help='Input data path') 16 | parser.add_argument('--id_dim', type=int, default=20, 17 | help='Dimension for id_part.') 18 | parser.add_argument('--epoch', type=int, default=20, 19 | help='Number of epochs.') 20 | parser.add_argument('--n_neg_samples', type=int, default=10, 21 | help='Number of negative samples.') 22 | parser.add_argument('--attr_dim', type=int, default=20, 23 | help='Dimension for attr_part.') 24 | return parser.parse_args() 25 | 26 | #################### Util functions #################### 27 | 28 | 29 | def run_SNE( data, id_dim, attr_dim ): 30 | model = SNE( data, id_embedding_size=id_dim, attr_embedding_size=attr_dim) 31 | model.train( ) 32 | 33 | 34 | if __name__ == '__main__': 35 | args = parse_args() 36 | print("data_path: ", args.data_path) 37 | path = args.data_path 38 | Data = data.LoadData( path , SEED) 39 | print("Total training links: ", len(Data.links)) 40 | print("Total epoch: ", args.epoch) 41 | 42 | print('id_dim :', args.id_dim) 43 | print('attr_dim :', args.attr_dim) 44 | 45 | run_SNE( Data, args.id_dim, args.attr_dim) 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | from sklearn.metrics import average_precision_score 4 | from sklearn.metrics.pairwise import euclidean_distances 5 | from sklearn.metrics.pairwise import manhattan_distances 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | from sklearn.metrics import roc_auc_score 8 | 9 | 10 | def calculate_distance( embeddings, type): # N * emb_size 11 | if type == 'euclidean_distances': 12 | Y_predict = -1.0 * euclidean_distances(embeddings, embeddings) 13 | return Y_predict 14 | if type == 'cosine_similarity': 15 | Y_predict = cosine_similarity(embeddings, embeddings) 16 | return Y_predict 17 | 18 | def norm(a): 19 | sum = 0.0 20 | for i in range(len(a)): 21 | sum = sum + a[i] * a[i] 22 | return math.sqrt(sum) 23 | 24 | def cosine_similarity( a, b): 25 | sum = 0.0 26 | for i in range(len(a)): 27 | sum = sum + a[i] * b[i] 28 | return sum/(norm(a) * norm(b)) 29 | 30 | def evaluate_ROC(X_test, Embeddings): 31 | y_true = [ X_test[i][2] for i in range(len(X_test))] 32 | y_predict = [ cosine_similarity(Embeddings[X_test[i][0],:], Embeddings[X_test[i][1], :]) for i in range(len(X_test))] 33 | roc = roc_auc_score(y_true, y_predict) 34 | if roc < 0.5: 35 | roc = 1 - roc 36 | return roc 37 | 38 | 39 | def evaluate_MAP( node_neighbors_map, Embeddings, distance_measure): 40 | ''' 41 | given the embeddings of nodes and the node_neighbors, return the MAP value 42 | :param node_neighbors_map: [node_id : neighbors_ids] 43 | :param nodes: a dictionary, ['node_id']--len(nodes) of id for nodes, one by one; ['node_attr']--a list of attrs for corresponding nodes 44 | :param Embeddings: # nodes_number * (id_dim + attr_dim), row sequence is the same as nodes['node_id'] 45 | :return: MAP value 46 | ''' 47 | MAP = .0 48 | Y_true = np.zeros((len(node_neighbors_map), len(node_neighbors_map))) 49 | for node in node_neighbors_map: 50 | # prepare the y_true 51 | for neighbor in node_neighbors_map[node]: 52 | Y_true[node][neighbor] = 1 53 | 54 | print distance_measure 55 | Y_predict = calculate_distance(Embeddings,distance_measure) 56 | for node in node_neighbors_map: 57 | MAP += average_precision_score(Y_true[node,:], Y_predict[node,:]) 58 | 59 | return MAP/len(node_neighbors_map) 60 | -------------------------------------------------------------------------------- /LoadData.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import random 4 | from random import shuffle 5 | 6 | class LoadData( object ): 7 | '''given the path of data, return the data format for ESNA 8 | :param path 9 | return: 10 | X: a dictionary, ['data_id_list']--len(links) of id for nodes in links ; ['data_attr_list']--a list of attrs for corresponding nodes; 11 | ['data_label_list']--len(links) of neighbor for corresponding nodes 12 | 13 | nodes: a dictionary, ['node_id']--len(nodes) of id for nodes, one by one; ['node_attr']--a list of attrs for corresponding nodes 14 | ''' 15 | 16 | # Three files are needed in the path 17 | def __init__(self, path, random_seed): 18 | np.random.seed(random_seed) 19 | random.seed(random_seed) 20 | self.path = path 21 | self.linkfile = path + "doublelink.edgelist" 22 | self.testlinkfile = path + "test_pairs.txt" 23 | self.attrfile = path + "attr_info.txt" 24 | #self.vocabfile = path + "vocab.txt" 25 | self.node_map = {} # [node_name: id] for map node to id inside the program, based on links since some nodes might not have attributes 26 | self.nodes = {} 27 | self.X = {} 28 | self.X_test = []# a list of 3-element lists, read from test_link.txt 29 | self.node_neighbors_map = {} # [nodeid: neighbors_set] each node id maps to its neighbors set 30 | self.construct_nodes() 31 | self.construct_X() 32 | self.construct_node_neighbors_map() 33 | self.read_test_link() 34 | 35 | def readvocab(self): 36 | f = open(self.attrfile) 37 | self.vocab = {} 38 | line = f.readline() 39 | i = 0 40 | while line: 41 | items = line.strip().split(' ') 42 | for item in items[1:]: 43 | if item not in self.vocab: 44 | self.vocab[ item ] = i 45 | i = i + 1 46 | line = f.readline() 47 | f.close() 48 | self.attr_M = i 49 | print("attr_M:", self.attr_M) 50 | 51 | def construct_nodes(self): 52 | '''construct the dictionary ''' 53 | self.readvocab() 54 | f = open(self.attrfile) 55 | i = 0 56 | self.nodes['node_id'] = [] 57 | self.nodes['node_attr'] = [] 58 | line = f.readline() 59 | while line: 60 | line = line.strip().split(' ') 61 | self.node_map[int(line[0])] = i # map the node 62 | self.nodes['node_id'].append(i) # only put id in nodes, not the original name 63 | vs = np.zeros(self.attr_M) 64 | for attr in line[1:]: 65 | if len(attr) > 0: 66 | vs[self.vocab[attr]] = 1 67 | self.nodes['node_attr'].append(vs) 68 | i = i + 1 69 | line = f.readline() 70 | f.close() 71 | self.id_N = i 72 | print("id_N:", self.id_N) 73 | 74 | def read_link(self): # read link file to a list of links 75 | f = open(self.linkfile) 76 | self.links = [] 77 | line = f.readline() 78 | while line: 79 | line = line.strip().split(' ') 80 | link = [int(line[0]), int(line[1])] 81 | self.links.append(link) 82 | line = f.readline() 83 | f.close() 84 | 85 | def construct_X(self): 86 | self.read_link() 87 | self.X['data_id_list'] = np.ndarray(shape=(len(self.links)), dtype=np.int32) 88 | self.X['data_attr_list'] = np.ndarray(shape=(len(self.links), self.attr_M), dtype=np.float32) 89 | self.X['data_label_list'] = np.ndarray(shape=(len(self.links), 1), dtype=np.int32) 90 | 91 | for i in range(len(self.links)): 92 | self.X['data_id_list'][i] = self.node_map[self.links[i][0]] 93 | self.X['data_attr_list'][i] = self.nodes['node_attr'][ self.node_map[self.links[i][0]] ] # dimension need to change to self.attr_dim 94 | self.X['data_label_list'][i, 0] = self.node_map[self.links[i][1]] # one neighbor of the node 95 | 96 | def construct_node_neighbors_map(self): 97 | for link in self.links: 98 | if self.node_map[ link[0] ] not in self.node_neighbors_map: 99 | self.node_neighbors_map[self.node_map[ link[0] ]] = set( [self.node_map[ link[1] ]] ) 100 | else: 101 | self.node_neighbors_map[self.node_map[ link[0] ]].add(self.node_map[ link[1] ]) 102 | 103 | def read_test_link(self): 104 | f = open(self.testlinkfile) 105 | line = f.readline() 106 | while line: 107 | line = line.strip().split(' ') 108 | self.X_test.append([self.node_map[ int(line[0]) ] , self.node_map[ int(line[1]) ] , int(line[2]) ] ) 109 | line = f.readline() 110 | f.close() 111 | print("test link number:", len(self.X_test)) 112 | 113 | -------------------------------------------------------------------------------- /SNE.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Tensorflow implementation of Social Network Embedding framework (SNE) 3 | 4 | @author: Lizi Liao (liaolizi.llz@gmail.com) 5 | 6 | ''' 7 | 8 | 9 | import math 10 | import numpy as np 11 | import tensorflow as tf 12 | from sklearn.base import BaseEstimator, TransformerMixin 13 | import evaluation 14 | 15 | class SNE(BaseEstimator, TransformerMixin): 16 | def __init__(self, data, id_embedding_size, attr_embedding_size, 17 | batch_size=128, alpha = 1.0, n_neg_samples=10, 18 | epoch=20, random_seed = 2016): 19 | # bind params to class 20 | self.batch_size = batch_size 21 | self.node_N = data.id_N 22 | self.attr_M = data.attr_M 23 | self.X_train = data.X 24 | self.X_test = data.X_test 25 | self.nodes = data.nodes 26 | self.id_embedding_size = id_embedding_size 27 | self.attr_embedding_size = attr_embedding_size 28 | self.alpha = alpha 29 | self.n_neg_samples = n_neg_samples 30 | self.epoch = epoch 31 | self.random_seed = random_seed 32 | # init all variables in a tensorflow graph 33 | self._init_graph() 34 | 35 | 36 | def _init_graph(self): 37 | ''' 38 | Init a tensorflow Graph containing: input data, variables, model, loss, optimizer 39 | ''' 40 | self.graph = tf.Graph() 41 | with self.graph.as_default():#, tf.device('/cpu:0'): 42 | # Set graph level random seed 43 | tf.set_random_seed(self.random_seed) 44 | # Input data. 45 | self.train_data_id = tf.placeholder(tf.int32, shape=[None]) # batch_size * 1 46 | self.train_data_attr = tf.placeholder(tf.float32, shape=[None, self.attr_M]) # batch_size * attr_M 47 | self.train_labels = tf.placeholder(tf.int32, shape=[None, 1]) # batch_size * 1 48 | 49 | # Variables. 50 | network_weights = self._initialize_weights() 51 | self.weights = network_weights 52 | 53 | # Model. 54 | # Look up embeddings for node_id. 55 | self.id_embed = tf.nn.embedding_lookup(self.weights['in_embeddings'], self.train_data_id) # batch_size * id_dim 56 | self.attr_embed = tf.matmul(self.train_data_attr, self.weights['attr_embeddings']) # batch_size * attr_dim 57 | self.embed_layer = tf.concat(1, [self.id_embed, self.alpha * self.attr_embed]) # batch_size * (id_dim + attr_dim) 58 | 59 | ## can add hidden_layers component here! 60 | 61 | # Compute the loss, using a sample of the negative labels each time. 62 | self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(self.weights['out_embeddings'], self.weights['biases'], self.embed_layer, 63 | self.train_labels, self.n_neg_samples, self.node_N)) 64 | # Optimizer. 65 | self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) 66 | # print("AdamOptimizer") 67 | 68 | # init 69 | init = tf.initialize_all_variables() 70 | self.sess = tf.Session() 71 | self.sess.run(init) 72 | 73 | def _initialize_weights(self): 74 | all_weights = dict() 75 | all_weights['in_embeddings'] = tf.Variable(tf.random_uniform([self.node_N, self.id_embedding_size], -1.0, 1.0)) # id_N * id_dim 76 | all_weights['attr_embeddings'] = tf.Variable(tf.random_uniform([self.attr_M,self.attr_embedding_size], -1.0, 1.0)) # attr_M * attr_dim 77 | all_weights['out_embeddings'] = tf.Variable(tf.truncated_normal([self.node_N, self.id_embedding_size + self.attr_embedding_size], 78 | stddev=1.0 / math.sqrt(self.id_embedding_size + self.attr_embedding_size))) 79 | all_weights['biases'] = tf.Variable(tf.zeros([self.node_N])) 80 | return all_weights 81 | 82 | def partial_fit(self, X): # fit a batch 83 | feed_dict = {self.train_data_id: X['batch_data_id'], self.train_data_attr: X['batch_data_attr'], 84 | self.train_labels: X['batch_data_label']} 85 | loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) 86 | return loss 87 | 88 | def get_random_block_from_data(self, data, batch_size): 89 | start_index = np.random.randint(0, len(data) - batch_size) 90 | return data[start_index:(start_index + batch_size)] 91 | 92 | def train(self): # fit a dataset 93 | 94 | print 'Using in + out embedding' 95 | 96 | for epoch in range( self.epoch ): 97 | total_batch = int( len(self.X_train['data_id_list']) / self.batch_size) 98 | # print('total_batch in 1 epoch: ', total_batch) 99 | # Loop over all batches 100 | for i in range(total_batch): 101 | # generate a batch data 102 | batch_xs = {} 103 | start_index = np.random.randint(0, len(self.X_train['data_id_list']) - self.batch_size) 104 | batch_xs['batch_data_id'] = self.X_train['data_id_list'][start_index:(start_index + self.batch_size)] 105 | batch_xs['batch_data_attr'] = self.X_train['data_attr_list'][start_index:(start_index + self.batch_size)] 106 | batch_xs['batch_data_label'] = self.X_train['data_label_list'][start_index:(start_index + self.batch_size)] 107 | 108 | # Fit training using batch data 109 | cost = self.partial_fit(batch_xs) 110 | 111 | # Display logs per epoch 112 | Embeddings_out = self.getEmbedding('out_embedding', self.nodes) 113 | Embeddings_in = self.getEmbedding('embed_layer', self.nodes) 114 | Embeddings = Embeddings_out + Embeddings_in 115 | 116 | # link prediction test 117 | roc = evaluation.evaluate_ROC(self.X_test, Embeddings) 118 | print "Epoch:", '%04d' % (epoch + 1), \ 119 | "roc=", "{:.9f}".format(roc) 120 | 121 | 122 | def getEmbedding(self, type, nodes): 123 | if type == 'embed_layer': 124 | feed_dict = {self.train_data_id: nodes['node_id'], self.train_data_attr: nodes['node_attr']} 125 | Embedding = self.sess.run(self.embed_layer, feed_dict=feed_dict) 126 | return Embedding 127 | if type == 'out_embedding': 128 | Embedding = self.sess.run(self.weights['out_embeddings']) 129 | return Embedding # nodes_number * (id_dim + attr_dim) 130 | 131 | --------------------------------------------------------------------------------