├── README.md
├── SNE_runner.py
├── evaluation.py
├── LoadData.py
└── SNE.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Attributed-Social-Network-Embedding  [  tensorflow 0.12   ]
 2 | 
 3 | Tensorflow implementation of Attributed Social Network Embedding framework (ASNE).
 4 | 
 5 | The instruction of commands has been clearly stated in the codes (see the parse_args function).
 6 | 
 7 | ## Example to run the codes [tensorflow  0.12 version]
 8 | ```
 9 | python SNE_runner.py --data_path path --id_dim 20 --attr_dim 20 --n_neg_samples 10 --epoch 20
10 | ```
11 | 
12 | ## Cite
13 | If you use the code, please kindly cite the following paper:
14 | ```
15 | @article{liao2017attributed,
16 |   title={Attributed Social Network Embedding},
17 |   author={Liao, Lizi and He, Xiangnan and Zhang, Hanwang and Chua, Tat-Seng},
18 |   journal={arXiv preprint arXiv:1705.04969},
19 |   year={2017}
20 | }
21 | ```
22 | 
23 | ## Contact
24 | liaolizi.llz@gmail.com
25 | 


--------------------------------------------------------------------------------
/SNE_runner.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import argparse
 3 | import numpy as np
 4 | import LoadData as data
 5 | from SNE import SNE
 6 | 
 7 | # Set random seeds
 8 | SEED = 2016
 9 | random.seed(SEED)
10 | np.random.seed(SEED)
11 | 
12 | def parse_args():
13 |     parser = argparse.ArgumentParser(description="Run SNE.")
14 |     parser.add_argument('--data_path', nargs='?', default='../UNC/',
15 |                         help='Input data path')
16 |     parser.add_argument('--id_dim', type=int, default=20,
17 |                         help='Dimension for id_part.')
18 |     parser.add_argument('--epoch', type=int, default=20,
19 |                         help='Number of epochs.')
20 |     parser.add_argument('--n_neg_samples', type=int, default=10,
21 |                         help='Number of negative samples.')
22 |     parser.add_argument('--attr_dim', type=int, default=20,
23 |                         help='Dimension for attr_part.')
24 |     return parser.parse_args()
25 | 
26 | #################### Util functions ####################
27 | 
28 | 
29 | def run_SNE( data, id_dim, attr_dim ):
30 |     model = SNE( data, id_embedding_size=id_dim, attr_embedding_size=attr_dim)
31 |     model.train( )
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     args = parse_args()
36 |     print("data_path: ", args.data_path)
37 |     path = args.data_path
38 |     Data = data.LoadData( path , SEED)
39 |     print("Total training links: ", len(Data.links))
40 |     print("Total epoch: ", args.epoch)
41 | 
42 |     print('id_dim :', args.id_dim)
43 |     print('attr_dim :', args.attr_dim)
44 | 
45 |     run_SNE( Data, args.id_dim, args.attr_dim)
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/evaluation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | from sklearn.metrics import average_precision_score
 4 | from sklearn.metrics.pairwise import euclidean_distances
 5 | from sklearn.metrics.pairwise import manhattan_distances
 6 | from sklearn.metrics.pairwise import cosine_similarity
 7 | from sklearn.metrics import roc_auc_score
 8 | 
 9 | 
10 | def calculate_distance( embeddings, type): # N * emb_size
11 |     if type == 'euclidean_distances':
12 |         Y_predict = -1.0 * euclidean_distances(embeddings, embeddings)
13 |         return Y_predict
14 |     if type == 'cosine_similarity':
15 |         Y_predict = cosine_similarity(embeddings, embeddings)
16 |         return Y_predict
17 | 
18 | def norm(a):
19 |     sum = 0.0
20 |     for i in range(len(a)):
21 |         sum = sum + a[i] * a[i]
22 |     return math.sqrt(sum)
23 | 
24 | def cosine_similarity( a,  b):
25 |     sum = 0.0
26 |     for i in range(len(a)):
27 |         sum = sum + a[i] * b[i]
28 |     return sum/(norm(a) * norm(b))
29 | 
30 | def evaluate_ROC(X_test, Embeddings):
31 |     y_true = [ X_test[i][2] for i in range(len(X_test))]
32 |     y_predict = [ cosine_similarity(Embeddings[X_test[i][0],:], Embeddings[X_test[i][1], :]) for i in range(len(X_test))]
33 |     roc = roc_auc_score(y_true, y_predict)
34 |     if roc < 0.5:
35 |         roc = 1 - roc
36 |     return roc
37 | 
38 | 
39 | def evaluate_MAP( node_neighbors_map, Embeddings, distance_measure):
40 |     '''
41 |     given the embeddings of nodes and the node_neighbors, return the MAP value
42 |     :param node_neighbors_map: [node_id : neighbors_ids]
43 |     :param nodes: a dictionary, ['node_id']--len(nodes) of id for nodes, one by one; ['node_attr']--a list of attrs for corresponding nodes
44 |     :param Embeddings:  # nodes_number * (id_dim + attr_dim), row sequence is the same as nodes['node_id']
45 |     :return: MAP value
46 |     '''
47 |     MAP = .0
48 |     Y_true = np.zeros((len(node_neighbors_map), len(node_neighbors_map)))
49 |     for node in node_neighbors_map:
50 |         # prepare the y_true
51 |         for neighbor in node_neighbors_map[node]:
52 |             Y_true[node][neighbor] = 1
53 | 
54 |     print distance_measure
55 |     Y_predict = calculate_distance(Embeddings,distance_measure)
56 |     for node in node_neighbors_map:
57 |         MAP +=  average_precision_score(Y_true[node,:], Y_predict[node,:])
58 | 
59 |     return MAP/len(node_neighbors_map)
60 | 


--------------------------------------------------------------------------------
/LoadData.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import random
  4 | from random import shuffle
  5 | 
  6 | class LoadData( object ):
  7 |     '''given the path of data, return the data format for ESNA
  8 |     :param path
  9 |     return:
 10 |      X: a dictionary, ['data_id_list']--len(links) of id for nodes in links ; ['data_attr_list']--a list of attrs for corresponding nodes;
 11 |                      ['data_label_list']--len(links) of neighbor for corresponding nodes
 12 | 
 13 |      nodes: a dictionary, ['node_id']--len(nodes) of id for nodes, one by one; ['node_attr']--a list of attrs for corresponding nodes
 14 |     '''
 15 | 
 16 |     # Three files are needed in the path
 17 |     def __init__(self, path, random_seed):
 18 |         np.random.seed(random_seed)
 19 |         random.seed(random_seed)
 20 |         self.path = path
 21 |         self.linkfile = path + "doublelink.edgelist"
 22 |         self.testlinkfile = path + "test_pairs.txt"
 23 |         self.attrfile = path + "attr_info.txt"
 24 |         #self.vocabfile = path + "vocab.txt"
 25 |         self.node_map = {} # [node_name: id] for map node to id inside the program, based on links since some nodes might not have attributes
 26 |         self.nodes = {}
 27 |         self.X = {}
 28 |         self.X_test = []# a list of 3-element lists, read from test_link.txt
 29 |         self.node_neighbors_map = {} # [nodeid: neighbors_set] each node id maps to its neighbors set
 30 |         self.construct_nodes()
 31 |         self.construct_X()
 32 |         self.construct_node_neighbors_map()
 33 |         self.read_test_link()
 34 | 
 35 |     def readvocab(self):
 36 |         f = open(self.attrfile)
 37 |         self.vocab = {}
 38 |         line = f.readline()
 39 |         i = 0
 40 |         while line:
 41 |             items = line.strip().split(' ')
 42 |             for item in items[1:]:
 43 |                 if item not in self.vocab:
 44 |                     self.vocab[ item ] = i
 45 |                     i = i + 1
 46 |             line = f.readline()
 47 |         f.close()
 48 |         self.attr_M = i
 49 |         print("attr_M:", self.attr_M)
 50 | 
 51 |     def construct_nodes(self):
 52 |         '''construct the dictionary '''
 53 |         self.readvocab()
 54 |         f = open(self.attrfile)
 55 |         i = 0
 56 |         self.nodes['node_id'] = []
 57 |         self.nodes['node_attr'] = []
 58 |         line = f.readline()
 59 |         while line:
 60 |             line = line.strip().split(' ')
 61 |             self.node_map[int(line[0])] = i # map the node
 62 |             self.nodes['node_id'].append(i) # only put id in nodes, not the original name
 63 |             vs = np.zeros(self.attr_M)
 64 |             for attr in line[1:]:
 65 |                 if len(attr) > 0:
 66 |                     vs[self.vocab[attr]] = 1
 67 |             self.nodes['node_attr'].append(vs)
 68 |             i = i + 1
 69 |             line = f.readline()
 70 |         f.close()
 71 |         self.id_N = i
 72 |         print("id_N:", self.id_N)
 73 | 
 74 |     def read_link(self): # read link file to a list of links
 75 |         f = open(self.linkfile)
 76 |         self.links = []
 77 |         line = f.readline()
 78 |         while line:
 79 |             line = line.strip().split(' ')
 80 |             link = [int(line[0]), int(line[1])]
 81 |             self.links.append(link)
 82 |             line = f.readline()
 83 |         f.close()
 84 | 
 85 |     def construct_X(self):
 86 |         self.read_link()
 87 |         self.X['data_id_list'] = np.ndarray(shape=(len(self.links)), dtype=np.int32)
 88 |         self.X['data_attr_list'] = np.ndarray(shape=(len(self.links), self.attr_M), dtype=np.float32)
 89 |         self.X['data_label_list'] = np.ndarray(shape=(len(self.links), 1), dtype=np.int32)
 90 | 
 91 |         for i in range(len(self.links)):
 92 |             self.X['data_id_list'][i] = self.node_map[self.links[i][0]]
 93 |             self.X['data_attr_list'][i] =  self.nodes['node_attr'][ self.node_map[self.links[i][0]] ]  # dimension need to change to  self.attr_dim
 94 |             self.X['data_label_list'][i, 0] = self.node_map[self.links[i][1]]  # one neighbor of the node
 95 | 
 96 |     def construct_node_neighbors_map(self):
 97 |         for link in self.links:
 98 |             if self.node_map[ link[0] ] not in self.node_neighbors_map:
 99 |                 self.node_neighbors_map[self.node_map[ link[0] ]] = set( [self.node_map[ link[1] ]] )
100 |             else:
101 |                 self.node_neighbors_map[self.node_map[ link[0] ]].add(self.node_map[ link[1] ])
102 | 
103 |     def read_test_link(self):
104 |         f = open(self.testlinkfile)
105 |         line = f.readline()
106 |         while line:
107 |             line = line.strip().split(' ')
108 |             self.X_test.append([self.node_map[ int(line[0]) ] ,  self.node_map[ int(line[1]) ] ,  int(line[2])  ] )
109 |             line = f.readline()
110 |         f.close()
111 |         print("test link number:", len(self.X_test))
112 | 
113 | 


--------------------------------------------------------------------------------
/SNE.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Tensorflow implementation of Social Network Embedding framework (SNE)
  3 | 
  4 | @author: Lizi Liao (liaolizi.llz@gmail.com)
  5 | 
  6 | '''
  7 | 
  8 | 
  9 | import math
 10 | import numpy as np
 11 | import tensorflow as tf
 12 | from sklearn.base import BaseEstimator, TransformerMixin
 13 | import evaluation
 14 | 
 15 | class SNE(BaseEstimator, TransformerMixin):
 16 |     def __init__(self, data, id_embedding_size, attr_embedding_size,
 17 |                  batch_size=128, alpha = 1.0, n_neg_samples=10,
 18 |                 epoch=20, random_seed = 2016):
 19 |         # bind params to class
 20 |         self.batch_size = batch_size
 21 |         self.node_N = data.id_N
 22 |         self.attr_M = data.attr_M
 23 |         self.X_train = data.X
 24 |         self.X_test = data.X_test
 25 |         self.nodes = data.nodes
 26 |         self.id_embedding_size = id_embedding_size
 27 |         self.attr_embedding_size = attr_embedding_size
 28 |         self.alpha = alpha
 29 |         self.n_neg_samples = n_neg_samples
 30 |         self.epoch = epoch
 31 |         self.random_seed = random_seed
 32 |         # init all variables in a tensorflow graph
 33 |         self._init_graph()
 34 | 
 35 | 
 36 |     def _init_graph(self):
 37 |         '''
 38 |         Init a tensorflow Graph containing: input data, variables, model, loss, optimizer
 39 |         '''
 40 |         self.graph = tf.Graph()
 41 |         with self.graph.as_default():#, tf.device('/cpu:0'):
 42 |             # Set graph level random seed
 43 |             tf.set_random_seed(self.random_seed)
 44 |             # Input data.
 45 |             self.train_data_id = tf.placeholder(tf.int32, shape=[None])  # batch_size * 1
 46 |             self.train_data_attr = tf.placeholder(tf.float32, shape=[None, self.attr_M])  # batch_size * attr_M
 47 |             self.train_labels = tf.placeholder(tf.int32, shape=[None, 1])  # batch_size * 1
 48 | 
 49 |             # Variables.
 50 |             network_weights = self._initialize_weights()
 51 |             self.weights = network_weights
 52 | 
 53 |             # Model.
 54 |             # Look up embeddings for node_id.
 55 |             self.id_embed =  tf.nn.embedding_lookup(self.weights['in_embeddings'], self.train_data_id) # batch_size * id_dim
 56 |             self.attr_embed =  tf.matmul(self.train_data_attr, self.weights['attr_embeddings'])  # batch_size * attr_dim
 57 |             self.embed_layer = tf.concat(1, [self.id_embed, self.alpha * self.attr_embed]) # batch_size * (id_dim + attr_dim)
 58 | 
 59 |             ## can add hidden_layers component here!
 60 | 
 61 |             # Compute the loss, using a sample of the negative labels each time.
 62 |             self.loss =  tf.reduce_mean(tf.nn.sampled_softmax_loss(self.weights['out_embeddings'], self.weights['biases'], self.embed_layer,
 63 |                                                   self.train_labels, self.n_neg_samples, self.node_N))
 64 |             # Optimizer.
 65 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss)
 66 |             # print("AdamOptimizer")
 67 | 
 68 |             # init
 69 |             init = tf.initialize_all_variables()
 70 |             self.sess = tf.Session()
 71 |             self.sess.run(init)
 72 | 
 73 |     def _initialize_weights(self):
 74 |         all_weights = dict()
 75 |         all_weights['in_embeddings'] = tf.Variable(tf.random_uniform([self.node_N, self.id_embedding_size], -1.0, 1.0))  # id_N * id_dim
 76 |         all_weights['attr_embeddings'] = tf.Variable(tf.random_uniform([self.attr_M,self.attr_embedding_size], -1.0, 1.0)) # attr_M * attr_dim
 77 |         all_weights['out_embeddings'] = tf.Variable(tf.truncated_normal([self.node_N, self.id_embedding_size + self.attr_embedding_size],
 78 |                                     stddev=1.0 / math.sqrt(self.id_embedding_size + self.attr_embedding_size)))
 79 |         all_weights['biases'] = tf.Variable(tf.zeros([self.node_N]))
 80 |         return all_weights
 81 | 
 82 |     def partial_fit(self, X): # fit a batch
 83 |         feed_dict = {self.train_data_id: X['batch_data_id'], self.train_data_attr: X['batch_data_attr'],
 84 |                      self.train_labels: X['batch_data_label']}
 85 |         loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)
 86 |         return loss
 87 | 
 88 |     def get_random_block_from_data(self, data, batch_size):
 89 |         start_index = np.random.randint(0, len(data) - batch_size)
 90 |         return data[start_index:(start_index + batch_size)]
 91 | 
 92 |     def train(self): # fit a dataset
 93 | 
 94 |         print 'Using in + out embedding'
 95 | 
 96 |         for epoch in range( self.epoch ):
 97 |             total_batch = int( len(self.X_train['data_id_list']) / self.batch_size)
 98 |             # print('total_batch in 1 epoch: ', total_batch)
 99 |             # Loop over all batches
100 |             for i in range(total_batch):
101 |                 # generate a batch data
102 |                 batch_xs = {}
103 |                 start_index = np.random.randint(0, len(self.X_train['data_id_list']) - self.batch_size)
104 |                 batch_xs['batch_data_id'] = self.X_train['data_id_list'][start_index:(start_index + self.batch_size)]
105 |                 batch_xs['batch_data_attr'] = self.X_train['data_attr_list'][start_index:(start_index + self.batch_size)]
106 |                 batch_xs['batch_data_label'] = self.X_train['data_label_list'][start_index:(start_index + self.batch_size)]
107 | 
108 |                 # Fit training using batch data
109 |                 cost = self.partial_fit(batch_xs)
110 | 
111 |             # Display logs per epoch
112 |             Embeddings_out = self.getEmbedding('out_embedding', self.nodes)
113 |             Embeddings_in = self.getEmbedding('embed_layer', self.nodes)
114 |             Embeddings = Embeddings_out + Embeddings_in
115 | 
116 |             # link prediction test
117 |             roc = evaluation.evaluate_ROC(self.X_test, Embeddings)
118 |             print "Epoch:", '%04d' % (epoch + 1), \
119 |                          "roc=", "{:.9f}".format(roc)
120 | 
121 | 
122 |     def getEmbedding(self, type, nodes):
123 |         if type == 'embed_layer':
124 |             feed_dict = {self.train_data_id: nodes['node_id'], self.train_data_attr: nodes['node_attr']}
125 |             Embedding = self.sess.run(self.embed_layer, feed_dict=feed_dict)
126 |             return Embedding
127 |         if type == 'out_embedding':
128 |             Embedding = self.sess.run(self.weights['out_embeddings'])
129 |             return Embedding  # nodes_number * (id_dim + attr_dim)
130 | 
131 | 


--------------------------------------------------------------------------------