├── README ├── sup.png ├── unsup.png ├── __pycache__ ├── utils.cpython-36.pyc ├── config.cpython-36.pyc ├── aggregator.cpython-36.pyc └── classify.cpython-36.pyc ├── utils.py ├── README.md ├── cora ├── README └── cora.cites ├── aggregator.py ├── config.py └── main.py /README: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/sup.png -------------------------------------------------------------------------------- /unsup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/unsup.png -------------------------------------------------------------------------------- /__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/aggregator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/__pycache__/aggregator.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/classify.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/__pycache__/classify.cpython-36.pyc -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import numpy as np 3 | def load_data(cfg): 4 | num_nodes = cfg.num_nodes 5 | num_feats = cfg.num_features 6 | feat_data = np.zeros((num_nodes, num_feats)) 7 | labels = np.empty((num_nodes), dtype=np.int64) 8 | node_map = {} 9 | label_map = {} 10 | with open(cfg.path + 'cora.content') as fp: 11 | for i,line in enumerate(fp): 12 | info = line.strip().split() 13 | # feat_data[i,:] = map(float, info[1:-1]) 14 | feat_data[i,:] = [float(x) for x in info[1:-1]] 15 | node_map[info[0]] = i 16 | if not info[-1] in label_map: 17 | label_map[info[-1]] = len(label_map) 18 | labels[i] = label_map[info[-1]] 19 | 20 | adj_lists = defaultdict(set) 21 | with open(cfg.path + 'cora.cites') as fp: 22 | for i,line in enumerate(fp): 23 | info = line.strip().split() 24 | paper1 = node_map[info[0]] 25 | paper2 = node_map[info[1]] 26 | adj_lists[paper1].add(paper2) 27 | adj_lists[paper2].add(paper1) 28 | return feat_data, labels, adj_lists, node_map -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Graphsage 2 | Tensorflow implementation of ['Inductive Representation Learning on Large Graphs'](http://papers.nips.cc/paper/6703-inductive-representation-learning-on-large-graphs) 3 | 4 | ## Introduction 5 | A tensorflow re-implementation of graphsage, which is easier than the original implementation [GraphSAGE original implementation](https://github.com/williamleif/GraphSAGE). 6 | This code includes supervised and uinsupervised version, and three types of aggregators('mean','pooling' and 'lstm'). 7 | 8 | ## Requirement 9 | python 3.6, tensorflow 1.12.0 10 | 11 | ## Usage 12 | To see and modify the parameters of graphsage, see config.py. 13 | To run the codes, use: 14 | ``` 15 | python main.py 16 | ``` 17 | 18 | ## Results 19 | Here shows accuracy of the supervised and unsupervised graphsage with 'mean' aggregator. 20 | 21 | The supervised graphsage accuracy is 0.871 22 | 23 |
supervised accuracy=0.871
24 | 25 | 26 | The unsupervised graphsage accuracy is 0.790 27 | 28 |
unsupervised accuracy=0.0.79 29 | -------------------------------------------------------------------------------- /cora/README: -------------------------------------------------------------------------------- 1 | This directory contains the a selection of the Cora dataset (www.research.whizbang.com/data). 2 | 3 | The Cora dataset consists of Machine Learning papers. These papers are classified into one of the following seven classes: 4 | Case_Based 5 | Genetic_Algorithms 6 | Neural_Networks 7 | Probabilistic_Methods 8 | Reinforcement_Learning 9 | Rule_Learning 10 | Theory 11 | 12 | The papers were selected in a way such that in the final corpus every paper cites or is cited by atleast one other paper. There are 2708 papers in the whole corpus. 13 | 14 | After stemming and removing stopwords we were left with a vocabulary of size 1433 unique words. All words with document frequency less than 10 were removed. 15 | 16 | 17 | THE DIRECTORY CONTAINS TWO FILES: 18 | 19 | The .content file contains descriptions of the papers in the following format: 20 | 21 | + 22 | 23 | The first entry in each line contains the unique string ID of the paper followed by binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last entry in the line contains the class label of the paper. 24 | 25 | The .cites file contains the citation graph of the corpus. Each line describes a link in the following format: 26 | 27 | 28 | 29 | Each line contains two paper IDs. The first entry is the ID of the paper being cited and the second ID stands for the paper which contains the citation. The direction of the link is from right to left. If a line is represented by "paper1 paper2" then the link is "paper2->paper1". -------------------------------------------------------------------------------- /aggregator.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | import tensorflow as tf 3 | import numpy as np 4 | from config import cfg 5 | import random 6 | import os 7 | os.environ["CUDA_VISIBLE_DEVICES"]='1' 8 | # tf.enable_eager_execution() 9 | def mean_aggregator(node_features,neigh_features,out_dims,scope_name): 10 | #mean aggregator: σ(W*MEAN({hk−1} ∪ {hk−1, ∀u ∈ N (v)}). 11 | with tf.variable_scope(scope_name,reuse=tf.AUTO_REUSE) as scope: 12 | if cfg.concat: 13 | node_embed = tf.expand_dims(node_features,1) 14 | to_feats = tf.concat([neigh_features,node_embed],1) 15 | else: 16 | to_feats = neigh_features 17 | combined = tf.reduce_mean(tf.layers.dense(to_feats,units=out_dims,activation=cfg.act),axis=1) 18 | return combined 19 | 20 | def pooling_aggreagtor(node_features,neigh_features,out_dims,scope_name): 21 | #pooling aggregator: max({σ(W_pool*h^k_u + b) , ∀u_i ∈ N (v)}) 22 | with tf.variable_scope(scope_name,reuse=tf.AUTO_REUSE) as scope: 23 | if cfg.concat: 24 | node_embed = tf.expand_dims(node_features,1) 25 | to_feats = tf.concat([neigh_features,node_embed],1) 26 | else: 27 | to_feats = neigh_features 28 | combined = tf.reduce_max(tf.layers.dense(to_feats,units=out_dims,activation=cfg.act),axis=1) 29 | return combined 30 | 31 | def lstm_aggregator(node_features,neigh_features,out_dims,scope_name): 32 | with tf.variable_scope(scope_name,reuse=tf.AUTO_REUSE) as scope: 33 | if cfg.concat: 34 | node_embed = tf.expand_dims(node_features,1) 35 | to_feats = tf.concat([neigh_features,node_embed],1) 36 | else: 37 | to_feats = neigh_features 38 | lstm = tf.contrib.rnn.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(out_dims) for _ in range(1)], state_is_tuple = True) 39 | init_state = lstm.zero_state(tf.shape(to_feats)[0], dtype=tf.float32) 40 | outputs, state = tf.nn.dynamic_rnn(lstm, inputs=to_feats, initial_state=init_state, time_major=False) 41 | combined = state[-1][1] 42 | return combined 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | import numpy as np 3 | import tensorflow as tf 4 | import os 5 | from utils import load_data 6 | 7 | class config(): 8 | def __init__(self): 9 | self._configs = {} 10 | 11 | self._configs['path'] = '/home/songyu/yu/graphsage-simple/cora/' 12 | self._configs['dims'] = 128 13 | self._configs['lr'] = 0.01 14 | self._configs['epochs'] = 10 15 | self._configs['num_nodes'] = 2708 16 | self._configs['num_features'] = 1433 17 | self._configs['num_classes'] = 7 18 | self._configs['sample_num'] = 10 19 | self._configs['clf_ratio'] = 0.5 20 | self._configs['batchsize'] = 10000 21 | self._configs['depth'] = 2 # 1 or 2 22 | self._configs['neg_num'] = 20 # negative sampling number for unsupervised training 23 | self._configs['act'] = tf.nn.relu 24 | self._configs['features'] = None 25 | self._configs['adj_lists'] = None 26 | self._configs['labels'] = None 27 | self._configs['node_map'] = None 28 | self._configs['gcn'] = True # whether add self-loop as gcn 29 | self._configs['concat'] = True # whether concat nodes with its neighbors 30 | self._configs['supervised'] = False # whether supervised training 31 | self._configs['aggregator'] = 'mean' # type of aggregators: mean,pooling,lstm 32 | 33 | @property 34 | def path(self): 35 | return self._configs['path'] 36 | 37 | @property 38 | def dims(self): 39 | return self._configs['dims'] 40 | 41 | @property 42 | def lr(self): 43 | return self._configs['lr'] 44 | 45 | @property 46 | def epochs(self): 47 | return self._configs['epochs'] 48 | 49 | @property 50 | def num_nodes(self): 51 | return self._configs['num_nodes'] 52 | 53 | @property 54 | def num_features(self): 55 | return self._configs['num_features'] 56 | 57 | @property 58 | def num_classes(self): 59 | return self._configs['num_classes'] 60 | 61 | @property 62 | def features(self): 63 | return self._configs['features'] 64 | 65 | @property 66 | def adj_lists(self): 67 | return self._configs['adj_lists'] 68 | 69 | @property 70 | def labels(self): 71 | return self._configs['labels'] 72 | 73 | @property 74 | def node_map(self): 75 | return self._configs['node_map'] 76 | 77 | @property 78 | def sample_num(self): 79 | return self._configs['sample_num'] 80 | 81 | @property 82 | def clf_ratio(self): 83 | return self._configs['clf_ratio'] 84 | 85 | @property 86 | def batchsize(self): 87 | return self._configs['batchsize'] 88 | 89 | @property 90 | def depth(self): 91 | return self._configs['depth'] 92 | 93 | @property 94 | def neg_num(self): 95 | return self._configs['neg_num'] 96 | 97 | @property 98 | def act(self): 99 | return self._configs['act'] 100 | 101 | @property 102 | def gcn(self): 103 | return self._configs['gcn'] 104 | 105 | @property 106 | def concat(self): 107 | return self._configs['concat'] 108 | 109 | @property 110 | def supervised(self): 111 | return self._configs['supervised'] 112 | 113 | @property 114 | def aggregator(self): 115 | return self._configs['aggregator'] 116 | 117 | def update_config(self,key,value): 118 | if key in self._configs.keys(): 119 | self._configs[key] = value 120 | else: 121 | raise RuntimeError('Update_Config_Error') 122 | 123 | cfg = config() 124 | feat_data, labels, adj_lists, node_map = load_data(cfg) 125 | cfg.update_config('features',feat_data) 126 | cfg.update_config('labels',labels) 127 | cfg.update_config('adj_lists',adj_lists) 128 | cfg.update_config('node_map',node_map) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | import tensorflow as tf 3 | import numpy as np 4 | from sklearn.metrics import f1_score 5 | from collections import defaultdict 6 | import time 7 | import random 8 | from config import cfg 9 | from aggregator import * 10 | import networkx as nx 11 | import itertools as it 12 | from sklearn import preprocessing 13 | from sklearn.linear_model import LogisticRegression 14 | import os 15 | os.environ["CUDA_VISIBLE_DEVICES"]='1' 16 | # tf.enable_eager_execution() 17 | 18 | class graphsage(): 19 | def __init__(self): 20 | self.cfg = cfg 21 | self.features = tf.Variable(self.cfg.features,dtype=tf.float32,trainable=False) 22 | if self.cfg.aggregator == 'mean': 23 | self.aggregator = mean_aggregator 24 | elif self.cfg.aggregator == 'pooling': 25 | self.aggregator = pooling_aggreagtor 26 | elif self.cfg.aggregator == 'lstm': 27 | self.aggregator = lstm_aggregator 28 | else: 29 | raise(Exception,"Invalid aggregator!") 30 | self.placeholders = self.build_placeholders() 31 | 32 | def build_placeholders(self): 33 | placeholders = {} 34 | if self.cfg.gcn: 35 | neigh_size = self.cfg.sample_num + 1 36 | else: 37 | neigh_size = self.cfg.sample_num 38 | placeholders['batchnodes'] = tf.placeholder(shape=(None),dtype=tf.int32) 39 | placeholders['samp_neighs_1st'] = tf.placeholder(shape=(None,neigh_size),dtype=tf.int32) 40 | if self.cfg.depth==2: 41 | placeholders['samp_neighs_2nd'] = tf.placeholder(shape=(None,neigh_size,neigh_size),dtype=tf.int32) 42 | if self.cfg.supervised: 43 | placeholders['labels'] = tf.placeholder(shape=(None),dtype=tf.int32) 44 | else: 45 | placeholders['input_1'] = tf.placeholder(shape=(None),dtype=tf.int32) 46 | placeholders['input_2'] = tf.placeholder(shape=(None),dtype=tf.int32) 47 | placeholders['input_3'] = tf.placeholder(shape=(None),dtype=tf.int32) 48 | return placeholders 49 | 50 | def construct_feed_dict_sup(self,nodes=None,samp_neighs_1st=None,samp_neighs_2nd=None,labels=None): 51 | feed_dict = {} 52 | feed_dict.update({self.placeholders['batchnodes']:nodes}) 53 | feed_dict.update({self.placeholders['samp_neighs_1st']:samp_neighs_1st}) 54 | feed_dict.update({self.placeholders['labels']:labels}) 55 | if self.cfg.depth==2: 56 | feed_dict.update({self.placeholders['samp_neighs_2nd']:samp_neighs_2nd}) 57 | return feed_dict 58 | 59 | def construct_feed_dict_unsup(self,nodes=None,samp_neighs_1st=None,samp_neighs_2nd=None,input_1=None,input_2=None,input_3=None): 60 | ###Note here labels are used for evaluate rather than training### 61 | feed_dict = {} 62 | feed_dict.update({self.placeholders['batchnodes']:nodes}) 63 | feed_dict.update({self.placeholders['samp_neighs_1st']:samp_neighs_1st}) 64 | feed_dict.update({self.placeholders['input_1']:input_1}) 65 | feed_dict.update({self.placeholders['input_2']:input_2}) 66 | feed_dict.update({self.placeholders['input_3']:input_3}) 67 | if self.cfg.depth==2: 68 | feed_dict.update({self.placeholders['samp_neighs_2nd']:samp_neighs_2nd}) 69 | return feed_dict 70 | 71 | def sample_neighs(self,nodes): 72 | _sample = np.random.choice 73 | neighs = [list(self.cfg.adj_lists[int(node)]) for node in nodes] 74 | samp_neighs = [list(_sample(neighs,self.cfg.sample_num,replace=False)) if len(neighs)>=self.cfg.sample_num else list(_sample(neighs,self.cfg.sample_num,replace=True)) for neighs in neighs] 75 | if self.cfg.gcn: 76 | samp_neighs = [samp_neigh+list([nodes[i]]) for i, samp_neigh in enumerate(samp_neighs)] 77 | if self.cfg.aggregator=='lstm': 78 | # for lstm we need to shuffle the node order 79 | samp_neighs = [list(np.random.permutation(x)) for x in samp_neighs] 80 | return samp_neighs 81 | 82 | def forward(self): 83 | ### Here we set the aggregate depth as 2 ### 84 | if self.cfg.depth==2: 85 | agg_2nd = tf.map_fn(fn = lambda x:self.aggregator(tf.nn.embedding_lookup(self.features,x[0]),tf.nn.embedding_lookup(self.features,x[1]),self.cfg.dims,'agg_2nd'), 86 | elems=(self.placeholders['samp_neighs_1st'],self.placeholders['samp_neighs_2nd']),dtype=tf.float32) 87 | node_features = self.aggregator(tf.nn.embedding_lookup(self.features,self.placeholders['batchnodes']),tf.nn.embedding_lookup(self.features,self.placeholders['samp_neighs_1st']),self.cfg.dims,'agg_2nd') 88 | agg_1st = self.aggregator(node_features,agg_2nd,self.cfg.dims,'agg_1st') 89 | else: 90 | agg_1st = self.aggregator(tf.nn.embedding_lookup(self.features,self.placeholders['batchnodes']),tf.nn.embedding_lookup(self.features,self.placeholders['samp_neighs_1st']), 91 | self.cfg.dims,'agg_1st') 92 | return agg_1st 93 | 94 | def sess(self): 95 | gpu_config = tf.ConfigProto() 96 | gpu_config.gpu_options.allow_growth = True 97 | sess = tf.InteractiveSession(config=gpu_config) 98 | init = tf.global_variables_initializer() 99 | sess.run(init) 100 | return sess 101 | 102 | def supervised(self,inputs,labels): 103 | preds = tf.layers.dense(inputs,units=self.cfg.num_classes,activation=None) 104 | labels = tf.one_hot(labels,depth=self.cfg.num_classes) 105 | loss = tf.losses.softmax_cross_entropy(onehot_labels=labels,logits=preds) 106 | accuray = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(preds,1),tf.argmax(labels,1)),tf.float32)) 107 | return loss,accuray 108 | 109 | def random_walk(self,num_walks=50,walk_length=4): 110 | G = nx.Graph() 111 | node_map = self.cfg.node_map 112 | with open(cfg.path + 'cora.cites','r') as f: 113 | for line in f: 114 | ls = line.strip().split() 115 | G.add_edge(node_map[ls[0]],node_map[ls[1]]) 116 | f.close() 117 | nodes = list(G.nodes()) 118 | degrees = [G.degree(x) for x in nodes] 119 | walk_pairs = [] 120 | for n in nodes: 121 | if G.degree(n) == 0: 122 | continue 123 | for j in range(num_walks): 124 | current_n = n 125 | for k in range(walk_length+1): 126 | neigs = list(G.neighbors(current_n)) 127 | if len(neigs)>0: 128 | next_n = random.choice(neigs) 129 | else: 130 | break 131 | if current_n != n: 132 | walk_pairs.append((n,current_n)) 133 | current_n = next_n 134 | random.shuffle(walk_pairs) 135 | return walk_pairs,nodes,degrees 136 | 137 | def sample(self,pos_nodes,nodes,p): 138 | sample_nodes = [] 139 | while len(sample_nodes)