├── README
├── sup.png
├── unsup.png
├── __pycache__
├── utils.cpython-36.pyc
├── config.cpython-36.pyc
├── aggregator.cpython-36.pyc
└── classify.cpython-36.pyc
├── utils.py
├── README.md
├── cora
├── README
└── cora.cites
├── aggregator.py
├── config.py
└── main.py
/README:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/sup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/sup.png
--------------------------------------------------------------------------------
/unsup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/unsup.png
--------------------------------------------------------------------------------
/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/aggregator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/__pycache__/aggregator.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/classify.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yusonghust/graphsage_tf/HEAD/__pycache__/classify.cpython-36.pyc
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | import numpy as np
3 | def load_data(cfg):
4 | num_nodes = cfg.num_nodes
5 | num_feats = cfg.num_features
6 | feat_data = np.zeros((num_nodes, num_feats))
7 | labels = np.empty((num_nodes), dtype=np.int64)
8 | node_map = {}
9 | label_map = {}
10 | with open(cfg.path + 'cora.content') as fp:
11 | for i,line in enumerate(fp):
12 | info = line.strip().split()
13 | # feat_data[i,:] = map(float, info[1:-1])
14 | feat_data[i,:] = [float(x) for x in info[1:-1]]
15 | node_map[info[0]] = i
16 | if not info[-1] in label_map:
17 | label_map[info[-1]] = len(label_map)
18 | labels[i] = label_map[info[-1]]
19 |
20 | adj_lists = defaultdict(set)
21 | with open(cfg.path + 'cora.cites') as fp:
22 | for i,line in enumerate(fp):
23 | info = line.strip().split()
24 | paper1 = node_map[info[0]]
25 | paper2 = node_map[info[1]]
26 | adj_lists[paper1].add(paper2)
27 | adj_lists[paper2].add(paper1)
28 | return feat_data, labels, adj_lists, node_map
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Graphsage
2 | Tensorflow implementation of ['Inductive Representation Learning on Large Graphs'](http://papers.nips.cc/paper/6703-inductive-representation-learning-on-large-graphs)
3 |
4 | ## Introduction
5 | A tensorflow re-implementation of graphsage, which is easier than the original implementation [GraphSAGE original implementation](https://github.com/williamleif/GraphSAGE).
6 | This code includes supervised and uinsupervised version, and three types of aggregators('mean','pooling' and 'lstm').
7 |
8 | ## Requirement
9 | python 3.6, tensorflow 1.12.0
10 |
11 | ## Usage
12 | To see and modify the parameters of graphsage, see config.py.
13 | To run the codes, use:
14 | ```
15 | python main.py
16 | ```
17 |
18 | ## Results
19 | Here shows accuracy of the supervised and unsupervised graphsage with 'mean' aggregator.
20 |
21 | The supervised graphsage accuracy is 0.871
22 |
23 |

24 |
25 |
26 | The unsupervised graphsage accuracy is 0.790
27 |
28 | 
29 |
--------------------------------------------------------------------------------
/cora/README:
--------------------------------------------------------------------------------
1 | This directory contains the a selection of the Cora dataset (www.research.whizbang.com/data).
2 |
3 | The Cora dataset consists of Machine Learning papers. These papers are classified into one of the following seven classes:
4 | Case_Based
5 | Genetic_Algorithms
6 | Neural_Networks
7 | Probabilistic_Methods
8 | Reinforcement_Learning
9 | Rule_Learning
10 | Theory
11 |
12 | The papers were selected in a way such that in the final corpus every paper cites or is cited by atleast one other paper. There are 2708 papers in the whole corpus.
13 |
14 | After stemming and removing stopwords we were left with a vocabulary of size 1433 unique words. All words with document frequency less than 10 were removed.
15 |
16 |
17 | THE DIRECTORY CONTAINS TWO FILES:
18 |
19 | The .content file contains descriptions of the papers in the following format:
20 |
21 |
+
22 |
23 | The first entry in each line contains the unique string ID of the paper followed by binary values indicating whether each word in the vocabulary is present (indicated by 1) or absent (indicated by 0) in the paper. Finally, the last entry in the line contains the class label of the paper.
24 |
25 | The .cites file contains the citation graph of the corpus. Each line describes a link in the following format:
26 |
27 |
28 |
29 | Each line contains two paper IDs. The first entry is the ID of the paper being cited and the second ID stands for the paper which contains the citation. The direction of the link is from right to left. If a line is represented by "paper1 paper2" then the link is "paper2->paper1".
--------------------------------------------------------------------------------
/aggregator.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | import tensorflow as tf
3 | import numpy as np
4 | from config import cfg
5 | import random
6 | import os
7 | os.environ["CUDA_VISIBLE_DEVICES"]='1'
8 | # tf.enable_eager_execution()
9 | def mean_aggregator(node_features,neigh_features,out_dims,scope_name):
10 | #mean aggregator: σ(W*MEAN({hk−1} ∪ {hk−1, ∀u ∈ N (v)}).
11 | with tf.variable_scope(scope_name,reuse=tf.AUTO_REUSE) as scope:
12 | if cfg.concat:
13 | node_embed = tf.expand_dims(node_features,1)
14 | to_feats = tf.concat([neigh_features,node_embed],1)
15 | else:
16 | to_feats = neigh_features
17 | combined = tf.reduce_mean(tf.layers.dense(to_feats,units=out_dims,activation=cfg.act),axis=1)
18 | return combined
19 |
20 | def pooling_aggreagtor(node_features,neigh_features,out_dims,scope_name):
21 | #pooling aggregator: max({σ(W_pool*h^k_u + b) , ∀u_i ∈ N (v)})
22 | with tf.variable_scope(scope_name,reuse=tf.AUTO_REUSE) as scope:
23 | if cfg.concat:
24 | node_embed = tf.expand_dims(node_features,1)
25 | to_feats = tf.concat([neigh_features,node_embed],1)
26 | else:
27 | to_feats = neigh_features
28 | combined = tf.reduce_max(tf.layers.dense(to_feats,units=out_dims,activation=cfg.act),axis=1)
29 | return combined
30 |
31 | def lstm_aggregator(node_features,neigh_features,out_dims,scope_name):
32 | with tf.variable_scope(scope_name,reuse=tf.AUTO_REUSE) as scope:
33 | if cfg.concat:
34 | node_embed = tf.expand_dims(node_features,1)
35 | to_feats = tf.concat([neigh_features,node_embed],1)
36 | else:
37 | to_feats = neigh_features
38 | lstm = tf.contrib.rnn.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(out_dims) for _ in range(1)], state_is_tuple = True)
39 | init_state = lstm.zero_state(tf.shape(to_feats)[0], dtype=tf.float32)
40 | outputs, state = tf.nn.dynamic_rnn(lstm, inputs=to_feats, initial_state=init_state, time_major=False)
41 | combined = state[-1][1]
42 | return combined
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | import numpy as np
3 | import tensorflow as tf
4 | import os
5 | from utils import load_data
6 |
7 | class config():
8 | def __init__(self):
9 | self._configs = {}
10 |
11 | self._configs['path'] = '/home/songyu/yu/graphsage-simple/cora/'
12 | self._configs['dims'] = 128
13 | self._configs['lr'] = 0.01
14 | self._configs['epochs'] = 10
15 | self._configs['num_nodes'] = 2708
16 | self._configs['num_features'] = 1433
17 | self._configs['num_classes'] = 7
18 | self._configs['sample_num'] = 10
19 | self._configs['clf_ratio'] = 0.5
20 | self._configs['batchsize'] = 10000
21 | self._configs['depth'] = 2 # 1 or 2
22 | self._configs['neg_num'] = 20 # negative sampling number for unsupervised training
23 | self._configs['act'] = tf.nn.relu
24 | self._configs['features'] = None
25 | self._configs['adj_lists'] = None
26 | self._configs['labels'] = None
27 | self._configs['node_map'] = None
28 | self._configs['gcn'] = True # whether add self-loop as gcn
29 | self._configs['concat'] = True # whether concat nodes with its neighbors
30 | self._configs['supervised'] = False # whether supervised training
31 | self._configs['aggregator'] = 'mean' # type of aggregators: mean,pooling,lstm
32 |
33 | @property
34 | def path(self):
35 | return self._configs['path']
36 |
37 | @property
38 | def dims(self):
39 | return self._configs['dims']
40 |
41 | @property
42 | def lr(self):
43 | return self._configs['lr']
44 |
45 | @property
46 | def epochs(self):
47 | return self._configs['epochs']
48 |
49 | @property
50 | def num_nodes(self):
51 | return self._configs['num_nodes']
52 |
53 | @property
54 | def num_features(self):
55 | return self._configs['num_features']
56 |
57 | @property
58 | def num_classes(self):
59 | return self._configs['num_classes']
60 |
61 | @property
62 | def features(self):
63 | return self._configs['features']
64 |
65 | @property
66 | def adj_lists(self):
67 | return self._configs['adj_lists']
68 |
69 | @property
70 | def labels(self):
71 | return self._configs['labels']
72 |
73 | @property
74 | def node_map(self):
75 | return self._configs['node_map']
76 |
77 | @property
78 | def sample_num(self):
79 | return self._configs['sample_num']
80 |
81 | @property
82 | def clf_ratio(self):
83 | return self._configs['clf_ratio']
84 |
85 | @property
86 | def batchsize(self):
87 | return self._configs['batchsize']
88 |
89 | @property
90 | def depth(self):
91 | return self._configs['depth']
92 |
93 | @property
94 | def neg_num(self):
95 | return self._configs['neg_num']
96 |
97 | @property
98 | def act(self):
99 | return self._configs['act']
100 |
101 | @property
102 | def gcn(self):
103 | return self._configs['gcn']
104 |
105 | @property
106 | def concat(self):
107 | return self._configs['concat']
108 |
109 | @property
110 | def supervised(self):
111 | return self._configs['supervised']
112 |
113 | @property
114 | def aggregator(self):
115 | return self._configs['aggregator']
116 |
117 | def update_config(self,key,value):
118 | if key in self._configs.keys():
119 | self._configs[key] = value
120 | else:
121 | raise RuntimeError('Update_Config_Error')
122 |
123 | cfg = config()
124 | feat_data, labels, adj_lists, node_map = load_data(cfg)
125 | cfg.update_config('features',feat_data)
126 | cfg.update_config('labels',labels)
127 | cfg.update_config('adj_lists',adj_lists)
128 | cfg.update_config('node_map',node_map)
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #-*-coding:utf-8-*-
2 | import tensorflow as tf
3 | import numpy as np
4 | from sklearn.metrics import f1_score
5 | from collections import defaultdict
6 | import time
7 | import random
8 | from config import cfg
9 | from aggregator import *
10 | import networkx as nx
11 | import itertools as it
12 | from sklearn import preprocessing
13 | from sklearn.linear_model import LogisticRegression
14 | import os
15 | os.environ["CUDA_VISIBLE_DEVICES"]='1'
16 | # tf.enable_eager_execution()
17 |
18 | class graphsage():
19 | def __init__(self):
20 | self.cfg = cfg
21 | self.features = tf.Variable(self.cfg.features,dtype=tf.float32,trainable=False)
22 | if self.cfg.aggregator == 'mean':
23 | self.aggregator = mean_aggregator
24 | elif self.cfg.aggregator == 'pooling':
25 | self.aggregator = pooling_aggreagtor
26 | elif self.cfg.aggregator == 'lstm':
27 | self.aggregator = lstm_aggregator
28 | else:
29 | raise(Exception,"Invalid aggregator!")
30 | self.placeholders = self.build_placeholders()
31 |
32 | def build_placeholders(self):
33 | placeholders = {}
34 | if self.cfg.gcn:
35 | neigh_size = self.cfg.sample_num + 1
36 | else:
37 | neigh_size = self.cfg.sample_num
38 | placeholders['batchnodes'] = tf.placeholder(shape=(None),dtype=tf.int32)
39 | placeholders['samp_neighs_1st'] = tf.placeholder(shape=(None,neigh_size),dtype=tf.int32)
40 | if self.cfg.depth==2:
41 | placeholders['samp_neighs_2nd'] = tf.placeholder(shape=(None,neigh_size,neigh_size),dtype=tf.int32)
42 | if self.cfg.supervised:
43 | placeholders['labels'] = tf.placeholder(shape=(None),dtype=tf.int32)
44 | else:
45 | placeholders['input_1'] = tf.placeholder(shape=(None),dtype=tf.int32)
46 | placeholders['input_2'] = tf.placeholder(shape=(None),dtype=tf.int32)
47 | placeholders['input_3'] = tf.placeholder(shape=(None),dtype=tf.int32)
48 | return placeholders
49 |
50 | def construct_feed_dict_sup(self,nodes=None,samp_neighs_1st=None,samp_neighs_2nd=None,labels=None):
51 | feed_dict = {}
52 | feed_dict.update({self.placeholders['batchnodes']:nodes})
53 | feed_dict.update({self.placeholders['samp_neighs_1st']:samp_neighs_1st})
54 | feed_dict.update({self.placeholders['labels']:labels})
55 | if self.cfg.depth==2:
56 | feed_dict.update({self.placeholders['samp_neighs_2nd']:samp_neighs_2nd})
57 | return feed_dict
58 |
59 | def construct_feed_dict_unsup(self,nodes=None,samp_neighs_1st=None,samp_neighs_2nd=None,input_1=None,input_2=None,input_3=None):
60 | ###Note here labels are used for evaluate rather than training###
61 | feed_dict = {}
62 | feed_dict.update({self.placeholders['batchnodes']:nodes})
63 | feed_dict.update({self.placeholders['samp_neighs_1st']:samp_neighs_1st})
64 | feed_dict.update({self.placeholders['input_1']:input_1})
65 | feed_dict.update({self.placeholders['input_2']:input_2})
66 | feed_dict.update({self.placeholders['input_3']:input_3})
67 | if self.cfg.depth==2:
68 | feed_dict.update({self.placeholders['samp_neighs_2nd']:samp_neighs_2nd})
69 | return feed_dict
70 |
71 | def sample_neighs(self,nodes):
72 | _sample = np.random.choice
73 | neighs = [list(self.cfg.adj_lists[int(node)]) for node in nodes]
74 | samp_neighs = [list(_sample(neighs,self.cfg.sample_num,replace=False)) if len(neighs)>=self.cfg.sample_num else list(_sample(neighs,self.cfg.sample_num,replace=True)) for neighs in neighs]
75 | if self.cfg.gcn:
76 | samp_neighs = [samp_neigh+list([nodes[i]]) for i, samp_neigh in enumerate(samp_neighs)]
77 | if self.cfg.aggregator=='lstm':
78 | # for lstm we need to shuffle the node order
79 | samp_neighs = [list(np.random.permutation(x)) for x in samp_neighs]
80 | return samp_neighs
81 |
82 | def forward(self):
83 | ### Here we set the aggregate depth as 2 ###
84 | if self.cfg.depth==2:
85 | agg_2nd = tf.map_fn(fn = lambda x:self.aggregator(tf.nn.embedding_lookup(self.features,x[0]),tf.nn.embedding_lookup(self.features,x[1]),self.cfg.dims,'agg_2nd'),
86 | elems=(self.placeholders['samp_neighs_1st'],self.placeholders['samp_neighs_2nd']),dtype=tf.float32)
87 | node_features = self.aggregator(tf.nn.embedding_lookup(self.features,self.placeholders['batchnodes']),tf.nn.embedding_lookup(self.features,self.placeholders['samp_neighs_1st']),self.cfg.dims,'agg_2nd')
88 | agg_1st = self.aggregator(node_features,agg_2nd,self.cfg.dims,'agg_1st')
89 | else:
90 | agg_1st = self.aggregator(tf.nn.embedding_lookup(self.features,self.placeholders['batchnodes']),tf.nn.embedding_lookup(self.features,self.placeholders['samp_neighs_1st']),
91 | self.cfg.dims,'agg_1st')
92 | return agg_1st
93 |
94 | def sess(self):
95 | gpu_config = tf.ConfigProto()
96 | gpu_config.gpu_options.allow_growth = True
97 | sess = tf.InteractiveSession(config=gpu_config)
98 | init = tf.global_variables_initializer()
99 | sess.run(init)
100 | return sess
101 |
102 | def supervised(self,inputs,labels):
103 | preds = tf.layers.dense(inputs,units=self.cfg.num_classes,activation=None)
104 | labels = tf.one_hot(labels,depth=self.cfg.num_classes)
105 | loss = tf.losses.softmax_cross_entropy(onehot_labels=labels,logits=preds)
106 | accuray = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(preds,1),tf.argmax(labels,1)),tf.float32))
107 | return loss,accuray
108 |
109 | def random_walk(self,num_walks=50,walk_length=4):
110 | G = nx.Graph()
111 | node_map = self.cfg.node_map
112 | with open(cfg.path + 'cora.cites','r') as f:
113 | for line in f:
114 | ls = line.strip().split()
115 | G.add_edge(node_map[ls[0]],node_map[ls[1]])
116 | f.close()
117 | nodes = list(G.nodes())
118 | degrees = [G.degree(x) for x in nodes]
119 | walk_pairs = []
120 | for n in nodes:
121 | if G.degree(n) == 0:
122 | continue
123 | for j in range(num_walks):
124 | current_n = n
125 | for k in range(walk_length+1):
126 | neigs = list(G.neighbors(current_n))
127 | if len(neigs)>0:
128 | next_n = random.choice(neigs)
129 | else:
130 | break
131 | if current_n != n:
132 | walk_pairs.append((n,current_n))
133 | current_n = next_n
134 | random.shuffle(walk_pairs)
135 | return walk_pairs,nodes,degrees
136 |
137 | def sample(self,pos_nodes,nodes,p):
138 | sample_nodes = []
139 | while len(sample_nodes)