├── .gitignore
├── 3d_network_plot.py
├── Centrality.py
├── Link_Prediction.py
├── GenerateOtherDatasets.py
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
--------------------------------------------------------------------------------
/3d_network_plot.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[7]:
5 |
6 | import igraph as ig
7 | import json
8 | import urllib2
9 |
10 | data = []
11 | req = urllib2.Request("https://raw.githubusercontent.com/plotly/datasets/master/miserables.json")
12 | opener = urllib2.build_opener()
13 | f = opener.open(req)
14 | data = json.loads(f.read())
15 |
16 |
17 | # In[16]:
18 |
19 | L=len(data['links'])
20 | Edges=[(data['links'][k]['source'], data['links'][k]['target']) for k in range(L)]
21 |
22 | Gp=ig.Graph(Edges, directed=False)
23 |
24 |
25 | # In[19]:
26 |
27 | print((Edges[0]))
28 |
29 |
30 | # In[3]:
31 |
32 | labels=[]
33 | group=[]
34 |
35 |
36 | for node in data['nodes']:
37 | labels.append(node['name'])
38 | group.append(node['group'])
39 |
40 |
41 | # In[25]:
42 |
43 | from igraph import *
44 | G=Graph()
45 | def addVertex(g,name_str):
46 | try:
47 | if(name_str not in g.vs['name']):
48 | print('Inserted node ',name_str)
49 | g.add_vertex(name=name_str)
50 | else:
51 | print ('Node ',name_str,' already present')
52 | print(g.vs.find(name_str).index)
53 | except KeyError:
54 | g.add_vertex(name=name_str)
55 | return g
56 |
57 |
58 |
59 | def write_tuple_to_file(f,t):
60 | string=str(t[0])+' '+str(t[1])+'\n'
61 | f.write(string)
62 |
63 | def retrieve_edge_name_tuple(g,t):
64 | a=(g.vs[t[0]]['name'],g.vs[t[1]]['name'])
65 | return a
66 |
67 |
68 | def load_dataset(fileName,g):
69 | fileNums=[0]
70 | for i,eachNum in enumerate(fileNums):
71 | print(eachNum)
72 | fileName="Datasets/facebook/edges/"+str(eachNum)+".edges"
73 | print('fileName=',fileName)
74 | f=open(fileName)
75 | line=f.readline()
76 | while(line!=''):
77 | c=(line.split())
78 | g=addVertex(g,c[0])
79 | g=addVertex(g,c[1])
80 | print('Adding ',c[0],'-->',c[1])
81 | g.add_edge(c[0],c[1])
82 | line=f.readline()
83 | g.simplify()
84 | return
85 |
86 | load_dataset('abd',G)
87 |
88 |
89 | N=len(G.vs)
90 | layt=G.layout('kk', dim=3)
91 |
92 | labels=[]
93 | print(type(labels))
94 | for eachNde in G.vs:
95 | labels.append(eachNde['name'])
96 |
97 | Edges=list()
98 | print(type(Edges))
99 | for eachTuple in G.es:
100 | Edges.append(eachTuple.tuple)
101 |
102 | Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
103 | Yn=[layt[k][1] for k in range(N)]# y-coordinates
104 | Zn=[layt[k][2] for k in range(N)]# z-coordinates
105 | Xe=[]
106 | Ye=[]
107 | Ze=[]
108 |
109 | for e in Edges:
110 | Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
111 | Ye+=[layt[e[0]][1],layt[e[1]][1], None]
112 | Ze+=[layt[e[0]][2],layt[e[1]][2], None]
113 |
114 | import plotly
115 | plotly.tools.set_credentials_file(username='prerna_237', api_key='DXXXKP8XPO3FBUWsH4NY')
116 |
117 |
118 | # In[63]:
119 |
120 |
121 |
122 |
123 | # In[65]:
124 |
125 | print(len(l))
126 |
127 |
128 | # In[71]:
129 |
130 | import plotly.plotly as py
131 | from plotly.graph_objs import *
132 |
133 |
134 | trace1=Scatter3d(x=Xe,
135 | y=Ye,
136 | z=Ze,
137 | mode='lines',
138 | line=Line(color='rgb(125,125,125)', width=1),
139 | hoverinfo='none'
140 | )
141 |
142 | trace2=Scatter3d(x=Xn,
143 | y=Yn,
144 | z=Zn,
145 | mode='markers',
146 | name='actors',
147 | marker=Marker(symbol='dot',
148 | color=l,
149 | size=6,colorbar=ColorBar(
150 | title='Colorbar'
151 | ),
152 | colorscale='Viridis',
153 | line=Line(color='rgb(158,18,130)', width=0.5)
154 | ),
155 | text=labels,
156 | hoverinfo='text'
157 | )
158 |
159 | axis=dict(showbackground=False,
160 | showline=False,
161 | zeroline=False,
162 | showgrid=False,
163 | showticklabels=False,
164 | title=''
165 | )
166 |
167 | layout = Layout(
168 | title="3D Visualization of the Facebook nodes",
169 | width=1000,
170 | height=1000,
171 | showlegend=False,
172 | scene=Scene(
173 | xaxis=XAxis(axis),
174 | yaxis=YAxis(axis),
175 | zaxis=ZAxis(axis),
176 | ),
177 | margin=Margin(
178 | t=100
179 | ),
180 | hovermode='closest',
181 | annotations=Annotations([
182 | Annotation(
183 | showarrow=False,
184 | # text="Data source: [1] miserables.json",
185 | xref='paper',
186 | yref='paper',
187 | x=0,
188 | y=0.1,
189 | xanchor='left',
190 | yanchor='bottom',
191 | font=Font(
192 | size=14
193 | )
194 | )
195 | ]), )
196 |
197 | data=Data([trace1, trace2])
198 | fig=Figure(data=data, layout=layout)
199 |
200 | py.iplot(fig)
201 |
202 |
--------------------------------------------------------------------------------
/Centrality.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[ ]:
5 |
6 | # Add code to visualize the centrality of the graph. Basically this section is to get an idea about the structure of the graph
7 |
8 |
9 | # In[2]:
10 |
11 | from igraph import *
12 | g=Graph()
13 |
14 |
15 | # In[3]:
16 |
17 |
18 | def addVertex(g,name_str):
19 | try:
20 | if(name_str not in g.vs['name']):
21 | print('Inserted node ',name_str)
22 | g.add_vertex(name=name_str)
23 | else:
24 | print ('Node ',name_str,' already present')
25 | print(g.vs.find(name_str).index)
26 | except KeyError:
27 | g.add_vertex(name=name_str)
28 | return g
29 |
30 |
31 |
32 | def write_tuple_to_file(f,t):
33 | string=str(t[0])+' '+str(t[1])+'\n'
34 | f.write(string)
35 |
36 | def retrieve_edge_name_tuple(g,t):
37 | a=(g.vs[t[0]]['name'],g.vs[t[1]]['name'])
38 | return a
39 |
40 |
41 | # In[4]:
42 |
43 |
44 | def load_dataset(fileName,g):
45 | fileNums=[0]
46 | for i,eachNum in enumerate(fileNums):
47 | print(eachNum)
48 | fileName="Datasets/facebook/edges/"+str(eachNum)+".edges"
49 | print('fileName=',fileName)
50 | f=open(fileName)
51 | line=f.readline()
52 | while(line!=''):
53 | c=(line.split())
54 | g=addVertex(g,c[0])
55 | g=addVertex(g,c[1])
56 | print('Adding ',c[0],'-->',c[1])
57 | g.add_edge(c[0],c[1])
58 | line=f.readline()
59 | g.simplify()
60 | return
61 |
62 |
63 | # In[5]:
64 |
65 | load_dataset('abd',g)
66 | print(len(g.vs))
67 |
68 |
69 | # In[7]:
70 |
71 | print(len(g.vs))
72 |
73 |
74 | # In[20]:
75 |
76 | def calculate_eigen(g):
77 | eigen=g.evcent(directed=False)
78 | for i in range(1,6):
79 | maxVal=max(eigen)
80 | print(i,'==node',g.vs[eigen.index(maxVal)]['name'],' with score of ',maxVal)
81 | eigen.remove(maxVal)
82 | eigen=g.evcent(directed=False)
83 | return eigen
84 |
85 |
86 | # In[21]:
87 |
88 | def calculate_closeness(g):
89 | close=g.closeness(g.vs)
90 | for i in range(1,6):
91 | maxVal=max(close)
92 | print(i,'==node',g.vs[close.index(maxVal)]['name'],' with score of ',maxVal)
93 | close.remove(maxVal)
94 | close=g.closeness(g.vs)
95 | return close
96 |
97 |
98 | # In[22]:
99 |
100 | def calculate_between(g):
101 | between=g.betweenness(g.vs)
102 | for i in range(1,6):
103 | maxVal=max(between)
104 | print(i,'==node',g.vs[between.index(maxVal)]['name'],' with score of ',maxVal)
105 | between.remove(maxVal)
106 | between=g.betweenness(g.vs)
107 | return between
108 |
109 |
110 | # In[23]:
111 |
112 | print('Eigen Vector')
113 | global eigen
114 | eigen=calculate_eigen(g)
115 |
116 | global close
117 | global between
118 | print('Closeness')
119 | close=calculate_closeness(g)
120 | print('Betweenness')
121 | between=calculate_between(g)
122 |
123 |
124 | # In[24]:
125 |
126 | print(close)
127 |
128 |
129 | # In[ ]:
130 |
131 |
132 |
133 |
134 | # In[25]:
135 |
136 | from igraph import *
137 | G=Graph()
138 |
139 | load_dataset('abd',G)
140 |
141 |
142 | N=len(G.vs)
143 | layt=G.layout('kk', dim=3)
144 |
145 | labels=[]
146 | print(type(labels))
147 | for eachNde in G.vs:
148 | labels.append(eachNde['name'])
149 |
150 | Edges=list()
151 | print(type(Edges))
152 | for eachTuple in G.es:
153 | Edges.append(eachTuple.tuple)
154 |
155 | Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
156 | Yn=[layt[k][1] for k in range(N)]# y-coordinates
157 | Zn=[layt[k][2] for k in range(N)]# z-coordinates
158 | Xe=[]
159 | Ye=[]
160 | Ze=[]
161 |
162 | for e in Edges:
163 | Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
164 | Ye+=[layt[e[0]][1],layt[e[1]][1], None]
165 | Ze+=[layt[e[0]][2],layt[e[1]][2], None]
166 |
167 | import plotly
168 | plotly.tools.set_credentials_file(username='prerna_237', api_key='DXXXKP8XPO3FBUWsH4NY')
169 |
170 |
171 | # In[26]:
172 |
173 | import plotly.plotly as py
174 | from plotly.graph_objs import *
175 |
176 |
177 | trace1=Scatter3d(x=Xe,
178 | y=Ye,
179 | z=Ze,
180 | mode='lines',
181 | line=Line(color='rgb(125,125,125)', width=1),
182 | hoverinfo='none'
183 | )
184 |
185 | trace2=Scatter3d(x=Xn,
186 | y=Yn,
187 | z=Zn,
188 | mode='markers',
189 | name='actors',
190 | marker=Marker(symbol='dot',
191 | color=eigen,
192 | size=6,colorbar=ColorBar(
193 | title='Colorbar'
194 | ),
195 | colorscale='Viridis',
196 | line=Line(color='rgb(158,18,130)', width=0.5)
197 | ),
198 | text=labels,
199 | hoverinfo='text'
200 | )
201 |
202 | axis=dict(showbackground=False,
203 | showline=False,
204 | zeroline=False,
205 | showgrid=False,
206 | showticklabels=False,
207 | title=''
208 | )
209 |
210 | layout = Layout(
211 | title="3D Visualization of the Facebook nodes",
212 | width=1000,
213 | height=1000,
214 | showlegend=False,
215 | scene=Scene(
216 | xaxis=XAxis(axis),
217 | yaxis=YAxis(axis),
218 | zaxis=ZAxis(axis),
219 | ),
220 | margin=Margin(
221 | t=100
222 | ),
223 | hovermode='closest',
224 | annotations=Annotations([
225 | Annotation(
226 | showarrow=False,
227 | # text="Data source: [1] miserables.json",
228 | xref='paper',
229 | yref='paper',
230 | x=0,
231 | y=0.1,
232 | xanchor='left',
233 | yanchor='bottom',
234 | font=Font(
235 | size=14
236 | )
237 | )
238 | ]), )
239 |
240 | data=Data([trace1, trace2])
241 | fig=Figure(data=data, layout=layout)
242 |
243 | py.iplot(fig)
244 |
245 |
246 | # In[ ]:
247 |
248 |
249 |
250 |
251 | # In[ ]:
252 |
253 |
254 |
255 |
256 | # In[ ]:
257 |
258 |
259 |
260 |
261 | # In[ ]:
262 |
263 |
264 |
265 |
266 | # In[ ]:
267 |
268 |
269 |
270 |
--------------------------------------------------------------------------------
/Link_Prediction.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[2]:
5 |
6 | #Reading input feature values using numpy
7 | import numpy as np
8 | from igraph import *
9 | global num_of_feat
10 | num_of_feat=347
11 |
12 |
13 | # In[3]:
14 |
15 | def load_dataset(fileName,g):
16 | fileNums=[0]
17 | for i,eachNum in enumerate(fileNums):
18 | print(eachNum)
19 | print('fileName=',fileName)
20 | f=open(fileName)
21 | line=f.readline()
22 | while(line!=''):
23 | c=(line.split())
24 | g=addVertex(g,c[0])
25 | g=addVertex(g,c[1])
26 | print('Adding ',c[0],'-->',c[1])
27 | g.add_edge(c[0],c[1])
28 | line=f.readline()
29 | g.simplify()
30 | return
31 |
32 | def load_neg_dataset(fileName,g):
33 | fileNums=[0]
34 | for i,eachNum in enumerate(fileNums):
35 | print(eachNum)
36 | print('fileName=',fileName)
37 | f=open(fileName)
38 | nodeID=eachNum
39 | line=f.readline()
40 | while(line!=''):
41 | c=(line.split())
42 | g=addVertex(g,c[0])
43 | g=addVertex(g,c[1])
44 | print('Adding ',c[0],'-->',c[1])
45 | g.add_edge(c[0],c[1])
46 | line=f.readline()
47 | g.simplify()
48 | return
49 |
50 | def load_and_shape_input(file_name):
51 | a=np.loadtxt(fname=file_name)
52 | slice_D =[a[i][1:] for i in range(0,num_of_feat)]
53 | c=np.asarray(slice_D)
54 | return c
55 |
56 | def load_shape_input(file_name_array):
57 | features=dict()
58 | for eachname in file_name_array:
59 | file_name='Datasets/facebook/'+str(eachname)+'.feat'
60 | a=np.loadtxt(file_name)
61 | for eachFeat in a:
62 | features[eachFeat[0]]=np.asarray(eachFeat[1:])
63 | return features
64 |
65 |
66 | def addVertex(g,name_str):
67 | try:
68 | if(name_str not in g.vs['name']):
69 | print('Inserted node ',name_str)
70 | g.add_vertex(name=name_str)
71 | else:
72 | print ('Node ',name_str,' already present')
73 | print(g.vs.find(name_str).index)
74 | except KeyError:
75 | g.add_vertex(name=name_str)
76 | return g
77 |
78 |
79 |
80 | def write_tuple_to_file(f,t):
81 | string=str(t[0])+' '+str(t[1])+'\n'
82 | f.write(string)
83 |
84 | def retrieve_edge_name_tuple(g,t):
85 | a=(g.vs[t[0]]['name'],g.vs[t[1]]['name'])
86 | return a
87 |
88 |
89 |
90 |
91 | # In[4]:
92 |
93 | # Load Feature vectors
94 | li={0}
95 | node_feat=load_shape_input(li)
96 |
97 |
98 | # In[5]:
99 |
100 | g=Graph()
101 | load_dataset('Datasets/Self_Datasets/sample_train.edges',g)
102 |
103 |
104 | not_g=Graph()
105 | load_dataset('Datasets/Self_Datasets/negative_train.edges',not_g)
106 |
107 |
108 | # In[6]:
109 |
110 | print(type(node_feat))
111 | for eachKey in node_feat.values():
112 | print(len(eachKey))
113 | print(type(eachKey))
114 |
115 |
116 | # In[7]:
117 |
118 | # print(node_feat[np.float64(0)])
119 |
120 |
121 | # In[8]:
122 |
123 | # print('positive edges',len(g.es))
124 | # print('negative edges',len(not_g.es))
125 | # t=retrieve_edge_name_tuple(g,(0,1))
126 | # node_feat[np.float64(t[0])]
127 |
128 |
129 | # In[9]:
130 |
131 | def make_class_arrays(g,datalabel):
132 | output_list=list()
133 | edgeSet=g.es
134 | for eachTuple in edgeSet:
135 | tuple_name=retrieve_edge_name_tuple(g,eachTuple.tuple)
136 | print('eachTuple=',tuple_name)
137 | output=np.add(node_feat[np.float64(tuple_name[0])],node_feat[np.float64(tuple_name[1])])
138 | output_list.append(output)
139 | return np.asarray(output_list)
140 |
141 |
142 | # In[10]:
143 |
144 | valid_g=Graph()
145 | load_dataset('Datasets/Self_Datasets/sample_valid.edges',valid_g)
146 | # node_feat=load_and_shape_input("Datasets/facebook/0.feat")
147 |
148 |
149 | valid_not_g=Graph()
150 | load_dataset('Datasets/Self_Datasets/negative_valid.edges',valid_not_g)
151 |
152 |
153 | # In[11]:
154 |
155 | # print(len(node_feat[np.float64(345)]))
156 |
157 |
158 | # In[12]:
159 |
160 | x_positive=make_class_arrays(g,1)
161 | x_negative=make_class_arrays(not_g,1)
162 |
163 |
164 | # In[13]:
165 |
166 | print(x_positive.shape)
167 | print(x_negative.shape)
168 |
169 |
170 | # In[14]:
171 |
172 | valid_x_positive=make_class_arrays(valid_g,1)
173 | valid_x_negative=make_class_arrays(valid_not_g,1)
174 |
175 |
176 | # In[15]:
177 |
178 | print(valid_x_positive.shape)
179 | print(valid_x_negative.shape)
180 |
181 |
182 | # In[ ]:
183 |
184 |
185 |
186 |
187 | # In[16]:
188 |
189 | y_positive=np.full(shape=(x_positive.shape[0],1),fill_value=1.0)
190 | y_negative=np.full(shape=(x_negative.shape[0],1),fill_value=0.0)
191 |
192 |
193 | # In[17]:
194 |
195 | print(y_positive.shape)
196 | print(y_negative.shape)
197 |
198 |
199 | # In[18]:
200 |
201 | valid_y_positive=np.full(shape=(valid_x_positive.shape[0],1),fill_value=1.0)
202 | valid_y_negative=np.full(shape=(valid_x_negative.shape[0],1),fill_value=0.0)
203 |
204 |
205 | # In[19]:
206 |
207 | print(valid_x_positive.shape)
208 | print(valid_x_negative.shape)
209 | print(valid_y_positive.shape)
210 | print(valid_y_negative.shape)
211 |
212 |
213 | # In[20]:
214 |
215 | print(valid_y_positive.shape)
216 |
217 |
218 | # In[21]:
219 |
220 | train_X=np.append(x_positive,x_negative,axis=0)
221 | train_Y=np.append(y_positive,y_negative,axis=0)
222 |
223 | valid_X=np.append(valid_x_positive,valid_x_negative,axis=0)
224 | valid_Y=np.append(valid_y_positive,valid_y_negative,axis=0)
225 |
226 |
227 | # In[22]:
228 |
229 | print(type(x_positive))
230 | print(valid_X.shape)
231 | print(type(x_negative))
232 | print(valid_Y.shape)
233 | print(type(y_positive))
234 | print(y_positive.shape)
235 | print(train_X.shape)
236 | print(1592+1748)
237 |
238 |
239 | # In[23]:
240 |
241 | from sklearn import linear_model
242 | reg = linear_model.Ridge (alpha = .5)
243 |
244 |
245 | # In[97]:
246 |
247 | # clf.fit(digits.data[:-1], digits.target[:-1])
248 | reg.fit(X=train_X[:-1],y=train_Y[:-1])
249 |
250 |
251 | # In[98]:
252 |
253 | reg.predict(train_X[-1:])
254 |
255 |
256 | # In[91]:
257 |
258 | len(reg.predict(valid_X))
259 |
260 |
261 | # In[100]:
262 |
263 | np.mean((reg.predict(valid_X)-valid_Y)**2)
264 |
265 |
266 | # In[24]:
267 |
268 | from sklearn.metrics import log_loss
269 | log_loss(valid_Y,reg.predict(valid_X))
270 | # print(0.01)
271 |
272 |
273 | # In[29]:
274 |
275 | from sklearn import svm
276 | clf_svm = svm.SVC()
277 | clf_svm.fit(X=train_X[:-1],y=train_Y[:-1])
278 |
279 |
280 | # In[31]:
281 |
282 | from sklearn.metrics import log_loss
283 | log_loss(valid_Y,clf_svm.predict(valid_X))
284 |
285 |
286 | # In[ ]:
287 |
288 | from sklearn.neighbors import NearestNeighbors
289 |
290 |
--------------------------------------------------------------------------------
/GenerateOtherDatasets.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[9]:
5 |
6 | # Load train dataset
7 | from igraph import *
8 | def load_dataset(g):
9 | fileNums=[0]
10 | for i,eachNum in enumerate(fileNums):
11 | print(eachNum)
12 | fileName="Datasets/facebook/edges/"+str(eachNum)+".edges"
13 | print('fileName=',fileName)
14 | f=open(fileName,'a+')
15 | nodeID=eachNum
16 | line=f.readline()
17 | while(line!=''):
18 | c=(line.split())
19 | g=addVertex(g,c[0])
20 | g=addVertex(g,c[1])
21 | print('Adding ',c[0],'-->',c[1])
22 | g.add_edge(c[0],c[1])
23 | line=f.readline()
24 | g.simplify()
25 | return
26 |
27 | def addVertex(g,name_str):
28 | try:
29 | if(name_str not in g.vs['name']):
30 | print('Inserted node ',name_str)
31 | g.add_vertex(name=name_str)
32 | else:
33 | print ('Node ',name_str,' already present')
34 | print(g.vs.find(name_str).index)
35 | except KeyError:
36 | g.add_vertex(name=name_str)
37 | return g
38 |
39 |
40 |
41 | def write_tuple_to_file(f,t):
42 | string=str(t[0])+' '+str(t[1])+'\n'
43 | f.write(string)
44 |
45 | def retrieve_edge_name_tuple(g,t):
46 | a=(g.vs[t[0]]['name'],g.vs[t[1]]['name'])
47 | return a
48 |
49 |
50 |
51 | # In[10]:
52 |
53 | g=Graph()
54 | # load_dataset(g)
55 |
56 |
57 | # In[ ]:
58 |
59 |
60 |
61 |
62 | # In[3]:
63 |
64 | # d=open("Datasets/Self_Datasets/some.txt",'a+')
65 | # d.write('Hello')
66 | # d.close()
67 | # d=open("Datasets/Self_Datasets/some.txt",'a+')
68 | # d.write("sucker")
69 | # d.close()
70 |
71 |
72 | # In[11]:
73 |
74 | import random
75 |
76 | def generate_datasets(g,num,train_filename,valid_filename,test_filename):
77 | load_dataset(g)
78 | f=open(train_filename,'a+');
79 | global train_num
80 | train_num=int(len(g.es)*0.5)
81 | print('train length=',train_num)
82 | global test_num
83 | test_num=int(len(g.es)*0.25)
84 | global valid_num
85 | valid_num=int(len(g.es)*0.15)
86 | print('valid num=',valid_num)
87 | for i in range(train_num):
88 | edgeSet=g.es;
89 | r=random.randint(0,len(edgeSet)-1);
90 | t=edgeSet[r].tuple
91 | g.delete_edges(t);
92 | print('len of es=',len(edgeSet))
93 | write_tuple_to_file(f,retrieve_edge_name_tuple(g,t))
94 | f.close()
95 | f=open(test_filename,'a+');
96 | for i in range(test_num):
97 | edgeSet=g.es;
98 | r=random.randint(0,len(edgeSet)-1);
99 | print('r=',r)
100 | t=edgeSet[r].tuple
101 | g.delete_edges(t);
102 | print('len of es=',len(edgeSet))
103 | write_tuple_to_file(f,retrieve_edge_name_tuple(g,t))
104 | f.close()
105 | f=open(valid_filename,'a+');
106 | for i in range(valid_num):
107 | edgeSet=g.es;
108 | if(len(g.es)==0):
109 | break
110 | else:
111 | print('len of es=',len(edgeSet))
112 | r=random.randint(0,len(edgeSet)-1);
113 | print('r=',r)
114 | t=edgeSet[r].tuple
115 | g.delete_edges(t);
116 | write_tuple_to_file(f,retrieve_edge_name_tuple(g,t))
117 | if(len(g.es)==0):
118 | f.close()
119 | break
120 | print ('I am done')
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 | # In[12]:
129 |
130 | generate_datasets(g,len(g.es)/10,'Datasets/Self_Datasets/sample_train.edges','Datasets/Self_Datasets/sample_valid.edges','Datasets/Self_Datasets/sample_test.edges')
131 |
132 |
133 | # In[13]:
134 |
135 | # train length=1426 valid=427
136 | print(train_num)
137 |
138 |
139 | # In[15]:
140 |
141 | #Generate negative examples with class label 0.0
142 | mat=g.get_adjacency()
143 |
144 | pool_of_empty=list()
145 | count=0
146 | for i,entireNode in enumerate(mat):
147 | for j,eachVal in enumerate(entireNode):
148 | if(eachVal==0 and i!=j):
149 | count+=1;
150 | pool_of_empty.append((i,j))
151 | print('count=',count)
152 |
153 |
154 | # In[20]:
155 |
156 | # print(pool_of_empty)
157 | for each in pool_of_empty:
158 | if(each[0]==0):
159 | pool_of_empty.remove(each)
160 |
161 |
162 | # In[21]:
163 |
164 | import random
165 | def generate_negative_examples(pool,trainfilename,trainnum,validfilename,validnum,testfilename,testnum):
166 | f=open(trainfilename,'a+')
167 | for i in range(0,trainnum):
168 | r=random.randint(0,len(pool)-1);
169 | t=pool[r];
170 | pool.remove(t);
171 | f.write(str(t[0])+' '+str(t[1])+'\n');
172 | f.close()
173 | f=open(validfilename,'a+')
174 | for i in range(0,validnum):
175 | r=random.randint(0,len(pool)-1);
176 | t=pool[r];
177 | pool.remove(t);
178 | f.write(str(t[0])+' '+str(t[1])+'\n');
179 | f.close()
180 | f=open(testfilename,'a+')
181 | for i in range(0,testnum):
182 | r=random.randint(0,len(pool)-1);
183 | t=pool[r];
184 | pool.remove(t);
185 | f.write(str(t[0])+' '+str(t[1])+'\n');
186 | f.close()
187 |
188 |
189 |
190 |
191 | # In[22]:
192 |
193 | generate_negative_examples(pool_of_empty,'Datasets/Self_Datasets/negative_train.edges',train_num,'Datasets/Self_Datasets/negative_valid.edges',valid_num,'Datasets/Self_Datasets/negative_test.edges',test_num)
194 |
195 |
196 | # In[28]:
197 |
198 | #NOT NEEDED
199 |
200 | #code to generate the Negative edge graph
201 | from igraph import *
202 |
203 | nodes=set()
204 | fileNums=[0]
205 | for i,eachNum in enumerate(fileNums):
206 | print(eachNum)
207 | fileName="Datasets/facebook/edges/"+str(eachNum)+".edges"
208 | print('fileName=',fileName)
209 | f=open(fileName)
210 | nodes.add(eachNum)
211 | line=f.readline()
212 | while(line!=''):
213 | c=line.split()
214 | nodes.add(c[0])
215 | print('added ',c[0])
216 | nodes.add(c[1])
217 | print('added ',c[1])
218 | line=f.readline()
219 | print('Length=',len(nodes))
220 | print(nodes)
221 |
222 |
223 |
224 | # In[ ]:
225 |
226 |
227 |
228 |
229 | # In[30]:
230 |
231 |
232 |
233 |
234 | # In[ ]:
235 |
236 |
237 |
238 |
239 | # In[49]:
240 |
241 | print(x.vs[2]['name'])
242 |
243 |
244 | # In[82]:
245 |
246 | print(len(pool_of_empty))
247 | print(type(pool_of_empty))
248 |
249 |
250 | # In[58]:
251 |
252 | print(len(x.vs))
253 | print(334*334)
254 |
255 |
256 | # In[32]:
257 |
258 | print(x.es[0].tuple)
259 |
260 |
261 | # In[44]:
262 |
263 | print(x.vs.find('236').index)
264 |
265 |
266 | # In[91]:
267 |
268 | try:
269 | print(x.get_eid(x.vs.find('0'),x.vs.find('83'),directed=False))
270 | except InternalError:
271 | print ("Edge doesnt exist")
272 |
273 |
274 | # In[30]:
275 |
276 | print(type(x.vs))
277 | q=set(x.vs['name'])
278 | print(len(q))
279 | print(len(nodes))
280 | print(q.pop())
281 | print(nodes.pop())
282 |
283 |
284 | # In[36]:
285 |
286 | print(x.get_eid(x.vs.find('236'),x.vs.find('236'),directed=False))
287 |
288 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Social Networks: Link Prediction
2 |
3 | ### Capstone Project: Data and Knowledge Engineering
4 |
5 | #### Video link: [Social Network Analysis: Link Prediction](https://youtu.be/XRMhgxW-C_M)
6 |
7 | ---
8 |
9 | ### ✨Contributors: Group 1✨
10 |
11 | - Pranjal Mathur (1410110296)
12 | - Prerna (1410110306)
13 | - Saketh Vallakatla (1410110352)
14 |
15 | ------
16 |
17 | ## Problem Statement
18 |
19 | > Given an instance of set of nodes (users) in a social network graph, the aim is to find the influencing (important) users and to predict the likelihood of a future association (edge) between two nodes, knowing that there is no association between the nodes in the current state of the graph.
20 |
21 |
22 | ## Motivation
23 | The edges described in the problem statement could be of any form: friendship, collaboration, following or mutual interests. Here, we specifically study and build our model over Facebook's social network, with the following areas of motivation:
24 |
25 | * General application of friends recommendation to a particular user.
26 | * Predicting hidden links in a social network group formed by terrorists along with identification of their leaders/ key influencers.
27 | * Targeted marketing of products: Marketing through highly influential individuals and also identifying plausible customers.
28 | * Suggesting promising interactions or collaborations that have not yet been identified within an organization.
29 | * In Bioinformatics, link prediction can be used to find interactions between proteins.
30 |
31 | The following model can be extended or modified to cater to the needs of various other social networks like Twitter, Google+, Foursquare, etc.
32 | ## Knowledge Engineering Process
33 |
34 | Discussed below are the four major Knowledge Engineering Tasks:
35 |
36 | ### Acquisition & Learning
37 | ##### Data:
38 | * Acquired from http://snap.stanford.edu/data/egonets-Facebook.html
39 |
40 | * This dataset consists of 'circles' (or 'friends lists') from Facebook.
41 |
42 | * This anonymized dataset includes node features (profiles), circles, and ego networks.
43 |
44 | * The edges are **undirected** .
45 |
46 | * 10 ego-networks, consisting of 193 circles and 4,039 users.
47 |
48 | * Features of various nodes are described in the following format:` [Type]:[Subtype]:attributeName`
49 |
50 | * Following figure represents an example of the attributes and the procedure of feature array formation.
51 |
52 | ##### Domain Knowledge:
53 |
54 | Following insights are meaningful while building our model for Link Prediction:
55 |
56 | * The idea is to assign the connection weight `score(x, y)` to pairs of nodes `` , based on the input graph.
57 |
58 | * The approaches adopted so far can be classified into:
59 | * *Methods based on node neighborhoods*: A number of approaches are based on the idea that two nodes x and y are more likely to form a link in the future if their sets of neighbors Γ(x) and Γ(y) have large overlap. Example:
60 | * Common neighbors
61 | * Jaccard’s coefficient
62 | * Preferential attachment
63 | * *Methods based on the ensemble of all paths*: A number of methods refine the notion of
64 | shortest-path distance by implicitly considering the ensemble of all paths between two nodes.
65 |
66 | * Since we had multiple features associated with each node in an ego network, we performed our experiment based on the **similarity of features between the two nodes**.
67 |
68 | * Machine Learning models like Support Vector Machine could classify the set of nodes into two: (1) Connection, (2) No connection, Neural Networks and regression techniques can be used for the same.
69 |
70 |
71 |
72 |
73 | ##### Task:
74 |
75 | > Given an unweighted, undirected graph `G = ⟨V,E⟩` representing the topological structure of a social network in which each edge `e = ⟨u,v⟩ ∈ E` represents an interaction between u and v that took place at a particular time `t(e)` , the two task can be described as:
76 | >
77 | > * **To find the highly influencing/ central node set N.**
78 | >
79 | >
80 | > * **For time T greater than t(e), we need to predict and output a list of edges not present at t(e).**
81 |
82 | ### Representation:
83 | ##### Data:
84 |
85 | * In order to represent complex data structure of a graph with various features attached to each node, `python-igraph` has been used.
86 | * `Dictionary Data Structure` is deployed to store the corresponding features of each node.
87 |
88 | ### Development and Explanation:
89 | ##### Approach:
90 |
91 | * ***Measures for Centrality*** : As our part of analysis, we used the following 4 centrality measures:
92 | * `Degree of nodes` :
93 | - Core idea: To find the nodes that have highest number of immediate neighbors (degree)
94 | - Input: Graph and a node
95 | - Output: Degree of nodes.
96 | * `Closeness Centrality` :
97 | * Core idea: A central node is one that is close, on average, to other nodes.
98 | * Input: Graph and a node
99 | * Output: value [0,1] after standardization (1 being highly central)
100 | * `Betweeness Centrality` :
101 | * Core Idea: A central actor is one that acts as a bridge, broker or gatekeeper.
102 | * Input: Graph and a node
103 | * Output: value [0,1] after normalization (1 being highly central)
104 | * `Eigenvector centrality` :
105 | * Core Idea: A central actor is connected to other central actors.
106 | * Input: Graph
107 | * Output: value [0,1]
108 | * ***Link Prediction***: Based on our survey[1], usability criteria and experiments, we have used the following Machine learning approach:
109 | * `Support Vector Machine` classification algorithm:
110 | * Core Idea: Segregating the two classes with a hyper-plane.
111 | * Here, two classes are: Linked and unlinked
112 | * Input:
113 | * Graph Dataset (separately for each ego network), with labels attached
114 | * Features (~230) dictionary
115 | * Output: Predicted association between two nodes `[x,y]` :
116 | * 0 if no association
117 | * 1 otherwise.
118 | * We divide out dataset in the ratio of **2:1:1** for **Train:Validation:Test**
119 |
120 |
121 |
122 | ##### Python Libraries used:
123 |
124 | * `Plotly` : Graphing library for making interactive, publication-quality graphs online.
125 | * `IGraph` : igraph is a collection of network analysis tools with the emphasis on efficiency**, **portability and ease of use. igraph is open source and free.
126 | * `Numpy` : Adds support for large, multi-dimensional arrays and matrices along with a large collection of high level mathematical functions to operate on these arrays. (dependency- Scikit)
127 | * `Scipy` : An open source Python library used for scientific computing and technical computing.(dependency- Scikit)
128 | * `Scikit- Learn` : Simple and efficient tool for data mining and data analysis. Used for dimensionality reduction and implementing machine learning algorithms.
129 |
130 | ### Validation: Performance Evaluation:
131 | ##### Evaluation Interpretation:
132 |
133 | | Criteria | Formula | Score | Interpretation |
134 | | --------------- | ---------------------------------------- | ----- | ---------------------------------------- |
135 | | Accuracy |  | 70% | The results predicted were correct 70% of the time. |
136 | | Precision Score |  | 68% | The links (1) predicted were correct 68% of the time. |
137 | | F1 Score |  | 65% | It considers both precision and recall the of the test to compute the score |
138 |
139 |
140 | ## Replicating the results
141 |
142 | In order to run the code provided in `SocialNetworkAnalysis.zip`
143 |
144 | * Unzip the file
145 |
146 | * Setting up the environment for execution:
147 |
148 | * Python version 2.7 or above.
149 | * Install the python libraries as described in the previous section.
150 |
151 | * Setting up the dataset:
152 |
153 | ```
154 | python2.7 GenerateOtherDatasets.py
155 | ```
156 |
157 | * To get the centrality measures and visualize the network:
158 |
159 | ```python
160 | python2.7 Centrality.py
161 | ```
162 |
163 | * For Link Prediction and Evaluation:
164 |
165 | ```
166 | python2.7 Link_Prediction.py
167 | ```
168 |
169 |
170 |
171 | ------
172 |
173 | [1]"Link prediction in multiplex online social networks" by Mahdi Jalili et al
174 |
--------------------------------------------------------------------------------