├── .gitignore
├── 3d_network_plot.py
├── Centrality.py
├── Link_Prediction.py
├── GenerateOtherDatasets.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/


--------------------------------------------------------------------------------
/3d_network_plot.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[7]:
  5 | 
  6 | import igraph as ig
  7 | import json
  8 | import urllib2
  9 | 
 10 | data = []
 11 | req = urllib2.Request("https://raw.githubusercontent.com/plotly/datasets/master/miserables.json")
 12 | opener = urllib2.build_opener()
 13 | f = opener.open(req)
 14 | data = json.loads(f.read())
 15 | 
 16 | 
 17 | # In[16]:
 18 | 
 19 | L=len(data['links'])
 20 | Edges=[(data['links'][k]['source'], data['links'][k]['target']) for k in range(L)]
 21 | 
 22 | Gp=ig.Graph(Edges, directed=False)
 23 | 
 24 | 
 25 | # In[19]:
 26 | 
 27 | print((Edges[0]))
 28 | 
 29 | 
 30 | # In[3]:
 31 | 
 32 | labels=[]
 33 | group=[]
 34 | 
 35 | 
 36 | for node in data['nodes']:
 37 |     labels.append(node['name'])
 38 |     group.append(node['group'])
 39 | 
 40 | 
 41 | # In[25]:
 42 | 
 43 | from igraph import * 
 44 | G=Graph()
 45 | def addVertex(g,name_str):
 46 |     try:
 47 |         if(name_str not in g.vs['name']):
 48 |             print('Inserted node ',name_str)
 49 |             g.add_vertex(name=name_str)
 50 |         else:
 51 |             print ('Node ',name_str,' already present')
 52 |             print(g.vs.find(name_str).index)   
 53 |     except KeyError:
 54 |         g.add_vertex(name=name_str)
 55 |     return g
 56 |    
 57 | 
 58 | 
 59 | def write_tuple_to_file(f,t):
 60 |     string=str(t[0])+' '+str(t[1])+'\n'
 61 |     f.write(string)
 62 | 
 63 | def retrieve_edge_name_tuple(g,t):
 64 |     a=(g.vs[t[0]]['name'],g.vs[t[1]]['name'])
 65 |     return a
 66 | 
 67 | 
 68 | def load_dataset(fileName,g):
 69 |     fileNums=[0]
 70 |     for i,eachNum in enumerate(fileNums):
 71 |         print(eachNum)
 72 |         fileName="Datasets/facebook/edges/"+str(eachNum)+".edges"
 73 |         print('fileName=',fileName)
 74 |         f=open(fileName)
 75 |         line=f.readline()
 76 |         while(line!=''):
 77 |             c=(line.split())
 78 |             g=addVertex(g,c[0])
 79 |             g=addVertex(g,c[1])
 80 |             print('Adding ',c[0],'-->',c[1])
 81 |             g.add_edge(c[0],c[1]) 
 82 |             line=f.readline()
 83 |     g.simplify()    
 84 |     return
 85 | 
 86 | load_dataset('abd',G)
 87 | 
 88 | 
 89 | N=len(G.vs)
 90 | layt=G.layout('kk', dim=3)
 91 | 
 92 | labels=[]
 93 | print(type(labels))
 94 | for eachNde in G.vs:
 95 |     labels.append(eachNde['name'])
 96 | 
 97 | Edges=list()
 98 | print(type(Edges))
 99 | for eachTuple in G.es:
100 |     Edges.append(eachTuple.tuple)
101 |     
102 | Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
103 | Yn=[layt[k][1] for k in range(N)]# y-coordinates
104 | Zn=[layt[k][2] for k in range(N)]# z-coordinates
105 | Xe=[]
106 | Ye=[]
107 | Ze=[]
108 | 
109 | for e in Edges:
110 |     Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
111 |     Ye+=[layt[e[0]][1],layt[e[1]][1], None]
112 |     Ze+=[layt[e[0]][2],layt[e[1]][2], None]
113 | 
114 | import plotly
115 | plotly.tools.set_credentials_file(username='prerna_237', api_key='DXXXKP8XPO3FBUWsH4NY')
116 | 
117 | 
118 | # In[63]:
119 | 
120 | 
121 | 
122 | 
123 | # In[65]:
124 | 
125 | print(len(l))
126 | 
127 | 
128 | # In[71]:
129 | 
130 | import plotly.plotly as py
131 | from plotly.graph_objs import *
132 | 
133 | 
134 | trace1=Scatter3d(x=Xe,
135 |                y=Ye,
136 |                z=Ze,
137 |                mode='lines',
138 |                line=Line(color='rgb(125,125,125)', width=1),
139 |                hoverinfo='none'
140 |                )
141 | 
142 | trace2=Scatter3d(x=Xn,
143 |                y=Yn,
144 |                z=Zn,
145 |                mode='markers',
146 |                name='actors',
147 |                marker=Marker(symbol='dot',
148 |                              color=l,
149 |                              size=6,colorbar=ColorBar(
150 |                 title='Colorbar'
151 |             ),
152 |                              colorscale='Viridis',
153 |                              line=Line(color='rgb(158,18,130)', width=0.5)
154 |                              ),
155 |                text=labels,
156 |                hoverinfo='text'
157 |                )
158 | 
159 | axis=dict(showbackground=False,
160 |           showline=False,
161 |           zeroline=False,
162 |           showgrid=False,
163 |           showticklabels=False,
164 |           title=''
165 |           )
166 | 
167 | layout = Layout(
168 |          title="3D Visualization of the Facebook nodes",
169 |          width=1000,
170 |          height=1000,
171 |          showlegend=False,
172 |          scene=Scene(
173 |          xaxis=XAxis(axis),
174 |          yaxis=YAxis(axis),
175 |          zaxis=ZAxis(axis),
176 |         ),
177 |      margin=Margin(
178 |         t=100
179 |     ),
180 |     hovermode='closest',
181 |     annotations=Annotations([
182 |            Annotation(
183 |            showarrow=False,
184 | #             text="Data source: <a href='http://bost.ocks.org/mike/miserables/miserables.json'>[1] miserables.json</a>",
185 |             xref='paper',
186 |             yref='paper',
187 |             x=0,
188 |             y=0.1,
189 |             xanchor='left',
190 |             yanchor='bottom',
191 |             font=Font(
192 |             size=14
193 |             )
194 |             )
195 |         ]),    )
196 | 
197 | data=Data([trace1, trace2])
198 | fig=Figure(data=data, layout=layout)
199 | 
200 | py.iplot(fig)
201 | 
202 | 


--------------------------------------------------------------------------------
/Centrality.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[ ]:
  5 | 
  6 | # Add code to visualize the centrality of the graph. Basically this section is to get an idea about the structure of the graph
  7 | 
  8 | 
  9 | # In[2]:
 10 | 
 11 | from igraph import * 
 12 | g=Graph()
 13 | 
 14 | 
 15 | # In[3]:
 16 | 
 17 | 
 18 | def addVertex(g,name_str):
 19 |     try:
 20 |         if(name_str not in g.vs['name']):
 21 |             print('Inserted node ',name_str)
 22 |             g.add_vertex(name=name_str)
 23 |         else:
 24 |             print ('Node ',name_str,' already present')
 25 |             print(g.vs.find(name_str).index)   
 26 |     except KeyError:
 27 |         g.add_vertex(name=name_str)
 28 |     return g
 29 |    
 30 | 
 31 | 
 32 | def write_tuple_to_file(f,t):
 33 |     string=str(t[0])+' '+str(t[1])+'\n'
 34 |     f.write(string)
 35 | 
 36 | def retrieve_edge_name_tuple(g,t):
 37 |     a=(g.vs[t[0]]['name'],g.vs[t[1]]['name'])
 38 |     return a
 39 | 
 40 | 
 41 | # In[4]:
 42 | 
 43 | 
 44 | def load_dataset(fileName,g):
 45 |     fileNums=[0]
 46 |     for i,eachNum in enumerate(fileNums):
 47 |         print(eachNum)
 48 |         fileName="Datasets/facebook/edges/"+str(eachNum)+".edges"
 49 |         print('fileName=',fileName)
 50 |         f=open(fileName)
 51 |         line=f.readline()
 52 |         while(line!=''):
 53 |             c=(line.split())
 54 |             g=addVertex(g,c[0])
 55 |             g=addVertex(g,c[1])
 56 |             print('Adding ',c[0],'-->',c[1])
 57 |             g.add_edge(c[0],c[1]) 
 58 |             line=f.readline()
 59 |     g.simplify()    
 60 |     return
 61 | 
 62 | 
 63 | # In[5]:
 64 | 
 65 | load_dataset('abd',g)
 66 | print(len(g.vs))
 67 | 
 68 | 
 69 | # In[7]:
 70 | 
 71 | print(len(g.vs))
 72 | 
 73 | 
 74 | # In[20]:
 75 | 
 76 | def calculate_eigen(g):
 77 |     eigen=g.evcent(directed=False)
 78 |     for i in range(1,6):
 79 |         maxVal=max(eigen)
 80 |         print(i,'==node',g.vs[eigen.index(maxVal)]['name'],' with score of ',maxVal)
 81 |         eigen.remove(maxVal)
 82 |     eigen=g.evcent(directed=False)
 83 |     return eigen
 84 | 
 85 | 
 86 | # In[21]:
 87 | 
 88 | def calculate_closeness(g):
 89 |     close=g.closeness(g.vs)
 90 |     for i in range(1,6):
 91 |         maxVal=max(close)
 92 |         print(i,'==node',g.vs[close.index(maxVal)]['name'],' with score of ',maxVal)
 93 |         close.remove(maxVal)
 94 |     close=g.closeness(g.vs)
 95 |     return close
 96 | 
 97 | 
 98 | # In[22]:
 99 | 
100 | def calculate_between(g):
101 |     between=g.betweenness(g.vs)
102 |     for i in range(1,6):
103 |         maxVal=max(between)
104 |         print(i,'==node',g.vs[between.index(maxVal)]['name'],' with score of ',maxVal)
105 |         between.remove(maxVal)
106 |     between=g.betweenness(g.vs)
107 |     return between
108 | 
109 | 
110 | # In[23]:
111 | 
112 | print('Eigen Vector')
113 | global eigen
114 | eigen=calculate_eigen(g)
115 | 
116 | global close
117 | global between
118 | print('Closeness')
119 | close=calculate_closeness(g)
120 | print('Betweenness')
121 | between=calculate_between(g)
122 | 
123 | 
124 | # In[24]:
125 | 
126 | print(close)
127 | 
128 | 
129 | # In[ ]:
130 | 
131 | 
132 | 
133 | 
134 | # In[25]:
135 | 
136 | from igraph import * 
137 | G=Graph()
138 | 
139 | load_dataset('abd',G)
140 | 
141 | 
142 | N=len(G.vs)
143 | layt=G.layout('kk', dim=3)
144 | 
145 | labels=[]
146 | print(type(labels))
147 | for eachNde in G.vs:
148 |     labels.append(eachNde['name'])
149 | 
150 | Edges=list()
151 | print(type(Edges))
152 | for eachTuple in G.es:
153 |     Edges.append(eachTuple.tuple)
154 |     
155 | Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
156 | Yn=[layt[k][1] for k in range(N)]# y-coordinates
157 | Zn=[layt[k][2] for k in range(N)]# z-coordinates
158 | Xe=[]
159 | Ye=[]
160 | Ze=[]
161 | 
162 | for e in Edges:
163 |     Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
164 |     Ye+=[layt[e[0]][1],layt[e[1]][1], None]
165 |     Ze+=[layt[e[0]][2],layt[e[1]][2], None]
166 | 
167 | import plotly
168 | plotly.tools.set_credentials_file(username='prerna_237', api_key='DXXXKP8XPO3FBUWsH4NY')
169 | 
170 | 
171 | # In[26]:
172 | 
173 | import plotly.plotly as py
174 | from plotly.graph_objs import *
175 | 
176 | 
177 | trace1=Scatter3d(x=Xe,
178 |                y=Ye,
179 |                z=Ze,
180 |                mode='lines',
181 |                line=Line(color='rgb(125,125,125)', width=1),
182 |                hoverinfo='none'
183 |                )
184 | 
185 | trace2=Scatter3d(x=Xn,
186 |                y=Yn,
187 |                z=Zn,
188 |                mode='markers',
189 |                name='actors',
190 |                marker=Marker(symbol='dot',
191 |                              color=eigen,
192 |                              size=6,colorbar=ColorBar(
193 |                 title='Colorbar'
194 |             ),
195 |                              colorscale='Viridis',
196 |                              line=Line(color='rgb(158,18,130)', width=0.5)
197 |                              ),
198 |                text=labels,
199 |                hoverinfo='text'
200 |                )
201 | 
202 | axis=dict(showbackground=False,
203 |           showline=False,
204 |           zeroline=False,
205 |           showgrid=False,
206 |           showticklabels=False,
207 |           title=''
208 |           )
209 | 
210 | layout = Layout(
211 |          title="3D Visualization of the Facebook nodes",
212 |          width=1000,
213 |          height=1000,
214 |          showlegend=False,
215 |          scene=Scene(
216 |          xaxis=XAxis(axis),
217 |          yaxis=YAxis(axis),
218 |          zaxis=ZAxis(axis),
219 |         ),
220 |      margin=Margin(
221 |         t=100
222 |     ),
223 |     hovermode='closest',
224 |     annotations=Annotations([
225 |            Annotation(
226 |            showarrow=False,
227 | #             text="Data source: <a href='http://bost.ocks.org/mike/miserables/miserables.json'>[1] miserables.json</a>",
228 |             xref='paper',
229 |             yref='paper',
230 |             x=0,
231 |             y=0.1,
232 |             xanchor='left',
233 |             yanchor='bottom',
234 |             font=Font(
235 |             size=14
236 |             )
237 |             )
238 |         ]),    )
239 | 
240 | data=Data([trace1, trace2])
241 | fig=Figure(data=data, layout=layout)
242 | 
243 | py.iplot(fig)
244 | 
245 | 
246 | # In[ ]:
247 | 
248 | 
249 | 
250 | 
251 | # In[ ]:
252 | 
253 | 
254 | 
255 | 
256 | # In[ ]:
257 | 
258 | 
259 | 
260 | 
261 | # In[ ]:
262 | 
263 | 
264 | 
265 | 
266 | # In[ ]:
267 | 
268 | 
269 | 
270 | 


--------------------------------------------------------------------------------
/Link_Prediction.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[2]:
  5 | 
  6 | #Reading input feature values using numpy
  7 | import numpy as np
  8 | from igraph import *
  9 | global num_of_feat
 10 | num_of_feat=347
 11 | 
 12 | 
 13 | # In[3]:
 14 | 
 15 | def load_dataset(fileName,g):
 16 |     fileNums=[0]
 17 |     for i,eachNum in enumerate(fileNums):
 18 |         print(eachNum)
 19 |         print('fileName=',fileName)
 20 |         f=open(fileName)
 21 |         line=f.readline()
 22 |         while(line!=''):
 23 |             c=(line.split())
 24 |             g=addVertex(g,c[0])
 25 |             g=addVertex(g,c[1])
 26 |             print('Adding ',c[0],'-->',c[1])
 27 |             g.add_edge(c[0],c[1]) 
 28 |             line=f.readline()
 29 |     g.simplify()    
 30 |     return
 31 | 
 32 | def load_neg_dataset(fileName,g):
 33 |     fileNums=[0]
 34 |     for i,eachNum in enumerate(fileNums):
 35 |         print(eachNum)
 36 |         print('fileName=',fileName)
 37 |         f=open(fileName)
 38 |         nodeID=eachNum
 39 |         line=f.readline()
 40 |         while(line!=''):
 41 |             c=(line.split())
 42 |             g=addVertex(g,c[0])
 43 |             g=addVertex(g,c[1])
 44 |             print('Adding ',c[0],'-->',c[1])
 45 |             g.add_edge(c[0],c[1]) 
 46 |             line=f.readline()
 47 |     g.simplify()    
 48 |     return
 49 | 
 50 | def load_and_shape_input(file_name):
 51 |     a=np.loadtxt(fname=file_name)
 52 |     slice_D =[a[i][1:] for i in range(0,num_of_feat)]
 53 |     c=np.asarray(slice_D)
 54 |     return c
 55 | 
 56 | def load_shape_input(file_name_array):
 57 |     features=dict()
 58 |     for eachname in file_name_array:
 59 |         file_name='Datasets/facebook/'+str(eachname)+'.feat'
 60 |         a=np.loadtxt(file_name)
 61 |         for eachFeat in a:
 62 |             features[eachFeat[0]]=np.asarray(eachFeat[1:])
 63 |     return features
 64 | 
 65 | 
 66 | def addVertex(g,name_str):
 67 |     try:
 68 |         if(name_str not in g.vs['name']):
 69 |             print('Inserted node ',name_str)
 70 |             g.add_vertex(name=name_str)
 71 |         else:
 72 |             print ('Node ',name_str,' already present')
 73 |             print(g.vs.find(name_str).index)   
 74 |     except KeyError:
 75 |         g.add_vertex(name=name_str)
 76 |     return g
 77 |    
 78 | 
 79 | 
 80 | def write_tuple_to_file(f,t):
 81 |     string=str(t[0])+' '+str(t[1])+'\n'
 82 |     f.write(string)
 83 | 
 84 | def retrieve_edge_name_tuple(g,t):
 85 |     a=(g.vs[t[0]]['name'],g.vs[t[1]]['name'])
 86 |     return a
 87 | 
 88 | 
 89 | 
 90 | 
 91 | # In[4]:
 92 | 
 93 | # Load Feature vectors
 94 | li={0}
 95 | node_feat=load_shape_input(li)
 96 | 
 97 | 
 98 | # In[5]:
 99 | 
100 | g=Graph()
101 | load_dataset('Datasets/Self_Datasets/sample_train.edges',g)
102 | 
103 | 
104 | not_g=Graph()
105 | load_dataset('Datasets/Self_Datasets/negative_train.edges',not_g)
106 | 
107 | 
108 | # In[6]:
109 | 
110 | print(type(node_feat))
111 | for eachKey in node_feat.values():
112 |     print(len(eachKey))
113 |     print(type(eachKey))
114 | 
115 | 
116 | # In[7]:
117 | 
118 | # print(node_feat[np.float64(0)])
119 | 
120 | 
121 | # In[8]:
122 | 
123 | # print('positive edges',len(g.es))
124 | # print('negative edges',len(not_g.es))
125 | # t=retrieve_edge_name_tuple(g,(0,1))
126 | # node_feat[np.float64(t[0])]
127 | 
128 | 
129 | # In[9]:
130 | 
131 | def make_class_arrays(g,datalabel):
132 |     output_list=list()
133 |     edgeSet=g.es
134 |     for eachTuple in edgeSet:
135 |         tuple_name=retrieve_edge_name_tuple(g,eachTuple.tuple)
136 |         print('eachTuple=',tuple_name)
137 |         output=np.add(node_feat[np.float64(tuple_name[0])],node_feat[np.float64(tuple_name[1])])
138 |         output_list.append(output)
139 |     return np.asarray(output_list)
140 | 
141 | 
142 | # In[10]:
143 | 
144 | valid_g=Graph()
145 | load_dataset('Datasets/Self_Datasets/sample_valid.edges',valid_g)
146 | # node_feat=load_and_shape_input("Datasets/facebook/0.feat")
147 | 
148 | 
149 | valid_not_g=Graph()
150 | load_dataset('Datasets/Self_Datasets/negative_valid.edges',valid_not_g)
151 | 
152 | 
153 | # In[11]:
154 | 
155 | # print(len(node_feat[np.float64(345)]))
156 | 
157 | 
158 | # In[12]:
159 | 
160 | x_positive=make_class_arrays(g,1)
161 | x_negative=make_class_arrays(not_g,1)
162 | 
163 | 
164 | # In[13]:
165 | 
166 | print(x_positive.shape)
167 | print(x_negative.shape)
168 | 
169 | 
170 | # In[14]:
171 | 
172 | valid_x_positive=make_class_arrays(valid_g,1)
173 | valid_x_negative=make_class_arrays(valid_not_g,1)
174 | 
175 | 
176 | # In[15]:
177 | 
178 | print(valid_x_positive.shape)
179 | print(valid_x_negative.shape)
180 | 
181 | 
182 | # In[ ]:
183 | 
184 | 
185 | 
186 | 
187 | # In[16]:
188 | 
189 | y_positive=np.full(shape=(x_positive.shape[0],1),fill_value=1.0)
190 | y_negative=np.full(shape=(x_negative.shape[0],1),fill_value=0.0)
191 | 
192 | 
193 | # In[17]:
194 | 
195 | print(y_positive.shape)
196 | print(y_negative.shape)
197 | 
198 | 
199 | # In[18]:
200 | 
201 | valid_y_positive=np.full(shape=(valid_x_positive.shape[0],1),fill_value=1.0)
202 | valid_y_negative=np.full(shape=(valid_x_negative.shape[0],1),fill_value=0.0)
203 | 
204 | 
205 | # In[19]:
206 | 
207 | print(valid_x_positive.shape)
208 | print(valid_x_negative.shape)
209 | print(valid_y_positive.shape)
210 | print(valid_y_negative.shape)
211 | 
212 | 
213 | # In[20]:
214 | 
215 | print(valid_y_positive.shape)
216 | 
217 | 
218 | # In[21]:
219 | 
220 | train_X=np.append(x_positive,x_negative,axis=0)
221 | train_Y=np.append(y_positive,y_negative,axis=0)
222 | 
223 | valid_X=np.append(valid_x_positive,valid_x_negative,axis=0)
224 | valid_Y=np.append(valid_y_positive,valid_y_negative,axis=0)
225 | 
226 | 
227 | # In[22]:
228 | 
229 | print(type(x_positive))
230 | print(valid_X.shape)
231 | print(type(x_negative))
232 | print(valid_Y.shape)
233 | print(type(y_positive))
234 | print(y_positive.shape)
235 | print(train_X.shape)
236 | print(1592+1748)
237 | 
238 | 
239 | # In[23]:
240 | 
241 | from sklearn import linear_model
242 | reg = linear_model.Ridge (alpha = .5)
243 | 
244 | 
245 | # In[97]:
246 | 
247 | # clf.fit(digits.data[:-1], digits.target[:-1])  
248 | reg.fit(X=train_X[:-1],y=train_Y[:-1])
249 | 
250 | 
251 | # In[98]:
252 | 
253 | reg.predict(train_X[-1:])
254 | 
255 | 
256 | # In[91]:
257 | 
258 | len(reg.predict(valid_X))
259 | 
260 | 
261 | # In[100]:
262 | 
263 | np.mean((reg.predict(valid_X)-valid_Y)**2)
264 | 
265 | 
266 | # In[24]:
267 | 
268 | from sklearn.metrics import log_loss
269 | log_loss(valid_Y,reg.predict(valid_X))
270 | # print(0.01)
271 | 
272 | 
273 | # In[29]:
274 | 
275 | from sklearn import svm
276 | clf_svm = svm.SVC()
277 | clf_svm.fit(X=train_X[:-1],y=train_Y[:-1])  
278 | 
279 | 
280 | # In[31]:
281 | 
282 | from sklearn.metrics import log_loss
283 | log_loss(valid_Y,clf_svm.predict(valid_X))
284 | 
285 | 
286 | # In[ ]:
287 | 
288 | from sklearn.neighbors import NearestNeighbors
289 | 
290 | 


--------------------------------------------------------------------------------
/GenerateOtherDatasets.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[9]:
  5 | 
  6 | # Load train dataset
  7 | from igraph import * 
  8 | def load_dataset(g):
  9 |     fileNums=[0]
 10 |     for i,eachNum in enumerate(fileNums):
 11 |         print(eachNum)
 12 |         fileName="Datasets/facebook/edges/"+str(eachNum)+".edges"
 13 |         print('fileName=',fileName)
 14 |         f=open(fileName,'a+')
 15 |         nodeID=eachNum
 16 |         line=f.readline()
 17 |         while(line!=''):
 18 |             c=(line.split())
 19 |             g=addVertex(g,c[0])
 20 |             g=addVertex(g,c[1])
 21 |             print('Adding ',c[0],'-->',c[1])
 22 |             g.add_edge(c[0],c[1]) 
 23 |             line=f.readline()
 24 |     g.simplify()    
 25 |     return
 26 |     
 27 | def addVertex(g,name_str):
 28 |     try:
 29 |         if(name_str not in g.vs['name']):
 30 |             print('Inserted node ',name_str)
 31 |             g.add_vertex(name=name_str)
 32 |         else:
 33 |             print ('Node ',name_str,' already present')
 34 |             print(g.vs.find(name_str).index)   
 35 |     except KeyError:
 36 |         g.add_vertex(name=name_str)
 37 |     return g
 38 |    
 39 | 
 40 | 
 41 | def write_tuple_to_file(f,t):
 42 |     string=str(t[0])+' '+str(t[1])+'\n'
 43 |     f.write(string)
 44 | 
 45 | def retrieve_edge_name_tuple(g,t):
 46 |     a=(g.vs[t[0]]['name'],g.vs[t[1]]['name'])
 47 |     return a
 48 | 
 49 | 
 50 | 
 51 | # In[10]:
 52 | 
 53 | g=Graph()
 54 | # load_dataset(g)
 55 | 
 56 | 
 57 | # In[ ]:
 58 | 
 59 | 
 60 | 
 61 | 
 62 | # In[3]:
 63 | 
 64 | # d=open("Datasets/Self_Datasets/some.txt",'a+')
 65 | # d.write('Hello')
 66 | # d.close()
 67 | # d=open("Datasets/Self_Datasets/some.txt",'a+')
 68 | # d.write("sucker")
 69 | # d.close()
 70 | 
 71 | 
 72 | # In[11]:
 73 | 
 74 | import random
 75 | 
 76 | def generate_datasets(g,num,train_filename,valid_filename,test_filename):
 77 |     load_dataset(g)
 78 |     f=open(train_filename,'a+');
 79 |     global train_num
 80 |     train_num=int(len(g.es)*0.5)
 81 |     print('train length=',train_num)
 82 |     global test_num
 83 |     test_num=int(len(g.es)*0.25)
 84 |     global valid_num
 85 |     valid_num=int(len(g.es)*0.15)
 86 |     print('valid num=',valid_num)
 87 |     for i in range(train_num):
 88 |         edgeSet=g.es;
 89 |         r=random.randint(0,len(edgeSet)-1);
 90 |         t=edgeSet[r].tuple
 91 |         g.delete_edges(t);
 92 |         print('len of es=',len(edgeSet))
 93 |         write_tuple_to_file(f,retrieve_edge_name_tuple(g,t))
 94 |     f.close()
 95 |     f=open(test_filename,'a+');
 96 |     for i in range(test_num):
 97 |         edgeSet=g.es;
 98 |         r=random.randint(0,len(edgeSet)-1);
 99 |         print('r=',r)
100 |         t=edgeSet[r].tuple
101 |         g.delete_edges(t);
102 |         print('len of es=',len(edgeSet))
103 |         write_tuple_to_file(f,retrieve_edge_name_tuple(g,t))
104 |     f.close()
105 |     f=open(valid_filename,'a+');
106 |     for i in range(valid_num):
107 |         edgeSet=g.es;
108 |         if(len(g.es)==0):
109 |             break
110 |         else:
111 |             print('len of es=',len(edgeSet))
112 |             r=random.randint(0,len(edgeSet)-1);
113 |             print('r=',r)
114 |             t=edgeSet[r].tuple
115 |             g.delete_edges(t);
116 |             write_tuple_to_file(f,retrieve_edge_name_tuple(g,t))
117 |             if(len(g.es)==0):
118 |                 f.close()
119 |                 break
120 |     print ('I am done')
121 |     
122 |   
123 |     
124 |         
125 |     
126 | 
127 | 
128 | # In[12]:
129 | 
130 | generate_datasets(g,len(g.es)/10,'Datasets/Self_Datasets/sample_train.edges','Datasets/Self_Datasets/sample_valid.edges','Datasets/Self_Datasets/sample_test.edges')
131 | 
132 | 
133 | # In[13]:
134 | 
135 | # train length=1426 valid=427
136 | print(train_num)
137 | 
138 | 
139 | # In[15]:
140 | 
141 | #Generate negative examples with class label 0.0
142 | mat=g.get_adjacency()
143 | 
144 | pool_of_empty=list()
145 | count=0
146 | for i,entireNode in enumerate(mat):
147 |     for j,eachVal in enumerate(entireNode):
148 |         if(eachVal==0 and i!=j):
149 |             count+=1;
150 |             pool_of_empty.append((i,j))
151 | print('count=',count)
152 | 
153 | 
154 | # In[20]:
155 | 
156 | # print(pool_of_empty)
157 | for each in pool_of_empty:
158 |     if(each[0]==0):
159 |         pool_of_empty.remove(each)
160 | 
161 | 
162 | # In[21]:
163 | 
164 | import random
165 | def generate_negative_examples(pool,trainfilename,trainnum,validfilename,validnum,testfilename,testnum):
166 |     f=open(trainfilename,'a+')
167 |     for i in range(0,trainnum):
168 |         r=random.randint(0,len(pool)-1);
169 |         t=pool[r];
170 |         pool.remove(t);
171 |         f.write(str(t[0])+' '+str(t[1])+'\n');
172 |     f.close()
173 |     f=open(validfilename,'a+')
174 |     for i in range(0,validnum):
175 |         r=random.randint(0,len(pool)-1);
176 |         t=pool[r];
177 |         pool.remove(t);
178 |         f.write(str(t[0])+' '+str(t[1])+'\n');
179 |     f.close()
180 |     f=open(testfilename,'a+')
181 |     for i in range(0,testnum):
182 |         r=random.randint(0,len(pool)-1);
183 |         t=pool[r];
184 |         pool.remove(t);
185 |         f.write(str(t[0])+' '+str(t[1])+'\n');
186 |     f.close()
187 | 
188 |         
189 | 
190 | 
191 | # In[22]:
192 | 
193 | generate_negative_examples(pool_of_empty,'Datasets/Self_Datasets/negative_train.edges',train_num,'Datasets/Self_Datasets/negative_valid.edges',valid_num,'Datasets/Self_Datasets/negative_test.edges',test_num)
194 | 
195 | 
196 | # In[28]:
197 | 
198 | #NOT NEEDED
199 | 
200 | #code to generate the Negative edge graph
201 | from igraph import * 
202 | 
203 | nodes=set()
204 | fileNums=[0]
205 | for i,eachNum in enumerate(fileNums):
206 |     print(eachNum)
207 |     fileName="Datasets/facebook/edges/"+str(eachNum)+".edges"
208 |     print('fileName=',fileName)
209 |     f=open(fileName)
210 |     nodes.add(eachNum)
211 |     line=f.readline()
212 |     while(line!=''):
213 |         c=line.split()
214 |         nodes.add(c[0])
215 |         print('added ',c[0])
216 |         nodes.add(c[1])
217 |         print('added ',c[1])
218 |         line=f.readline()
219 |     print('Length=',len(nodes))
220 |     print(nodes)
221 |     
222 | 
223 | 
224 | # In[ ]:
225 | 
226 | 
227 | 
228 | 
229 | # In[30]:
230 | 
231 | 
232 | 
233 | 
234 | # In[ ]:
235 | 
236 | 
237 | 
238 | 
239 | # In[49]:
240 | 
241 | print(x.vs[2]['name'])
242 | 
243 | 
244 | # In[82]:
245 | 
246 | print(len(pool_of_empty))
247 | print(type(pool_of_empty))
248 | 
249 | 
250 | # In[58]:
251 | 
252 | print(len(x.vs))
253 | print(334*334)
254 | 
255 | 
256 | # In[32]:
257 | 
258 | print(x.es[0].tuple)
259 | 
260 | 
261 | # In[44]:
262 | 
263 | print(x.vs.find('236').index)
264 | 
265 | 
266 | # In[91]:
267 | 
268 | try:
269 |     print(x.get_eid(x.vs.find('0'),x.vs.find('83'),directed=False))
270 | except InternalError:
271 |     print ("Edge doesnt exist")
272 | 
273 | 
274 | # In[30]:
275 | 
276 | print(type(x.vs))
277 | q=set(x.vs['name'])
278 | print(len(q))
279 | print(len(nodes))
280 | print(q.pop())
281 | print(nodes.pop())
282 | 
283 | 
284 | # In[36]:
285 | 
286 | print(x.get_eid(x.vs.find('236'),x.vs.find('236'),directed=False))
287 | 
288 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Social Networks: Link Prediction
  2 | 
  3 | ### Capstone Project: Data and Knowledge Engineering
  4 | 
  5 | #### Video link:  [Social Network Analysis: Link Prediction](https://youtu.be/XRMhgxW-C_M)
  6 | 
  7 | ---
  8 | 
  9 | ### ✨Contributors: Group 1✨
 10 | 
 11 | - Pranjal Mathur  (1410110296) 
 12 | - Prerna (1410110306)
 13 | - Saketh Vallakatla (1410110352)
 14 | 
 15 | ------
 16 | 
 17 | ## Problem Statement
 18 | 
 19 | > Given an instance of set of nodes (users) in a social network graph, the aim is to find the influencing (important) users and to predict the likelihood of a future association (edge) between two nodes, knowing that there is no association between the nodes in the current state of the graph.
 20 | 
 21 | 
 22 | ## Motivation
 23 | The edges described in the problem statement could be of any form: friendship, collaboration, following or mutual interests. Here, we specifically study and build our model over Facebook's social network, with the following areas of motivation:
 24 | 
 25 | * General application of friends recommendation to a particular user.
 26 | * Predicting hidden links in a social network group formed by terrorists along with identification of their leaders/ key influencers.
 27 | * Targeted marketing of products: Marketing through highly influential individuals and also identifying plausible customers.
 28 | * Suggesting promising interactions or collaborations that have not yet been identified within an organization. 
 29 | * In Bioinformatics, link prediction can be used to find interactions between proteins.
 30 | 
 31 | The following model can be extended or modified to cater to the needs of various other social networks like Twitter, Google+, Foursquare, etc.
 32 | ## Knowledge Engineering Process
 33 | 
 34 | Discussed below are the four major Knowledge Engineering Tasks:
 35 | 
 36 | ### Acquisition & Learning
 37 | ##### Data: 
 38 | * Acquired from http://snap.stanford.edu/data/egonets-Facebook.html
 39 | 
 40 | * This dataset consists of 'circles' (or 'friends lists') from Facebook.
 41 | 
 42 | * This anonymized dataset includes node features (profiles), circles, and ego networks.
 43 | 
 44 | * The edges are **undirected** .
 45 | 
 46 | *  10 ego-networks, consisting of 193 circles and 4,039 users.
 47 | 
 48 | *  Features of various nodes are described in the following format:` [Type]:[Subtype]:attributeName` 
 49 | 
 50 | *  Following figure represents an example of the attributes and the procedure of feature array formation.
 51 | 
 52 | ##### Domain Knowledge: 
 53 | 
 54 | Following insights are meaningful while building our model for Link Prediction:
 55 | 
 56 | * The idea is to assign the connection weight `score(x, y)`  to pairs of nodes `<x, y>` , based on the input graph.
 57 | 
 58 | * The approaches adopted so far can be classified into:
 59 |   * *Methods based on node neighborhoods*: A number of approaches are based on the idea that two nodes x and y are more likely to form a link in the future if their sets of neighbors Γ(x) and Γ(y) have large overlap. Example:
 60 |     * Common neighbors
 61 |     * Jaccard’s coefficient
 62 |     * Preferential attachment
 63 |   * *Methods based on the ensemble of all paths*: A number of methods refine the notion of
 64 |     shortest-path distance by implicitly considering the ensemble of all paths between two nodes.
 65 | 
 66 | * Since we had multiple features associated with each node in an ego network, we performed our experiment based on the **similarity of features between the two nodes**.
 67 | 
 68 | * Machine Learning models like Support Vector Machine could classify the set of nodes into two: (1) Connection, (2) No connection, Neural Networks and regression techniques can be used for the same.
 69 | 
 70 | 
 71 | 
 72 | 
 73 | ##### Task: 
 74 | 
 75 | > Given an unweighted, undirected graph `G = ⟨V,E⟩`  representing the topological structure of a social network in which each edge `e = ⟨u,v⟩ ∈ E`  represents an interaction between u and v that took place at a particular time `t(e)` , the two task can be described as:
 76 | >
 77 | > * **To find the highly influencing/ central node set N.**
 78 | >
 79 | >
 80 | > * **For time T greater than t(e), we need to predict and output a list of edges not present at t(e).** 
 81 | 
 82 | ### Representation:
 83 | ##### Data: 
 84 | 
 85 | * In order to represent complex data structure of a graph with various features attached to each node, `python-igraph` has been used.
 86 | * `Dictionary Data Structure` is deployed to store the corresponding features of each node.
 87 | 
 88 | ### Development and Explanation:
 89 | ##### Approach:
 90 | 
 91 | * ***Measures for Centrality***  : As our part of analysis, we used the following 4 centrality measures:
 92 |   * `Degree of nodes` : 
 93 |     - Core idea: To find the nodes that have highest number of  immediate neighbors (degree)
 94 |     - Input: Graph and a node 
 95 |     - Output: Degree of nodes.
 96 |   * `Closeness Centrality` : 
 97 |     * Core idea: A central node is one that is close, on average, to other nodes.
 98 |     * Input: Graph and a node 
 99 |     * Output: value [0,1] after standardization (1 being highly central)
100 |   * `Betweeness Centrality` :
101 |     * Core Idea: A central actor is one that acts as a bridge, broker or gatekeeper.
102 |     * Input: Graph and a node 
103 |     * Output: value [0,1] after normalization (1 being highly central)
104 |   * `Eigenvector centrality` :
105 |     * Core Idea: A central actor is connected to other central actors.
106 |     * Input: Graph
107 |     * Output: value [0,1] 
108 | * ***Link Prediction***: Based on our survey[1], usability criteria and experiments, we have used the following Machine learning approach:
109 |   * `Support Vector Machine` classification algorithm:
110 |     * Core Idea: Segregating the two classes with a hyper-plane.
111 |       * Here, two classes are: Linked and unlinked
112 |     * Input:
113 |       * Graph Dataset (separately for each ego network), with labels attached
114 |       * Features (~230) dictionary
115 |     * Output: Predicted association between two nodes `[x,y]` :
116 |       * 0 if no association
117 |       * 1 otherwise.
118 |     * We divide out dataset in the ratio of **2:1:1**  for **Train:Validation:Test** 
119 | 
120 | 
121 | 
122 | ##### Python Libraries used:
123 | 
124 | * `Plotly` : Graphing library for making interactive, publication-quality graphs online. 
125 | * `IGraph` : igraph is a collection of network analysis tools with the emphasis on efficiency**, **portability and ease of use. igraph is open source and free. 
126 | * `Numpy` :  Adds support for large, multi-dimensional arrays and matrices along with a large collection of high level mathematical functions to operate on these arrays. (dependency- Scikit)
127 | * `Scipy` :  An open source Python library used for scientific computing and technical computing.(dependency- Scikit)
128 | * `Scikit- Learn` : Simple and efficient tool for data mining and data analysis. Used for dimensionality reduction and implementing machine learning algorithms.
129 | 
130 | ### Validation: Performance Evaluation:
131 | ##### Evaluation Interpretation: 
132 | 
133 | | Criteria        | Formula                                  | Score | Interpretation                           |
134 | | --------------- | ---------------------------------------- | ----- | ---------------------------------------- |
135 | | Accuracy        | ![{\mathit  {ACC}}=({\mathit  {TP}}+{\mathit  {TN}})/(P+N)](https://wikimedia.org/api/rest_v1/media/math/render/svg/31f7e08f6490e7182038c4ce27b87c483d6c3b4a) | 70%   | The results predicted were correct 70% of the time. |
136 | | Precision Score | ![{\mathit {PPV}}={\mathit {TP}}/({\mathit {TP}}+{\mathit {FP}})](https://wikimedia.org/api/rest_v1/media/math/render/svg/699fcdb880b7f6a92742bc0845b8b60b59806a98) | 68%   | The links (1) predicted were correct 68% of the time. |
137 | | F1 Score        | ![{\mathit {F1}}=2{\mathit {TP}}/(2{\mathit {TP}}+{\mathit {FP}}+{\mathit {FN}})](https://wikimedia.org/api/rest_v1/media/math/render/svg/8b64097b6362d28387a0c4650f2fed2bc5ea9fe9) | 65%   | It considers both precision and recall the of the test to compute the score |
138 | 
139 | 
140 | ## Replicating the results
141 | 
142 | In order to run the code provided in `SocialNetworkAnalysis.zip`
143 | 
144 | * Unzip the file
145 | 
146 | * Setting up the environment for execution:
147 | 
148 |   * Python version 2.7 or above.
149 |   * Install the python libraries as described in the previous section.
150 | 
151 | * Setting up the dataset:
152 | 
153 |   ```
154 |   python2.7 GenerateOtherDatasets.py
155 |   ```
156 | 
157 | * To get the centrality measures and visualize the network:
158 | 
159 |   ```python
160 |   python2.7 Centrality.py
161 |   ```
162 | 
163 | * For Link Prediction and Evaluation:
164 | 
165 |   ```
166 |   python2.7 Link_Prediction.py
167 |   ```
168 | 
169 |   ​
170 | 
171 | ------
172 | 
173 | [1]"Link prediction in multiplex online social networks" by Mahdi Jalili et al
174 | 


--------------------------------------------------------------------------------