├── .gitignore ├── 3d_network_plot.py ├── Centrality.py ├── Link_Prediction.py ├── GenerateOtherDatasets.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ -------------------------------------------------------------------------------- /3d_network_plot.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[7]: 5 | 6 | import igraph as ig 7 | import json 8 | import urllib2 9 | 10 | data = [] 11 | req = urllib2.Request("https://raw.githubusercontent.com/plotly/datasets/master/miserables.json") 12 | opener = urllib2.build_opener() 13 | f = opener.open(req) 14 | data = json.loads(f.read()) 15 | 16 | 17 | # In[16]: 18 | 19 | L=len(data['links']) 20 | Edges=[(data['links'][k]['source'], data['links'][k]['target']) for k in range(L)] 21 | 22 | Gp=ig.Graph(Edges, directed=False) 23 | 24 | 25 | # In[19]: 26 | 27 | print((Edges[0])) 28 | 29 | 30 | # In[3]: 31 | 32 | labels=[] 33 | group=[] 34 | 35 | 36 | for node in data['nodes']: 37 | labels.append(node['name']) 38 | group.append(node['group']) 39 | 40 | 41 | # In[25]: 42 | 43 | from igraph import * 44 | G=Graph() 45 | def addVertex(g,name_str): 46 | try: 47 | if(name_str not in g.vs['name']): 48 | print('Inserted node ',name_str) 49 | g.add_vertex(name=name_str) 50 | else: 51 | print ('Node ',name_str,' already present') 52 | print(g.vs.find(name_str).index) 53 | except KeyError: 54 | g.add_vertex(name=name_str) 55 | return g 56 | 57 | 58 | 59 | def write_tuple_to_file(f,t): 60 | string=str(t[0])+' '+str(t[1])+'\n' 61 | f.write(string) 62 | 63 | def retrieve_edge_name_tuple(g,t): 64 | a=(g.vs[t[0]]['name'],g.vs[t[1]]['name']) 65 | return a 66 | 67 | 68 | def load_dataset(fileName,g): 69 | fileNums=[0] 70 | for i,eachNum in enumerate(fileNums): 71 | print(eachNum) 72 | fileName="Datasets/facebook/edges/"+str(eachNum)+".edges" 73 | print('fileName=',fileName) 74 | f=open(fileName) 75 | line=f.readline() 76 | while(line!=''): 77 | c=(line.split()) 78 | g=addVertex(g,c[0]) 79 | g=addVertex(g,c[1]) 80 | print('Adding ',c[0],'-->',c[1]) 81 | g.add_edge(c[0],c[1]) 82 | line=f.readline() 83 | g.simplify() 84 | return 85 | 86 | load_dataset('abd',G) 87 | 88 | 89 | N=len(G.vs) 90 | layt=G.layout('kk', dim=3) 91 | 92 | labels=[] 93 | print(type(labels)) 94 | for eachNde in G.vs: 95 | labels.append(eachNde['name']) 96 | 97 | Edges=list() 98 | print(type(Edges)) 99 | for eachTuple in G.es: 100 | Edges.append(eachTuple.tuple) 101 | 102 | Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes 103 | Yn=[layt[k][1] for k in range(N)]# y-coordinates 104 | Zn=[layt[k][2] for k in range(N)]# z-coordinates 105 | Xe=[] 106 | Ye=[] 107 | Ze=[] 108 | 109 | for e in Edges: 110 | Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends 111 | Ye+=[layt[e[0]][1],layt[e[1]][1], None] 112 | Ze+=[layt[e[0]][2],layt[e[1]][2], None] 113 | 114 | import plotly 115 | plotly.tools.set_credentials_file(username='prerna_237', api_key='DXXXKP8XPO3FBUWsH4NY') 116 | 117 | 118 | # In[63]: 119 | 120 | 121 | 122 | 123 | # In[65]: 124 | 125 | print(len(l)) 126 | 127 | 128 | # In[71]: 129 | 130 | import plotly.plotly as py 131 | from plotly.graph_objs import * 132 | 133 | 134 | trace1=Scatter3d(x=Xe, 135 | y=Ye, 136 | z=Ze, 137 | mode='lines', 138 | line=Line(color='rgb(125,125,125)', width=1), 139 | hoverinfo='none' 140 | ) 141 | 142 | trace2=Scatter3d(x=Xn, 143 | y=Yn, 144 | z=Zn, 145 | mode='markers', 146 | name='actors', 147 | marker=Marker(symbol='dot', 148 | color=l, 149 | size=6,colorbar=ColorBar( 150 | title='Colorbar' 151 | ), 152 | colorscale='Viridis', 153 | line=Line(color='rgb(158,18,130)', width=0.5) 154 | ), 155 | text=labels, 156 | hoverinfo='text' 157 | ) 158 | 159 | axis=dict(showbackground=False, 160 | showline=False, 161 | zeroline=False, 162 | showgrid=False, 163 | showticklabels=False, 164 | title='' 165 | ) 166 | 167 | layout = Layout( 168 | title="3D Visualization of the Facebook nodes", 169 | width=1000, 170 | height=1000, 171 | showlegend=False, 172 | scene=Scene( 173 | xaxis=XAxis(axis), 174 | yaxis=YAxis(axis), 175 | zaxis=ZAxis(axis), 176 | ), 177 | margin=Margin( 178 | t=100 179 | ), 180 | hovermode='closest', 181 | annotations=Annotations([ 182 | Annotation( 183 | showarrow=False, 184 | # text="Data source: [1] miserables.json", 185 | xref='paper', 186 | yref='paper', 187 | x=0, 188 | y=0.1, 189 | xanchor='left', 190 | yanchor='bottom', 191 | font=Font( 192 | size=14 193 | ) 194 | ) 195 | ]), ) 196 | 197 | data=Data([trace1, trace2]) 198 | fig=Figure(data=data, layout=layout) 199 | 200 | py.iplot(fig) 201 | 202 | -------------------------------------------------------------------------------- /Centrality.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | # Add code to visualize the centrality of the graph. Basically this section is to get an idea about the structure of the graph 7 | 8 | 9 | # In[2]: 10 | 11 | from igraph import * 12 | g=Graph() 13 | 14 | 15 | # In[3]: 16 | 17 | 18 | def addVertex(g,name_str): 19 | try: 20 | if(name_str not in g.vs['name']): 21 | print('Inserted node ',name_str) 22 | g.add_vertex(name=name_str) 23 | else: 24 | print ('Node ',name_str,' already present') 25 | print(g.vs.find(name_str).index) 26 | except KeyError: 27 | g.add_vertex(name=name_str) 28 | return g 29 | 30 | 31 | 32 | def write_tuple_to_file(f,t): 33 | string=str(t[0])+' '+str(t[1])+'\n' 34 | f.write(string) 35 | 36 | def retrieve_edge_name_tuple(g,t): 37 | a=(g.vs[t[0]]['name'],g.vs[t[1]]['name']) 38 | return a 39 | 40 | 41 | # In[4]: 42 | 43 | 44 | def load_dataset(fileName,g): 45 | fileNums=[0] 46 | for i,eachNum in enumerate(fileNums): 47 | print(eachNum) 48 | fileName="Datasets/facebook/edges/"+str(eachNum)+".edges" 49 | print('fileName=',fileName) 50 | f=open(fileName) 51 | line=f.readline() 52 | while(line!=''): 53 | c=(line.split()) 54 | g=addVertex(g,c[0]) 55 | g=addVertex(g,c[1]) 56 | print('Adding ',c[0],'-->',c[1]) 57 | g.add_edge(c[0],c[1]) 58 | line=f.readline() 59 | g.simplify() 60 | return 61 | 62 | 63 | # In[5]: 64 | 65 | load_dataset('abd',g) 66 | print(len(g.vs)) 67 | 68 | 69 | # In[7]: 70 | 71 | print(len(g.vs)) 72 | 73 | 74 | # In[20]: 75 | 76 | def calculate_eigen(g): 77 | eigen=g.evcent(directed=False) 78 | for i in range(1,6): 79 | maxVal=max(eigen) 80 | print(i,'==node',g.vs[eigen.index(maxVal)]['name'],' with score of ',maxVal) 81 | eigen.remove(maxVal) 82 | eigen=g.evcent(directed=False) 83 | return eigen 84 | 85 | 86 | # In[21]: 87 | 88 | def calculate_closeness(g): 89 | close=g.closeness(g.vs) 90 | for i in range(1,6): 91 | maxVal=max(close) 92 | print(i,'==node',g.vs[close.index(maxVal)]['name'],' with score of ',maxVal) 93 | close.remove(maxVal) 94 | close=g.closeness(g.vs) 95 | return close 96 | 97 | 98 | # In[22]: 99 | 100 | def calculate_between(g): 101 | between=g.betweenness(g.vs) 102 | for i in range(1,6): 103 | maxVal=max(between) 104 | print(i,'==node',g.vs[between.index(maxVal)]['name'],' with score of ',maxVal) 105 | between.remove(maxVal) 106 | between=g.betweenness(g.vs) 107 | return between 108 | 109 | 110 | # In[23]: 111 | 112 | print('Eigen Vector') 113 | global eigen 114 | eigen=calculate_eigen(g) 115 | 116 | global close 117 | global between 118 | print('Closeness') 119 | close=calculate_closeness(g) 120 | print('Betweenness') 121 | between=calculate_between(g) 122 | 123 | 124 | # In[24]: 125 | 126 | print(close) 127 | 128 | 129 | # In[ ]: 130 | 131 | 132 | 133 | 134 | # In[25]: 135 | 136 | from igraph import * 137 | G=Graph() 138 | 139 | load_dataset('abd',G) 140 | 141 | 142 | N=len(G.vs) 143 | layt=G.layout('kk', dim=3) 144 | 145 | labels=[] 146 | print(type(labels)) 147 | for eachNde in G.vs: 148 | labels.append(eachNde['name']) 149 | 150 | Edges=list() 151 | print(type(Edges)) 152 | for eachTuple in G.es: 153 | Edges.append(eachTuple.tuple) 154 | 155 | Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes 156 | Yn=[layt[k][1] for k in range(N)]# y-coordinates 157 | Zn=[layt[k][2] for k in range(N)]# z-coordinates 158 | Xe=[] 159 | Ye=[] 160 | Ze=[] 161 | 162 | for e in Edges: 163 | Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends 164 | Ye+=[layt[e[0]][1],layt[e[1]][1], None] 165 | Ze+=[layt[e[0]][2],layt[e[1]][2], None] 166 | 167 | import plotly 168 | plotly.tools.set_credentials_file(username='prerna_237', api_key='DXXXKP8XPO3FBUWsH4NY') 169 | 170 | 171 | # In[26]: 172 | 173 | import plotly.plotly as py 174 | from plotly.graph_objs import * 175 | 176 | 177 | trace1=Scatter3d(x=Xe, 178 | y=Ye, 179 | z=Ze, 180 | mode='lines', 181 | line=Line(color='rgb(125,125,125)', width=1), 182 | hoverinfo='none' 183 | ) 184 | 185 | trace2=Scatter3d(x=Xn, 186 | y=Yn, 187 | z=Zn, 188 | mode='markers', 189 | name='actors', 190 | marker=Marker(symbol='dot', 191 | color=eigen, 192 | size=6,colorbar=ColorBar( 193 | title='Colorbar' 194 | ), 195 | colorscale='Viridis', 196 | line=Line(color='rgb(158,18,130)', width=0.5) 197 | ), 198 | text=labels, 199 | hoverinfo='text' 200 | ) 201 | 202 | axis=dict(showbackground=False, 203 | showline=False, 204 | zeroline=False, 205 | showgrid=False, 206 | showticklabels=False, 207 | title='' 208 | ) 209 | 210 | layout = Layout( 211 | title="3D Visualization of the Facebook nodes", 212 | width=1000, 213 | height=1000, 214 | showlegend=False, 215 | scene=Scene( 216 | xaxis=XAxis(axis), 217 | yaxis=YAxis(axis), 218 | zaxis=ZAxis(axis), 219 | ), 220 | margin=Margin( 221 | t=100 222 | ), 223 | hovermode='closest', 224 | annotations=Annotations([ 225 | Annotation( 226 | showarrow=False, 227 | # text="Data source: [1] miserables.json", 228 | xref='paper', 229 | yref='paper', 230 | x=0, 231 | y=0.1, 232 | xanchor='left', 233 | yanchor='bottom', 234 | font=Font( 235 | size=14 236 | ) 237 | ) 238 | ]), ) 239 | 240 | data=Data([trace1, trace2]) 241 | fig=Figure(data=data, layout=layout) 242 | 243 | py.iplot(fig) 244 | 245 | 246 | # In[ ]: 247 | 248 | 249 | 250 | 251 | # In[ ]: 252 | 253 | 254 | 255 | 256 | # In[ ]: 257 | 258 | 259 | 260 | 261 | # In[ ]: 262 | 263 | 264 | 265 | 266 | # In[ ]: 267 | 268 | 269 | 270 | -------------------------------------------------------------------------------- /Link_Prediction.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[2]: 5 | 6 | #Reading input feature values using numpy 7 | import numpy as np 8 | from igraph import * 9 | global num_of_feat 10 | num_of_feat=347 11 | 12 | 13 | # In[3]: 14 | 15 | def load_dataset(fileName,g): 16 | fileNums=[0] 17 | for i,eachNum in enumerate(fileNums): 18 | print(eachNum) 19 | print('fileName=',fileName) 20 | f=open(fileName) 21 | line=f.readline() 22 | while(line!=''): 23 | c=(line.split()) 24 | g=addVertex(g,c[0]) 25 | g=addVertex(g,c[1]) 26 | print('Adding ',c[0],'-->',c[1]) 27 | g.add_edge(c[0],c[1]) 28 | line=f.readline() 29 | g.simplify() 30 | return 31 | 32 | def load_neg_dataset(fileName,g): 33 | fileNums=[0] 34 | for i,eachNum in enumerate(fileNums): 35 | print(eachNum) 36 | print('fileName=',fileName) 37 | f=open(fileName) 38 | nodeID=eachNum 39 | line=f.readline() 40 | while(line!=''): 41 | c=(line.split()) 42 | g=addVertex(g,c[0]) 43 | g=addVertex(g,c[1]) 44 | print('Adding ',c[0],'-->',c[1]) 45 | g.add_edge(c[0],c[1]) 46 | line=f.readline() 47 | g.simplify() 48 | return 49 | 50 | def load_and_shape_input(file_name): 51 | a=np.loadtxt(fname=file_name) 52 | slice_D =[a[i][1:] for i in range(0,num_of_feat)] 53 | c=np.asarray(slice_D) 54 | return c 55 | 56 | def load_shape_input(file_name_array): 57 | features=dict() 58 | for eachname in file_name_array: 59 | file_name='Datasets/facebook/'+str(eachname)+'.feat' 60 | a=np.loadtxt(file_name) 61 | for eachFeat in a: 62 | features[eachFeat[0]]=np.asarray(eachFeat[1:]) 63 | return features 64 | 65 | 66 | def addVertex(g,name_str): 67 | try: 68 | if(name_str not in g.vs['name']): 69 | print('Inserted node ',name_str) 70 | g.add_vertex(name=name_str) 71 | else: 72 | print ('Node ',name_str,' already present') 73 | print(g.vs.find(name_str).index) 74 | except KeyError: 75 | g.add_vertex(name=name_str) 76 | return g 77 | 78 | 79 | 80 | def write_tuple_to_file(f,t): 81 | string=str(t[0])+' '+str(t[1])+'\n' 82 | f.write(string) 83 | 84 | def retrieve_edge_name_tuple(g,t): 85 | a=(g.vs[t[0]]['name'],g.vs[t[1]]['name']) 86 | return a 87 | 88 | 89 | 90 | 91 | # In[4]: 92 | 93 | # Load Feature vectors 94 | li={0} 95 | node_feat=load_shape_input(li) 96 | 97 | 98 | # In[5]: 99 | 100 | g=Graph() 101 | load_dataset('Datasets/Self_Datasets/sample_train.edges',g) 102 | 103 | 104 | not_g=Graph() 105 | load_dataset('Datasets/Self_Datasets/negative_train.edges',not_g) 106 | 107 | 108 | # In[6]: 109 | 110 | print(type(node_feat)) 111 | for eachKey in node_feat.values(): 112 | print(len(eachKey)) 113 | print(type(eachKey)) 114 | 115 | 116 | # In[7]: 117 | 118 | # print(node_feat[np.float64(0)]) 119 | 120 | 121 | # In[8]: 122 | 123 | # print('positive edges',len(g.es)) 124 | # print('negative edges',len(not_g.es)) 125 | # t=retrieve_edge_name_tuple(g,(0,1)) 126 | # node_feat[np.float64(t[0])] 127 | 128 | 129 | # In[9]: 130 | 131 | def make_class_arrays(g,datalabel): 132 | output_list=list() 133 | edgeSet=g.es 134 | for eachTuple in edgeSet: 135 | tuple_name=retrieve_edge_name_tuple(g,eachTuple.tuple) 136 | print('eachTuple=',tuple_name) 137 | output=np.add(node_feat[np.float64(tuple_name[0])],node_feat[np.float64(tuple_name[1])]) 138 | output_list.append(output) 139 | return np.asarray(output_list) 140 | 141 | 142 | # In[10]: 143 | 144 | valid_g=Graph() 145 | load_dataset('Datasets/Self_Datasets/sample_valid.edges',valid_g) 146 | # node_feat=load_and_shape_input("Datasets/facebook/0.feat") 147 | 148 | 149 | valid_not_g=Graph() 150 | load_dataset('Datasets/Self_Datasets/negative_valid.edges',valid_not_g) 151 | 152 | 153 | # In[11]: 154 | 155 | # print(len(node_feat[np.float64(345)])) 156 | 157 | 158 | # In[12]: 159 | 160 | x_positive=make_class_arrays(g,1) 161 | x_negative=make_class_arrays(not_g,1) 162 | 163 | 164 | # In[13]: 165 | 166 | print(x_positive.shape) 167 | print(x_negative.shape) 168 | 169 | 170 | # In[14]: 171 | 172 | valid_x_positive=make_class_arrays(valid_g,1) 173 | valid_x_negative=make_class_arrays(valid_not_g,1) 174 | 175 | 176 | # In[15]: 177 | 178 | print(valid_x_positive.shape) 179 | print(valid_x_negative.shape) 180 | 181 | 182 | # In[ ]: 183 | 184 | 185 | 186 | 187 | # In[16]: 188 | 189 | y_positive=np.full(shape=(x_positive.shape[0],1),fill_value=1.0) 190 | y_negative=np.full(shape=(x_negative.shape[0],1),fill_value=0.0) 191 | 192 | 193 | # In[17]: 194 | 195 | print(y_positive.shape) 196 | print(y_negative.shape) 197 | 198 | 199 | # In[18]: 200 | 201 | valid_y_positive=np.full(shape=(valid_x_positive.shape[0],1),fill_value=1.0) 202 | valid_y_negative=np.full(shape=(valid_x_negative.shape[0],1),fill_value=0.0) 203 | 204 | 205 | # In[19]: 206 | 207 | print(valid_x_positive.shape) 208 | print(valid_x_negative.shape) 209 | print(valid_y_positive.shape) 210 | print(valid_y_negative.shape) 211 | 212 | 213 | # In[20]: 214 | 215 | print(valid_y_positive.shape) 216 | 217 | 218 | # In[21]: 219 | 220 | train_X=np.append(x_positive,x_negative,axis=0) 221 | train_Y=np.append(y_positive,y_negative,axis=0) 222 | 223 | valid_X=np.append(valid_x_positive,valid_x_negative,axis=0) 224 | valid_Y=np.append(valid_y_positive,valid_y_negative,axis=0) 225 | 226 | 227 | # In[22]: 228 | 229 | print(type(x_positive)) 230 | print(valid_X.shape) 231 | print(type(x_negative)) 232 | print(valid_Y.shape) 233 | print(type(y_positive)) 234 | print(y_positive.shape) 235 | print(train_X.shape) 236 | print(1592+1748) 237 | 238 | 239 | # In[23]: 240 | 241 | from sklearn import linear_model 242 | reg = linear_model.Ridge (alpha = .5) 243 | 244 | 245 | # In[97]: 246 | 247 | # clf.fit(digits.data[:-1], digits.target[:-1]) 248 | reg.fit(X=train_X[:-1],y=train_Y[:-1]) 249 | 250 | 251 | # In[98]: 252 | 253 | reg.predict(train_X[-1:]) 254 | 255 | 256 | # In[91]: 257 | 258 | len(reg.predict(valid_X)) 259 | 260 | 261 | # In[100]: 262 | 263 | np.mean((reg.predict(valid_X)-valid_Y)**2) 264 | 265 | 266 | # In[24]: 267 | 268 | from sklearn.metrics import log_loss 269 | log_loss(valid_Y,reg.predict(valid_X)) 270 | # print(0.01) 271 | 272 | 273 | # In[29]: 274 | 275 | from sklearn import svm 276 | clf_svm = svm.SVC() 277 | clf_svm.fit(X=train_X[:-1],y=train_Y[:-1]) 278 | 279 | 280 | # In[31]: 281 | 282 | from sklearn.metrics import log_loss 283 | log_loss(valid_Y,clf_svm.predict(valid_X)) 284 | 285 | 286 | # In[ ]: 287 | 288 | from sklearn.neighbors import NearestNeighbors 289 | 290 | -------------------------------------------------------------------------------- /GenerateOtherDatasets.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | # Load train dataset 7 | from igraph import * 8 | def load_dataset(g): 9 | fileNums=[0] 10 | for i,eachNum in enumerate(fileNums): 11 | print(eachNum) 12 | fileName="Datasets/facebook/edges/"+str(eachNum)+".edges" 13 | print('fileName=',fileName) 14 | f=open(fileName,'a+') 15 | nodeID=eachNum 16 | line=f.readline() 17 | while(line!=''): 18 | c=(line.split()) 19 | g=addVertex(g,c[0]) 20 | g=addVertex(g,c[1]) 21 | print('Adding ',c[0],'-->',c[1]) 22 | g.add_edge(c[0],c[1]) 23 | line=f.readline() 24 | g.simplify() 25 | return 26 | 27 | def addVertex(g,name_str): 28 | try: 29 | if(name_str not in g.vs['name']): 30 | print('Inserted node ',name_str) 31 | g.add_vertex(name=name_str) 32 | else: 33 | print ('Node ',name_str,' already present') 34 | print(g.vs.find(name_str).index) 35 | except KeyError: 36 | g.add_vertex(name=name_str) 37 | return g 38 | 39 | 40 | 41 | def write_tuple_to_file(f,t): 42 | string=str(t[0])+' '+str(t[1])+'\n' 43 | f.write(string) 44 | 45 | def retrieve_edge_name_tuple(g,t): 46 | a=(g.vs[t[0]]['name'],g.vs[t[1]]['name']) 47 | return a 48 | 49 | 50 | 51 | # In[10]: 52 | 53 | g=Graph() 54 | # load_dataset(g) 55 | 56 | 57 | # In[ ]: 58 | 59 | 60 | 61 | 62 | # In[3]: 63 | 64 | # d=open("Datasets/Self_Datasets/some.txt",'a+') 65 | # d.write('Hello') 66 | # d.close() 67 | # d=open("Datasets/Self_Datasets/some.txt",'a+') 68 | # d.write("sucker") 69 | # d.close() 70 | 71 | 72 | # In[11]: 73 | 74 | import random 75 | 76 | def generate_datasets(g,num,train_filename,valid_filename,test_filename): 77 | load_dataset(g) 78 | f=open(train_filename,'a+'); 79 | global train_num 80 | train_num=int(len(g.es)*0.5) 81 | print('train length=',train_num) 82 | global test_num 83 | test_num=int(len(g.es)*0.25) 84 | global valid_num 85 | valid_num=int(len(g.es)*0.15) 86 | print('valid num=',valid_num) 87 | for i in range(train_num): 88 | edgeSet=g.es; 89 | r=random.randint(0,len(edgeSet)-1); 90 | t=edgeSet[r].tuple 91 | g.delete_edges(t); 92 | print('len of es=',len(edgeSet)) 93 | write_tuple_to_file(f,retrieve_edge_name_tuple(g,t)) 94 | f.close() 95 | f=open(test_filename,'a+'); 96 | for i in range(test_num): 97 | edgeSet=g.es; 98 | r=random.randint(0,len(edgeSet)-1); 99 | print('r=',r) 100 | t=edgeSet[r].tuple 101 | g.delete_edges(t); 102 | print('len of es=',len(edgeSet)) 103 | write_tuple_to_file(f,retrieve_edge_name_tuple(g,t)) 104 | f.close() 105 | f=open(valid_filename,'a+'); 106 | for i in range(valid_num): 107 | edgeSet=g.es; 108 | if(len(g.es)==0): 109 | break 110 | else: 111 | print('len of es=',len(edgeSet)) 112 | r=random.randint(0,len(edgeSet)-1); 113 | print('r=',r) 114 | t=edgeSet[r].tuple 115 | g.delete_edges(t); 116 | write_tuple_to_file(f,retrieve_edge_name_tuple(g,t)) 117 | if(len(g.es)==0): 118 | f.close() 119 | break 120 | print ('I am done') 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | # In[12]: 129 | 130 | generate_datasets(g,len(g.es)/10,'Datasets/Self_Datasets/sample_train.edges','Datasets/Self_Datasets/sample_valid.edges','Datasets/Self_Datasets/sample_test.edges') 131 | 132 | 133 | # In[13]: 134 | 135 | # train length=1426 valid=427 136 | print(train_num) 137 | 138 | 139 | # In[15]: 140 | 141 | #Generate negative examples with class label 0.0 142 | mat=g.get_adjacency() 143 | 144 | pool_of_empty=list() 145 | count=0 146 | for i,entireNode in enumerate(mat): 147 | for j,eachVal in enumerate(entireNode): 148 | if(eachVal==0 and i!=j): 149 | count+=1; 150 | pool_of_empty.append((i,j)) 151 | print('count=',count) 152 | 153 | 154 | # In[20]: 155 | 156 | # print(pool_of_empty) 157 | for each in pool_of_empty: 158 | if(each[0]==0): 159 | pool_of_empty.remove(each) 160 | 161 | 162 | # In[21]: 163 | 164 | import random 165 | def generate_negative_examples(pool,trainfilename,trainnum,validfilename,validnum,testfilename,testnum): 166 | f=open(trainfilename,'a+') 167 | for i in range(0,trainnum): 168 | r=random.randint(0,len(pool)-1); 169 | t=pool[r]; 170 | pool.remove(t); 171 | f.write(str(t[0])+' '+str(t[1])+'\n'); 172 | f.close() 173 | f=open(validfilename,'a+') 174 | for i in range(0,validnum): 175 | r=random.randint(0,len(pool)-1); 176 | t=pool[r]; 177 | pool.remove(t); 178 | f.write(str(t[0])+' '+str(t[1])+'\n'); 179 | f.close() 180 | f=open(testfilename,'a+') 181 | for i in range(0,testnum): 182 | r=random.randint(0,len(pool)-1); 183 | t=pool[r]; 184 | pool.remove(t); 185 | f.write(str(t[0])+' '+str(t[1])+'\n'); 186 | f.close() 187 | 188 | 189 | 190 | 191 | # In[22]: 192 | 193 | generate_negative_examples(pool_of_empty,'Datasets/Self_Datasets/negative_train.edges',train_num,'Datasets/Self_Datasets/negative_valid.edges',valid_num,'Datasets/Self_Datasets/negative_test.edges',test_num) 194 | 195 | 196 | # In[28]: 197 | 198 | #NOT NEEDED 199 | 200 | #code to generate the Negative edge graph 201 | from igraph import * 202 | 203 | nodes=set() 204 | fileNums=[0] 205 | for i,eachNum in enumerate(fileNums): 206 | print(eachNum) 207 | fileName="Datasets/facebook/edges/"+str(eachNum)+".edges" 208 | print('fileName=',fileName) 209 | f=open(fileName) 210 | nodes.add(eachNum) 211 | line=f.readline() 212 | while(line!=''): 213 | c=line.split() 214 | nodes.add(c[0]) 215 | print('added ',c[0]) 216 | nodes.add(c[1]) 217 | print('added ',c[1]) 218 | line=f.readline() 219 | print('Length=',len(nodes)) 220 | print(nodes) 221 | 222 | 223 | 224 | # In[ ]: 225 | 226 | 227 | 228 | 229 | # In[30]: 230 | 231 | 232 | 233 | 234 | # In[ ]: 235 | 236 | 237 | 238 | 239 | # In[49]: 240 | 241 | print(x.vs[2]['name']) 242 | 243 | 244 | # In[82]: 245 | 246 | print(len(pool_of_empty)) 247 | print(type(pool_of_empty)) 248 | 249 | 250 | # In[58]: 251 | 252 | print(len(x.vs)) 253 | print(334*334) 254 | 255 | 256 | # In[32]: 257 | 258 | print(x.es[0].tuple) 259 | 260 | 261 | # In[44]: 262 | 263 | print(x.vs.find('236').index) 264 | 265 | 266 | # In[91]: 267 | 268 | try: 269 | print(x.get_eid(x.vs.find('0'),x.vs.find('83'),directed=False)) 270 | except InternalError: 271 | print ("Edge doesnt exist") 272 | 273 | 274 | # In[30]: 275 | 276 | print(type(x.vs)) 277 | q=set(x.vs['name']) 278 | print(len(q)) 279 | print(len(nodes)) 280 | print(q.pop()) 281 | print(nodes.pop()) 282 | 283 | 284 | # In[36]: 285 | 286 | print(x.get_eid(x.vs.find('236'),x.vs.find('236'),directed=False)) 287 | 288 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Social Networks: Link Prediction 2 | 3 | ### Capstone Project: Data and Knowledge Engineering 4 | 5 | #### Video link: [Social Network Analysis: Link Prediction](https://youtu.be/XRMhgxW-C_M) 6 | 7 | --- 8 | 9 | ### ✨Contributors: Group 1✨ 10 | 11 | - Pranjal Mathur (1410110296) 12 | - Prerna (1410110306) 13 | - Saketh Vallakatla (1410110352) 14 | 15 | ------ 16 | 17 | ## Problem Statement 18 | 19 | > Given an instance of set of nodes (users) in a social network graph, the aim is to find the influencing (important) users and to predict the likelihood of a future association (edge) between two nodes, knowing that there is no association between the nodes in the current state of the graph. 20 | 21 | 22 | ## Motivation 23 | The edges described in the problem statement could be of any form: friendship, collaboration, following or mutual interests. Here, we specifically study and build our model over Facebook's social network, with the following areas of motivation: 24 | 25 | * General application of friends recommendation to a particular user. 26 | * Predicting hidden links in a social network group formed by terrorists along with identification of their leaders/ key influencers. 27 | * Targeted marketing of products: Marketing through highly influential individuals and also identifying plausible customers. 28 | * Suggesting promising interactions or collaborations that have not yet been identified within an organization. 29 | * In Bioinformatics, link prediction can be used to find interactions between proteins. 30 | 31 | The following model can be extended or modified to cater to the needs of various other social networks like Twitter, Google+, Foursquare, etc. 32 | ## Knowledge Engineering Process 33 | 34 | Discussed below are the four major Knowledge Engineering Tasks: 35 | 36 | ### Acquisition & Learning 37 | ##### Data: 38 | * Acquired from http://snap.stanford.edu/data/egonets-Facebook.html 39 | 40 | * This dataset consists of 'circles' (or 'friends lists') from Facebook. 41 | 42 | * This anonymized dataset includes node features (profiles), circles, and ego networks. 43 | 44 | * The edges are **undirected** . 45 | 46 | * 10 ego-networks, consisting of 193 circles and 4,039 users. 47 | 48 | * Features of various nodes are described in the following format:` [Type]:[Subtype]:attributeName` 49 | 50 | * Following figure represents an example of the attributes and the procedure of feature array formation. 51 | 52 | ##### Domain Knowledge: 53 | 54 | Following insights are meaningful while building our model for Link Prediction: 55 | 56 | * The idea is to assign the connection weight `score(x, y)` to pairs of nodes `` , based on the input graph. 57 | 58 | * The approaches adopted so far can be classified into: 59 | * *Methods based on node neighborhoods*: A number of approaches are based on the idea that two nodes x and y are more likely to form a link in the future if their sets of neighbors Γ(x) and Γ(y) have large overlap. Example: 60 | * Common neighbors 61 | * Jaccard’s coefficient 62 | * Preferential attachment 63 | * *Methods based on the ensemble of all paths*: A number of methods refine the notion of 64 | shortest-path distance by implicitly considering the ensemble of all paths between two nodes. 65 | 66 | * Since we had multiple features associated with each node in an ego network, we performed our experiment based on the **similarity of features between the two nodes**. 67 | 68 | * Machine Learning models like Support Vector Machine could classify the set of nodes into two: (1) Connection, (2) No connection, Neural Networks and regression techniques can be used for the same. 69 | 70 | 71 | 72 | 73 | ##### Task: 74 | 75 | > Given an unweighted, undirected graph `G = ⟨V,E⟩` representing the topological structure of a social network in which each edge `e = ⟨u,v⟩ ∈ E` represents an interaction between u and v that took place at a particular time `t(e)` , the two task can be described as: 76 | > 77 | > * **To find the highly influencing/ central node set N.** 78 | > 79 | > 80 | > * **For time T greater than t(e), we need to predict and output a list of edges not present at t(e).** 81 | 82 | ### Representation: 83 | ##### Data: 84 | 85 | * In order to represent complex data structure of a graph with various features attached to each node, `python-igraph` has been used. 86 | * `Dictionary Data Structure` is deployed to store the corresponding features of each node. 87 | 88 | ### Development and Explanation: 89 | ##### Approach: 90 | 91 | * ***Measures for Centrality*** : As our part of analysis, we used the following 4 centrality measures: 92 | * `Degree of nodes` : 93 | - Core idea: To find the nodes that have highest number of immediate neighbors (degree) 94 | - Input: Graph and a node 95 | - Output: Degree of nodes. 96 | * `Closeness Centrality` : 97 | * Core idea: A central node is one that is close, on average, to other nodes. 98 | * Input: Graph and a node 99 | * Output: value [0,1] after standardization (1 being highly central) 100 | * `Betweeness Centrality` : 101 | * Core Idea: A central actor is one that acts as a bridge, broker or gatekeeper. 102 | * Input: Graph and a node 103 | * Output: value [0,1] after normalization (1 being highly central) 104 | * `Eigenvector centrality` : 105 | * Core Idea: A central actor is connected to other central actors. 106 | * Input: Graph 107 | * Output: value [0,1] 108 | * ***Link Prediction***: Based on our survey[1], usability criteria and experiments, we have used the following Machine learning approach: 109 | * `Support Vector Machine` classification algorithm: 110 | * Core Idea: Segregating the two classes with a hyper-plane. 111 | * Here, two classes are: Linked and unlinked 112 | * Input: 113 | * Graph Dataset (separately for each ego network), with labels attached 114 | * Features (~230) dictionary 115 | * Output: Predicted association between two nodes `[x,y]` : 116 | * 0 if no association 117 | * 1 otherwise. 118 | * We divide out dataset in the ratio of **2:1:1** for **Train:Validation:Test** 119 | 120 | 121 | 122 | ##### Python Libraries used: 123 | 124 | * `Plotly` : Graphing library for making interactive, publication-quality graphs online. 125 | * `IGraph` : igraph is a collection of network analysis tools with the emphasis on efficiency**, **portability and ease of use. igraph is open source and free. 126 | * `Numpy` : Adds support for large, multi-dimensional arrays and matrices along with a large collection of high level mathematical functions to operate on these arrays. (dependency- Scikit) 127 | * `Scipy` : An open source Python library used for scientific computing and technical computing.(dependency- Scikit) 128 | * `Scikit- Learn` : Simple and efficient tool for data mining and data analysis. Used for dimensionality reduction and implementing machine learning algorithms. 129 | 130 | ### Validation: Performance Evaluation: 131 | ##### Evaluation Interpretation: 132 | 133 | | Criteria | Formula | Score | Interpretation | 134 | | --------------- | ---------------------------------------- | ----- | ---------------------------------------- | 135 | | Accuracy | ![{\mathit {ACC}}=({\mathit {TP}}+{\mathit {TN}})/(P+N)](https://wikimedia.org/api/rest_v1/media/math/render/svg/31f7e08f6490e7182038c4ce27b87c483d6c3b4a) | 70% | The results predicted were correct 70% of the time. | 136 | | Precision Score | ![{\mathit {PPV}}={\mathit {TP}}/({\mathit {TP}}+{\mathit {FP}})](https://wikimedia.org/api/rest_v1/media/math/render/svg/699fcdb880b7f6a92742bc0845b8b60b59806a98) | 68% | The links (1) predicted were correct 68% of the time. | 137 | | F1 Score | ![{\mathit {F1}}=2{\mathit {TP}}/(2{\mathit {TP}}+{\mathit {FP}}+{\mathit {FN}})](https://wikimedia.org/api/rest_v1/media/math/render/svg/8b64097b6362d28387a0c4650f2fed2bc5ea9fe9) | 65% | It considers both precision and recall the of the test to compute the score | 138 | 139 | 140 | ## Replicating the results 141 | 142 | In order to run the code provided in `SocialNetworkAnalysis.zip` 143 | 144 | * Unzip the file 145 | 146 | * Setting up the environment for execution: 147 | 148 | * Python version 2.7 or above. 149 | * Install the python libraries as described in the previous section. 150 | 151 | * Setting up the dataset: 152 | 153 | ``` 154 | python2.7 GenerateOtherDatasets.py 155 | ``` 156 | 157 | * To get the centrality measures and visualize the network: 158 | 159 | ```python 160 | python2.7 Centrality.py 161 | ``` 162 | 163 | * For Link Prediction and Evaluation: 164 | 165 | ``` 166 | python2.7 Link_Prediction.py 167 | ``` 168 | 169 | ​ 170 | 171 | ------ 172 | 173 | [1]"Link prediction in multiplex online social networks" by Mahdi Jalili et al 174 | --------------------------------------------------------------------------------