├── D2AGE
    ├── symmetric
    │   ├── toolsFunction.py
    │   ├── evaluateTools.py
    │   ├── proxEmbedBySubgraphProcessModelBatch.py
    │   ├── directedGraphLSTMModel.py
    │   ├── pythonParamsConfig
    │   ├── proxEmbedBySubgraphModel.py
    │   ├── proxEmbedBySubgraphProcessAndAssess.py
    │   ├── experimentForOneFileByParams.py
    │   ├── prepareSubgraphsWithAllSubpaths.py
    │   ├── dataProcessTools.py
    │   └── proxEmbedBySubgraphs.py
    └── asymmetric
    │   ├── proxEmbedBySubgraphProcessModelBatch.py
    │   ├── evaluateTools.py
    │   ├── toolsFunction.py
    │   ├── directedGraphLSTMModel.py
    │   ├── pythonParamsConfig
    │   ├── proxEmbedBySubgraphModel.py
    │   ├── proxEmbedBySubgraphProcessAndAssess.py
    │   ├── prepareSubgraphsWithAllSubpaths.py
    │   ├── experimentForOneFileByParams.py
    │   ├── dataProcessTools.py
    │   └── proxEmbedBySubgraphs.py
└── README.md


/D2AGE/symmetric/toolsFunction.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | '''
 4 | 
 5 | import numpy
 6 | import theano
 7 | from theano import tensor
 8 | from theano.ifelse import ifelse
 9 | 
10 | def mapSortByValueDESC(map,top):
11 |     """
12 |     sort DESC
13 |     """
14 |     if top>len(map): 
15 |         top=len(map)
16 |     items=map.items() 
17 |     backitems=[[v[1],v[0]] for v in items]  
18 |     backitems.sort(reverse=True) 
19 | #     backitems.sort() 
20 |     e=[ backitems[i][1] for i in range(top)]  
21 |     return e
22 | 
23 | 
24 | def mapSortByValueASC(map,top):
25 |     """
26 |    sort ASC
27 |     """
28 |     if top>len(map): 
29 |         top=len(map)
30 |     items=map.items() 
31 |     backitems=[[v[1],v[0]] for v in items] 
32 | #     backitems.sort(reverse=True) 
33 |     backitems.sort()
34 |     e=[ backitems[i][1] for i in range(top)]  
35 |     return e
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # D2AGE
 2 | 
 3 | This is the cource codes for our paper : \
 4 | 	Distance-aware DAG Embedding for Proximity Search on Heterogeneous Graphs. AAAI, 2018.
 5 | 
 6 | ============================================================
 7 | 
 8 | Files list:\
 9 | 	1). D2AGE : the main dir for the source code \
10 | 	2). readme : this file
11 | 
12 | The codes are written in python-2.7, and we use theano for model development. You should generate the subpaths by yourselves, that is: \
13 | 	1) random walk in the given graph \
14 | 	2) truncate the subpaths from the sampled paths then save to file.
15 | 
16 | After this step, you can use these codes to first generate the DAGs, and then to model them by D2AGE.
17 | 
18 | ============================================================
19 | 
20 | D2AGE directory
21 | 
22 | There are two directories in /D2AGE/, symmetric and asymmetric.
23 | The symmetric dir is the source codes for symmetric relation; while the asymmetric dir is the source codes for asymmetric relation. Next we only use the symmetric to explain the details.
24 | 
25 | In /D2AGE/symmetric, \
26 | 	1)pythonParamsConfig : this file is to set all the parameters used in the model. We explain these parameters in this file. \
27 | 	2)prepareSubgraphsWithAllSubpaths.py : this file is to generate the DAGs between (q,v) by the given sampled subpaths. \
28 | 	3)experimentForOneFileByParams.py : after DAG generation, you could use this file to train the model, and then test the model.
29 | 	
30 | For methods in other files, they would be called in the above three files.
31 | 
32 | ============================================================
33 | 
34 | If you use the code, please cite our paper:
35 | 
36 | @inproceedings{liu2018distance, \
37 |   title={Distance-aware DAG Embedding for Proximity Search on Heterogeneous Graphs}, \
38 |   author={Liu, Zemin and Zheng, Vincent W and Zhao, Zhou and Zhu, Fanwei and Chang, Kevin Chen-Chuan and Wu, Minghui and Ying, Jing}, \
39 |   year={2018}, \
40 |   organization={AAAI} \
41 | }
42 | 


--------------------------------------------------------------------------------
/D2AGE/asymmetric/proxEmbedBySubgraphProcessModelBatch.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | the processing model
 4 | '''
 5 | 
 6 | import numpy
 7 | import theano
 8 | from theano import tensor
 9 | import directedGraphLSTMModel
10 | from theano.ifelse import ifelse
11 | 
12 | def proxEmbedBySubgraphProcessModel(options, tparams):
13 |     xs=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen
14 |     masks=tensor.tensor3('masks', dtype=theano.config.floatX)  # @UndefinedVariable # shape=nsamples * maxlen * maxlen
15 |     lengths=tensor.vector('lengths',dtype='int64') # shape=#(xs) * 0
16 |     subgraph_lens=tensor.vector('subgraph_lens', dtype='int64') # shape=nsamples*0
17 |     wordsEmbeddings=tensor.matrix('wordsEmbeddings', dtype=theano.config.floatX)  # @UndefinedVariable # shape=#(words) * wordsDimension
18 |     buffer_tensor=tensor.tensor3('buffer_tensor', dtype=theano.config.floatX)  # @UndefinedVariable # shape=maxlen*maxlen*dimension
19 |     nodesLens=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen
20 |         
21 |     def _processSubgraph(i):
22 |         length=lengths[i]
23 |         x=xs[i,:length] 
24 |         mask=masks[i,:length,:length] 
25 |         nodesLen=nodesLens[i,:length] 
26 |         emb=directedGraphLSTMModel.directedGraphLSTMModel(options, tparams, x, mask, wordsEmbeddings, buffer_tensor, nodesLen) 
27 |         return emb 
28 |     
29 |     embx=None
30 |     rval,update=theano.scan(
31 |                         _processSubgraph,
32 |                         sequences=tensor.arange(lengths.shape[0]), 
33 |                         )
34 |     rval=discountModel(options['discount_alpha'], subgraph_lens)[:,None]*rval
35 |     embx=rval.max(axis=0)
36 |     
37 |     score=tensor.dot(embx,tparams['w'])
38 |     
39 |     return xs, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, score
40 | 
41 | def discountModel(alpha,length):
42 |     """
43 |     discount
44 |     """
45 |     return tensor.exp(alpha*length*(-1))


--------------------------------------------------------------------------------
/D2AGE/symmetric/evaluateTools.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | evaluate criteria
 4 | '''
 5 | 
 6 | import numpy
 7 | 
 8 | def get_AP(k,ideal,test):
 9 |     """
10 |     """
11 |     ideal=set(ideal)
12 |     accumulation=0.0 
13 |     count=0 
14 |     for i in range(len(test)): 
15 |         if i>=k: 
16 |             break
17 |         if test[i] in ideal: 
18 |             count+=1
19 |             accumulation+=count/(i+1.0)
20 |     m=len(ideal) 
21 |     n=k 
22 |     x=0
23 |     if m>n:
24 |        x=n 
25 |     else:
26 |         x=m
27 |     if x==0:
28 |         return 0 
29 |     return accumulation/x
30 |             
31 |             
32 | def get_MAP(k,ideal_map,test_map):
33 |     """
34 |     """
35 |     accumulation=0.0
36 |     for key in ideal_map.keys(): 
37 |         accumulation+=get_AP(k, ideal_map[key], test_map[key]) 
38 |     if len(ideal_map)==0: 
39 |         return 0
40 |     return accumulation/len(ideal_map)
41 |     
42 |     
43 | def get_nDCG(k,ideal,test):
44 |     """
45 |     """
46 |     ideal=set(ideal)
47 |     accumulation=0.0
48 |     for i in range(len(test)):
49 |         if i>=k: 
50 |             break
51 |         if test[i] in ideal: 
52 |             if i==0:
53 |                 accumulation+=1.0
54 |             else:
55 |                 accumulation+=1.0/numpy.log2(i+1)
56 |     normalization=0.0
57 |     for i in range(len(ideal)):
58 |         if i>=k: 
59 |             break
60 |         if i==0:
61 |             normalization+=1.0
62 |         else:
63 |             normalization+=1.0/numpy.log2(i+1)
64 |     if normalization==0:
65 |         return 0
66 |     return accumulation/normalization
67 |         
68 | def get_MnDCG(k,ideal_map,test_map):
69 |     """
70 |     """
71 |     accumulation=0.0
72 |     for key in ideal_map.keys(): 
73 |         accumulation+=get_nDCG(k, ideal_map[key], test_map[key]) 
74 |     if len(ideal_map)==0: 
75 |         return 0
76 |     return accumulation/len(ideal_map)
77 |             
78 | 


--------------------------------------------------------------------------------
/D2AGE/asymmetric/evaluateTools.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | evaluate tools
 4 | '''
 5 | 
 6 | import numpy
 7 | 
 8 | def get_AP(k,ideal,test):
 9 |     """
10 |     """
11 |     ideal=set(ideal)
12 |     accumulation=0.0 
13 |     count=0 
14 |     for i in range(len(test)): 
15 |         if i>=k: 
16 |             break
17 |         if test[i] in ideal: 
18 |             count+=1
19 |             accumulation+=count/(i+1.0)
20 |     m=len(ideal) 
21 |     n=k 
22 |     x=0
23 |     if m>n:
24 |        x=n 
25 |     else:
26 |         x=m
27 |     if x==0:
28 |         return 0 
29 |     return accumulation/x
30 |             
31 |             
32 | def get_MAP(k,ideal_map,test_map):
33 |     """
34 |     """
35 |     accumulation=0.0
36 |     for key in ideal_map.keys(): 
37 |         accumulation+=get_AP(k, ideal_map[key], test_map[key]) 
38 |     if len(ideal_map)==0: 
39 |         return 0
40 |     return accumulation/len(ideal_map)
41 |     
42 |     
43 | def get_nDCG(k,ideal,test):
44 |     """
45 |     """
46 |     ideal=set(ideal)
47 |     accumulation=0.0
48 |     for i in range(len(test)):
49 |         if i>=k: 
50 |             break
51 |         if test[i] in ideal:
52 |             if i==0:
53 |                 accumulation+=1.0
54 |             else:
55 |                 accumulation+=1.0/numpy.log2(i+1)
56 |     normalization=0.0
57 |     for i in range(len(ideal)):
58 |         if i>=k: 
59 |             break
60 |         if i==0:
61 |             normalization+=1.0
62 |         else:
63 |             normalization+=1.0/numpy.log2(i+1)
64 |     if normalization==0:
65 |         return 0
66 |     return accumulation/normalization
67 |         
68 | def get_MnDCG(k,ideal_map,test_map):
69 |     """
70 |     """
71 |     accumulation=0.0
72 |     for key in ideal_map.keys(): 
73 |         accumulation+=get_nDCG(k, ideal_map[key], test_map[key]) 
74 |     if len(ideal_map)==0: 
75 |         return 0
76 |     return accumulation/len(ideal_map)
77 |     
78 |             
79 | 


--------------------------------------------------------------------------------
/D2AGE/symmetric/proxEmbedBySubgraphProcessModelBatch.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | @author: Liu Zemin
 4 | Functions and Application : 
 5 | process model
 6 | '''
 7 | 
 8 | import numpy
 9 | import theano
10 | from theano import tensor
11 | import directedGraphLSTMModel
12 | from theano.ifelse import ifelse
13 | 
14 | def proxEmbedBySubgraphProcessModel(options, tparams):
15 |     """
16 |     """
17 |     xs=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen
18 |     masks=tensor.tensor3('masks', dtype=theano.config.floatX)  # @UndefinedVariable # shape=nsamples * maxlen * maxlen
19 |     lengths=tensor.vector('lengths',dtype='int64') # shape=#(xs) * 0
20 |     subgraph_lens=tensor.vector('subgraph_lens', dtype='int64') # shape=nsamples*0
21 |     wordsEmbeddings=tensor.matrix('wordsEmbeddings', dtype=theano.config.floatX)  # @UndefinedVariable # shape=#(words) * wordsDimension
22 |     buffer_tensor=tensor.tensor3('buffer_tensor', dtype=theano.config.floatX)  # @UndefinedVariable # shape=maxlen*maxlen*dimension
23 |     nodesLens=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen
24 |         
25 |     def _processSubgraph(i):
26 |         length=lengths[i]
27 |         x=xs[i,:length] 
28 |         mask=masks[i,:length,:length] 
29 |         nodesLen=nodesLens[i,:length] 
30 |         emb=directedGraphLSTMModel.directedGraphLSTMModel(options, tparams, x, mask, wordsEmbeddings, buffer_tensor, nodesLen) 
31 |         return emb 
32 |     
33 |     embx=None
34 |     rval,update=theano.scan(
35 |                         _processSubgraph,
36 |                         sequences=tensor.arange(lengths.shape[0]), 
37 |                         )
38 |     
39 |     rval=discountModel(options['discount_alpha'], subgraph_lens)[:,None]*rval
40 |     embx=rval.max(axis=0)
41 |     
42 |     score=tensor.dot(embx,tparams['w'])
43 |     
44 |     return xs, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, score
45 | 
46 | def discountModel(alpha,length):
47 |     """
48 |     discount
49 |     """
50 |     return tensor.exp(alpha*length*(-1))


--------------------------------------------------------------------------------
/D2AGE/asymmetric/toolsFunction.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | Created on 2016年8月9日
 4 | 
 5 | @author: Administrator
 6 | 
 7 | 本文中主要是一些tools工具方法
 8 | '''
 9 | 
10 | import numpy
11 | import theano
12 | from theano import tensor
13 | from theano.ifelse import ifelse
14 | 
15 | def mapSortByValueDESC(map,top):
16 |     """
17 |     将map按照降序进行排列，并返回top个key
18 |     其中top<=len(map)
19 |     已测试过。
20 |     """
21 |     if top>len(map): # 如果设置的top的数值大于map的长度，则将top进行修改为map的长度
22 |         top=len(map)
23 |     items=map.items() 
24 |     backitems=[[v[1],v[0]] for v in items]  # 反转
25 |     backitems.sort(reverse=True) # reverse=True是降序
26 | #     backitems.sort() # 升序
27 |     e=[ backitems[i][1] for i in range(top)]  # 把key按照顺序返回
28 |     return e
29 | 
30 | 
31 | def mapSortByValueASC(map,top):
32 |     """
33 |     将map按照升序进行排列，并返回top个key
34 |     其中top<=len(map)
35 |     已测试过。
36 |     """
37 |     if top>len(map): # 如果设置的top的数值大于map的长度，则将top进行修改为map的长度
38 |         top=len(map)
39 |     items=map.items() 
40 |     backitems=[[v[1],v[0]] for v in items]  # 反转
41 | #     backitems.sort(reverse=True) # reverse=True是降序
42 |     backitems.sort() # 升序
43 |     e=[ backitems[i][1] for i in range(top)]  # 把key按照顺序返回
44 |     return e
45 | 
46 | 
47 | def max_poolingForMatrix(x):
48 |     """
49 |         使用scan函数来实现max-pooling的计算
50 |         其中，x是要计算max-pooling的matrix，这里是按照列来进行绝对值的max-pooling
51 |         已测试过。
52 |     """
53 |     def _funcForRow(row,max_array):
54 |         """
55 |                 对于每一行，均计算
56 |         """
57 |         def _funcForElement(element,max_value):
58 |             """
59 |                         对于每个元素
60 |             """
61 | #             return tensor.switch(tensor.gt(tensor.abs_(element), tensor.abs_(max_value)),  element,  max_value)
62 |             return ifelse(tensor.gt(tensor.abs_(element), tensor.abs_(max_value)),  element,  max_value)
63 |     
64 |         r,u=theano.scan(
65 |                     fn=_funcForElement,
66 |                     sequences=[row,max_array],
67 |                     )
68 |         # 这里的r便是经过这一个row的处理后的max_array
69 |         return r
70 | 
71 |     rval,update=theano.scan(
72 |                         fn=_funcForRow,
73 |                         sequences=x,
74 |                         outputs_info=tensor.alloc(numpy.asarray(0., dtype=theano.config.floatX), # 建立一个内容为0，x.shape[0]*0 维度的矩阵 @UndefinedVariable
75 |                                                            x.shape[1],
76 |                                                            ),
77 |                         )
78 |     # 这里的rval的最后的那个，便是经过处理后的abs max
79 |     return rval[-1]


--------------------------------------------------------------------------------
/D2AGE/asymmetric/directedGraphLSTMModel.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | @author: Liu Zemin
 4 | Functions and Application : 
 5 | DAG LSTM model
 6 | '''
 7 | 
 8 | import numpy
 9 | import theano
10 | from theano import tensor
11 | 
12 | def directedGraphLSTMModel(options, tparams, x, mask, wemb, buffer_tensor, nodesLen):
13 |     
14 |     length=x.shape[0] 
15 |     dimension=wemb.shape[1] 
16 |     
17 |     proj=wemb[x]
18 |     discount_vector=discountModel(options['discount_beta'], nodesLen)
19 |     
20 |     def _step(index,hArr,cArr):
21 |         
22 |         hi_sum=None # shape=dimension*0
23 |         
24 |         discount=mask[index]*discount_vector # shape=maxlen*0
25 |         hi_sum=(discount[:,None] * hArr).max(axis=0)
26 |         
27 |         # input gate, vector, shape= lstm_dimension * 0
28 |         i=tensor.nnet.sigmoid(tensor.dot(tparams['Wi'], proj[index]) + tensor.dot(tparams['Ui'], hi_sum) + tparams['bi'])
29 |         # forget gate, vector, shape= maxlen*lstm_dimension
30 |         f=tensor.nnet.sigmoid(tensor.dot(tparams['Wf'], proj[index]) + tensor.dot((mask[index])[:,None]*hArr, tparams['Uf']) + tparams['bf'])
31 |         # output gate, vector, shape= lstm_dimension * 0
32 |         o=tensor.nnet.sigmoid(tensor.dot(tparams['Wo'], proj[index]) + tensor.dot(tparams['Uo'], hi_sum) + tparams['bo'])
33 |         # new temp cell, vector, shape= lstm_dimension * 0
34 |         c_=tensor.tanh(tensor.dot(tparams['Wc'], proj[index]) + tensor.dot(tparams['Uc'], hi_sum) + tparams['bc'])
35 |         
36 |         c=None
37 |         
38 |         c=i*c_ + (discount[:,None] * (f * ((mask[index])[:,None]*cArr))).max(axis=0)
39 |         
40 |         h=o*tensor.tanh(c)
41 |         
42 |         hArr=tensor.set_subtensor(hArr[index, :], h)
43 |         cArr=tensor.set_subtensor(cArr[index, :], c)
44 |         
45 |         return hArr, cArr
46 |     
47 |     rval, update=theano.scan(
48 |                              _step,
49 |                              sequences=tensor.arange(x.shape[0]),
50 |                              outputs_info=[tensor.alloc(numpy_floatX(0.), length, options['dimension']),# @UndefinedVariable 
51 |                                            tensor.alloc(numpy_floatX(0.), length, options['dimension'])],# @UndefinedVariable 
52 |                              )
53 |     if options['h_output_method']=='h':
54 |         return rval[0][-1][-1] 
55 |     elif options['h_output_method']=='mean-pooling': 
56 |         return rval[0][-1].mean(axis=0)
57 |     elif options['h_output_method']=='max-pooling':
58 |         return rval[0][-1].max(axis=0)
59 |     else: 
60 |         return rval[0][-1][-1] 
61 | 
62 | def numpy_floatX(data):
63 |     return numpy.asarray(data, dtype=theano.config.floatX)  # @UndefinedVariable
64 | 
65 | def discountModel(beta,length):
66 |     """
67 |     """
68 |     return tensor.exp(beta*length*(-1))


--------------------------------------------------------------------------------
/D2AGE/symmetric/directedGraphLSTMModel.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | @author: Liu Zemin
 4 | Functions and Application : 
 5 | DAG LSTM model
 6 | '''
 7 | 
 8 | import numpy
 9 | import theano
10 | from theano import tensor
11 | 
12 | def directedGraphLSTMModel(options, tparams, x, mask, wemb, buffer_tensor, nodesLen):
13 |     
14 |     """
15 |     """
16 |     
17 |     length=x.shape[0] 
18 |     dimension=wemb.shape[1] 
19 |     
20 |     proj=wemb[x]
21 |     discount_vector=discountModel(options['discount_beta'], nodesLen)
22 |     
23 |     def _step(index,hArr,cArr):
24 |         
25 |         
26 |         hi_sum=None
27 | 
28 |         discount=mask[index]*discount_vector # shape=maxlen*0
29 |         hi_sum=(discount[:,None] * hArr).max(axis=0)
30 |         
31 |         # input gate, vector, shape= lstm_dimension * 0
32 |         i=tensor.nnet.sigmoid(tensor.dot(tparams['Wi'], proj[index]) + tensor.dot(tparams['Ui'], hi_sum) + tparams['bi'])
33 |         # forget gate, vector, shape= maxlen*lstm_dimension
34 |         f=tensor.nnet.sigmoid(tensor.dot(tparams['Wf'], proj[index]) + tensor.dot((mask[index])[:,None]*hArr, tparams['Uf']) + tparams['bf'])
35 |         # output gate, vector, shape= lstm_dimension * 0
36 |         o=tensor.nnet.sigmoid(tensor.dot(tparams['Wo'], proj[index]) + tensor.dot(tparams['Uo'], hi_sum) + tparams['bo'])
37 |         # new temp cell, vector, shape= lstm_dimension * 0
38 |         c_=tensor.tanh(tensor.dot(tparams['Wc'], proj[index]) + tensor.dot(tparams['Uc'], hi_sum) + tparams['bc'])
39 |         
40 |         c=None
41 |         c=i*c_ + (discount[:,None] * (f * ((mask[index])[:,None]*cArr))).max(axis=0)
42 |         
43 |         h=o*tensor.tanh(c)
44 |         
45 |         hArr=tensor.set_subtensor(hArr[index, :], h)
46 |         cArr=tensor.set_subtensor(cArr[index, :], c)
47 |         
48 |         return hArr, cArr
49 |     
50 |     rval, update=theano.scan(
51 |                              _step,
52 |                              sequences=tensor.arange(x.shape[0]),
53 |                              outputs_info=[tensor.alloc(numpy_floatX(0.), length, options['dimension']),# @UndefinedVariable h
54 |                                            tensor.alloc(numpy_floatX(0.), length, options['dimension'])],# @UndefinedVariable c
55 |                              )
56 |     if options['h_output_method']=='h':
57 |         return rval[0][-1][-1] 
58 |     elif options['h_output_method']=='mean-pooling': 
59 |         return rval[0][-1].mean(axis=0)
60 |     elif options['h_output_method']=='max-pooling':
61 |         return rval[0][-1].max(axis=0)
62 |     else: 
63 |         return rval[0][-1][-1] 
64 | 
65 | def numpy_floatX(data):
66 |     return numpy.asarray(data, dtype=theano.config.floatX)  # @UndefinedVariable
67 | 
68 | def discountModel(beta,length):
69 |     """
70 |     """
71 |     return tensor.exp(beta*length*(-1))


--------------------------------------------------------------------------------
/D2AGE/asymmetric/pythonParamsConfig:
--------------------------------------------------------------------------------
 1 | [param]
 2 | 
 3 | ############################################
 4 | # training data dictory
 5 | ############################################
 6 | # main work dir
 7 | root_dir = D:/dataset/dataset
 8 | # the name of one dataset, such as linkedin, facebook
 9 | dataset_name = facebook
10 | # number of labels for each dataset, such as 10,100,1000
11 | suffix = 100
12 | # relatin name, such as classmate，family
13 | class_name = classmate
14 | # the index of the dataset file
15 | index = 1
16 | 
17 | ############################################
18 | # paths for some prepared data
19 | ############################################
20 | # words embeddings path
21 | wordsEmbeddings_path = %(root_dir)s/%(dataset_name)s/nodesFeatures
22 | # sub-paths save file
23 | subpaths_file = %(root_dir)s/%(dataset_name)s/subpathsSaveFile
24 | # DAGs save files
25 | subgraphSaveFile = %(root_dir)s/%(dataset_name)s/subgraphSaveFile
26 | 
27 | ############################################
28 | # experiment parameters - do not need to change frequently
29 | ############################################
30 | # the max length for sub-paths
31 | maxlen_subpaths = 1000
32 | # the size of words vocabulary
33 | wordsSize = 10000000
34 | # Sequence longer than this get ignored 
35 | maxlen = 1000
36 | # use a batch for training. This is the size of this batch.
37 | batch_size = 10
38 | # if need shuffle for training
39 | is_shuffle_for_batch = True
40 | # the frequences for display
41 | dispFreq = 5
42 | # the frequences for saving the parameters
43 | saveFreq = 5
44 | # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. It will be generated in the code.
45 | saveto = 
46 | # the top num to predict
47 | top_num = 10
48 | 
49 | ############################################
50 | # experiment parameters 
51 | ############################################
52 | # learning rate
53 | lrate = 0.0001
54 | # dimension of words embeddings
55 | word_dimension = 10
56 | # the dimension of paths embeddings
57 | dimension = 18
58 | 
59 | # discount parameter alpha
60 | discount_alpha = 0.3
61 | # discount parameter beta
62 | discount_beta = 0.3
63 | # the output way of DAG-LSTM. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path.
64 | h_output_method = max-pooling
65 | # loss function, we use sigmoid
66 | objective_function_method = sigmoid
67 | # the parameter in loss function, mu
68 | objective_function_param = 0.5
69 | # the max epochs for training
70 | max_epochs = 50
71 | # decay parameter lambda
72 | decay = 0.0001
73 | 
74 | ############################################
75 | # DAG generation parameters
76 | ############################################
77 | # the number of generating DAGs between (q,v)
78 | subgraphNum = 0
79 | # if subgraphNum = 0, then use this proportion to generate the DAGs. subgraphNum=proportion * #(subpaths)
80 | proportion = 0.8


--------------------------------------------------------------------------------
/D2AGE/symmetric/pythonParamsConfig:
--------------------------------------------------------------------------------
 1 | [param]
 2 | 
 3 | ############################################
 4 | # training data dictory
 5 | ############################################
 6 | # main work dir
 7 | root_dir = D:/dataset/dataset
 8 | # the name of one dataset, such as linkedin, facebook
 9 | dataset_name = facebook
10 | # number of labels for each dataset, such as 10,100,1000
11 | suffix = 100
12 | # relatin name, such as classmate，family
13 | class_name = classmate
14 | # the index of the dataset file
15 | index = 1
16 | 
17 | ############################################
18 | # paths for some prepared data
19 | ############################################
20 | # words embeddings path
21 | wordsEmbeddings_path = %(root_dir)s/%(dataset_name)s/nodesFeatures
22 | # sub-paths save file
23 | subpaths_file = %(root_dir)s/%(dataset_name)s/subpathsSaveFile
24 | # DAGs save files
25 | subgraphSaveFile = %(root_dir)s/%(dataset_name)s/subgraphSaveFile
26 | 
27 | ############################################
28 | # experiment parameters - do not need to change frequently
29 | ############################################
30 | # the max length for sub-paths
31 | maxlen_subpaths = 1000
32 | # the size of words vocabulary
33 | wordsSize = 10000000
34 | # Sequence longer than this get ignored 
35 | maxlen = 1000
36 | # use a batch for training. This is the size of this batch.
37 | batch_size = 10
38 | # if need shuffle for training
39 | is_shuffle_for_batch = True
40 | # the frequences for display
41 | dispFreq = 5
42 | # the frequences for saving the parameters
43 | saveFreq = 5
44 | # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. It will be generated in the code.
45 | saveto = 
46 | # the top num to predict
47 | top_num = 10
48 | 
49 | ############################################
50 | # experiment parameters 
51 | ############################################
52 | # learning rate
53 | lrate = 0.0001
54 | # dimension of words embeddings
55 | word_dimension = 10
56 | # the dimension of paths embeddings
57 | dimension = 18
58 | 
59 | # discount parameter alpha
60 | discount_alpha = 0.3
61 | # discount parameter beta
62 | discount_beta = 0.3
63 | # the output way of DAG-LSTM. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path.
64 | h_output_method = max-pooling
65 | # loss function, we use sigmoid
66 | objective_function_method = sigmoid
67 | # the parameter in loss function, mu
68 | objective_function_param = 0.5
69 | # the max epochs for training
70 | max_epochs = 50
71 | # decay parameter lambda
72 | decay = 0.0001
73 | 
74 | ############################################
75 | # DAG generation parameters
76 | ############################################
77 | # the number of generating DAGs between (q,v)
78 | subgraphNum = 0
79 | # if subgraphNum = 0, then use this proportion to generate the DAGs. subgraphNum=proportion * #(subpaths)
80 | proportion = 0.8


--------------------------------------------------------------------------------
/D2AGE/asymmetric/proxEmbedBySubgraphModel.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | @author: Liu Zemin
 4 | Functions and Application : 
 5 | Training Model
 6 | '''
 7 | 
 8 | import numpy
 9 | import theano
10 | from theano import tensor
11 | from theano.ifelse import ifelse
12 | import directedGraphLSTMModel
13 | 
14 | def proxEmbedBySubgraphModel(options, tparams):
15 |     """
16 |     """
17 |     trainingPairs=tensor.tensor3('trainingPairs',dtype='int64') # 3D tensor,shape=#(triples)*4*2
18 |     xs=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen
19 |     masks=tensor.tensor3('masks', dtype=theano.config.floatX)  # @UndefinedVariable # shape=nsamples*maxlen*maxlen
20 |     subgraph_lens=tensor.vector('subgraph_lens', dtype='int64') # shape=nsamples*0
21 |     lengths=tensor.vector('lengths',dtype='int64') # shape=#(xs) * 0
22 |     wordsEmbeddings=tensor.matrix('wordsEmbeddings', dtype=theano.config.floatX)  # @UndefinedVariable # shape=#(words) * wordsDimension
23 |     
24 |     buffer_tensor=tensor.tensor3('buffer_tensor', dtype=theano.config.floatX)  # @UndefinedVariable # shape=maxlen*maxlen*dimension
25 |     nodesLens=tensor.matrix('nodesLens', dtype='int64') # shape=nsamples*maxlen
26 |     
27 |     def _processTuple(index , lossSum):
28 |         tuple=trainingPairs[index] 
29 |         
30 |         def _processSubgraph(i):
31 |             length=lengths[i]
32 |             x=xs[i,:length] 
33 |             mask=masks[i,:length,:length] 
34 |             nodesLen=nodesLens[i,:length] 
35 |             emb=directedGraphLSTMModel.directedGraphLSTMModel(options, tparams, x, mask, wordsEmbeddings, buffer_tensor, nodesLen) 
36 |             return emb 
37 |         
38 |         def iftFunc(): 
39 |             embx=tensor.zeros(options['dimension'],).astype(theano.config.floatX)  # @UndefinedVariable 
40 |             return embx
41 |         
42 |         def iffFunc(start, end):
43 |             embx=None
44 |             rval,update=theano.scan(
45 |                                 _processSubgraph,
46 |                                 sequences=tensor.arange(start,end), 
47 |                                 )
48 |             subgraph_len=subgraph_lens[start:end] 
49 |             
50 |             rval=discountModel(options['discount_alpha'], subgraph_len)[:,None]*rval
51 |             embx=rval.max(axis=0)
52 |             
53 |             return embx
54 |         
55 |         start=tuple[0][0] 
56 |         end=tuple[0][1] 
57 |         emb1=None 
58 |         emb1=ifelse(tensor.eq(start,end),iftFunc(),iffFunc(start,end)) 
59 |         
60 |         start=tuple[2][0] 
61 |         end=tuple[2][1]
62 |         emb2=None 
63 |         emb2=ifelse(tensor.eq(start,end),iftFunc(),iffFunc(start,end)) 
64 |         
65 |         loss=0
66 |         param=options['objective_function_param'] 
67 |         if options['objective_function_method']=='sigmoid': 
68 |             loss=-tensor.log(tensor.nnet.sigmoid(param*(tensor.dot(emb1,tparams['w'])-tensor.dot(emb2,tparams['w'])))) # sigmoid
69 |         else: # hinge-loss
70 |             value=param + tensor.dot(emb2,tparams['w']) - tensor.dot(emb1,tparams['w'])
71 |             loss=value*(value>0)
72 |         
73 |         return loss+lossSum
74 |     
75 |     rval, update=theano.scan(
76 |                                  _processTuple,
77 |                                  sequences=tensor.arange(trainingPairs.shape[0]), 
78 |                                  outputs_info=tensor.constant(0., dtype=theano.config.floatX), # @UndefinedVariable
79 |                                  )
80 |     
81 |     cost=rval[-1]
82 |     cost+=options['decay']*(tparams['Wi'] ** 2).sum()
83 |     cost+=options['decay']*(tparams['Wf'] ** 2).sum()
84 |     cost+=options['decay']*(tparams['Wo'] ** 2).sum()
85 |     cost+=options['decay']*(tparams['Wc'] ** 2).sum()
86 |     cost+=options['decay']*(tparams['Ui'] ** 2).sum()
87 |     cost+=options['decay']*(tparams['Uf'] ** 2).sum()
88 |     cost+=options['decay']*(tparams['Uo'] ** 2).sum()
89 |     cost+=options['decay']*(tparams['Uc'] ** 2).sum()
90 |     cost+=options['decay']*(tparams['bi'] ** 2).sum()
91 |     cost+=options['decay']*(tparams['bf'] ** 2).sum()
92 |     cost+=options['decay']*(tparams['bo'] ** 2).sum()
93 |     cost+=options['decay']*(tparams['bc'] ** 2).sum()
94 |     
95 |     return trainingPairs, xs, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost
96 | 
97 | 
98 | def discountModel(alpha,length):
99 |     return tensor.exp(alpha*length*(-1))


--------------------------------------------------------------------------------
/D2AGE/symmetric/proxEmbedBySubgraphModel.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | @author: Liu Zemin
  4 | Functions and Application : 
  5 | model
  6 | '''
  7 | 
  8 | import numpy
  9 | import theano
 10 | from theano import tensor
 11 | from theano.ifelse import ifelse
 12 | import directedGraphLSTMModel
 13 | 
 14 | def proxEmbedBySubgraphModel(options, tparams):
 15 |     """
 16 |     """
 17 |     trainingPairs=tensor.tensor3('trainingPairs',dtype='int64') # 3D tensor,shape=#(triples)*4*2
 18 |     xs=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen
 19 |     masks=tensor.tensor3('masks', dtype=theano.config.floatX)  # @UndefinedVariable # shape=nsamples*maxlen*maxlen
 20 |     subgraph_lens=tensor.vector('subgraph_lens', dtype='int64') # shape=nsamples*0
 21 |     lengths=tensor.vector('lengths',dtype='int64') # shape=#(xs) * 0
 22 |     wordsEmbeddings=tensor.matrix('wordsEmbeddings', dtype=theano.config.floatX)  # @UndefinedVariable # shape=#(words) * wordsDimension
 23 |     
 24 |     buffer_tensor=tensor.tensor3('buffer_tensor', dtype=theano.config.floatX)  # @UndefinedVariable # shape=maxlen*maxlen*dimension
 25 |     nodesLens=tensor.matrix('nodesLens', dtype='int64') # shape=nsamples*maxlen
 26 |     
 27 |     def _processTuple(index , lossSum):
 28 |         tuple=trainingPairs[index] 
 29 |         
 30 |         def _processSubgraph(i):
 31 |             length=lengths[i]
 32 |             x=xs[i,:length] 
 33 |             mask=masks[i,:length,:length] 
 34 |             nodesLen=nodesLens[i,:length] 
 35 |             emb=directedGraphLSTMModel.directedGraphLSTMModel(options, tparams, x, mask, wordsEmbeddings, buffer_tensor, nodesLen) 
 36 |             return emb 
 37 |         
 38 |         def iftFunc(): 
 39 |             embx=tensor.zeros(options['dimension'],).astype(theano.config.floatX)  # @UndefinedVariable 
 40 |             return embx
 41 |         
 42 |         def iffFunc(start, end):
 43 |             embx=None
 44 |             rval,update=theano.scan(
 45 |                                 _processSubgraph,
 46 |                                 sequences=tensor.arange(start,end), 
 47 |                                 )
 48 |             subgraph_len=subgraph_lens[start:end] 
 49 |             
 50 |             rval=discountModel(options['discount_alpha'], subgraph_len)[:,None]*rval
 51 |             embx=rval.max(axis=0)
 52 |             
 53 |             return embx
 54 |         
 55 |         start=tuple[0][0] 
 56 |         end=tuple[1][1] 
 57 |         emb1=None 
 58 |         emb1=iffFunc(start,end)
 59 |         
 60 |         start=tuple[2][0] 
 61 |         end=tuple[3][1]
 62 |         emb2=None 
 63 |         emb2=iffFunc(start,end)
 64 |         
 65 |         loss=0
 66 |         param=options['objective_function_param'] 
 67 |         if options['objective_function_method']=='sigmoid': 
 68 |             loss=-tensor.log(tensor.nnet.sigmoid(param*(tensor.dot(emb1,tparams['w'])-tensor.dot(emb2,tparams['w'])))) # sigmoid
 69 |         else: # hinge-loss
 70 |             value=param + tensor.dot(emb2,tparams['w']) - tensor.dot(emb1,tparams['w'])
 71 |             loss=value*(value>0)
 72 |         
 73 |         return tensor.cast(loss+lossSum, theano.config.floatX) # @UndefinedVariable
 74 |     
 75 |     rval, update=theano.scan(
 76 |                                  _processTuple,
 77 |                                  sequences=tensor.arange(trainingPairs.shape[0]), 
 78 |                                  outputs_info=tensor.constant(0., dtype=theano.config.floatX), # @UndefinedVariable
 79 |                                  )
 80 |     
 81 |     cost=rval[-1]
 82 |     cost+=options['decay']*(tparams['Wi'] ** 2).sum()
 83 |     cost+=options['decay']*(tparams['Wf'] ** 2).sum()
 84 |     cost+=options['decay']*(tparams['Wo'] ** 2).sum()
 85 |     cost+=options['decay']*(tparams['Wc'] ** 2).sum()
 86 |     cost+=options['decay']*(tparams['Ui'] ** 2).sum()
 87 |     cost+=options['decay']*(tparams['Uf'] ** 2).sum()
 88 |     cost+=options['decay']*(tparams['Uo'] ** 2).sum()
 89 |     cost+=options['decay']*(tparams['Uc'] ** 2).sum()
 90 |     cost+=options['decay']*(tparams['bi'] ** 2).sum()
 91 |     cost+=options['decay']*(tparams['bf'] ** 2).sum()
 92 |     cost+=options['decay']*(tparams['bo'] ** 2).sum()
 93 |     cost+=options['decay']*(tparams['bc'] ** 2).sum()
 94 |     
 95 |     return trainingPairs, xs, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost
 96 | 
 97 | 
 98 | def discountModel(alpha,length):
 99 |     """
100 |     discount
101 |     """
102 |     return tensor.exp(alpha*length*(-1))
103 | 


--------------------------------------------------------------------------------
/D2AGE/symmetric/proxEmbedBySubgraphProcessAndAssess.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | @author: Liu Zemin
  4 | Functions and Application : 
  5 | 
  6 | '''
  7 | 
  8 | import numpy
  9 | import theano
 10 | from collections import OrderedDict
 11 | import proxEmbedBySubgraphProcessModelBatch
 12 | import dataProcessTools
 13 | import toolsFunction
 14 | import evaluateTools
 15 | 
 16 | def load_params(path, params):
 17 |     """
 18 |     load all params from file
 19 |     """
 20 |     pp = numpy.load(path) 
 21 |     for kk, vv in params.items():
 22 |         if kk not in pp:
 23 |             raise Warning('%s is not in the archive' % kk)
 24 |         params[kk] = pp[kk]
 25 | 
 26 |     return params
 27 | 
 28 | 
 29 | def get_proxEmbedBySubgraphModel(
 30 |                       
 31 |                    model_params_path='', 
 32 |                      word_dimension=0, 
 33 |                      dimension=0, 
 34 |                      discount_alpha=0.3, 
 35 |                      discount_beta=0.3, 
 36 |                      h_output_method='max-pooling', 
 37 |                       ):
 38 |     """
 39 |     """
 40 |     model_options = locals().copy()
 41 |     
 42 |     tparams = OrderedDict()
 43 |     tparams['Wi']=None
 44 |     tparams['Wf']=None
 45 |     tparams['Wo']=None
 46 |     tparams['Wc']=None
 47 |     tparams['Ui']=None
 48 |     tparams['Uf']=None
 49 |     tparams['Uo']=None
 50 |     tparams['Uc']=None
 51 |     tparams['bi']=None
 52 |     tparams['bf']=None
 53 |     tparams['bo']=None
 54 |     tparams['bc']=None
 55 |     tparams['w']=None
 56 |     tparams=load_params(model_params_path, tparams) 
 57 |     
 58 |     sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, score=proxEmbedBySubgraphProcessModelBatch.proxEmbedBySubgraphProcessModel(model_options, tparams)
 59 |     func=theano.function([sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens], score, on_unused_input='ignore') 
 60 |     return func 
 61 | 
 62 | 
 63 | def compute_proxEmbedBySubgraph(
 64 |                      wordsEmbeddings=None, 
 65 |                      wordsEmbeddings_path=None, 
 66 |                      word_dimension=0, 
 67 |                      dimension=0, 
 68 |                      wordsSize=0, 
 69 |                      subpaths_map=None, 
 70 |                      subpaths_file=None,
 71 |                      subgraphs_file='', 
 72 |                      maxlen_subpaths=1000, 
 73 |                      maxlen=100,  # Sequence longer then this get ignored 
 74 |                      
 75 |                      test_data_file='', 
 76 |                      top_num=10, 
 77 |                      ideal_data_file='', 
 78 |                      func=None, 
 79 |                    ):
 80 |     model_options = locals().copy()
 81 |     
 82 |     if wordsEmbeddings is None: 
 83 |         if wordsEmbeddings_path is not None: 
 84 |             wordsEmbeddings,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
 85 |         else: 
 86 |             exit(0) 
 87 | 
 88 |     subgraphs_map=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphs_file)
 89 |     
 90 |     line_count=0 
 91 |     test_map={} 
 92 |     print 'Compute MAP and nDCG for file ',test_data_file
 93 |     with open(test_data_file) as f: 
 94 |         for l in f: 
 95 |             arr=l.strip().split()
 96 |             query=int(arr[0]) 
 97 |             map={} 
 98 |             for i in range(1,len(arr)): 
 99 |                 candidate=int(arr[i]) 
100 |                 sequences_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data,nodesLens_data=dataProcessTools.prepareDataForTestForSubgraphSingleSequenceWithLengths(query, candidate, subgraphs_map, dimension)
101 |                 if sequences_data is None and mask_data is None and lens_data is None: 
102 |                     map[candidate]=-1000. 
103 |                 else: 
104 |                     value=func(sequences_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings, buffer_tensor_data, nodesLens_data) 
105 |                     map[candidate]=value
106 |             
107 |             tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num)
108 |             test_map[line_count]=tops_in_line 
109 |             line_count+=1 
110 |                 
111 |     line_count=0 
112 |     ideal_map={}
113 |     with open(ideal_data_file) as f: 
114 |         for l in f: 
115 |             arr=l.strip().split()
116 |             arr=[int(x) for x in arr] 
117 |             ideal_map[line_count]=arr[1:] 
118 |             line_count+=1 
119 |     
120 |     MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map)
121 |     MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map)
122 |     
123 |     return MAP,MnDCG
124 |     
125 |     
126 |     
127 |     
128 |     


--------------------------------------------------------------------------------
/D2AGE/asymmetric/proxEmbedBySubgraphProcessAndAssess.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | Created on 2017年2月3日
  4 | @author: Liu Zemin
  5 | Functions and Application : 
  6 | 
  7 | '''
  8 | 
  9 | import numpy
 10 | import theano
 11 | from collections import OrderedDict
 12 | import proxEmbedBySubgraphProcessModelBatch
 13 | import dataProcessTools
 14 | import toolsFunction
 15 | import evaluateTools
 16 | 
 17 | 
 18 | 
 19 | def load_params(path, params):
 20 |     """
 21 |     load parameters from file
 22 |     """
 23 |     pp = numpy.load(path) 
 24 |     for kk, vv in params.items():
 25 |         if kk not in pp:
 26 |             raise Warning('%s is not in the archive' % kk)
 27 |         params[kk] = pp[kk]
 28 | 
 29 |     return params
 30 | 
 31 | 
 32 | def get_proxEmbedBySubgraphModel(
 33 |                       
 34 |                    model_params_path='', 
 35 |                      word_dimension=0, 
 36 |                      dimension=0, 
 37 |                      discount_alpha=0.3, 
 38 |                      discount_beta=0.3, 
 39 |                      h_output_method='max-pooling', 
 40 |                       ):
 41 |     """
 42 |     the processing model
 43 |     """
 44 |     model_options = locals().copy()
 45 |     
 46 |     tparams = OrderedDict()
 47 |     tparams['Wi']=None
 48 |     tparams['Wf']=None
 49 |     tparams['Wo']=None
 50 |     tparams['Wc']=None
 51 |     tparams['Ui']=None
 52 |     tparams['Uf']=None
 53 |     tparams['Uo']=None
 54 |     tparams['Uc']=None
 55 |     tparams['bi']=None
 56 |     tparams['bf']=None
 57 |     tparams['bo']=None
 58 |     tparams['bc']=None
 59 |     tparams['w']=None
 60 |     tparams=load_params(model_params_path, tparams) 
 61 |     
 62 |     sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, score=proxEmbedBySubgraphProcessModelBatch.proxEmbedBySubgraphProcessModel(model_options, tparams)
 63 |     func=theano.function([sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens], score, on_unused_input='ignore') 
 64 |     return func 
 65 | 
 66 | 
 67 | def compute_proxEmbedBySubgraph(
 68 |                      wordsEmbeddings=None, 
 69 |                      wordsEmbeddings_path=None, 
 70 |                      word_dimension=0, 
 71 |                      dimension=0,
 72 |                      wordsSize=0, 
 73 |                      subpaths_map=None, 
 74 |                      subpaths_file=None,
 75 |                      subgraphs_file='', 
 76 |                      maxlen_subpaths=1000, 
 77 |                      maxlen=100,  # Sequence longer then this get ignored 
 78 |                      
 79 |                      test_data_file='', 
 80 |                      top_num=10, 
 81 |                      ideal_data_file='',
 82 |                      func=None, 
 83 |                    ):
 84 |     model_options = locals().copy()
 85 |     
 86 |     if wordsEmbeddings is None: 
 87 |         if wordsEmbeddings_path is not None: 
 88 |             wordsEmbeddings,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
 89 |         else: 
 90 |             exit(0) 
 91 | 
 92 |     subgraphs_map=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphs_file)
 93 |     
 94 |     line_count=0 
 95 |     test_map={} 
 96 |     print 'Compute MAP and nDCG for file ',test_data_file
 97 |     with open(test_data_file) as f: 
 98 |         for l in f: 
 99 |             arr=l.strip().split()
100 |             query=int(arr[0]) 
101 |             map={} 
102 |             for i in range(1,len(arr)): 
103 |                 candidate=int(arr[i]) 
104 |                 sequences_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data,nodesLens_data=dataProcessTools.prepareDataForTestForSubgraphSingleSequenceWithLengthsAsymmetric(query, candidate, subgraphs_map, dimension)
105 |                 if sequences_data is None and mask_data is None and lens_data is None: 
106 |                     map[candidate]=-1000. 
107 |                 else: 
108 |                     value=func(sequences_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings, buffer_tensor_data, nodesLens_data) 
109 |                     map[candidate]=value
110 |             
111 |             tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num)
112 |             test_map[line_count]=tops_in_line 
113 |             line_count+=1 
114 |                 
115 |     line_count=0 
116 |     ideal_map={}
117 |     with open(ideal_data_file) as f: 
118 |         for l in f: 
119 |             arr=l.strip().split()
120 |             arr=[int(x) for x in arr] 
121 |             ideal_map[line_count]=arr[1:] 
122 |             line_count+=1 
123 |     
124 |     MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map)
125 |     MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map)
126 |     
127 |     return MAP,MnDCG
128 |     
129 |     
130 |     
131 |     
132 |     


--------------------------------------------------------------------------------
/D2AGE/asymmetric/prepareSubgraphsWithAllSubpaths.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | @author: Liu Zemin
  4 | Functions and Application : 
  5 | DAG generation.
  6 | '''
  7 | 
  8 | import numpy
  9 | import random
 10 | import dataProcessTools
 11 | import ConfigParser
 12 | import string, os, sys
 13 | import time
 14 | import math
 15 | 
 16 | SEED = 123
 17 | random.seed(SEED)
 18 | 
 19 | cf = ConfigParser.SafeConfigParser()
 20 | cf.read("pythonParamsConfig")
 21 |     
 22 | rootdir=cf.get("param", "root_dir") 
 23 | datasetName=cf.get("param", "dataset_name") 
 24 | relationName=cf.get("param", "class_name") 
 25 | # sampleTimes=cf.getint("param", "sampleTimes")
 26 | subgraphNum=cf.getint("param", "subgraphNum")
 27 | DAGSaveFile=cf.get("param", "subgraphSaveFile") 
 28 | subpaths_file=cf.get("param", "subpaths_file")
 29 | maxlen_subpaths=cf.getint("param", "maxlen_subpaths") 
 30 | proportion=cf.getfloat("param", "proportion")
 31 | upperLimit=cf.getint("param", "upperLimit") 
 32 | 
 33 | 
 34 | def getAlltuplesForSingleDirection(rootdir, datasetName, relationName):
 35 |     """
 36 |         get all tuples for asymmetric
 37 |     """
 38 |     folder=rootdir+'/'+datasetName+'.splits/'
 39 |     tuples=set()
 40 |     folder_train10=folder+'train.10/'
 41 |     for i in range(1,11):
 42 |         path=folder_train10+'train_'+relationName+'_'+bytes(i) 
 43 |         with open(path) as f:
 44 |             for l in f:
 45 |                 tmp=l.strip().split()
 46 |                 if len(tmp)<=0:
 47 |                     continue
 48 |                 tuples.add(tmp[0]+'-'+tmp[1])
 49 |                 tuples.add(tmp[0]+'-'+tmp[2])
 50 |         f.close()
 51 |         f=None
 52 |     # training data 100
 53 |     folder_train100=folder+'train.100/'
 54 |     for i in range(1,11):
 55 |         path=folder_train100+'train_'+relationName+'_'+bytes(i) 
 56 |         with open(path) as f:
 57 |             for l in f:
 58 |                 tmp=l.strip().split()
 59 |                 if len(tmp)<=0:
 60 |                     continue
 61 |                 tuples.add(tmp[0]+'-'+tmp[1])
 62 |                 tuples.add(tmp[0]+'-'+tmp[2])
 63 |         f.close()
 64 |         f=None
 65 |     # training data 1000
 66 |     folder_train1000=folder+'train.1000/'
 67 |     for i in range(1,11):
 68 |         path=folder_train1000+'train_'+relationName+'_'+bytes(i) 
 69 |         with open(path) as f:
 70 |             for l in f:
 71 |                 tmp=l.strip().split()
 72 |                 if len(tmp)<=0:
 73 |                     continue
 74 |                 tuples.add(tmp[0]+'-'+tmp[1])
 75 |                 tuples.add(tmp[0]+'-'+tmp[2])
 76 |         f.close()
 77 |         f=None
 78 |     # test data
 79 |     folder_test=folder+'test/'
 80 |     for i in range(1,11):
 81 |         path=folder_test+'test_'+relationName+'_'+bytes(i) 
 82 |         with open(path) as f:
 83 |             for l in f:
 84 |                 tmp=l.strip().split()
 85 |                 if len(tmp)<=0:
 86 |                     continue
 87 |                 for j in range(1,len(tmp)):
 88 |                     tuples.add(tmp[0]+'-'+tmp[j])
 89 |         f.close()
 90 |         f=None
 91 |     return tuples
 92 | 
 93 | 
 94 | def generateSubgraphsByAllSubpathsDirectlyAndSave(tuples, subpathsMap, subgraphNum, proportion, DAGSaveFile, upperLimit):
 95 |     """
 96 |     generate DAGs by subpaths
 97 |     """
 98 |     output = open(DAGSaveFile, 'w') 
 99 |     for tuple in tuples: 
100 |         arr=tuple.strip().split('-') 
101 |         start=int(arr[0]) 
102 |         end=int(arr[1]) 
103 |         if tuple not in subpathsMap: 
104 |             continue
105 |         subpaths=subpathsMap[tuple]
106 |         indexes=range(len(subpaths)) 
107 |         number=0 
108 |         if subgraphNum>0: 
109 |             number=subgraphNum
110 |         else: 
111 |             number=int(math.ceil(len(subpaths)*proportion))
112 |             if upperLimit>0: 
113 |                 number=min(number, upperLimit) 
114 |         for i in range(number): 
115 |             map={} 
116 |             mapCheck={} 
117 |             random.shuffle(indexes) 
118 |             for j in indexes:
119 |                 subpath=subpaths[j] 
120 |                 for x in range(len(subpath)-1): 
121 |                     if subpath[x] in map: 
122 |                         if subpath[x+1] not in mapCheck[subpath[x]]: 
123 |                             map[subpath[x]].append(subpath[x+1])
124 |                             mapCheck[subpath[x]].add(subpath[x+1]) 
125 |                     else: 
126 |                         map[subpath[x]]=[subpath[x+1]]
127 |                         mapCheck[subpath[x]]=set([subpath[x+1]])
128 |             dependency, sequence, nodesLevel=dataProcessTools.subgraphToOrderedSequence(map, start, end)
129 |             str=bytes(start)+'-'+bytes(end)+'#'
130 |             for depend in dependency: 
131 |                 str+=bytes(depend[0])+'-'+bytes(depend[1])+'\t'
132 |             str+='#' 
133 |             for id in sequence:
134 |                 str+=bytes(id)+'\t'
135 |             str+='#'
136 |             for id in sequence:
137 |                 str+=bytes(id)+'-'+bytes(nodesLevel[id])+'\t'
138 |             str+='\n'
139 |             output.write(str)
140 |             output.flush()
141 |     output.close()
142 |     output=None
143 | 
144 | if __name__=='__main__':
145 |     print 'Read all tuples from files..........'
146 |     start_time = time.time() 
147 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
148 |     tuples=getAlltuplesForSingleDirection(rootdir, datasetName, relationName) # asymmetric
149 |      
150 |     print '-------------------------------------------------------------------------------'
151 |     print 'Read all subpaths from files..........'
152 |     start_time = time.time() 
153 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
154 |     subpathsMap=dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths)
155 |      
156 |     print '-------------------------------------------------------------------------------'
157 |     print 'Generate subgraphs and save them to file..........'
158 |     start_time = time.time() 
159 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
160 |     
161 |     generateSubgraphsByAllSubpathsDirectlyAndSave(tuples, subpathsMap, subgraphNum, proportion, DAGSaveFile, upperLimit)
162 |     
163 |     print '-------------------------------------------------------------------------------'
164 |     print 'Finished!!!'
165 |     start_time = time.time()
166 |     print 'End time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
167 | 
168 | 


--------------------------------------------------------------------------------
/D2AGE/asymmetric/experimentForOneFileByParams.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | @author: Liu Zemin
  4 | '''
  5 | 
  6 | import numpy
  7 | import theano
  8 | from theano import tensor
  9 | 
 10 | import ConfigParser
 11 | import string, os, sys
 12 | import proxEmbedBySubgraphs
 13 | import proxEmbedBySubgraphProcessAndAssess
 14 | import time
 15 | import subprocess
 16 | 
 17 | if __name__=='__main__':
 18 |     
 19 |     cf = ConfigParser.SafeConfigParser()
 20 |     cf.read("pythonParamsConfig")
 21 |     
 22 |     main_dir=cf.get("param", "root_dir") # main work dir
 23 |     dataset_name=cf.get("param", "dataset_name") # dataset name, such as facebook
 24 |     suffix=cf.get("param", "suffix") # number of labels for each dataset, such as 10,100,1000
 25 |     class_name=cf.get("param", "class_name")  # relatin name, such as classmate，family
 26 |     index=cf.get("param", "index") # the index of the dataset file
 27 |     
 28 |     trainingDataFile=os.path.join(main_dir+'/',dataset_name+'.splits','train.'+suffix,'train_'+class_name+'_'+index) # the full path of training data file. This path will be generated by main_dir, dataset_name, suffix, class_name and index.
 29 |     
 30 |     wordsEmbeddings=None # the file path of words embeddings
 31 |     wordsEmbeddings_path=cf.get("param", "wordsEmbeddings_path") # dimension of words embeddings
 32 |     subpaths_map=None # the map to save all the subpaths
 33 |     subpaths_file=cf.get("param", "subpaths_file") # the file to save all the subpaths
 34 |     subgraphSaveFile=cf.get("param", "subgraphSaveFile") # the file to save all the DAGs
 35 |     
 36 |     maxlen_subpaths=cf.getint("param", "maxlen_subpaths") # the max length for sub-paths
 37 |     wordsSize=cf.getint("param", "wordsSize") # the max size of words vocabulary
 38 |     maxlen=cf.getint("param", "maxlen")  # Sequence longer then this get ignored  
 39 |     batch_size=cf.getint("param", "batch_size") # use a batch for training. This is the size of this batch.
 40 |     is_shuffle_for_batch=cf.getboolean("param", "is_shuffle_for_batch") # if need shuffle for training
 41 |     
 42 |     dispFreq=cf.getint("param", "dispFreq") # the frequences for display
 43 |     saveFreq=cf.getint("param", "saveFreq") # the frequences for saving the parameters
 44 |     saveto=os.path.join(main_dir+'/',dataset_name+'.trainModels','train.'+suffix,'train_'+class_name+'_'+index+'.npz') # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index.
 45 |     
 46 |     lrate=cf.getfloat("param", "lrate") # learning rate
 47 |     word_dimension=cf.getint("param", "word_dimension") # dimension of words embeddings
 48 |     dimension=cf.getint("param", "dimension") # the dimension of paths embeddings
 49 |     discount_alpha=cf.getfloat("param", "discount_alpha") # parameter alpha
 50 |     discount_beta=cf.getfloat("param", "discount_beta") # parameter beta
 51 |     h_output_method=cf.get("param", "h_output_method") # the way of output for each DAG, we use the hidden state of the end node in a DAG as its output
 52 |     objective_function_method=cf.get("param", "objective_function_method") # the objective function, here we use sigmoid
 53 |     objective_function_param=cf.getfloat("param", "objective_function_param") # the parameter mu for sigmoid
 54 |     max_epochs=cf.getint("param", "max_epochs") # the max epoches for training 
 55 |     
 56 |     decay=cf.getfloat("param", "decay") # the decay parameter lambda
 57 |     
 58 |     test_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','test','test_'+class_name+'_'+index) # the file of test data
 59 |     top_num=cf.getint("param", "top_num") # the top num to predict
 60 |     ideal_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','ideal','ideal_'+class_name+'_'+index) # the file of ground truth
 61 |     
 62 |     # training
 63 |     proxEmbedBySubgraphs.proxEmbedBySubgraphs(
 64 |                         trainingDataFile, 
 65 |                         wordsEmbeddings, 
 66 |                         wordsEmbeddings_path, 
 67 |                         subpaths_map, 
 68 |                         subpaths_file, 
 69 |                         subgraphSaveFile, 
 70 |                         maxlen_subpaths, 
 71 |                         wordsSize, 
 72 |                         maxlen, 
 73 |                         batch_size, 
 74 |                         is_shuffle_for_batch, 
 75 |                         dispFreq, 
 76 |                         saveFreq, 
 77 |                         saveto, 
 78 |                         lrate, 
 79 |                         word_dimension, 
 80 |                         dimension, 
 81 |                         discount_alpha,
 82 |                         discount_beta,
 83 |                         h_output_method,
 84 |                         objective_function_method, 
 85 |                         objective_function_param, 
 86 |                         max_epochs, 
 87 |                         decay)
 88 |     
 89 |     time.sleep(5) 
 90 |     
 91 |     print '------------------------------------------------------------------------------'
 92 |     print 'Start to generate process model..........'
 93 |     start_time = time.time() 
 94 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
 95 |     func=proxEmbedBySubgraphProcessAndAssess.get_proxEmbedBySubgraphModel(
 96 |                                                                      saveto, 
 97 |                                                                      word_dimension, 
 98 |                                                                      dimension, 
 99 |                                                                     discount_alpha,
100 |                                                                     discount_beta,
101 |                                                                      h_output_method)
102 |      
103 |     print 'Start to process and evaluate the model..........'
104 |     start_time = time.time() 
105 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
106 |     MAP, NDCG=proxEmbedBySubgraphProcessAndAssess.compute_proxEmbedBySubgraph(
107 |                         wordsEmbeddings, 
108 |                         wordsEmbeddings_path, 
109 |                         word_dimension, 
110 |                         dimension, 
111 |                         wordsSize, 
112 |                         subpaths_map, 
113 |                         subpaths_file, 
114 |                         subgraphSaveFile, 
115 |                         maxlen_subpaths, 
116 |                         maxlen, 
117 |                         test_data_file, 
118 |                         top_num, 
119 |                         ideal_data_file, 
120 |                         func)
121 |     print '------------------------------------------------------------------------------'
122 |     start_time = time.time() 
123 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
124 |     print 'MAP =', MAP
125 |     print 'NDCG =', NDCG


--------------------------------------------------------------------------------
/D2AGE/symmetric/experimentForOneFileByParams.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | @author: Liu Zemin
  4 | Functions and Application : 
  5 | '''
  6 | 
  7 | 
  8 | import ConfigParser
  9 | import string, os, sys
 10 | import proxEmbedBySubgraphs
 11 | import proxEmbedBySubgraphProcessAndAssess
 12 | import time
 13 | 
 14 | if __name__=='__main__':
 15 |     
 16 |     cf = ConfigParser.SafeConfigParser()
 17 |     cf.read("pythonParamsConfig")
 18 |     
 19 |     main_dir=cf.get("param", "root_dir") # main work dir
 20 |     dataset_name=cf.get("param", "dataset_name") # dataset name, such as facebook
 21 |     suffix=cf.get("param", "suffix") # number of labels for each dataset, such as 10,100,1000
 22 |     class_name=cf.get("param", "class_name") # relatin name, such as classmate，family
 23 |     index=cf.get("param", "index") # the index of the dataset file
 24 |     
 25 |     trainingDataFile=os.path.join(main_dir+'/',dataset_name+'.splits','train.'+suffix,'train_'+class_name+'_'+index) # the full path of training data file. This path will be generated by main_dir, dataset_name, suffix, class_name and index.
 26 |     
 27 |     wordsEmbeddings=None # the file path of words embeddings
 28 |     wordsEmbeddings_path=cf.get("param", "wordsEmbeddings_path") # dimension of words embeddings
 29 |     subpaths_map=None # the map to save all the subpaths
 30 |     subpaths_file=cf.get("param", "subpaths_file") # the file to save all the subpaths
 31 |     subgraphSaveFile=cf.get("param", "subgraphSaveFile") # the file to save all the DAGs
 32 |     
 33 |     maxlen_subpaths=cf.getint("param", "maxlen_subpaths")  # the max length for sub-paths
 34 |     wordsSize=cf.getint("param", "wordsSize") # the max size of words vocabulary
 35 |     maxlen=cf.getint("param", "maxlen") # Sequence longer than this get ignored 
 36 |     batch_size=cf.getint("param", "batch_size") # use a batch for training. This is the size of this batch.
 37 |     is_shuffle_for_batch=cf.getboolean("param", "is_shuffle_for_batch") # if need shuffle for training
 38 |     
 39 |     dispFreq=cf.getint("param", "dispFreq") # the frequences for display
 40 |     saveFreq=cf.getint("param", "saveFreq") # the frequences for saving the parameters
 41 |     saveto=os.path.join(main_dir+'/',dataset_name+'.trainModels','train.'+suffix,'train_'+class_name+'_'+index+'.npz') # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index.
 42 |     
 43 |     lrate=cf.getfloat("param", "lrate") # learning rate
 44 |     word_dimension=cf.getint("param", "word_dimension") # dimension of words embeddings
 45 |     dimension=cf.getint("param", "dimension") # the dimension of paths embeddings
 46 |     discount_alpha=cf.getfloat("param", "discount_alpha") # parameter alpha
 47 |     discount_beta=cf.getfloat("param", "discount_beta") # parameter beta
 48 |     h_output_method=cf.get("param", "h_output_method") # the way of output for each DAG, we use the hidden state of the end node in a DAG as its output
 49 |     objective_function_method=cf.get("param", "objective_function_method") # the objective function, here we use sigmoid
 50 |     objective_function_param=cf.getfloat("param", "objective_function_param") # the parameter mu for sigmoid
 51 |     max_epochs=cf.getint("param", "max_epochs") # the max epoches for training 
 52 |     
 53 |     decay=cf.getfloat("param", "decay") # the decay parameter lambda
 54 |     
 55 |     test_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','test','test_'+class_name+'_'+index) # the file of test data
 56 |     top_num=cf.getint("param", "top_num") # the top num to predict
 57 |     ideal_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','ideal','ideal_'+class_name+'_'+index) # the file of ground truth
 58 |     
 59 |     # training
 60 |     proxEmbedBySubgraphs.proxEmbedBySubgraphs(
 61 |                         trainingDataFile, 
 62 |                         wordsEmbeddings, 
 63 |                         wordsEmbeddings_path, 
 64 |                         subpaths_map, 
 65 |                         subpaths_file, 
 66 |                         subgraphSaveFile, 
 67 |                         maxlen_subpaths, 
 68 |                         wordsSize, 
 69 |                         maxlen, 
 70 |                         batch_size, 
 71 |                         is_shuffle_for_batch, 
 72 |                         dispFreq, 
 73 |                         saveFreq, 
 74 |                         saveto, 
 75 |                         lrate, 
 76 |                         word_dimension, 
 77 |                         dimension, 
 78 |                         discount_alpha,
 79 |                         discount_beta,
 80 |                         h_output_method,
 81 |                         objective_function_method, 
 82 |                         objective_function_param, 
 83 |                         max_epochs, 
 84 |                         decay)
 85 |     
 86 |     time.sleep(5) # sleep
 87 |     
 88 |     
 89 |     print '------------------------------------------------------------------------------'
 90 |     print 'Start to generate process model..........'
 91 |     start_time = time.time() 
 92 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
 93 |     # get the model for process
 94 |     func=proxEmbedBySubgraphProcessAndAssess.get_proxEmbedBySubgraphModel(
 95 |                                                                      saveto, 
 96 |                                                                      word_dimension, 
 97 |                                                                      dimension, 
 98 |                                                                     discount_alpha,
 99 |                                                                     discount_beta,
100 |                                                                      h_output_method)
101 |      
102 |     print 'Start to process and evaluate the model..........'
103 |     start_time = time.time() 
104 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
105 |     # calculate the results
106 |     MAP, NDCG=proxEmbedBySubgraphProcessAndAssess.compute_proxEmbedBySubgraph(
107 |                         wordsEmbeddings, 
108 |                         wordsEmbeddings_path, 
109 |                         word_dimension, 
110 |                         dimension, 
111 |                         wordsSize, 
112 |                         subpaths_map, 
113 |                         subpaths_file, 
114 |                         subgraphSaveFile, 
115 |                         maxlen_subpaths, 
116 |                         maxlen, 
117 |                         test_data_file, 
118 |                         top_num, 
119 |                         ideal_data_file, 
120 |                         func)
121 |     print '------------------------------------------------------------------------------'
122 |     start_time = time.time() 
123 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
124 |     print 'MAP =', MAP
125 |     print 'NDCG =', NDCG


--------------------------------------------------------------------------------
/D2AGE/symmetric/prepareSubgraphsWithAllSubpaths.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | @author: Liu Zemin
  4 | Functions and Application : 
  5 | generate DAGs with subpaths
  6 | '''
  7 | 
  8 | import numpy
  9 | import random
 10 | import dataProcessTools
 11 | import ConfigParser
 12 | import string, os, sys
 13 | import time
 14 | import math
 15 | 
 16 | SEED = 123
 17 | random.seed(SEED)
 18 | 
 19 | cf = ConfigParser.SafeConfigParser()
 20 | cf.read("/usr/pythonParamsConfig")
 21 |     
 22 | rootdir=cf.get("param", "root_dir") 
 23 | datasetName=cf.get("param", "dataset_name") 
 24 | relationName=cf.get("param", "class_name") 
 25 | subgraphNum=cf.getint("param", "subgraphNum")
 26 | DAGSaveFile=cf.get("param", "subgraphSaveFile") 
 27 | subpaths_file=cf.get("param", "subpaths_file")
 28 | maxlen_subpaths=cf.getint("param", "maxlen_subpaths") 
 29 | proportion=cf.getfloat("param", "proportion")
 30 | upperLimit=cf.getint("param", "upperLimit") 
 31 | 
 32 | def getAlltuples(rootdir, datasetName, relationName):
 33 |     """
 34 |         get all tuples from training data
 35 |     """
 36 |     folder=rootdir+'/'+datasetName+'.splits/'
 37 |     tuples=set()
 38 |     folder_train10=folder+'train.10/'
 39 |     for i in range(1,11):
 40 |         path=folder_train10+'train_'+relationName+'_'+bytes(i) 
 41 |         with open(path) as f:
 42 |             for l in f:
 43 |                 tmp=l.strip().split()
 44 |                 if len(tmp)<=0:
 45 |                     continue
 46 |                 tuples.add(tmp[0]+'-'+tmp[1])
 47 |                 tuples.add(tmp[1]+'-'+tmp[0])
 48 |                 tuples.add(tmp[0]+'-'+tmp[2])
 49 |                 tuples.add(tmp[2]+'-'+tmp[0])
 50 |         f.close()
 51 |         f=None
 52 |     # training data 100
 53 |     folder_train100=folder+'train.100/'
 54 |     for i in range(1,11):
 55 |         path=folder_train100+'train_'+relationName+'_'+bytes(i) 
 56 |         with open(path) as f:
 57 |             for l in f:
 58 |                 tmp=l.strip().split()
 59 |                 if len(tmp)<=0:
 60 |                     continue
 61 |                 tuples.add(tmp[0]+'-'+tmp[1])
 62 |                 tuples.add(tmp[1]+'-'+tmp[0])
 63 |                 tuples.add(tmp[0]+'-'+tmp[2])
 64 |                 tuples.add(tmp[2]+'-'+tmp[0])
 65 |         f.close()
 66 |         f=None
 67 |     # training data 1000
 68 |     folder_train1000=folder+'train.1000/'
 69 |     for i in range(1,11):
 70 |         path=folder_train1000+'train_'+relationName+'_'+bytes(i) 
 71 |         with open(path) as f:
 72 |             for l in f:
 73 |                 tmp=l.strip().split()
 74 |                 if len(tmp)<=0:
 75 |                     continue
 76 |                 tuples.add(tmp[0]+'-'+tmp[1])
 77 |                 tuples.add(tmp[1]+'-'+tmp[0])
 78 |                 tuples.add(tmp[0]+'-'+tmp[2])
 79 |                 tuples.add(tmp[2]+'-'+tmp[0])
 80 |         f.close()
 81 |         f=None
 82 |     # test data
 83 |     folder_test=folder+'test/'
 84 |     for i in range(1,11):
 85 |         path=folder_test+'test_'+relationName+'_'+bytes(i) 
 86 |         with open(path) as f:
 87 |             for l in f:
 88 |                 tmp=l.strip().split()
 89 |                 if len(tmp)<=0:
 90 |                     continue
 91 |                 for j in range(1,len(tmp)):
 92 |                     tuples.add(tmp[0]+'-'+tmp[j])
 93 |                     tuples.add(tmp[j]+'-'+tmp[0])
 94 |         f.close()
 95 |         f=None
 96 |     return tuples
 97 | 
 98 | def getAlltuplesForSingleDirection(rootdir, datasetName, relationName):
 99 |     """
100 |        get all tuples for asymmetric relation
101 |     """
102 |     folder=rootdir+'/'+datasetName+'.splits/'
103 |     tuples=set()
104 |     folder_train10=folder+'train.10/'
105 |     for i in range(1,11):
106 |         path=folder_train10+'train_'+relationName+'_'+bytes(i) 
107 |         with open(path) as f:
108 |             for l in f:
109 |                 tmp=l.strip().split()
110 |                 if len(tmp)<=0:
111 |                     continue
112 |                 tuples.add(tmp[0]+'-'+tmp[1])
113 |                 tuples.add(tmp[0]+'-'+tmp[2])
114 |         f.close()
115 |         f=None
116 |     # training data 100
117 |     folder_train100=folder+'train.100/'
118 |     for i in range(1,11):
119 |         path=folder_train100+'train_'+relationName+'_'+bytes(i) 
120 |         with open(path) as f:
121 |             for l in f:
122 |                 tmp=l.strip().split()
123 |                 if len(tmp)<=0:
124 |                     continue
125 |                 tuples.add(tmp[0]+'-'+tmp[1])
126 |                 tuples.add(tmp[0]+'-'+tmp[2])
127 |         f.close()
128 |         f=None
129 |     # training data 1000
130 |     folder_train1000=folder+'train.1000/'
131 |     for i in range(1,11):
132 |         path=folder_train1000+'train_'+relationName+'_'+bytes(i) 
133 |         with open(path) as f:
134 |             for l in f:
135 |                 tmp=l.strip().split()
136 |                 if len(tmp)<=0:
137 |                     continue
138 |                 tuples.add(tmp[0]+'-'+tmp[1])
139 | #                 tuples.add(tmp[1]+'-'+tmp[0])
140 |                 tuples.add(tmp[0]+'-'+tmp[2])
141 | #                 tuples.add(tmp[2]+'-'+tmp[0])
142 |         f.close()
143 |         f=None
144 |     # test data
145 |     folder_test=folder+'test/'
146 |     for i in range(1,11):
147 |         path=folder_test+'test_'+relationName+'_'+bytes(i) 
148 |         with open(path) as f:
149 |             for l in f:
150 |                 tmp=l.strip().split()
151 |                 if len(tmp)<=0:
152 |                     continue
153 |                 for j in range(1,len(tmp)):
154 |                     tuples.add(tmp[0]+'-'+tmp[j])
155 | #                     tuples.add(tmp[j]+'-'+tmp[0])
156 |         f.close()
157 |         f=None
158 |     return tuples
159 | 
160 | 
161 | def generateSubgraphsByAllSubpathsDirectlyAndSave(tuples, subpathsMap, subgraphNum, proportion, DAGSaveFile, upperLimit):
162 |     """
163 |         generate DAGs by all subpaths, then save to file
164 |     """
165 |     output = open(DAGSaveFile, 'w') 
166 |     for tuple in tuples: 
167 |         arr=tuple.strip().split('-') 
168 |         start=int(arr[0]) 
169 |         end=int(arr[1]) 
170 |         if tuple not in subpathsMap: 
171 |             continue
172 |         subpaths=subpathsMap[tuple] 
173 |         indexes=range(len(subpaths)) 
174 |         number=0 
175 |         if subgraphNum>0: 
176 |             number=subgraphNum
177 |         else: 
178 |             number=int(math.ceil(len(subpaths)*proportion))
179 |             if upperLimit>0: 
180 |                 number=min(number, upperLimit) 
181 |         for i in range(number): 
182 |             map={} 
183 |             mapCheck={} 
184 |             random.shuffle(indexes) 
185 |             for j in indexes:
186 |                 subpath=subpaths[j] 
187 |                 for x in range(len(subpath)-1): 
188 |                     if subpath[x] in map: 
189 |                         if subpath[x+1] not in mapCheck[subpath[x]]: 
190 |                             map[subpath[x]].append(subpath[x+1]) 
191 |                             mapCheck[subpath[x]].add(subpath[x+1]) 
192 |                     else: 
193 |                         map[subpath[x]]=[subpath[x+1]]
194 |                         mapCheck[subpath[x]]=set([subpath[x+1]])
195 |             dependency, sequence, nodesLevel=dataProcessTools.subgraphToOrderedSequence(map, start, end)
196 |             str=bytes(start)+'-'+bytes(end)+'#'
197 |             for depend in dependency: 
198 |                 str+=bytes(depend[0])+'-'+bytes(depend[1])+'\t'
199 |             str+='#' 
200 |             for id in sequence:
201 |                 str+=bytes(id)+'\t'
202 |             str+='#'
203 |             for id in sequence:
204 |                 str+=bytes(id)+'-'+bytes(nodesLevel[id])+'\t'
205 |             str+='\n'
206 |             output.write(str)
207 |             output.flush()
208 |     output.close()
209 |     output=None
210 | 
211 |                 
212 | if __name__=='__main__':
213 |     print 'Read all tuples from files..........'
214 |     start_time = time.time() 
215 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
216 |     tuples=getAlltuples(rootdir, datasetName, relationName) # symmetric
217 | #     tuples=getAlltuplesForSingleDirection(rootdir, datasetName, relationName) # asymmetric
218 |      
219 |     print '-------------------------------------------------------------------------------'
220 |     print 'Read all subpaths from files..........'
221 |     start_time = time.time() 
222 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
223 |     subpathsMap=dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths)
224 |      
225 |     print '-------------------------------------------------------------------------------'
226 |     print 'Generate subgraphs and save them to file..........'
227 |     start_time = time.time() 
228 |     print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
229 |     
230 |     generateSubgraphsByAllSubpathsDirectlyAndSave(tuples, subpathsMap, subgraphNum, proportion, DAGSaveFile, upperLimit)
231 |     
232 |     print '-------------------------------------------------------------------------------'
233 |     print 'Finished!!!'
234 |     start_time = time.time() 
235 |     print 'End time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
236 | 
237 | 


--------------------------------------------------------------------------------
/D2AGE/symmetric/dataProcessTools.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | data processing tools
  4 | '''
  5 | 
  6 | import numpy
  7 | import theano
  8 | 
  9 | # Set the random number generators' seeds for consistency
 10 | SEED = 123
 11 | numpy.random.seed(SEED)
 12 | 
 13 | def getTrainingData(trainingDataFile):
 14 |     '''
 15 |     get training data from file
 16 |     '''
 17 |     data=[] 
 18 |     pairs=[] 
 19 |     with open(trainingDataFile) as f:
 20 |         for l in f:
 21 |             tmp=l.strip().split()
 22 |             if len(tmp)<=0:
 23 |                 continue
 24 |             arr=[]
 25 |             arr.append(tmp[0]+'-'+tmp[1])
 26 |             arr.append(tmp[1]+'-'+tmp[0])
 27 |             arr.append(tmp[0]+'-'+tmp[2])
 28 |             arr.append(tmp[2]+'-'+tmp[0])
 29 |             pairs.append(arr) 
 30 |             tmp=[int(x) for x in tmp] 
 31 |             data.append(tmp)
 32 |             
 33 |     return data,pairs
 34 | 
 35 | def getWordsEmbeddings(wordsEmbeddings_path):
 36 |     """
 37 |         get word embeddings
 38 |     """
 39 |     size=0
 40 |     dimension=0
 41 |     wemb=[]
 42 |     with open(wordsEmbeddings_path) as f:
 43 |         for l in f:
 44 |             arr=l.strip().split()
 45 |             if len(arr)==2: 
 46 |                 size=int(arr[0])
 47 |                 dimension=int(arr[1])
 48 |                 wemb=numpy.zeros((size,dimension)).astype(theano.config.floatX)  # @UndefinedVariable 
 49 |                 continue
 50 |             id=int(arr[0]) 
 51 |             for i in range(0,dimension):
 52 | #                 wemb[id][i]=theano.config.floatX(arr[i+1])  # @UndefinedVariable
 53 |                 wemb[id][i]=float(arr[i+1])
 54 |     return wemb,dimension,size
 55 | 
 56 | def loadAllSubPaths(subpaths_file,maxlen=1000):
 57 |     """
 58 |         load all subpaths
 59 |     """
 60 |     map={}
 61 |     with open(subpaths_file) as f:
 62 |         for l in f: 
 63 |             splitByTab=l.strip().split('\t')
 64 |             key=splitByTab[0]+'-'+splitByTab[1] 
 65 |             sentence=[int(y) for y in splitByTab[2].split()[:]] 
 66 |             if len(sentence)>maxlen: 
 67 |                 continue
 68 |             if key in map: 
 69 |                 map[key].append(sentence)
 70 |             else: 
 71 |                 tmp=[]
 72 |                 tmp.append(sentence)
 73 |                 map[key]=tmp
 74 |     return map
 75 | 
 76 | def prepareDataForTestForSubgraphSingleSequenceWithLengths(query,candidate,subgraphs_map,dimension):
 77 |     """
 78 |     prepare data for test
 79 |     """
 80 |     key1=bytes(query)+'-'+bytes(candidate)
 81 |     key2=bytes(candidate)+'-'+bytes(query)
 82 |     if key1 not in subgraphs_map and key2 not in subgraphs_map:
 83 |         return None,None,None
 84 |     subgraphs=[] 
 85 |     if key1 in subgraphs_map:
 86 |         subgraphs.append(subgraphs_map[key1]) 
 87 |     if key2 in subgraphs_map:
 88 |         subgraphs.append(subgraphs_map[key2]) 
 89 |     maxlen=0 
 90 |     nsamples=0 
 91 |     for value in subgraphs: 
 92 |         for sequence in value[1]:
 93 |             nsamples+=1 
 94 |             if maxlen<len(sequence):
 95 |                 maxlen=len(sequence)
 96 |     sequences=numpy.zeros((nsamples, maxlen)).astype('int64') # shape=nsamples*maxlen
 97 |     mask=numpy.zeros((nsamples, maxlen, maxlen)).astype(theano.config.floatX)  # @UndefinedVariable # shape=nsamples*maxlen*maxlen
 98 |     lens=numpy.zeros((nsamples, )).astype('int64') # shape=nsamples*0
 99 |     subgraph_lens=numpy.zeros((nsamples,)).astype('int64') 
100 |     nodesLens=numpy.zeros((nsamples,maxlen)).astype('int64') 
101 |     current_index=0 
102 |     for value in subgraphs:
103 |         for i in range(len(value[1])):
104 |             map={} 
105 |             seq=value[1][i] 
106 |             subgraph_len=value[2][i] 
107 |             for j in range(len(seq)):
108 |                 sequences[current_index][j]=seq[j] 
109 |                 nodesLens[current_index][j]=subgraph_len[seq[j]]
110 |                 map[seq[j]]=j
111 |             depend=value[0][i] 
112 |             for dep in depend: 
113 |                 mask[current_index][map[dep[1]]][map[dep[0]]]=1.
114 |             lens[current_index]=len(seq) 
115 |             subgraph_lens[current_index]=subgraph_len[seq[-1]] 
116 |             current_index+=1 
117 |     for i in range(nsamples): 
118 |         for j in range(maxlen):
119 |             if mask[i][j].sum()==0: 
120 |                 mask[i][j][j]=1. 
121 |     
122 |     buffer_tensor=numpy.zeros([maxlen, maxlen, dimension])
123 |     for i in range(maxlen):
124 |         for j in range(dimension):
125 |             buffer_tensor[i][i][j]=1.
126 |             
127 |     return sequences,mask,lens,subgraph_lens,buffer_tensor,nodesLens
128 | 
129 |     
130 | def get_minibatches_idx(n, minibatch_size, shuffle=False):
131 |     """
132 |     Used to shuffle the dataset at each iteration.
133 |     """
134 |     idx_list = numpy.arange(n, dtype="int32")
135 | 
136 |     if shuffle:
137 |         numpy.random.shuffle(idx_list)
138 | 
139 |     minibatches = []
140 |     minibatch_start = 0
141 |     for i in range(n // minibatch_size):
142 |         minibatches.append(idx_list[minibatch_start:
143 |                                     minibatch_start + minibatch_size])
144 |         minibatch_start += minibatch_size
145 | 
146 |     if (minibatch_start != n):
147 |         # Make a minibatch out of what is left
148 |         minibatches.append(idx_list[minibatch_start:])
149 | 
150 |     return zip(range(len(minibatches)), minibatches)
151 | 
152 | 
153 | def pathsToSubgraph(paths):
154 |     """
155 |     """
156 |     subgraph={}
157 |     for path in paths: 
158 |         for i in range(len(path)-1): 
159 |             if path[i] in subgraph: 
160 |                 subgraph[path[i]].append(path[i+1]) 
161 |             else: 
162 |                 subgraph[path[i]]=[path[i+1]]
163 |     return subgraph
164 |     
165 | 
166 | def subgraphToOrderedSequence(edges, start, end):
167 |     """
168 |     """
169 |     nodesLevel={}
170 |     nodesSeq={} 
171 |     for key,values in edges.items():
172 |         if key not in nodesLevel:
173 |             nodesLevel[key]=-1
174 |     queue=[] 
175 |     now=start 
176 |     queue.append(now)
177 |     nodesLevel[now]=0 
178 |     nodesSeq[now]=len(nodesSeq) 
179 |     results=[]
180 |     endNodeLevel=-1 
181 |     while len(queue)>0:
182 |         now=queue.pop(0) 
183 |         children=edges[now] 
184 |         for node in children:
185 |             if node==end: 
186 |                 results.append([now,node])
187 |                 if endNodeLevel==-1: 
188 |                     endNodeLevel=nodesLevel[now]+1
189 |             elif nodesLevel[node]==-1: 
190 |                 queue.append(node) 
191 |                 nodesLevel[node]=nodesLevel[now]+1 
192 |                 nodesSeq[node]=len(nodesSeq)
193 |                 results.append([now,node])
194 |             elif nodesSeq[node]>nodesSeq[now]: 
195 |                 results.append([now,node])
196 |     nodesSeq[end]=len(nodesSeq)
197 |     items=nodesSeq.items()
198 |     backitems=[[v[1],v[0]] for v in items]  
199 |     backitems.sort() 
200 |     sequence=[ backitems[i][1] for i in range(len(items))]  
201 |     nodesLevel[end]=endNodeLevel
202 |     return results, sequence, nodesLevel
203 | 
204 | 
205 | def readAllSubgraphDependencyAndSequencesWithLengths(filepath):
206 |     """
207 |         read all DAGs 
208 |     """
209 |     map={}
210 |     with open(filepath) as f:
211 |         for l in f:
212 |             tmp=l.strip().split('#') 
213 |             if len(tmp)<=0:
214 |                 continue
215 |             depend=tmp[1].strip().split('\t')
216 |             dependint=[]
217 |             for edge in depend:
218 |                 arr=edge.strip().split('-')
219 |                 dependint.append([int(arr[0]),int(arr[1])])
220 |             sequence=tmp[2].strip().split('\t')
221 |             sequenceint=[int(x) for x in sequence]
222 |             lenArr=tmp[3].strip().split('\t')
223 |             lengths={}
224 |             for l in lenArr:
225 |                 lArr=l.strip().split('-')
226 |                 lengths[int(lArr[0])]=int(lArr[1])
227 |             if tmp[0] in map: 
228 |                 value=map[tmp[0]]
229 |                 value[0].append(dependint)
230 |                 value[1].append(sequenceint)
231 |                 value[2].append(lengths)
232 |             else: 
233 |                 map[tmp[0]]=[[dependint],[sequenceint],[lengths]] 
234 |     return map
235 | 
236 | def generateSequenceAndMasksForSingleSequenceWithLength(tuples, tupleFourPairs, subgraphs, dimension):
237 |     """
238 |         generate sequence and masks
239 |     """
240 |     maxlen=0 
241 |     graphNum=0 
242 |     for tuple in tupleFourPairs: 
243 |         for pair in tuple: 
244 |             if pair not in subgraphs:
245 |                 continue
246 |             value=subgraphs[pair]
247 |             sequences=value[1] 
248 |             graphNum+=len(sequences) 
249 |             for seq in sequences: 
250 |                 if len(seq)>maxlen:
251 |                     maxlen=len(seq)
252 |     tuples3DMatrix=numpy.zeros((len(tuples),4,2)).astype('int64') 
253 |     x=numpy.zeros((graphNum,maxlen)).astype('int64') 
254 |     mask=numpy.zeros((graphNum,maxlen,maxlen)).astype(theano.config.floatX)  # @UndefinedVariable 
255 |     lens=numpy.zeros((graphNum,)).astype('int64') 
256 |     subgraph_lens=numpy.zeros((graphNum,)).astype('int64') 
257 |     nodesLens=numpy.zeros((graphNum,maxlen)).astype('int64') 
258 |     current_index=0
259 |     for i in range(len(tuples)): 
260 |         tuple=tuples[i] 
261 |         fourPairs=tupleFourPairs[i] 
262 |         for j in range(len(fourPairs)): 
263 |             if fourPairs[j] not in subgraphs: 
264 |                 tuples3DMatrix[i][j][0]=current_index
265 |                 tuples3DMatrix[i][j][1]=current_index
266 |                 continue
267 |             value=subgraphs[fourPairs[j]] 
268 |             dependency=value[0] 
269 |             sequences=value[1] 
270 |             lengths=value[2] 
271 |             tuples3DMatrix[i][j][0]=current_index 
272 |             for index in range(len(sequences)): 
273 |                 map={}
274 |                 seq=sequences[index] 
275 |                 length=lengths[index] 
276 |                 for s in range(len(seq)): 
277 |                     x[current_index][s]=seq[s]
278 |                     nodesLens[current_index][s]=length[seq[s]] 
279 |                     map[seq[s]]=s 
280 |                 depend=dependency[index] 
281 |                 for d in range(len(depend)): 
282 |                     dep=depend[d] 
283 |                     mask[current_index][map[dep[1]]][map[dep[0]]]=1. 
284 |                 lens[current_index]=len(seq)
285 |                 subgraph_lens[current_index]=length[seq[-1]] 
286 |                 current_index+=1 
287 |             tuples3DMatrix[i][j][1]=current_index 
288 |     
289 |     for i in range(graphNum): 
290 |         for j in range(maxlen):
291 |             if mask[i][j].sum()==0: 
292 |                 mask[i][j][j]=1. 
293 |     buffer_tensor=numpy.zeros([maxlen, maxlen, dimension]).astype(theano.config.floatX)  # @UndefinedVariable 
294 |     for i in range(maxlen):
295 |         for j in range(dimension):
296 |             buffer_tensor[i][i][j]=1.
297 |             
298 |     return tuples3DMatrix, x, mask, lens, subgraph_lens, buffer_tensor, nodesLens


--------------------------------------------------------------------------------
/D2AGE/asymmetric/dataProcessTools.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | data process tools
  4 | '''
  5 | 
  6 | import numpy
  7 | import theano
  8 | 
  9 | # Set the random number generators' seeds for consistency
 10 | SEED = 123
 11 | numpy.random.seed(SEED)
 12 | 
 13 | def getTrainingData(trainingDataFile):
 14 |     '''
 15 |     get training data from file
 16 |     '''
 17 |     data=[] 
 18 |     pairs=[]
 19 |     with open(trainingDataFile) as f:
 20 |         for l in f:
 21 |             tmp=l.strip().split()
 22 |             if len(tmp)<=0:
 23 |                 continue
 24 |             arr=[]
 25 |             arr.append(tmp[0]+'-'+tmp[1])
 26 |             arr.append(tmp[1]+'-'+tmp[0])
 27 |             arr.append(tmp[0]+'-'+tmp[2])
 28 |             arr.append(tmp[2]+'-'+tmp[0])
 29 |             pairs.append(arr)
 30 |             tmp=[int(x) for x in tmp] 
 31 |             data.append(tmp)
 32 |             
 33 |     return data,pairs
 34 | 
 35 | def getWordsEmbeddings(wordsEmbeddings_path):
 36 |     """
 37 |     get word embeddings
 38 |     """
 39 |     size=0
 40 |     dimension=0
 41 |     wemb=[]
 42 |     with open(wordsEmbeddings_path) as f:
 43 |         for l in f:
 44 |             arr=l.strip().split()
 45 |             if len(arr)==2: 
 46 |                 size=int(arr[0])
 47 |                 dimension=int(arr[1])
 48 |                 
 49 |                 wemb=numpy.zeros((size,dimension)).astype(theano.config.floatX)  # @UndefinedVariable 
 50 |                 continue
 51 |             id=int(arr[0]) 
 52 |             for i in range(0,dimension):
 53 |                 wemb[id][i]=float(arr[i+1])
 54 |     return wemb,dimension,size
 55 | 
 56 | def loadAllSubPaths(subpaths_file,maxlen=1000):
 57 |     """
 58 |     load all subpaths from file
 59 |     """
 60 |     map={}
 61 |     with open(subpaths_file) as f:
 62 |         for l in f: 
 63 |             splitByTab=l.strip().split('\t')
 64 |             key=splitByTab[0]+'-'+splitByTab[1] 
 65 |             sentence=[int(y) for y in splitByTab[2].split()[:]] 
 66 |             if len(sentence)>maxlen: 
 67 |                 continue
 68 |             if key in map: 
 69 |                 map[key].append(sentence)
 70 |             else: 
 71 |                 tmp=[]
 72 |                 tmp.append(sentence)
 73 |                 map[key]=tmp
 74 |     return map
 75 | 
 76 |     
 77 | def prepareDataForTestForSubgraphSingleSequenceWithLengthsAsymmetric(query,candidate,subgraphs_map,dimension):
 78 |     """
 79 |     prepare data for test
 80 |     """
 81 |     key1=bytes(query)+'-'+bytes(candidate)
 82 |     if key1 not in subgraphs_map : 
 83 |         return None,None,None,None,None,None
 84 |     subgraphs=[] 
 85 |     if key1 in subgraphs_map:
 86 |         subgraphs.append(subgraphs_map[key1]) 
 87 |     maxlen=0 
 88 |     nsamples=0 
 89 |     for value in subgraphs: 
 90 |         for sequence in value[1]:
 91 |             nsamples+=1 
 92 |             if maxlen<len(sequence):
 93 |                 maxlen=len(sequence)
 94 |     sequences=numpy.zeros((nsamples, maxlen)).astype('int64') 
 95 |     mask=numpy.zeros((nsamples, maxlen, maxlen)).astype(theano.config.floatX)  # @UndefinedVariable 
 96 |     lens=numpy.zeros((nsamples, )).astype('int64') # shape=nsamples*0
 97 |     subgraph_lens=numpy.zeros((nsamples,)).astype('int64') 
 98 |     nodesLens=numpy.zeros((nsamples,maxlen)).astype('int64') 
 99 |     current_index=0 
100 |     for value in subgraphs:
101 |         for i in range(len(value[1])):
102 |             map={} 
103 |             seq=value[1][i] 
104 |             subgraph_len=value[2][i] 
105 |             for j in range(len(seq)):
106 |                 sequences[current_index][j]=seq[j] 
107 |                 nodesLens[current_index][j]=subgraph_len[seq[j]]
108 |                 map[seq[j]]=j
109 |             depend=value[0][i] 
110 |             for dep in depend: 
111 |                 mask[current_index][map[dep[1]]][map[dep[0]]]=1.
112 |             lens[current_index]=len(seq) 
113 |             subgraph_lens[current_index]=subgraph_len[seq[-1]] 
114 |             current_index+=1 
115 |     for i in range(nsamples): 
116 |         for j in range(maxlen):
117 |             if mask[i][j].sum()==0: 
118 |                 mask[i][j][j]=1. 
119 |     
120 |     buffer_tensor=numpy.zeros([maxlen, maxlen, dimension])
121 |     for i in range(maxlen):
122 |         for j in range(dimension):
123 |             buffer_tensor[i][i][j]=1.
124 |             
125 |     return sequences,mask,lens,subgraph_lens,buffer_tensor,nodesLens
126 | 
127 |     
128 | def get_minibatches_idx(n, minibatch_size, shuffle=False):
129 |     """
130 |     Used to shuffle the dataset at each iteration.
131 |     """
132 |     idx_list = numpy.arange(n, dtype="int32")
133 | 
134 |     if shuffle:
135 |         numpy.random.shuffle(idx_list)
136 | 
137 |     minibatches = []
138 |     minibatch_start = 0
139 |     for i in range(n // minibatch_size):
140 |         minibatches.append(idx_list[minibatch_start:
141 |                                     minibatch_start + minibatch_size])
142 |         minibatch_start += minibatch_size
143 | 
144 |     if (minibatch_start != n):
145 |         # Make a minibatch out of what is left
146 |         minibatches.append(idx_list[minibatch_start:])
147 | 
148 |     return zip(range(len(minibatches)), minibatches)
149 | 
150 | 
151 | def pathsToSubgraph(paths):
152 |     """
153 |     """
154 |     subgraph={}
155 |     for path in paths: 
156 |         for i in range(len(path)-1): 
157 |             if path[i] in subgraph: 
158 |                 subgraph[path[i]].append(path[i+1]) 
159 |             else: 
160 |                 subgraph[path[i]]=[path[i+1]]
161 |     return subgraph
162 |     
163 | 
164 | def subgraphToOrderedSequence(edges, start, end):
165 |     """
166 |     set DAG to a topology ordered sequence
167 |     """
168 |     nodesLevel={}
169 |     nodesSeq={} 
170 |     for key,values in edges.items():
171 |         if key not in nodesLevel:
172 |             nodesLevel[key]=-1
173 |     queue=[] 
174 |     now=start 
175 |     queue.append(now)
176 |     nodesLevel[now]=0 
177 |     nodesSeq[now]=len(nodesSeq) 
178 |     results=[]
179 |     endNodeLevel=-1 
180 |     while len(queue)>0:
181 |         now=queue.pop(0) 
182 |         children=edges[now] 
183 |         for node in children:
184 |             if node==end: 
185 |                 results.append([now,node])
186 |                 if endNodeLevel==-1: 
187 |                     endNodeLevel=nodesLevel[now]+1
188 |             elif nodesLevel[node]==-1: 
189 |                 queue.append(node) 
190 |                 nodesLevel[node]=nodesLevel[now]+1 
191 |                 nodesSeq[node]=len(nodesSeq)
192 |                 results.append([now,node])
193 |             elif nodesSeq[node]>nodesSeq[now]: 
194 |                 results.append([now,node])
195 |     nodesSeq[end]=len(nodesSeq)
196 |     items=nodesSeq.items()
197 |     backitems=[[v[1],v[0]] for v in items]  
198 |     backitems.sort() 
199 |     sequence=[ backitems[i][1] for i in range(len(items))]  
200 |     nodesLevel[end]=endNodeLevel
201 |     return results, sequence, nodesLevel
202 | 
203 | 
204 | def readAllSubgraphDependencyAndSequencesWithLengths(filepath):
205 |     """
206 |     read all DAGs from file
207 |     """
208 |     map={}
209 |     with open(filepath) as f:
210 |         for l in f:
211 |             tmp=l.strip().split('#') 
212 |             if len(tmp)<=0:
213 |                 continue
214 |             depend=tmp[1].strip().split('\t')
215 |             dependint=[]
216 |             for edge in depend:
217 |                 arr=edge.strip().split('-')
218 |                 dependint.append([int(arr[0]),int(arr[1])])
219 |             sequence=tmp[2].strip().split('\t')
220 |             sequenceint=[int(x) for x in sequence]
221 |             lenArr=tmp[3].strip().split('\t')
222 |             lengths={}
223 |             for l in lenArr:
224 |                 lArr=l.strip().split('-')
225 |                 lengths[int(lArr[0])]=int(lArr[1])
226 |             if tmp[0] in map: 
227 |                 value=map[tmp[0]]
228 |                 value[0].append(dependint)
229 |                 value[1].append(sequenceint)
230 |                 value[2].append(lengths)
231 |             else: 
232 |                 map[tmp[0]]=[[dependint],[sequenceint],[lengths]] 
233 |     return map
234 | 
235 | def generateSequenceAndMasksForSingleSequenceWithLengthAsymmetric(tuples, tupleFourPairs, subgraphs, dimension):
236 |     """
237 |     generate data for training
238 |     """
239 |     maxlen=0 
240 |     graphNum=0 
241 |     for tuple in tupleFourPairs: 
242 |         for pair in tuple: 
243 |             if pair not in subgraphs:
244 |                 continue
245 |             value=subgraphs[pair]
246 |             sequences=value[1] 
247 |             graphNum+=len(sequences) 
248 |             for seq in sequences: 
249 |                 if len(seq)>maxlen:
250 |                     maxlen=len(seq)
251 |     tuples3DMatrix=numpy.zeros((len(tuples),4,2)).astype('int64') 
252 |     x=numpy.zeros((graphNum,maxlen)).astype('int64') 
253 |     mask=numpy.zeros((graphNum,maxlen,maxlen)).astype(theano.config.floatX)  # @UndefinedVariable 
254 |     lens=numpy.zeros((graphNum,)).astype('int64') # shape=graphNum*0
255 |     subgraph_lens=numpy.zeros((graphNum,)).astype('int64')
256 |     nodesLens=numpy.zeros((graphNum,maxlen)).astype('int64') 
257 |     current_index=0
258 |     for i in range(len(tuples)): 
259 |         tuple=tuples[i] 
260 |         fourPairs=tupleFourPairs[i] 
261 |         for j in range(len(fourPairs)):
262 |             if fourPairs[j] not in subgraphs: 
263 |                 tuples3DMatrix[i][j][0]=current_index
264 |                 tuples3DMatrix[i][j][1]=current_index
265 |                 continue
266 |             value=subgraphs[fourPairs[j]] 
267 |             dependency=value[0] 
268 |             sequences=value[1]
269 |             lengths=value[2] 
270 |             tuples3DMatrix[i][j][0]=current_index 
271 |             for index in range(len(sequences)): 
272 |                 map={}
273 |                 seq=sequences[index] 
274 |                 length=lengths[index] 
275 |                 for s in range(len(seq)): 
276 |                     x[current_index][s]=seq[s]
277 |                     nodesLens[current_index][s]=length[seq[s]] 
278 |                     map[seq[s]]=s 
279 |                 depend=dependency[index] 
280 |                 for d in range(len(depend)):
281 |                     dep=depend[d] 
282 |                     mask[current_index][map[dep[1]]][map[dep[0]]]=1. 
283 |                 lens[current_index]=len(seq)
284 |                 subgraph_lens[current_index]=length[seq[-1]] 
285 |                 current_index+=1 
286 |             tuples3DMatrix[i][j][1]=current_index 
287 |     
288 |     count=0
289 |     for i in range(len(tuples3DMatrix)):
290 |         if tuples3DMatrix[i][0][0]!=tuples3DMatrix[i][0][1] and tuples3DMatrix[i][2][0]!=tuples3DMatrix[i][2][1]:
291 |             count+=1
292 |     tuples3DMatrix_new=numpy.zeros((count,4,2)).astype('int64')
293 |     index=0
294 |     for i in range(len(tuples3DMatrix)):
295 |         if tuples3DMatrix[i][0][0]!=tuples3DMatrix[i][0][1] and tuples3DMatrix[i][2][0]!=tuples3DMatrix[i][2][1]:
296 |             tuples3DMatrix_new[index]=tuples3DMatrix[i]
297 |             index+=1
298 |     tuples3DMatrix=tuples3DMatrix_new 
299 |     
300 |     for i in range(graphNum): 
301 |         for j in range(maxlen):
302 |             if mask[i][j].sum()==0: 
303 |                 mask[i][j][j]=1. 
304 |     buffer_tensor=numpy.zeros([maxlen, maxlen, dimension])
305 |     for i in range(maxlen):
306 |         for j in range(dimension):
307 |             buffer_tensor[i][i][j]=1.
308 |             
309 |     return tuples3DMatrix, x, mask, lens, subgraph_lens, buffer_tensor, nodesLens


--------------------------------------------------------------------------------
/D2AGE/symmetric/proxEmbedBySubgraphs.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | Created on 2017年2月1日
  4 | @author: Liu Zemin
  5 | Functions and Application : 
  6 | 
  7 | '''
  8 | 
  9 | import numpy
 10 | import theano
 11 | from theano import tensor
 12 | from theano import config
 13 | from collections import OrderedDict
 14 | import dataProcessTools
 15 | import time
 16 | import proxEmbedBySubgraphModel
 17 | import gc
 18 | import six.moves.cPickle as pickle  # @UnresolvedImport
 19 | 
 20 | SEED = 123
 21 | numpy.random.seed(SEED)
 22 | 
 23 | def numpy_floatX(data):
 24 |     return numpy.asarray(data, dtype=theano.config.floatX)  # @UndefinedVariable
 25 | 
 26 | def gradientDescentGroup(learning_rate,tparams,grads,trainingPairs, sequences, masks, lengths, wordsEmbeddings, cost):
 27 |     """
 28 |     """
 29 |     update=[(shared,shared-learning_rate*g) for g,shared in zip(grads,tparams.values())]
 30 |     func=theano.function([trainingPairs, sequences, masks, lengths, wordsEmbeddings],cost,updates=update,on_unused_input='ignore',mode='FAST_RUN')
 31 |     return func
 32 | 
 33 | def adadelta(lr, tparams, grads, trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost):
 34 |     """
 35 |     An adaptive learning rate optimizer
 36 |     Parameters
 37 |     ----------
 38 |     lr : Theano SharedVariable
 39 |         Initial learning rate
 40 |     tpramas: Theano SharedVariable
 41 |         Model parameters
 42 |     grads: Theano variable
 43 |         Gradients of cost w.r.t to parameres
 44 |     x: Theano variable
 45 |         Model inputs
 46 |     mask: Theano variable
 47 |         Sequence mask
 48 |     y: Theano variable
 49 |         Targets
 50 |     cost: Theano variable
 51 |         Objective fucntion to minimize
 52 | 
 53 |     Notes
 54 |     -----
 55 |     For more information, see [ADADELTA]_.
 56 | 
 57 |     .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
 58 |        Rate Method*, arXiv:1212.5701.
 59 |     """
 60 |     zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
 61 |                                   name='%s_grad' % k)
 62 |                     for k, p in tparams.items()]
 63 |     running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
 64 |                                  name='%s_rup2' % k)
 65 |                    for k, p in tparams.items()]
 66 |     running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
 67 |                                     name='%s_rgrad2' % k)
 68 |                       for k, p in tparams.items()]
 69 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
 70 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
 71 |              for rg2, g in zip(running_grads2, grads)]
 72 |     f_grad_shared = theano.function([trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens], cost, updates=zgup + rg2up,
 73 |                                     on_unused_input='ignore',
 74 |                                     name='adadelta_f_grad_shared')
 75 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
 76 |              for zg, ru2, rg2 in zip(zipped_grads,
 77 |                                      running_up2,
 78 |                                      running_grads2)]
 79 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 
 80 |              for ru2, ud in zip(running_up2, updir)] 
 81 |     param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 
 82 |     f_update = theano.function([lr], [], updates=ru2up + param_up,
 83 |                                on_unused_input='ignore',
 84 |                                name='adadelta_f_update')
 85 | 
 86 |     return f_grad_shared, f_update
 87 | 
 88 | 
 89 | def sgd(lr, tparams, grads, x, mask, y, cost):
 90 |     """ Stochastic Gradient Descent
 91 | 
 92 |     :note: A more complicated version of sgd then needed.  This is
 93 |         done like that for adadelta and rmsprop.
 94 | 
 95 |     """
 96 |     # New set of shared variable that will contain the gradient
 97 |     # for a mini-batch.
 98 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
 99 |                for k, p in tparams.items()]
100 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
101 | 
102 |     # Function that computes gradients for a mini-batch, but do not
103 |     # updates the weights.
104 |     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
105 |                                     name='sgd_f_grad_shared')
106 | 
107 |     pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
108 | 
109 |     # Function that updates the weights from the previously computed
110 |     # gradient.
111 |     f_update = theano.function([lr], [], updates=pup,
112 |                                name='sgd_f_update')
113 | 
114 |     return f_grad_shared, f_update
115 | 
116 | def ortho_weight(ndim):
117 |     """
118 |     """
119 |     W = numpy.random.randn(ndim, ndim)
120 |     u, s, v = numpy.linalg.svd(W)
121 |     return u.astype(theano.config.floatX)  # @UndefinedVariable
122 | 
123 | def init_params_weight(row,column):
124 |     """
125 |     """
126 |     W = numpy.random.rand(row, column) 
127 |     W = W*2.0-1.0 
128 |     return W.astype(theano.config.floatX)  # @UndefinedVariable
129 | 
130 | def init_sharedVariables(options):
131 |     """
132 |     """
133 |     print 'init shared Variables......'
134 |     params = OrderedDict()
135 |     Wi=init_params_weight(options['dimension'],options['word_dimension'])
136 |     Wf=init_params_weight(options['dimension'],options['word_dimension'])
137 |     Wo=init_params_weight(options['dimension'],options['word_dimension'])
138 |     Wc=init_params_weight(options['dimension'],options['word_dimension'])
139 |     
140 |     Ui=ortho_weight(options['dimension'])
141 |     Uf=ortho_weight(options['dimension'])
142 |     Uo=ortho_weight(options['dimension'])
143 |     Uc=ortho_weight(options['dimension'])
144 |     
145 |     bi=numpy.zeros((options['dimension'],)).astype(config.floatX)  # @UndefinedVariable
146 |     bf=numpy.zeros((options['dimension'],)).astype(config.floatX)  # @UndefinedVariable
147 |     bo=numpy.zeros((options['dimension'],)).astype(config.floatX)  # @UndefinedVariable
148 |     bc=numpy.zeros((options['dimension'],)).astype(config.floatX)  # @UndefinedVariable
149 |     
150 |     w = numpy.random.rand(options['dimension'], ).astype(config.floatX)  # @UndefinedVariable 
151 |     
152 |     params['Wi']=Wi
153 |     params['Wf']=Wf
154 |     params['Wo']=Wo
155 |     params['Wc']=Wc
156 |     params['Ui']=Ui
157 |     params['Uf']=Uf
158 |     params['Uo']=Uo
159 |     params['Uc']=Uc
160 |     params['bi']=bi
161 |     params['bf']=bf
162 |     params['bo']=bo
163 |     params['bc']=bc
164 |     
165 |     params['w']=w
166 |     
167 |     return params
168 |     
169 |     
170 | def init_tparams(params):
171 |     tparams = OrderedDict()
172 |     for kk, pp in params.items():
173 |         tparams[kk] = theano.shared(params[kk], name=kk)
174 |     return tparams
175 |     
176 | def unzip(zipped):
177 |     """
178 |     """
179 |     new_params = OrderedDict()
180 |     for kk, vv in zipped.items():
181 |         new_params[kk] = vv.get_value()
182 |     return new_params
183 | 
184 | main_dir='D:/dataset/test/icde2016_metagraph/'
185 | def proxEmbedBySubgraphs(
186 |                      trainingDataFile=main_dir+'train_classmate', 
187 |                      wordsEmbeddings_data=None, 
188 |                      wordsEmbeddings_path=main_dir+'facebook/nodesFeatures', 
189 |                       subpaths_map=None, 
190 |                      subpaths_file=main_dir+'facebook/subpathsSaveFile',
191 |                      subgraphSaveFile='', 
192 |                      maxlen_subpaths=1000, 
193 |                      wordsSize=1000000, 
194 |                      
195 |                      maxlen=100,  
196 |                      batch_size=1, 
197 |                      is_shuffle_for_batch=False, 
198 |                      dispFreq=5, 
199 |                      saveFreq=5, 
200 |                      saveto=main_dir+'facebook/path2vec-modelParams.npz', 
201 |                      
202 |                      lrate=0.0001, 
203 |                      word_dimension=22, 
204 |                      dimension=64, 
205 |                      discount_alpha=0.3, 
206 |                      discount_beta=0.3, 
207 |                      h_output_method='max-pooling', 
208 |                      objective_function_method='hinge-loss', 
209 |                      objective_function_param=0, 
210 |                      max_epochs=10, 
211 |                      
212 |                      decay=0.01, 
213 |                          ):
214 |     model_options = locals().copy()
215 |     
216 |     if wordsEmbeddings_data is None: 
217 |         if wordsEmbeddings_path is not None: 
218 |             wordsEmbeddings_data,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
219 |         else: 
220 |             exit(0) 
221 |     trainingData,trainingPairs_data=dataProcessTools.getTrainingData(trainingDataFile)
222 |     allBatches=dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch)
223 |     
224 |     subgraphs=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphSaveFile)
225 |     
226 |     params=init_sharedVariables(model_options) 
227 |     tparams=init_tparams(params) 
228 |     print 'Generate models ......'
229 |     
230 |     trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost=proxEmbedBySubgraphModel.proxEmbedBySubgraphModel(model_options, tparams)
231 |     
232 |     print 'Generate gradients ......'
233 |     grads=tensor.grad(cost,wrt=list(tparams.values()))
234 |     print 'Using Adadelta to generate functions ......'
235 |     this_time = time.time() 
236 |     print 'Start to compile and optimize, time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(this_time))
237 |     lr = tensor.scalar(name='lr')
238 |     f_grad_shared, f_update=adadelta(lr, tparams, grads, trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost)
239 |     
240 |     print 'Start training models ......'
241 |     best_p = None 
242 |     history_cost=[] 
243 |     
244 |     start_time = time.time() 
245 |     print 'start time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
246 |     uidx=0 
247 |     for eidx in range(max_epochs):
248 |         for _, batch in allBatches: 
249 |             uidx += 1
250 |             trainingDataForBatch=[trainingData[i] for i in batch] 
251 |             trainingPairsForBatch=[trainingPairs_data[i] for i in batch] 
252 |             tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data, nodesLens_data=dataProcessTools.generateSequenceAndMasksForSingleSequenceWithLength(trainingDataForBatch, trainingPairsForBatch, subgraphs, dimension)
253 |             cost=f_grad_shared(tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings_data, buffer_tensor_data, nodesLens_data)
254 |             f_update(lrate)
255 |             
256 |             if numpy.isnan(cost) or numpy.isinf(cost):
257 |                 print('bad cost detected: ', cost)
258 |                 return 
259 |             if numpy.mod(uidx, dispFreq) == 0:
260 |                 print 'Epoch =', eidx, ',  Update =', uidx, ',  Cost =', cost
261 |                 this_time = time.time() 
262 |                 print 'Time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(this_time))
263 |             if saveto and numpy.mod(uidx, saveFreq) == 0:
264 |                 print('Saving...')
265 |                 if best_p is not None: 
266 |                     params = best_p
267 |                 else: 
268 |                     params = unzip(tparams)
269 |                 numpy.savez(saveto, history_errs=history_cost, **params)
270 |                 pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
271 |                 print('Done')
272 |         gc.collect()
273 |         
274 |     end_time = time.time() 
275 |     print 'end time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end_time))
276 |     print 'Training finished! Cost time == ', end_time-start_time,' s'
277 |     
278 |     
279 |     
280 |     
281 |     
282 | 


--------------------------------------------------------------------------------
/D2AGE/asymmetric/proxEmbedBySubgraphs.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | @author: Liu Zemin
  4 | Functions and Application : 
  5 | training 
  6 | '''
  7 | 
  8 | import numpy
  9 | import theano
 10 | from theano import tensor
 11 | from theano import config
 12 | from collections import OrderedDict
 13 | import dataProcessTools
 14 | import time
 15 | import proxEmbedBySubgraphModel
 16 | import gc
 17 | import six.moves.cPickle as pickle  # @UnresolvedImport
 18 | 
 19 | SEED = 123
 20 | numpy.random.seed(SEED)
 21 | 
 22 | def numpy_floatX(data):
 23 |     return numpy.asarray(data, dtype=theano.config.floatX)  # @UndefinedVariable
 24 | 
 25 | def gradientDescentGroup(learning_rate,tparams,grads,trainingPairs, sequences, masks, lengths, wordsEmbeddings, cost):
 26 |     """
 27 |     """
 28 |     update=[(shared,shared-learning_rate*g) for g,shared in zip(grads,tparams.values())]
 29 |     func=theano.function([trainingPairs, sequences, masks, lengths, wordsEmbeddings],cost,updates=update,on_unused_input='ignore',mode='FAST_RUN')
 30 |     return func
 31 | 
 32 | def adadelta(lr, tparams, grads, trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost):
 33 |     """
 34 |     An adaptive learning rate optimizer
 35 |     Parameters
 36 |     ----------
 37 |     lr : Theano SharedVariable
 38 |         Initial learning rate
 39 |     tpramas: Theano SharedVariable
 40 |         Model parameters
 41 |     grads: Theano variable
 42 |         Gradients of cost w.r.t to parameres
 43 |     x: Theano variable
 44 |         Model inputs
 45 |     mask: Theano variable
 46 |         Sequence mask
 47 |     y: Theano variable
 48 |         Targets
 49 |     cost: Theano variable
 50 |         Objective fucntion to minimize
 51 | 
 52 |     Notes
 53 |     -----
 54 |     For more information, see [ADADELTA]_.
 55 | 
 56 |     .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
 57 |        Rate Method*, arXiv:1212.5701.
 58 |     """
 59 |     zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
 60 |                                   name='%s_grad' % k)
 61 |                     for k, p in tparams.items()]
 62 |     running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
 63 |                                  name='%s_rup2' % k)
 64 |                    for k, p in tparams.items()]
 65 |     running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
 66 |                                     name='%s_rgrad2' % k)
 67 |                       for k, p in tparams.items()]
 68 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
 69 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
 70 |              for rg2, g in zip(running_grads2, grads)]
 71 |     f_grad_shared = theano.function([trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens], cost, updates=zgup + rg2up,
 72 |                                     on_unused_input='ignore',
 73 |                                     name='adadelta_f_grad_shared')
 74 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
 75 |              for zg, ru2, rg2 in zip(zipped_grads,
 76 |                                      running_up2,
 77 |                                      running_grads2)]
 78 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 
 79 |              for ru2, ud in zip(running_up2, updir)] 
 80 |     param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 
 81 |     f_update = theano.function([lr], [], updates=ru2up + param_up,
 82 |                                on_unused_input='ignore',
 83 |                                name='adadelta_f_update')
 84 | 
 85 |     return f_grad_shared, f_update
 86 | 
 87 | 
 88 | def sgd(lr, tparams, grads, x, mask, y, cost):
 89 |     """ Stochastic Gradient Descent
 90 | 
 91 |     :note: A more complicated version of sgd then needed.  This is
 92 |         done like that for adadelta and rmsprop.
 93 | 
 94 |     """
 95 |     # New set of shared variable that will contain the gradient
 96 |     # for a mini-batch.
 97 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
 98 |                for k, p in tparams.items()]
 99 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
100 | 
101 |     # Function that computes gradients for a mini-batch, but do not
102 |     # updates the weights.
103 |     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
104 |                                     name='sgd_f_grad_shared')
105 | 
106 |     pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
107 | 
108 |     # Function that updates the weights from the previously computed
109 |     # gradient.
110 |     f_update = theano.function([lr], [], updates=pup,
111 |                                name='sgd_f_update')
112 | 
113 |     return f_grad_shared, f_update
114 | 
115 | def ortho_weight(ndim):
116 |     """
117 |     """
118 |     W = numpy.random.randn(ndim, ndim)
119 |     u, s, v = numpy.linalg.svd(W)
120 |     return u.astype(theano.config.floatX)  # @UndefinedVariable
121 | 
122 | def init_params_weight(row,column):
123 |     """
124 |     """
125 |     W = numpy.random.rand(row, column) 
126 |     W = W*2.0-1.0 
127 |     return W.astype(theano.config.floatX)  # @UndefinedVariable
128 | 
129 | def init_sharedVariables(options):
130 |     """
131 |     """
132 |     print 'init shared Variables......'
133 |     params = OrderedDict()
134 |     Wi=init_params_weight(options['dimension'],options['word_dimension'])
135 |     Wf=init_params_weight(options['dimension'],options['word_dimension'])
136 |     Wo=init_params_weight(options['dimension'],options['word_dimension'])
137 |     Wc=init_params_weight(options['dimension'],options['word_dimension'])
138 |     
139 |     Ui=ortho_weight(options['dimension'])
140 |     Uf=ortho_weight(options['dimension'])
141 |     Uo=ortho_weight(options['dimension'])
142 |     Uc=ortho_weight(options['dimension'])
143 |     
144 |     bi=numpy.zeros((options['dimension'],)).astype(config.floatX)  # @UndefinedVariable
145 |     bf=numpy.zeros((options['dimension'],)).astype(config.floatX)  # @UndefinedVariable
146 |     bo=numpy.zeros((options['dimension'],)).astype(config.floatX)  # @UndefinedVariable
147 |     bc=numpy.zeros((options['dimension'],)).astype(config.floatX)  # @UndefinedVariable
148 |     
149 |     w = numpy.random.rand(options['dimension'], ).astype(config.floatX)  # @UndefinedVariable # 将w初始化为(0,1)之间的随机数
150 |     
151 |     params['Wi']=Wi
152 |     params['Wf']=Wf
153 |     params['Wo']=Wo
154 |     params['Wc']=Wc
155 |     params['Ui']=Ui
156 |     params['Uf']=Uf
157 |     params['Uo']=Uo
158 |     params['Uc']=Uc
159 |     params['bi']=bi
160 |     params['bf']=bf
161 |     params['bo']=bo
162 |     params['bc']=bc
163 |     
164 |     params['w']=w
165 |     
166 |     return params
167 |     
168 |     
169 | def init_tparams(params):
170 |     tparams = OrderedDict()
171 |     for kk, pp in params.items():
172 |         tparams[kk] = theano.shared(params[kk], name=kk)
173 |     return tparams
174 |     
175 | def unzip(zipped):
176 |     """
177 |     When we pickle the model. Needed for the GPU stuff.
178 |     """
179 |     new_params = OrderedDict()
180 |     for kk, vv in zipped.items():
181 |         new_params[kk] = vv.get_value()
182 |     return new_params
183 | 
184 | main_dir='D:/dataset/test/'
185 | def proxEmbedBySubgraphs(
186 |                      trainingDataFile=main_dir+'train_classmate_1', 
187 |                      wordsEmbeddings_data=None, 
188 |                      wordsEmbeddings_path=main_dir+'facebook/nodesFeatures', 
189 |                       subpaths_map=None, 
190 |                      subpaths_file=main_dir+'facebook/subpathsSaveFile',
191 |                      subgraphSaveFile='', 
192 |                      maxlen_subpaths=1000, 
193 |                      wordsSize=1000000, 
194 |                      
195 |                      maxlen=100,  # Sequence longer then this get ignored 
196 |                      batch_size=1, 
197 |                      is_shuffle_for_batch=False, 
198 |                      dispFreq=5, 
199 |                      saveFreq=5, 
200 |                      saveto=main_dir+'facebook/path2vec-modelParams.npz',
201 |                      
202 |                      lrate=0.0001, 
203 |                      word_dimension=22, 
204 |                      dimension=64, 
205 |                      discount_alpha=0.3, 
206 |                      discount_beta=0.3, 
207 |                      h_output_method='max-pooling', 
208 |                      objective_function_method='hinge-loss', 
209 |                      objective_function_param=0, 
210 |                      max_epochs=10, 
211 |                      
212 |                      decay=0.01, 
213 |                          ):
214 |     model_options = locals().copy()
215 |     
216 |     if wordsEmbeddings_data is None: 
217 |         if wordsEmbeddings_path is not None: 
218 |             wordsEmbeddings_data,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
219 |         else: 
220 |             exit(0) 
221 |     trainingData,trainingPairs_data=dataProcessTools.getTrainingData(trainingDataFile)
222 |     allBatches=dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch)
223 |     
224 |     subgraphs=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphSaveFile)
225 |     
226 |     params=init_sharedVariables(model_options) 
227 |     tparams=init_tparams(params) 
228 |     print 'Generate models ......'
229 |     
230 |     trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost=proxEmbedBySubgraphModel.proxEmbedBySubgraphModel(model_options, tparams)
231 |     
232 |     print 'Generate gradients ......'
233 |     grads=tensor.grad(cost,wrt=list(tparams.values()))
234 |     print 'Using Adadelta to generate functions ......'
235 |     this_time = time.time() 
236 |     print 'Start to compile and optimize, time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(this_time))
237 |     lr = tensor.scalar(name='lr')
238 |     f_grad_shared, f_update=adadelta(lr, tparams, grads, trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost)
239 |     
240 |     print 'Start training models ......'
241 |     best_p = None 
242 |     history_cost=[] 
243 |     
244 |     start_time = time.time() 
245 |     print 'start time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
246 |     uidx=0 
247 |     for eidx in range(max_epochs):
248 |         for _, batch in allBatches: 
249 |             uidx += 1
250 |             trainingDataForBatch=[trainingData[i] for i in batch] 
251 |             trainingPairsForBatch=[trainingPairs_data[i] for i in batch] 
252 |             tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data, nodesLens_data=dataProcessTools.generateSequenceAndMasksForSingleSequenceWithLengthAsymmetric(trainingDataForBatch, trainingPairsForBatch, subgraphs, dimension)
253 |             cost=f_grad_shared(tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings_data, buffer_tensor_data, nodesLens_data)
254 |             f_update(lrate)
255 |             
256 |             if numpy.isnan(cost) or numpy.isinf(cost):
257 |                 print('bad cost detected: ', cost)
258 |                 return 
259 |             if numpy.mod(uidx, dispFreq) == 0:
260 |                 print 'Epoch =', eidx, ',  Update =', uidx, ',  Cost =', cost
261 |                 this_time = time.time() 
262 |                 print 'Time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(this_time))
263 |             if saveto and numpy.mod(uidx, saveFreq) == 0:
264 |                 print('Saving...')
265 |                 if best_p is not None: 
266 |                     params = best_p
267 |                 else: 
268 |                     params = unzip(tparams)
269 |                 numpy.savez(saveto, history_errs=history_cost, **params)
270 |                 pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
271 |                 print('Done')
272 |         gc.collect()
273 |         
274 |     end_time = time.time() 
275 |     print 'end time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end_time))
276 |     print 'Training finished! Cost time == ', end_time-start_time,' s'
277 |     
278 |     
279 |     
280 |     
281 |     
282 | 


--------------------------------------------------------------------------------