├── D2AGE ├── symmetric │ ├── toolsFunction.py │ ├── evaluateTools.py │ ├── proxEmbedBySubgraphProcessModelBatch.py │ ├── directedGraphLSTMModel.py │ ├── pythonParamsConfig │ ├── proxEmbedBySubgraphModel.py │ ├── proxEmbedBySubgraphProcessAndAssess.py │ ├── experimentForOneFileByParams.py │ ├── prepareSubgraphsWithAllSubpaths.py │ ├── dataProcessTools.py │ └── proxEmbedBySubgraphs.py └── asymmetric │ ├── proxEmbedBySubgraphProcessModelBatch.py │ ├── evaluateTools.py │ ├── toolsFunction.py │ ├── directedGraphLSTMModel.py │ ├── pythonParamsConfig │ ├── proxEmbedBySubgraphModel.py │ ├── proxEmbedBySubgraphProcessAndAssess.py │ ├── prepareSubgraphsWithAllSubpaths.py │ ├── experimentForOneFileByParams.py │ ├── dataProcessTools.py │ └── proxEmbedBySubgraphs.py └── README.md /D2AGE/symmetric/toolsFunction.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | ''' 4 | 5 | import numpy 6 | import theano 7 | from theano import tensor 8 | from theano.ifelse import ifelse 9 | 10 | def mapSortByValueDESC(map,top): 11 | """ 12 | sort DESC 13 | """ 14 | if top>len(map): 15 | top=len(map) 16 | items=map.items() 17 | backitems=[[v[1],v[0]] for v in items] 18 | backitems.sort(reverse=True) 19 | # backitems.sort() 20 | e=[ backitems[i][1] for i in range(top)] 21 | return e 22 | 23 | 24 | def mapSortByValueASC(map,top): 25 | """ 26 | sort ASC 27 | """ 28 | if top>len(map): 29 | top=len(map) 30 | items=map.items() 31 | backitems=[[v[1],v[0]] for v in items] 32 | # backitems.sort(reverse=True) 33 | backitems.sort() 34 | e=[ backitems[i][1] for i in range(top)] 35 | return e 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # D2AGE 2 | 3 | This is the cource codes for our paper : \ 4 | Distance-aware DAG Embedding for Proximity Search on Heterogeneous Graphs. AAAI, 2018. 5 | 6 | ============================================================ 7 | 8 | Files list:\ 9 | 1). D2AGE : the main dir for the source code \ 10 | 2). readme : this file 11 | 12 | The codes are written in python-2.7, and we use theano for model development. You should generate the subpaths by yourselves, that is: \ 13 | 1) random walk in the given graph \ 14 | 2) truncate the subpaths from the sampled paths then save to file. 15 | 16 | After this step, you can use these codes to first generate the DAGs, and then to model them by D2AGE. 17 | 18 | ============================================================ 19 | 20 | D2AGE directory 21 | 22 | There are two directories in /D2AGE/, symmetric and asymmetric. 23 | The symmetric dir is the source codes for symmetric relation; while the asymmetric dir is the source codes for asymmetric relation. Next we only use the symmetric to explain the details. 24 | 25 | In /D2AGE/symmetric, \ 26 | 1)pythonParamsConfig : this file is to set all the parameters used in the model. We explain these parameters in this file. \ 27 | 2)prepareSubgraphsWithAllSubpaths.py : this file is to generate the DAGs between (q,v) by the given sampled subpaths. \ 28 | 3)experimentForOneFileByParams.py : after DAG generation, you could use this file to train the model, and then test the model. 29 | 30 | For methods in other files, they would be called in the above three files. 31 | 32 | ============================================================ 33 | 34 | If you use the code, please cite our paper: 35 | 36 | @inproceedings{liu2018distance, \ 37 | title={Distance-aware DAG Embedding for Proximity Search on Heterogeneous Graphs}, \ 38 | author={Liu, Zemin and Zheng, Vincent W and Zhao, Zhou and Zhu, Fanwei and Chang, Kevin Chen-Chuan and Wu, Minghui and Ying, Jing}, \ 39 | year={2018}, \ 40 | organization={AAAI} \ 41 | } 42 | -------------------------------------------------------------------------------- /D2AGE/asymmetric/proxEmbedBySubgraphProcessModelBatch.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | the processing model 4 | ''' 5 | 6 | import numpy 7 | import theano 8 | from theano import tensor 9 | import directedGraphLSTMModel 10 | from theano.ifelse import ifelse 11 | 12 | def proxEmbedBySubgraphProcessModel(options, tparams): 13 | xs=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen 14 | masks=tensor.tensor3('masks', dtype=theano.config.floatX) # @UndefinedVariable # shape=nsamples * maxlen * maxlen 15 | lengths=tensor.vector('lengths',dtype='int64') # shape=#(xs) * 0 16 | subgraph_lens=tensor.vector('subgraph_lens', dtype='int64') # shape=nsamples*0 17 | wordsEmbeddings=tensor.matrix('wordsEmbeddings', dtype=theano.config.floatX) # @UndefinedVariable # shape=#(words) * wordsDimension 18 | buffer_tensor=tensor.tensor3('buffer_tensor', dtype=theano.config.floatX) # @UndefinedVariable # shape=maxlen*maxlen*dimension 19 | nodesLens=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen 20 | 21 | def _processSubgraph(i): 22 | length=lengths[i] 23 | x=xs[i,:length] 24 | mask=masks[i,:length,:length] 25 | nodesLen=nodesLens[i,:length] 26 | emb=directedGraphLSTMModel.directedGraphLSTMModel(options, tparams, x, mask, wordsEmbeddings, buffer_tensor, nodesLen) 27 | return emb 28 | 29 | embx=None 30 | rval,update=theano.scan( 31 | _processSubgraph, 32 | sequences=tensor.arange(lengths.shape[0]), 33 | ) 34 | rval=discountModel(options['discount_alpha'], subgraph_lens)[:,None]*rval 35 | embx=rval.max(axis=0) 36 | 37 | score=tensor.dot(embx,tparams['w']) 38 | 39 | return xs, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, score 40 | 41 | def discountModel(alpha,length): 42 | """ 43 | discount 44 | """ 45 | return tensor.exp(alpha*length*(-1)) -------------------------------------------------------------------------------- /D2AGE/symmetric/evaluateTools.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | evaluate criteria 4 | ''' 5 | 6 | import numpy 7 | 8 | def get_AP(k,ideal,test): 9 | """ 10 | """ 11 | ideal=set(ideal) 12 | accumulation=0.0 13 | count=0 14 | for i in range(len(test)): 15 | if i>=k: 16 | break 17 | if test[i] in ideal: 18 | count+=1 19 | accumulation+=count/(i+1.0) 20 | m=len(ideal) 21 | n=k 22 | x=0 23 | if m>n: 24 | x=n 25 | else: 26 | x=m 27 | if x==0: 28 | return 0 29 | return accumulation/x 30 | 31 | 32 | def get_MAP(k,ideal_map,test_map): 33 | """ 34 | """ 35 | accumulation=0.0 36 | for key in ideal_map.keys(): 37 | accumulation+=get_AP(k, ideal_map[key], test_map[key]) 38 | if len(ideal_map)==0: 39 | return 0 40 | return accumulation/len(ideal_map) 41 | 42 | 43 | def get_nDCG(k,ideal,test): 44 | """ 45 | """ 46 | ideal=set(ideal) 47 | accumulation=0.0 48 | for i in range(len(test)): 49 | if i>=k: 50 | break 51 | if test[i] in ideal: 52 | if i==0: 53 | accumulation+=1.0 54 | else: 55 | accumulation+=1.0/numpy.log2(i+1) 56 | normalization=0.0 57 | for i in range(len(ideal)): 58 | if i>=k: 59 | break 60 | if i==0: 61 | normalization+=1.0 62 | else: 63 | normalization+=1.0/numpy.log2(i+1) 64 | if normalization==0: 65 | return 0 66 | return accumulation/normalization 67 | 68 | def get_MnDCG(k,ideal_map,test_map): 69 | """ 70 | """ 71 | accumulation=0.0 72 | for key in ideal_map.keys(): 73 | accumulation+=get_nDCG(k, ideal_map[key], test_map[key]) 74 | if len(ideal_map)==0: 75 | return 0 76 | return accumulation/len(ideal_map) 77 | 78 | -------------------------------------------------------------------------------- /D2AGE/asymmetric/evaluateTools.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | evaluate tools 4 | ''' 5 | 6 | import numpy 7 | 8 | def get_AP(k,ideal,test): 9 | """ 10 | """ 11 | ideal=set(ideal) 12 | accumulation=0.0 13 | count=0 14 | for i in range(len(test)): 15 | if i>=k: 16 | break 17 | if test[i] in ideal: 18 | count+=1 19 | accumulation+=count/(i+1.0) 20 | m=len(ideal) 21 | n=k 22 | x=0 23 | if m>n: 24 | x=n 25 | else: 26 | x=m 27 | if x==0: 28 | return 0 29 | return accumulation/x 30 | 31 | 32 | def get_MAP(k,ideal_map,test_map): 33 | """ 34 | """ 35 | accumulation=0.0 36 | for key in ideal_map.keys(): 37 | accumulation+=get_AP(k, ideal_map[key], test_map[key]) 38 | if len(ideal_map)==0: 39 | return 0 40 | return accumulation/len(ideal_map) 41 | 42 | 43 | def get_nDCG(k,ideal,test): 44 | """ 45 | """ 46 | ideal=set(ideal) 47 | accumulation=0.0 48 | for i in range(len(test)): 49 | if i>=k: 50 | break 51 | if test[i] in ideal: 52 | if i==0: 53 | accumulation+=1.0 54 | else: 55 | accumulation+=1.0/numpy.log2(i+1) 56 | normalization=0.0 57 | for i in range(len(ideal)): 58 | if i>=k: 59 | break 60 | if i==0: 61 | normalization+=1.0 62 | else: 63 | normalization+=1.0/numpy.log2(i+1) 64 | if normalization==0: 65 | return 0 66 | return accumulation/normalization 67 | 68 | def get_MnDCG(k,ideal_map,test_map): 69 | """ 70 | """ 71 | accumulation=0.0 72 | for key in ideal_map.keys(): 73 | accumulation+=get_nDCG(k, ideal_map[key], test_map[key]) 74 | if len(ideal_map)==0: 75 | return 0 76 | return accumulation/len(ideal_map) 77 | 78 | 79 | -------------------------------------------------------------------------------- /D2AGE/symmetric/proxEmbedBySubgraphProcessModelBatch.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | Functions and Application : 5 | process model 6 | ''' 7 | 8 | import numpy 9 | import theano 10 | from theano import tensor 11 | import directedGraphLSTMModel 12 | from theano.ifelse import ifelse 13 | 14 | def proxEmbedBySubgraphProcessModel(options, tparams): 15 | """ 16 | """ 17 | xs=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen 18 | masks=tensor.tensor3('masks', dtype=theano.config.floatX) # @UndefinedVariable # shape=nsamples * maxlen * maxlen 19 | lengths=tensor.vector('lengths',dtype='int64') # shape=#(xs) * 0 20 | subgraph_lens=tensor.vector('subgraph_lens', dtype='int64') # shape=nsamples*0 21 | wordsEmbeddings=tensor.matrix('wordsEmbeddings', dtype=theano.config.floatX) # @UndefinedVariable # shape=#(words) * wordsDimension 22 | buffer_tensor=tensor.tensor3('buffer_tensor', dtype=theano.config.floatX) # @UndefinedVariable # shape=maxlen*maxlen*dimension 23 | nodesLens=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen 24 | 25 | def _processSubgraph(i): 26 | length=lengths[i] 27 | x=xs[i,:length] 28 | mask=masks[i,:length,:length] 29 | nodesLen=nodesLens[i,:length] 30 | emb=directedGraphLSTMModel.directedGraphLSTMModel(options, tparams, x, mask, wordsEmbeddings, buffer_tensor, nodesLen) 31 | return emb 32 | 33 | embx=None 34 | rval,update=theano.scan( 35 | _processSubgraph, 36 | sequences=tensor.arange(lengths.shape[0]), 37 | ) 38 | 39 | rval=discountModel(options['discount_alpha'], subgraph_lens)[:,None]*rval 40 | embx=rval.max(axis=0) 41 | 42 | score=tensor.dot(embx,tparams['w']) 43 | 44 | return xs, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, score 45 | 46 | def discountModel(alpha,length): 47 | """ 48 | discount 49 | """ 50 | return tensor.exp(alpha*length*(-1)) -------------------------------------------------------------------------------- /D2AGE/asymmetric/toolsFunction.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Created on 2016年8月9日 4 | 5 | @author: Administrator 6 | 7 | 本文中主要是一些tools工具方法 8 | ''' 9 | 10 | import numpy 11 | import theano 12 | from theano import tensor 13 | from theano.ifelse import ifelse 14 | 15 | def mapSortByValueDESC(map,top): 16 | """ 17 | 将map按照降序进行排列,并返回top个key 18 | 其中top<=len(map) 19 | 已测试过。 20 | """ 21 | if top>len(map): # 如果设置的top的数值大于map的长度,则将top进行修改为map的长度 22 | top=len(map) 23 | items=map.items() 24 | backitems=[[v[1],v[0]] for v in items] # 反转 25 | backitems.sort(reverse=True) # reverse=True是降序 26 | # backitems.sort() # 升序 27 | e=[ backitems[i][1] for i in range(top)] # 把key按照顺序返回 28 | return e 29 | 30 | 31 | def mapSortByValueASC(map,top): 32 | """ 33 | 将map按照升序进行排列,并返回top个key 34 | 其中top<=len(map) 35 | 已测试过。 36 | """ 37 | if top>len(map): # 如果设置的top的数值大于map的长度,则将top进行修改为map的长度 38 | top=len(map) 39 | items=map.items() 40 | backitems=[[v[1],v[0]] for v in items] # 反转 41 | # backitems.sort(reverse=True) # reverse=True是降序 42 | backitems.sort() # 升序 43 | e=[ backitems[i][1] for i in range(top)] # 把key按照顺序返回 44 | return e 45 | 46 | 47 | def max_poolingForMatrix(x): 48 | """ 49 | 使用scan函数来实现max-pooling的计算 50 | 其中,x是要计算max-pooling的matrix,这里是按照列来进行绝对值的max-pooling 51 | 已测试过。 52 | """ 53 | def _funcForRow(row,max_array): 54 | """ 55 | 对于每一行,均计算 56 | """ 57 | def _funcForElement(element,max_value): 58 | """ 59 | 对于每个元素 60 | """ 61 | # return tensor.switch(tensor.gt(tensor.abs_(element), tensor.abs_(max_value)), element, max_value) 62 | return ifelse(tensor.gt(tensor.abs_(element), tensor.abs_(max_value)), element, max_value) 63 | 64 | r,u=theano.scan( 65 | fn=_funcForElement, 66 | sequences=[row,max_array], 67 | ) 68 | # 这里的r便是经过这一个row的处理后的max_array 69 | return r 70 | 71 | rval,update=theano.scan( 72 | fn=_funcForRow, 73 | sequences=x, 74 | outputs_info=tensor.alloc(numpy.asarray(0., dtype=theano.config.floatX), # 建立一个内容为0,x.shape[0]*0 维度的矩阵 @UndefinedVariable 75 | x.shape[1], 76 | ), 77 | ) 78 | # 这里的rval的最后的那个,便是经过处理后的abs max 79 | return rval[-1] -------------------------------------------------------------------------------- /D2AGE/asymmetric/directedGraphLSTMModel.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | Functions and Application : 5 | DAG LSTM model 6 | ''' 7 | 8 | import numpy 9 | import theano 10 | from theano import tensor 11 | 12 | def directedGraphLSTMModel(options, tparams, x, mask, wemb, buffer_tensor, nodesLen): 13 | 14 | length=x.shape[0] 15 | dimension=wemb.shape[1] 16 | 17 | proj=wemb[x] 18 | discount_vector=discountModel(options['discount_beta'], nodesLen) 19 | 20 | def _step(index,hArr,cArr): 21 | 22 | hi_sum=None # shape=dimension*0 23 | 24 | discount=mask[index]*discount_vector # shape=maxlen*0 25 | hi_sum=(discount[:,None] * hArr).max(axis=0) 26 | 27 | # input gate, vector, shape= lstm_dimension * 0 28 | i=tensor.nnet.sigmoid(tensor.dot(tparams['Wi'], proj[index]) + tensor.dot(tparams['Ui'], hi_sum) + tparams['bi']) 29 | # forget gate, vector, shape= maxlen*lstm_dimension 30 | f=tensor.nnet.sigmoid(tensor.dot(tparams['Wf'], proj[index]) + tensor.dot((mask[index])[:,None]*hArr, tparams['Uf']) + tparams['bf']) 31 | # output gate, vector, shape= lstm_dimension * 0 32 | o=tensor.nnet.sigmoid(tensor.dot(tparams['Wo'], proj[index]) + tensor.dot(tparams['Uo'], hi_sum) + tparams['bo']) 33 | # new temp cell, vector, shape= lstm_dimension * 0 34 | c_=tensor.tanh(tensor.dot(tparams['Wc'], proj[index]) + tensor.dot(tparams['Uc'], hi_sum) + tparams['bc']) 35 | 36 | c=None 37 | 38 | c=i*c_ + (discount[:,None] * (f * ((mask[index])[:,None]*cArr))).max(axis=0) 39 | 40 | h=o*tensor.tanh(c) 41 | 42 | hArr=tensor.set_subtensor(hArr[index, :], h) 43 | cArr=tensor.set_subtensor(cArr[index, :], c) 44 | 45 | return hArr, cArr 46 | 47 | rval, update=theano.scan( 48 | _step, 49 | sequences=tensor.arange(x.shape[0]), 50 | outputs_info=[tensor.alloc(numpy_floatX(0.), length, options['dimension']),# @UndefinedVariable 51 | tensor.alloc(numpy_floatX(0.), length, options['dimension'])],# @UndefinedVariable 52 | ) 53 | if options['h_output_method']=='h': 54 | return rval[0][-1][-1] 55 | elif options['h_output_method']=='mean-pooling': 56 | return rval[0][-1].mean(axis=0) 57 | elif options['h_output_method']=='max-pooling': 58 | return rval[0][-1].max(axis=0) 59 | else: 60 | return rval[0][-1][-1] 61 | 62 | def numpy_floatX(data): 63 | return numpy.asarray(data, dtype=theano.config.floatX) # @UndefinedVariable 64 | 65 | def discountModel(beta,length): 66 | """ 67 | """ 68 | return tensor.exp(beta*length*(-1)) -------------------------------------------------------------------------------- /D2AGE/symmetric/directedGraphLSTMModel.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | Functions and Application : 5 | DAG LSTM model 6 | ''' 7 | 8 | import numpy 9 | import theano 10 | from theano import tensor 11 | 12 | def directedGraphLSTMModel(options, tparams, x, mask, wemb, buffer_tensor, nodesLen): 13 | 14 | """ 15 | """ 16 | 17 | length=x.shape[0] 18 | dimension=wemb.shape[1] 19 | 20 | proj=wemb[x] 21 | discount_vector=discountModel(options['discount_beta'], nodesLen) 22 | 23 | def _step(index,hArr,cArr): 24 | 25 | 26 | hi_sum=None 27 | 28 | discount=mask[index]*discount_vector # shape=maxlen*0 29 | hi_sum=(discount[:,None] * hArr).max(axis=0) 30 | 31 | # input gate, vector, shape= lstm_dimension * 0 32 | i=tensor.nnet.sigmoid(tensor.dot(tparams['Wi'], proj[index]) + tensor.dot(tparams['Ui'], hi_sum) + tparams['bi']) 33 | # forget gate, vector, shape= maxlen*lstm_dimension 34 | f=tensor.nnet.sigmoid(tensor.dot(tparams['Wf'], proj[index]) + tensor.dot((mask[index])[:,None]*hArr, tparams['Uf']) + tparams['bf']) 35 | # output gate, vector, shape= lstm_dimension * 0 36 | o=tensor.nnet.sigmoid(tensor.dot(tparams['Wo'], proj[index]) + tensor.dot(tparams['Uo'], hi_sum) + tparams['bo']) 37 | # new temp cell, vector, shape= lstm_dimension * 0 38 | c_=tensor.tanh(tensor.dot(tparams['Wc'], proj[index]) + tensor.dot(tparams['Uc'], hi_sum) + tparams['bc']) 39 | 40 | c=None 41 | c=i*c_ + (discount[:,None] * (f * ((mask[index])[:,None]*cArr))).max(axis=0) 42 | 43 | h=o*tensor.tanh(c) 44 | 45 | hArr=tensor.set_subtensor(hArr[index, :], h) 46 | cArr=tensor.set_subtensor(cArr[index, :], c) 47 | 48 | return hArr, cArr 49 | 50 | rval, update=theano.scan( 51 | _step, 52 | sequences=tensor.arange(x.shape[0]), 53 | outputs_info=[tensor.alloc(numpy_floatX(0.), length, options['dimension']),# @UndefinedVariable h 54 | tensor.alloc(numpy_floatX(0.), length, options['dimension'])],# @UndefinedVariable c 55 | ) 56 | if options['h_output_method']=='h': 57 | return rval[0][-1][-1] 58 | elif options['h_output_method']=='mean-pooling': 59 | return rval[0][-1].mean(axis=0) 60 | elif options['h_output_method']=='max-pooling': 61 | return rval[0][-1].max(axis=0) 62 | else: 63 | return rval[0][-1][-1] 64 | 65 | def numpy_floatX(data): 66 | return numpy.asarray(data, dtype=theano.config.floatX) # @UndefinedVariable 67 | 68 | def discountModel(beta,length): 69 | """ 70 | """ 71 | return tensor.exp(beta*length*(-1)) -------------------------------------------------------------------------------- /D2AGE/asymmetric/pythonParamsConfig: -------------------------------------------------------------------------------- 1 | [param] 2 | 3 | ############################################ 4 | # training data dictory 5 | ############################################ 6 | # main work dir 7 | root_dir = D:/dataset/dataset 8 | # the name of one dataset, such as linkedin, facebook 9 | dataset_name = facebook 10 | # number of labels for each dataset, such as 10,100,1000 11 | suffix = 100 12 | # relatin name, such as classmate,family 13 | class_name = classmate 14 | # the index of the dataset file 15 | index = 1 16 | 17 | ############################################ 18 | # paths for some prepared data 19 | ############################################ 20 | # words embeddings path 21 | wordsEmbeddings_path = %(root_dir)s/%(dataset_name)s/nodesFeatures 22 | # sub-paths save file 23 | subpaths_file = %(root_dir)s/%(dataset_name)s/subpathsSaveFile 24 | # DAGs save files 25 | subgraphSaveFile = %(root_dir)s/%(dataset_name)s/subgraphSaveFile 26 | 27 | ############################################ 28 | # experiment parameters - do not need to change frequently 29 | ############################################ 30 | # the max length for sub-paths 31 | maxlen_subpaths = 1000 32 | # the size of words vocabulary 33 | wordsSize = 10000000 34 | # Sequence longer than this get ignored 35 | maxlen = 1000 36 | # use a batch for training. This is the size of this batch. 37 | batch_size = 10 38 | # if need shuffle for training 39 | is_shuffle_for_batch = True 40 | # the frequences for display 41 | dispFreq = 5 42 | # the frequences for saving the parameters 43 | saveFreq = 5 44 | # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. It will be generated in the code. 45 | saveto = 46 | # the top num to predict 47 | top_num = 10 48 | 49 | ############################################ 50 | # experiment parameters 51 | ############################################ 52 | # learning rate 53 | lrate = 0.0001 54 | # dimension of words embeddings 55 | word_dimension = 10 56 | # the dimension of paths embeddings 57 | dimension = 18 58 | 59 | # discount parameter alpha 60 | discount_alpha = 0.3 61 | # discount parameter beta 62 | discount_beta = 0.3 63 | # the output way of DAG-LSTM. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path. 64 | h_output_method = max-pooling 65 | # loss function, we use sigmoid 66 | objective_function_method = sigmoid 67 | # the parameter in loss function, mu 68 | objective_function_param = 0.5 69 | # the max epochs for training 70 | max_epochs = 50 71 | # decay parameter lambda 72 | decay = 0.0001 73 | 74 | ############################################ 75 | # DAG generation parameters 76 | ############################################ 77 | # the number of generating DAGs between (q,v) 78 | subgraphNum = 0 79 | # if subgraphNum = 0, then use this proportion to generate the DAGs. subgraphNum=proportion * #(subpaths) 80 | proportion = 0.8 -------------------------------------------------------------------------------- /D2AGE/symmetric/pythonParamsConfig: -------------------------------------------------------------------------------- 1 | [param] 2 | 3 | ############################################ 4 | # training data dictory 5 | ############################################ 6 | # main work dir 7 | root_dir = D:/dataset/dataset 8 | # the name of one dataset, such as linkedin, facebook 9 | dataset_name = facebook 10 | # number of labels for each dataset, such as 10,100,1000 11 | suffix = 100 12 | # relatin name, such as classmate,family 13 | class_name = classmate 14 | # the index of the dataset file 15 | index = 1 16 | 17 | ############################################ 18 | # paths for some prepared data 19 | ############################################ 20 | # words embeddings path 21 | wordsEmbeddings_path = %(root_dir)s/%(dataset_name)s/nodesFeatures 22 | # sub-paths save file 23 | subpaths_file = %(root_dir)s/%(dataset_name)s/subpathsSaveFile 24 | # DAGs save files 25 | subgraphSaveFile = %(root_dir)s/%(dataset_name)s/subgraphSaveFile 26 | 27 | ############################################ 28 | # experiment parameters - do not need to change frequently 29 | ############################################ 30 | # the max length for sub-paths 31 | maxlen_subpaths = 1000 32 | # the size of words vocabulary 33 | wordsSize = 10000000 34 | # Sequence longer than this get ignored 35 | maxlen = 1000 36 | # use a batch for training. This is the size of this batch. 37 | batch_size = 10 38 | # if need shuffle for training 39 | is_shuffle_for_batch = True 40 | # the frequences for display 41 | dispFreq = 5 42 | # the frequences for saving the parameters 43 | saveFreq = 5 44 | # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. It will be generated in the code. 45 | saveto = 46 | # the top num to predict 47 | top_num = 10 48 | 49 | ############################################ 50 | # experiment parameters 51 | ############################################ 52 | # learning rate 53 | lrate = 0.0001 54 | # dimension of words embeddings 55 | word_dimension = 10 56 | # the dimension of paths embeddings 57 | dimension = 18 58 | 59 | # discount parameter alpha 60 | discount_alpha = 0.3 61 | # discount parameter beta 62 | discount_beta = 0.3 63 | # the output way of DAG-LSTM. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path. 64 | h_output_method = max-pooling 65 | # loss function, we use sigmoid 66 | objective_function_method = sigmoid 67 | # the parameter in loss function, mu 68 | objective_function_param = 0.5 69 | # the max epochs for training 70 | max_epochs = 50 71 | # decay parameter lambda 72 | decay = 0.0001 73 | 74 | ############################################ 75 | # DAG generation parameters 76 | ############################################ 77 | # the number of generating DAGs between (q,v) 78 | subgraphNum = 0 79 | # if subgraphNum = 0, then use this proportion to generate the DAGs. subgraphNum=proportion * #(subpaths) 80 | proportion = 0.8 -------------------------------------------------------------------------------- /D2AGE/asymmetric/proxEmbedBySubgraphModel.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | Functions and Application : 5 | Training Model 6 | ''' 7 | 8 | import numpy 9 | import theano 10 | from theano import tensor 11 | from theano.ifelse import ifelse 12 | import directedGraphLSTMModel 13 | 14 | def proxEmbedBySubgraphModel(options, tparams): 15 | """ 16 | """ 17 | trainingPairs=tensor.tensor3('trainingPairs',dtype='int64') # 3D tensor,shape=#(triples)*4*2 18 | xs=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen 19 | masks=tensor.tensor3('masks', dtype=theano.config.floatX) # @UndefinedVariable # shape=nsamples*maxlen*maxlen 20 | subgraph_lens=tensor.vector('subgraph_lens', dtype='int64') # shape=nsamples*0 21 | lengths=tensor.vector('lengths',dtype='int64') # shape=#(xs) * 0 22 | wordsEmbeddings=tensor.matrix('wordsEmbeddings', dtype=theano.config.floatX) # @UndefinedVariable # shape=#(words) * wordsDimension 23 | 24 | buffer_tensor=tensor.tensor3('buffer_tensor', dtype=theano.config.floatX) # @UndefinedVariable # shape=maxlen*maxlen*dimension 25 | nodesLens=tensor.matrix('nodesLens', dtype='int64') # shape=nsamples*maxlen 26 | 27 | def _processTuple(index , lossSum): 28 | tuple=trainingPairs[index] 29 | 30 | def _processSubgraph(i): 31 | length=lengths[i] 32 | x=xs[i,:length] 33 | mask=masks[i,:length,:length] 34 | nodesLen=nodesLens[i,:length] 35 | emb=directedGraphLSTMModel.directedGraphLSTMModel(options, tparams, x, mask, wordsEmbeddings, buffer_tensor, nodesLen) 36 | return emb 37 | 38 | def iftFunc(): 39 | embx=tensor.zeros(options['dimension'],).astype(theano.config.floatX) # @UndefinedVariable 40 | return embx 41 | 42 | def iffFunc(start, end): 43 | embx=None 44 | rval,update=theano.scan( 45 | _processSubgraph, 46 | sequences=tensor.arange(start,end), 47 | ) 48 | subgraph_len=subgraph_lens[start:end] 49 | 50 | rval=discountModel(options['discount_alpha'], subgraph_len)[:,None]*rval 51 | embx=rval.max(axis=0) 52 | 53 | return embx 54 | 55 | start=tuple[0][0] 56 | end=tuple[0][1] 57 | emb1=None 58 | emb1=ifelse(tensor.eq(start,end),iftFunc(),iffFunc(start,end)) 59 | 60 | start=tuple[2][0] 61 | end=tuple[2][1] 62 | emb2=None 63 | emb2=ifelse(tensor.eq(start,end),iftFunc(),iffFunc(start,end)) 64 | 65 | loss=0 66 | param=options['objective_function_param'] 67 | if options['objective_function_method']=='sigmoid': 68 | loss=-tensor.log(tensor.nnet.sigmoid(param*(tensor.dot(emb1,tparams['w'])-tensor.dot(emb2,tparams['w'])))) # sigmoid 69 | else: # hinge-loss 70 | value=param + tensor.dot(emb2,tparams['w']) - tensor.dot(emb1,tparams['w']) 71 | loss=value*(value>0) 72 | 73 | return loss+lossSum 74 | 75 | rval, update=theano.scan( 76 | _processTuple, 77 | sequences=tensor.arange(trainingPairs.shape[0]), 78 | outputs_info=tensor.constant(0., dtype=theano.config.floatX), # @UndefinedVariable 79 | ) 80 | 81 | cost=rval[-1] 82 | cost+=options['decay']*(tparams['Wi'] ** 2).sum() 83 | cost+=options['decay']*(tparams['Wf'] ** 2).sum() 84 | cost+=options['decay']*(tparams['Wo'] ** 2).sum() 85 | cost+=options['decay']*(tparams['Wc'] ** 2).sum() 86 | cost+=options['decay']*(tparams['Ui'] ** 2).sum() 87 | cost+=options['decay']*(tparams['Uf'] ** 2).sum() 88 | cost+=options['decay']*(tparams['Uo'] ** 2).sum() 89 | cost+=options['decay']*(tparams['Uc'] ** 2).sum() 90 | cost+=options['decay']*(tparams['bi'] ** 2).sum() 91 | cost+=options['decay']*(tparams['bf'] ** 2).sum() 92 | cost+=options['decay']*(tparams['bo'] ** 2).sum() 93 | cost+=options['decay']*(tparams['bc'] ** 2).sum() 94 | 95 | return trainingPairs, xs, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost 96 | 97 | 98 | def discountModel(alpha,length): 99 | return tensor.exp(alpha*length*(-1)) -------------------------------------------------------------------------------- /D2AGE/symmetric/proxEmbedBySubgraphModel.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | Functions and Application : 5 | model 6 | ''' 7 | 8 | import numpy 9 | import theano 10 | from theano import tensor 11 | from theano.ifelse import ifelse 12 | import directedGraphLSTMModel 13 | 14 | def proxEmbedBySubgraphModel(options, tparams): 15 | """ 16 | """ 17 | trainingPairs=tensor.tensor3('trainingPairs',dtype='int64') # 3D tensor,shape=#(triples)*4*2 18 | xs=tensor.matrix('xs', dtype='int64') # shape=nsamples*maxlen 19 | masks=tensor.tensor3('masks', dtype=theano.config.floatX) # @UndefinedVariable # shape=nsamples*maxlen*maxlen 20 | subgraph_lens=tensor.vector('subgraph_lens', dtype='int64') # shape=nsamples*0 21 | lengths=tensor.vector('lengths',dtype='int64') # shape=#(xs) * 0 22 | wordsEmbeddings=tensor.matrix('wordsEmbeddings', dtype=theano.config.floatX) # @UndefinedVariable # shape=#(words) * wordsDimension 23 | 24 | buffer_tensor=tensor.tensor3('buffer_tensor', dtype=theano.config.floatX) # @UndefinedVariable # shape=maxlen*maxlen*dimension 25 | nodesLens=tensor.matrix('nodesLens', dtype='int64') # shape=nsamples*maxlen 26 | 27 | def _processTuple(index , lossSum): 28 | tuple=trainingPairs[index] 29 | 30 | def _processSubgraph(i): 31 | length=lengths[i] 32 | x=xs[i,:length] 33 | mask=masks[i,:length,:length] 34 | nodesLen=nodesLens[i,:length] 35 | emb=directedGraphLSTMModel.directedGraphLSTMModel(options, tparams, x, mask, wordsEmbeddings, buffer_tensor, nodesLen) 36 | return emb 37 | 38 | def iftFunc(): 39 | embx=tensor.zeros(options['dimension'],).astype(theano.config.floatX) # @UndefinedVariable 40 | return embx 41 | 42 | def iffFunc(start, end): 43 | embx=None 44 | rval,update=theano.scan( 45 | _processSubgraph, 46 | sequences=tensor.arange(start,end), 47 | ) 48 | subgraph_len=subgraph_lens[start:end] 49 | 50 | rval=discountModel(options['discount_alpha'], subgraph_len)[:,None]*rval 51 | embx=rval.max(axis=0) 52 | 53 | return embx 54 | 55 | start=tuple[0][0] 56 | end=tuple[1][1] 57 | emb1=None 58 | emb1=iffFunc(start,end) 59 | 60 | start=tuple[2][0] 61 | end=tuple[3][1] 62 | emb2=None 63 | emb2=iffFunc(start,end) 64 | 65 | loss=0 66 | param=options['objective_function_param'] 67 | if options['objective_function_method']=='sigmoid': 68 | loss=-tensor.log(tensor.nnet.sigmoid(param*(tensor.dot(emb1,tparams['w'])-tensor.dot(emb2,tparams['w'])))) # sigmoid 69 | else: # hinge-loss 70 | value=param + tensor.dot(emb2,tparams['w']) - tensor.dot(emb1,tparams['w']) 71 | loss=value*(value>0) 72 | 73 | return tensor.cast(loss+lossSum, theano.config.floatX) # @UndefinedVariable 74 | 75 | rval, update=theano.scan( 76 | _processTuple, 77 | sequences=tensor.arange(trainingPairs.shape[0]), 78 | outputs_info=tensor.constant(0., dtype=theano.config.floatX), # @UndefinedVariable 79 | ) 80 | 81 | cost=rval[-1] 82 | cost+=options['decay']*(tparams['Wi'] ** 2).sum() 83 | cost+=options['decay']*(tparams['Wf'] ** 2).sum() 84 | cost+=options['decay']*(tparams['Wo'] ** 2).sum() 85 | cost+=options['decay']*(tparams['Wc'] ** 2).sum() 86 | cost+=options['decay']*(tparams['Ui'] ** 2).sum() 87 | cost+=options['decay']*(tparams['Uf'] ** 2).sum() 88 | cost+=options['decay']*(tparams['Uo'] ** 2).sum() 89 | cost+=options['decay']*(tparams['Uc'] ** 2).sum() 90 | cost+=options['decay']*(tparams['bi'] ** 2).sum() 91 | cost+=options['decay']*(tparams['bf'] ** 2).sum() 92 | cost+=options['decay']*(tparams['bo'] ** 2).sum() 93 | cost+=options['decay']*(tparams['bc'] ** 2).sum() 94 | 95 | return trainingPairs, xs, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost 96 | 97 | 98 | def discountModel(alpha,length): 99 | """ 100 | discount 101 | """ 102 | return tensor.exp(alpha*length*(-1)) 103 | -------------------------------------------------------------------------------- /D2AGE/symmetric/proxEmbedBySubgraphProcessAndAssess.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | Functions and Application : 5 | 6 | ''' 7 | 8 | import numpy 9 | import theano 10 | from collections import OrderedDict 11 | import proxEmbedBySubgraphProcessModelBatch 12 | import dataProcessTools 13 | import toolsFunction 14 | import evaluateTools 15 | 16 | def load_params(path, params): 17 | """ 18 | load all params from file 19 | """ 20 | pp = numpy.load(path) 21 | for kk, vv in params.items(): 22 | if kk not in pp: 23 | raise Warning('%s is not in the archive' % kk) 24 | params[kk] = pp[kk] 25 | 26 | return params 27 | 28 | 29 | def get_proxEmbedBySubgraphModel( 30 | 31 | model_params_path='', 32 | word_dimension=0, 33 | dimension=0, 34 | discount_alpha=0.3, 35 | discount_beta=0.3, 36 | h_output_method='max-pooling', 37 | ): 38 | """ 39 | """ 40 | model_options = locals().copy() 41 | 42 | tparams = OrderedDict() 43 | tparams['Wi']=None 44 | tparams['Wf']=None 45 | tparams['Wo']=None 46 | tparams['Wc']=None 47 | tparams['Ui']=None 48 | tparams['Uf']=None 49 | tparams['Uo']=None 50 | tparams['Uc']=None 51 | tparams['bi']=None 52 | tparams['bf']=None 53 | tparams['bo']=None 54 | tparams['bc']=None 55 | tparams['w']=None 56 | tparams=load_params(model_params_path, tparams) 57 | 58 | sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, score=proxEmbedBySubgraphProcessModelBatch.proxEmbedBySubgraphProcessModel(model_options, tparams) 59 | func=theano.function([sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens], score, on_unused_input='ignore') 60 | return func 61 | 62 | 63 | def compute_proxEmbedBySubgraph( 64 | wordsEmbeddings=None, 65 | wordsEmbeddings_path=None, 66 | word_dimension=0, 67 | dimension=0, 68 | wordsSize=0, 69 | subpaths_map=None, 70 | subpaths_file=None, 71 | subgraphs_file='', 72 | maxlen_subpaths=1000, 73 | maxlen=100, # Sequence longer then this get ignored 74 | 75 | test_data_file='', 76 | top_num=10, 77 | ideal_data_file='', 78 | func=None, 79 | ): 80 | model_options = locals().copy() 81 | 82 | if wordsEmbeddings is None: 83 | if wordsEmbeddings_path is not None: 84 | wordsEmbeddings,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) 85 | else: 86 | exit(0) 87 | 88 | subgraphs_map=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphs_file) 89 | 90 | line_count=0 91 | test_map={} 92 | print 'Compute MAP and nDCG for file ',test_data_file 93 | with open(test_data_file) as f: 94 | for l in f: 95 | arr=l.strip().split() 96 | query=int(arr[0]) 97 | map={} 98 | for i in range(1,len(arr)): 99 | candidate=int(arr[i]) 100 | sequences_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data,nodesLens_data=dataProcessTools.prepareDataForTestForSubgraphSingleSequenceWithLengths(query, candidate, subgraphs_map, dimension) 101 | if sequences_data is None and mask_data is None and lens_data is None: 102 | map[candidate]=-1000. 103 | else: 104 | value=func(sequences_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings, buffer_tensor_data, nodesLens_data) 105 | map[candidate]=value 106 | 107 | tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num) 108 | test_map[line_count]=tops_in_line 109 | line_count+=1 110 | 111 | line_count=0 112 | ideal_map={} 113 | with open(ideal_data_file) as f: 114 | for l in f: 115 | arr=l.strip().split() 116 | arr=[int(x) for x in arr] 117 | ideal_map[line_count]=arr[1:] 118 | line_count+=1 119 | 120 | MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) 121 | MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map) 122 | 123 | return MAP,MnDCG 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /D2AGE/asymmetric/proxEmbedBySubgraphProcessAndAssess.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Created on 2017年2月3日 4 | @author: Liu Zemin 5 | Functions and Application : 6 | 7 | ''' 8 | 9 | import numpy 10 | import theano 11 | from collections import OrderedDict 12 | import proxEmbedBySubgraphProcessModelBatch 13 | import dataProcessTools 14 | import toolsFunction 15 | import evaluateTools 16 | 17 | 18 | 19 | def load_params(path, params): 20 | """ 21 | load parameters from file 22 | """ 23 | pp = numpy.load(path) 24 | for kk, vv in params.items(): 25 | if kk not in pp: 26 | raise Warning('%s is not in the archive' % kk) 27 | params[kk] = pp[kk] 28 | 29 | return params 30 | 31 | 32 | def get_proxEmbedBySubgraphModel( 33 | 34 | model_params_path='', 35 | word_dimension=0, 36 | dimension=0, 37 | discount_alpha=0.3, 38 | discount_beta=0.3, 39 | h_output_method='max-pooling', 40 | ): 41 | """ 42 | the processing model 43 | """ 44 | model_options = locals().copy() 45 | 46 | tparams = OrderedDict() 47 | tparams['Wi']=None 48 | tparams['Wf']=None 49 | tparams['Wo']=None 50 | tparams['Wc']=None 51 | tparams['Ui']=None 52 | tparams['Uf']=None 53 | tparams['Uo']=None 54 | tparams['Uc']=None 55 | tparams['bi']=None 56 | tparams['bf']=None 57 | tparams['bo']=None 58 | tparams['bc']=None 59 | tparams['w']=None 60 | tparams=load_params(model_params_path, tparams) 61 | 62 | sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, score=proxEmbedBySubgraphProcessModelBatch.proxEmbedBySubgraphProcessModel(model_options, tparams) 63 | func=theano.function([sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens], score, on_unused_input='ignore') 64 | return func 65 | 66 | 67 | def compute_proxEmbedBySubgraph( 68 | wordsEmbeddings=None, 69 | wordsEmbeddings_path=None, 70 | word_dimension=0, 71 | dimension=0, 72 | wordsSize=0, 73 | subpaths_map=None, 74 | subpaths_file=None, 75 | subgraphs_file='', 76 | maxlen_subpaths=1000, 77 | maxlen=100, # Sequence longer then this get ignored 78 | 79 | test_data_file='', 80 | top_num=10, 81 | ideal_data_file='', 82 | func=None, 83 | ): 84 | model_options = locals().copy() 85 | 86 | if wordsEmbeddings is None: 87 | if wordsEmbeddings_path is not None: 88 | wordsEmbeddings,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) 89 | else: 90 | exit(0) 91 | 92 | subgraphs_map=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphs_file) 93 | 94 | line_count=0 95 | test_map={} 96 | print 'Compute MAP and nDCG for file ',test_data_file 97 | with open(test_data_file) as f: 98 | for l in f: 99 | arr=l.strip().split() 100 | query=int(arr[0]) 101 | map={} 102 | for i in range(1,len(arr)): 103 | candidate=int(arr[i]) 104 | sequences_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data,nodesLens_data=dataProcessTools.prepareDataForTestForSubgraphSingleSequenceWithLengthsAsymmetric(query, candidate, subgraphs_map, dimension) 105 | if sequences_data is None and mask_data is None and lens_data is None: 106 | map[candidate]=-1000. 107 | else: 108 | value=func(sequences_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings, buffer_tensor_data, nodesLens_data) 109 | map[candidate]=value 110 | 111 | tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num) 112 | test_map[line_count]=tops_in_line 113 | line_count+=1 114 | 115 | line_count=0 116 | ideal_map={} 117 | with open(ideal_data_file) as f: 118 | for l in f: 119 | arr=l.strip().split() 120 | arr=[int(x) for x in arr] 121 | ideal_map[line_count]=arr[1:] 122 | line_count+=1 123 | 124 | MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) 125 | MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map) 126 | 127 | return MAP,MnDCG 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /D2AGE/asymmetric/prepareSubgraphsWithAllSubpaths.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | Functions and Application : 5 | DAG generation. 6 | ''' 7 | 8 | import numpy 9 | import random 10 | import dataProcessTools 11 | import ConfigParser 12 | import string, os, sys 13 | import time 14 | import math 15 | 16 | SEED = 123 17 | random.seed(SEED) 18 | 19 | cf = ConfigParser.SafeConfigParser() 20 | cf.read("pythonParamsConfig") 21 | 22 | rootdir=cf.get("param", "root_dir") 23 | datasetName=cf.get("param", "dataset_name") 24 | relationName=cf.get("param", "class_name") 25 | # sampleTimes=cf.getint("param", "sampleTimes") 26 | subgraphNum=cf.getint("param", "subgraphNum") 27 | DAGSaveFile=cf.get("param", "subgraphSaveFile") 28 | subpaths_file=cf.get("param", "subpaths_file") 29 | maxlen_subpaths=cf.getint("param", "maxlen_subpaths") 30 | proportion=cf.getfloat("param", "proportion") 31 | upperLimit=cf.getint("param", "upperLimit") 32 | 33 | 34 | def getAlltuplesForSingleDirection(rootdir, datasetName, relationName): 35 | """ 36 | get all tuples for asymmetric 37 | """ 38 | folder=rootdir+'/'+datasetName+'.splits/' 39 | tuples=set() 40 | folder_train10=folder+'train.10/' 41 | for i in range(1,11): 42 | path=folder_train10+'train_'+relationName+'_'+bytes(i) 43 | with open(path) as f: 44 | for l in f: 45 | tmp=l.strip().split() 46 | if len(tmp)<=0: 47 | continue 48 | tuples.add(tmp[0]+'-'+tmp[1]) 49 | tuples.add(tmp[0]+'-'+tmp[2]) 50 | f.close() 51 | f=None 52 | # training data 100 53 | folder_train100=folder+'train.100/' 54 | for i in range(1,11): 55 | path=folder_train100+'train_'+relationName+'_'+bytes(i) 56 | with open(path) as f: 57 | for l in f: 58 | tmp=l.strip().split() 59 | if len(tmp)<=0: 60 | continue 61 | tuples.add(tmp[0]+'-'+tmp[1]) 62 | tuples.add(tmp[0]+'-'+tmp[2]) 63 | f.close() 64 | f=None 65 | # training data 1000 66 | folder_train1000=folder+'train.1000/' 67 | for i in range(1,11): 68 | path=folder_train1000+'train_'+relationName+'_'+bytes(i) 69 | with open(path) as f: 70 | for l in f: 71 | tmp=l.strip().split() 72 | if len(tmp)<=0: 73 | continue 74 | tuples.add(tmp[0]+'-'+tmp[1]) 75 | tuples.add(tmp[0]+'-'+tmp[2]) 76 | f.close() 77 | f=None 78 | # test data 79 | folder_test=folder+'test/' 80 | for i in range(1,11): 81 | path=folder_test+'test_'+relationName+'_'+bytes(i) 82 | with open(path) as f: 83 | for l in f: 84 | tmp=l.strip().split() 85 | if len(tmp)<=0: 86 | continue 87 | for j in range(1,len(tmp)): 88 | tuples.add(tmp[0]+'-'+tmp[j]) 89 | f.close() 90 | f=None 91 | return tuples 92 | 93 | 94 | def generateSubgraphsByAllSubpathsDirectlyAndSave(tuples, subpathsMap, subgraphNum, proportion, DAGSaveFile, upperLimit): 95 | """ 96 | generate DAGs by subpaths 97 | """ 98 | output = open(DAGSaveFile, 'w') 99 | for tuple in tuples: 100 | arr=tuple.strip().split('-') 101 | start=int(arr[0]) 102 | end=int(arr[1]) 103 | if tuple not in subpathsMap: 104 | continue 105 | subpaths=subpathsMap[tuple] 106 | indexes=range(len(subpaths)) 107 | number=0 108 | if subgraphNum>0: 109 | number=subgraphNum 110 | else: 111 | number=int(math.ceil(len(subpaths)*proportion)) 112 | if upperLimit>0: 113 | number=min(number, upperLimit) 114 | for i in range(number): 115 | map={} 116 | mapCheck={} 117 | random.shuffle(indexes) 118 | for j in indexes: 119 | subpath=subpaths[j] 120 | for x in range(len(subpath)-1): 121 | if subpath[x] in map: 122 | if subpath[x+1] not in mapCheck[subpath[x]]: 123 | map[subpath[x]].append(subpath[x+1]) 124 | mapCheck[subpath[x]].add(subpath[x+1]) 125 | else: 126 | map[subpath[x]]=[subpath[x+1]] 127 | mapCheck[subpath[x]]=set([subpath[x+1]]) 128 | dependency, sequence, nodesLevel=dataProcessTools.subgraphToOrderedSequence(map, start, end) 129 | str=bytes(start)+'-'+bytes(end)+'#' 130 | for depend in dependency: 131 | str+=bytes(depend[0])+'-'+bytes(depend[1])+'\t' 132 | str+='#' 133 | for id in sequence: 134 | str+=bytes(id)+'\t' 135 | str+='#' 136 | for id in sequence: 137 | str+=bytes(id)+'-'+bytes(nodesLevel[id])+'\t' 138 | str+='\n' 139 | output.write(str) 140 | output.flush() 141 | output.close() 142 | output=None 143 | 144 | if __name__=='__main__': 145 | print 'Read all tuples from files..........' 146 | start_time = time.time() 147 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 148 | tuples=getAlltuplesForSingleDirection(rootdir, datasetName, relationName) # asymmetric 149 | 150 | print '-------------------------------------------------------------------------------' 151 | print 'Read all subpaths from files..........' 152 | start_time = time.time() 153 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 154 | subpathsMap=dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths) 155 | 156 | print '-------------------------------------------------------------------------------' 157 | print 'Generate subgraphs and save them to file..........' 158 | start_time = time.time() 159 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 160 | 161 | generateSubgraphsByAllSubpathsDirectlyAndSave(tuples, subpathsMap, subgraphNum, proportion, DAGSaveFile, upperLimit) 162 | 163 | print '-------------------------------------------------------------------------------' 164 | print 'Finished!!!' 165 | start_time = time.time() 166 | print 'End time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 167 | 168 | -------------------------------------------------------------------------------- /D2AGE/asymmetric/experimentForOneFileByParams.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | ''' 5 | 6 | import numpy 7 | import theano 8 | from theano import tensor 9 | 10 | import ConfigParser 11 | import string, os, sys 12 | import proxEmbedBySubgraphs 13 | import proxEmbedBySubgraphProcessAndAssess 14 | import time 15 | import subprocess 16 | 17 | if __name__=='__main__': 18 | 19 | cf = ConfigParser.SafeConfigParser() 20 | cf.read("pythonParamsConfig") 21 | 22 | main_dir=cf.get("param", "root_dir") # main work dir 23 | dataset_name=cf.get("param", "dataset_name") # dataset name, such as facebook 24 | suffix=cf.get("param", "suffix") # number of labels for each dataset, such as 10,100,1000 25 | class_name=cf.get("param", "class_name") # relatin name, such as classmate,family 26 | index=cf.get("param", "index") # the index of the dataset file 27 | 28 | trainingDataFile=os.path.join(main_dir+'/',dataset_name+'.splits','train.'+suffix,'train_'+class_name+'_'+index) # the full path of training data file. This path will be generated by main_dir, dataset_name, suffix, class_name and index. 29 | 30 | wordsEmbeddings=None # the file path of words embeddings 31 | wordsEmbeddings_path=cf.get("param", "wordsEmbeddings_path") # dimension of words embeddings 32 | subpaths_map=None # the map to save all the subpaths 33 | subpaths_file=cf.get("param", "subpaths_file") # the file to save all the subpaths 34 | subgraphSaveFile=cf.get("param", "subgraphSaveFile") # the file to save all the DAGs 35 | 36 | maxlen_subpaths=cf.getint("param", "maxlen_subpaths") # the max length for sub-paths 37 | wordsSize=cf.getint("param", "wordsSize") # the max size of words vocabulary 38 | maxlen=cf.getint("param", "maxlen") # Sequence longer then this get ignored 39 | batch_size=cf.getint("param", "batch_size") # use a batch for training. This is the size of this batch. 40 | is_shuffle_for_batch=cf.getboolean("param", "is_shuffle_for_batch") # if need shuffle for training 41 | 42 | dispFreq=cf.getint("param", "dispFreq") # the frequences for display 43 | saveFreq=cf.getint("param", "saveFreq") # the frequences for saving the parameters 44 | saveto=os.path.join(main_dir+'/',dataset_name+'.trainModels','train.'+suffix,'train_'+class_name+'_'+index+'.npz') # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. 45 | 46 | lrate=cf.getfloat("param", "lrate") # learning rate 47 | word_dimension=cf.getint("param", "word_dimension") # dimension of words embeddings 48 | dimension=cf.getint("param", "dimension") # the dimension of paths embeddings 49 | discount_alpha=cf.getfloat("param", "discount_alpha") # parameter alpha 50 | discount_beta=cf.getfloat("param", "discount_beta") # parameter beta 51 | h_output_method=cf.get("param", "h_output_method") # the way of output for each DAG, we use the hidden state of the end node in a DAG as its output 52 | objective_function_method=cf.get("param", "objective_function_method") # the objective function, here we use sigmoid 53 | objective_function_param=cf.getfloat("param", "objective_function_param") # the parameter mu for sigmoid 54 | max_epochs=cf.getint("param", "max_epochs") # the max epoches for training 55 | 56 | decay=cf.getfloat("param", "decay") # the decay parameter lambda 57 | 58 | test_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','test','test_'+class_name+'_'+index) # the file of test data 59 | top_num=cf.getint("param", "top_num") # the top num to predict 60 | ideal_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','ideal','ideal_'+class_name+'_'+index) # the file of ground truth 61 | 62 | # training 63 | proxEmbedBySubgraphs.proxEmbedBySubgraphs( 64 | trainingDataFile, 65 | wordsEmbeddings, 66 | wordsEmbeddings_path, 67 | subpaths_map, 68 | subpaths_file, 69 | subgraphSaveFile, 70 | maxlen_subpaths, 71 | wordsSize, 72 | maxlen, 73 | batch_size, 74 | is_shuffle_for_batch, 75 | dispFreq, 76 | saveFreq, 77 | saveto, 78 | lrate, 79 | word_dimension, 80 | dimension, 81 | discount_alpha, 82 | discount_beta, 83 | h_output_method, 84 | objective_function_method, 85 | objective_function_param, 86 | max_epochs, 87 | decay) 88 | 89 | time.sleep(5) 90 | 91 | print '------------------------------------------------------------------------------' 92 | print 'Start to generate process model..........' 93 | start_time = time.time() 94 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 95 | func=proxEmbedBySubgraphProcessAndAssess.get_proxEmbedBySubgraphModel( 96 | saveto, 97 | word_dimension, 98 | dimension, 99 | discount_alpha, 100 | discount_beta, 101 | h_output_method) 102 | 103 | print 'Start to process and evaluate the model..........' 104 | start_time = time.time() 105 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 106 | MAP, NDCG=proxEmbedBySubgraphProcessAndAssess.compute_proxEmbedBySubgraph( 107 | wordsEmbeddings, 108 | wordsEmbeddings_path, 109 | word_dimension, 110 | dimension, 111 | wordsSize, 112 | subpaths_map, 113 | subpaths_file, 114 | subgraphSaveFile, 115 | maxlen_subpaths, 116 | maxlen, 117 | test_data_file, 118 | top_num, 119 | ideal_data_file, 120 | func) 121 | print '------------------------------------------------------------------------------' 122 | start_time = time.time() 123 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 124 | print 'MAP =', MAP 125 | print 'NDCG =', NDCG -------------------------------------------------------------------------------- /D2AGE/symmetric/experimentForOneFileByParams.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | Functions and Application : 5 | ''' 6 | 7 | 8 | import ConfigParser 9 | import string, os, sys 10 | import proxEmbedBySubgraphs 11 | import proxEmbedBySubgraphProcessAndAssess 12 | import time 13 | 14 | if __name__=='__main__': 15 | 16 | cf = ConfigParser.SafeConfigParser() 17 | cf.read("pythonParamsConfig") 18 | 19 | main_dir=cf.get("param", "root_dir") # main work dir 20 | dataset_name=cf.get("param", "dataset_name") # dataset name, such as facebook 21 | suffix=cf.get("param", "suffix") # number of labels for each dataset, such as 10,100,1000 22 | class_name=cf.get("param", "class_name") # relatin name, such as classmate,family 23 | index=cf.get("param", "index") # the index of the dataset file 24 | 25 | trainingDataFile=os.path.join(main_dir+'/',dataset_name+'.splits','train.'+suffix,'train_'+class_name+'_'+index) # the full path of training data file. This path will be generated by main_dir, dataset_name, suffix, class_name and index. 26 | 27 | wordsEmbeddings=None # the file path of words embeddings 28 | wordsEmbeddings_path=cf.get("param", "wordsEmbeddings_path") # dimension of words embeddings 29 | subpaths_map=None # the map to save all the subpaths 30 | subpaths_file=cf.get("param", "subpaths_file") # the file to save all the subpaths 31 | subgraphSaveFile=cf.get("param", "subgraphSaveFile") # the file to save all the DAGs 32 | 33 | maxlen_subpaths=cf.getint("param", "maxlen_subpaths") # the max length for sub-paths 34 | wordsSize=cf.getint("param", "wordsSize") # the max size of words vocabulary 35 | maxlen=cf.getint("param", "maxlen") # Sequence longer than this get ignored 36 | batch_size=cf.getint("param", "batch_size") # use a batch for training. This is the size of this batch. 37 | is_shuffle_for_batch=cf.getboolean("param", "is_shuffle_for_batch") # if need shuffle for training 38 | 39 | dispFreq=cf.getint("param", "dispFreq") # the frequences for display 40 | saveFreq=cf.getint("param", "saveFreq") # the frequences for saving the parameters 41 | saveto=os.path.join(main_dir+'/',dataset_name+'.trainModels','train.'+suffix,'train_'+class_name+'_'+index+'.npz') # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. 42 | 43 | lrate=cf.getfloat("param", "lrate") # learning rate 44 | word_dimension=cf.getint("param", "word_dimension") # dimension of words embeddings 45 | dimension=cf.getint("param", "dimension") # the dimension of paths embeddings 46 | discount_alpha=cf.getfloat("param", "discount_alpha") # parameter alpha 47 | discount_beta=cf.getfloat("param", "discount_beta") # parameter beta 48 | h_output_method=cf.get("param", "h_output_method") # the way of output for each DAG, we use the hidden state of the end node in a DAG as its output 49 | objective_function_method=cf.get("param", "objective_function_method") # the objective function, here we use sigmoid 50 | objective_function_param=cf.getfloat("param", "objective_function_param") # the parameter mu for sigmoid 51 | max_epochs=cf.getint("param", "max_epochs") # the max epoches for training 52 | 53 | decay=cf.getfloat("param", "decay") # the decay parameter lambda 54 | 55 | test_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','test','test_'+class_name+'_'+index) # the file of test data 56 | top_num=cf.getint("param", "top_num") # the top num to predict 57 | ideal_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','ideal','ideal_'+class_name+'_'+index) # the file of ground truth 58 | 59 | # training 60 | proxEmbedBySubgraphs.proxEmbedBySubgraphs( 61 | trainingDataFile, 62 | wordsEmbeddings, 63 | wordsEmbeddings_path, 64 | subpaths_map, 65 | subpaths_file, 66 | subgraphSaveFile, 67 | maxlen_subpaths, 68 | wordsSize, 69 | maxlen, 70 | batch_size, 71 | is_shuffle_for_batch, 72 | dispFreq, 73 | saveFreq, 74 | saveto, 75 | lrate, 76 | word_dimension, 77 | dimension, 78 | discount_alpha, 79 | discount_beta, 80 | h_output_method, 81 | objective_function_method, 82 | objective_function_param, 83 | max_epochs, 84 | decay) 85 | 86 | time.sleep(5) # sleep 87 | 88 | 89 | print '------------------------------------------------------------------------------' 90 | print 'Start to generate process model..........' 91 | start_time = time.time() 92 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 93 | # get the model for process 94 | func=proxEmbedBySubgraphProcessAndAssess.get_proxEmbedBySubgraphModel( 95 | saveto, 96 | word_dimension, 97 | dimension, 98 | discount_alpha, 99 | discount_beta, 100 | h_output_method) 101 | 102 | print 'Start to process and evaluate the model..........' 103 | start_time = time.time() 104 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 105 | # calculate the results 106 | MAP, NDCG=proxEmbedBySubgraphProcessAndAssess.compute_proxEmbedBySubgraph( 107 | wordsEmbeddings, 108 | wordsEmbeddings_path, 109 | word_dimension, 110 | dimension, 111 | wordsSize, 112 | subpaths_map, 113 | subpaths_file, 114 | subgraphSaveFile, 115 | maxlen_subpaths, 116 | maxlen, 117 | test_data_file, 118 | top_num, 119 | ideal_data_file, 120 | func) 121 | print '------------------------------------------------------------------------------' 122 | start_time = time.time() 123 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 124 | print 'MAP =', MAP 125 | print 'NDCG =', NDCG -------------------------------------------------------------------------------- /D2AGE/symmetric/prepareSubgraphsWithAllSubpaths.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | Functions and Application : 5 | generate DAGs with subpaths 6 | ''' 7 | 8 | import numpy 9 | import random 10 | import dataProcessTools 11 | import ConfigParser 12 | import string, os, sys 13 | import time 14 | import math 15 | 16 | SEED = 123 17 | random.seed(SEED) 18 | 19 | cf = ConfigParser.SafeConfigParser() 20 | cf.read("/usr/pythonParamsConfig") 21 | 22 | rootdir=cf.get("param", "root_dir") 23 | datasetName=cf.get("param", "dataset_name") 24 | relationName=cf.get("param", "class_name") 25 | subgraphNum=cf.getint("param", "subgraphNum") 26 | DAGSaveFile=cf.get("param", "subgraphSaveFile") 27 | subpaths_file=cf.get("param", "subpaths_file") 28 | maxlen_subpaths=cf.getint("param", "maxlen_subpaths") 29 | proportion=cf.getfloat("param", "proportion") 30 | upperLimit=cf.getint("param", "upperLimit") 31 | 32 | def getAlltuples(rootdir, datasetName, relationName): 33 | """ 34 | get all tuples from training data 35 | """ 36 | folder=rootdir+'/'+datasetName+'.splits/' 37 | tuples=set() 38 | folder_train10=folder+'train.10/' 39 | for i in range(1,11): 40 | path=folder_train10+'train_'+relationName+'_'+bytes(i) 41 | with open(path) as f: 42 | for l in f: 43 | tmp=l.strip().split() 44 | if len(tmp)<=0: 45 | continue 46 | tuples.add(tmp[0]+'-'+tmp[1]) 47 | tuples.add(tmp[1]+'-'+tmp[0]) 48 | tuples.add(tmp[0]+'-'+tmp[2]) 49 | tuples.add(tmp[2]+'-'+tmp[0]) 50 | f.close() 51 | f=None 52 | # training data 100 53 | folder_train100=folder+'train.100/' 54 | for i in range(1,11): 55 | path=folder_train100+'train_'+relationName+'_'+bytes(i) 56 | with open(path) as f: 57 | for l in f: 58 | tmp=l.strip().split() 59 | if len(tmp)<=0: 60 | continue 61 | tuples.add(tmp[0]+'-'+tmp[1]) 62 | tuples.add(tmp[1]+'-'+tmp[0]) 63 | tuples.add(tmp[0]+'-'+tmp[2]) 64 | tuples.add(tmp[2]+'-'+tmp[0]) 65 | f.close() 66 | f=None 67 | # training data 1000 68 | folder_train1000=folder+'train.1000/' 69 | for i in range(1,11): 70 | path=folder_train1000+'train_'+relationName+'_'+bytes(i) 71 | with open(path) as f: 72 | for l in f: 73 | tmp=l.strip().split() 74 | if len(tmp)<=0: 75 | continue 76 | tuples.add(tmp[0]+'-'+tmp[1]) 77 | tuples.add(tmp[1]+'-'+tmp[0]) 78 | tuples.add(tmp[0]+'-'+tmp[2]) 79 | tuples.add(tmp[2]+'-'+tmp[0]) 80 | f.close() 81 | f=None 82 | # test data 83 | folder_test=folder+'test/' 84 | for i in range(1,11): 85 | path=folder_test+'test_'+relationName+'_'+bytes(i) 86 | with open(path) as f: 87 | for l in f: 88 | tmp=l.strip().split() 89 | if len(tmp)<=0: 90 | continue 91 | for j in range(1,len(tmp)): 92 | tuples.add(tmp[0]+'-'+tmp[j]) 93 | tuples.add(tmp[j]+'-'+tmp[0]) 94 | f.close() 95 | f=None 96 | return tuples 97 | 98 | def getAlltuplesForSingleDirection(rootdir, datasetName, relationName): 99 | """ 100 | get all tuples for asymmetric relation 101 | """ 102 | folder=rootdir+'/'+datasetName+'.splits/' 103 | tuples=set() 104 | folder_train10=folder+'train.10/' 105 | for i in range(1,11): 106 | path=folder_train10+'train_'+relationName+'_'+bytes(i) 107 | with open(path) as f: 108 | for l in f: 109 | tmp=l.strip().split() 110 | if len(tmp)<=0: 111 | continue 112 | tuples.add(tmp[0]+'-'+tmp[1]) 113 | tuples.add(tmp[0]+'-'+tmp[2]) 114 | f.close() 115 | f=None 116 | # training data 100 117 | folder_train100=folder+'train.100/' 118 | for i in range(1,11): 119 | path=folder_train100+'train_'+relationName+'_'+bytes(i) 120 | with open(path) as f: 121 | for l in f: 122 | tmp=l.strip().split() 123 | if len(tmp)<=0: 124 | continue 125 | tuples.add(tmp[0]+'-'+tmp[1]) 126 | tuples.add(tmp[0]+'-'+tmp[2]) 127 | f.close() 128 | f=None 129 | # training data 1000 130 | folder_train1000=folder+'train.1000/' 131 | for i in range(1,11): 132 | path=folder_train1000+'train_'+relationName+'_'+bytes(i) 133 | with open(path) as f: 134 | for l in f: 135 | tmp=l.strip().split() 136 | if len(tmp)<=0: 137 | continue 138 | tuples.add(tmp[0]+'-'+tmp[1]) 139 | # tuples.add(tmp[1]+'-'+tmp[0]) 140 | tuples.add(tmp[0]+'-'+tmp[2]) 141 | # tuples.add(tmp[2]+'-'+tmp[0]) 142 | f.close() 143 | f=None 144 | # test data 145 | folder_test=folder+'test/' 146 | for i in range(1,11): 147 | path=folder_test+'test_'+relationName+'_'+bytes(i) 148 | with open(path) as f: 149 | for l in f: 150 | tmp=l.strip().split() 151 | if len(tmp)<=0: 152 | continue 153 | for j in range(1,len(tmp)): 154 | tuples.add(tmp[0]+'-'+tmp[j]) 155 | # tuples.add(tmp[j]+'-'+tmp[0]) 156 | f.close() 157 | f=None 158 | return tuples 159 | 160 | 161 | def generateSubgraphsByAllSubpathsDirectlyAndSave(tuples, subpathsMap, subgraphNum, proportion, DAGSaveFile, upperLimit): 162 | """ 163 | generate DAGs by all subpaths, then save to file 164 | """ 165 | output = open(DAGSaveFile, 'w') 166 | for tuple in tuples: 167 | arr=tuple.strip().split('-') 168 | start=int(arr[0]) 169 | end=int(arr[1]) 170 | if tuple not in subpathsMap: 171 | continue 172 | subpaths=subpathsMap[tuple] 173 | indexes=range(len(subpaths)) 174 | number=0 175 | if subgraphNum>0: 176 | number=subgraphNum 177 | else: 178 | number=int(math.ceil(len(subpaths)*proportion)) 179 | if upperLimit>0: 180 | number=min(number, upperLimit) 181 | for i in range(number): 182 | map={} 183 | mapCheck={} 184 | random.shuffle(indexes) 185 | for j in indexes: 186 | subpath=subpaths[j] 187 | for x in range(len(subpath)-1): 188 | if subpath[x] in map: 189 | if subpath[x+1] not in mapCheck[subpath[x]]: 190 | map[subpath[x]].append(subpath[x+1]) 191 | mapCheck[subpath[x]].add(subpath[x+1]) 192 | else: 193 | map[subpath[x]]=[subpath[x+1]] 194 | mapCheck[subpath[x]]=set([subpath[x+1]]) 195 | dependency, sequence, nodesLevel=dataProcessTools.subgraphToOrderedSequence(map, start, end) 196 | str=bytes(start)+'-'+bytes(end)+'#' 197 | for depend in dependency: 198 | str+=bytes(depend[0])+'-'+bytes(depend[1])+'\t' 199 | str+='#' 200 | for id in sequence: 201 | str+=bytes(id)+'\t' 202 | str+='#' 203 | for id in sequence: 204 | str+=bytes(id)+'-'+bytes(nodesLevel[id])+'\t' 205 | str+='\n' 206 | output.write(str) 207 | output.flush() 208 | output.close() 209 | output=None 210 | 211 | 212 | if __name__=='__main__': 213 | print 'Read all tuples from files..........' 214 | start_time = time.time() 215 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 216 | tuples=getAlltuples(rootdir, datasetName, relationName) # symmetric 217 | # tuples=getAlltuplesForSingleDirection(rootdir, datasetName, relationName) # asymmetric 218 | 219 | print '-------------------------------------------------------------------------------' 220 | print 'Read all subpaths from files..........' 221 | start_time = time.time() 222 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 223 | subpathsMap=dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths) 224 | 225 | print '-------------------------------------------------------------------------------' 226 | print 'Generate subgraphs and save them to file..........' 227 | start_time = time.time() 228 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 229 | 230 | generateSubgraphsByAllSubpathsDirectlyAndSave(tuples, subpathsMap, subgraphNum, proportion, DAGSaveFile, upperLimit) 231 | 232 | print '-------------------------------------------------------------------------------' 233 | print 'Finished!!!' 234 | start_time = time.time() 235 | print 'End time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 236 | 237 | -------------------------------------------------------------------------------- /D2AGE/symmetric/dataProcessTools.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | data processing tools 4 | ''' 5 | 6 | import numpy 7 | import theano 8 | 9 | # Set the random number generators' seeds for consistency 10 | SEED = 123 11 | numpy.random.seed(SEED) 12 | 13 | def getTrainingData(trainingDataFile): 14 | ''' 15 | get training data from file 16 | ''' 17 | data=[] 18 | pairs=[] 19 | with open(trainingDataFile) as f: 20 | for l in f: 21 | tmp=l.strip().split() 22 | if len(tmp)<=0: 23 | continue 24 | arr=[] 25 | arr.append(tmp[0]+'-'+tmp[1]) 26 | arr.append(tmp[1]+'-'+tmp[0]) 27 | arr.append(tmp[0]+'-'+tmp[2]) 28 | arr.append(tmp[2]+'-'+tmp[0]) 29 | pairs.append(arr) 30 | tmp=[int(x) for x in tmp] 31 | data.append(tmp) 32 | 33 | return data,pairs 34 | 35 | def getWordsEmbeddings(wordsEmbeddings_path): 36 | """ 37 | get word embeddings 38 | """ 39 | size=0 40 | dimension=0 41 | wemb=[] 42 | with open(wordsEmbeddings_path) as f: 43 | for l in f: 44 | arr=l.strip().split() 45 | if len(arr)==2: 46 | size=int(arr[0]) 47 | dimension=int(arr[1]) 48 | wemb=numpy.zeros((size,dimension)).astype(theano.config.floatX) # @UndefinedVariable 49 | continue 50 | id=int(arr[0]) 51 | for i in range(0,dimension): 52 | # wemb[id][i]=theano.config.floatX(arr[i+1]) # @UndefinedVariable 53 | wemb[id][i]=float(arr[i+1]) 54 | return wemb,dimension,size 55 | 56 | def loadAllSubPaths(subpaths_file,maxlen=1000): 57 | """ 58 | load all subpaths 59 | """ 60 | map={} 61 | with open(subpaths_file) as f: 62 | for l in f: 63 | splitByTab=l.strip().split('\t') 64 | key=splitByTab[0]+'-'+splitByTab[1] 65 | sentence=[int(y) for y in splitByTab[2].split()[:]] 66 | if len(sentence)>maxlen: 67 | continue 68 | if key in map: 69 | map[key].append(sentence) 70 | else: 71 | tmp=[] 72 | tmp.append(sentence) 73 | map[key]=tmp 74 | return map 75 | 76 | def prepareDataForTestForSubgraphSingleSequenceWithLengths(query,candidate,subgraphs_map,dimension): 77 | """ 78 | prepare data for test 79 | """ 80 | key1=bytes(query)+'-'+bytes(candidate) 81 | key2=bytes(candidate)+'-'+bytes(query) 82 | if key1 not in subgraphs_map and key2 not in subgraphs_map: 83 | return None,None,None 84 | subgraphs=[] 85 | if key1 in subgraphs_map: 86 | subgraphs.append(subgraphs_map[key1]) 87 | if key2 in subgraphs_map: 88 | subgraphs.append(subgraphs_map[key2]) 89 | maxlen=0 90 | nsamples=0 91 | for value in subgraphs: 92 | for sequence in value[1]: 93 | nsamples+=1 94 | if maxlen0: 182 | now=queue.pop(0) 183 | children=edges[now] 184 | for node in children: 185 | if node==end: 186 | results.append([now,node]) 187 | if endNodeLevel==-1: 188 | endNodeLevel=nodesLevel[now]+1 189 | elif nodesLevel[node]==-1: 190 | queue.append(node) 191 | nodesLevel[node]=nodesLevel[now]+1 192 | nodesSeq[node]=len(nodesSeq) 193 | results.append([now,node]) 194 | elif nodesSeq[node]>nodesSeq[now]: 195 | results.append([now,node]) 196 | nodesSeq[end]=len(nodesSeq) 197 | items=nodesSeq.items() 198 | backitems=[[v[1],v[0]] for v in items] 199 | backitems.sort() 200 | sequence=[ backitems[i][1] for i in range(len(items))] 201 | nodesLevel[end]=endNodeLevel 202 | return results, sequence, nodesLevel 203 | 204 | 205 | def readAllSubgraphDependencyAndSequencesWithLengths(filepath): 206 | """ 207 | read all DAGs 208 | """ 209 | map={} 210 | with open(filepath) as f: 211 | for l in f: 212 | tmp=l.strip().split('#') 213 | if len(tmp)<=0: 214 | continue 215 | depend=tmp[1].strip().split('\t') 216 | dependint=[] 217 | for edge in depend: 218 | arr=edge.strip().split('-') 219 | dependint.append([int(arr[0]),int(arr[1])]) 220 | sequence=tmp[2].strip().split('\t') 221 | sequenceint=[int(x) for x in sequence] 222 | lenArr=tmp[3].strip().split('\t') 223 | lengths={} 224 | for l in lenArr: 225 | lArr=l.strip().split('-') 226 | lengths[int(lArr[0])]=int(lArr[1]) 227 | if tmp[0] in map: 228 | value=map[tmp[0]] 229 | value[0].append(dependint) 230 | value[1].append(sequenceint) 231 | value[2].append(lengths) 232 | else: 233 | map[tmp[0]]=[[dependint],[sequenceint],[lengths]] 234 | return map 235 | 236 | def generateSequenceAndMasksForSingleSequenceWithLength(tuples, tupleFourPairs, subgraphs, dimension): 237 | """ 238 | generate sequence and masks 239 | """ 240 | maxlen=0 241 | graphNum=0 242 | for tuple in tupleFourPairs: 243 | for pair in tuple: 244 | if pair not in subgraphs: 245 | continue 246 | value=subgraphs[pair] 247 | sequences=value[1] 248 | graphNum+=len(sequences) 249 | for seq in sequences: 250 | if len(seq)>maxlen: 251 | maxlen=len(seq) 252 | tuples3DMatrix=numpy.zeros((len(tuples),4,2)).astype('int64') 253 | x=numpy.zeros((graphNum,maxlen)).astype('int64') 254 | mask=numpy.zeros((graphNum,maxlen,maxlen)).astype(theano.config.floatX) # @UndefinedVariable 255 | lens=numpy.zeros((graphNum,)).astype('int64') 256 | subgraph_lens=numpy.zeros((graphNum,)).astype('int64') 257 | nodesLens=numpy.zeros((graphNum,maxlen)).astype('int64') 258 | current_index=0 259 | for i in range(len(tuples)): 260 | tuple=tuples[i] 261 | fourPairs=tupleFourPairs[i] 262 | for j in range(len(fourPairs)): 263 | if fourPairs[j] not in subgraphs: 264 | tuples3DMatrix[i][j][0]=current_index 265 | tuples3DMatrix[i][j][1]=current_index 266 | continue 267 | value=subgraphs[fourPairs[j]] 268 | dependency=value[0] 269 | sequences=value[1] 270 | lengths=value[2] 271 | tuples3DMatrix[i][j][0]=current_index 272 | for index in range(len(sequences)): 273 | map={} 274 | seq=sequences[index] 275 | length=lengths[index] 276 | for s in range(len(seq)): 277 | x[current_index][s]=seq[s] 278 | nodesLens[current_index][s]=length[seq[s]] 279 | map[seq[s]]=s 280 | depend=dependency[index] 281 | for d in range(len(depend)): 282 | dep=depend[d] 283 | mask[current_index][map[dep[1]]][map[dep[0]]]=1. 284 | lens[current_index]=len(seq) 285 | subgraph_lens[current_index]=length[seq[-1]] 286 | current_index+=1 287 | tuples3DMatrix[i][j][1]=current_index 288 | 289 | for i in range(graphNum): 290 | for j in range(maxlen): 291 | if mask[i][j].sum()==0: 292 | mask[i][j][j]=1. 293 | buffer_tensor=numpy.zeros([maxlen, maxlen, dimension]).astype(theano.config.floatX) # @UndefinedVariable 294 | for i in range(maxlen): 295 | for j in range(dimension): 296 | buffer_tensor[i][i][j]=1. 297 | 298 | return tuples3DMatrix, x, mask, lens, subgraph_lens, buffer_tensor, nodesLens -------------------------------------------------------------------------------- /D2AGE/asymmetric/dataProcessTools.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | data process tools 4 | ''' 5 | 6 | import numpy 7 | import theano 8 | 9 | # Set the random number generators' seeds for consistency 10 | SEED = 123 11 | numpy.random.seed(SEED) 12 | 13 | def getTrainingData(trainingDataFile): 14 | ''' 15 | get training data from file 16 | ''' 17 | data=[] 18 | pairs=[] 19 | with open(trainingDataFile) as f: 20 | for l in f: 21 | tmp=l.strip().split() 22 | if len(tmp)<=0: 23 | continue 24 | arr=[] 25 | arr.append(tmp[0]+'-'+tmp[1]) 26 | arr.append(tmp[1]+'-'+tmp[0]) 27 | arr.append(tmp[0]+'-'+tmp[2]) 28 | arr.append(tmp[2]+'-'+tmp[0]) 29 | pairs.append(arr) 30 | tmp=[int(x) for x in tmp] 31 | data.append(tmp) 32 | 33 | return data,pairs 34 | 35 | def getWordsEmbeddings(wordsEmbeddings_path): 36 | """ 37 | get word embeddings 38 | """ 39 | size=0 40 | dimension=0 41 | wemb=[] 42 | with open(wordsEmbeddings_path) as f: 43 | for l in f: 44 | arr=l.strip().split() 45 | if len(arr)==2: 46 | size=int(arr[0]) 47 | dimension=int(arr[1]) 48 | 49 | wemb=numpy.zeros((size,dimension)).astype(theano.config.floatX) # @UndefinedVariable 50 | continue 51 | id=int(arr[0]) 52 | for i in range(0,dimension): 53 | wemb[id][i]=float(arr[i+1]) 54 | return wemb,dimension,size 55 | 56 | def loadAllSubPaths(subpaths_file,maxlen=1000): 57 | """ 58 | load all subpaths from file 59 | """ 60 | map={} 61 | with open(subpaths_file) as f: 62 | for l in f: 63 | splitByTab=l.strip().split('\t') 64 | key=splitByTab[0]+'-'+splitByTab[1] 65 | sentence=[int(y) for y in splitByTab[2].split()[:]] 66 | if len(sentence)>maxlen: 67 | continue 68 | if key in map: 69 | map[key].append(sentence) 70 | else: 71 | tmp=[] 72 | tmp.append(sentence) 73 | map[key]=tmp 74 | return map 75 | 76 | 77 | def prepareDataForTestForSubgraphSingleSequenceWithLengthsAsymmetric(query,candidate,subgraphs_map,dimension): 78 | """ 79 | prepare data for test 80 | """ 81 | key1=bytes(query)+'-'+bytes(candidate) 82 | if key1 not in subgraphs_map : 83 | return None,None,None,None,None,None 84 | subgraphs=[] 85 | if key1 in subgraphs_map: 86 | subgraphs.append(subgraphs_map[key1]) 87 | maxlen=0 88 | nsamples=0 89 | for value in subgraphs: 90 | for sequence in value[1]: 91 | nsamples+=1 92 | if maxlen0: 181 | now=queue.pop(0) 182 | children=edges[now] 183 | for node in children: 184 | if node==end: 185 | results.append([now,node]) 186 | if endNodeLevel==-1: 187 | endNodeLevel=nodesLevel[now]+1 188 | elif nodesLevel[node]==-1: 189 | queue.append(node) 190 | nodesLevel[node]=nodesLevel[now]+1 191 | nodesSeq[node]=len(nodesSeq) 192 | results.append([now,node]) 193 | elif nodesSeq[node]>nodesSeq[now]: 194 | results.append([now,node]) 195 | nodesSeq[end]=len(nodesSeq) 196 | items=nodesSeq.items() 197 | backitems=[[v[1],v[0]] for v in items] 198 | backitems.sort() 199 | sequence=[ backitems[i][1] for i in range(len(items))] 200 | nodesLevel[end]=endNodeLevel 201 | return results, sequence, nodesLevel 202 | 203 | 204 | def readAllSubgraphDependencyAndSequencesWithLengths(filepath): 205 | """ 206 | read all DAGs from file 207 | """ 208 | map={} 209 | with open(filepath) as f: 210 | for l in f: 211 | tmp=l.strip().split('#') 212 | if len(tmp)<=0: 213 | continue 214 | depend=tmp[1].strip().split('\t') 215 | dependint=[] 216 | for edge in depend: 217 | arr=edge.strip().split('-') 218 | dependint.append([int(arr[0]),int(arr[1])]) 219 | sequence=tmp[2].strip().split('\t') 220 | sequenceint=[int(x) for x in sequence] 221 | lenArr=tmp[3].strip().split('\t') 222 | lengths={} 223 | for l in lenArr: 224 | lArr=l.strip().split('-') 225 | lengths[int(lArr[0])]=int(lArr[1]) 226 | if tmp[0] in map: 227 | value=map[tmp[0]] 228 | value[0].append(dependint) 229 | value[1].append(sequenceint) 230 | value[2].append(lengths) 231 | else: 232 | map[tmp[0]]=[[dependint],[sequenceint],[lengths]] 233 | return map 234 | 235 | def generateSequenceAndMasksForSingleSequenceWithLengthAsymmetric(tuples, tupleFourPairs, subgraphs, dimension): 236 | """ 237 | generate data for training 238 | """ 239 | maxlen=0 240 | graphNum=0 241 | for tuple in tupleFourPairs: 242 | for pair in tuple: 243 | if pair not in subgraphs: 244 | continue 245 | value=subgraphs[pair] 246 | sequences=value[1] 247 | graphNum+=len(sequences) 248 | for seq in sequences: 249 | if len(seq)>maxlen: 250 | maxlen=len(seq) 251 | tuples3DMatrix=numpy.zeros((len(tuples),4,2)).astype('int64') 252 | x=numpy.zeros((graphNum,maxlen)).astype('int64') 253 | mask=numpy.zeros((graphNum,maxlen,maxlen)).astype(theano.config.floatX) # @UndefinedVariable 254 | lens=numpy.zeros((graphNum,)).astype('int64') # shape=graphNum*0 255 | subgraph_lens=numpy.zeros((graphNum,)).astype('int64') 256 | nodesLens=numpy.zeros((graphNum,maxlen)).astype('int64') 257 | current_index=0 258 | for i in range(len(tuples)): 259 | tuple=tuples[i] 260 | fourPairs=tupleFourPairs[i] 261 | for j in range(len(fourPairs)): 262 | if fourPairs[j] not in subgraphs: 263 | tuples3DMatrix[i][j][0]=current_index 264 | tuples3DMatrix[i][j][1]=current_index 265 | continue 266 | value=subgraphs[fourPairs[j]] 267 | dependency=value[0] 268 | sequences=value[1] 269 | lengths=value[2] 270 | tuples3DMatrix[i][j][0]=current_index 271 | for index in range(len(sequences)): 272 | map={} 273 | seq=sequences[index] 274 | length=lengths[index] 275 | for s in range(len(seq)): 276 | x[current_index][s]=seq[s] 277 | nodesLens[current_index][s]=length[seq[s]] 278 | map[seq[s]]=s 279 | depend=dependency[index] 280 | for d in range(len(depend)): 281 | dep=depend[d] 282 | mask[current_index][map[dep[1]]][map[dep[0]]]=1. 283 | lens[current_index]=len(seq) 284 | subgraph_lens[current_index]=length[seq[-1]] 285 | current_index+=1 286 | tuples3DMatrix[i][j][1]=current_index 287 | 288 | count=0 289 | for i in range(len(tuples3DMatrix)): 290 | if tuples3DMatrix[i][0][0]!=tuples3DMatrix[i][0][1] and tuples3DMatrix[i][2][0]!=tuples3DMatrix[i][2][1]: 291 | count+=1 292 | tuples3DMatrix_new=numpy.zeros((count,4,2)).astype('int64') 293 | index=0 294 | for i in range(len(tuples3DMatrix)): 295 | if tuples3DMatrix[i][0][0]!=tuples3DMatrix[i][0][1] and tuples3DMatrix[i][2][0]!=tuples3DMatrix[i][2][1]: 296 | tuples3DMatrix_new[index]=tuples3DMatrix[i] 297 | index+=1 298 | tuples3DMatrix=tuples3DMatrix_new 299 | 300 | for i in range(graphNum): 301 | for j in range(maxlen): 302 | if mask[i][j].sum()==0: 303 | mask[i][j][j]=1. 304 | buffer_tensor=numpy.zeros([maxlen, maxlen, dimension]) 305 | for i in range(maxlen): 306 | for j in range(dimension): 307 | buffer_tensor[i][i][j]=1. 308 | 309 | return tuples3DMatrix, x, mask, lens, subgraph_lens, buffer_tensor, nodesLens -------------------------------------------------------------------------------- /D2AGE/symmetric/proxEmbedBySubgraphs.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Created on 2017年2月1日 4 | @author: Liu Zemin 5 | Functions and Application : 6 | 7 | ''' 8 | 9 | import numpy 10 | import theano 11 | from theano import tensor 12 | from theano import config 13 | from collections import OrderedDict 14 | import dataProcessTools 15 | import time 16 | import proxEmbedBySubgraphModel 17 | import gc 18 | import six.moves.cPickle as pickle # @UnresolvedImport 19 | 20 | SEED = 123 21 | numpy.random.seed(SEED) 22 | 23 | def numpy_floatX(data): 24 | return numpy.asarray(data, dtype=theano.config.floatX) # @UndefinedVariable 25 | 26 | def gradientDescentGroup(learning_rate,tparams,grads,trainingPairs, sequences, masks, lengths, wordsEmbeddings, cost): 27 | """ 28 | """ 29 | update=[(shared,shared-learning_rate*g) for g,shared in zip(grads,tparams.values())] 30 | func=theano.function([trainingPairs, sequences, masks, lengths, wordsEmbeddings],cost,updates=update,on_unused_input='ignore',mode='FAST_RUN') 31 | return func 32 | 33 | def adadelta(lr, tparams, grads, trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost): 34 | """ 35 | An adaptive learning rate optimizer 36 | Parameters 37 | ---------- 38 | lr : Theano SharedVariable 39 | Initial learning rate 40 | tpramas: Theano SharedVariable 41 | Model parameters 42 | grads: Theano variable 43 | Gradients of cost w.r.t to parameres 44 | x: Theano variable 45 | Model inputs 46 | mask: Theano variable 47 | Sequence mask 48 | y: Theano variable 49 | Targets 50 | cost: Theano variable 51 | Objective fucntion to minimize 52 | 53 | Notes 54 | ----- 55 | For more information, see [ADADELTA]_. 56 | 57 | .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning 58 | Rate Method*, arXiv:1212.5701. 59 | """ 60 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), 61 | name='%s_grad' % k) 62 | for k, p in tparams.items()] 63 | running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), 64 | name='%s_rup2' % k) 65 | for k, p in tparams.items()] 66 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), 67 | name='%s_rgrad2' % k) 68 | for k, p in tparams.items()] 69 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 70 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 71 | for rg2, g in zip(running_grads2, grads)] 72 | f_grad_shared = theano.function([trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens], cost, updates=zgup + rg2up, 73 | on_unused_input='ignore', 74 | name='adadelta_f_grad_shared') 75 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 76 | for zg, ru2, rg2 in zip(zipped_grads, 77 | running_up2, 78 | running_grads2)] 79 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 80 | for ru2, ud in zip(running_up2, updir)] 81 | param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 82 | f_update = theano.function([lr], [], updates=ru2up + param_up, 83 | on_unused_input='ignore', 84 | name='adadelta_f_update') 85 | 86 | return f_grad_shared, f_update 87 | 88 | 89 | def sgd(lr, tparams, grads, x, mask, y, cost): 90 | """ Stochastic Gradient Descent 91 | 92 | :note: A more complicated version of sgd then needed. This is 93 | done like that for adadelta and rmsprop. 94 | 95 | """ 96 | # New set of shared variable that will contain the gradient 97 | # for a mini-batch. 98 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 99 | for k, p in tparams.items()] 100 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 101 | 102 | # Function that computes gradients for a mini-batch, but do not 103 | # updates the weights. 104 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 105 | name='sgd_f_grad_shared') 106 | 107 | pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] 108 | 109 | # Function that updates the weights from the previously computed 110 | # gradient. 111 | f_update = theano.function([lr], [], updates=pup, 112 | name='sgd_f_update') 113 | 114 | return f_grad_shared, f_update 115 | 116 | def ortho_weight(ndim): 117 | """ 118 | """ 119 | W = numpy.random.randn(ndim, ndim) 120 | u, s, v = numpy.linalg.svd(W) 121 | return u.astype(theano.config.floatX) # @UndefinedVariable 122 | 123 | def init_params_weight(row,column): 124 | """ 125 | """ 126 | W = numpy.random.rand(row, column) 127 | W = W*2.0-1.0 128 | return W.astype(theano.config.floatX) # @UndefinedVariable 129 | 130 | def init_sharedVariables(options): 131 | """ 132 | """ 133 | print 'init shared Variables......' 134 | params = OrderedDict() 135 | Wi=init_params_weight(options['dimension'],options['word_dimension']) 136 | Wf=init_params_weight(options['dimension'],options['word_dimension']) 137 | Wo=init_params_weight(options['dimension'],options['word_dimension']) 138 | Wc=init_params_weight(options['dimension'],options['word_dimension']) 139 | 140 | Ui=ortho_weight(options['dimension']) 141 | Uf=ortho_weight(options['dimension']) 142 | Uo=ortho_weight(options['dimension']) 143 | Uc=ortho_weight(options['dimension']) 144 | 145 | bi=numpy.zeros((options['dimension'],)).astype(config.floatX) # @UndefinedVariable 146 | bf=numpy.zeros((options['dimension'],)).astype(config.floatX) # @UndefinedVariable 147 | bo=numpy.zeros((options['dimension'],)).astype(config.floatX) # @UndefinedVariable 148 | bc=numpy.zeros((options['dimension'],)).astype(config.floatX) # @UndefinedVariable 149 | 150 | w = numpy.random.rand(options['dimension'], ).astype(config.floatX) # @UndefinedVariable 151 | 152 | params['Wi']=Wi 153 | params['Wf']=Wf 154 | params['Wo']=Wo 155 | params['Wc']=Wc 156 | params['Ui']=Ui 157 | params['Uf']=Uf 158 | params['Uo']=Uo 159 | params['Uc']=Uc 160 | params['bi']=bi 161 | params['bf']=bf 162 | params['bo']=bo 163 | params['bc']=bc 164 | 165 | params['w']=w 166 | 167 | return params 168 | 169 | 170 | def init_tparams(params): 171 | tparams = OrderedDict() 172 | for kk, pp in params.items(): 173 | tparams[kk] = theano.shared(params[kk], name=kk) 174 | return tparams 175 | 176 | def unzip(zipped): 177 | """ 178 | """ 179 | new_params = OrderedDict() 180 | for kk, vv in zipped.items(): 181 | new_params[kk] = vv.get_value() 182 | return new_params 183 | 184 | main_dir='D:/dataset/test/icde2016_metagraph/' 185 | def proxEmbedBySubgraphs( 186 | trainingDataFile=main_dir+'train_classmate', 187 | wordsEmbeddings_data=None, 188 | wordsEmbeddings_path=main_dir+'facebook/nodesFeatures', 189 | subpaths_map=None, 190 | subpaths_file=main_dir+'facebook/subpathsSaveFile', 191 | subgraphSaveFile='', 192 | maxlen_subpaths=1000, 193 | wordsSize=1000000, 194 | 195 | maxlen=100, 196 | batch_size=1, 197 | is_shuffle_for_batch=False, 198 | dispFreq=5, 199 | saveFreq=5, 200 | saveto=main_dir+'facebook/path2vec-modelParams.npz', 201 | 202 | lrate=0.0001, 203 | word_dimension=22, 204 | dimension=64, 205 | discount_alpha=0.3, 206 | discount_beta=0.3, 207 | h_output_method='max-pooling', 208 | objective_function_method='hinge-loss', 209 | objective_function_param=0, 210 | max_epochs=10, 211 | 212 | decay=0.01, 213 | ): 214 | model_options = locals().copy() 215 | 216 | if wordsEmbeddings_data is None: 217 | if wordsEmbeddings_path is not None: 218 | wordsEmbeddings_data,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) 219 | else: 220 | exit(0) 221 | trainingData,trainingPairs_data=dataProcessTools.getTrainingData(trainingDataFile) 222 | allBatches=dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch) 223 | 224 | subgraphs=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphSaveFile) 225 | 226 | params=init_sharedVariables(model_options) 227 | tparams=init_tparams(params) 228 | print 'Generate models ......' 229 | 230 | trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost=proxEmbedBySubgraphModel.proxEmbedBySubgraphModel(model_options, tparams) 231 | 232 | print 'Generate gradients ......' 233 | grads=tensor.grad(cost,wrt=list(tparams.values())) 234 | print 'Using Adadelta to generate functions ......' 235 | this_time = time.time() 236 | print 'Start to compile and optimize, time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(this_time)) 237 | lr = tensor.scalar(name='lr') 238 | f_grad_shared, f_update=adadelta(lr, tparams, grads, trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost) 239 | 240 | print 'Start training models ......' 241 | best_p = None 242 | history_cost=[] 243 | 244 | start_time = time.time() 245 | print 'start time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 246 | uidx=0 247 | for eidx in range(max_epochs): 248 | for _, batch in allBatches: 249 | uidx += 1 250 | trainingDataForBatch=[trainingData[i] for i in batch] 251 | trainingPairsForBatch=[trainingPairs_data[i] for i in batch] 252 | tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data, nodesLens_data=dataProcessTools.generateSequenceAndMasksForSingleSequenceWithLength(trainingDataForBatch, trainingPairsForBatch, subgraphs, dimension) 253 | cost=f_grad_shared(tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings_data, buffer_tensor_data, nodesLens_data) 254 | f_update(lrate) 255 | 256 | if numpy.isnan(cost) or numpy.isinf(cost): 257 | print('bad cost detected: ', cost) 258 | return 259 | if numpy.mod(uidx, dispFreq) == 0: 260 | print 'Epoch =', eidx, ', Update =', uidx, ', Cost =', cost 261 | this_time = time.time() 262 | print 'Time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(this_time)) 263 | if saveto and numpy.mod(uidx, saveFreq) == 0: 264 | print('Saving...') 265 | if best_p is not None: 266 | params = best_p 267 | else: 268 | params = unzip(tparams) 269 | numpy.savez(saveto, history_errs=history_cost, **params) 270 | pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) 271 | print('Done') 272 | gc.collect() 273 | 274 | end_time = time.time() 275 | print 'end time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end_time)) 276 | print 'Training finished! Cost time == ', end_time-start_time,' s' 277 | 278 | 279 | 280 | 281 | 282 | -------------------------------------------------------------------------------- /D2AGE/asymmetric/proxEmbedBySubgraphs.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | @author: Liu Zemin 4 | Functions and Application : 5 | training 6 | ''' 7 | 8 | import numpy 9 | import theano 10 | from theano import tensor 11 | from theano import config 12 | from collections import OrderedDict 13 | import dataProcessTools 14 | import time 15 | import proxEmbedBySubgraphModel 16 | import gc 17 | import six.moves.cPickle as pickle # @UnresolvedImport 18 | 19 | SEED = 123 20 | numpy.random.seed(SEED) 21 | 22 | def numpy_floatX(data): 23 | return numpy.asarray(data, dtype=theano.config.floatX) # @UndefinedVariable 24 | 25 | def gradientDescentGroup(learning_rate,tparams,grads,trainingPairs, sequences, masks, lengths, wordsEmbeddings, cost): 26 | """ 27 | """ 28 | update=[(shared,shared-learning_rate*g) for g,shared in zip(grads,tparams.values())] 29 | func=theano.function([trainingPairs, sequences, masks, lengths, wordsEmbeddings],cost,updates=update,on_unused_input='ignore',mode='FAST_RUN') 30 | return func 31 | 32 | def adadelta(lr, tparams, grads, trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost): 33 | """ 34 | An adaptive learning rate optimizer 35 | Parameters 36 | ---------- 37 | lr : Theano SharedVariable 38 | Initial learning rate 39 | tpramas: Theano SharedVariable 40 | Model parameters 41 | grads: Theano variable 42 | Gradients of cost w.r.t to parameres 43 | x: Theano variable 44 | Model inputs 45 | mask: Theano variable 46 | Sequence mask 47 | y: Theano variable 48 | Targets 49 | cost: Theano variable 50 | Objective fucntion to minimize 51 | 52 | Notes 53 | ----- 54 | For more information, see [ADADELTA]_. 55 | 56 | .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning 57 | Rate Method*, arXiv:1212.5701. 58 | """ 59 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), 60 | name='%s_grad' % k) 61 | for k, p in tparams.items()] 62 | running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), 63 | name='%s_rup2' % k) 64 | for k, p in tparams.items()] 65 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), 66 | name='%s_rgrad2' % k) 67 | for k, p in tparams.items()] 68 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 69 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 70 | for rg2, g in zip(running_grads2, grads)] 71 | f_grad_shared = theano.function([trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens], cost, updates=zgup + rg2up, 72 | on_unused_input='ignore', 73 | name='adadelta_f_grad_shared') 74 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 75 | for zg, ru2, rg2 in zip(zipped_grads, 76 | running_up2, 77 | running_grads2)] 78 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 79 | for ru2, ud in zip(running_up2, updir)] 80 | param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 81 | f_update = theano.function([lr], [], updates=ru2up + param_up, 82 | on_unused_input='ignore', 83 | name='adadelta_f_update') 84 | 85 | return f_grad_shared, f_update 86 | 87 | 88 | def sgd(lr, tparams, grads, x, mask, y, cost): 89 | """ Stochastic Gradient Descent 90 | 91 | :note: A more complicated version of sgd then needed. This is 92 | done like that for adadelta and rmsprop. 93 | 94 | """ 95 | # New set of shared variable that will contain the gradient 96 | # for a mini-batch. 97 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 98 | for k, p in tparams.items()] 99 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 100 | 101 | # Function that computes gradients for a mini-batch, but do not 102 | # updates the weights. 103 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 104 | name='sgd_f_grad_shared') 105 | 106 | pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] 107 | 108 | # Function that updates the weights from the previously computed 109 | # gradient. 110 | f_update = theano.function([lr], [], updates=pup, 111 | name='sgd_f_update') 112 | 113 | return f_grad_shared, f_update 114 | 115 | def ortho_weight(ndim): 116 | """ 117 | """ 118 | W = numpy.random.randn(ndim, ndim) 119 | u, s, v = numpy.linalg.svd(W) 120 | return u.astype(theano.config.floatX) # @UndefinedVariable 121 | 122 | def init_params_weight(row,column): 123 | """ 124 | """ 125 | W = numpy.random.rand(row, column) 126 | W = W*2.0-1.0 127 | return W.astype(theano.config.floatX) # @UndefinedVariable 128 | 129 | def init_sharedVariables(options): 130 | """ 131 | """ 132 | print 'init shared Variables......' 133 | params = OrderedDict() 134 | Wi=init_params_weight(options['dimension'],options['word_dimension']) 135 | Wf=init_params_weight(options['dimension'],options['word_dimension']) 136 | Wo=init_params_weight(options['dimension'],options['word_dimension']) 137 | Wc=init_params_weight(options['dimension'],options['word_dimension']) 138 | 139 | Ui=ortho_weight(options['dimension']) 140 | Uf=ortho_weight(options['dimension']) 141 | Uo=ortho_weight(options['dimension']) 142 | Uc=ortho_weight(options['dimension']) 143 | 144 | bi=numpy.zeros((options['dimension'],)).astype(config.floatX) # @UndefinedVariable 145 | bf=numpy.zeros((options['dimension'],)).astype(config.floatX) # @UndefinedVariable 146 | bo=numpy.zeros((options['dimension'],)).astype(config.floatX) # @UndefinedVariable 147 | bc=numpy.zeros((options['dimension'],)).astype(config.floatX) # @UndefinedVariable 148 | 149 | w = numpy.random.rand(options['dimension'], ).astype(config.floatX) # @UndefinedVariable # 将w初始化为(0,1)之间的随机数 150 | 151 | params['Wi']=Wi 152 | params['Wf']=Wf 153 | params['Wo']=Wo 154 | params['Wc']=Wc 155 | params['Ui']=Ui 156 | params['Uf']=Uf 157 | params['Uo']=Uo 158 | params['Uc']=Uc 159 | params['bi']=bi 160 | params['bf']=bf 161 | params['bo']=bo 162 | params['bc']=bc 163 | 164 | params['w']=w 165 | 166 | return params 167 | 168 | 169 | def init_tparams(params): 170 | tparams = OrderedDict() 171 | for kk, pp in params.items(): 172 | tparams[kk] = theano.shared(params[kk], name=kk) 173 | return tparams 174 | 175 | def unzip(zipped): 176 | """ 177 | When we pickle the model. Needed for the GPU stuff. 178 | """ 179 | new_params = OrderedDict() 180 | for kk, vv in zipped.items(): 181 | new_params[kk] = vv.get_value() 182 | return new_params 183 | 184 | main_dir='D:/dataset/test/' 185 | def proxEmbedBySubgraphs( 186 | trainingDataFile=main_dir+'train_classmate_1', 187 | wordsEmbeddings_data=None, 188 | wordsEmbeddings_path=main_dir+'facebook/nodesFeatures', 189 | subpaths_map=None, 190 | subpaths_file=main_dir+'facebook/subpathsSaveFile', 191 | subgraphSaveFile='', 192 | maxlen_subpaths=1000, 193 | wordsSize=1000000, 194 | 195 | maxlen=100, # Sequence longer then this get ignored 196 | batch_size=1, 197 | is_shuffle_for_batch=False, 198 | dispFreq=5, 199 | saveFreq=5, 200 | saveto=main_dir+'facebook/path2vec-modelParams.npz', 201 | 202 | lrate=0.0001, 203 | word_dimension=22, 204 | dimension=64, 205 | discount_alpha=0.3, 206 | discount_beta=0.3, 207 | h_output_method='max-pooling', 208 | objective_function_method='hinge-loss', 209 | objective_function_param=0, 210 | max_epochs=10, 211 | 212 | decay=0.01, 213 | ): 214 | model_options = locals().copy() 215 | 216 | if wordsEmbeddings_data is None: 217 | if wordsEmbeddings_path is not None: 218 | wordsEmbeddings_data,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) 219 | else: 220 | exit(0) 221 | trainingData,trainingPairs_data=dataProcessTools.getTrainingData(trainingDataFile) 222 | allBatches=dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch) 223 | 224 | subgraphs=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphSaveFile) 225 | 226 | params=init_sharedVariables(model_options) 227 | tparams=init_tparams(params) 228 | print 'Generate models ......' 229 | 230 | trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost=proxEmbedBySubgraphModel.proxEmbedBySubgraphModel(model_options, tparams) 231 | 232 | print 'Generate gradients ......' 233 | grads=tensor.grad(cost,wrt=list(tparams.values())) 234 | print 'Using Adadelta to generate functions ......' 235 | this_time = time.time() 236 | print 'Start to compile and optimize, time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(this_time)) 237 | lr = tensor.scalar(name='lr') 238 | f_grad_shared, f_update=adadelta(lr, tparams, grads, trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost) 239 | 240 | print 'Start training models ......' 241 | best_p = None 242 | history_cost=[] 243 | 244 | start_time = time.time() 245 | print 'start time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 246 | uidx=0 247 | for eidx in range(max_epochs): 248 | for _, batch in allBatches: 249 | uidx += 1 250 | trainingDataForBatch=[trainingData[i] for i in batch] 251 | trainingPairsForBatch=[trainingPairs_data[i] for i in batch] 252 | tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data, nodesLens_data=dataProcessTools.generateSequenceAndMasksForSingleSequenceWithLengthAsymmetric(trainingDataForBatch, trainingPairsForBatch, subgraphs, dimension) 253 | cost=f_grad_shared(tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings_data, buffer_tensor_data, nodesLens_data) 254 | f_update(lrate) 255 | 256 | if numpy.isnan(cost) or numpy.isinf(cost): 257 | print('bad cost detected: ', cost) 258 | return 259 | if numpy.mod(uidx, dispFreq) == 0: 260 | print 'Epoch =', eidx, ', Update =', uidx, ', Cost =', cost 261 | this_time = time.time() 262 | print 'Time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(this_time)) 263 | if saveto and numpy.mod(uidx, saveFreq) == 0: 264 | print('Saving...') 265 | if best_p is not None: 266 | params = best_p 267 | else: 268 | params = unzip(tparams) 269 | numpy.savez(saveto, history_errs=history_cost, **params) 270 | pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) 271 | print('Done') 272 | gc.collect() 273 | 274 | end_time = time.time() 275 | print 'end time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end_time)) 276 | print 'Training finished! Cost time == ', end_time-start_time,' s' 277 | 278 | 279 | 280 | 281 | 282 | --------------------------------------------------------------------------------