├── dataset preparation-SPE ├── AnalyseTrainingAndTestData.java ├── Config.java ├── GenerateSubgraphAndFeatureVector.java ├── GenerateSubpathsBySampling.java ├── Main.java ├── Node.java ├── ReadWholeGraph.java ├── SamplingFromSourceGraph.java └── javaParams.properties ├── model-SPE ├── attentionBatch.py ├── dataProcessTools.py ├── evaluateTools.py ├── experimentForOneFileByParams.py ├── lstmModel.py ├── modelProcessAndAssess.py ├── pythonParamsConfig ├── subgraphAttentionModelLSTMBatch.py ├── subgraphAttentionProcessModelLSTMBatch.py └── toolsFunction.py ├── model-autoencoder ├── __init__.py ├── __init__.pyc ├── autoencoderCalculate.py ├── autoencoderModel.py ├── autoencoderTraining.py └── dataToolsForAutoencoder.py ├── readMe └── readMe~ /dataset preparation-SPE/AnalyseTrainingAndTestData.java: -------------------------------------------------------------------------------- 1 | package SPE; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.HashSet; 9 | import java.util.Set; 10 | 11 | /** 12 | * For convenience, we get all the (q,v) from training data and test data, so that we only need to check and store the paths between q and v. 13 | */ 14 | public class AnalyseTrainingAndTestData { 15 | 16 | static String root_dir=Config.ROOT;//dataset root dir 17 | static String dataset_name=Config.DATASET_NAME;//dataset name, such as linkedin 18 | static String relation_class=Config.RELATION_CLASS;//relation class name, such as classmate 19 | static String allQueryPairs=Config.ALL_QUERY_PAIRS_PATH;//query pairs file 20 | 21 | static Set allNodes=new HashSet(); 22 | 23 | public static void main(String[] args) { 24 | AnalyseTrainingAndTestData atatd=new AnalyseTrainingAndTestData(); 25 | 26 | Set set1=atatd.analyseQueryTuples( 27 | "D:/dataset/icde2016/dataset/", 28 | "facebook", 29 | "classmate", 30 | "allQueryPairs"); 31 | System.out.println(set1.size()); 32 | Set set2=atatd.analyseQueryTuples( 33 | "D:/dataset/icde2016/dataset/", 34 | "facebook", 35 | "family", 36 | "allQueryPairs"); 37 | System.out.println(set2.size()); 38 | Set set=new HashSet(); 39 | set.addAll(set1); 40 | set.addAll(set2); 41 | System.out.println(set.size()); 42 | System.out.println(allNodes.size()); 43 | } 44 | 45 | /** 46 | * we get all the (q,a) from training and test dataset, and then save them. 47 | */ 48 | public Set analyseQueryTuples(String mainFolder, String datasetName, String relationName, String saveFileName){ 49 | String train_10=mainFolder+datasetName+".splits/train.10/"; 50 | String train_100=mainFolder+datasetName+".splits/train.100/"; 51 | String train_1000=mainFolder+datasetName+".splits/train.1000/"; 52 | String test=mainFolder+datasetName+".splits/test/"; 53 | Set allPairs=new HashSet(); 54 | String filePath=null; 55 | for(int i=1;i<=10;i++){ 56 | filePath=train_10+"train_"+relationName+"_"+i; 57 | Set set=analyseQueryTuplesForOneFile(filePath); 58 | allPairs.addAll(set); 59 | } 60 | for(int i=1;i<=10;i++){ 61 | filePath=train_100+"train_"+relationName+"_"+i; 62 | Set set=analyseQueryTuplesForOneFile(filePath); 63 | allPairs.addAll(set); 64 | } 65 | for(int i=1;i<=10;i++){ 66 | filePath=train_1000+"train_"+relationName+"_"+i; 67 | Set set=analyseQueryTuplesForOneFile(filePath); 68 | allPairs.addAll(set); 69 | } 70 | for(int i=1;i<=10;i++){ 71 | filePath=test+"test_"+relationName+"_"+i; 72 | Set set=analyseQueryTuplesForOneFile(filePath); 73 | allPairs.addAll(set); 74 | } 75 | 76 | FileWriter writer =null; 77 | String saveFile=mainFolder+datasetName+"/"+saveFileName; 78 | try { 79 | writer = new FileWriter(saveFile); 80 | for(String pair:allPairs){ 81 | writer.write(pair+"\r\n"); 82 | writer.flush(); 83 | } 84 | } catch (IOException e) { 85 | e.printStackTrace(); 86 | } 87 | finally{ 88 | try { 89 | if(writer!=null){ 90 | writer.close(); 91 | writer=null; 92 | } 93 | } catch (Exception e2) { 94 | // TODO: handle exception 95 | e2.printStackTrace(); 96 | } 97 | } 98 | return allPairs; 99 | } 100 | 101 | /** 102 | * get all (q,a) from one file 103 | */ 104 | private Set analyseQueryTuplesForOneFile(String filePath){ 105 | Set result=new HashSet(); 106 | BufferedReader br=null; 107 | String[] arr=null; 108 | try {//读文件 109 | br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "UTF-8")); 110 | String temp = null; 111 | while ((temp = br.readLine()) != null ) { 112 | temp=temp.trim(); 113 | if(temp.length()>0){ 114 | arr=temp.split("\t"); 115 | allNodes.add(arr[0]); 116 | for(int i=1;i vectors=new HashMap(); 31 | static Map userPair2Index=new HashMap(); 32 | static Map index2UserPair=new HashMap(); 33 | 34 | public static void main(String[] args) { 35 | // TODO Auto-generated method stub 36 | // AnalyseTrainingAndTestData atatd=new AnalyseTrainingAndTestData(); 37 | // Set queryTuples=atatd.analyseQueryTuples(mainFolder, datasetName, relationName, saveFileName); 38 | 39 | // Set queryTuples=new HashSet(); 40 | // queryTuples.add("1\t13"); 41 | // queryTuples.add("5\t6"); 42 | // 43 | // 44 | GenerateSubgraphAndFeatureVector gmfv=new GenerateSubgraphAndFeatureVector(); 45 | // gmfv.analyseSubpathsAndChnForDblpDirected(queryTuples, subpathsFile, newSubpathsSaveFile, index2UserpairSaveFile); 46 | // System.out.println("OK"); 47 | // System.out.println(vectors); 48 | // System.out.println(userPair2Index); 49 | // System.out.println(index2UserPair); 50 | 51 | index2UserPair.put(0, "0\t3"); 52 | index2UserPair.put(1, "5\t2"); 53 | index2UserPair.put(2, "4\t0"); 54 | index2UserPair.put(3, "8\t9"); 55 | index2UserPair.put(4, "0\t4"); 56 | index2UserPair.put(5, "9\t8"); 57 | userPair2Index.put("0\t3",0); 58 | userPair2Index.put("5\t2",1); 59 | userPair2Index.put("4\t0",2); 60 | userPair2Index.put("8\t9",3); 61 | userPair2Index.put("0\t4",4); 62 | userPair2Index.put("9\t8",5); 63 | vectors.put("0\t3", null); 64 | vectors.put("5\t2", null); 65 | vectors.put("4\t0", null); 66 | vectors.put("8\t9", null); 67 | vectors.put("0\t4", null); 68 | vectors.put("9\t8", null); 69 | vectors.put("0", null); 70 | vectors.put("2", null); 71 | vectors.put("3", null); 72 | vectors.put("4", null); 73 | vectors.put("5", null); 74 | vectors.put("8", null); 75 | vectors.put("9", null); 76 | gmfv.analyseInstancesGenerateStatNum("D:/test/modeling-subgraph/instancesToydata/","D:/test/modeling-subgraph/vectors"); 77 | for(String key:vectors.keySet()){ 78 | System.out.println(key); 79 | System.out.println(Arrays.toString(vectors.get(key))); 80 | System.out.println("-------------------------------"); 81 | } 82 | // gmfv.generateVectorForUndirected(vectorSaveFile); 83 | // gmfv.generateVectorForDirectedWithNodeFeature("D:/test/modeling-subgraph/nodeFeature", vectorSaveFile); 84 | 85 | // gmfv.generateSingleNodeVectorBySubgraph("D:/test/modeling-subgraph/nodeFeature"); 86 | } 87 | 88 | /** 89 | * change user-only paths to m-paths 90 | * @param queryTuples all query tuples (q,a) 91 | * @param subpathsFile user-only subpahts 92 | * @param newSubpathsSaveFile file for m-paths 93 | * @param index2UserpairSaveFile file for m-node IDs and the corresponding user-pairs 94 | */ 95 | public void analyseSubpathsAndChnForFbAndLiUndirected(Set queryTuples, String subpathsFile, String newSubpathsSaveFile, String index2UserpairSaveFile){ 96 | BufferedReader br=null; 97 | FileWriter writer = null; 98 | String[] arr=null; 99 | String[] arr1=null; 100 | String queryTuple1=null; 101 | String queryTuple2=null; 102 | String userpair1=null; 103 | String userpair2=null; 104 | StringBuilder sb=new StringBuilder(); 105 | int index=0; 106 | try { 107 | br = new BufferedReader(new InputStreamReader(new FileInputStream(subpathsFile), "UTF-8")); 108 | writer = new FileWriter(newSubpathsSaveFile); 109 | String temp = null; 110 | while ((temp = br.readLine()) != null ) { 111 | temp=temp.trim(); 112 | if(temp.length()>0){ 113 | arr=temp.split("\t"); 114 | queryTuple1=arr[0]+"\t"+arr[1]; 115 | queryTuple2=arr[1]+"\t"+arr[0]; 116 | if(queryTuples.contains(queryTuple1) || queryTuples.contains(queryTuple2)){ 117 | sb.delete( 0, sb.length() ); 118 | sb.append(arr[0]+"\t"+arr[1]+"\t"); 119 | arr1=arr[2].split(" "); 120 | for(int i=0;i<(arr1.length-1);i++){ 121 | userpair1=arr1[i]+"\t"+arr1[i+1]; 122 | userpair2=arr1[i+1]+"\t"+arr1[i]; 123 | vectors.put(arr1[i], null); 124 | vectors.put(arr1[i+1], null); 125 | if(!vectors.containsKey(userpair1) && !vectors.containsKey(userpair2)){ 126 | vectors.put(userpair1, null); 127 | userPair2Index.put(userpair1, userPair2Index.size()); 128 | index2UserPair.put(index2UserPair.size(), userpair1); 129 | index=userPair2Index.get(userpair1); 130 | } 131 | else{ 132 | if(vectors.containsKey(userpair1)){ 133 | index=userPair2Index.get(userpair1); 134 | } 135 | else{ 136 | index=userPair2Index.get(userpair2); 137 | } 138 | } 139 | sb.append(index+" "); 140 | } 141 | sb.append("\r\n"); 142 | writer.write(sb.toString()); 143 | writer.flush(); 144 | } 145 | } 146 | } 147 | } catch (Exception e2) { 148 | e2.printStackTrace(); 149 | } 150 | finally{ 151 | try { 152 | if(writer!=null){ 153 | writer.close(); 154 | writer=null; 155 | } 156 | if(br!=null){ 157 | br.close(); 158 | br=null; 159 | } 160 | } catch (IOException e) { 161 | e.printStackTrace(); 162 | } 163 | } 164 | 165 | try { 166 | writer = new FileWriter(index2UserpairSaveFile); 167 | for(int id:index2UserPair.keySet()){ 168 | writer.write(id+"\t"+index2UserPair.get(id)+"\r\n"); 169 | writer.flush(); 170 | } 171 | } catch (Exception e) { 172 | // TODO: handle exception 173 | e.printStackTrace(); 174 | } 175 | finally{ 176 | try { 177 | if(writer!=null){ 178 | writer.close(); 179 | writer=null; 180 | } 181 | } catch (Exception e2) { 182 | // TODO: handle exception 183 | e2.printStackTrace(); 184 | } 185 | } 186 | } 187 | 188 | 189 | /** 190 | * analyse instances for subgraphs and get the statistical numbers 191 | * @param instanceFolder instance folder 192 | * @param statNumSaveFile statistical numbers file 193 | */ 194 | public void analyseInstancesGenerateStatNum(String instanceFolder,String statNumSaveFile){ 195 | File folder = new File(instanceFolder); 196 | File[] files = folder.listFiles(); 197 | int dimensionSubgraph=files.length; 198 | for(String key:vectors.keySet()){ 199 | vectors.put(key, new int[dimensionSubgraph]); 200 | } 201 | String filePath=null; 202 | BufferedReader br=null; 203 | String[] arr=null; 204 | int[] intArr=null; 205 | String userPair1=null; 206 | String userPair2=null; 207 | for(int i=0;i0){ 215 | arr=temp.split("\t"); 216 | for(String s:arr){ 217 | if(vectors.containsKey(s)){ 218 | intArr=vectors.get(s); 219 | intArr[i]+=1; 220 | } 221 | } 222 | for(int a=0;a result=new HashMap(); 299 | int index=0; 300 | String[] arr=null; 301 | double[] vector=null; 302 | int[] statNum=null; 303 | int[] statNum_0=null; 304 | int[] statNum_1=null; 305 | int dimension=0; 306 | for(String key:vectors.keySet()){ 307 | if(key.contains("\t")){ 308 | index=userPair2Index.get(key); 309 | statNum=vectors.get(key); 310 | dimension=statNum.length; 311 | vector=new double[statNum.length]; 312 | arr=key.split("\t"); 313 | statNum_0=vectors.get(arr[0]); 314 | statNum_1=vectors.get(arr[1]); 315 | for(int i=0;i0){ 363 | return 1.0; 364 | } 365 | return 0.0; 366 | } 367 | } 368 | -------------------------------------------------------------------------------- /dataset preparation-SPE/GenerateSubpathsBySampling.java: -------------------------------------------------------------------------------- 1 | package SPE; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.ArrayList; 9 | import java.util.Arrays; 10 | import java.util.HashMap; 11 | import java.util.HashSet; 12 | import java.util.List; 13 | import java.util.Map; 14 | import java.util.Set; 15 | 16 | /** 17 | * generate user-only paths by ramdom walk samplings 18 | */ 19 | public class GenerateSubpathsBySampling { 20 | 21 | 22 | static String samplingsPath=Config.SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS;//the save file for random walk sampling 23 | static String subPathsSavePath=Config.SUBPATHS_SAVE_PATH;//the save file for subpaths(user-only paths) 24 | static int window_maxlen=Config.LONGEST_ANALYSE_LENGTH_FOR_SAMPLING;//we use a window to analyse a path to generate subpath (o-path), this is the width of this window 25 | static int subpath_maxlen=Config.LONGEST_LENGTH_FOR_SUBPATHS;//the max length for a subpath (user-only path between two nodes) 26 | static int subpath_minlen=Config.SHORTEST_LENGTH_FOR_SUBPATHS;//the min length for a subpath (user-only path between two nodes) 27 | 28 | public static void main(String[] args) { 29 | 30 | GenerateSubpathsBySampling gsbs=new GenerateSubpathsBySampling(); 31 | gsbs.generateSubPathsFromSamplings(samplingsPath, subPathsSavePath, window_maxlen, subpath_maxlen, subpath_minlen); 32 | } 33 | 34 | /** 35 | * generate subpaths (user-only paths) between the query node q and another candidate node v. 36 | */ 37 | public void generateSubPathsFromSamplings(String samplingsPath, String subPathsSavePath,int window_maxlen,int subpath_maxlen,int subpath_minlen){ 38 | BufferedReader br=null; 39 | String[] arr=null; 40 | FileWriter writer =null; 41 | String t=null; 42 | List path=new ArrayList(); 43 | try { 44 | br = new BufferedReader(new InputStreamReader(new FileInputStream(samplingsPath), "UTF-8")); 45 | writer = new FileWriter(subPathsSavePath); 46 | String temp = null; 47 | while ((temp = br.readLine()) != null ) { 48 | temp=temp.trim(); 49 | if(temp.length()>0){ 50 | path.clear(); 51 | arr=temp.split(" "); 52 | for(String s:arr){ 53 | path.add(Integer.parseInt(s)); 54 | } 55 | t=analyseOnePath(path, window_maxlen, subpath_maxlen, subpath_minlen); 56 | if(t.length()>0){ 57 | writer.write(t); 58 | writer.flush(); 59 | } 60 | } 61 | } 62 | } catch (Exception e2) { 63 | e2.printStackTrace(); 64 | } 65 | finally{ 66 | try { 67 | if(writer!=null){ 68 | writer.close(); 69 | writer=null; 70 | } 71 | if(br!=null){ 72 | br.close(); 73 | br=null; 74 | } 75 | } catch (IOException e) { 76 | e.printStackTrace(); 77 | } 78 | } 79 | } 80 | 81 | /** 82 | * generate all the subpaths for a given random walk sampling path 83 | */ 84 | private String analyseOnePath(List path, int maxWindowLen, int maxSubpathLen, int subpath_minlen){ 85 | StringBuilder sb=new StringBuilder(); 86 | List subpath=new ArrayList(); 87 | for(int i=0;i0 && (j-i)>maxWindowLen){ 90 | break; 91 | } 92 | 93 | subpath.clear(); 94 | for(int x=i;x<=j;x++){ 95 | subpath.add(path.get(x)+0); 96 | } 97 | List subpathNoRepeat=deleteRepeat(subpath); 98 | if(subpathNoRepeat.size()0 && subpathNoRepeat.size()>maxSubpathLen){ 104 | continue; 105 | } 106 | 107 | sb.append(path.get(i)+"\t"+path.get(j)+"\t"); 108 | for(int x=0;x deleteRepeat(List path){ 122 | Map map=new HashMap(); 123 | int node=0; 124 | List result=new ArrayList(); 125 | int formerIndex=0; 126 | for(int i=0;i queryTuples = atatd.analyseQueryTuples(root_dir, dataset_name, relation_class, allQueryPairs); 47 | 48 | // generate m-paths, and calculate the instances number between q and v 49 | GenerateSubgraphAndFeatureVector gmfv = new GenerateSubgraphAndFeatureVector(); 50 | 51 | // find all the user-pairs 52 | gmfv.analyseSubpathsAndChnForFbAndLiUndirected(queryTuples, subpathsFile, newSubpathsSaveFile, 53 | index2UserpairSaveFile); 54 | // calculate the subgraph instances number and save 55 | gmfv.analyseInstancesGenerateStatNum(instanceFolder, statNumSaveFile); 56 | // generate the final m-node 57 | gmfv.generateVectorForUndirected(vectorSaveFile); 58 | 59 | System.out.println("Finished。。。Time == " + new Date()); 60 | long endtime = System.currentTimeMillis(); 61 | System.out.println("Cost time == " + (endtime - starttime) / 1000 + " s"); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /dataset preparation-SPE/Node.java: -------------------------------------------------------------------------------- 1 | package SPE; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.HashSet; 6 | import java.util.List; 7 | import java.util.Map; 8 | import java.util.Set; 9 | 10 | /** 11 | * Node class 12 | */ 13 | public class Node { 14 | 15 | /** 16 | * node id 17 | */ 18 | private int id=-1; 19 | /** 20 | * node type 21 | */ 22 | private String type=null; 23 | /** 24 | * node type id 25 | */ 26 | private int typeId=-1; 27 | /** 28 | * in neighbours 29 | */ 30 | public List in_nodes=new ArrayList(); 31 | /** 32 | * out neighbours 33 | */ 34 | public List out_nodes=new ArrayList(); 35 | /** 36 | * in neighbours ids 37 | */ 38 | public List in_ids=new ArrayList(); 39 | /** 40 | * out neighbours ids 41 | */ 42 | public List out_ids=new ArrayList(); 43 | /** 44 | * all neighbours 45 | */ 46 | public Set neighbours=new HashSet(); 47 | 48 | public int getId() { 49 | return id; 50 | } 51 | 52 | public void setId(int id) { 53 | this.id = id; 54 | } 55 | 56 | public String getType() { 57 | return type; 58 | } 59 | 60 | public void setType(String type) { 61 | this.type = type; 62 | } 63 | 64 | public int getTypeId() { 65 | return typeId; 66 | } 67 | 68 | public void setTypeId(int typeId) { 69 | this.typeId = typeId; 70 | } 71 | 72 | @Override 73 | public int hashCode() { 74 | // TODO Auto-generated method stub 75 | return this.id; 76 | } 77 | 78 | @Override 79 | public boolean equals(Object obj) { 80 | // TODO Auto-generated method stub 81 | if(obj instanceof Node){ 82 | Node node=(Node) obj; 83 | if(node.getId()==this.id){ 84 | return true; 85 | } 86 | } 87 | return false; 88 | } 89 | 90 | @Override 91 | public String toString() { 92 | // TODO Auto-generated method stub 93 | // return "id=="+id+",in_ids=="+in_ids.toString()+",out_ids=="+out_ids.toString(); 94 | return "[id="+id+",neighbours=["+getNeighboursInfo()+"]]"; 95 | } 96 | 97 | /** 98 | * get nieghbours info 99 | * @return 100 | */ 101 | private String getNeighboursInfo(){ 102 | StringBuilder sb=new StringBuilder(); 103 | if(neighbours.size()==0){ 104 | return ""; 105 | } 106 | else{ 107 | for(Node n:neighbours){ 108 | sb.append(n.id+","); 109 | } 110 | return sb.toString(); 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /dataset preparation-SPE/ReadWholeGraph.java: -------------------------------------------------------------------------------- 1 | package SPE; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | /** 12 | * read whole graph info 13 | */ 14 | public class ReadWholeGraph { 15 | 16 | static Map typeid2Type=new HashMap(); 17 | static Map type2Typeid=new HashMap(); 18 | 19 | public static void main(String[] args) { 20 | // TODO Auto-generated method stub 21 | 22 | } 23 | 24 | /** 25 | * read whole graph info 26 | * @param nodesPath nodes path 27 | * @param edgesPath edges path 28 | * @param typeAndTypeIdPath file to save type and typeIDs 29 | * @return 30 | */ 31 | public Map readDataFromFile(String nodesPath,String edgesPath,String typeAndTypeIdPath){ 32 | Map data=new HashMap(); 33 | BufferedReader br=null; 34 | String[] arr=null; 35 | Node node=null; 36 | try { 37 | br = new BufferedReader(new InputStreamReader(new FileInputStream(nodesPath), "UTF-8")); 38 | String temp = null; 39 | while ((temp = br.readLine()) != null ) { 40 | temp=temp.trim(); 41 | if(temp.length()>0){ 42 | arr=temp.split("\t"); 43 | node=new Node(); 44 | node.setId(Integer.parseInt(arr[0])); 45 | node.setType(arr[1]); 46 | if(type2Typeid.containsKey(arr[1])){ 47 | node.setTypeId(type2Typeid.get(arr[1])); 48 | } 49 | else{ 50 | type2Typeid.put(arr[1], type2Typeid.size()); 51 | typeid2Type.put(typeid2Type.size(), arr[1]); 52 | node.setTypeId(type2Typeid.get(arr[1])); 53 | } 54 | data.put(Integer.parseInt(arr[0]), node); 55 | } 56 | } 57 | } catch (Exception e2) { 58 | // TODO: handle exception 59 | e2.printStackTrace(); 60 | } 61 | finally{ 62 | try { 63 | if(br!=null){ 64 | br.close(); 65 | br=null; 66 | } 67 | } catch (IOException e) { 68 | // TODO Auto-generated catch block 69 | e.printStackTrace(); 70 | } 71 | } 72 | int start=0; 73 | int end=0; 74 | Node startNode=null; 75 | Node endNode=null; 76 | try { 77 | br = new BufferedReader(new InputStreamReader(new FileInputStream(edgesPath), "UTF-8")); 78 | String temp = null; 79 | while ((temp = br.readLine()) != null ) { 80 | temp=temp.trim(); 81 | if(temp.length()>0){ 82 | arr=temp.split("\t"); 83 | start=Integer.parseInt(arr[0]); 84 | end=Integer.parseInt(arr[1]); 85 | startNode=data.get(start); 86 | endNode=data.get(end); 87 | startNode.out_ids.add(end); 88 | startNode.out_nodes.add(endNode); 89 | endNode.in_ids.add(start); 90 | endNode.in_nodes.add(startNode); 91 | } 92 | } 93 | } catch (Exception e2) { 94 | e2.printStackTrace(); 95 | } 96 | finally{ 97 | try { 98 | if(br!=null){ 99 | br.close(); 100 | br=null; 101 | } 102 | } catch (IOException e) { 103 | e.printStackTrace(); 104 | } 105 | } 106 | FileWriter writer = null; 107 | try { 108 | writer = new FileWriter(typeAndTypeIdPath); 109 | for(String type:type2Typeid.keySet()){ 110 | writer.write(type+" "+type2Typeid.get(type)+"\r\n"); 111 | writer.flush(); 112 | } 113 | } catch (Exception e) { 114 | e.printStackTrace(); 115 | } 116 | finally{ 117 | try { 118 | if(writer!=null){ 119 | writer.close(); 120 | writer=null; 121 | } 122 | } catch (Exception e2) { 123 | e2.printStackTrace(); 124 | } 125 | } 126 | 127 | return data; 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /dataset preparation-SPE/SamplingFromSourceGraph.java: -------------------------------------------------------------------------------- 1 | package SPE; 2 | 3 | import java.io.FileWriter; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.HashSet; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Random; 11 | import java.util.Set; 12 | 13 | 14 | /** 15 | * random walk sampling from source graph, and chenge them into user-only paths and then save them into a file. 16 | */ 17 | public class SamplingFromSourceGraph { 18 | 19 | /** 20 | * random generator, to keep invariant 21 | */ 22 | private Random random=new Random(123); 23 | //nodes path 24 | static String nodesPath=Config.NODES_PATH; 25 | //edges path 26 | static String edgesPath=Config.EDGES_PATH; 27 | //type and typesid save file 28 | static String typeAndTypeIdPath=Config.TYPE_TYPEID_SAVEFILE; 29 | static String randomWalkSampling_savePath=Config.SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS;//file path to save random walk samplings 30 | static int K=Config.SAMPLING_TIMES_PER_NODE;//random walk sampling times for each node 31 | static int L=Config.SAMPLING_LENGTH_PER_PATH;//random walk sampling length for each path (or walker) 32 | static int shortest_path_length=Config.SHORTEST_LENGTH_FOR_SAMPLING;//the shortest length for paths in random walk sampling 33 | 34 | 35 | public static void main(String[] args) { 36 | // TODO Auto-generated method stub 37 | //get the whole graph 38 | ReadWholeGraph rwg=new ReadWholeGraph(); 39 | Map graph=rwg.readDataFromFile(nodesPath, edgesPath, typeAndTypeIdPath); 40 | //sampling and then save 41 | SamplingFromSourceGraph sfa=new SamplingFromSourceGraph(); 42 | sfa.randomWalkSampling(graph, K, L, randomWalkSampling_savePath); 43 | } 44 | 45 | /** 46 | * random walk sampling 47 | * @param data dataset 48 | * @param k random walk sampling times for each node 49 | * @param l random walk sampling length for each path (or walker) 50 | * @param pathsFile savefile 51 | */ 52 | public void randomWalkSampling(Map data,int k,int l,String pathsSaveFile){ 53 | List path=null; 54 | FileWriter writer=null; 55 | StringBuilder sb=new StringBuilder(); 56 | try { 57 | writer=new FileWriter(pathsSaveFile); 58 | } catch (IOException e) { 59 | e.printStackTrace(); 60 | } 61 | for(Node node:data.values()){ 62 | for(int i=0;i randomWalkPath(Node start,int l,double prob_user,Map data){ 94 | List path=new ArrayList(l+1); 95 | path.add(start); 96 | Node now=start; 97 | Set types_set=new HashSet(); 98 | List types=new ArrayList(); 99 | Map> neighbours=new HashMap>(); 100 | int type=-1; 101 | List list=null; 102 | for(int i=0;i ids=new ArrayList(); 116 | ids.add(n.getId()); 117 | neighbours.put(n.getTypeId(), ids); 118 | } 119 | } 120 | types.addAll(types_set); 121 | if(prob_user==-1){ 122 | type=types.get(random.nextInt(types.size())); 123 | list=neighbours.get(type); 124 | now=data.get(list.get(random.nextInt(list.size()))); 125 | } 126 | else{ 127 | now=now.out_nodes.get(random.nextInt(now.out_nodes.size())); 128 | } 129 | path.add(now); 130 | } 131 | return path; 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /dataset preparation-SPE/javaParams.properties: -------------------------------------------------------------------------------- 1 | 2 | # dataset root dir 3 | ROOT = D:/dataset/icde2016/dataset/ 4 | 5 | # dataset name 6 | DATASET_NAME= facebook 7 | 8 | # relation class name 9 | RELATION_CLASS = classmate 10 | 11 | # Need to add graph node feature to m-node? default=False 12 | IS_SUBGRAPH_COMBINE_FEATURE = false 13 | 14 | # Need to use m-node to replace the node feature? default=False 15 | IS_SUBGRAPH_REPLACE_NODE_FEATURE = true 16 | 17 | # random walk sampling times for each node 18 | SAMPLING_TIMES_PER_NODE = 20 19 | 20 | # random walk sampling length for each path (or walker) 21 | SAMPLING_LENGTH_PER_PATH = 20 22 | 23 | # the shortest length for paths in random walk sampling 24 | SHORTEST_LENGTH_FOR_SAMPLING = 2 25 | 26 | # we use a window to analyse a path to generate subpath (o-path), this is the width of this window 27 | LONGEST_ANALYSE_LENGTH_FOR_SAMPLING = 20 28 | 29 | # the max length for a subpath (user-only path between two nodes) 30 | LONGEST_LENGTH_FOR_SUBPATHS = 5 31 | 32 | # the min length for a subpath (user-only path between two nodes) 33 | SHORTEST_LENGTH_FOR_SUBPATHS = 2 34 | 35 | # path for subgraph instance 36 | INSTANCE_FOLDER = /usr/SPE/subgraph-instances/instance-source-unzip-now/ 37 | 38 | -------------------------------------------------------------------------------- /model-SPE/attentionBatch.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Functions and Application : 4 | MPE model 5 | ''' 6 | import numpy 7 | import theano 8 | from theano import tensor 9 | import dataProcessTools 10 | from theano import config 11 | from collections import OrderedDict 12 | import time 13 | import six.moves.cPickle as pickle # @UnresolvedImport 14 | import gc 15 | import subgraphAttentionModelLSTMBatch 16 | # theano.config.floatX = 'float32' 17 | 18 | SEED = 123 19 | numpy.random.seed(SEED) 20 | 21 | def numpy_floatX(data): 22 | return numpy.asarray(data, dtype=theano.config.floatX) # @UndefinedVariable 23 | 24 | def gradientDescentGroup(learning_rate,tparams,grads,metagraphEmbeddings, trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wordsEmbeddings, cost): 25 | update=[(shared,shared-learning_rate*g) for g,shared in zip(grads,tparams.values())] 26 | func=theano.function([metagraphEmbeddings, trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wordsEmbeddings],cost,updates=update,on_unused_input='ignore',mode='FAST_RUN') 27 | return func 28 | 29 | def adadelta(lr, tparams, grads, metagraphEmbeddings, trainingParis, subPaths_matrix, subPaths_mask, wordsEmbeddings, cost): 30 | """ 31 | An adaptive learning rate optimizer 32 | Parameters 33 | ---------- 34 | lr : Theano SharedVariable 35 | Initial learning rate 36 | tpramas: Theano SharedVariable 37 | Model parameters 38 | grads: Theano variable 39 | Gradients of cost w.r.t to parameres 40 | x: Theano variable 41 | Model inputs 42 | mask: Theano variable 43 | Sequence mask 44 | y: Theano variable 45 | Targets 46 | cost: Theano variable 47 | Objective fucntion to minimize 48 | 49 | Notes 50 | ----- 51 | For more information, see [ADADELTA]_. 52 | 53 | .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning 54 | Rate Method*, arXiv:1212.5701. 55 | """ 56 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), 57 | name='%s_grad' % k) 58 | for k, p in tparams.items()] 59 | running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), 60 | name='%s_rup2' % k) 61 | for k, p in tparams.items()] 62 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), 63 | name='%s_rgrad2' % k) 64 | for k, p in tparams.items()] 65 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 66 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 67 | for rg2, g in zip(running_grads2, grads)] 68 | f_grad_shared = theano.function([metagraphEmbeddings, trainingParis, subPaths_matrix, subPaths_mask, wordsEmbeddings], cost, updates=zgup + rg2up, 69 | on_unused_input='ignore', 70 | name='adadelta_f_grad_shared') 71 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 72 | for zg, ru2, rg2 in zip(zipped_grads, 73 | running_up2, 74 | running_grads2)] 75 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 76 | for ru2, ud in zip(running_up2, updir)] 77 | param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 78 | f_update = theano.function([lr], [], updates=ru2up + param_up, 79 | on_unused_input='ignore', 80 | name='adadelta_f_update') 81 | 82 | return f_grad_shared, f_update 83 | 84 | 85 | def sgd(lr, tparams, grads, x, mask, y, cost): 86 | """ Stochastic Gradient Descent 87 | 88 | :note: A more complicated version of sgd then needed. This is 89 | done like that for adadelta and rmsprop. 90 | 91 | """ 92 | # New set of shared variable that will contain the gradient 93 | # for a mini-batch. 94 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 95 | for k, p in tparams.items()] 96 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 97 | 98 | # Function that computes gradients for a mini-batch, but do not 99 | # updates the weights. 100 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 101 | name='sgd_f_grad_shared') 102 | 103 | pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] 104 | 105 | # Function that updates the weights from the previously computed 106 | # gradient. 107 | f_update = theano.function([lr], [], updates=pup, 108 | name='sgd_f_update') 109 | 110 | return f_grad_shared, f_update 111 | 112 | def ortho_weight(ndim): 113 | """ 114 | init a matrix by svd 115 | """ 116 | W = numpy.random.randn(ndim, ndim) 117 | u, s, v = numpy.linalg.svd(W) 118 | return u.astype(theano.config.floatX) # @UndefinedVariable 119 | 120 | def init_params_weight(row,column): 121 | """ 122 | init a matrix 123 | """ 124 | W = numpy.random.rand(row, column) 125 | W = W*2.0-1.0 126 | return W.astype(theano.config.floatX) # @UndefinedVariable 127 | 128 | 129 | def init_sharedVariables(options): 130 | """ 131 | inti the shared variables 132 | """ 133 | print 'init shared Variables......' 134 | params = OrderedDict() 135 | # Q_A 136 | Q_A=init_params_weight(options['metagraph_embedding_dimension'],options['dimension_A']) # (-1,1) 137 | params['Q_A']=Q_A 138 | # b_A 139 | b_A=numpy.random.rand(options['dimension_A'], ) 140 | params['b_A']=b_A 141 | # eta_A 142 | eta_A=numpy.random.rand(options['dimension_A'], ) 143 | params['eta_A']=eta_A 144 | 145 | 146 | lstm_W=numpy.concatenate([ 147 | init_params_weight(options['metagraph_embedding_dimension'],options['dimension_lstm']), 148 | init_params_weight(options['metagraph_embedding_dimension'],options['dimension_lstm']), 149 | init_params_weight(options['metagraph_embedding_dimension'],options['dimension_lstm']), 150 | init_params_weight(options['metagraph_embedding_dimension'],options['dimension_lstm']) 151 | ],axis=1) 152 | params['lstm_W'] = lstm_W 153 | lstm_U = numpy.concatenate([ortho_weight(options['dimension_lstm']), 154 | ortho_weight(options['dimension_lstm']), 155 | ortho_weight(options['dimension_lstm']), 156 | ortho_weight(options['dimension_lstm'])], axis=1) 157 | params['lstm_U'] = lstm_U 158 | lstm_b = numpy.zeros((4 * options['dimension_lstm'],)) 159 | params['lstm_b'] = lstm_b.astype(theano.config.floatX) # @UndefinedVariable 160 | 161 | # Q_B 162 | Q_B=init_params_weight(options['dimension_lstm'],options['dimension_B']) # (-1,1) 163 | params['Q_B']=Q_B 164 | # b_B 165 | b_B=numpy.random.rand(options['dimension_B'], ) # (0,1) 166 | params['b_B']=b_B 167 | # eta_B 168 | eta_B=numpy.random.rand(options['dimension_B'], ) # (0,1) 169 | params['eta_B']=eta_B 170 | 171 | # Q_C 172 | Q_C=init_params_weight(options['dimension_lstm'],options['dimension_C']) # (-1,1) 173 | params['Q_C']=Q_C 174 | # b_C 175 | b_C=numpy.random.rand(options['dimension_C'], ) # (0,1) 176 | params['b_C']=b_C 177 | # eta_C 178 | eta_C=numpy.random.rand(options['dimension_C'], ) # (0,1) 179 | params['eta_C']=eta_C 180 | 181 | w = numpy.random.rand(options['dimension_lstm'], ) # (0,1) 182 | params['w']=w.astype(theano.config.floatX) # @UndefinedVariable 183 | 184 | return params 185 | 186 | def init_tparams(params): # set shared variables 187 | tparams = OrderedDict() 188 | for kk, pp in params.items(): 189 | tparams[kk] = theano.shared(params[kk], name=kk) 190 | return tparams 191 | 192 | def unzip(zipped): 193 | new_params = OrderedDict() 194 | for kk, vv in zipped.items(): 195 | new_params[kk] = vv.get_value() 196 | return new_params 197 | 198 | main_dir='D:/dataset/test/icde2016_metagraph/' 199 | def metagraphAttentionTraining( 200 | 201 | trainingDataFile=main_dir+'facebook.splits/train.10/train_classmate_1', # the full path of training data file 202 | metagraphEmbeddings_path='', # the file path of metagraph embeddings 203 | wordsEmbeddings_data=None, # words embeddings 204 | wordsEmbeddings_path=main_dir+'facebook/nodesFeatures', # the file path of words embeddings 205 | wordsSize=1000000, # the size of words vocabulary 206 | subpaths_map=None, # contains sub-paths 207 | subpaths_file=main_dir+'facebook/subpathsSaveFile',# the file which contains sub-paths 208 | maxlen_subpaths=1000, # the max length for sub-paths 209 | maxlen=100, # Sequence longer then this get ignored 210 | batch_size=10, # use a batch for training. This is the size of this batch. 211 | is_shuffle_for_batch=True, # if need shuffle for training 212 | objective_function_method='sigmoid', # loss function, we use sigmoid here 213 | objective_function_param=0, # the parameter in loss function, beta 214 | lrate=0.0001, # learning rate 215 | max_epochs=100, # the max epochs for training 216 | 217 | dispFreq=5, # the frequences for display 218 | saveFreq=5, # the frequences for saving the parameters 219 | saveto=main_dir+'facebook/path2vec-modelParams.npz', # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. 220 | 221 | # all dimensions parameters 222 | metagraph_embedding_dimension=10, # metagraph embedding dimension 223 | dimension_A=10, # the dimension of attention when computing the m-node embedding 224 | dimension_lstm=10, # dimension of lstm parameters 225 | dimension_B=10, # the dimension of attention when computing the m-path embedding 226 | dimension_C=10, # the dimension of attention when computing the m-paths embedding 227 | 228 | # decay parameters 229 | decay_Q_A=0.001, 230 | decay_b_A=0.001, 231 | decay_eta_A=0.001, 232 | decay_lstm_W=0.001, 233 | decay_lstm_U=0.001, 234 | decay_lstm_b=0.001, 235 | decay_Q_B=0.001, 236 | decay_b_B=0.001, 237 | decay_eta_B=0.001, 238 | decay_Q_C=0.001, 239 | decay_b_C=0.001, 240 | decay_eta_C=0.001, 241 | decay_w=0.001, 242 | 243 | ): 244 | # get all parameters 245 | model_options = locals().copy() 246 | 247 | if wordsEmbeddings_data is None: 248 | if wordsEmbeddings_path is not None: 249 | wordsEmbeddings_data,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) 250 | else: 251 | print 'There is not path for wordsEmbeddings, exit!!!' 252 | exit(0) 253 | 254 | if subpaths_map is None: 255 | if subpaths_file is not None: 256 | subpaths_map=dataProcessTools.loadAllSubPathsRomove0Path(subpaths_file, maxlen_subpaths, wordsEmbeddings_data) 257 | else: 258 | print 'There is not path for sub-paths, exit!!!' 259 | exit(0) 260 | 261 | metagraphEmbedding_data, metagraphDimension, metagraphSize=dataProcessTools.getMetagraphEmbeddings(metagraphEmbeddings_path) 262 | 263 | trainingData,trainingPairs_data=dataProcessTools.getTrainingData(trainingDataFile) 264 | allBatches=dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch) 265 | 266 | ''' 267 | init shared variables 268 | ''' 269 | params=init_sharedVariables(model_options) 270 | tparams=init_tparams(params) 271 | print 'Generate models ......' 272 | 273 | metagraphEmbeddings, trainingParis, subPaths_matrix, subPaths_mask, wordsEmbeddings, cost=subgraphAttentionModelLSTMBatch.metagraphAttentionModel(model_options, tparams) 274 | 275 | print 'Generate gradients ......' 276 | grads=tensor.grad(cost,wrt=list(tparams.values())) 277 | print 'Using Adadelta to generate functions ......' 278 | this_time = time.time() 279 | print 'Start to compile and optimize, time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(this_time)) 280 | lr = tensor.scalar(name='lr') 281 | f_grad_shared, f_update=adadelta(lr, tparams, grads, metagraphEmbeddings, trainingParis, subPaths_matrix, subPaths_mask, wordsEmbeddings, cost) 282 | 283 | print 'Start training models ......' 284 | best_p = None 285 | history_cost=[] # not use 286 | 287 | start_time = time.time() 288 | print 'start time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 289 | uidx=0 290 | for eidx in range(max_epochs): 291 | for _, batch in allBatches: 292 | uidx += 1 293 | # prepare data for this model 294 | trainingDataForBatch=[trainingData[i] for i in batch] 295 | trainingPairsForBatch=[trainingPairs_data[i] for i in batch] 296 | triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data=dataProcessTools.prepareDataForTraining(trainingDataForBatch, trainingPairsForBatch, subpaths_map) 297 | cost=0 298 | cost=f_grad_shared(metagraphEmbedding_data, triples_matrix_data, subPaths_matrix_data, subPaths_mask_data,wordsEmbeddings_data) 299 | f_update(lrate) 300 | 301 | trainingDataForBatch=None 302 | trainingPairsForBatch=None 303 | del triples_matrix_data 304 | del subPaths_matrix_data 305 | del subPaths_mask_data 306 | del subPaths_lens_data 307 | 308 | if numpy.isnan(cost) or numpy.isinf(cost): 309 | print('bad cost detected: ', cost) 310 | return 311 | if numpy.mod(uidx, dispFreq) == 0: 312 | print 'Epoch =', eidx, ', Update =', uidx, ', Cost =', cost 313 | this_time = time.time() 314 | print 'Time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(this_time)) 315 | if saveto and numpy.mod(uidx, saveFreq) == 0: 316 | print('Saving...') 317 | if best_p is not None: 318 | params = best_p 319 | else: 320 | params = unzip(tparams) 321 | 322 | numpy.savez(saveto, history_errs=history_cost, **params) 323 | pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) 324 | print('Done') 325 | gc.collect() 326 | 327 | end_time = time.time() 328 | print 'end time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end_time)) 329 | print 'Training finished! Cost time == ', end_time-start_time,' s' 330 | 331 | -------------------------------------------------------------------------------- /model-SPE/dataProcessTools.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | data processing methods 4 | ''' 5 | 6 | import numpy 7 | import theano 8 | 9 | # Set the random number generators' seeds for consistency 10 | SEED = 123 11 | numpy.random.seed(SEED) 12 | 13 | def getTrainingData(trainingDataFile): 14 | ''' 15 | get all training data, (q,a,b) tuples 16 | :type string 17 | :param trainingDataFile path 18 | ''' 19 | data=[] 20 | pairs=[] 21 | with open(trainingDataFile) as f: 22 | for l in f: 23 | tmp=l.strip().split() 24 | if len(tmp)<=0: 25 | continue 26 | arr=[] 27 | arr.append(tmp[0]+'-'+tmp[1]) 28 | arr.append(tmp[1]+'-'+tmp[0]) 29 | arr.append(tmp[0]+'-'+tmp[2]) 30 | arr.append(tmp[2]+'-'+tmp[0]) 31 | pairs.append(arr) 32 | tmp=[int(x) for x in tmp] 33 | data.append(tmp) 34 | 35 | return data,pairs 36 | 37 | def getMetagraphEmbeddings(metagraphEmbeddings_path): 38 | """ 39 | get metagraph embeddings from file 40 | :type String 41 | :param metagraphEmbeddings_path file 42 | """ 43 | size=0 44 | dimension=0 45 | wemb=[] 46 | with open(metagraphEmbeddings_path) as f: 47 | for l in f: 48 | arr=l.strip().split() 49 | if len(arr)==2: 50 | size=int(arr[0]) 51 | dimension=int(arr[1]) 52 | wemb=numpy.zeros((size,dimension)) 53 | continue 54 | id=int(arr[0]) 55 | for i in range(0,dimension): 56 | wemb[id][i]=float(arr[i+1]) 57 | return wemb,dimension,size 58 | 59 | def getWordsEmbeddings(wordsEmbeddings_path): 60 | """ 61 | get words embeddings 62 | :type String 63 | :param wordsEmbeddings_path file 64 | """ 65 | size=0 66 | dimension=0 67 | wemb=[] 68 | with open(wordsEmbeddings_path) as f: 69 | for l in f: 70 | arr=l.strip().split() 71 | if len(arr)==2: 72 | size=int(arr[0]) 73 | dimension=int(arr[1]) 74 | wemb=numpy.zeros((size,dimension)) 75 | continue 76 | id=int(arr[0]) 77 | for i in range(0,dimension): 78 | wemb[id][i]=float(arr[i+1]) 79 | return wemb,dimension,size 80 | 81 | def loadAllSubPaths(subpaths_file,maxlen=1000): 82 | """ 83 | get all subpaths (the m-paths) 84 | """ 85 | map={} 86 | with open(subpaths_file) as f: 87 | for l in f: 88 | splitByTab=l.strip().split('\t') 89 | key=splitByTab[0]+'-'+splitByTab[1] 90 | sentence=[int(y) for y in splitByTab[2].split()[:]] 91 | if len(sentence)>maxlen: 92 | continue 93 | if key in map: 94 | map[key].append(sentence) 95 | else: 96 | tmp=[] 97 | tmp.append(sentence) 98 | map[key]=tmp 99 | return map 100 | 101 | def loadAllSubPathsRomove0Path(subpaths_file,maxlen=1000,wordsEmbeddings=None): 102 | """ 103 | get all subpaths and return them in a map 104 | """ 105 | set_0=set() 106 | wemb_sum=wordsEmbeddings.sum(axis=1) 107 | for i in range(len(wemb_sum)): 108 | if wemb_sum[i]<1.: 109 | set_0.add(i) 110 | print '# of subpaths whose all elements are 0s is ',len(set_0) 111 | 112 | map={} 113 | with open(subpaths_file) as f: 114 | for l in f: 115 | splitByTab=l.strip().split('\t') 116 | key=splitByTab[0]+'-'+splitByTab[1] 117 | sentence=[int(y) for y in splitByTab[2].split()[:]] 118 | if len(sentence)>maxlen: 119 | continue 120 | flag=True 121 | for i in sentence: 122 | if i in set_0: 123 | flag=False 124 | break 125 | if not flag: 126 | continue 127 | if key in map: 128 | map[key].append(sentence) 129 | else: 130 | tmp=[] 131 | tmp.append(sentence) 132 | map[key]=tmp 133 | return map 134 | 135 | def prepareDataForTraining(trainingDataTriples,trainingDataPairs,subpaths_map): 136 | """ 137 | prepare data for the model 138 | """ 139 | n_triples=len(trainingDataTriples) 140 | 141 | triples_matrix=numpy.zeros([n_triples,4,2]).astype(theano.config.floatX) # @UndefinedVariable 142 | 143 | maxlen=0 144 | n_subpaths=0 145 | allPairs=[] 146 | for list in trainingDataPairs: 147 | for l in list: 148 | allPairs.append(l) 149 | for key in allPairs: 150 | if key not in subpaths_map: 151 | continue; 152 | list=subpaths_map[key] 153 | n_subpaths+=len(list) 154 | for l in list: 155 | if len(l)>maxlen: 156 | maxlen=len(l) 157 | 158 | subPaths_matrix=numpy.zeros([maxlen,n_subpaths]).astype('int64') 159 | 160 | subPaths_mask=numpy.zeros([maxlen,n_subpaths]).astype(theano.config.floatX) # @UndefinedVariable 161 | 162 | subPaths_lens=numpy.zeros([n_subpaths,]).astype('int64') 163 | 164 | current_index=0 165 | path_index=0 166 | valid_triples_count=0 167 | for i in range(len(trainingDataPairs)): 168 | pairs=trainingDataPairs[i] 169 | 170 | valid_triples_count+=1 171 | for j in range(len(pairs)): 172 | pair=pairs[j] 173 | list=None 174 | if pair in subpaths_map: 175 | list=subpaths_map[pair] 176 | if list is not None: 177 | triples_matrix[i][j][0]=current_index 178 | current_index+=len(list) 179 | triples_matrix[i][j][1]=current_index 180 | for x in range(len(list)): 181 | index=path_index+x 182 | path=list[x] 183 | subPaths_lens[index]=len(path) 184 | for y in range(len(path)): 185 | subPaths_matrix[y][index]=path[y] 186 | subPaths_mask[y][index]=1. 187 | for y in range(maxlen-len(path)): 188 | subPaths_matrix[len(path)+y][index]=path[0] 189 | path_index+=len(list) 190 | else : 191 | triples_matrix[i][j][0]=current_index 192 | current_index+=0 193 | triples_matrix[i][j][1]=current_index 194 | 195 | count=0 196 | for i in range(len(triples_matrix)): 197 | if triples_matrix[i][0][0]!=triples_matrix[i][1][1] and triples_matrix[i][2][0]!=triples_matrix[i][3][1]: 198 | count+=1 199 | triples_matrix_new=numpy.zeros([count,4,2]).astype('int64') 200 | index=0 201 | for i in range(len(triples_matrix)): 202 | if triples_matrix[i][0][0]!=triples_matrix[i][1][1] and triples_matrix[i][2][0]!=triples_matrix[i][3][1]: 203 | triples_matrix_new[index]=triples_matrix[i] 204 | index+=1 205 | triples_matrix=triples_matrix_new 206 | 207 | return triples_matrix, subPaths_matrix, subPaths_mask, subPaths_lens 208 | 209 | 210 | def prepareDataForTest(query,candidate,subpaths_map): 211 | """ 212 | prepare the data for test 213 | """ 214 | key1=bytes(query)+'-'+bytes(candidate) 215 | key2=bytes(candidate)+'-'+bytes(query) 216 | if key1 not in subpaths_map and key2 not in subpaths_map: 217 | return None,None,None 218 | subpaths=[] 219 | if key1 in subpaths_map: 220 | subpaths.extend(subpaths_map[key1]) 221 | if key2 in subpaths_map: 222 | subpaths.extend(subpaths_map[key2]) 223 | maxlen=0 224 | for subpath in subpaths: 225 | if len(subpath)>maxlen: 226 | maxlen=len(subpath) 227 | subPaths_matrix=numpy.zeros([maxlen,len(subpaths)]).astype('int64') 228 | subPaths_mask=numpy.zeros([maxlen,len(subpaths)]).astype(theano.config.floatX) # @UndefinedVariable 229 | subPaths_lens=numpy.zeros([len(subpaths),]).astype('int64') 230 | for i in range(len(subpaths)): 231 | subpath=subpaths[i] 232 | subPaths_lens[i]=len(subpath) 233 | for j in range(len(subpath)): 234 | subPaths_matrix[j][i]=subpath[j] 235 | subPaths_mask[j][i]=1. 236 | 237 | return subPaths_matrix,subPaths_mask,subPaths_lens 238 | 239 | 240 | def get_minibatches_idx(n, minibatch_size, shuffle=False): 241 | """ 242 | Used to shuffle the dataset at each iteration. 243 | """ 244 | idx_list = numpy.arange(n, dtype="int64") 245 | 246 | if shuffle: 247 | numpy.random.shuffle(idx_list) 248 | 249 | minibatches = [] 250 | minibatch_start = 0 251 | for i in range(n // minibatch_size): 252 | minibatches.append(idx_list[minibatch_start: 253 | minibatch_start + minibatch_size]) 254 | minibatch_start += minibatch_size 255 | 256 | if (minibatch_start != n): 257 | # Make a minibatch out of what is left 258 | minibatches.append(idx_list[minibatch_start:]) 259 | 260 | return zip(range(len(minibatches)), minibatches) 261 | 262 | 263 | 264 | 265 | -------------------------------------------------------------------------------- /model-SPE/evaluateTools.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | evaluation tools 4 | ''' 5 | 6 | import numpy 7 | 8 | def get_AP(k,ideal,test): 9 | """ 10 | compute AP 11 | """ 12 | ideal=set(ideal) 13 | accumulation=0.0 14 | count=0 15 | for i in range(len(test)): 16 | if i>=k: 17 | break 18 | if test[i] in ideal: 19 | count+=1 20 | accumulation+=count/(i+1.0) 21 | m=len(ideal) 22 | n=k 23 | x=0 24 | if m>n: 25 | x=n 26 | else: 27 | x=m 28 | if x==0: 29 | return 0 30 | return accumulation/x 31 | 32 | 33 | def get_MAP(k,ideal_map,test_map): 34 | """ 35 | compute MAP 36 | """ 37 | accumulation=0.0 38 | for key in ideal_map.keys(): 39 | accumulation+=get_AP(k, ideal_map[key], test_map[key]) 40 | if len(ideal_map)==0: 41 | return 0 42 | return accumulation/len(ideal_map) 43 | 44 | 45 | def get_nDCG(k,ideal,test): 46 | """ 47 | compute NDCG 48 | """ 49 | ideal=set(ideal) 50 | accumulation=0.0 51 | for i in range(len(test)): 52 | if i>=k: 53 | break 54 | if test[i] in ideal: 55 | if i==0: 56 | accumulation+=1.0 57 | else: 58 | accumulation+=1.0/numpy.log2(i+1) 59 | normalization=0.0 60 | for i in range(len(ideal)): 61 | if i>=k: 62 | break 63 | if i==0: 64 | normalization+=1.0 65 | else: 66 | normalization+=1.0/numpy.log2(i+1) 67 | if normalization==0: 68 | return 0 69 | return accumulation/normalization 70 | 71 | def get_MnDCG(k,ideal_map,test_map): 72 | """ 73 | compute mean NDCG 74 | """ 75 | accumulation=0.0 76 | for key in ideal_map.keys(): 77 | accumulation+=get_nDCG(k, ideal_map[key], test_map[key]) 78 | if len(ideal_map)==0: 79 | return 0 80 | return accumulation/len(ideal_map) 81 | 82 | 83 | if __name__=='__main__': 84 | ideal=['a'] 85 | test=['b','a'] 86 | k=10 87 | print get_nDCG(k, ideal, test) 88 | # ideal={'q':['a','b','c'],'p':['a','b','c','d','e']} 89 | # test={'q':['b','a','m','c','d','n'],'p':['b','a','m','c','d','n']} 90 | # k=4 91 | # print get_MnDCG(k, ideal, test) -------------------------------------------------------------------------------- /model-SPE/experimentForOneFileByParams.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Main file of MPE 4 | ''' 5 | 6 | import ConfigParser 7 | import string, os, sys 8 | import attentionBatch 9 | import modelProcessAndAssess 10 | import gc 11 | import time 12 | import subprocess 13 | 14 | if __name__=='__main__': 15 | 16 | # read config file 17 | cf = ConfigParser.SafeConfigParser() 18 | cf.read("pythonParamsConfig") 19 | 20 | main_dir=cf.get("param", "root_dir") # main work dir 21 | dataset_name=cf.get("param", "dataset_name") # the name of one dataset 22 | suffix=cf.get("param", "suffix") # the suffix of dataset, such as 10,100,1000 23 | class_name=cf.get("param", "class_name") # the relation name of data 24 | index=cf.get("param", "index") # the index of the dataset file 25 | 26 | trainingDataFile=os.path.join(main_dir+'/',dataset_name+'.splits','train.'+suffix,'train_'+class_name+'_'+index) # the full path of training data file. This path will be generated by main_dir, dataset_name, suffix, class_name and index. 27 | metagraphEmbeddings_path=cf.get("param", "metagraphEmbeddings_path") # the file path of metagraph embeddings 28 | wordsEmbeddings_data=None # words embeddings 29 | wordsEmbeddings_path=cf.get("param", "wordsEmbeddings_path") # the file path of words embeddings 30 | 31 | wordsSize=cf.getint("param", "wordsSize") # the size of words vocabulary 32 | subpaths_map=None # contains sub-paths 33 | subpaths_file=cf.get("param", "subpaths_file") # the file which contains sub-paths 34 | maxlen_subpaths=cf.getint("param", "maxlen_subpaths") # the max length for sub-paths 35 | maxlen=cf.getint("param", "maxlen") # Sequence longer then this get ignored 36 | batch_size=cf.getint("param", "batch_size") # use a batch for training. This is the size of this batch. 37 | is_shuffle_for_batch=cf.getboolean("param", "is_shuffle_for_batch") # if need shuffle for training 38 | objective_function_method=cf.get("param", "objective_function_method") # loss function, we use sigmoid here 39 | objective_function_param=cf.getfloat("param", "objective_function_param") # the parameter in loss function, beta 40 | lrate=cf.getfloat("param", "lrate") # learning rate 41 | max_epochs=cf.getint("param", "max_epochs") # the max epochs for training 42 | 43 | dispFreq=cf.getint("param", "dispFreq") # the frequences for display 44 | saveFreq=cf.getint("param", "saveFreq") # the frequences for saving the parameters 45 | saveto=os.path.join(main_dir+'/',dataset_name+'.trainModels','train.'+suffix,'train_'+class_name+'_'+index+'.npz') # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. 46 | 47 | metagraph_embedding_dimension=cf.getint("param", "metagraph_embedding_dimension") # metagraph embedding dimension 48 | dimension_A=cf.getint("param", "dimension_A") # the dimension of attention when computing the m-node embedding 49 | dimension_lstm=cf.getint("param", "dimension_lstm") # dimension of lstm parameters 50 | dimension_B=cf.getint("param", "dimension_B") # the dimension of attention when computing the m-path embedding 51 | dimension_C=cf.getint("param", "dimension_C") # the dimension of attention when computing the m-paths embedding 52 | 53 | # decay parameters 54 | decay_Q_A=cf.getfloat("param", "decay_Q_A") 55 | decay_b_A=cf.getfloat("param", "decay_b_A") 56 | decay_eta_A=cf.getfloat("param", "decay_eta_A") 57 | 58 | decay_lstm_W=cf.getfloat("param", "decay_lstm_W") 59 | decay_lstm_U=cf.getfloat("param", "decay_lstm_U") 60 | decay_lstm_b=cf.getfloat("param", "decay_lstm_b") 61 | decay_w=cf.getfloat("param", "decay_w") 62 | 63 | decay_Q_B=cf.getfloat("param", "decay_Q_B") 64 | decay_b_B=cf.getfloat("param", "decay_b_B") 65 | decay_eta_B=cf.getfloat("param", "decay_eta_B") 66 | 67 | decay_Q_C=cf.getfloat("param", "decay_Q_C") 68 | decay_b_C=cf.getfloat("param", "decay_b_C") 69 | decay_eta_C=cf.getfloat("param", "decay_eta_C") 70 | 71 | # test and ideal data 72 | test_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','test','test_'+class_name+'_'+index) # test data file 73 | top_num=cf.getint("param", "top_num") # top num in experiments 74 | ideal_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','ideal','ideal_'+class_name+'_'+index) # ideal data file 75 | 76 | # training 77 | attentionBatch.metagraphAttentionTraining( 78 | trainingDataFile, 79 | metagraphEmbeddings_path, 80 | wordsEmbeddings_data, 81 | wordsEmbeddings_path, 82 | wordsSize, 83 | subpaths_map, 84 | subpaths_file, 85 | maxlen_subpaths, 86 | maxlen, 87 | batch_size, 88 | is_shuffle_for_batch, 89 | objective_function_method, 90 | objective_function_param, 91 | lrate, 92 | max_epochs, 93 | dispFreq, 94 | saveFreq, 95 | saveto, 96 | metagraph_embedding_dimension, 97 | dimension_A, 98 | dimension_lstm, 99 | dimension_B, 100 | dimension_C, 101 | decay_Q_A, 102 | decay_b_A, 103 | decay_eta_A, 104 | decay_lstm_W, 105 | decay_lstm_U, 106 | decay_lstm_b, 107 | decay_Q_B, 108 | decay_b_B, 109 | decay_eta_B, 110 | decay_Q_C, 111 | decay_b_C, 112 | decay_eta_C, 113 | decay_w) 114 | 115 | time.sleep(5) # sleep 116 | 117 | # child = subprocess.Popen("nohup python experimentForOneFileByParams_second.py &",shell=True) 118 | 119 | start_time = time.time() 120 | print 'This time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 121 | a=gc.collect() 122 | print 'First time to release number of objects from RAM ==', a 123 | b=gc.collect() 124 | print 'Second time to release number of objects from RAM ==', b 125 | c=gc.collect() 126 | print 'Third time to release number of objects from RAM ==', c 127 | 128 | # get the process model 129 | func=modelProcessAndAssess.get_metagraphAttentionModel( 130 | saveto, 131 | metagraph_embedding_dimension, 132 | dimension_A, 133 | dimension_lstm, 134 | dimension_B, 135 | dimension_C) 136 | 137 | # gc 138 | gc.collect() 139 | gc.collect() 140 | gc.collect() 141 | 142 | # test and get the results 143 | MAP,MnDCG=modelProcessAndAssess.compute_metagraphAttention( 144 | wordsEmbeddings_data, 145 | wordsEmbeddings_path, 146 | metagraphEmbeddings_path, 147 | wordsSize, 148 | subpaths_map, 149 | subpaths_file, 150 | maxlen_subpaths, 151 | test_data_file, 152 | top_num, 153 | ideal_data_file, 154 | func) 155 | 156 | print '......' 157 | end_time = time.time() 158 | print 'Final time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end_time)) 159 | print 'MAP =', MAP 160 | print 'NDCG =', MnDCG -------------------------------------------------------------------------------- /model-SPE/lstmModel.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | from __future__ import print_function 3 | import os 4 | 5 | import six.moves.cPickle as pickle # @UnresolvedImport 6 | # import six.moves as moves 7 | 8 | from collections import OrderedDict 9 | import sys 10 | import time 11 | 12 | import numpy 13 | import theano 14 | from theano import config 15 | import theano.tensor as tensor 16 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 17 | 18 | 19 | # theano.config.floatX = 'float32' 20 | # Set the random number generators' seeds for consistency 21 | # SEED = 123 22 | # numpy.random.seed(SEED) 23 | 24 | def numpy_floatX(data): 25 | return numpy.asarray(data, dtype=theano.config.floatX) # @UndefinedVariable 26 | 27 | 28 | def _p(pp, name): 29 | return '%s_%s' % (pp, name) 30 | 31 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): 32 | nsteps = state_below.shape[0] 33 | if state_below.ndim == 3: 34 | n_samples = state_below.shape[1] 35 | else: 36 | n_samples = 1 37 | 38 | assert mask is not None 39 | 40 | def _slice(_x, n, dim): 41 | if _x.ndim == 3: 42 | return _x[:, :, n * dim:(n + 1) * dim] 43 | return _x[:, n * dim:(n + 1) * dim] 44 | 45 | def _step(m_, x_, h_, c_): 46 | preact = tensor.dot(h_, tparams['lstm_U']) 47 | preact += x_ 48 | 49 | i = tensor.nnet.sigmoid(_slice(preact, 0, options['dimension_lstm'])) # input gate 50 | f = tensor.nnet.sigmoid(_slice(preact, 1, options['dimension_lstm'])) # forget gate 51 | o = tensor.nnet.sigmoid(_slice(preact, 2, options['dimension_lstm'])) # output gate 52 | c = tensor.tanh(_slice(preact, 3, options['dimension_lstm'])) # init cell 53 | 54 | c = f * c_ + i * c 55 | c = m_[:, None] * c + (1. - m_)[:, None] * c_ 56 | 57 | h = o * tensor.tanh(c) 58 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 59 | 60 | return h, c 61 | state_below = (tensor.dot(state_below, tparams['lstm_W']) + tparams['lstm_b']) 62 | 63 | dim_proj = options['dimension_lstm'] 64 | rval, updates = theano.scan(_step, 65 | sequences=[mask, state_below], 66 | outputs_info=[tensor.alloc(numpy_floatX(0.), 67 | n_samples, 68 | dim_proj), 69 | tensor.alloc(numpy_floatX(0.), 70 | n_samples, 71 | dim_proj)], 72 | name=_p(prefix, '_layers'), 73 | n_steps=nsteps) # maxlen 74 | return rval[0] 75 | 76 | def build_model(tparams, options, x, mask): 77 | 78 | proj = lstm_layer(tparams, x, options, 79 | prefix='lstm', 80 | mask=mask) 81 | return proj 82 | 83 | 84 | def get_lstm( 85 | model_options, # model config parameters 86 | tparams, # theano shared variables 87 | x, # a sequence 88 | x_mask, # mask matrix 89 | ): 90 | 91 | proj = build_model(tparams, model_options, x, x_mask) 92 | return proj 93 | -------------------------------------------------------------------------------- /model-SPE/modelProcessAndAssess.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Functions and Application : 4 | Get the process model and test 5 | ''' 6 | 7 | import numpy 8 | import theano 9 | from theano import tensor 10 | from collections import OrderedDict 11 | import dataProcessTools 12 | import toolsFunction 13 | import evaluateTools 14 | import gc 15 | import subgraphAttentionProcessModelLSTMBatch 16 | 17 | 18 | def get_metagraphAttentionModel( 19 | 20 | model_params_path='', # model save path 21 | metagraph_embedding_dimension=10, # metagraph embedding dimension 22 | dimension_A=10, # the dimension of attention when computing the m-node embedding 23 | dimension_lstm=10, # dimension of lstm parameters 24 | dimension_B=10, # the dimension of attention when computing the m-path embedding 25 | dimension_C=10, # the dimension of attention when computing the m-paths embedding 26 | ): 27 | """ 28 | get the MPE process Model 29 | """ 30 | model_options = locals().copy() 31 | 32 | tparams = OrderedDict() 33 | tparams['Q_A']=None 34 | tparams['b_A']=None 35 | tparams['eta_A']=None 36 | tparams['lstm_W']=None 37 | tparams['lstm_U']=None 38 | tparams['lstm_b']=None 39 | tparams['Q_B']=None 40 | tparams['b_B']=None 41 | tparams['eta_B']=None 42 | tparams['Q_C']=None 43 | tparams['b_C']=None 44 | tparams['eta_C']=None 45 | tparams['w']=None 46 | tparams=load_params(model_params_path, tparams) 47 | 48 | metagraphEmbeddings, subPaths_matrix, subPaths_mask, wordsEmbeddings, score=subgraphAttentionProcessModelLSTMBatch.metagraphAttentionProcessModel(model_options, tparams) 49 | func=theano.function([metagraphEmbeddings, subPaths_matrix, subPaths_mask, wordsEmbeddings], score) 50 | return func 51 | 52 | def load_params(path, params): 53 | """ 54 | load all the parameters 55 | """ 56 | pp = numpy.load(path) 57 | for kk, vv in params.items(): 58 | if kk not in pp: 59 | raise Warning('%s is not in the archive' % kk) 60 | params[kk] = pp[kk] 61 | 62 | return params 63 | 64 | def compute_metagraphAttention( 65 | wordsEmbeddings=None, # words embeddings 66 | wordsEmbeddings_path=None, # the file path of words embeddings 67 | metagraphEmbeddings_path=None, # the file path of metagraph embeddings 68 | wordsSize=0, # the size of words vocabulary 69 | subpaths_map=None, # contains sub-paths 70 | subpaths_file=None, # the file which contains sub-paths 71 | maxlen_subpaths=1000, # the max length for sub-paths 72 | 73 | test_data_file='', # test data file 74 | top_num=10, # top num in experiments 75 | ideal_data_file='', # ideal data file 76 | func=None, # the MPE process model 77 | ): 78 | """ 79 | evaluate the MPE model 80 | """ 81 | model_options = locals().copy() 82 | 83 | if wordsEmbeddings is None: 84 | if wordsEmbeddings_path is not None: 85 | wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) 86 | else: 87 | print 'There is not path for wordsEmbeddings, exit!!!' 88 | exit(0) 89 | 90 | if subpaths_map is None: 91 | if subpaths_file is not None: 92 | subpaths_map=dataProcessTools.loadAllSubPathsRomove0Path(subpaths_file, maxlen_subpaths, wordsEmbeddings) 93 | else: 94 | print 'There is not path for sub-paths, exit!!!' 95 | exit(0) 96 | 97 | metagraphEmbedding_data, metagraphDimension, metagraphSize=dataProcessTools.getMetagraphEmbeddings(metagraphEmbeddings_path) 98 | 99 | line_count=0 100 | test_map={} 101 | print 'Compute MAP and nDCG for file ',test_data_file 102 | with open(test_data_file) as f: 103 | for l in f: 104 | arr=l.strip().split() 105 | query=int(arr[0]) 106 | map={} 107 | for i in range(1,len(arr)): 108 | candidate=int(arr[i]) 109 | subPaths_matrix_data,subPaths_mask_data,subPaths_lens_data=dataProcessTools.prepareDataForTest(query, candidate, subpaths_map) 110 | if subPaths_matrix_data is None and subPaths_mask_data is None and subPaths_lens_data is None: 111 | map[candidate]=-1000. 112 | else: 113 | value=func(metagraphEmbedding_data, subPaths_matrix_data, subPaths_mask_data, wordsEmbeddings) 114 | map[candidate]=value 115 | del subPaths_matrix_data 116 | del subPaths_mask_data 117 | del subPaths_lens_data 118 | tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num) 119 | test_map[line_count]=tops_in_line 120 | line_count+=1 121 | map=None 122 | gc.collect() 123 | 124 | 125 | line_count=0 126 | ideal_map={} 127 | with open(ideal_data_file) as f: 128 | for l in f: 129 | arr=l.strip().split() 130 | arr=[int(x) for x in arr] 131 | ideal_map[line_count]=arr[1:] 132 | line_count+=1 133 | 134 | MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) 135 | MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map) 136 | 137 | return MAP,MnDCG 138 | 139 | -------------------------------------------------------------------------------- /model-SPE/pythonParamsConfig: -------------------------------------------------------------------------------- 1 | [param] 2 | 3 | ############################################ 4 | # training data config 5 | ############################################ 6 | # main work dir 7 | root_dir = D:/dataset/icde2016/dataset 8 | # the name of one dataset, such as facebook, linkedin 9 | dataset_name = facebook 10 | # the suffix of dataset, such as 10,100,1000 11 | suffix = 10 12 | # the relation name of data, such as classmate,family 13 | class_name = classmate 14 | # the index of the dataset file 15 | index = 1 16 | 17 | ############################################ 18 | # offline results files 19 | ############################################ 20 | # metagraph embedding path 21 | metagraphEmbeddings_path = %(root_dir)s/metagraph-structural-similarity/%(dataset_name)s-metagraph.embedding 22 | # words embeddings path 23 | wordsEmbeddings_path = %(root_dir)s/%(dataset_name)s/vectorSaveFile 24 | # sub-paths paths 25 | subpaths_file = %(root_dir)s/%(dataset_name)s/newSubpathsSaveFile 26 | 27 | ############################################ 28 | # experiment parameters - do not need to change frequently 29 | ############################################ 30 | # the max length for sub-paths 31 | maxlen_subpaths = 1000 32 | # the size of words vocabulary 33 | wordsSize = 1000000 34 | # Sequence longer than this get ignored 35 | maxlen = 1000 36 | # use a batch for training. This is the size of this batch. 37 | batch_size = 10 38 | # if need shuffle for training 39 | is_shuffle_for_batch = True 40 | # the frequences for display 41 | dispFreq = 5 42 | # the frequences for saving the parameters 43 | saveFreq = 5 44 | # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. It will be generated in the code. 45 | saveto = 46 | # the top num to predict 47 | top_num = 10 48 | # results file 49 | result_save_file = %(root_dir)s/%(dataset_name)s.results/train.%(suffix)s/class_name 50 | 51 | ############################################ 52 | # experiment parameters - need to tune frequently 53 | ############################################ 54 | # learning rate 55 | lrate = 0.0001 56 | # metagraph embedding dimension 57 | metagraph_embedding_dimension = 32 58 | # the dimension of attention when computing the m-node embedding 59 | dimension_A = 12 60 | # dimension of lstm parameters 61 | dimension_lstm = 12 62 | # the dimension of attention when computing the m-path embedding 63 | dimension_B = 12 64 | # the dimension of attention when computing the m-paths embedding 65 | dimension_C = 12 66 | 67 | # loss function, we use sigmoid here 68 | objective_function_method = sigmoid 69 | # the parameter in loss function, beta 70 | objective_function_param = 0.1 71 | # the max epochs for training 72 | max_epochs = 100 73 | 74 | # decay 75 | decay_Q_A=0.00001 76 | decay_b_A=0.00001 77 | decay_eta_A=0.00001 78 | 79 | decay_lstm_W = 0.00001 80 | decay_lstm_U = 0.00001 81 | decay_lstm_b = 0.00001 82 | 83 | decay_Q_B=0.00001 84 | decay_b_B=0.00001 85 | decay_eta_B=0.00001 86 | 87 | decay_Q_C=0.00001 88 | decay_b_C=0.00001 89 | decay_eta_C=0.00001 90 | 91 | decay_w = 0.00001 92 | 93 | -------------------------------------------------------------------------------- /model-SPE/subgraphAttentionModelLSTMBatch.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Functions and Application : 4 | the MPE model 5 | ''' 6 | import os 7 | import numpy 8 | import theano 9 | from theano import tensor 10 | from theano.ifelse import ifelse 11 | import lstmModel 12 | 13 | def metagraphAttentionModel(options,tparams): 14 | """ 15 | the MPE model 16 | """ 17 | metagraphEmbeddings=tensor.matrix('metagraphEmbeddings',dtype=theano.config.floatX) # @UndefinedVariable # shape=#(metagraph)*len(metaEmbeding) 18 | trainingParis=tensor.tensor3('trainingParis',dtype='int64') # 3D tensor,shape=#(triples)*4*2 19 | subPaths_matrix=tensor.matrix('subPaths_matrix',dtype='int64') # shape=maxlen*#(sub-paths) 20 | subPaths_mask=tensor.matrix('subPaths_mask',dtype=theano.config.floatX) # @UndefinedVariable # shape=maxlen*#(sub-paths) 21 | wordsEmbeddings=tensor.matrix('wordsEmbeddings',dtype=theano.config.floatX) # @UndefinedVariable # #(words)*word_dimension 22 | 23 | metagraphBeta=tensor.exp(tensor.dot(tensor.nnet.sigmoid(tensor.dot(metagraphEmbeddings, tparams['Q_A'])+tparams['b_A']), tparams['eta_A'])) 24 | 25 | def _processTriple(fourPairs,lossSum): 26 | 27 | def _processSubpathsBatch(start, end): 28 | x=subPaths_matrix[:,start:end] # shape=maxlen*nsamples 29 | x_mask=subPaths_mask[:,start:end] 30 | 31 | wordEmb=wordsEmbeddings[x] # shape=maxlen*nsamples*len(wordsEmbeddings) 32 | tmp=wordEmb*metagraphBeta 33 | softmax0=tmp/tmp.sum(axis=-1, keepdims=True) # shape=maxlen*nsamples*len(wordsEmbeddings) 34 | subpaths=((softmax0*wordEmb)[:,:,:,None]*metagraphEmbeddings).sum(axis=2) 35 | 36 | h3Dmatrix=lstmModel.get_lstm(options, tparams, subpaths, x_mask) 37 | beta1=tensor.exp(tensor.dot(tensor.nnet.sigmoid(tensor.dot(h3Dmatrix, tparams['Q_B'])+tparams['b_B']), tparams['eta_B'])) 38 | temp=x_mask*beta1 39 | # softmax1 shape=maxlen*nsamples 40 | softmax1=temp/temp.sum(axis=0, keepdims=True) 41 | # shape=nsamples*lstm_dimension 42 | pathsEmb=(softmax1[:,:,None]*h3Dmatrix).sum(axis=0) 43 | 44 | return pathsEmb 45 | 46 | def iftFunc(): 47 | embx=numpy.zeros(options['dimension_lstm'],).astype(theano.config.floatX) # @UndefinedVariable 48 | return embx 49 | 50 | def iffFunc(start,end): 51 | embx=None 52 | rval2=_processSubpathsBatch(start, end) 53 | # beta shape=paths_num * 0 54 | beta=tensor.dot(tensor.nnet.sigmoid(tensor.dot(rval2, tparams['Q_C'])+tparams['b_C']), tparams['eta_C']) 55 | softmax2=tensor.nnet.softmax(beta)[0] 56 | embx=(softmax2[:,None]*rval2).sum(axis=0) # shape=dimension_lstm*0 57 | 58 | return embx 59 | 60 | # get emb1 61 | start=fourPairs[0][0] 62 | end=fourPairs[1][1] 63 | emb1=None 64 | emb1=ifelse(tensor.eq(start,end),iftFunc(),iffFunc(start,end)) 65 | # get emb2 66 | start=fourPairs[2][0] 67 | end=fourPairs[3][1] 68 | emb2=None 69 | emb2=ifelse(tensor.eq(start,end),iftFunc(),iffFunc(start,end)) 70 | 71 | loss=0 72 | param=options['objective_function_param'] 73 | if options['objective_function_method']=='sigmoid': 74 | loss=-tensor.log(tensor.nnet.sigmoid(param*(tensor.dot(emb1,tparams['w'])-tensor.dot(emb2,tparams['w'])))) # sigmoid 75 | else: # hinge-loss 76 | value=param + tensor.dot(emb2,tparams['w']) - tensor.dot(emb1,tparams['w']) 77 | loss=value*(value>0) 78 | 79 | return loss+lossSum 80 | 81 | rval,update=theano.scan( 82 | _processTriple, 83 | sequences=trainingParis, 84 | outputs_info=tensor.constant(0., dtype=theano.config.floatX), # @UndefinedVariable 85 | ) 86 | cost=rval[-1] 87 | 88 | cost+=options['decay_Q_A']*(tparams['Q_A'] ** 2).sum() 89 | cost+=options['decay_Q_A']*(tparams['b_A'] ** 2).sum() 90 | cost+=options['decay_Q_A']*(tparams['eta_A'] ** 2).sum() 91 | cost+=options['decay_Q_A']*(tparams['lstm_W'] ** 2).sum() 92 | cost+=options['decay_Q_A']*(tparams['lstm_U'] ** 2).sum() 93 | cost+=options['decay_Q_A']*(tparams['lstm_b'] ** 2).sum() 94 | cost+=options['decay_Q_A']*(tparams['Q_B'] ** 2).sum() 95 | cost+=options['decay_Q_A']*(tparams['b_B'] ** 2).sum() 96 | cost+=options['decay_Q_A']*(tparams['eta_B'] ** 2).sum() 97 | cost+=options['decay_Q_A']*(tparams['Q_C'] ** 2).sum() 98 | cost+=options['decay_Q_A']*(tparams['b_C'] ** 2).sum() 99 | cost+=options['decay_Q_A']*(tparams['eta_C'] ** 2).sum() 100 | cost+=options['decay_Q_A']*(tparams['w'] ** 2).sum() 101 | 102 | # return MPE model 103 | return metagraphEmbeddings, trainingParis, subPaths_matrix, subPaths_mask, wordsEmbeddings, cost 104 | -------------------------------------------------------------------------------- /model-SPE/subgraphAttentionProcessModelLSTMBatch.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Functions and Application : 4 | attention based metagraph process model 5 | ''' 6 | 7 | import numpy 8 | import theano 9 | from theano import tensor 10 | from theano.ifelse import ifelse 11 | import lstmModel 12 | # theano.config.floatX = 'float32' 13 | 14 | def metagraphAttentionProcessModel(options,tparams): 15 | """ 16 | the MPE process model 17 | """ 18 | metagraphEmbeddings=tensor.matrix('metagraphEmbeddings',dtype=theano.config.floatX) # @UndefinedVariable # shape=#(metagraph)*len(metaEmbeding) 19 | subPaths_matrix=tensor.matrix('subPaths_matrix',dtype='int64') # shape=maxlen*#(sub-paths) 20 | subPaths_mask=tensor.matrix('subPaths_mask',dtype=theano.config.floatX) # @UndefinedVariable # shape=maxlen*#(sub-paths) 21 | wordsEmbeddings=tensor.matrix('wordsEmbeddings',dtype=theano.config.floatX) # @UndefinedVariable # #(words)*word_dimension 22 | 23 | metagraphBeta=tensor.exp(tensor.dot(tensor.nnet.sigmoid(tensor.dot(metagraphEmbeddings, tparams['Q_A'])+tparams['b_A']), tparams['eta_A'])) 24 | 25 | def _processSubpathsBatch(): 26 | x=subPaths_matrix 27 | x_mask=subPaths_mask 28 | 29 | wordEmb=wordsEmbeddings[x] # shape=len(path)*1*len(wordsEmbeddings) 30 | tmp=wordEmb*metagraphBeta 31 | softmax0=tmp/tmp.sum(axis=-1, keepdims=True) # shape=len(path)*1*len(wordsEmbeddings) 32 | subpaths=((softmax0*wordEmb)[:,:,:,None]*metagraphEmbeddings).sum(axis=2) 33 | 34 | 35 | h3Dmatrix=lstmModel.get_lstm(options, tparams, subpaths, x_mask) 36 | 37 | beta1=tensor.exp(tensor.dot(tensor.nnet.sigmoid(tensor.dot(h3Dmatrix, tparams['Q_B'])+tparams['b_B']), tparams['eta_B'])) 38 | temp=x_mask*beta1 39 | softmax1=temp/temp.sum(axis=0, keepdims=True) 40 | # shape=dimension_lstm*0 41 | pathsEmb=(softmax1[:,:,None]*h3Dmatrix).sum(axis=0) 42 | 43 | return pathsEmb 44 | 45 | rval2=_processSubpathsBatch() 46 | beta=tensor.dot(tensor.nnet.sigmoid(tensor.dot(rval2, tparams['Q_C'])+tparams['b_C']), tparams['eta_C']) 47 | softmax2=tensor.nnet.softmax(beta)[0] 48 | embx=(softmax2[:,None]*rval2).sum(axis=0) # shape=dimension_lstm*0 49 | 50 | score=tensor.dot(embx,tparams['w']) 51 | 52 | return metagraphEmbeddings, subPaths_matrix, subPaths_mask, wordsEmbeddings, score -------------------------------------------------------------------------------- /model-SPE/toolsFunction.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | some tools methods 4 | ''' 5 | 6 | def mapSortByValueDESC(map,top): 7 | """ 8 | sort by value desc 9 | """ 10 | if top>len(map): 11 | top=len(map) 12 | items=map.items() 13 | backitems=[[v[1],v[0]] for v in items] 14 | backitems.sort(reverse=True) 15 | e=[ backitems[i][1] for i in range(top)] 16 | return e 17 | 18 | 19 | def mapSortByValueASC(map,top): 20 | """ 21 | sort by value asc 22 | """ 23 | if top>len(map): 24 | top=len(map) 25 | items=map.items() 26 | backitems=[[v[1],v[0]] for v in items] 27 | backitems.sort() 28 | e=[ backitems[i][1] for i in range(top)] 29 | return e 30 | 31 | -------------------------------------------------------------------------------- /model-autoencoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vwz/SPE/17fc878bd394569586073747d0bd43ce9b558f26/model-autoencoder/__init__.py -------------------------------------------------------------------------------- /model-autoencoder/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vwz/SPE/17fc878bd394569586073747d0bd43ce9b558f26/model-autoencoder/__init__.pyc -------------------------------------------------------------------------------- /model-autoencoder/autoencoderCalculate.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Created on 2017年1月13日 4 | @author: Liu Zemin 5 | Functions and Application : 6 | 7 | ''' 8 | 9 | import numpy 10 | import theano 11 | from theano import tensor 12 | 13 | 14 | def autoencoderCalculateModel(tparams): 15 | """ 16 | aotuencoder compute model 17 | """ 18 | 19 | x=tensor.vector('x',dtype=theano.config.floatX) # @UndefinedVariable 20 | 21 | y=tensor.nnet.sigmoid(tensor.dot(tparams['w'],x)+tparams['b1']) 22 | 23 | # 返回结果 24 | return x, y 25 | -------------------------------------------------------------------------------- /model-autoencoder/autoencoderModel.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Functions and Application : 4 | autoencoder model 5 | ''' 6 | 7 | import numpy 8 | import theano 9 | from theano import tensor 10 | 11 | 12 | def autoencoderModel(options, tparams): 13 | """ 14 | autoencoder model 15 | """ 16 | 17 | x=tensor.matrix('x',dtype=theano.config.floatX) # @UndefinedVariable 18 | 19 | def _objectiveFunc(index, loss): 20 | y=tensor.nnet.sigmoid(tensor.dot(tparams['w'],x[index])+tparams['b1']) 21 | # _x=tensor.nnet.sigmoid(tensor.dot(tensor.transpose(tparams['w']),y)+tparams['b2']) 22 | _x=tensor.nnet.sigmoid(tensor.dot(tparams['w2'],y)+tparams['b2']) 23 | p=((x[index]-_x)**2).sum() 24 | return loss+p 25 | 26 | rval, update = theano.scan( 27 | _objectiveFunc, 28 | sequences=tensor.arange(x.shape[0]), 29 | outputs_info=tensor.constant(0., dtype=theano.config.floatX), # @UndefinedVariable 30 | ) 31 | 32 | cost=rval[-1] 33 | # cost=0 34 | 35 | cost+=options['decay_w']*tparams['w'].norm(2) 36 | cost+=options['decay_w']*tparams['w2'].norm(2) 37 | cost+=options['decay_b1']*tparams['b1'].norm(2) 38 | cost+=options['decay_b2']*tparams['b2'].norm(2) 39 | 40 | return x, cost 41 | 42 | -------------------------------------------------------------------------------- /model-autoencoder/autoencoderTraining.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Functions and Application : 4 | autoencoder training for metagraph embedding 5 | ''' 6 | 7 | import numpy 8 | import theano 9 | from theano import tensor 10 | from collections import OrderedDict 11 | import time 12 | import six.moves.cPickle as pickle # @UnresolvedImport 13 | from theano import config 14 | import dataToolsForAutoencoder 15 | import autoencoderModel 16 | import autoencoderCalculate 17 | 18 | # Set the random number generators' seeds for consistency 19 | SEED = 123 20 | numpy.random.seed(SEED) 21 | 22 | def numpy_floatX(data): 23 | return numpy.asarray(data, dtype=config.floatX) # @UndefinedVariable 24 | 25 | 26 | def gradientDescentGroup(learning_rate, tparams, grads, x, cost): 27 | update=[(shared,shared-learning_rate*g) for g,shared in zip(grads,tparams.values())] 28 | func=theano.function([x],cost,updates=update,on_unused_input='ignore') 29 | return func 30 | 31 | def adadelta(lr, tparams, grads, x, cost): 32 | """ 33 | An adaptive learning rate optimizer 34 | ---------- 35 | lr : Theano SharedVariable 36 | Initial learning rate 37 | tpramas: Theano SharedVariable 38 | Model parameters 39 | grads: Theano variable 40 | Gradients of cost w.r.t to parameres 41 | x: Theano variable 42 | Model inputs 43 | mask: Theano variable 44 | Sequence mask 45 | y: Theano variable 46 | Targets 47 | cost: Theano variable 48 | Objective fucntion to minimize 49 | 50 | Notes 51 | ----- 52 | For more information, see [ADADELTA]_. 53 | 54 | .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning 55 | Rate Method*, arXiv:1212.5701. 56 | """ 57 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), 58 | name='%s_grad' % k) 59 | for k, p in tparams.items()] 60 | running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), 61 | name='%s_rup2' % k) 62 | for k, p in tparams.items()] 63 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), 64 | name='%s_rgrad2' % k) 65 | for k, p in tparams.items()] 66 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 67 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 68 | for rg2, g in zip(running_grads2, grads)] 69 | f_grad_shared = theano.function([x], cost, updates=zgup + rg2up, 70 | on_unused_input='ignore', 71 | name='adadelta_f_grad_shared') 72 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 73 | for zg, ru2, rg2 in zip(zipped_grads, 74 | running_up2, 75 | running_grads2)] 76 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 77 | for ru2, ud in zip(running_up2, updir)] 78 | param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 79 | f_update = theano.function([lr], [], updates=ru2up + param_up, 80 | on_unused_input='ignore', 81 | name='adadelta_f_update') 82 | 83 | return f_grad_shared, f_update 84 | 85 | 86 | def init_sharedVariables(options): 87 | """ 88 | init shared variables 89 | """ 90 | print 'init shared Variables......' 91 | params = OrderedDict() 92 | w = numpy.random.rand(options['dimension2'], options['dimension1']) 93 | w = w*2.0-1.0 94 | params['w']=w.astype(config.floatX) # @UndefinedVariable 95 | w2 = numpy.random.rand(options['dimension1'], options['dimension2']) 96 | w2 = w2*2.0-1.0 97 | params['w2']=w2.astype(config.floatX) # @UndefinedVariable 98 | 99 | 100 | b1 = numpy.random.rand(options['dimension2'], ) 101 | b1 = b1*2.0-1.0 102 | params['b1']=b1.astype(config.floatX) # @UndefinedVariable 103 | b2 = numpy.random.rand(options['dimension1'], ) 104 | b2 = b2*2.0-1.0 105 | params['b2']=b2.astype(config.floatX) # @UndefinedVariable 106 | return params 107 | 108 | def init_tparams(params): 109 | tparams = OrderedDict() 110 | for kk, pp in params.items(): 111 | tparams[kk] = theano.shared(params[kk], name=kk) 112 | return tparams 113 | 114 | def unzip(zipped): 115 | """ 116 | When we pickle the model. Needed for the GPU stuff. 117 | """ 118 | new_params = OrderedDict() 119 | for kk, vv in zipped.items(): 120 | new_params[kk] = vv.get_value() 121 | return new_params 122 | 123 | def load_params(path, params): 124 | """ 125 | load the trained parameters from file 126 | """ 127 | pp = numpy.load(path) 128 | for kk, vv in params.items(): 129 | if kk not in pp: 130 | raise Warning('%s is not in the archive' % kk) 131 | params[kk] = pp[kk] 132 | 133 | return params 134 | 135 | def autoencoderTraining( 136 | metagraphStructuralSimFilePath='', # metagraph structural similarity file 137 | dimension1=981, # dimension of input vectors 138 | dimension2=30, # dimension of the result 139 | lrate=0.0001, # learning rate 140 | max_epochs=100, # epochs 141 | decay_w=0.0001, # decay 142 | decay_b1=0.0001, # decay 143 | decay_b2=0.0001, # decay 144 | batch_size=20, # training batch size 145 | is_shuffle_for_batch=True, # is shuffle for batch 146 | dispFreq=5, # display frequence 147 | saveFreq=5, # parameters save frequence 148 | saveto='', # save destination 149 | embeddingsSaveFile='', # result file 150 | ): 151 | """ 152 | training method 153 | """ 154 | model_options = locals().copy() 155 | 156 | """ 157 | get all data 158 | """ 159 | # 的structural similarity matrix 160 | x_allData=dataToolsForAutoencoder.readMetagraphStructuralSim(metagraphStructuralSimFilePath) 161 | 162 | # batch size 163 | allBatches=dataToolsForAutoencoder.get_minibatches_idx(len(x_allData), batch_size, is_shuffle_for_batch) 164 | 165 | """ 166 | init shared variables 167 | """ 168 | params=init_sharedVariables(model_options) 169 | tparams=init_tparams(params) 170 | print 'Generate SelectSignificantFeatures model ......' 171 | x, cost=autoencoderModel.autoencoderModel(model_options, tparams) 172 | 173 | print 'Generate gradients ......' 174 | grads=tensor.grad(cost,wrt=list(tparams.values())) 175 | 176 | print 'Using Adadelta to generate functions ......' 177 | lr = tensor.scalar(name='lr') 178 | f_grad_shared, f_update=adadelta(lr, tparams, grads, x, cost) 179 | 180 | """ 181 | training 182 | """ 183 | start_time = time.time() 184 | print 'start time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time)) 185 | best_p = None 186 | history_cost=[] 187 | uidx=0 # update index 188 | for eidx in range(max_epochs): 189 | for _, batch in allBatches: 190 | uidx+=1 191 | x_data=[x_allData[i] for i in batch] 192 | x_data=numpy.asarray(x_data) 193 | 194 | cost_data=f_grad_shared(x_data) 195 | f_update(lrate) 196 | 197 | if numpy.isnan(cost_data) or numpy.isinf(cost_data): 198 | print('bad cost detected: ', cost_data) 199 | return 200 | if numpy.mod(uidx, dispFreq) == 0: 201 | print 'Epoch =', eidx, ', Update =', uidx, ', Cost =', cost_data 202 | if saveto and numpy.mod(uidx, saveFreq) == 0: 203 | print('Saving...') 204 | if best_p is not None: 205 | params = best_p 206 | else: 207 | params = unzip(tparams) 208 | 209 | numpy.savez(saveto, history_errs=history_cost, **params) 210 | pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) 211 | print('Done') 212 | 213 | end_time = time.time() 214 | print 'end time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end_time)) 215 | print 'Training finished! Cost time == ', end_time-start_time,' s' 216 | 217 | x,y=autoencoderCalculate.autoencoderCalculateModel(tparams) 218 | calculateF=theano.function([x],y) 219 | output = open(embeddingsSaveFile, 'w') 220 | output.write(bytes(dimension1)+'\t'+bytes(dimension2)+'\n') 221 | for i in range(len(x_allData)): 222 | embedding=calculateF(x_allData[i]) 223 | output.write(bytes(i)+'\t') 224 | for j in embedding: 225 | output.write(bytes(j)+'\t') 226 | output.write('\n') 227 | output.flush() 228 | output.close() 229 | print 'Complete writing !!!' 230 | 231 | root_dir='D:/dataset/icde2016/dataset/metagraph-structural-similarity/' 232 | datasetName='linkedin' 233 | if __name__=='__main__': 234 | autoencoderTraining( 235 | metagraphStructuralSimFilePath=root_dir+datasetName+'-metagraph.sim', 236 | dimension1=173, # dimension of input vectors 237 | dimension2=64, # dimension of the result 238 | lrate=0.1, # learning rate 239 | max_epochs=50, # max epochs 240 | decay_w=0.1, # decay 241 | decay_b1=0.1, # decay 242 | decay_b2=0.1, # decay 243 | batch_size=50, # batch size 244 | is_shuffle_for_batch=True, # is shuffle for batch 245 | dispFreq=5, # display frequence 246 | saveFreq=5, # save frequence 247 | saveto=root_dir+datasetName+'.saveto.npz', # parameters save file 248 | embeddingsSaveFile=root_dir+datasetName+'-metagraph.embedding', # embedding results 249 | ) -------------------------------------------------------------------------------- /model-autoencoder/dataToolsForAutoencoder.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | ''' 3 | Functions and Application : 4 | data process methods 5 | ''' 6 | 7 | import numpy 8 | import theano 9 | from theano import tensor 10 | 11 | def readMetagraphStructuralSim(filepath): 12 | """ 13 | read metagraph structural similarity 14 | """ 15 | sim=None 16 | dimension=0 17 | row=0 18 | column=0 19 | with open(filepath) as f: 20 | for l in f: 21 | tmp=l.strip().split() 22 | if len(tmp)>0: 23 | if len(tmp)==1: 24 | dimension=int(tmp[0]) 25 | sim=numpy.ones((dimension,dimension)).astype(theano.config.floatX) # @UndefinedVariable 26 | continue 27 | else: 28 | row=int(tmp[0]) 29 | column=int(tmp[1]) 30 | sim[row][column]=float(tmp[2]) 31 | sim[column][row]=float(tmp[2]) 32 | return sim 33 | 34 | 35 | 36 | def get_minibatches_idx(n, minibatch_size, shuffle=False): 37 | """ 38 | Used to shuffle the dataset at each iteration. 39 | """ 40 | idx_list = numpy.arange(n, dtype="int32") 41 | 42 | if shuffle: 43 | numpy.random.shuffle(idx_list) 44 | 45 | minibatches = [] 46 | minibatch_start = 0 47 | for i in range(n // minibatch_size): 48 | minibatches.append(idx_list[minibatch_start: 49 | minibatch_start + minibatch_size]) 50 | minibatch_start += minibatch_size 51 | 52 | if (minibatch_start != n): 53 | # Make a minibatch out of what is left 54 | minibatches.append(idx_list[minibatch_start:]) 55 | 56 | return zip(range(len(minibatches)), minibatches) 57 | 58 | 59 | if __name__=='__main__': 60 | filepath='d:/test/write' 61 | sim=readMetagraphStructuralSim(filepath) 62 | print sim[0] 63 | print sim 64 | 65 | -------------------------------------------------------------------------------- /readMe: -------------------------------------------------------------------------------- 1 | Reference: 2 | Zemin Liu, Vincent W. Zheng, Zhou Zhao, Hongxia Yang, Kevin Chen-Chuan Chang, Minghui Wu, and Jing Ying. 2018. Subgraph-augmented Path Embedding for Semantic User Search on Heterogeneous Social Network. In Proceedings of The 2018 Web Conference (WWW 2018). ACM. New York, NY. USA, 10 pages. 3 | 4 | This python project implements the SPE model proposed in the above paper. 5 | Please refer the above paper for all the notations in this readme file. 6 | If you use it for scientific experiments, please cite this paper: 7 | @inproceedings{LiuZ18, 8 | author = {Zemin Liu and 9 | Vincent W. Zheng and 10 | Zhou Zhao and 11 | Hongxia Yang and 12 | Kevin Chen-Chuan Chang and 13 | Minghui Wu and 14 | Jing Ying}, 15 | title = {Subgraph-augmented Path Embedding for Semantic User Search on Heterogeneous Social Network}, 16 | booktitle = {The 2018 Web Conference (WWW 2018)}, 17 | year = {2018} 18 | } 19 | 20 | **************************** 21 | 22 | FileList: 23 | 24 | source-code : The source code of SPE model 25 | readMe : This file. 26 | 27 | 28 | ======================================================================= 29 | source-code 30 | ======================================================================= 31 | FileList in source-code folder: 32 | dataset preparation-SPE : These codes are used to prepare datasets offline for SPE. 33 | model-autoencoder : These codes are used to learn the subgraph embedding from a subgraph structural similarity matrix. 34 | model-SPE : These codes are used to learn the SPE model and evaluate it. 35 | 36 | 1. dataset preparation-SPE 37 | These codes are written in Java, which are used to prepare datasets offline for SPE. 38 | First of all, we use codes from [1] to get all the subgraphs in a graph and then get the instances of each subgraph. 39 | Then we use dataset preparation-SPE to sample o-paths from graph, and then we calculate the number of instances between any node u and node v, and finally generate the m-paths. 40 | All of these are done offline and then saved in the database(or files), so we could use directly in the period of training and testing. 41 | 42 | Main.java is the entry class. And this program reads the parameters from Config.java. 43 | 44 | 2. model-autoencoder 45 | These codes are written in Python, which are used to learn the subgraph embedding from a subgraph structural similarity matrix. 46 | Here we use aotuencoder to learn an embedding x for each subgraph m. 47 | 48 | Here autoencoderTraining.py is the main method of this autoencoder model. And we set parameters in this file. 49 | 50 | 3. model-SPE 51 | These codes are written in Python, which are used to train the SPE model and to evaluate it. It takes the output of codes 1 and codes 2 as its input. 52 | In this folder, experimentForOneFileByParams.py is the main method. This file provides us a method to get parameters from file pythonParamsConfig, and then train the model and test it. 53 | The pythonParamsConfig is the configuration file of this model, and we could set parameters in this file. 54 | At last, this model then will output the results of NDCG and MAP. 55 | 56 | ======================================================================= 57 | References 58 | ======================================================================= 59 | [1]. Fang, Yuan, et al. "Semantic proximity search on graphs with metagraph-based learning." Data Engineering (ICDE), 2016 IEEE 32nd International Conference on. IEEE, 2016. 60 | -------------------------------------------------------------------------------- /readMe~: -------------------------------------------------------------------------------- 1 | Reference: 2 | Zemin Liu, Vincent W. Zheng, Zhou Zhao, Hongxia Yang, Kevin Chen-Chuan Chang, Minghui Wu, and Jing Ying. 2018. Subgraph-augmented Path Embedding for Semantic User Search on Heterogeneous Social Network. In Proceedings of The 2018 Web Conference (WWW 2018). ACM. New York, NY. USA, 10 pages. 3 | 4 | This python project implements the SPE model proposed in the above paper. 5 | Please refer the above paper for all the notations in this readme file. 6 | If you use it for scientific experiments, please cite this paper: 7 | @inproceedings{LiuZ18, 8 | author = {Zemin Liu and 9 | Vincent W. Zheng and 10 | Zhou Zhao and 11 | Hongxia Yang and 12 | Kevin Chen-Chuan Chang and 13 | Minghui Wu and 14 | Jing Ying}, 15 | title = {Subgraph-augmented Path Embedding for Semantic User Search on Heterogeneous Social Network}, 16 | booktitle = {The 2018 Web Conference (WWW 2018)}, 17 | year = {2018} 18 | } 19 | 20 | 21 | FileList: 22 | 23 | source-code : The source code of SPE model 24 | readMe : This file. 25 | 26 | 27 | ======================================================================= 28 | source-code 29 | ======================================================================= 30 | FileList in source-code folder: 31 | dataset preparation-SPE : These codes are used to prepare datasets offline for SPE. 32 | model-autoencoder : These codes are used to learn the subgraph embedding from a subgraph structural similarity matrix. 33 | model-SPE : These codes are used to learn the SPE model and evaluate it. 34 | 35 | 1. dataset preparation-SPE 36 | These codes are written in Java, which are used to prepare datasets offline for SPE. 37 | First of all, we use codes from [1] to get all the subgraphs in a graph and then get the instances of each subgraph. 38 | Then we use dataset preparation-SPE to sample o-paths from graph, and then we calculate the number of instances between any node u and node v, and finally generate the m-paths. 39 | All of these are done offline and then saved in the database(or files), so we could use directly in the period of training and testing. 40 | 41 | Main.java is the entry class. And this program reads the parameters from Config.java. 42 | 43 | 2. model-autoencoder 44 | These codes are written in Python, which are used to learn the subgraph embedding from a subgraph structural similarity matrix. 45 | Here we use aotuencoder to learn an embedding x for each subgraph m. 46 | 47 | Here autoencoderTraining.py is the main method of this autoencoder model. And we set parameters in this file. 48 | 49 | 3. model-SPE 50 | These codes are written in Python, which are used to train the SPE model and to evaluate it. It takes the output of codes 1 and codes 2 as its input. 51 | In this folder, experimentForOneFileByParams.py is the main method. This file provides us a method to get parameters from file pythonParamsConfig, and then train the model and test it. 52 | The pythonParamsConfig is the configuration file of this model, and we could set parameters in this file. 53 | At last, this model then will output the results of NDCG and MAP. 54 | 55 | ======================================================================= 56 | References 57 | ======================================================================= 58 | [1]. Fang, Yuan, et al. "Semantic proximity search on graphs with metagraph-based learning." Data Engineering (ICDE), 2016 IEEE 32nd International Conference on. IEEE, 2016. 59 | --------------------------------------------------------------------------------