├── ReadMe
├── code
    ├── asymmetric
    │   ├── java - prepare data for model
    │   │   ├── Config.java
    │   │   ├── GenerateEntitiesFeatureByTypes.java
    │   │   ├── GenerateEntitiesFeaturesByGraph.java
    │   │   ├── GenerateSubPathsFromSamplings.java
    │   │   ├── Main.java
    │   │   ├── Node.java
    │   │   ├── RandomWalkSampling.java
    │   │   ├── ReadWholeGraph.java
    │   │   └── javaParams.properties
    │   └── python - model
    │   │   ├── dataProcessTools.py
    │   │   ├── evaluateTools.py
    │   │   ├── experimentForOneFileByParams.py
    │   │   ├── lstmModel.py
    │   │   ├── proxEmbed.py
    │   │   ├── proxEmbedModelMulti.py
    │   │   ├── proxEmbedProcessAndAssess.py
    │   │   ├── proxEmbedProcessModel.py
    │   │   ├── pythonParamsConfig
    │   │   └── toolsFunction.py
    └── symmetric
    │   ├── java - prepare data for model
    │       ├── Config.java
    │       ├── GenerateEntitiesFeatureByTypes.java
    │       ├── GenerateEntitiesFeaturesByGraph.java
    │       ├── GenerateSubPathsFromSamplings.java
    │       ├── Main.java
    │       ├── Node.java
    │       ├── RandomWalkSampling.java
    │       ├── ReadWholeGraph.java
    │       └── javaParams.properties
    │   └── python - model
    │       ├── dataProcessTools.py
    │       ├── evaluateTools.py
    │       ├── experimentForOneFileByParams.py
    │       ├── lstmModel.py
    │       ├── proxEmbed.py
    │       ├── proxEmbedModelMulti.py
    │       ├── proxEmbedProcessAndAssess.py
    │       ├── proxEmbedProcessModel.py
    │       ├── pythonParamsConfig
    │       └── toolsFunction.py
└── toy_data
    ├── ReadMe.txt
    ├── dblp.splits
        ├── ideal
        │   ├── ideal_advisee_1
        │   ├── ideal_advisee_2
        │   ├── ideal_advisee_3
        │   ├── ideal_advisor_1
        │   ├── ideal_advisor_2
        │   └── ideal_advisor_3
        ├── test
        │   ├── test_advisee_1
        │   ├── test_advisee_2
        │   ├── test_advisee_3
        │   ├── test_advisor_1
        │   ├── test_advisor_2
        │   └── test_advisor_3
        └── train.4
        │   ├── train_advisee_1
        │   ├── train_advisee_2
        │   ├── train_advisee_3
        │   ├── train_advisor_1
        │   ├── train_advisor_2
        │   └── train_advisor_3
    ├── dblp
        ├── graph.edge
        └── graph.node
    ├── linkedin.splits
        ├── ideal
        │   ├── ideal_school_1
        │   ├── ideal_school_2
        │   ├── ideal_school_3
        │   ├── ideal_work_1
        │   ├── ideal_work_2
        │   └── ideal_work_3
        ├── test
        │   ├── test_school_1
        │   ├── test_school_2
        │   ├── test_school_3
        │   ├── test_work_1
        │   ├── test_work_2
        │   └── test_work_3
        └── train.4
        │   ├── train_school_1
        │   ├── train_school_2
        │   ├── train_school_3
        │   ├── train_work_1
        │   ├── train_work_2
        │   └── train_work_3
    └── linkedin
        ├── graph.edge
        └── graph.node


/ReadMe:
--------------------------------------------------------------------------------
 1 | This directory contains the source code of ProxEmbed model.
 2 | ========================================================================================================
 3 | @inproceedings{LiuZZZCWY17,
 4 |  author = {Liu, Zemin and Zheng, Vincent W. and Zhao, Zhou and Zhu, Fanwei and Chang, Kevin Chen-Chuan and Wu, Minghui and Ying, Jing},
 5 |  title = {Semantic Proximity Search on Heterogeneous Graph by Proximity Embedding},
 6 |  booktitle = {Proc. of the 31st AAAI Conference on Artificial Intelligence},
 7 |  series = {AAAI '17},
 8 |  year = {2017}
 9 | } 
10 | 
11 | Please cite the above reference for using our code.
12 | 
13 | For inqueries, please contact:
14 | Zemin Liu (liuzemin@zju.edu.cn)
15 | Vincent Zheng (vincent.zheng@adsc.com.sg)
16 | ========================================================================================================
17 | 
18 | This experiment consists of two parts, symmetric and asymmetric. 
19 | The symmetric part is used to compute facebook and linkedin dataset, whose relation is symmetric.
20 | While the asymmetric part is used to compute dblp dataset, whose relation is asymmetric.
21 | 
22 | In each part, such as this symmetric directory, there are two directories, "java - prepare data for model" and "python - model".
23 | The "java - prepare data for model" directory is used to prepare input data for ProxEmbed model, while "python - model" directory is used to generate and assess ProxEmbed model.
24 | 
25 | In "java - prepare data for model", 
26 | Main.java is the entry class. And the program will read parameters from file javaParams.properties.
27 | 
28 | In "python - model", the entry file is experimentForOneFileByParams.py. This file provides us a method to get parameters from file pythonParamsConfig, and then trains the model and then tests it.
29 | For instance, in file pythonParamsConfig, if we set 
30 | 	root_dir = ./toy_data/,
31 | 	dataset_name = linkedin,
32 | 	suffix = 4,
33 | 	class_name = school,
34 | 	index = 1,
35 | then this model will use ./toy_data/linkedin.splits/train.4/train_school_1 as the training data, 
36 | 	and will use ./toy_data/linkedin.splits/test/test_school_1 as test data,
37 | 	and will use ./toy_data/linkedin.splits/ideal/ideal_school_1 as ideal data (ground-truth).
38 | 	
39 | So at last, this model then will output the results of NDCG and MAP.
40 | 
41 | The "java - prepare data for model" will read data from main folder of the dataset (which contains graph.node and graph.edge), such as "linkedin" folder, and then output intermediate dataset into the same folder.
42 | Then "python - model" will take these data as one part of the input.
43 | 


--------------------------------------------------------------------------------
/code/asymmetric/java - prepare data for model/Config.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.FileInputStream;
  4 | import java.util.Properties;
  5 | 
  6 | /**
  7 |  * Parameters for ProxEmbed to prepare data for training.
  8 |  */
  9 | public class Config {
 10 | 
 11 | 	/**
 12 | 	 * The main dataset directory.
 13 | 	 * Eg, "D:/dataset/icde2016/dataset/linkedin/" means the directory of linkedin.
 14 | 	 */
 15 | 	public static String MAIN_DIR="D:/test/test/toydata/dblp/";
 16 | 	/**
 17 | 	 * Path of nodes file.
 18 | 	 */
 19 | 	public static String NODES_PATH=MAIN_DIR+"graph.node";
 20 | 	/**
 21 | 	 * Path of edges file.
 22 | 	 */
 23 | 	public static String EDGES_PATH=MAIN_DIR+"graph.edge";
 24 | 	/**
 25 | 	 * The result file of random walk sampling.
 26 | 	 */
 27 | 	public static String SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS=MAIN_DIR+"randomWalkSamplingPaths";
 28 | 	/**
 29 | 	 * The file which contains the map relation of type and typeid.
 30 | 	 */
 31 | 	public static String TYPE_TYPEID_SAVEFILE=MAIN_DIR+"typeAndTypeIDSavePath";
 32 | 	/**
 33 | 	 * The file which contains the nodes features.
 34 | 	 */
 35 | 	public static String NODES_FEATURE_SAVE_PATH=MAIN_DIR+"nodesFeatures";
 36 | 	/**
 37 | 	 * Truncate sub-paths from samplings by this type.
 38 | 	 */
 39 | 	public static String TRUNCATED_TYPE_NAME="user";
 40 | 	/**
 41 | 	 * Sub-paths save path.
 42 | 	 */
 43 | 	public static String SUBPATHS_SAVE_PATH=MAIN_DIR+"subpathsSaveFile";
 44 | 	/**
 45 | 	 * The longest length for sampling to truncate sub-paths.
 46 | 	 */
 47 | 	public static int LONGEST_ANALYSE_LENGTH_FOR_SAMPLING=20;
 48 | 	/**
 49 | 	 * Longest length for sub-paths
 50 | 	 */
 51 | 	public static int LONGEST_LENGTH_FOR_SUBPATHS=5;
 52 | 	/**
 53 | 	 * The shortest length for each path in sampling results.
 54 | 	 */
 55 | 	public static int SHORTEST_LENGTH_FOR_SAMPLING=0;
 56 | 	/**
 57 | 	 * Sampling times for per node in random walk sampling.
 58 | 	 */
 59 | 	public static int SAMPLING_TIMES_PER_NODE=5;
 60 | 	/**
 61 | 	 * Sampling length for per node in random walk sampling.
 62 | 	 */
 63 | 	public static int SAMPLING_LENGTH_PER_PATH=5;
 64 | 	/**
 65 | 	 * When generate user features by neighbours' information, the value we set for type information when this node belongs to this kind of type.
 66 | 	 */
 67 | 	public static double FEATURE_TYPE_VALUE=1.0;
 68 | 	
 69 | 	//initialize
 70 | 	static{
 71 | 
 72 | 		Properties prop = new Properties();
 73 | 		FileInputStream in=null;
 74 | 		try {
 75 | 			//The path of properties file
 76 | 			in = new FileInputStream("/usr/lzmExperiment/path2vec/paramsSet/javaParams.properties");
 77 | 			prop.load(in);
 78 | 			
 79 | 			MAIN_DIR=prop.getProperty("MAIN_DIR");
 80 | 			NODES_PATH=MAIN_DIR+prop.getProperty("NODES_PATH");
 81 | 			EDGES_PATH=MAIN_DIR+prop.getProperty("EDGES_PATH");
 82 | 			SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS=MAIN_DIR+prop.getProperty("SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS");
 83 | 			TYPE_TYPEID_SAVEFILE=MAIN_DIR+prop.getProperty("TYPE_TYPEID_SAVEFILE");
 84 | 			NODES_FEATURE_SAVE_PATH=MAIN_DIR+prop.getProperty("NODES_FEATURE_SAVE_PATH");
 85 | 			TRUNCATED_TYPE_NAME=prop.getProperty("TRUNCATED_TYPE_NAME");
 86 | 			SUBPATHS_SAVE_PATH=MAIN_DIR+prop.getProperty("SUBPATHS_SAVE_PATH");
 87 | 			LONGEST_ANALYSE_LENGTH_FOR_SAMPLING=Integer.parseInt(prop.getProperty("LONGEST_ANALYSE_LENGTH_FOR_SAMPLING"));
 88 | 			LONGEST_LENGTH_FOR_SUBPATHS=Integer.parseInt(prop.getProperty("LONGEST_LENGTH_FOR_SUBPATHS"));
 89 | 			SHORTEST_LENGTH_FOR_SAMPLING=Integer.parseInt(prop.getProperty("SHORTEST_LENGTH_FOR_SAMPLING"));
 90 | 			SAMPLING_TIMES_PER_NODE=Integer.parseInt(prop.getProperty("SAMPLING_TIMES_PER_NODE"));
 91 | 			SAMPLING_LENGTH_PER_PATH=Integer.parseInt(prop.getProperty("SAMPLING_LENGTH_PER_PATH"));
 92 | 			FEATURE_TYPE_VALUE=Double.parseDouble(prop.getProperty("FEATURE_TYPE_VALUE"));
 93 | 			
 94 | 			//Print these parameters
 95 | 			System.out.println("Java parameters is :");
 96 | 			System.out.println("MAIN_DIR : "+MAIN_DIR);
 97 | 			System.out.println("NODES_PATH : "+NODES_PATH);
 98 | 			System.out.println("EDGES_PATH : "+EDGES_PATH);
 99 | 			System.out.println("SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS : "+SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS);
100 | 			System.out.println("TYPE_TYPEID_SAVEFILE : "+TYPE_TYPEID_SAVEFILE);
101 | 			System.out.println("NODES_FEATURE_SAVE_PATH : "+NODES_FEATURE_SAVE_PATH);
102 | 			System.out.println("TRUNCATED_TYPE_NAME : "+TRUNCATED_TYPE_NAME);
103 | 			System.out.println("SUBPATHS_SAVE_PATH : "+SUBPATHS_SAVE_PATH);
104 | 			System.out.println("LONGEST_ANALYSE_LENGTH_FOR_SAMPLING : "+LONGEST_ANALYSE_LENGTH_FOR_SAMPLING);
105 | 			System.out.println("LONGEST_LENGTH_FOR_SUBPATHS : "+LONGEST_LENGTH_FOR_SUBPATHS);
106 | 			System.out.println("SHORTEST_LENGTH_FOR_SAMPLING : "+SHORTEST_LENGTH_FOR_SAMPLING);
107 | 			System.out.println("SAMPLING_TIMES_PER_NODE : "+SAMPLING_TIMES_PER_NODE);
108 | 			System.out.println("SAMPLING_LENGTH_PER_PATH : "+SAMPLING_LENGTH_PER_PATH);
109 | 			System.out.println("FEATURE_TYPE_VALUE : "+FEATURE_TYPE_VALUE);
110 | 		} catch (Exception e) {
111 | 			e.printStackTrace();
112 | 		}
113 | 		finally{
114 | 			try {
115 | 				if(in!=null){
116 | 					in.close();
117 | 					in=null;
118 | 				}
119 | 			} catch (Exception e2) {
120 | 				e2.printStackTrace();
121 | 			}
122 | 		}
123 | 	
124 | 	}
125 | }
126 | 


--------------------------------------------------------------------------------
/code/asymmetric/java - prepare data for model/GenerateEntitiesFeatureByTypes.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.FileWriter;
  4 | import java.util.HashMap;
  5 | import java.util.HashSet;
  6 | import java.util.Map;
  7 | import java.util.Set;
  8 | 
  9 | /**
 10 |  * Generate entity features by information from neighbours -- just for asymmetric
 11 |  */
 12 | public class GenerateEntitiesFeatureByTypes {
 13 | 
 14 | 	private Set<String> types=new HashSet<String>();
 15 | 	private Map<String,Integer> type2Typeid=new HashMap<String, Integer>();
 16 | 	private Map<Integer,String> typeid2Type=new HashMap<Integer, String>();
 17 | 
 18 | 	static String nodes_path=Config.NODES_PATH;
 19 | 	static String edges_path=Config.EDGES_PATH;
 20 | 	static String entities_feature_file=Config.NODES_FEATURE_SAVE_PATH;
 21 | 	static String typeAndTypeIdPath=Config.TYPE_TYPEID_SAVEFILE;
 22 | 	static double feature_type_value=Config.FEATURE_TYPE_VALUE;
 23 | 	
 24 | 	public static void main(String[] args) {
 25 | 		ReadWholeGraph rwg=new ReadWholeGraph();
 26 | 		Map<Integer,Node> graph=rwg.readDataFromFile(nodes_path, edges_path, typeAndTypeIdPath);
 27 | 		GenerateEntitiesFeatureByTypes gefb=new GenerateEntitiesFeatureByTypes();
 28 | 		gefb.analyseTypes(graph);
 29 | 		gefb.generateFeaturesByGraph(graph, entities_feature_file,feature_type_value);
 30 | 	}
 31 | 
 32 | 
 33 | 	/**
 34 | 	 * Analyse this graph.
 35 | 	 * @param graph
 36 | 	 */
 37 | 	public void analyseTypes(Map<Integer,Node> graph){
 38 | 		for(Node n:graph.values()){
 39 | 			types.add(n.getType());
 40 | 			if(!type2Typeid.containsKey(n.getType())){
 41 | 				type2Typeid.put(n.getType(), type2Typeid.size());
 42 | 				typeid2Type.put(typeid2Type.size(), n.getType());
 43 | 			}	
 44 | 		}
 45 | 	}
 46 | 	
 47 | 	/**
 48 | 	 * Generate nodes features.
 49 | 	 * @param graph
 50 | 	 * @param saveFile
 51 | 	 */
 52 | 	public void generateFeaturesByGraph(Map<Integer,Node> graph,String saveFile,double typeValue){
 53 | 		int dimension=types.size();
 54 | 		int nodesNum=graph.size();
 55 | 		StringBuilder sb=new StringBuilder();
 56 | 		String type=null;
 57 | 		int typeId=0;
 58 | 		Map<String,Integer> typesNum=new HashMap<String, Integer>();
 59 | 		FileWriter writer = null;
 60 | 		try {
 61 | 			writer = new FileWriter(saveFile);
 62 | 			writer.write(nodesNum+" "+dimension+"\r\n");
 63 | 			writer.flush();
 64 | 			for(Node now:graph.values()){
 65 | 				sb.delete( 0, sb.length() );
 66 | 				typesNum.clear();
 67 | 				
 68 | 				sb.append(now.getId()+" ");
 69 | 				type=now.getType();
 70 | 				typeId=type2Typeid.get(type);
 71 | 				
 72 | 				for(int i=0;i<types.size();i++){
 73 | 					if(i==typeId){
 74 | 						sb.append(typeValue+" ");
 75 | 					}
 76 | 					else{
 77 | 						sb.append(0.0+" ");
 78 | 					}
 79 | 				}
 80 | 				
 81 | 				sb.append("\r\n");
 82 | 				writer.write(sb.toString());
 83 | 				writer.flush();
 84 | 			}
 85 | 		} catch (Exception e) {
 86 | 			e.printStackTrace();
 87 | 		}
 88 | 		finally{
 89 | 			try {
 90 | 				if(writer!=null){
 91 | 					writer.close();
 92 | 					writer=null;
 93 | 				}
 94 | 			} catch (Exception e2) {
 95 | 				e2.printStackTrace();
 96 | 			}
 97 | 		}
 98 | 	}
 99 | }
100 | 


--------------------------------------------------------------------------------
/code/asymmetric/java - prepare data for model/GenerateEntitiesFeaturesByGraph.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.FileWriter;
  4 | import java.util.HashMap;
  5 | import java.util.HashSet;
  6 | import java.util.Map;
  7 | import java.util.Set;
  8 | 
  9 | /**
 10 |  * Generate entity features by information from neighbours -- just for asymmetric
 11 |  */
 12 | public class GenerateEntitiesFeaturesByGraph {
 13 | 
 14 | 	private Set<String> types=new HashSet<String>();
 15 | 	private Map<String,Integer> type2Typeid=new HashMap<String, Integer>();
 16 | 	private Map<Integer,String> typeid2Type=new HashMap<Integer, String>();
 17 | 	static String nodes_path=Config.NODES_PATH;
 18 | 	static String edges_path=Config.EDGES_PATH;
 19 | 	static String entities_feature_file=Config.NODES_FEATURE_SAVE_PATH;
 20 | 	static String typeAndTypeIdPath=Config.TYPE_TYPEID_SAVEFILE;
 21 | 	static double feature_type_value=Config.FEATURE_TYPE_VALUE;
 22 | 	
 23 | 	public static void main(String[] args) {
 24 | 		ReadWholeGraph rwg=new ReadWholeGraph();
 25 | 		Map<Integer,Node> graph=rwg.readDataFromFile(nodes_path, edges_path, typeAndTypeIdPath);
 26 | 		GenerateEntitiesFeaturesByGraph gefb=new GenerateEntitiesFeaturesByGraph();
 27 | 		gefb.analyseTypes(graph);
 28 | 		gefb.generateFeaturesByGraph(graph, entities_feature_file,feature_type_value);
 29 | 	}
 30 | 
 31 | 	/**
 32 | 	 * Analyse nodes types
 33 | 	 */
 34 | 	public void analyseTypes(Map<Integer,Node> graph){
 35 | 		for(Node n:graph.values()){
 36 | 			types.add(n.getType());
 37 | 			if(!type2Typeid.containsKey(n.getType())){
 38 | 				type2Typeid.put(n.getType(), type2Typeid.size());
 39 | 				typeid2Type.put(typeid2Type.size(), n.getType());
 40 | 			}	
 41 | 		}
 42 | 	}
 43 | 	
 44 | 	/**
 45 | 	 * Generate nodes features
 46 | 	 */
 47 | 	public void generateFeaturesByGraph(Map<Integer,Node> graph,String saveFile,double typeValue){
 48 | 		int dimension=types.size()+1+types.size()+1;
 49 | 		int nodesNum=graph.size();
 50 | 		StringBuilder sb=new StringBuilder();
 51 | 		String type=null;
 52 | 		int typeId=0;
 53 | 		double value=0;
 54 | 		double sum=0;
 55 | 		Map<String,Integer> typesNum=new HashMap<String, Integer>();
 56 | 		FileWriter writer = null;
 57 | 		try {
 58 | 			writer = new FileWriter(saveFile);
 59 | 			writer.write(nodesNum+" "+dimension+"\r\n");
 60 | 			writer.flush();
 61 | 			for(Node now:graph.values()){
 62 | 				sb.delete( 0, sb.length() );
 63 | 				typesNum.clear();
 64 | 				
 65 | 				sb.append(now.getId()+" ");
 66 | 				type=now.getType();
 67 | 				typeId=type2Typeid.get(type);
 68 | 				
 69 | 				for(int i=0;i<types.size();i++){
 70 | 					if(i==typeId){
 71 | 						sb.append(typeValue+" ");
 72 | 					}
 73 | 					else{
 74 | 						sb.append(0.0+" ");
 75 | 					}
 76 | 				}
 77 | 				
 78 | 				value=now.in_nodes.size();
 79 | 				sb.append(Math.log(value+1.0)+" ");
 80 | 				
 81 | 				for(Node n:now.in_nodes){
 82 | 					type=n.getType();
 83 | 					if(typesNum.containsKey(type)){
 84 | 						typesNum.put(type, typesNum.get(type)+1);
 85 | 					}
 86 | 					else{
 87 | 						typesNum.put(type, 1);
 88 | 					}
 89 | 				}
 90 | 				for(int i=0;i<typeid2Type.size();i++){
 91 | 					type=typeid2Type.get(i);
 92 | 					if(typesNum.containsKey(type)){
 93 | 						sb.append(Math.log(typesNum.get(type)+1)+" ");
 94 | 					}
 95 | 					else{
 96 | 						sb.append(0.0+" ");
 97 | 					}
 98 | 				}
 99 | 				
100 | 				value=0;
101 | 				sum=0;
102 | 				for(int num:typesNum.values()){
103 | 					value=(num+0.0)/now.in_nodes.size();
104 | 					sum+=-value*Math.log(value);
105 | 				}
106 | 				sb.append(sum);
107 | 				
108 | 				sb.append("\r\n");
109 | 				writer.write(sb.toString());
110 | 				writer.flush();
111 | 			}
112 | 		} catch (Exception e) {
113 | 			e.printStackTrace();
114 | 		}
115 | 		finally{
116 | 			try {
117 | 				if(writer!=null){
118 | 					writer.close();
119 | 					writer=null;
120 | 				}
121 | 			} catch (Exception e2) {
122 | 				e2.printStackTrace();
123 | 			}
124 | 		}
125 | 	}
126 | }
127 | 


--------------------------------------------------------------------------------
/code/asymmetric/java - prepare data for model/GenerateSubPathsFromSamplings.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileWriter;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.ArrayList;
  9 | import java.util.Arrays;
 10 | import java.util.HashMap;
 11 | import java.util.HashSet;
 12 | import java.util.List;
 13 | import java.util.Map;
 14 | import java.util.Set;
 15 | 
 16 | /**
 17 |  * Generate sub-paths by samplings.
 18 |  */
 19 | public class GenerateSubPathsFromSamplings {
 20 | 
 21 | 	static String nodes_path=Config.NODES_PATH;
 22 | 	static String conditional_random_walk_sampling_paths=Config.SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS;
 23 | 	static String truncated_type_name=Config.TRUNCATED_TYPE_NAME;
 24 | 	static String subpaths_save_path=Config.SUBPATHS_SAVE_PATH;
 25 | 	static int longest_length_for_window=Config.LONGEST_ANALYSE_LENGTH_FOR_SAMPLING;
 26 | 	static int longest_lenght_for_subpaths=Config.LONGEST_LENGTH_FOR_SUBPATHS;
 27 | 	
 28 | 	public static void main(String[] args) {
 29 | 		GenerateSubPathsFromSamplings g=new GenerateSubPathsFromSamplings();
 30 | 		g.generateSubPathsFromSamplings(
 31 | 				nodes_path, 
 32 | 				conditional_random_walk_sampling_paths, 
 33 | 				truncated_type_name, 
 34 | 				subpaths_save_path, 
 35 | 				longest_length_for_window,
 36 | 				longest_lenght_for_subpaths);
 37 | 	}
 38 | 
 39 | 	/**
 40 | 	 * Generate sub-paths by samplings.
 41 | 	 */
 42 | 	public void generateSubPathsFromSamplings(String nodesPath,String samplingsPath,String truncatedNodeType,String subPathsSavePath,int window_maxlen,int subpath_maxlen){
 43 | 		Set<Integer> truncatedNodeIds=new HashSet<Integer>();
 44 | 		Set<String> truncatedTypes=new HashSet<String>();
 45 | 		String[] arr=truncatedNodeType.split(" ");
 46 | 		truncatedTypes.addAll(Arrays.asList(arr));
 47 | 		BufferedReader br=null;
 48 | 		arr=null;
 49 | 		try {
 50 | 			br = new BufferedReader(new InputStreamReader(new FileInputStream(nodesPath), "UTF-8"));
 51 | 			String temp = null;
 52 | 			while ((temp = br.readLine()) != null ) {
 53 | 				temp=temp.trim();
 54 | 				if(temp.length()>0){
 55 | 					arr=temp.split("	");
 56 | 					if(truncatedTypes.contains(arr[1])){
 57 | 						truncatedNodeIds.add(Integer.parseInt(arr[0]));
 58 | 					}
 59 | 				}
 60 | 			}
 61 | 		} catch (Exception e2) {
 62 | 			e2.printStackTrace();
 63 | 		}
 64 | 		finally{
 65 | 			try {
 66 | 				if(br!=null){
 67 | 					br.close();
 68 | 					br=null;
 69 | 				}
 70 | 			} catch (IOException e) {
 71 | 				e.printStackTrace();
 72 | 			}
 73 | 		}
 74 | 		FileWriter writer =null;
 75 | 		String t=null;
 76 | 		List<Integer> path=new ArrayList<Integer>();
 77 | 		try {
 78 | 			br = new BufferedReader(new InputStreamReader(new FileInputStream(samplingsPath), "UTF-8"));
 79 | 			writer = new FileWriter(subPathsSavePath);
 80 | 			String temp = null;
 81 | 			while ((temp = br.readLine()) != null ) {
 82 | 				temp=temp.trim();
 83 | 				if(temp.length()>0){
 84 | 					path.clear();
 85 | 					arr=temp.split(" ");
 86 | 					for(String s:arr){
 87 | 						path.add(Integer.parseInt(s));
 88 | 					}
 89 | 					t=analyseOnePath(path, truncatedNodeIds, window_maxlen, subpath_maxlen);
 90 | 					if(t.length()>0){
 91 | 						writer.write(t);
 92 | 						writer.flush();
 93 | 					}
 94 | 				}
 95 | 			}
 96 | 		} catch (Exception e2) {
 97 | 			e2.printStackTrace();
 98 | 		}
 99 | 		finally{
100 | 			try {
101 | 				if(writer!=null){
102 | 					writer.close();
103 | 					writer=null;
104 | 				}
105 | 				if(br!=null){
106 | 					br.close();
107 | 					br=null;
108 | 				}
109 | 			} catch (IOException e) {
110 | 				e.printStackTrace();
111 | 			}
112 | 		}
113 | 	}
114 | 	
115 | 	/**
116 | 	 * Generate sub-paths by one specific sampling path.
117 | 	 */
118 | 	private String analyseOnePath(List<Integer> path,Set<Integer> truncatedNodeIds,int maxWindowLen,int maxSubpathLen){
119 | 		StringBuilder sb=new StringBuilder();
120 | 		int start=0;
121 | 		int end=0;
122 | 		List<Integer> subpath=new ArrayList<Integer>();
123 | 		for(int i=0;i<path.size();i++){
124 | 			start=path.get(i);
125 | 			if(!truncatedNodeIds.contains(start)){
126 | 				continue;
127 | 			}
128 | 			for(int j=i+1;j<path.size();j++){
129 | 				end=path.get(j);
130 | 				if(!truncatedNodeIds.contains(end)){
131 | 					continue;
132 | 				}
133 | 				
134 | 				if(maxWindowLen>0 && (j-i)>maxWindowLen){
135 | 					break;
136 | 				}
137 | 				
138 | 				subpath.clear();
139 | 				for(int x=i;x<=j;x++){
140 | 					subpath.add(path.get(x)+0);
141 | 				}
142 | 				List<Integer> subpathNoRepeat=deleteRepeat(subpath);
143 | 				if(subpathNoRepeat.size()<2){
144 | 					subpathNoRepeat=null;
145 | 					continue;
146 | 				}
147 | 				
148 | 				if(maxSubpathLen>0 && subpathNoRepeat.size()>maxSubpathLen){
149 | 					continue;
150 | 				}
151 | 				
152 | 				sb.append(path.get(i)+"	"+path.get(j)+"	");
153 | 				for(int x=0;x<subpathNoRepeat.size();x++){
154 | 					sb.append(subpathNoRepeat.get(x)+" ");
155 | 				}
156 | 				sb.append("\r\n");
157 | 				subpathNoRepeat=null;
158 | 			}
159 | 		}
160 | 		return sb.toString();
161 | 	}
162 | 	
163 | 	/**
164 | 	 * Delete repeat segments for sub-paths
165 | 	 */
166 | 	public List<Integer> deleteRepeat(List<Integer> path){
167 | 		Map<Integer,Integer> map=new HashMap<Integer,Integer>();
168 | 		int node=0;
169 | 		List<Integer> result=new ArrayList<Integer>();
170 | 		int formerIndex=0;
171 | 		for(int i=0;i<path.size();i++){
172 | 			node=path.get(i);
173 | 			if(!map.containsKey(node)){
174 | 				map.put(node, i);
175 | 			}
176 | 			else{
177 | 				formerIndex=map.get(node);
178 | 				for(int j=formerIndex;j<i;j++){
179 | 					map.remove(path.get(j));
180 | 					path.set(j, -1);
181 | 				}
182 | 				map.put(node, i);
183 | 			}
184 | 		}
185 | 		for(int i=0;i<path.size();i++){
186 | 			if(path.get(i)!=-1){
187 | 				result.add(path.get(i));
188 | 			}
189 | 		}
190 | 		return result;
191 | 	}
192 | }
193 | 


--------------------------------------------------------------------------------
/code/asymmetric/java - prepare data for model/Main.java:
--------------------------------------------------------------------------------
 1 | package dataPrepare.ProxEmbed;
 2 | 
 3 | /**
 4 |  * Prepare data from graph for training and testing.
 5 |  * 
 6 |  * Procedure:
 7 |  * 1.Sampling by random walk.
 8 |  * 2.Generate entity features by information from graph.
 9 |  * 3.Generate sub-paths from samplings from step 1.
10 |  * 
11 |  * Finally, these results are written to a file.
12 |  * 
13 |  * Attention:
14 |  * 	If you want to generate data for symmetric training data, then you should remove "GenerateEntitiesFeaturesByGraph.main(null);" and only use "GenerateEntitiesFeatureByTypes.main(null);".
15 |  *  While if you want to generate data for asymmetric training data, then you should remove "GenerateEntitiesFeatureByTypes.main(null);" and only use "GenerateEntitiesFeaturesByGraph.main(null);".
16 |  */
17 | public class Main {
18 | 
19 | 	public static void main(String[] args) {
20 | 		//random walk sampling
21 | 		long starttime=System.currentTimeMillis();
22 | 		System.out.println("Start random walk sampling......");
23 | 		RandomWalkSampling.main(null);
24 | 		
25 | 		//generate entities features by information from this graph.
26 | 		System.out.println("Start generating entities' features......");
27 | 		GenerateEntitiesFeaturesByGraph.main(null);//Generate entity features by information from neighbours -- just for asymmetric
28 | //		GenerateEntitiesFeatureByTypes.main(null);//Generate entity features only by type information -- just for symmetric
29 | 		
30 | 		//generate sub-paths
31 | 		System.out.println("Start generating sub-paths from samplings......");
32 | 		GenerateSubPathsFromSamplings.main(null);//Generate sub-paths from samplings.
33 | 		long endtime=System.currentTimeMillis();
34 | 		System.out.println("Cost time == "+(endtime-starttime)+" ms");
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/code/asymmetric/java - prepare data for model/Node.java:
--------------------------------------------------------------------------------
 1 | package dataPrepare.ProxEmbed;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.HashMap;
 5 | import java.util.HashSet;
 6 | import java.util.List;
 7 | import java.util.Map;
 8 | import java.util.Set;
 9 | 
10 | /**
11 |  * Node class, contains id, type, in and out info.
12 |  */
13 | public class Node {
14 | 
15 | 	private int id=-1;
16 | 	private String type=null;
17 | 	/**
18 | 	 * Index for type, not use now.
19 | 	 */
20 | 	private int typeId=-1;
21 | 	public List<Node> in_nodes=new ArrayList<Node>();
22 | 	public List<Node> out_nodes=new ArrayList<Node>();
23 | 	public List<Integer> in_ids=new ArrayList<Integer>();
24 | 	public List<Integer> out_ids=new ArrayList<Integer>();
25 | 	public Map<Node,List<List<Node>>> typePaths=new HashMap<Node, List<List<Node>>>();
26 | 	public Set<Node> neighbours=new HashSet<Node>(); 
27 | 
28 | 	public int getId() {
29 | 		return id;
30 | 	}
31 | 
32 | 	public void setId(int id) {
33 | 		this.id = id;
34 | 	}
35 | 
36 | 	public String getType() {
37 | 		return type;
38 | 	}
39 | 
40 | 	public void setType(String type) {
41 | 		this.type = type;
42 | 	}
43 | 	
44 | 	public int getTypeId() {
45 | 		return typeId;
46 | 	}
47 | 
48 | 	public void setTypeId(int typeId) {
49 | 		this.typeId = typeId;
50 | 	}
51 | 
52 | 	@Override
53 | 	public int hashCode() {
54 | 		return this.id;
55 | 	}
56 | 
57 | 	@Override
58 | 	public boolean equals(Object obj) {
59 | 		if(obj instanceof Node){
60 | 			Node node=(Node) obj;
61 | 			if(node.getId()==this.id){
62 | 				return true;
63 | 			}
64 | 		}
65 | 		return false;
66 | 	}
67 | 
68 | 	@Override
69 | 	public String toString() {
70 | 		return "[id="+id+",neighbours=["+getNeighboursInfo()+"]]";
71 | 	}
72 | 	
73 | 	private String getNeighboursInfo(){
74 | 		StringBuilder sb=new StringBuilder();
75 | 		if(neighbours.size()==0){
76 | 			return "";
77 | 		}
78 | 		else{
79 | 			for(Node n:neighbours){
80 | 				sb.append(n.id+",");
81 | 			}
82 | 			return sb.toString();
83 | 		}
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/code/asymmetric/java - prepare data for model/RandomWalkSampling.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.FileWriter;
  4 | import java.io.IOException;
  5 | import java.util.ArrayList;
  6 | import java.util.HashMap;
  7 | import java.util.HashSet;
  8 | import java.util.List;
  9 | import java.util.Map;
 10 | import java.util.Random;
 11 | import java.util.Set;
 12 | 
 13 | 
 14 | /**
 15 |  * Generate samplings by random walk samplings.
 16 |  * 
 17 |  * Procedure:
 18 |  * 1.Read the whole graph
 19 |  * 2.Generate samplings by random walk.
 20 |  */
 21 | public class RandomWalkSampling {
 22 | 
 23 | 	/**
 24 | 	 * Random number generator
 25 | 	 */
 26 | 	private Random random=new Random(123);
 27 | 	
 28 | 	static String nodesPath=Config.NODES_PATH;
 29 | 	static String edgesPath=Config.EDGES_PATH;
 30 | 	static String savePath=Config.SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS;
 31 | 	static int K=Config.SAMPLING_TIMES_PER_NODE;
 32 | 	static int L=Config.SAMPLING_LENGTH_PER_PATH;
 33 | 	static String typeAndTypeIdPath=Config.TYPE_TYPEID_SAVEFILE;
 34 | 	static int shortest_path_length=Config.SHORTEST_LENGTH_FOR_SAMPLING;
 35 | 	
 36 | 	public static void main(String[] args) {
 37 | 		ReadWholeGraph rwg=new ReadWholeGraph();
 38 | 		//1.Read the whole graph
 39 | 		Map<Integer,Node> data=rwg.readDataFromFile(
 40 | 				nodesPath, 
 41 | 				edgesPath, 
 42 | 				typeAndTypeIdPath); 
 43 | 		//2.Generate samplings by random walk.
 44 | 		RandomWalkSampling crws=new RandomWalkSampling();
 45 | 		crws.randomWalkSampling(data, K, L, savePath);
 46 | 	}
 47 | 
 48 | 	/**
 49 | 	 * Generate samplings by random walk.
 50 | 	 * @param data
 51 | 	 * @param k
 52 | 	 * @param l
 53 | 	 * @param pathsFile
 54 | 	 */
 55 | 	public void randomWalkSampling(Map<Integer,Node> data,int k,int l,String pathsFile){
 56 | 		List<Node> path=null;
 57 | 		FileWriter writer=null;
 58 | 		StringBuilder sb=new StringBuilder();
 59 | 		try {
 60 | 			writer=new FileWriter(pathsFile);
 61 | 		} catch (IOException e) {
 62 | 			e.printStackTrace();
 63 | 		}
 64 | 		for(Node node:data.values()){
 65 | 			for(int i=0;i<k;i++){
 66 | 				path=randomWalkPath(node,l,data);
 67 | 				if(path.size()<shortest_path_length){
 68 | 					continue;
 69 | 				}
 70 | 				sb.delete( 0, sb.length() );
 71 | 				for(int j=0;j<path.size();j++){
 72 | 					sb.append(path.get(j).getId()+" ");
 73 | 				}
 74 | 				sb.append("\r\n");
 75 | 				try {
 76 | 					writer.write(sb.toString());
 77 | 					writer.flush();
 78 | 				} catch (IOException e) {
 79 | 					e.printStackTrace();
 80 | 				}
 81 | 			}
 82 | 			
 83 | 		}
 84 | 	}
 85 | 	
 86 | 	/**
 87 | 	 * Generate a path by random walk.
 88 | 	 * @param start
 89 | 	 * @param l
 90 | 	 * @param data
 91 | 	 * @return
 92 | 	 */
 93 | 	private List<Node> randomWalkPath(Node start,int l, Map<Integer,Node> data){
 94 | 		List<Node> path=new ArrayList<Node>(l+1);
 95 | 		path.add(start);
 96 | 		Node now=start;
 97 | 		Set<Integer> types_set=new HashSet<Integer>();
 98 | 		List<Integer> types=new ArrayList<Integer>();
 99 | 		Map<Integer,List<Integer>> neighbours=new HashMap<Integer, List<Integer>>();
100 | 		int type=-1;
101 | 		List<Integer> list=null;
102 | 		for(int i=0;i<l;i++){
103 | 			if(now.out_nodes.size()==0){
104 | 				break;
105 | 			}
106 | 			types_set.clear();
107 | 			types.clear();
108 | 			neighbours.clear();
109 | 			for(Node n:now.out_nodes){
110 | 				types_set.add(n.getTypeId());
111 | 				if(neighbours.containsKey(n.getTypeId())){
112 | 					neighbours.get(n.getTypeId()).add(n.getId());
113 | 				}
114 | 				else{
115 | 					List<Integer> ids=new ArrayList<Integer>();
116 | 					ids.add(n.getId());
117 | 					neighbours.put(n.getTypeId(), ids);
118 | 				}
119 | 			}
120 | 			types.addAll(types_set);
121 | 			type=types.get(random.nextInt(types.size()));
122 | 			list=neighbours.get(type);
123 | 			now=data.get(list.get(random.nextInt(list.size())));
124 | 			path.add(now);
125 | 		}
126 | 		return path;
127 | 	}
128 | }
129 | 


--------------------------------------------------------------------------------
/code/asymmetric/java - prepare data for model/ReadWholeGraph.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileWriter;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.HashMap;
  9 | import java.util.Map;
 10 | 
 11 | /**
 12 |  * Read the while graph, and then save the info into Map<Integer,Node>
 13 |  */
 14 | public class ReadWholeGraph {
 15 | 
 16 | 	static Map<Integer,String> typeid2Type=new HashMap<Integer, String>();
 17 | 	static Map<String,Integer> type2Typeid=new HashMap<String, Integer>();
 18 | 
 19 | 	/**
 20 | 	 * Read whole graph info
 21 | 	 * @param nodesPath
 22 | 	 * @param edgesPath
 23 | 	 * @param typeAndTypeIdPath
 24 | 	 * @return
 25 | 	 */
 26 | 	public Map<Integer,Node> readDataFromFile(String nodesPath,String edgesPath,String typeAndTypeIdPath){
 27 | 		Map<Integer,Node> data=new HashMap<Integer,Node>();
 28 | 		BufferedReader br=null;
 29 | 		String[] arr=null;
 30 | 		Node node=null;
 31 | 		try {
 32 | 			br = new BufferedReader(new InputStreamReader(new FileInputStream(nodesPath), "UTF-8"));
 33 | 			String temp = null;
 34 | 			while ((temp = br.readLine()) != null ) {
 35 | 				temp=temp.trim();
 36 | 				if(temp.length()>0){
 37 | 					arr=temp.split("\t");
 38 | 					node=new Node();
 39 | 					node.setId(Integer.parseInt(arr[0]));
 40 | 					node.setType(arr[1]);
 41 | 					if(type2Typeid.containsKey(arr[1])){
 42 | 						node.setTypeId(type2Typeid.get(arr[1]));
 43 | 					}
 44 | 					else{
 45 | 						type2Typeid.put(arr[1], type2Typeid.size());
 46 | 						typeid2Type.put(typeid2Type.size(), arr[1]);
 47 | 						node.setTypeId(type2Typeid.get(arr[1]));
 48 | 					}
 49 | 					data.put(Integer.parseInt(arr[0]), node);
 50 | 				}
 51 | 			}
 52 | 		} catch (Exception e2) {
 53 | 			e2.printStackTrace();
 54 | 		}
 55 | 		finally{
 56 | 			try {
 57 | 				if(br!=null){
 58 | 					br.close();
 59 | 					br=null;
 60 | 				}
 61 | 			} catch (IOException e) {
 62 | 				e.printStackTrace();
 63 | 			}
 64 | 		}
 65 | 		int start=0;
 66 | 		int end=0;
 67 | 		Node startNode=null;
 68 | 		Node endNode=null;
 69 | 		try {
 70 | 			br = new BufferedReader(new InputStreamReader(new FileInputStream(edgesPath), "UTF-8"));
 71 | 			String temp = null;
 72 | 			while ((temp = br.readLine()) != null ) {
 73 | 				temp=temp.trim();
 74 | 				if(temp.length()>0){
 75 | 					arr=temp.split("\t");
 76 | 					start=Integer.parseInt(arr[0]);
 77 | 					end=Integer.parseInt(arr[1]);
 78 | 					startNode=data.get(start);
 79 | 					endNode=data.get(end);
 80 | 					startNode.out_ids.add(end);
 81 | 					startNode.out_nodes.add(endNode);
 82 | 					endNode.in_ids.add(start);
 83 | 					endNode.in_nodes.add(startNode);
 84 | 				}
 85 | 			}
 86 | 		} catch (Exception e2) {
 87 | 			e2.printStackTrace();
 88 | 		}
 89 | 		finally{
 90 | 			try {
 91 | 				if(br!=null){
 92 | 					br.close();
 93 | 					br=null;
 94 | 				}
 95 | 			} catch (IOException e) {
 96 | 				e.printStackTrace();
 97 | 			}
 98 | 		}
 99 | 		FileWriter writer = null;
100 | 		try {
101 | 			writer = new FileWriter(typeAndTypeIdPath);
102 | 			for(String type:type2Typeid.keySet()){
103 | 				writer.write(type+" "+type2Typeid.get(type)+"\r\n");
104 | 				writer.flush();
105 | 				}
106 | 			} catch (Exception e) {
107 | 			e.printStackTrace();
108 | 		}
109 | 		finally{
110 | 			try {
111 | 				if(writer!=null){
112 | 					writer.close();
113 | 					writer=null;
114 | 				}
115 | 			} catch (Exception e2) {
116 | 				e2.printStackTrace();
117 | 			}
118 | 		}
119 | 		
120 | 		return data;
121 | 	}
122 | }
123 | 


--------------------------------------------------------------------------------
/code/asymmetric/java - prepare data for model/javaParams.properties:
--------------------------------------------------------------------------------
 1 | 
 2 | # Main dataset directory
 3 | MAIN_DIR = /usr/lzmExperiment/path2vec/dataset/dblp/
 4 | 
 5 | # Truncate sub-paths from samplings by this type.
 6 | TRUNCATED_TYPE_NAME = user
 7 | 
 8 | # The longest length for sampling to truncate sub-paths.
 9 | LONGEST_ANALYSE_LENGTH_FOR_SAMPLING = 20
10 | 
11 | # Longest length for sub-paths
12 | LONGEST_LENGTH_FOR_SUBPATHS = 5
13 | 
14 | # The shortest length for each path in sampling results.
15 | SHORTEST_LENGTH_FOR_SAMPLING = 0
16 | 
17 | # Sampling times for per node in random walk sampling.
18 | SAMPLING_TIMES_PER_NODE = 5
19 | 
20 | # Sampling length for per node in random walk sampling.
21 | SAMPLING_LENGTH_PER_PATH = 5
22 | 
23 | # When generate user features by neighbours' information, the value we set for type information when this node belongs to this kind of type.
24 | FEATURE_TYPE_VALUE = 1.0
25 | 
26 | ########################################
27 | # 不太需要改动的参数
28 | ########################################
29 | # file name of nodes
30 | NODES_PATH = graph.node
31 | 
32 | # file name of edges
33 | EDGES_PATH = graph.edge
34 | 
35 | # file name of random walk sampling paths
36 | SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS = randomWalkSamplingPaths
37 | 
38 | # The file name which contains the map relation of type and typeid.
39 | TYPE_TYPEID_SAVEFILE = typeAndTypeIDSavePath
40 | 
41 | # file name of node features
42 | NODES_FEATURE_SAVE_PATH = nodesFeatures
43 | 
44 | # file name of sub-paths save file
45 | SUBPATHS_SAVE_PATH = subpathsSaveFile


--------------------------------------------------------------------------------
/code/asymmetric/python - model/dataProcessTools.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | methods for processing data
  4 | '''
  5 | 
  6 | import numpy
  7 | import theano
  8 | 
  9 | # Set the random number generators' seeds for consistency
 10 | SEED = 123
 11 | numpy.random.seed(SEED)
 12 | 
 13 | def getTrainingData(trainingDataFile):
 14 |     '''
 15 |         read training data from file
 16 |     :type string
 17 |     :param trainingDataFile
 18 |     '''
 19 |     data=[] 
 20 |     pairs=[] 
 21 |     with open(trainingDataFile) as f:
 22 |         for l in f:
 23 |             tmp=l.strip().split()
 24 |             if len(tmp)<=0:
 25 |                 continue
 26 |             arr=[]
 27 |             arr.append(tmp[0]+'-'+tmp[1])
 28 |             arr.append(tmp[1]+'-'+tmp[0])
 29 |             arr.append(tmp[0]+'-'+tmp[2])
 30 |             arr.append(tmp[2]+'-'+tmp[0])
 31 |             pairs.append(arr) 
 32 |             tmp=[int(x) for x in tmp] 
 33 |             data.append(tmp)
 34 |             
 35 |     return data,pairs
 36 | 
 37 | def getWordsEmbeddings(wordsEmbeddings_path):
 38 |     """
 39 |         read words embeddings from file
 40 |             a b
 41 |             c d e f ....
 42 |             g h j k ....
 43 |             a means the num(line) of the data，b means the dimension of the data
 44 |             c and g are the index of the corresponding words
 45 |             d，e，f，h，j，k，... are the content of embeddings
 46 |     :type String
 47 |     :param wordsEmbeddings_path
 48 |     """
 49 |     size=0
 50 |     dimension=0
 51 |     wemb=[]
 52 |     with open(wordsEmbeddings_path) as f:
 53 |         for l in f:
 54 |             arr=l.strip().split()
 55 |             if len(arr)==2:
 56 |                 size=int(arr[0])
 57 |                 dimension=int(arr[1])
 58 |                 wemb=numpy.zeros((size,dimension)) # @UndefinedVariable
 59 |                 continue
 60 |             id=int(arr[0]) 
 61 |             for i in range(0,dimension):
 62 |                 wemb[id][i]=float(arr[i+1])
 63 |     return wemb,dimension,size
 64 | 
 65 | def loadAllSubPaths(subpaths_file,maxlen=1000):
 66 |     """
 67 |         read all subpaths from file
 68 |     :type subpaths_file: String
 69 |     :param subpaths_file：file path 
 70 |        
 71 |     :type maxlen:int
 72 |     :param maxlen:
 73 |     
 74 |     the return value is a map, and the key of this map is made of startNodeId-endNodeId.
 75 |     the value of this map is a list made of startNodeId aId bId cId dId... endNodeId
 76 |     """
 77 |     map={}
 78 |     with open(subpaths_file) as f:
 79 |         for l in f: 
 80 |             splitByTab=l.strip().split('\t')
 81 |             key=splitByTab[0]+'-'+splitByTab[1] 
 82 |             sentence=[int(y) for y in splitByTab[2].split()[:]] 
 83 |             if len(sentence)>maxlen: 
 84 |                 continue
 85 |             if key in map: 
 86 |                 map[key].append(sentence)
 87 |             else:
 88 |                 tmp=[]
 89 |                 tmp.append(sentence)
 90 |                 map[key]=tmp
 91 |     return map
 92 | 
 93 | def prepareDataForTraining(trainingDataTriples,trainingDataPairs,subpaths_map):
 94 |     """
 95 |         prepare data for training
 96 |     """
 97 |     n_triples=len(trainingDataTriples)
 98 |     
 99 |     triples_matrix=numpy.zeros([n_triples,4,2]).astype('int64')
100 |     
101 |     maxlen=0 
102 |     n_subpaths=0 
103 |     allPairs=[] 
104 |     for list in trainingDataPairs:
105 |         for l in list:
106 |             allPairs.append(l)
107 |     for key in allPairs:
108 |         if key not in subpaths_map: 
109 |             continue;
110 |         list=subpaths_map[key]
111 |         n_subpaths+=len(list) 
112 |         for l in list: 
113 |             if len(l)>maxlen:
114 |                 maxlen=len(l)
115 |                 
116 |     subPaths_matrix=numpy.zeros([maxlen,n_subpaths]).astype('int64') 
117 |     
118 |     subPaths_mask=numpy.zeros([maxlen,n_subpaths]).astype(theano.config.floatX)  # @UndefinedVariable
119 |     
120 |     subPaths_lens=numpy.zeros([n_subpaths,]).astype('int64')
121 |     
122 |     current_index=0 
123 |     path_index=0 
124 |     valid_triples_count=0 
125 |     for i in range(len(trainingDataPairs)): 
126 |         pairs=trainingDataPairs[i] 
127 |         
128 |         valid_triples_count+=1
129 |         for j in range(len(pairs)):
130 |             pair=pairs[j]
131 |             list=None
132 |             if pair in subpaths_map: 
133 |                 list=subpaths_map[pair] 
134 |             if list is not None: 
135 |                 triples_matrix[i][j][0]=current_index 
136 |                 current_index+=len(list)
137 |                 triples_matrix[i][j][1]=current_index 
138 |                 for x in range(len(list)):
139 |                     index=path_index+x 
140 |                     path=list[x] 
141 |                     subPaths_lens[index]=len(path) 
142 |                     for y in range(len(path)): 
143 |                         subPaths_matrix[y][index]=path[y] 
144 |                         subPaths_mask[y][index]=1. 
145 |                 path_index+=len(list) 
146 |             else : 
147 |                 triples_matrix[i][j][0]=current_index 
148 |                 current_index+=0
149 |                 triples_matrix[i][j][1]=current_index 
150 |     
151 |     count=0
152 |     for i in range(len(triples_matrix)):
153 |         if triples_matrix[i][0][0]!=triples_matrix[i][0][1] and triples_matrix[i][2][0]!=triples_matrix[i][2][1]:
154 |             count+=1
155 |     triples_matrix_new=numpy.zeros([count,4,2]).astype('int64')
156 |     index=0
157 |     for i in range(len(triples_matrix)):
158 |         if triples_matrix[i][0][0]!=triples_matrix[i][0][1] and triples_matrix[i][2][0]!=triples_matrix[i][2][1]:
159 |             triples_matrix_new[index]=triples_matrix[i]
160 |             index+=1
161 |     triples_matrix=triples_matrix_new
162 |     
163 |     return triples_matrix, subPaths_matrix, subPaths_mask, subPaths_lens
164 |     
165 |     
166 | def prepareDataForTest(query,candidate,subpaths_map):
167 |     """
168 |    prepare data for test
169 |     """
170 |     key1=bytes(query)+'-'+bytes(candidate)
171 |     key2=bytes(candidate)+'-'+bytes(query)
172 |     if key1 not in subpaths_map and key2 not in subpaths_map: 
173 |         return None,None,None
174 |     subpaths=[]
175 |     if key1 in subpaths_map:
176 |         subpaths.extend(subpaths_map[key1]) 
177 |     if key2 in subpaths_map:
178 |         subpaths.extend(subpaths_map[key2]) 
179 |     maxlen=0
180 |     for subpath in subpaths:
181 |         if len(subpath)>maxlen:
182 |             maxlen=len(subpath)
183 |     subPaths_matrix=numpy.zeros([maxlen,len(subpaths)]).astype('int64')
184 |     subPaths_mask=numpy.zeros([maxlen,len(subpaths)]).astype(theano.config.floatX)  # @UndefinedVariable
185 |     subPaths_lens=numpy.zeros([len(subpaths),]).astype('int64')
186 |     for i in range(len(subpaths)): 
187 |         subpath=subpaths[i] 
188 |         subPaths_lens[i]=len(subpath) 
189 |         for j in range(len(subpath)): 
190 |             subPaths_matrix[j][i]=subpath[j]
191 |             subPaths_mask[j][i]=1.  
192 |     
193 |     return subPaths_matrix,subPaths_mask,subPaths_lens
194 |             
195 |             
196 | def prepareDataForTestAsymmetric(query,candidate,subpaths_map):
197 |     """
198 |        prepare data for asymmetric test
199 |     """
200 |     key1=bytes(query)+'-'+bytes(candidate)
201 |     if key1 not in subpaths_map : 
202 |         return None,None,None
203 |     subpaths=[]
204 |     if key1 in subpaths_map:
205 |         subpaths.extend(subpaths_map[key1]) 
206 |     maxlen=0
207 |     for subpath in subpaths:
208 |         if len(subpath)>maxlen:
209 |             maxlen=len(subpath)
210 |     subPaths_matrix=numpy.zeros([maxlen,len(subpaths)]).astype('int64')
211 |     subPaths_mask=numpy.zeros([maxlen,len(subpaths)]).astype(theano.config.floatX)  # @UndefinedVariable
212 |     subPaths_lens=numpy.zeros([len(subpaths),]).astype('int64')
213 |     for i in range(len(subpaths)): 
214 |         subpath=subpaths[i] 
215 |         subPaths_lens[i]=len(subpath)
216 |         for j in range(len(subpath)): 
217 |             subPaths_matrix[j][i]=subpath[j] 
218 |             subPaths_mask[j][i]=1.  
219 |     
220 |     return subPaths_matrix,subPaths_mask,subPaths_lens
221 | 
222 |     
223 | def get_minibatches_idx(n, minibatch_size, shuffle=False):
224 |     """
225 |     Used to shuffle the dataset at each iteration.
226 |     """
227 |     idx_list = numpy.arange(n, dtype="int32")
228 | 
229 |     if shuffle:
230 |         numpy.random.shuffle(idx_list)
231 | 
232 |     minibatches = []
233 |     minibatch_start = 0
234 |     for i in range(n // minibatch_size):
235 |         minibatches.append(idx_list[minibatch_start:
236 |                                     minibatch_start + minibatch_size])
237 |         minibatch_start += minibatch_size
238 | 
239 |     if (minibatch_start != n):
240 |         # Make a minibatch out of what is left
241 |         minibatches.append(idx_list[minibatch_start:])
242 | 
243 |     return zip(range(len(minibatches)), minibatches)
244 | 
245 | 


--------------------------------------------------------------------------------
/code/asymmetric/python - model/evaluateTools.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | evaluation tools
 4 | '''
 5 | 
 6 | import numpy
 7 | 
 8 | def get_AP(k,ideal,test):
 9 |     """
10 |         compute AP
11 |     """
12 |     ideal=set(ideal)
13 |     accumulation=0.0 
14 |     count=0 
15 |     for i in range(len(test)): 
16 |         if i>=k: 
17 |             break
18 |         if test[i] in ideal: 
19 |             count+=1
20 |             accumulation+=count/(i+1.0)
21 |     m=len(ideal) 
22 |     n=k 
23 |     x=0
24 |     if m>n:
25 |         x=n 
26 |     else:
27 |         x=m
28 |     if x==0:
29 |         return 0 
30 |     return accumulation/x
31 |             
32 |             
33 | def get_MAP(k,ideal_map,test_map):
34 |     """
35 |         compute MAP
36 |     """
37 |     accumulation=0.0
38 |     for key in ideal_map.keys(): 
39 |         accumulation+=get_AP(k, ideal_map[key], test_map[key]) 
40 |     if len(ideal_map)==0: 
41 |         return 0
42 |     return accumulation/len(ideal_map)
43 |     
44 |     
45 | def get_nDCG(k,ideal,test):
46 |     """
47 |         compute NDCG
48 |     """
49 |     ideal=set(ideal)
50 |     accumulation=0.0
51 |     for i in range(len(test)):
52 |         if i>=k: 
53 |             break
54 |         if test[i] in ideal:
55 |             if i==0:
56 |                 accumulation+=1.0
57 |             else:
58 |                 accumulation+=1.0/numpy.log2(i+1)
59 |     normalization=0.0
60 |     for i in range(len(ideal)):
61 |         if i>=k: 
62 |             break
63 |         if i==0:
64 |             normalization+=1.0
65 |         else:
66 |             normalization+=1.0/numpy.log2(i+1)
67 |     if normalization==0:
68 |         return 0
69 |     return accumulation/normalization
70 |         
71 | def get_MnDCG(k,ideal_map,test_map):
72 |     """
73 |         compute mean NDCG
74 |     """
75 |     accumulation=0.0
76 |     for key in ideal_map.keys(): 
77 |         accumulation+=get_nDCG(k, ideal_map[key], test_map[key]) 
78 |     if len(ideal_map)==0: 
79 |         return 0
80 |     return accumulation/len(ideal_map)
81 |     
82 |             
83 | if __name__=='__main__':
84 |     ideal=['a']
85 |     test=['b','a']
86 |     k=10
87 |     print get_nDCG(k, ideal, test)
88 | #     ideal={'q':['a','b','c'],'p':['a','b','c','d','e']}
89 | #     test={'q':['b','a','m','c','d','n'],'p':['b','a','m','c','d','n']}
90 | #     k=4
91 | #     print get_MnDCG(k, ideal, test)


--------------------------------------------------------------------------------
/code/asymmetric/python - model/experimentForOneFileByParams.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | Training one dataset and then test NDCG and MAP.
  4 | '''
  5 | 
  6 | import numpy
  7 | import theano
  8 | from theano import tensor
  9 | 
 10 | import proxEmbed
 11 | import proxEmbedProcessAndAssess
 12 | import os
 13 | 
 14 | import ConfigParser
 15 | import string, os, sys
 16 | 
 17 | 
 18 | if __name__=='__main__':
 19 |     
 20 |     cf = ConfigParser.SafeConfigParser()
 21 |     # read the parameters file.
 22 | #     cf.read("/usr/lzmExperiment/proxEmbed/paramsSet/pythonParamsConfig")
 23 |     cf.read("pythonParamsConfig")
 24 |     
 25 |     main_dir=cf.get("param", "root_dir") # main work dir
 26 |     dataset_name=cf.get("param", "dataset_name") # the name of one dataset
 27 |     suffix=cf.get("param", "suffix") # the suffix of dataset, such as 10,100,1000
 28 |     class_name=cf.get("param", "class_name") # the relation name of data
 29 |     index=cf.get("param", "index") # the index of the dataset file
 30 |     
 31 |     trainingDataFile=os.path.join(main_dir+'/',dataset_name+'.splits','train.'+suffix,'train_'+class_name+'_'+index) # the full path of training data file. This path will be generated by main_dir, dataset_name, suffix, class_name and index.
 32 |     wordsEmbeddings=None # words embeddings
 33 |     wordsEmbeddings_path=cf.get("param", "wordsEmbeddings_path") # the file path of words embeddings
 34 |     word_dimension=cf.getint("param", "word_dimension") # dimension of words embeddings
 35 |     dimension=cf.getint("param", "dimension") # the dimension of paths embeddings
 36 |     wordsSize=cf.getint("param", "wordsSize") # the size of words vocabulary
 37 |     subpaths_map=None # contains sub-paths
 38 |     subpaths_file=cf.get("param", "subpaths_file") # the file which contains sub-paths
 39 |     maxlen_subpaths=cf.getint("param", "maxlen_subpaths") # the max length for sub-paths
 40 |     h_output_method=cf.get("param", "h_output_method") # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path.
 41 |     maxlen=cf.getint("param", "maxlen")  # Sequence longer than this get ignored 
 42 |     batch_size=cf.getint("param", "batch_size") # use a batch for training. This is the size of this batch.
 43 |     is_shuffle_for_batch=cf.getboolean("param", "is_shuffle_for_batch") # if need shuffle for training
 44 |     discount_alpha=cf.getfloat("param", "discount_alpha") # the parameter alpha for discount. The longer the subpath, the little will the weight be.
 45 |     subpaths_pooling_method=cf.get("param", "subpaths_pooling_method") # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling.
 46 |     objective_function_method=cf.get("param", "objective_function_method") # loss function, we use sigmoid
 47 |     objective_function_param=cf.getfloat("param", "objective_function_param") # the parameter in loss function, beta
 48 |     lrate=cf.getfloat("param", "lrate") # learning rate
 49 |     max_epochs=cf.getint("param", "max_epochs") # the max epochs for training
 50 |     
 51 |     dispFreq=cf.getint("param", "dispFreq") # the frequences for display
 52 |     saveFreq=cf.getint("param", "saveFreq") # the frequences for saving the parameters
 53 |     saveto=os.path.join(main_dir+'/',dataset_name+'.trainModels','train.'+suffix,'train_'+class_name+'_'+index+'.npz') # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index.
 54 |     
 55 |     # the normalization of this model, l2-norm of all parameters
 56 |     decay_lstm_W=cf.getfloat("param", "decay_lstm_W") 
 57 |     decay_lstm_U=cf.getfloat("param", "decay_lstm_U") 
 58 |     decay_lstm_b=cf.getfloat("param", "decay_lstm_b") 
 59 |     decay_w=cf.getfloat("param", "decay_w") 
 60 |     
 61 |     test_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','test','test_'+class_name+'_'+index) # the file of test data
 62 |     top_num=cf.getint("param", "top_num") # the top num to predict
 63 |     ideal_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','ideal','ideal_'+class_name+'_'+index) # the file of ground truth
 64 |     
 65 |     # 首先训练模型
 66 |     proxEmbed.proxEmbedTraining(
 67 |                      trainingDataFile, 
 68 |                      wordsEmbeddings, 
 69 |                      wordsEmbeddings_path, 
 70 |                      word_dimension, 
 71 |                      dimension, 
 72 |                      wordsSize, 
 73 |                      subpaths_map, 
 74 |                      subpaths_file, 
 75 |                      maxlen_subpaths,
 76 |                      h_output_method, 
 77 |                      maxlen,  
 78 |                      batch_size, 
 79 |                      is_shuffle_for_batch, 
 80 |                      discount_alpha, 
 81 |                      subpaths_pooling_method,
 82 |                      objective_function_method, 
 83 |                      objective_function_param, 
 84 |                      lrate, 
 85 |                      max_epochs, 
 86 |                      
 87 |                      dispFreq, 
 88 |                      saveFreq,
 89 |                      saveto, 
 90 |                      
 91 |                      decay_lstm_W, 
 92 |                      decay_lstm_U,
 93 |                      decay_lstm_b, 
 94 |                      decay_w, 
 95 |                      )
 96 |     
 97 |     # load the function which is trained beforehand
 98 |     computeFunc=proxEmbedProcessAndAssess.get_proxEmbedModel(
 99 |                      saveto, 
100 |                      word_dimension, 
101 |                      dimension, 
102 |                      h_output_method, 
103 |                      discount_alpha, 
104 |                      subpaths_pooling_method, 
105 |                       )
106 |     # test the model
107 |     MAP,MnDCG=proxEmbedProcessAndAssess.compute_proxEmbed(
108 |                      wordsEmbeddings, 
109 |                      wordsEmbeddings_path, 
110 |                      word_dimension, 
111 |                      dimension, 
112 |                      wordsSize, 
113 |                      subpaths_map,
114 |                      subpaths_file,
115 |                      maxlen_subpaths, 
116 |                      maxlen,  
117 |                      
118 |                      test_data_file, 
119 |                      top_num, 
120 |                      ideal_data_file,
121 |                      func=computeFunc,
122 |                    )
123 |     
124 |     print 'MAP==',MAP
125 |     print 'MnDCG==',MnDCG


--------------------------------------------------------------------------------
/code/asymmetric/python - model/lstmModel.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | 
  3 | from __future__ import print_function
  4 | import six.moves.cPickle as pickle  # @UnresolvedImport
  5 | 
  6 | from collections import OrderedDict
  7 | import sys
  8 | import time
  9 | 
 10 | import numpy
 11 | import theano
 12 | from theano import config 
 13 | import theano.tensor as tensor
 14 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 15 | import toolsFunction
 16 | 
 17 | def numpy_floatX(data):
 18 |     return numpy.asarray(data, dtype=config.floatX)  # @UndefinedVariable
 19 | 
 20 | 
 21 | def _p(pp, name):
 22 |     return '%s_%s' % (pp, name)
 23 | 
 24 | 
 25 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
 26 |     """
 27 |     generate lstm
 28 |     """
 29 |     nsteps = state_below.shape[0]
 30 |     if state_below.ndim == 3:
 31 |         n_samples = state_below.shape[1] 
 32 |     else:
 33 |         n_samples = 1
 34 | 
 35 |     assert mask is not None
 36 | 
 37 |     def _slice(_x, n, dim):
 38 |         if _x.ndim == 3:
 39 |             return _x[:, :, n * dim:(n + 1) * dim]
 40 |         return _x[:, n * dim:(n + 1) * dim] 
 41 | 
 42 |     def _step(m_, x_, h_, c_): 
 43 |         preact = tensor.dot(h_, tparams['lstm_U']) 
 44 |         preact += x_ 
 45 | 
 46 |         i = tensor.nnet.sigmoid(_slice(preact, 0, options['dimension'])) # input gate 
 47 |         f = tensor.nnet.sigmoid(_slice(preact, 1, options['dimension'])) # forget gate 
 48 |         o = tensor.nnet.sigmoid(_slice(preact, 2, options['dimension'])) # output gate 
 49 |         c = tensor.tanh(_slice(preact, 3, options['dimension'])) #  cell 
 50 | 
 51 |         c = f * c_ + i * c 
 52 |         c = m_[:, None] * c + (1. - m_)[:, None] * c_
 53 | 
 54 |         h = o * tensor.tanh(c) 
 55 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
 56 | 
 57 |         return h, c
 58 |     state_below = (tensor.dot(state_below, tparams['lstm_W']) + tparams['lstm_b'])
 59 | 
 60 |     dim_proj = options['dimension']
 61 |     rval, updates = theano.scan(_step, 
 62 |                                 sequences=[mask, state_below],
 63 |                                 outputs_info=[tensor.alloc(numpy_floatX(0.), 
 64 |                                                            n_samples,
 65 |                                                            dim_proj),
 66 |                                               tensor.alloc(numpy_floatX(0.), 
 67 |                                                            n_samples,
 68 |                                                            dim_proj)],
 69 |                                 name=_p(prefix, '_layers'),
 70 |                                 n_steps=nsteps) # maxlen
 71 |     return rval[0] 
 72 | 
 73 | 
 74 | 
 75 | def build_model(tparams, options, x, mask, wordsemb):
 76 |     """
 77 |     build the model
 78 |     """
 79 |     n_timesteps = x.shape[0] 
 80 |     n_samples = x.shape[1] 
 81 |     emb = wordsemb[x.flatten()].reshape([n_timesteps,
 82 |                                                 n_samples,
 83 |                                                 options['word_dimension']])
 84 |     proj = lstm_layer(tparams, emb, options,
 85 |                                             prefix='lstm',
 86 |                                             mask=mask)
 87 |     output=None
 88 |     if options['h_output_method'] == 'h': # the last h as the output
 89 |         temp=proj[-1] 
 90 |         output=temp[0] 
 91 |     elif options['h_output_method'] == 'mean-pooling': # mean-pooling as the output
 92 |         temp1 = (proj * mask[:, :, None]).sum(axis=0) 
 93 |         temp2 = temp1 / mask.sum(axis=0)[:, None]
 94 |         output=temp2[0]
 95 |     elif options['h_output_method'] == 'max-pooling': # max-pooling as the output
 96 |         temp1=proj * mask[:, :, None] 
 97 |         temp2=temp1.sum(axis=1) 
 98 |         output = temp2.max(axis=0) 
 99 |     else : # default, the last h as the output
100 |         temp=proj[-1]
101 |         output=temp[0] 
102 |     return  output
103 | 
104 | 
105 | # get lstm model by parameters
106 | def get_lstm(
107 |     model_options, # the options parameters for the model
108 |     tparams, # theano shared variables
109 |     x, # a sub-path
110 |     x_mask, # the mask of this sub-path
111 |     wordsemb, # embeddings of all words
112 | ):
113 | 
114 |     # build the model
115 |     proj = build_model(tparams, model_options, x, x_mask, wordsemb)
116 |     return proj
117 | 


--------------------------------------------------------------------------------
/code/asymmetric/python - model/proxEmbed.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | 
  3 | import dataProcessTools
  4 | import numpy
  5 | import theano
  6 | from theano import tensor
  7 | from theano import config
  8 | from collections import OrderedDict
  9 | import time
 10 | import six.moves.cPickle as pickle  # @UnresolvedImport
 11 | import proxEmbedModelMulti
 12 | 
 13 | 
 14 | # Set the random number generators' seeds for consistency
 15 | SEED = 123
 16 | numpy.random.seed(SEED)
 17 | 
 18 | 
 19 | def numpy_floatX(data):
 20 |     return numpy.asarray(data, dtype=config.floatX)  # @UndefinedVariable
 21 | 
 22 | def adadelta(lr, tparams, grads, fourPairs, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost):
 23 |     """
 24 |     An adaptive learning rate optimizer adadelta
 25 |         
 26 |     Parameters
 27 |     ----------
 28 |     lr : Theano SharedVariable
 29 |         Initial learning rate
 30 |     tpramas: Theano SharedVariable
 31 |         Model parameters
 32 |     grads: Theano variable
 33 |         Gradients of cost w.r.t to parameres
 34 |     x: Theano variable
 35 |         Model inputs
 36 |     mask: Theano variable
 37 |         Sequence mask
 38 |     y: Theano variable
 39 |         Targets
 40 |     cost: Theano variable
 41 |         Objective fucntion to minimize
 42 | 
 43 |     Notes
 44 |     -----
 45 |     For more information, see [ADADELTA]_.
 46 | 
 47 |     .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
 48 |        Rate Method*, arXiv:1212.5701.
 49 |     """
 50 |     zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
 51 |                                   name='%s_grad' % k)
 52 |                     for k, p in tparams.items()]
 53 |     running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
 54 |                                  name='%s_rup2' % k)
 55 |                    for k, p in tparams.items()]
 56 |     running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
 57 |                                     name='%s_rgrad2' % k)
 58 |                       for k, p in tparams.items()]
 59 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
 60 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
 61 |              for rg2, g in zip(running_grads2, grads)]
 62 |     f_grad_shared = theano.function([fourPairs, subPaths_matrix, subPaths_mask, subPaths_lens, wemb], cost, updates=zgup + rg2up,
 63 |                                     on_unused_input='ignore',
 64 |                                     name='adadelta_f_grad_shared')
 65 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
 66 |              for zg, ru2, rg2 in zip(zipped_grads,
 67 |                                      running_up2,
 68 |                                      running_grads2)]
 69 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 
 70 |              for ru2, ud in zip(running_up2, updir)] 
 71 |     param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 
 72 |     f_update = theano.function([lr], [], updates=ru2up + param_up,
 73 |                                on_unused_input='ignore',
 74 |                                name='adadelta_f_update')
 75 | 
 76 |     return f_grad_shared, f_update
 77 | 
 78 |         
 79 | def ortho_weight(ndim):
 80 |     """
 81 |         initialize a matrix 
 82 |     """
 83 |     W = numpy.random.randn(ndim, ndim)
 84 |     u, s, v = numpy.linalg.svd(W)
 85 |     return u.astype(config.floatX)  # @UndefinedVariable
 86 | 
 87 | def init_params_weight(row,column):
 88 |     """
 89 |     initialize matrix parameters by row and column
 90 |     """
 91 |     lstm_W = numpy.random.rand(row, column) 
 92 |     return lstm_W.astype(config.floatX)  # @UndefinedVariable
 93 | 
 94 | 
 95 | def init_sharedVariables(options):
 96 |     """
 97 |         initialize all the shared parameters
 98 |     """
 99 |     print 'init shared Variables......'
100 |     params = OrderedDict()
101 |     lstm_W=numpy.concatenate([
102 |                               init_params_weight(options['word_dimension'],options['dimension']),
103 |                               init_params_weight(options['word_dimension'],options['dimension']),
104 |                               init_params_weight(options['word_dimension'],options['dimension']),
105 |                               init_params_weight(options['word_dimension'],options['dimension'])
106 |                               ],axis=1) 
107 |     params['lstm_W'] = lstm_W
108 |     lstm_U = numpy.concatenate([ortho_weight(options['dimension']),
109 |                            ortho_weight(options['dimension']),
110 |                            ortho_weight(options['dimension']),
111 |                            ortho_weight(options['dimension'])], axis=1)
112 |     params['lstm_U'] = lstm_U
113 |     lstm_b = numpy.zeros((4 * options['dimension'],))
114 |     params['lstm_b'] = lstm_b.astype(config.floatX)  # @UndefinedVariable
115 |     w = numpy.random.rand(options['dimension'], ) 
116 |     params['w']=w.astype(config.floatX)  # @UndefinedVariable
117 |     
118 |     return params
119 |     
120 |     
121 | def init_tparams(params):
122 |     tparams = OrderedDict()
123 |     for kk, pp in params.items():
124 |         tparams[kk] = theano.shared(params[kk], name=kk)
125 |     return tparams
126 |     
127 | def unzip(zipped):
128 |     new_params = OrderedDict()
129 |     for kk, vv in zipped.items():
130 |         new_params[kk] = vv.get_value()
131 |     return new_params
132 | 
133 | main_dir='D:/dataset/test/icde2016_metagraph/'
134 | def proxEmbedTraining(
135 |                      trainingDataFile=main_dir+'facebook.splits/train.10/train_classmate_1', # the full path of training data file
136 |                      wordsEmbeddings=None, # words embeddings
137 |                      wordsEmbeddings_path=main_dir+'facebook/nodesFeatures', # the file path of words embeddings
138 |                      word_dimension=22, # dimension of words embeddings
139 |                      dimension=64, # the dimension of paths embeddings
140 |                      wordsSize=1000000, # the size of words vocabulary
141 |                      subpaths_map=None, # contains sub-paths
142 |                      subpaths_file=main_dir+'facebook/subpathsSaveFile',# the file which contains sub-paths
143 |                      maxlen_subpaths=1000, # the max length for sub-paths
144 |                      h_output_method='mean-pooling', # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path.
145 |                      maxlen=100,  # Sequence longer then this get ignored 
146 |                      batch_size=1, # use a batch for training. This is the size of this batch.
147 |                      is_shuffle_for_batch=False, # if need shuffle for training
148 |                      discount_alpha=0.1, # the parameter alpha for discount. The longer the subpath, the little will the weight be.
149 |                      subpaths_pooling_method='max-pooling', # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling.
150 |                      objective_function_method='hinge-loss', # loss function, we use sigmoid
151 |                      objective_function_param=0, # the parameter in loss function, beta
152 |                      lrate=0.0001, # learning rate
153 |                      max_epochs=10, # the max epochs for training
154 |                      
155 |                      dispFreq=5, # the frequences for display
156 |                      saveFreq=5, # the frequences for saving the parameters
157 |                      saveto=main_dir+'facebook/proxEmbed-modelParams.npz', # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index.
158 |                      
159 |                      # the normalization of this model, l2-norm of all parameters
160 |                      decay_lstm_W=0.01, 
161 |                      decay_lstm_U=0.01,
162 |                      decay_lstm_b=0.01, 
163 |                      decay_w=0.01, 
164 |                      
165 |                      ):
166 |     """
167 |     The training stage of ProxEmbed
168 |     """
169 |     model_options = locals().copy()
170 |     
171 |     if wordsEmbeddings is None: 
172 |         if wordsEmbeddings_path is not None: 
173 |             wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
174 |         else: 
175 |             print 'There is not path for wordsEmbeddings, exit!!!'
176 |             exit(0) 
177 |     
178 |     if subpaths_map is None: 
179 |         if subpaths_file is not None:
180 |             subpaths_map=dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths)
181 |         else: 
182 |             print 'There is not path for sub-paths, exit!!!'
183 |             exit(0)
184 |     
185 |     trainingData,trainingPairs=dataProcessTools.getTrainingData(trainingDataFile)
186 |     allBatches=dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch)
187 | 
188 |     params=init_sharedVariables(model_options) 
189 |     tparams=init_tparams(params) 
190 |     print 'Generate models ......'
191 |     
192 |     trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost=proxEmbedModelMulti.proxEmbedModel(model_options, tparams)
193 |     
194 |     print 'Generate gradients ......'
195 |     grads=tensor.grad(cost,wrt=list(tparams.values()))
196 |     print 'Using Adadelta to generate functions ......'
197 |     lr = tensor.scalar(name='lr')
198 |     f_grad_shared, f_update=adadelta(lr, tparams, grads, trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost)
199 |     
200 |     print 'Start training models ......'
201 |     best_p = None 
202 |     history_cost=[] 
203 |     
204 |     models_count=[0,0,0,0] 
205 |     
206 |     start_time = time.time() 
207 |     print 'start time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
208 |     uidx=0 
209 |     for eidx in range(max_epochs):
210 |         for _, batch in allBatches: 
211 |             uidx += 1
212 |             trainingDataForBatch=[trainingData[i] for i in batch]
213 |             trainingPairsForBatch=[trainingPairs[i] for i in batch]
214 |             triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data=dataProcessTools.prepareDataForTraining(trainingDataForBatch, trainingPairsForBatch, subpaths_map)
215 |             cost=0
216 |             cost=f_grad_shared(triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data,wordsEmbeddings)
217 |             f_update(lrate)
218 |             
219 |             if numpy.isnan(cost) or numpy.isinf(cost):
220 |                 print('bad cost detected: ', cost)
221 |                 return 
222 |             if numpy.mod(uidx, dispFreq) == 0:
223 |                 print 'Epoch =', eidx, ',  Update =', uidx, ',  Cost =', cost
224 |                 print 'models_count ==',models_count
225 |             if saveto and numpy.mod(uidx, saveFreq) == 0:
226 |                 print('Saving...')
227 |                 if best_p is not None: 
228 |                     params = best_p
229 |                 else:
230 |                     params = unzip(tparams)
231 |                 numpy.savez(saveto, history_errs=history_cost, **params)
232 |                 pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
233 |                 print('Done')
234 |     end_time = time.time() 
235 |     print 'end time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end_time))
236 |     print 'Training finished! Cost time == ', end_time-start_time,' s'
237 |             
238 |     
239 | if __name__=='__main__':
240 |     print 'Start running proxEmbedTraining......'
241 |     proxEmbedTraining()


--------------------------------------------------------------------------------
/code/asymmetric/python - model/proxEmbedModelMulti.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | Generate ProxEmbed Model
 4 | '''
 5 | import numpy
 6 | import theano
 7 | from theano import tensor
 8 | import lstmModel
 9 | from theano.ifelse import ifelse
10 | 
11 | 
12 | def proxEmbedModel(model_options,tparams):
13 |     """
14 |     generate proxEmbed model
15 |     """
16 |     trainingParis=tensor.tensor3('trainingParis',dtype='int64') 
17 |     subPaths_matrix=tensor.matrix('subPaths_matrix',dtype='int64') 
18 |     subPaths_mask=tensor.matrix('subPaths_mask',dtype=theano.config.floatX)  # @UndefinedVariable 
19 |     subPaths_lens=tensor.vector('subPaths_lens',dtype='int64') 
20 |     wordsEmbeddings=tensor.matrix('wordsEmbeddings',dtype=theano.config.floatX)  # @UndefinedVariable 
21 |     
22 |     def _processTriple(fourPairs,lossSum):
23 |         
24 |         def _processSubpath(index):
25 |             length=subPaths_lens[index] 
26 |             x=subPaths_matrix[:length,index:index+1]
27 |             x_mask=subPaths_mask[:length,index:index+1] 
28 |             emb=lstmModel.get_lstm(model_options, tparams, x, x_mask, wordsEmbeddings)
29 |             emb=emb*discountModel(model_options['discount_alpha'], length)
30 |             return emb 
31 |         
32 |         def iftFunc():
33 |             embx=numpy.zeros(model_options['dimension'],) 
34 |             embx.astype(theano.config.floatX)  # @UndefinedVariable
35 |             return embx
36 |          
37 |         def iffFunc(start,end):
38 |             embx=None
39 |             rval,update=theano.scan(
40 |                                 _processSubpath,
41 |                                 sequences=tensor.arange(start,end), 
42 |                                 )
43 |             if model_options['subpaths_pooling_method']=='mean-pooling': # mean-pooling
44 |                 embx = rval.sum(axis=0) 
45 |                 embx = embx / rval.shape[0] 
46 |             elif model_options['subpaths_pooling_method']=='max-pooling': # max-pooling
47 |                 embx = rval.max(axis=0) 
48 |             else: # default, mean-pooling
49 |                 embx = rval.sum(axis=0)
50 |                 embx = embx / rval.shape[0] 
51 |                 
52 |             return embx
53 |         
54 |         start=fourPairs[0][0] 
55 |         end=fourPairs[0][1] 
56 |         emb1=None 
57 |         emb1=ifelse(tensor.eq(start,end),iftFunc(),iffFunc(start,end)) # 先选一个，然后计算这个值
58 |         
59 |         start=fourPairs[2][0] 
60 |         end=fourPairs[2][1]
61 |         emb2=None 
62 |         emb2=ifelse(tensor.eq(start,end),iftFunc(),iffFunc(start,end)) # 先选一个，然后计算这个值
63 |             
64 |         loss=0
65 |         param=model_options['objective_function_param'] 
66 |         if model_options['objective_function_method']=='sigmoid': 
67 |             loss=-tensor.log(tensor.nnet.sigmoid(param*(tensor.dot(emb1,tparams['w'])-tensor.dot(emb2,tparams['w'])))) # sigmoid
68 |         
69 |         return loss+lossSum
70 |         
71 |     rval,update=theano.scan(
72 |                             _processTriple,
73 |                             sequences=trainingParis, 
74 |                             outputs_info=tensor.constant(0., dtype=theano.config.floatX), # @UndefinedVariable # 输出是这个triple的loss
75 |                             )
76 |     cost=rval[-1]
77 |     cost+=model_options['decay_lstm_W']*(tparams['lstm_W'] ** 2).sum()
78 |     cost+=model_options['decay_lstm_U']*(tparams['lstm_U'] ** 2).sum()
79 |     cost+=model_options['decay_lstm_b']*(tparams['lstm_b'] ** 2).sum()
80 |     cost+=model_options['decay_w']*(tparams['w'] ** 2).sum()
81 |     return trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens , wordsEmbeddings, cost
82 | 
83 | 
84 | def discountModel(alpha,length):
85 |     """
86 |     discount model
87 |     """
88 |     return tensor.exp(alpha*length*(-1))
89 |     
90 | def numpy_floatX(data):
91 |     return numpy.asarray(data, dtype=theano.config.floatX)  # @UndefinedVariable


--------------------------------------------------------------------------------
/code/asymmetric/python - model/proxEmbedProcessAndAssess.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | process dataset by proxEmbed model and then assess
  4 | '''
  5 | 
  6 | import numpy
  7 | import theano
  8 | from theano import tensor
  9 | from collections import OrderedDict
 10 | import proxEmbedProcessModel
 11 | import dataProcessTools
 12 | import toolsFunction
 13 | import evaluateTools
 14 | 
 15 | 
 16 | def load_params(path, params):
 17 |     """
 18 |     load model params from file
 19 |     """
 20 |     pp = numpy.load(path) 
 21 |     for kk, vv in params.items():
 22 |         if kk not in pp:
 23 |             raise Warning('%s is not in the archive' % kk)
 24 |         params[kk] = pp[kk]
 25 | 
 26 |     return params
 27 | 
 28 | 
 29 | def get_proxEmbedModel(
 30 |                       
 31 |                    model_params_path='', # the path of model parameters
 32 |                      word_dimension=0, # the dimension of words embedding 
 33 |                      dimension=0, # the dimension of path embedding
 34 |                      h_output_method='h', # the output way of lstm
 35 |                      discount_alpha=0.1, # discount alpha
 36 |                      subpaths_pooling_method='max-pooling', # the combine way of sub-paths
 37 |                       ):
 38 |     """
 39 |     get model from file
 40 |     """
 41 |     model_options = locals().copy()
 42 |     
 43 |     tparams = OrderedDict()
 44 |     tparams['lstm_W']=None
 45 |     tparams['lstm_U']=None
 46 |     tparams['lstm_b']=None
 47 |     tparams['w']=None
 48 |     tparams=load_params(model_params_path, tparams) 
 49 |     
 50 |     subPaths_matrix,subPaths_mask,subPaths_lens,wemb,score=proxEmbedProcessModel.proxEmbedModel(model_options, tparams)
 51 |     func=theano.function([subPaths_matrix,subPaths_mask,subPaths_lens,wemb], score) 
 52 |     
 53 |     return func 
 54 | 
 55 | 
 56 | def compute_proxEmbed(
 57 |                      wordsEmbeddings=None, # words embeddings
 58 |                      wordsEmbeddings_path=None, # the file path of words embeddings
 59 |                      word_dimension=0, #  dimension of words embeddings
 60 |                      dimension=0, # the dimension of paths embeddings
 61 |                      wordsSize=0, # the size of words vocabulary
 62 |                      subpaths_map=None, # contains sub-paths
 63 |                      subpaths_file=None, # the file which contains sub-paths
 64 |                      maxlen_subpaths=1000, # the max length for sub-paths
 65 |                      maxlen=100,  # Sequence longer then this get ignored 
 66 |                      
 67 |                      test_data_file='', # the file path of test data
 68 |                      top_num=10, # the top num to predict
 69 |                      ideal_data_file='', # ground truth
 70 |                      func=None, # model function
 71 |                    ):
 72 |     """
 73 |     compute the result of the model
 74 |     """
 75 |     
 76 |     model_options = locals().copy()
 77 |     
 78 |     if wordsEmbeddings is None: 
 79 |         if wordsEmbeddings_path is not None: 
 80 |             wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
 81 |         else: 
 82 |             print 'There is not path for wordsEmbeddings, exit!!!'
 83 |             exit(0) 
 84 | 
 85 |     if subpaths_map is None: 
 86 |         if subpaths_file is not None: 
 87 |             subpaths_map=dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths)
 88 |         else: 
 89 |             print 'There is not path for sub-paths, exit!!!'
 90 |             exit(0)
 91 | 
 92 |     line_count=0 
 93 |     test_map={} 
 94 |     print 'Compute MAP and nDCG for file ',test_data_file
 95 |     with open(test_data_file) as f: 
 96 |         for l in f: 
 97 |             arr=l.strip().split()
 98 |             query=int(arr[0])
 99 |             map={} 
100 |             for i in range(1,len(arr)): 
101 |                 candidate=int(arr[i]) 
102 |                 subPaths_matrix_data,subPaths_mask_data,subPaths_lens_data=dataProcessTools.prepareDataForTestAsymmetric(query, candidate, subpaths_map)
103 |                 if subPaths_matrix_data is None and subPaths_mask_data is None and subPaths_lens_data is None: 
104 |                     map[candidate]=-1000. 
105 |                 else: 
106 |                     value=func(subPaths_matrix_data,subPaths_mask_data,subPaths_lens_data,wordsEmbeddings) 
107 |                     map[candidate]=value
108 |             
109 |             tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num)
110 |             test_map[line_count]=tops_in_line 
111 |             line_count+=1 
112 |                 
113 |     line_count=0 
114 |     ideal_map={}
115 |     with open(ideal_data_file) as f: 
116 |         for l in f: 
117 |             arr=l.strip().split()
118 |             arr=[int(x) for x in arr] 
119 |             ideal_map[line_count]=arr[1:] 
120 |             line_count+=1 
121 |     
122 |     MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map)
123 |     MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map)
124 |     
125 |     return MAP,MnDCG
126 | 


--------------------------------------------------------------------------------
/code/asymmetric/python - model/proxEmbedProcessModel.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | proxEmbed model for compute some dataset
 4 | '''
 5 | 
 6 | import numpy
 7 | import theano
 8 | from theano import tensor
 9 | import lstmModel
10 | 
11 | 
12 | def proxEmbedModel(model_options,tparams):
13 |     """
14 |        build ProxEmbed model
15 |     """
16 |     subPaths_matrix=tensor.matrix('subPaths_matrix',dtype='int64')
17 |     subPaths_mask=tensor.matrix('subPaths_mask',dtype=theano.config.floatX)  # @UndefinedVariable
18 |     subPaths_lens=tensor.vector('subPaths_lens',dtype='int64')
19 |     wordsEmbeddings=tensor.matrix('wordsEmbeddings',dtype=theano.config.floatX)  # @UndefinedVariable
20 |     
21 |     def _processSubpath(index):
22 |         length=subPaths_lens[index] 
23 |         x=subPaths_matrix[:length,index:index+1]
24 |         x_mask=subPaths_mask[:length,index:index+1] 
25 |         emb=lstmModel.get_lstm(model_options, tparams, x, x_mask, wordsEmbeddings)
26 |         emb=emb*discountModel(model_options['discount_alpha'], length)
27 |         return emb 
28 |     
29 |     rval,update=theano.scan(
30 |                                 _processSubpath,
31 |                                 sequences=tensor.arange(subPaths_lens.shape[0]), 
32 |                                 )
33 |     emb=0
34 |     if model_options['subpaths_pooling_method']=='mean-pooling': # mean-pooling
35 |         emb = rval.sum(axis=0) 
36 |         emb = emb / rval.shape[0] 
37 |     elif model_options['subpaths_pooling_method']=='max-pooling': # max-pooling
38 |         emb = rval.max(axis=0) 
39 |     else: # default, mean-pooling
40 |         emb = rval.sum(axis=0) 
41 |         emb = emb / rval.shape[0] 
42 |         
43 |     score=tensor.dot(emb,tparams['w'])
44 |     
45 |     return subPaths_matrix,subPaths_mask,subPaths_lens,wordsEmbeddings,score
46 |     
47 |     
48 | def discountModel(alpha,length):
49 |     """
50 |     discount model
51 |     """
52 |     return tensor.exp(alpha*length*(-1))
53 | 


--------------------------------------------------------------------------------
/code/asymmetric/python - model/pythonParamsConfig:
--------------------------------------------------------------------------------
 1 | [param]
 2 | 
 3 | ############################################
 4 | # training data dictory
 5 | ############################################
 6 | # main work dir
 7 | root_dir = D:/test/test/toydata
 8 | # the name of one dataset, such as linkedin, dblp
 9 | dataset_name = dblp
10 | # the suffix of dataset, such as 10,100,1000
11 | suffix = 4
12 | # the relation name of data, such as classmate，family
13 | class_name = advisor
14 | # the index of the dataset file
15 | index = 3
16 | 
17 | ############################################
18 | # paths for some prepared data
19 | ############################################
20 | # the file path of words embeddings
21 | wordsEmbeddings_path = %(root_dir)s/%(dataset_name)s/nodesFeatures
22 | # the file which contains sub-paths
23 | subpaths_file = %(root_dir)s/%(dataset_name)s/subpathsSaveFile
24 | 
25 | ############################################
26 | # experiment parameters - do not need to change frequently
27 | ############################################
28 | # the max length for sub-paths
29 | maxlen_subpaths = 1000
30 | # the size of words vocabulary
31 | wordsSize = 1000000
32 | # Sequence longer than this get ignored 
33 | maxlen = 1000
34 | # use a batch for training. This is the size of this batch.
35 | batch_size = 4
36 | # if need shuffle for training
37 | is_shuffle_for_batch = True
38 | # the frequences for display
39 | dispFreq = 2
40 | # the frequences for saving the parameters
41 | saveFreq = 2
42 | # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. It will be generated in the code.
43 | saveto = 
44 | # the top num to predict
45 | top_num = 1
46 | 
47 | ############################################
48 | # experiment parameters - need to tune
49 | ############################################
50 | # learning rate
51 | lrate = 0.0001
52 | # dimension of words embeddings
53 | word_dimension = 12
54 | # the dimension of paths embeddings
55 | dimension = 5
56 | # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path.
57 | h_output_method = max-pooling
58 | # the parameter alpha for discount. The longer the subpath, the little will the weight be.
59 | discount_alpha = 0.3
60 | # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling.
61 | subpaths_pooling_method = max-pooling
62 | # loss function, we use sigmoid
63 | objective_function_method = sigmoid
64 | # the parameter in loss function, beta
65 | objective_function_param = 0.5
66 | # the max epochs for training
67 | max_epochs = 50
68 | # decay for lstm_W
69 | decay_lstm_W = 0.0001
70 | # decay for lstm_U
71 | decay_lstm_U = 0.0001
72 | # decay for lstm_b
73 | decay_lstm_b = 0.0001
74 | # decay for w
75 | decay_w = 0.0001
76 | 


--------------------------------------------------------------------------------
/code/asymmetric/python - model/toolsFunction.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | some tools methods
 4 | '''
 5 | 
 6 | def mapSortByValueDESC(map,top):
 7 |     """
 8 |     sort by value desc
 9 |     """
10 |     if top>len(map): 
11 |         top=len(map)
12 |     items=map.items() 
13 |     backitems=[[v[1],v[0]] for v in items]  
14 |     backitems.sort(reverse=True) 
15 |     e=[ backitems[i][1] for i in range(top)] 
16 |     return e
17 | 
18 | 
19 | def mapSortByValueASC(map,top):
20 |     """
21 |     sort by value asc
22 |     """
23 |     if top>len(map):
24 |         top=len(map)
25 |     items=map.items() 
26 |     backitems=[[v[1],v[0]] for v in items] 
27 |     backitems.sort() 
28 |     e=[ backitems[i][1] for i in range(top)]  
29 |     return e
30 | 
31 | 


--------------------------------------------------------------------------------
/code/symmetric/java - prepare data for model/Config.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.FileInputStream;
  4 | import java.util.Properties;
  5 | 
  6 | /**
  7 |  * Parameters for ProxEmbed to prepare data for training.
  8 |  */
  9 | public class Config {
 10 | 
 11 | 	/**
 12 | 	 * The main dataset directory.
 13 | 	 * Eg, "D:/dataset/icde2016/dataset/linkedin/" means the directory of linkedin.
 14 | 	 */
 15 | 	public static String MAIN_DIR="D:/test/test/toydata/linkedin/";
 16 | 	/**
 17 | 	 * Path of nodes file.
 18 | 	 */
 19 | 	public static String NODES_PATH=MAIN_DIR+"graph.node";
 20 | 	/**
 21 | 	 * Path of edges file.
 22 | 	 */
 23 | 	public static String EDGES_PATH=MAIN_DIR+"graph.edge";
 24 | 	/**
 25 | 	 * The result file of random walk sampling.
 26 | 	 */
 27 | 	public static String SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS=MAIN_DIR+"randomWalkSamplingPaths";
 28 | 	/**
 29 | 	 * The file which contains the map relation of type and typeid.
 30 | 	 */
 31 | 	public static String TYPE_TYPEID_SAVEFILE=MAIN_DIR+"typeAndTypeIDSavePath";
 32 | 	/**
 33 | 	 * The file which contains the nodes features.
 34 | 	 */
 35 | 	public static String NODES_FEATURE_SAVE_PATH=MAIN_DIR+"nodesFeatures";
 36 | 	/**
 37 | 	 * Truncate sub-paths from samplings by this type.
 38 | 	 */
 39 | 	public static String TRUNCATED_TYPE_NAME="user";
 40 | 	/**
 41 | 	 * Sub-paths save path.
 42 | 	 */
 43 | 	public static String SUBPATHS_SAVE_PATH=MAIN_DIR+"subpathsSaveFile";
 44 | 	/**
 45 | 	 * The longest length for sampling to truncate sub-paths.
 46 | 	 */
 47 | 	public static int LONGEST_ANALYSE_LENGTH_FOR_SAMPLING=20;
 48 | 	/**
 49 | 	 * Longest length for sub-paths
 50 | 	 */
 51 | 	public static int LONGEST_LENGTH_FOR_SUBPATHS=5;
 52 | 	/**
 53 | 	 * The shortest length for each path in sampling results.
 54 | 	 */
 55 | 	public static int SHORTEST_LENGTH_FOR_SAMPLING=0;
 56 | 	/**
 57 | 	 * Sampling times for per node in random walk sampling.
 58 | 	 */
 59 | 	public static int SAMPLING_TIMES_PER_NODE=5;
 60 | 	/**
 61 | 	 * Sampling length for per node in random walk sampling.
 62 | 	 */
 63 | 	public static int SAMPLING_LENGTH_PER_PATH=5;
 64 | 	/**
 65 | 	 * When generate user features by neighbours' information, the value we set for type information when this node belongs to this kind of type.
 66 | 	 */
 67 | 	public static double FEATURE_TYPE_VALUE=1.0;
 68 | 	
 69 | 	//initialize
 70 | 	static{
 71 | 
 72 | 		Properties prop = new Properties();
 73 | 		FileInputStream in=null;
 74 | 		try {
 75 | 			//The path of properties file
 76 | 			in = new FileInputStream("/usr/lzmExperiment/path2vec/paramsSet/javaParams.properties");
 77 | 			prop.load(in);
 78 | 			
 79 | 			MAIN_DIR=prop.getProperty("MAIN_DIR");
 80 | 			NODES_PATH=MAIN_DIR+prop.getProperty("NODES_PATH");
 81 | 			EDGES_PATH=MAIN_DIR+prop.getProperty("EDGES_PATH");
 82 | 			SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS=MAIN_DIR+prop.getProperty("SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS");
 83 | 			TYPE_TYPEID_SAVEFILE=MAIN_DIR+prop.getProperty("TYPE_TYPEID_SAVEFILE");
 84 | 			NODES_FEATURE_SAVE_PATH=MAIN_DIR+prop.getProperty("NODES_FEATURE_SAVE_PATH");
 85 | 			TRUNCATED_TYPE_NAME=prop.getProperty("TRUNCATED_TYPE_NAME");
 86 | 			SUBPATHS_SAVE_PATH=MAIN_DIR+prop.getProperty("SUBPATHS_SAVE_PATH");
 87 | 			LONGEST_ANALYSE_LENGTH_FOR_SAMPLING=Integer.parseInt(prop.getProperty("LONGEST_ANALYSE_LENGTH_FOR_SAMPLING"));
 88 | 			LONGEST_LENGTH_FOR_SUBPATHS=Integer.parseInt(prop.getProperty("LONGEST_LENGTH_FOR_SUBPATHS"));
 89 | 			SHORTEST_LENGTH_FOR_SAMPLING=Integer.parseInt(prop.getProperty("SHORTEST_LENGTH_FOR_SAMPLING"));
 90 | 			SAMPLING_TIMES_PER_NODE=Integer.parseInt(prop.getProperty("SAMPLING_TIMES_PER_NODE"));
 91 | 			SAMPLING_LENGTH_PER_PATH=Integer.parseInt(prop.getProperty("SAMPLING_LENGTH_PER_PATH"));
 92 | 			FEATURE_TYPE_VALUE=Double.parseDouble(prop.getProperty("FEATURE_TYPE_VALUE"));
 93 | 			
 94 | 			//Print these parameters
 95 | 			System.out.println("Java parameters is :");
 96 | 			System.out.println("MAIN_DIR : "+MAIN_DIR);
 97 | 			System.out.println("NODES_PATH : "+NODES_PATH);
 98 | 			System.out.println("EDGES_PATH : "+EDGES_PATH);
 99 | 			System.out.println("SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS : "+SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS);
100 | 			System.out.println("TYPE_TYPEID_SAVEFILE : "+TYPE_TYPEID_SAVEFILE);
101 | 			System.out.println("NODES_FEATURE_SAVE_PATH : "+NODES_FEATURE_SAVE_PATH);
102 | 			System.out.println("TRUNCATED_TYPE_NAME : "+TRUNCATED_TYPE_NAME);
103 | 			System.out.println("SUBPATHS_SAVE_PATH : "+SUBPATHS_SAVE_PATH);
104 | 			System.out.println("LONGEST_ANALYSE_LENGTH_FOR_SAMPLING : "+LONGEST_ANALYSE_LENGTH_FOR_SAMPLING);
105 | 			System.out.println("LONGEST_LENGTH_FOR_SUBPATHS : "+LONGEST_LENGTH_FOR_SUBPATHS);
106 | 			System.out.println("SHORTEST_LENGTH_FOR_SAMPLING : "+SHORTEST_LENGTH_FOR_SAMPLING);
107 | 			System.out.println("SAMPLING_TIMES_PER_NODE : "+SAMPLING_TIMES_PER_NODE);
108 | 			System.out.println("SAMPLING_LENGTH_PER_PATH : "+SAMPLING_LENGTH_PER_PATH);
109 | 			System.out.println("FEATURE_TYPE_VALUE : "+FEATURE_TYPE_VALUE);
110 | 		} catch (Exception e) {
111 | 			e.printStackTrace();
112 | 		}
113 | 		finally{
114 | 			try {
115 | 				if(in!=null){
116 | 					in.close();
117 | 					in=null;
118 | 				}
119 | 			} catch (Exception e2) {
120 | 				e2.printStackTrace();
121 | 			}
122 | 		}
123 | 	
124 | 	}
125 | }
126 | 


--------------------------------------------------------------------------------
/code/symmetric/java - prepare data for model/GenerateEntitiesFeatureByTypes.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.FileWriter;
  4 | import java.util.HashMap;
  5 | import java.util.HashSet;
  6 | import java.util.Map;
  7 | import java.util.Set;
  8 | 
  9 | /**
 10 |  * Generate entity features by information from neighbours -- just for asymmetric
 11 |  */
 12 | public class GenerateEntitiesFeatureByTypes {
 13 | 
 14 | 	private Set<String> types=new HashSet<String>();
 15 | 	private Map<String,Integer> type2Typeid=new HashMap<String, Integer>();
 16 | 	private Map<Integer,String> typeid2Type=new HashMap<Integer, String>();
 17 | 
 18 | 	static String nodes_path=Config.NODES_PATH;
 19 | 	static String edges_path=Config.EDGES_PATH;
 20 | 	static String entities_feature_file=Config.NODES_FEATURE_SAVE_PATH;
 21 | 	static String typeAndTypeIdPath=Config.TYPE_TYPEID_SAVEFILE;
 22 | 	static double feature_type_value=Config.FEATURE_TYPE_VALUE;
 23 | 	
 24 | 	public static void main(String[] args) {
 25 | 		ReadWholeGraph rwg=new ReadWholeGraph();
 26 | 		Map<Integer,Node> graph=rwg.readDataFromFile(nodes_path, edges_path, typeAndTypeIdPath);
 27 | 		GenerateEntitiesFeatureByTypes gefb=new GenerateEntitiesFeatureByTypes();
 28 | 		gefb.analyseTypes(graph);
 29 | 		gefb.generateFeaturesByGraph(graph, entities_feature_file,feature_type_value);
 30 | 	}
 31 | 
 32 | 
 33 | 	/**
 34 | 	 * Analyse this graph.
 35 | 	 * @param graph
 36 | 	 */
 37 | 	public void analyseTypes(Map<Integer,Node> graph){
 38 | 		for(Node n:graph.values()){
 39 | 			types.add(n.getType());
 40 | 			if(!type2Typeid.containsKey(n.getType())){
 41 | 				type2Typeid.put(n.getType(), type2Typeid.size());
 42 | 				typeid2Type.put(typeid2Type.size(), n.getType());
 43 | 			}	
 44 | 		}
 45 | 	}
 46 | 	
 47 | 	/**
 48 | 	 * Generate nodes features.
 49 | 	 * @param graph
 50 | 	 * @param saveFile
 51 | 	 */
 52 | 	public void generateFeaturesByGraph(Map<Integer,Node> graph,String saveFile,double typeValue){
 53 | 		int dimension=types.size();
 54 | 		int nodesNum=graph.size();
 55 | 		StringBuilder sb=new StringBuilder();
 56 | 		String type=null;
 57 | 		int typeId=0;
 58 | 		Map<String,Integer> typesNum=new HashMap<String, Integer>();
 59 | 		FileWriter writer = null;
 60 | 		try {
 61 | 			writer = new FileWriter(saveFile);
 62 | 			writer.write(nodesNum+" "+dimension+"\r\n");
 63 | 			writer.flush();
 64 | 			for(Node now:graph.values()){
 65 | 				sb.delete( 0, sb.length() );
 66 | 				typesNum.clear();
 67 | 				
 68 | 				sb.append(now.getId()+" ");
 69 | 				type=now.getType();
 70 | 				typeId=type2Typeid.get(type);
 71 | 				
 72 | 				for(int i=0;i<types.size();i++){
 73 | 					if(i==typeId){
 74 | 						sb.append(typeValue+" ");
 75 | 					}
 76 | 					else{
 77 | 						sb.append(0.0+" ");
 78 | 					}
 79 | 				}
 80 | 				
 81 | 				sb.append("\r\n");
 82 | 				writer.write(sb.toString());
 83 | 				writer.flush();
 84 | 			}
 85 | 		} catch (Exception e) {
 86 | 			e.printStackTrace();
 87 | 		}
 88 | 		finally{
 89 | 			try {
 90 | 				if(writer!=null){
 91 | 					writer.close();
 92 | 					writer=null;
 93 | 				}
 94 | 			} catch (Exception e2) {
 95 | 				e2.printStackTrace();
 96 | 			}
 97 | 		}
 98 | 	}
 99 | }
100 | 


--------------------------------------------------------------------------------
/code/symmetric/java - prepare data for model/GenerateEntitiesFeaturesByGraph.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.FileWriter;
  4 | import java.util.HashMap;
  5 | import java.util.HashSet;
  6 | import java.util.Map;
  7 | import java.util.Set;
  8 | 
  9 | /**
 10 |  * Generate entity features by information from neighbours -- just for asymmetric
 11 |  */
 12 | public class GenerateEntitiesFeaturesByGraph {
 13 | 
 14 | 	private Set<String> types=new HashSet<String>();
 15 | 	private Map<String,Integer> type2Typeid=new HashMap<String, Integer>();
 16 | 	private Map<Integer,String> typeid2Type=new HashMap<Integer, String>();
 17 | 	static String nodes_path=Config.NODES_PATH;
 18 | 	static String edges_path=Config.EDGES_PATH;
 19 | 	static String entities_feature_file=Config.NODES_FEATURE_SAVE_PATH;
 20 | 	static String typeAndTypeIdPath=Config.TYPE_TYPEID_SAVEFILE;
 21 | 	static double feature_type_value=Config.FEATURE_TYPE_VALUE;
 22 | 	
 23 | 	public static void main(String[] args) {
 24 | 		ReadWholeGraph rwg=new ReadWholeGraph();
 25 | 		Map<Integer,Node> graph=rwg.readDataFromFile(nodes_path, edges_path, typeAndTypeIdPath);
 26 | 		GenerateEntitiesFeaturesByGraph gefb=new GenerateEntitiesFeaturesByGraph();
 27 | 		gefb.analyseTypes(graph);
 28 | 		gefb.generateFeaturesByGraph(graph, entities_feature_file,feature_type_value);
 29 | 	}
 30 | 
 31 | 	/**
 32 | 	 * Analyse nodes types
 33 | 	 */
 34 | 	public void analyseTypes(Map<Integer,Node> graph){
 35 | 		for(Node n:graph.values()){
 36 | 			types.add(n.getType());
 37 | 			if(!type2Typeid.containsKey(n.getType())){
 38 | 				type2Typeid.put(n.getType(), type2Typeid.size());
 39 | 				typeid2Type.put(typeid2Type.size(), n.getType());
 40 | 			}	
 41 | 		}
 42 | 	}
 43 | 	
 44 | 	/**
 45 | 	 * Generate nodes features
 46 | 	 */
 47 | 	public void generateFeaturesByGraph(Map<Integer,Node> graph,String saveFile,double typeValue){
 48 | 		int dimension=types.size()+1+types.size()+1;
 49 | 		int nodesNum=graph.size();
 50 | 		StringBuilder sb=new StringBuilder();
 51 | 		String type=null;
 52 | 		int typeId=0;
 53 | 		double value=0;
 54 | 		double sum=0;
 55 | 		Map<String,Integer> typesNum=new HashMap<String, Integer>();
 56 | 		FileWriter writer = null;
 57 | 		try {
 58 | 			writer = new FileWriter(saveFile);
 59 | 			writer.write(nodesNum+" "+dimension+"\r\n");
 60 | 			writer.flush();
 61 | 			for(Node now:graph.values()){
 62 | 				sb.delete( 0, sb.length() );
 63 | 				typesNum.clear();
 64 | 				
 65 | 				sb.append(now.getId()+" ");
 66 | 				type=now.getType();
 67 | 				typeId=type2Typeid.get(type);
 68 | 				
 69 | 				for(int i=0;i<types.size();i++){
 70 | 					if(i==typeId){
 71 | 						sb.append(typeValue+" ");
 72 | 					}
 73 | 					else{
 74 | 						sb.append(0.0+" ");
 75 | 					}
 76 | 				}
 77 | 				
 78 | 				value=now.in_nodes.size();
 79 | 				sb.append(Math.log(value+1.0)+" ");
 80 | 				
 81 | 				for(Node n:now.in_nodes){
 82 | 					type=n.getType();
 83 | 					if(typesNum.containsKey(type)){
 84 | 						typesNum.put(type, typesNum.get(type)+1);
 85 | 					}
 86 | 					else{
 87 | 						typesNum.put(type, 1);
 88 | 					}
 89 | 				}
 90 | 				for(int i=0;i<typeid2Type.size();i++){
 91 | 					type=typeid2Type.get(i);
 92 | 					if(typesNum.containsKey(type)){
 93 | 						sb.append(Math.log(typesNum.get(type)+1)+" ");
 94 | 					}
 95 | 					else{
 96 | 						sb.append(0.0+" ");
 97 | 					}
 98 | 				}
 99 | 				
100 | 				value=0;
101 | 				sum=0;
102 | 				for(int num:typesNum.values()){
103 | 					value=(num+0.0)/now.in_nodes.size();
104 | 					sum+=-value*Math.log(value);
105 | 				}
106 | 				sb.append(sum);
107 | 				
108 | 				sb.append("\r\n");
109 | 				writer.write(sb.toString());
110 | 				writer.flush();
111 | 			}
112 | 		} catch (Exception e) {
113 | 			e.printStackTrace();
114 | 		}
115 | 		finally{
116 | 			try {
117 | 				if(writer!=null){
118 | 					writer.close();
119 | 					writer=null;
120 | 				}
121 | 			} catch (Exception e2) {
122 | 				e2.printStackTrace();
123 | 			}
124 | 		}
125 | 	}
126 | }
127 | 


--------------------------------------------------------------------------------
/code/symmetric/java - prepare data for model/GenerateSubPathsFromSamplings.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileWriter;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.ArrayList;
  9 | import java.util.Arrays;
 10 | import java.util.HashMap;
 11 | import java.util.HashSet;
 12 | import java.util.List;
 13 | import java.util.Map;
 14 | import java.util.Set;
 15 | 
 16 | /**
 17 |  * Generate sub-paths by samplings.
 18 |  */
 19 | public class GenerateSubPathsFromSamplings {
 20 | 
 21 | 	static String nodes_path=Config.NODES_PATH;
 22 | 	static String conditional_random_walk_sampling_paths=Config.SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS;
 23 | 	static String truncated_type_name=Config.TRUNCATED_TYPE_NAME;
 24 | 	static String subpaths_save_path=Config.SUBPATHS_SAVE_PATH;
 25 | 	static int longest_length_for_window=Config.LONGEST_ANALYSE_LENGTH_FOR_SAMPLING;
 26 | 	static int longest_lenght_for_subpaths=Config.LONGEST_LENGTH_FOR_SUBPATHS;
 27 | 	
 28 | 	public static void main(String[] args) {
 29 | 		GenerateSubPathsFromSamplings g=new GenerateSubPathsFromSamplings();
 30 | 		g.generateSubPathsFromSamplings(
 31 | 				nodes_path, 
 32 | 				conditional_random_walk_sampling_paths, 
 33 | 				truncated_type_name, 
 34 | 				subpaths_save_path, 
 35 | 				longest_length_for_window,
 36 | 				longest_lenght_for_subpaths);
 37 | 	}
 38 | 
 39 | 	/**
 40 | 	 * Generate sub-paths by samplings.
 41 | 	 */
 42 | 	public void generateSubPathsFromSamplings(String nodesPath,String samplingsPath,String truncatedNodeType,String subPathsSavePath,int window_maxlen,int subpath_maxlen){
 43 | 		Set<Integer> truncatedNodeIds=new HashSet<Integer>();
 44 | 		Set<String> truncatedTypes=new HashSet<String>();
 45 | 		String[] arr=truncatedNodeType.split(" ");
 46 | 		truncatedTypes.addAll(Arrays.asList(arr));
 47 | 		BufferedReader br=null;
 48 | 		arr=null;
 49 | 		try {
 50 | 			br = new BufferedReader(new InputStreamReader(new FileInputStream(nodesPath), "UTF-8"));
 51 | 			String temp = null;
 52 | 			while ((temp = br.readLine()) != null ) {
 53 | 				temp=temp.trim();
 54 | 				if(temp.length()>0){
 55 | 					arr=temp.split("	");
 56 | 					if(truncatedTypes.contains(arr[1])){
 57 | 						truncatedNodeIds.add(Integer.parseInt(arr[0]));
 58 | 					}
 59 | 				}
 60 | 			}
 61 | 		} catch (Exception e2) {
 62 | 			e2.printStackTrace();
 63 | 		}
 64 | 		finally{
 65 | 			try {
 66 | 				if(br!=null){
 67 | 					br.close();
 68 | 					br=null;
 69 | 				}
 70 | 			} catch (IOException e) {
 71 | 				e.printStackTrace();
 72 | 			}
 73 | 		}
 74 | 		FileWriter writer =null;
 75 | 		String t=null;
 76 | 		List<Integer> path=new ArrayList<Integer>();
 77 | 		try {
 78 | 			br = new BufferedReader(new InputStreamReader(new FileInputStream(samplingsPath), "UTF-8"));
 79 | 			writer = new FileWriter(subPathsSavePath);
 80 | 			String temp = null;
 81 | 			while ((temp = br.readLine()) != null ) {
 82 | 				temp=temp.trim();
 83 | 				if(temp.length()>0){
 84 | 					path.clear();
 85 | 					arr=temp.split(" ");
 86 | 					for(String s:arr){
 87 | 						path.add(Integer.parseInt(s));
 88 | 					}
 89 | 					t=analyseOnePath(path, truncatedNodeIds, window_maxlen, subpath_maxlen);
 90 | 					if(t.length()>0){
 91 | 						writer.write(t);
 92 | 						writer.flush();
 93 | 					}
 94 | 				}
 95 | 			}
 96 | 		} catch (Exception e2) {
 97 | 			e2.printStackTrace();
 98 | 		}
 99 | 		finally{
100 | 			try {
101 | 				if(writer!=null){
102 | 					writer.close();
103 | 					writer=null;
104 | 				}
105 | 				if(br!=null){
106 | 					br.close();
107 | 					br=null;
108 | 				}
109 | 			} catch (IOException e) {
110 | 				e.printStackTrace();
111 | 			}
112 | 		}
113 | 	}
114 | 	
115 | 	/**
116 | 	 * Generate sub-paths by one specific sampling path.
117 | 	 */
118 | 	private String analyseOnePath(List<Integer> path,Set<Integer> truncatedNodeIds,int maxWindowLen,int maxSubpathLen){
119 | 		StringBuilder sb=new StringBuilder();
120 | 		int start=0;
121 | 		int end=0;
122 | 		List<Integer> subpath=new ArrayList<Integer>();
123 | 		for(int i=0;i<path.size();i++){
124 | 			start=path.get(i);
125 | 			if(!truncatedNodeIds.contains(start)){
126 | 				continue;
127 | 			}
128 | 			for(int j=i+1;j<path.size();j++){
129 | 				end=path.get(j);
130 | 				if(!truncatedNodeIds.contains(end)){
131 | 					continue;
132 | 				}
133 | 				
134 | 				if(maxWindowLen>0 && (j-i)>maxWindowLen){
135 | 					break;
136 | 				}
137 | 				
138 | 				subpath.clear();
139 | 				for(int x=i;x<=j;x++){
140 | 					subpath.add(path.get(x)+0);
141 | 				}
142 | 				List<Integer> subpathNoRepeat=deleteRepeat(subpath);
143 | 				if(subpathNoRepeat.size()<2){
144 | 					subpathNoRepeat=null;
145 | 					continue;
146 | 				}
147 | 				
148 | 				if(maxSubpathLen>0 && subpathNoRepeat.size()>maxSubpathLen){
149 | 					continue;
150 | 				}
151 | 				
152 | 				sb.append(path.get(i)+"	"+path.get(j)+"	");
153 | 				for(int x=0;x<subpathNoRepeat.size();x++){
154 | 					sb.append(subpathNoRepeat.get(x)+" ");
155 | 				}
156 | 				sb.append("\r\n");
157 | 				subpathNoRepeat=null;
158 | 			}
159 | 		}
160 | 		return sb.toString();
161 | 	}
162 | 	
163 | 	/**
164 | 	 * Delete repeat segments for sub-paths
165 | 	 */
166 | 	public List<Integer> deleteRepeat(List<Integer> path){
167 | 		Map<Integer,Integer> map=new HashMap<Integer,Integer>();
168 | 		int node=0;
169 | 		List<Integer> result=new ArrayList<Integer>();
170 | 		int formerIndex=0;
171 | 		for(int i=0;i<path.size();i++){
172 | 			node=path.get(i);
173 | 			if(!map.containsKey(node)){
174 | 				map.put(node, i);
175 | 			}
176 | 			else{
177 | 				formerIndex=map.get(node);
178 | 				for(int j=formerIndex;j<i;j++){
179 | 					map.remove(path.get(j));
180 | 					path.set(j, -1);
181 | 				}
182 | 				map.put(node, i);
183 | 			}
184 | 		}
185 | 		for(int i=0;i<path.size();i++){
186 | 			if(path.get(i)!=-1){
187 | 				result.add(path.get(i));
188 | 			}
189 | 		}
190 | 		return result;
191 | 	}
192 | }
193 | 


--------------------------------------------------------------------------------
/code/symmetric/java - prepare data for model/Main.java:
--------------------------------------------------------------------------------
 1 | package dataPrepare.ProxEmbed;
 2 | 
 3 | /**
 4 |  * Prepare data from graph for training and testing.
 5 |  * 
 6 |  * Procedure:
 7 |  * 1.Sampling by random walk.
 8 |  * 2.Generate entity features by information from graph.
 9 |  * 3.Generate sub-paths from samplings from step 1.
10 |  * 
11 |  * Finally, these results are written to a file.
12 |  * 
13 |  * Attention:
14 |  * 	If you want to generate data for symmetric training data, then you should remove "GenerateEntitiesFeaturesByGraph.main(null);" and only use "GenerateEntitiesFeatureByTypes.main(null);".
15 |  *  While if you want to generate data for asymmetric training data, then you should remove "GenerateEntitiesFeatureByTypes.main(null);" and only use "GenerateEntitiesFeaturesByGraph.main(null);".
16 |  */
17 | public class Main {
18 | 
19 | 	public static void main(String[] args) {
20 | 		//random walk sampling
21 | 		long starttime=System.currentTimeMillis();
22 | 		System.out.println("Start random walk sampling......");
23 | 		RandomWalkSampling.main(null);
24 | 		
25 | 		//generate entities features by information from this graph.
26 | 		System.out.println("Start generating entities' features......");
27 | //		GenerateEntitiesFeaturesByGraph.main(null);//Generate entity features by information from neighbours -- just for asymmetric
28 | 		GenerateEntitiesFeatureByTypes.main(null);//Generate entity features only by type information -- just for symmetric
29 | 		
30 | 		//generate sub-paths
31 | 		System.out.println("Start generating sub-paths from samplings......");
32 | 		GenerateSubPathsFromSamplings.main(null);//Generate sub-paths from samplings.
33 | 		long endtime=System.currentTimeMillis();
34 | 		System.out.println("Cost time == "+(endtime-starttime)+" ms");
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/code/symmetric/java - prepare data for model/Node.java:
--------------------------------------------------------------------------------
 1 | package dataPrepare.ProxEmbed;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.HashMap;
 5 | import java.util.HashSet;
 6 | import java.util.List;
 7 | import java.util.Map;
 8 | import java.util.Set;
 9 | 
10 | /**
11 |  * Node class, contains id, type, in and out info.
12 |  */
13 | public class Node {
14 | 
15 | 	private int id=-1;
16 | 	private String type=null;
17 | 	/**
18 | 	 * Index for type, not use now.
19 | 	 */
20 | 	private int typeId=-1;
21 | 	public List<Node> in_nodes=new ArrayList<Node>();
22 | 	public List<Node> out_nodes=new ArrayList<Node>();
23 | 	public List<Integer> in_ids=new ArrayList<Integer>();
24 | 	public List<Integer> out_ids=new ArrayList<Integer>();
25 | 	public Map<Node,List<List<Node>>> typePaths=new HashMap<Node, List<List<Node>>>();
26 | 	public Set<Node> neighbours=new HashSet<Node>(); 
27 | 
28 | 	public int getId() {
29 | 		return id;
30 | 	}
31 | 
32 | 	public void setId(int id) {
33 | 		this.id = id;
34 | 	}
35 | 
36 | 	public String getType() {
37 | 		return type;
38 | 	}
39 | 
40 | 	public void setType(String type) {
41 | 		this.type = type;
42 | 	}
43 | 	
44 | 	public int getTypeId() {
45 | 		return typeId;
46 | 	}
47 | 
48 | 	public void setTypeId(int typeId) {
49 | 		this.typeId = typeId;
50 | 	}
51 | 
52 | 	@Override
53 | 	public int hashCode() {
54 | 		return this.id;
55 | 	}
56 | 
57 | 	@Override
58 | 	public boolean equals(Object obj) {
59 | 		if(obj instanceof Node){
60 | 			Node node=(Node) obj;
61 | 			if(node.getId()==this.id){
62 | 				return true;
63 | 			}
64 | 		}
65 | 		return false;
66 | 	}
67 | 
68 | 	@Override
69 | 	public String toString() {
70 | 		return "[id="+id+",neighbours=["+getNeighboursInfo()+"]]";
71 | 	}
72 | 	
73 | 	private String getNeighboursInfo(){
74 | 		StringBuilder sb=new StringBuilder();
75 | 		if(neighbours.size()==0){
76 | 			return "";
77 | 		}
78 | 		else{
79 | 			for(Node n:neighbours){
80 | 				sb.append(n.id+",");
81 | 			}
82 | 			return sb.toString();
83 | 		}
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/code/symmetric/java - prepare data for model/RandomWalkSampling.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.FileWriter;
  4 | import java.io.IOException;
  5 | import java.util.ArrayList;
  6 | import java.util.HashMap;
  7 | import java.util.HashSet;
  8 | import java.util.List;
  9 | import java.util.Map;
 10 | import java.util.Random;
 11 | import java.util.Set;
 12 | 
 13 | 
 14 | /**
 15 |  * Generate samplings by random walk samplings.
 16 |  * 
 17 |  * Procedure:
 18 |  * 1.Read the whole graph
 19 |  * 2.Generate samplings by random walk.
 20 |  */
 21 | public class RandomWalkSampling {
 22 | 
 23 | 	/**
 24 | 	 * Random number generator
 25 | 	 */
 26 | 	private Random random=new Random(123);
 27 | 	
 28 | 	static String nodesPath=Config.NODES_PATH;
 29 | 	static String edgesPath=Config.EDGES_PATH;
 30 | 	static String savePath=Config.SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS;
 31 | 	static int K=Config.SAMPLING_TIMES_PER_NODE;
 32 | 	static int L=Config.SAMPLING_LENGTH_PER_PATH;
 33 | 	static String typeAndTypeIdPath=Config.TYPE_TYPEID_SAVEFILE;
 34 | 	static int shortest_path_length=Config.SHORTEST_LENGTH_FOR_SAMPLING;
 35 | 	
 36 | 	public static void main(String[] args) {
 37 | 		ReadWholeGraph rwg=new ReadWholeGraph();
 38 | 		//1.Read the whole graph
 39 | 		Map<Integer,Node> data=rwg.readDataFromFile(
 40 | 				nodesPath, 
 41 | 				edgesPath, 
 42 | 				typeAndTypeIdPath); 
 43 | 		//2.Generate samplings by random walk.
 44 | 		RandomWalkSampling crws=new RandomWalkSampling();
 45 | 		crws.randomWalkSampling(data, K, L, savePath);
 46 | 	}
 47 | 
 48 | 	/**
 49 | 	 * Generate samplings by random walk.
 50 | 	 * @param data
 51 | 	 * @param k
 52 | 	 * @param l
 53 | 	 * @param pathsFile
 54 | 	 */
 55 | 	public void randomWalkSampling(Map<Integer,Node> data,int k,int l,String pathsFile){
 56 | 		List<Node> path=null;
 57 | 		FileWriter writer=null;
 58 | 		StringBuilder sb=new StringBuilder();
 59 | 		try {
 60 | 			writer=new FileWriter(pathsFile);
 61 | 		} catch (IOException e) {
 62 | 			e.printStackTrace();
 63 | 		}
 64 | 		for(Node node:data.values()){
 65 | 			for(int i=0;i<k;i++){
 66 | 				path=randomWalkPath(node,l,data);
 67 | 				if(path.size()<shortest_path_length){
 68 | 					continue;
 69 | 				}
 70 | 				sb.delete( 0, sb.length() );
 71 | 				for(int j=0;j<path.size();j++){
 72 | 					sb.append(path.get(j).getId()+" ");
 73 | 				}
 74 | 				sb.append("\r\n");
 75 | 				try {
 76 | 					writer.write(sb.toString());
 77 | 					writer.flush();
 78 | 				} catch (IOException e) {
 79 | 					e.printStackTrace();
 80 | 				}
 81 | 			}
 82 | 			
 83 | 		}
 84 | 	}
 85 | 	
 86 | 	/**
 87 | 	 * Generate a path by random walk.
 88 | 	 * @param start
 89 | 	 * @param l
 90 | 	 * @param data
 91 | 	 * @return
 92 | 	 */
 93 | 	private List<Node> randomWalkPath(Node start,int l, Map<Integer,Node> data){
 94 | 		List<Node> path=new ArrayList<Node>(l+1);
 95 | 		path.add(start);
 96 | 		Node now=start;
 97 | 		Set<Integer> types_set=new HashSet<Integer>();
 98 | 		List<Integer> types=new ArrayList<Integer>();
 99 | 		Map<Integer,List<Integer>> neighbours=new HashMap<Integer, List<Integer>>();
100 | 		int type=-1;
101 | 		List<Integer> list=null;
102 | 		for(int i=0;i<l;i++){
103 | 			if(now.out_nodes.size()==0){
104 | 				break;
105 | 			}
106 | 			types_set.clear();
107 | 			types.clear();
108 | 			neighbours.clear();
109 | 			for(Node n:now.out_nodes){
110 | 				types_set.add(n.getTypeId());
111 | 				if(neighbours.containsKey(n.getTypeId())){
112 | 					neighbours.get(n.getTypeId()).add(n.getId());
113 | 				}
114 | 				else{
115 | 					List<Integer> ids=new ArrayList<Integer>();
116 | 					ids.add(n.getId());
117 | 					neighbours.put(n.getTypeId(), ids);
118 | 				}
119 | 			}
120 | 			types.addAll(types_set);
121 | 			type=types.get(random.nextInt(types.size()));
122 | 			list=neighbours.get(type);
123 | 			now=data.get(list.get(random.nextInt(list.size())));
124 | 			path.add(now);
125 | 		}
126 | 		return path;
127 | 	}
128 | }
129 | 


--------------------------------------------------------------------------------
/code/symmetric/java - prepare data for model/ReadWholeGraph.java:
--------------------------------------------------------------------------------
  1 | package dataPrepare.ProxEmbed;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileWriter;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.HashMap;
  9 | import java.util.Map;
 10 | 
 11 | /**
 12 |  * Read the while graph, and then save the info into Map<Integer,Node>
 13 |  */
 14 | public class ReadWholeGraph {
 15 | 
 16 | 	static Map<Integer,String> typeid2Type=new HashMap<Integer, String>();
 17 | 	static Map<String,Integer> type2Typeid=new HashMap<String, Integer>();
 18 | 
 19 | 	/**
 20 | 	 * Read whole graph info
 21 | 	 * @param nodesPath
 22 | 	 * @param edgesPath
 23 | 	 * @param typeAndTypeIdPath
 24 | 	 * @return
 25 | 	 */
 26 | 	public Map<Integer,Node> readDataFromFile(String nodesPath,String edgesPath,String typeAndTypeIdPath){
 27 | 		Map<Integer,Node> data=new HashMap<Integer,Node>();
 28 | 		BufferedReader br=null;
 29 | 		String[] arr=null;
 30 | 		Node node=null;
 31 | 		try {
 32 | 			br = new BufferedReader(new InputStreamReader(new FileInputStream(nodesPath), "UTF-8"));
 33 | 			String temp = null;
 34 | 			while ((temp = br.readLine()) != null ) {
 35 | 				temp=temp.trim();
 36 | 				if(temp.length()>0){
 37 | 					arr=temp.split("\t");
 38 | 					node=new Node();
 39 | 					node.setId(Integer.parseInt(arr[0]));
 40 | 					node.setType(arr[1]);
 41 | 					if(type2Typeid.containsKey(arr[1])){
 42 | 						node.setTypeId(type2Typeid.get(arr[1]));
 43 | 					}
 44 | 					else{
 45 | 						type2Typeid.put(arr[1], type2Typeid.size());
 46 | 						typeid2Type.put(typeid2Type.size(), arr[1]);
 47 | 						node.setTypeId(type2Typeid.get(arr[1]));
 48 | 					}
 49 | 					data.put(Integer.parseInt(arr[0]), node);
 50 | 				}
 51 | 			}
 52 | 		} catch (Exception e2) {
 53 | 			e2.printStackTrace();
 54 | 		}
 55 | 		finally{
 56 | 			try {
 57 | 				if(br!=null){
 58 | 					br.close();
 59 | 					br=null;
 60 | 				}
 61 | 			} catch (IOException e) {
 62 | 				e.printStackTrace();
 63 | 			}
 64 | 		}
 65 | 		int start=0;
 66 | 		int end=0;
 67 | 		Node startNode=null;
 68 | 		Node endNode=null;
 69 | 		try {
 70 | 			br = new BufferedReader(new InputStreamReader(new FileInputStream(edgesPath), "UTF-8"));
 71 | 			String temp = null;
 72 | 			while ((temp = br.readLine()) != null ) {
 73 | 				temp=temp.trim();
 74 | 				if(temp.length()>0){
 75 | 					arr=temp.split("\t");
 76 | 					start=Integer.parseInt(arr[0]);
 77 | 					end=Integer.parseInt(arr[1]);
 78 | 					startNode=data.get(start);
 79 | 					endNode=data.get(end);
 80 | 					startNode.out_ids.add(end);
 81 | 					startNode.out_nodes.add(endNode);
 82 | 					endNode.in_ids.add(start);
 83 | 					endNode.in_nodes.add(startNode);
 84 | 				}
 85 | 			}
 86 | 		} catch (Exception e2) {
 87 | 			e2.printStackTrace();
 88 | 		}
 89 | 		finally{
 90 | 			try {
 91 | 				if(br!=null){
 92 | 					br.close();
 93 | 					br=null;
 94 | 				}
 95 | 			} catch (IOException e) {
 96 | 				e.printStackTrace();
 97 | 			}
 98 | 		}
 99 | 		FileWriter writer = null;
100 | 		try {
101 | 			writer = new FileWriter(typeAndTypeIdPath);
102 | 			for(String type:type2Typeid.keySet()){
103 | 				writer.write(type+" "+type2Typeid.get(type)+"\r\n");
104 | 				writer.flush();
105 | 				}
106 | 			} catch (Exception e) {
107 | 			e.printStackTrace();
108 | 		}
109 | 		finally{
110 | 			try {
111 | 				if(writer!=null){
112 | 					writer.close();
113 | 					writer=null;
114 | 				}
115 | 			} catch (Exception e2) {
116 | 				e2.printStackTrace();
117 | 			}
118 | 		}
119 | 		
120 | 		return data;
121 | 	}
122 | }
123 | 


--------------------------------------------------------------------------------
/code/symmetric/java - prepare data for model/javaParams.properties:
--------------------------------------------------------------------------------
 1 | 
 2 | # Main dataset directory
 3 | MAIN_DIR = /usr/lzmExperiment/path2vec/dataset/linkedin/
 4 | 
 5 | # Truncate sub-paths from samplings by this type.
 6 | TRUNCATED_TYPE_NAME = user
 7 | 
 8 | # The longest length for sampling to truncate sub-paths.
 9 | LONGEST_ANALYSE_LENGTH_FOR_SAMPLING = 20
10 | 
11 | # Longest length for sub-paths
12 | LONGEST_LENGTH_FOR_SUBPATHS = 5
13 | 
14 | # The shortest length for each path in sampling results.
15 | SHORTEST_LENGTH_FOR_SAMPLING = 0
16 | 
17 | # Sampling times for per node in random walk sampling.
18 | SAMPLING_TIMES_PER_NODE = 5
19 | 
20 | # Sampling length for per node in random walk sampling.
21 | SAMPLING_LENGTH_PER_PATH = 5
22 | 
23 | # When generate user features by neighbours' information, the value we set for type information when this node belongs to this kind of type.
24 | FEATURE_TYPE_VALUE = 1.0
25 | 
26 | ########################################
27 | # 不太需要改动的参数
28 | ########################################
29 | # file name of nodes
30 | NODES_PATH = graph.node
31 | 
32 | # file name of edges
33 | EDGES_PATH = graph.edge
34 | 
35 | # file name of random walk sampling paths
36 | SAVE_PATH_FOR_RANDOMWALK_SAMPLINGS = randomWalkSamplingPaths
37 | 
38 | # The file name which contains the map relation of type and typeid.
39 | TYPE_TYPEID_SAVEFILE = typeAndTypeIDSavePath
40 | 
41 | # file name of node features
42 | NODES_FEATURE_SAVE_PATH = nodesFeatures
43 | 
44 | # file name of sub-paths save file
45 | SUBPATHS_SAVE_PATH = subpathsSaveFile


--------------------------------------------------------------------------------
/code/symmetric/python - model/dataProcessTools.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | methods for processing data
  4 | '''
  5 | 
  6 | import numpy
  7 | import theano
  8 | 
  9 | # Set the random number generators' seeds for consistency
 10 | SEED = 123
 11 | numpy.random.seed(SEED)
 12 | 
 13 | def getTrainingData(trainingDataFile):
 14 |     '''
 15 |         read training data from file
 16 |     :type string
 17 |     :param trainingDataFile
 18 |     '''
 19 |     data=[] 
 20 |     pairs=[] 
 21 |     with open(trainingDataFile) as f:
 22 |         for l in f:
 23 |             tmp=l.strip().split()
 24 |             if len(tmp)<=0:
 25 |                 continue
 26 |             arr=[]
 27 |             arr.append(tmp[0]+'-'+tmp[1])
 28 |             arr.append(tmp[1]+'-'+tmp[0])
 29 |             arr.append(tmp[0]+'-'+tmp[2])
 30 |             arr.append(tmp[2]+'-'+tmp[0])
 31 |             pairs.append(arr) 
 32 |             tmp=[int(x) for x in tmp] 
 33 |             data.append(tmp)
 34 |             
 35 |     return data,pairs
 36 | 
 37 | def getWordsEmbeddings(wordsEmbeddings_path):
 38 |     """
 39 |         read words embeddings from file
 40 |             a b
 41 |             c d e f ....
 42 |             g h j k ....
 43 |             a means the num(line) of the data，b means the dimension of the data
 44 |             c and g are the index of the corresponding words
 45 |             d，e，f，h，j，k，... are the content of embeddings
 46 |     :type String
 47 |     :param wordsEmbeddings_path
 48 |     """
 49 |     size=0
 50 |     dimension=0
 51 |     wemb=[]
 52 |     with open(wordsEmbeddings_path) as f:
 53 |         for l in f:
 54 |             arr=l.strip().split()
 55 |             if len(arr)==2: 
 56 |                 size=int(arr[0])
 57 |                 dimension=int(arr[1])
 58 |                 wemb=numpy.zeros((size,dimension)) # @UndefinedVariable
 59 |                 continue
 60 |             id=int(arr[0])
 61 |             for i in range(0,dimension):
 62 |                 wemb[id][i]=float(arr[i+1])
 63 |     return wemb,dimension,size
 64 | 
 65 | def loadAllSubPaths(subpaths_file,maxlen=1000):
 66 |     """
 67 |         read all subpaths from file
 68 |     :type subpaths_file: String
 69 |     :param subpaths_file：file path 
 70 |        
 71 |     :type maxlen:int
 72 |     :param maxlen:
 73 |     
 74 |     the return value is a map, and the key of this map is made of startNodeId-endNodeId.
 75 |     the value of this map is a list made of startNodeId aId bId cId dId... endNodeId
 76 |     """
 77 |     map={}
 78 |     with open(subpaths_file) as f:
 79 |         for l in f: 
 80 |             splitByTab=l.strip().split('\t')
 81 |             key=splitByTab[0]+'-'+splitByTab[1] 
 82 |             sentence=[int(y) for y in splitByTab[2].split()[:]] 
 83 |             if len(sentence)>maxlen: 
 84 |                 continue
 85 |             if key in map:
 86 |                 map[key].append(sentence)
 87 |             else: 
 88 |                 tmp=[]
 89 |                 tmp.append(sentence)
 90 |                 map[key]=tmp
 91 |     return map
 92 | 
 93 | def prepareDataForTraining(trainingDataTriples,trainingDataPairs,subpaths_map):
 94 |     """
 95 |         prepare data for training
 96 |     """
 97 |     n_triples=len(trainingDataTriples)
 98 |     
 99 |     triples_matrix=numpy.zeros([n_triples,4,2]).astype('int64')
100 |     
101 |     maxlen=0 
102 |     n_subpaths=0 
103 |     allPairs=[] 
104 |     for list in trainingDataPairs:
105 |         for l in list:
106 |             allPairs.append(l)
107 |     for key in allPairs: 
108 |         if key not in subpaths_map: 
109 |             continue;
110 |         list=subpaths_map[key]
111 |         n_subpaths+=len(list) 
112 |         for l in list:
113 |             if len(l)>maxlen:
114 |                 maxlen=len(l)
115 |                 
116 |     subPaths_matrix=numpy.zeros([maxlen,n_subpaths]).astype('int64') 
117 |     
118 |     subPaths_mask=numpy.zeros([maxlen,n_subpaths]).astype(theano.config.floatX)  # @UndefinedVariable
119 |     
120 |     subPaths_lens=numpy.zeros([n_subpaths,]).astype('int64')
121 |     
122 |     current_index=0 
123 |     path_index=0 
124 |     valid_triples_count=0 
125 |     for i in range(len(trainingDataPairs)): 
126 |         pairs=trainingDataPairs[i] 
127 |         
128 |         valid_triples_count+=1 
129 |         for j in range(len(pairs)): 
130 |             pair=pairs[j]
131 |             list=None
132 |             if pair in subpaths_map: 
133 |                 list=subpaths_map[pair] 
134 |             if list is not None: 
135 |                 triples_matrix[i][j][0]=current_index
136 |                 current_index+=len(list)
137 |                 triples_matrix[i][j][1]=current_index 
138 |                 for x in range(len(list)):
139 |                     index=path_index+x 
140 |                     path=list[x] 
141 |                     subPaths_lens[index]=len(path) 
142 |                     for y in range(len(path)): 
143 |                         subPaths_matrix[y][index]=path[y] 
144 |                         subPaths_mask[y][index]=1. 
145 |                 path_index+=len(list) 
146 |             else : 
147 |                 triples_matrix[i][j][0]=current_index 
148 |                 current_index+=0
149 |                 triples_matrix[i][j][1]=current_index 
150 |                 
151 |     count=0
152 |     for i in range(len(triples_matrix)):
153 |         if triples_matrix[i][0][0]!=triples_matrix[i][1][1] and triples_matrix[i][2][0]!=triples_matrix[i][3][1]:
154 |             count+=1
155 |     triples_matrix_new=numpy.zeros([count,4,2]).astype('int64')
156 |     index=0
157 |     for i in range(len(triples_matrix)):
158 |         if triples_matrix[i][0][0]!=triples_matrix[i][1][1] and triples_matrix[i][2][0]!=triples_matrix[i][3][1]:
159 |             triples_matrix_new[index]=triples_matrix[i]
160 |             index+=1
161 |     triples_matrix=triples_matrix_new
162 |     
163 |     return triples_matrix, subPaths_matrix, subPaths_mask, subPaths_lens
164 |     
165 |     
166 | def prepareDataForTest(query,candidate,subpaths_map):
167 |     """
168 |    prepare data for test
169 |     """
170 |     key1=bytes(query)+'-'+bytes(candidate)
171 |     key2=bytes(candidate)+'-'+bytes(query)
172 |     if key1 not in subpaths_map and key2 not in subpaths_map:
173 |         return None,None,None
174 |     subpaths=[]
175 |     if key1 in subpaths_map:
176 |         subpaths.extend(subpaths_map[key1]) 
177 |     if key2 in subpaths_map:
178 |         subpaths.extend(subpaths_map[key2]) 
179 |     maxlen=0
180 |     for subpath in subpaths:
181 |         if len(subpath)>maxlen:
182 |             maxlen=len(subpath)
183 |     subPaths_matrix=numpy.zeros([maxlen,len(subpaths)]).astype('int64')
184 |     subPaths_mask=numpy.zeros([maxlen,len(subpaths)]).astype(theano.config.floatX)  # @UndefinedVariable
185 |     subPaths_lens=numpy.zeros([len(subpaths),]).astype('int64')
186 |     for i in range(len(subpaths)):
187 |         subpath=subpaths[i]
188 |         subPaths_lens[i]=len(subpath) 
189 |         for j in range(len(subpath)):
190 |             subPaths_matrix[j][i]=subpath[j]
191 |             subPaths_mask[j][i]=1.  
192 |     
193 |     return subPaths_matrix,subPaths_mask,subPaths_lens
194 | 
195 |     
196 | def get_minibatches_idx(n, minibatch_size, shuffle=False):
197 |     """
198 |     Used to shuffle the dataset at each iteration.
199 |     """
200 |     idx_list = numpy.arange(n, dtype="int32")
201 | 
202 |     if shuffle:
203 |         numpy.random.shuffle(idx_list)
204 | 
205 |     minibatches = []
206 |     minibatch_start = 0
207 |     for i in range(n // minibatch_size):
208 |         minibatches.append(idx_list[minibatch_start:
209 |                                     minibatch_start + minibatch_size])
210 |         minibatch_start += minibatch_size
211 | 
212 |     if (minibatch_start != n):
213 |         # Make a minibatch out of what is left
214 |         minibatches.append(idx_list[minibatch_start:])
215 | 
216 |     return zip(range(len(minibatches)), minibatches)# 将(index,minibatch)设置成tuple（所有的tuple组成一个list），然后返回这个list
217 | 
218 | 
219 | 
220 | 
221 | 


--------------------------------------------------------------------------------
/code/symmetric/python - model/evaluateTools.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | evaluation tools
 4 | '''
 5 | 
 6 | import numpy
 7 | 
 8 | def get_AP(k,ideal,test):
 9 |     """
10 |         compute AP
11 |     """
12 |     ideal=set(ideal)
13 |     accumulation=0.0 
14 |     count=0 
15 |     for i in range(len(test)):
16 |         if i>=k: 
17 |             break
18 |         if test[i] in ideal: 
19 |             count+=1
20 |             accumulation+=count/(i+1.0)
21 |     m=len(ideal) 
22 |     n=k 
23 |     x=0
24 |     if m>n:
25 |         x=n 
26 |     else:
27 |         x=m
28 |     if x==0:
29 |         return 0 
30 |     return accumulation/x
31 |             
32 |             
33 | def get_MAP(k,ideal_map,test_map):
34 |     """
35 |         compute MAP
36 |     """
37 |     accumulation=0.0
38 |     for key in ideal_map.keys(): 
39 |         accumulation+=get_AP(k, ideal_map[key], test_map[key]) 
40 |     if len(ideal_map)==0: 
41 |         return 0
42 |     return accumulation/len(ideal_map)
43 |     
44 |     
45 | def get_nDCG(k,ideal,test):
46 |     """
47 |         compute NDCG
48 |     """
49 |     ideal=set(ideal)
50 |     accumulation=0.0
51 |     for i in range(len(test)):
52 |         if i>=k: 
53 |             break
54 |         if test[i] in ideal: 
55 |             if i==0:
56 |                 accumulation+=1.0
57 |             else:
58 |                 accumulation+=1.0/numpy.log2(i+1)
59 |     normalization=0.0
60 |     for i in range(len(ideal)):
61 |         if i>=k: 
62 |             break
63 |         if i==0:
64 |             normalization+=1.0
65 |         else:
66 |             normalization+=1.0/numpy.log2(i+1)
67 |     if normalization==0:
68 |         return 0
69 |     return accumulation/normalization
70 |         
71 | def get_MnDCG(k,ideal_map,test_map):
72 |     """
73 |         compute mean NDCG
74 |     """
75 |     accumulation=0.0
76 |     for key in ideal_map.keys(): 
77 |         accumulation+=get_nDCG(k, ideal_map[key], test_map[key])
78 |     if len(ideal_map)==0: 
79 |         return 0
80 |     return accumulation/len(ideal_map)
81 |     
82 |             
83 | if __name__=='__main__':
84 |     ideal=['a']
85 |     test=['b','a']
86 |     k=10
87 |     print get_nDCG(k, ideal, test)
88 | #     ideal={'q':['a','b','c'],'p':['a','b','c','d','e']}
89 | #     test={'q':['b','a','m','c','d','n'],'p':['b','a','m','c','d','n']}
90 | #     k=4
91 | #     print get_MnDCG(k, ideal, test)


--------------------------------------------------------------------------------
/code/symmetric/python - model/experimentForOneFileByParams.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | Training one dataset and then test NDCG and MAP.
  4 | '''
  5 | 
  6 | import numpy
  7 | import theano
  8 | from theano import tensor
  9 | 
 10 | import proxEmbed
 11 | import proxEmbedProcessAndAssess
 12 | import os
 13 | 
 14 | import ConfigParser
 15 | import string, os, sys
 16 | 
 17 | 
 18 | if __name__=='__main__':
 19 |     
 20 |     cf = ConfigParser.SafeConfigParser()
 21 |     # read the parameters file.
 22 | #     cf.read("/usr/lzmExperiment/proxEmbed/paramsSet/pythonParamsConfig")
 23 |     cf.read("pythonParamsConfig")
 24 |     
 25 |     main_dir=cf.get("param", "root_dir") # main work dir
 26 |     dataset_name=cf.get("param", "dataset_name") # the name of one dataset
 27 |     suffix=cf.get("param", "suffix") # the suffix of dataset, such as 10,100,1000
 28 |     class_name=cf.get("param", "class_name") # the relation name of data
 29 |     index=cf.get("param", "index") # the index of the dataset file
 30 |     
 31 |     trainingDataFile=os.path.join(main_dir+'/',dataset_name+'.splits','train.'+suffix,'train_'+class_name+'_'+index) # the full path of training data file. This path will be generated by main_dir, dataset_name, suffix, class_name and index.
 32 |     wordsEmbeddings=None # words embeddings
 33 |     wordsEmbeddings_path=cf.get("param", "wordsEmbeddings_path") # the file path of words embeddings
 34 |     word_dimension=cf.getint("param", "word_dimension") # dimension of words embeddings
 35 |     dimension=cf.getint("param", "dimension") # the dimension of paths embeddings
 36 |     wordsSize=cf.getint("param", "wordsSize") # the size of words vocabulary
 37 |     subpaths_map=None # contains sub-paths
 38 |     subpaths_file=cf.get("param", "subpaths_file") # the file which contains sub-paths
 39 |     maxlen_subpaths=cf.getint("param", "maxlen_subpaths") # the max length for sub-paths
 40 |     h_output_method=cf.get("param", "h_output_method") # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path.
 41 |     maxlen=cf.getint("param", "maxlen")  # Sequence longer than this get ignored 
 42 |     batch_size=cf.getint("param", "batch_size") # use a batch for training. This is the size of this batch.
 43 |     is_shuffle_for_batch=cf.getboolean("param", "is_shuffle_for_batch") # if need shuffle for training
 44 |     discount_alpha=cf.getfloat("param", "discount_alpha") # the parameter alpha for discount. The longer the subpath, the little will the weight be.
 45 |     subpaths_pooling_method=cf.get("param", "subpaths_pooling_method") # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling.
 46 |     objective_function_method=cf.get("param", "objective_function_method") # loss function, we use sigmoid
 47 |     objective_function_param=cf.getfloat("param", "objective_function_param") # the parameter in loss function, beta
 48 |     lrate=cf.getfloat("param", "lrate") # learning rate
 49 |     max_epochs=cf.getint("param", "max_epochs") # the max epochs for training
 50 |     
 51 |     dispFreq=cf.getint("param", "dispFreq")  # the frequences for display
 52 |     saveFreq=cf.getint("param", "saveFreq") # the frequences for saving the parameters
 53 |     saveto=os.path.join(main_dir+'/',dataset_name+'.trainModels','train.'+suffix,'train_'+class_name+'_'+index+'.npz') # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index.
 54 |     
 55 |     # the normalization of this model, l2-norm of all parameters
 56 |     decay_lstm_W=cf.getfloat("param", "decay_lstm_W") 
 57 |     decay_lstm_U=cf.getfloat("param", "decay_lstm_U") 
 58 |     decay_lstm_b=cf.getfloat("param", "decay_lstm_b") 
 59 |     decay_w=cf.getfloat("param", "decay_w") 
 60 |     
 61 |     test_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','test','test_'+class_name+'_'+index) # the file of test data
 62 |     top_num=cf.getint("param", "top_num") # the top num to predict
 63 |     ideal_data_file=os.path.join(main_dir+'/',dataset_name+'.splits','ideal','ideal_'+class_name+'_'+index) # the file of ground truth
 64 |     
 65 |     # training
 66 |     proxEmbed.proxEmbedTraining(
 67 |                      trainingDataFile, 
 68 |                      wordsEmbeddings, 
 69 |                      wordsEmbeddings_path, 
 70 |                      word_dimension, 
 71 |                      dimension,
 72 |                      wordsSize, 
 73 |                      subpaths_map, 
 74 |                      subpaths_file,
 75 |                      maxlen_subpaths, 
 76 |                      h_output_method, 
 77 |                      maxlen,  
 78 |                      batch_size, 
 79 |                      is_shuffle_for_batch, 
 80 |                      discount_alpha,
 81 |                      subpaths_pooling_method, 
 82 |                      objective_function_method, 
 83 |                      objective_function_param, 
 84 |                      lrate, 
 85 |                      max_epochs, 
 86 |                       
 87 |                      dispFreq, 
 88 |                      saveFreq, 
 89 |                      saveto, 
 90 |                       
 91 |                      decay_lstm_W, 
 92 |                      decay_lstm_U, 
 93 |                      decay_lstm_b, 
 94 |                      decay_w, 
 95 |                      )
 96 |     
 97 |     # load the function which is trained beforehand
 98 |     computeFunc=proxEmbedProcessAndAssess.get_proxEmbedModel(
 99 |                      saveto, 
100 |                      word_dimension, 
101 |                      dimension, 
102 |                      h_output_method, 
103 |                      discount_alpha, 
104 |                      subpaths_pooling_method, 
105 |                       )
106 |     # test the model
107 |     MAP,MnDCG=proxEmbedProcessAndAssess.compute_proxEmbed(
108 |                      wordsEmbeddings, 
109 |                      wordsEmbeddings_path, 
110 |                      word_dimension, 
111 |                      dimension, 
112 |                      wordsSize, 
113 |                      subpaths_map, 
114 |                      subpaths_file,
115 |                      maxlen_subpaths, 
116 |                      maxlen,  
117 |                      
118 |                      test_data_file,
119 |                      top_num, 
120 |                      ideal_data_file, 
121 |                      func=computeFunc, 
122 |                    )
123 |     
124 |     print 'MAP==',MAP
125 |     print 'MnDCG==',MnDCG


--------------------------------------------------------------------------------
/code/symmetric/python - model/lstmModel.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | 
  3 | from __future__ import print_function
  4 | import six.moves.cPickle as pickle  # @UnresolvedImport
  5 | 
  6 | from collections import OrderedDict
  7 | import sys
  8 | import time
  9 | 
 10 | import numpy
 11 | import theano
 12 | from theano import config 
 13 | import theano.tensor as tensor
 14 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 15 | import toolsFunction
 16 | 
 17 | def numpy_floatX(data):
 18 |     return numpy.asarray(data, dtype=config.floatX)  # @UndefinedVariable
 19 | 
 20 | 
 21 | def _p(pp, name):
 22 |     return '%s_%s' % (pp, name)
 23 | 
 24 | 
 25 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
 26 |     """
 27 |     generate lstm
 28 |     """
 29 |     nsteps = state_below.shape[0] 
 30 |     if state_below.ndim == 3:
 31 |         n_samples = state_below.shape[1] 
 32 |     else:
 33 |         n_samples = 1
 34 | 
 35 |     assert mask is not None
 36 | 
 37 |     def _slice(_x, n, dim):
 38 |         if _x.ndim == 3:
 39 |             return _x[:, :, n * dim:(n + 1) * dim]
 40 |         return _x[:, n * dim:(n + 1) * dim] 
 41 | 
 42 |     def _step(m_, x_, h_, c_): 
 43 |         preact = tensor.dot(h_, tparams['lstm_U']) 
 44 |         preact += x_ 
 45 | 
 46 |         i = tensor.nnet.sigmoid(_slice(preact, 0, options['dimension'])) # input gate 
 47 |         f = tensor.nnet.sigmoid(_slice(preact, 1, options['dimension'])) # forget gate 
 48 |         o = tensor.nnet.sigmoid(_slice(preact, 2, options['dimension'])) # output gate 
 49 |         c = tensor.tanh(_slice(preact, 3, options['dimension'])) 
 50 | 
 51 |         c = f * c_ + i * c 
 52 |         c = m_[:, None] * c + (1. - m_)[:, None] * c_
 53 | 
 54 |         h = o * tensor.tanh(c) 
 55 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
 56 | 
 57 |         return h, c
 58 |     state_below = (tensor.dot(state_below, tparams['lstm_W']) + tparams['lstm_b'])
 59 | 
 60 |     dim_proj = options['dimension']
 61 |     rval, updates = theano.scan(_step, 
 62 |                                 sequences=[mask, state_below],
 63 |                                 outputs_info=[tensor.alloc(numpy_floatX(0.), 
 64 |                                                            n_samples,
 65 |                                                            dim_proj),
 66 |                                               tensor.alloc(numpy_floatX(0.), 
 67 |                                                            n_samples,
 68 |                                                            dim_proj)],
 69 |                                 name=_p(prefix, '_layers'),
 70 |                                 n_steps=nsteps) 
 71 |     return rval[0] 
 72 | 
 73 | 
 74 | 
 75 | def build_model(tparams, options, x, mask, wordsemb):
 76 |     """
 77 |     build the model
 78 |     """
 79 |     n_timesteps = x.shape[0] 
 80 |     n_samples = x.shape[1] 
 81 |     emb = wordsemb[x.flatten()].reshape([n_timesteps,
 82 |                                                 n_samples,
 83 |                                                 options['word_dimension']])
 84 |     proj = lstm_layer(tparams, emb, options,
 85 |                                             prefix='lstm',
 86 |                                             mask=mask)
 87 |     output=None
 88 |     if options['h_output_method'] == 'h': # the last h as the output
 89 |         temp=proj[-1] 
 90 |         output=temp[0] 
 91 |     elif options['h_output_method'] == 'mean-pooling': # mean-pooling as the output
 92 |         temp1 = (proj * mask[:, :, None]).sum(axis=0) 
 93 |         temp2 = temp1 / mask.sum(axis=0)[:, None]
 94 |         output=temp2[0]
 95 |     elif options['h_output_method'] == 'max-pooling': # max-pooling as the output
 96 |         temp1=proj * mask[:, :, None] 
 97 |         temp2=temp1.sum(axis=1) 
 98 |         output = temp2.max(axis=0) 
 99 |     else : # default, the last h as the output
100 |         temp=proj[-1]
101 |         output=temp[0] 
102 |     return  output
103 | 
104 | 
105 | # get lstm model by parameters
106 | def get_lstm(
107 |     model_options, # the options parameters for the model
108 |     tparams, # theano shared variables
109 |     x, # a sub-path
110 |     x_mask, # the mask of this sub-path
111 |     wordsemb, # embeddings of all words
112 | ):
113 | 
114 |     # build the model
115 |     proj = build_model(tparams, model_options, x, x_mask, wordsemb)
116 |     return proj
117 | 


--------------------------------------------------------------------------------
/code/symmetric/python - model/proxEmbed.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | 
  3 | import dataProcessTools
  4 | import numpy
  5 | import theano
  6 | from theano import tensor
  7 | from theano import config
  8 | from collections import OrderedDict
  9 | import time
 10 | import six.moves.cPickle as pickle  # @UnresolvedImport
 11 | import proxEmbedModelMulti
 12 | 
 13 | 
 14 | # Set the random number generators' seeds for consistency
 15 | SEED = 123
 16 | numpy.random.seed(SEED)
 17 | 
 18 | 
 19 | def numpy_floatX(data):
 20 |     return numpy.asarray(data, dtype=config.floatX)  # @UndefinedVariable
 21 | 
 22 | def adadelta(lr, tparams, grads, fourPairs, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost):
 23 |     """
 24 |     An adaptive learning rate optimizer adadelta
 25 |         
 26 |     Parameters
 27 |     ----------
 28 |     lr : Theano SharedVariable
 29 |         Initial learning rate
 30 |     tpramas: Theano SharedVariable
 31 |         Model parameters
 32 |     grads: Theano variable
 33 |         Gradients of cost w.r.t to parameres
 34 |     x: Theano variable
 35 |         Model inputs
 36 |     mask: Theano variable
 37 |         Sequence mask
 38 |     y: Theano variable
 39 |         Targets
 40 |     cost: Theano variable
 41 |         Objective fucntion to minimize
 42 | 
 43 |     Notes
 44 |     -----
 45 |     For more information, see [ADADELTA]_.
 46 | 
 47 |     .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
 48 |        Rate Method*, arXiv:1212.5701.
 49 |     """
 50 |     zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
 51 |                                   name='%s_grad' % k)
 52 |                     for k, p in tparams.items()]
 53 |     running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
 54 |                                  name='%s_rup2' % k)
 55 |                    for k, p in tparams.items()]
 56 |     running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
 57 |                                     name='%s_rgrad2' % k)
 58 |                       for k, p in tparams.items()]
 59 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
 60 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
 61 |              for rg2, g in zip(running_grads2, grads)]
 62 |     f_grad_shared = theano.function([fourPairs, subPaths_matrix, subPaths_mask, subPaths_lens, wemb], cost, updates=zgup + rg2up,
 63 |                                     on_unused_input='ignore',
 64 |                                     name='adadelta_f_grad_shared')
 65 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
 66 |              for zg, ru2, rg2 in zip(zipped_grads,
 67 |                                      running_up2,
 68 |                                      running_grads2)]
 69 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 
 70 |              for ru2, ud in zip(running_up2, updir)] 
 71 |     param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 
 72 |     f_update = theano.function([lr], [], updates=ru2up + param_up,
 73 |                                on_unused_input='ignore',
 74 |                                name='adadelta_f_update')
 75 | 
 76 |     return f_grad_shared, f_update
 77 | 
 78 |         
 79 | def ortho_weight(ndim):
 80 |     """
 81 |         initialize a matrix 
 82 |     """
 83 |     W = numpy.random.randn(ndim, ndim)
 84 |     u, s, v = numpy.linalg.svd(W)
 85 |     return u.astype(config.floatX)  # @UndefinedVariable
 86 | 
 87 | def init_params_weight(row,column):
 88 |     """
 89 |     initialize matrix parameters by row and column
 90 |     """
 91 |     lstm_W = numpy.random.rand(row, column) 
 92 |     return lstm_W.astype(config.floatX)  # @UndefinedVariable
 93 | 
 94 | 
 95 | def init_sharedVariables(options):
 96 |     """
 97 |         initialize all the shared parameters
 98 |     """
 99 |     print 'init shared Variables......'
100 |     params = OrderedDict()
101 |     lstm_W=numpy.concatenate([
102 |                               init_params_weight(options['word_dimension'],options['dimension']),
103 |                               init_params_weight(options['word_dimension'],options['dimension']),
104 |                               init_params_weight(options['word_dimension'],options['dimension']),
105 |                               init_params_weight(options['word_dimension'],options['dimension'])
106 |                               ],axis=1)
107 |     params['lstm_W'] = lstm_W
108 |     lstm_U = numpy.concatenate([ortho_weight(options['dimension']),
109 |                            ortho_weight(options['dimension']),
110 |                            ortho_weight(options['dimension']),
111 |                            ortho_weight(options['dimension'])], axis=1)
112 |     params['lstm_U'] = lstm_U
113 |     lstm_b = numpy.zeros((4 * options['dimension'],))
114 |     params['lstm_b'] = lstm_b.astype(config.floatX)  # @UndefinedVariable
115 |     w = numpy.random.rand(options['dimension'], ) 
116 |     params['w']=w.astype(config.floatX)  # @UndefinedVariable
117 |     
118 |     return params
119 |     
120 |     
121 | def init_tparams(params):
122 |     tparams = OrderedDict()
123 |     for kk, pp in params.items():
124 |         tparams[kk] = theano.shared(params[kk], name=kk)
125 |     return tparams
126 |     
127 | def unzip(zipped):
128 |     new_params = OrderedDict()
129 |     for kk, vv in zipped.items():
130 |         new_params[kk] = vv.get_value()
131 |     return new_params
132 | 
133 | main_dir='D:/dataset/test/icde2016_metagraph/'
134 | def proxEmbedTraining(
135 |                      trainingDataFile=main_dir+'facebook.splits/train.10/train_classmate_1', # the full path of training data file
136 |                      wordsEmbeddings=None, # words embeddings
137 |                      wordsEmbeddings_path=main_dir+'facebook/nodesFeatures', # the file path of words embeddings
138 |                      word_dimension=22, # dimension of words embeddings
139 |                      dimension=64, # the dimension of paths embeddings
140 |                      wordsSize=1000000, # the size of words vocabulary
141 |                      subpaths_map=None, # contains sub-paths
142 |                      subpaths_file=main_dir+'facebook/subpathsSaveFile',# the file which contains sub-paths
143 |                      maxlen_subpaths=1000, # the max length for sub-paths
144 |                      h_output_method='mean-pooling', # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path.
145 |                      maxlen=100,  # Sequence longer then this get ignored 
146 |                      batch_size=1, # use a batch for training. This is the size of this batch.
147 |                      is_shuffle_for_batch=False, # if need shuffle for training
148 |                      discount_alpha=0.1, # the parameter alpha for discount. The longer the subpath, the little will the weight be.
149 |                      subpaths_pooling_method='max-pooling', # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling.
150 |                      objective_function_method='hinge-loss', # loss function, we use sigmoid
151 |                      objective_function_param=0, # the parameter in loss function, beta
152 |                      lrate=0.0001, # learning rate
153 |                      max_epochs=10, # the max epochs for training
154 |                      
155 |                      dispFreq=5, # the frequences for display
156 |                      saveFreq=5, # the frequences for saving the parameters
157 |                      saveto=main_dir+'facebook/proxEmbed-modelParams.npz', # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index.
158 |                      
159 |                      # the normalization of this model, l2-norm of all parameters
160 |                      decay_lstm_W=0.01, 
161 |                      decay_lstm_U=0.01, 
162 |                      decay_lstm_b=0.01,
163 |                      decay_w=0.01, 
164 |                      
165 |                      ):
166 |     """
167 |     The training stage of ProxEmbed
168 |     """
169 |     model_options = locals().copy()
170 |     
171 |     if wordsEmbeddings is None: 
172 |         if wordsEmbeddings_path is not None: 
173 |             wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
174 |         else: 
175 |             print 'There is not path for wordsEmbeddings, exit!!!'
176 |             exit(0) 
177 |     
178 |     if subpaths_map is None:
179 |         if subpaths_file is not None: 
180 |             subpaths_map=dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths)
181 |         else: 
182 |             print 'There is not path for sub-paths, exit!!!'
183 |             exit(0)
184 |     
185 |     trainingData,trainingPairs=dataProcessTools.getTrainingData(trainingDataFile)
186 |     allBatches=dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch)
187 | 
188 |     params=init_sharedVariables(model_options) 
189 |     tparams=init_tparams(params) 
190 |     print 'Generate models ......'
191 |     
192 |     trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost=proxEmbedModelMulti.proxEmbedModel(model_options, tparams)
193 |     
194 |     print 'Generate gradients ......'
195 |     grads=tensor.grad(cost,wrt=list(tparams.values()))
196 |     print 'Using Adadelta to generate functions ......'
197 |     lr = tensor.scalar(name='lr')
198 |     f_grad_shared, f_update=adadelta(lr, tparams, grads, trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost)
199 |     
200 |     print 'Start training models ......'
201 |     best_p = None 
202 |     history_cost=[] 
203 |     
204 |     models_count=[0,0,0,0] 
205 |     
206 |     start_time = time.time() 
207 |     print 'start time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start_time))
208 |     uidx=0 
209 |     for eidx in range(max_epochs):
210 |         for _, batch in allBatches: 
211 |             uidx += 1
212 |             trainingDataForBatch=[trainingData[i] for i in batch]
213 |             trainingPairsForBatch=[trainingPairs[i] for i in batch]
214 |             triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data=dataProcessTools.prepareDataForTraining(trainingDataForBatch, trainingPairsForBatch, subpaths_map)
215 |             cost=0
216 |             cost=f_grad_shared(triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data,wordsEmbeddings)
217 |             f_update(lrate)
218 |             
219 |             if numpy.isnan(cost) or numpy.isinf(cost):
220 |                 print('bad cost detected: ', cost)
221 |                 return 
222 |             if numpy.mod(uidx, dispFreq) == 0:
223 |                 print 'Epoch =', eidx, ',  Update =', uidx, ',  Cost =', cost
224 |                 print 'models_count ==',models_count
225 |             if saveto and numpy.mod(uidx, saveFreq) == 0:
226 |                 print('Saving...')
227 |                 if best_p is not None: 
228 |                     params = best_p
229 |                 else: 
230 |                     params = unzip(tparams)
231 |                 numpy.savez(saveto, history_errs=history_cost, **params)
232 |                 pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
233 |                 print('Done')
234 |     end_time = time.time() 
235 |     print 'end time ==',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end_time))
236 |     print 'Training finished! Cost time == ', end_time-start_time,' s'
237 |             
238 |     
239 | if __name__=='__main__':
240 |     print 'Start running proxEmbedTraining......'
241 |     proxEmbedTraining()


--------------------------------------------------------------------------------
/code/symmetric/python - model/proxEmbedModelMulti.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | Generate ProxEmbed Model
 4 | '''
 5 | import numpy
 6 | import theano
 7 | from theano import tensor
 8 | import lstmModel
 9 | from theano.ifelse import ifelse
10 | 
11 | 
12 | def proxEmbedModel(model_options,tparams):
13 |     """
14 |     generate proxEmbed model
15 |     """
16 |     trainingParis=tensor.tensor3('trainingParis',dtype='int64') 
17 |     subPaths_matrix=tensor.matrix('subPaths_matrix',dtype='int64') 
18 |     subPaths_mask=tensor.matrix('subPaths_mask',dtype=theano.config.floatX)  # @UndefinedVariable 
19 |     subPaths_lens=tensor.vector('subPaths_lens',dtype='int64') 
20 |     wordsEmbeddings=tensor.matrix('wordsEmbeddings',dtype=theano.config.floatX)  # @UndefinedVariable 
21 |     
22 |     def _processTriple(fourPairs,lossSum):
23 |         
24 |         def _processSubpath(index):
25 |             length=subPaths_lens[index] 
26 |             x=subPaths_matrix[:length,index:index+1] 
27 |             x_mask=subPaths_mask[:length,index:index+1] 
28 |             emb=lstmModel.get_lstm(model_options, tparams, x, x_mask, wordsEmbeddings)
29 |             emb=emb*discountModel(model_options['discount_alpha'], length)
30 |             return emb 
31 |         
32 |         def iftFunc():
33 |             embx=numpy.zeros(model_options['dimension'],) 
34 |             embx.astype(theano.config.floatX)  # @UndefinedVariable
35 |             return embx
36 |          
37 |         def iffFunc(start,end):
38 |             embx=None
39 |             rval,update=theano.scan(
40 |                                 _processSubpath,
41 |                                 sequences=tensor.arange(start,end), 
42 |                                 )
43 |             if model_options['subpaths_pooling_method']=='mean-pooling': # mean-pooling
44 |                 embx = rval.sum(axis=0) 
45 |                 embx = embx / rval.shape[0] 
46 |             elif model_options['subpaths_pooling_method']=='max-pooling': # max-pooling
47 |                 embx = rval.max(axis=0) 
48 |             else: # default, mean-pooling
49 |                 embx = rval.sum(axis=0) 
50 |                 embx = embx / rval.shape[0] 
51 |                 
52 |             return embx
53 |         
54 |         start=fourPairs[0][0] 
55 |         end=fourPairs[1][1] 
56 |         emb1=None
57 |         emb1=ifelse(tensor.eq(start,end),iftFunc(),iffFunc(start,end))
58 | 
59 |         start=fourPairs[2][0] 
60 |         end=fourPairs[3][1]
61 |         emb2=None 
62 |         emb2=ifelse(tensor.eq(start,end),iftFunc(),iffFunc(start,end)) 
63 |             
64 |         loss=0
65 |         param=model_options['objective_function_param'] 
66 |         if model_options['objective_function_method']=='sigmoid':  # use sigmoid 
67 |             loss=-tensor.log(tensor.nnet.sigmoid(param*(tensor.dot(emb1,tparams['w'])-tensor.dot(emb2,tparams['w'])))) # sigmoid
68 |         
69 |         return loss+lossSum
70 |         
71 |     rval,update=theano.scan(
72 |                             _processTriple,
73 |                             sequences=trainingParis, 
74 |                             outputs_info=tensor.constant(0., dtype=theano.config.floatX), # @UndefinedVariable 
75 |                             )
76 |     cost=rval[-1]
77 |     cost+=model_options['decay_lstm_W']*(tparams['lstm_W'] ** 2).sum()
78 |     cost+=model_options['decay_lstm_U']*(tparams['lstm_U'] ** 2).sum()
79 |     cost+=model_options['decay_lstm_b']*(tparams['lstm_b'] ** 2).sum()
80 |     cost+=model_options['decay_w']*(tparams['w'] ** 2).sum()
81 |     return trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens , wordsEmbeddings, cost
82 | 
83 | 
84 | def discountModel(alpha,length):
85 |     """
86 |     discount model
87 |     """
88 |     return tensor.exp(alpha*length*(-1))
89 |     
90 | def numpy_floatX(data):
91 |     return numpy.asarray(data, dtype=theano.config.floatX)  # @UndefinedVariable


--------------------------------------------------------------------------------
/code/symmetric/python - model/proxEmbedProcessAndAssess.py:
--------------------------------------------------------------------------------
  1 | #encoding=utf-8
  2 | '''
  3 | process dataset by proxEmbed model and then assess
  4 | '''
  5 | 
  6 | import numpy
  7 | import theano
  8 | from theano import tensor
  9 | from collections import OrderedDict
 10 | import proxEmbedProcessModel
 11 | import dataProcessTools
 12 | import toolsFunction
 13 | import evaluateTools
 14 | 
 15 | 
 16 | def load_params(path, params):
 17 |     """
 18 |     load model params from file
 19 |     """
 20 |     pp = numpy.load(path) 
 21 |     for kk, vv in params.items():
 22 |         if kk not in pp:
 23 |             raise Warning('%s is not in the archive' % kk)
 24 |         params[kk] = pp[kk]
 25 | 
 26 |     return params
 27 | 
 28 | 
 29 | def get_proxEmbedModel(
 30 |                       
 31 |                    model_params_path='', # the path of model parameters
 32 |                      word_dimension=0, # the dimension of words embedding 
 33 |                      dimension=0, # the dimension of path embedding
 34 |                      h_output_method='h', # the output way of lstm
 35 |                      discount_alpha=0.1, # discount alpha
 36 |                      subpaths_pooling_method='max-pooling', # the combine way of sub-paths
 37 |                       ):
 38 |     """
 39 |     get model from file
 40 |     """
 41 |     model_options = locals().copy()
 42 |     
 43 |     tparams = OrderedDict()
 44 |     tparams['lstm_W']=None
 45 |     tparams['lstm_U']=None
 46 |     tparams['lstm_b']=None
 47 |     tparams['w']=None
 48 |     tparams=load_params(model_params_path, tparams) 
 49 |     
 50 |     subPaths_matrix,subPaths_mask,subPaths_lens,wemb,score=proxEmbedProcessModel.proxEmbedModel(model_options, tparams)
 51 |     func=theano.function([subPaths_matrix,subPaths_mask,subPaths_lens,wemb], score) 
 52 |     
 53 |     return func 
 54 | 
 55 | 
 56 | def compute_proxEmbed(
 57 |                      wordsEmbeddings=None, # words embeddings
 58 |                      wordsEmbeddings_path=None, # the file path of words embeddings
 59 |                      word_dimension=0, #  dimension of words embeddings
 60 |                      dimension=0, # the dimension of paths embeddings
 61 |                      wordsSize=0, # the size of words vocabulary
 62 |                      subpaths_map=None, # contains sub-paths
 63 |                      subpaths_file=None,# the file which contains sub-paths
 64 |                      maxlen_subpaths=1000, # the max length for sub-paths
 65 |                      maxlen=100,  # Sequence longer then this get ignored 
 66 |                      
 67 |                      test_data_file='', # the file path of test data
 68 |                      top_num=10, # the top num to predict
 69 |                      ideal_data_file='', # ground truth
 70 |                      func=None, # model function
 71 |                    ):
 72 |     """
 73 |     compute the result of the model
 74 |     """
 75 |     
 76 |     model_options = locals().copy()
 77 |     
 78 |     if wordsEmbeddings is None:
 79 |         if wordsEmbeddings_path is not None: 
 80 |             wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
 81 |         else: 
 82 |             print 'There is not path for wordsEmbeddings, exit!!!'
 83 |             exit(0) 
 84 | 
 85 |     if subpaths_map is None: 
 86 |         if subpaths_file is not None: 
 87 |             subpaths_map=dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths)
 88 |         else: 
 89 |             print 'There is not path for sub-paths, exit!!!'
 90 |             exit(0)
 91 | 
 92 |     line_count=0 
 93 |     test_map={} 
 94 |     print 'Compute MAP and nDCG for file ',test_data_file
 95 |     with open(test_data_file) as f: 
 96 |         for l in f: 
 97 |             arr=l.strip().split()
 98 |             query=int(arr[0]) 
 99 |             map={} 
100 |             for i in range(1,len(arr)): 
101 |                 candidate=int(arr[i]) 
102 |                 subPaths_matrix_data,subPaths_mask_data,subPaths_lens_data=dataProcessTools.prepareDataForTest(query, candidate, subpaths_map)
103 |                 if subPaths_matrix_data is None and subPaths_mask_data is None and subPaths_lens_data is None: 
104 |                     map[candidate]=-1000. 
105 |                 else: 
106 |                     value=func(subPaths_matrix_data,subPaths_mask_data,subPaths_lens_data,wordsEmbeddings) 
107 |                     map[candidate]=value
108 |             
109 |             tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num)
110 |             test_map[line_count]=tops_in_line 
111 |             line_count+=1 
112 |                 
113 |     line_count=0 
114 |     ideal_map={}
115 |     with open(ideal_data_file) as f: 
116 |         for l in f: 
117 |             arr=l.strip().split()
118 |             arr=[int(x) for x in arr]
119 |             ideal_map[line_count]=arr[1:] 
120 |             line_count+=1 
121 |     
122 |     MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map)
123 |     MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map)
124 |     
125 |     return MAP,MnDCG
126 |     


--------------------------------------------------------------------------------
/code/symmetric/python - model/proxEmbedProcessModel.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | proxEmbed model for compute some dataset
 4 | '''
 5 | 
 6 | import numpy
 7 | import theano
 8 | from theano import tensor
 9 | import lstmModel
10 | 
11 | 
12 | def proxEmbedModel(model_options,tparams):
13 |     """
14 |        build ProxEmbed model
15 |     """
16 |     subPaths_matrix=tensor.matrix('subPaths_matrix',dtype='int64')
17 |     subPaths_mask=tensor.matrix('subPaths_mask',dtype=theano.config.floatX)  # @UndefinedVariable
18 |     subPaths_lens=tensor.vector('subPaths_lens',dtype='int64')
19 |     wordsEmbeddings=tensor.matrix('wordsEmbeddings',dtype=theano.config.floatX)  # @UndefinedVariable
20 |     
21 |     def _processSubpath(index):
22 |         length=subPaths_lens[index] 
23 |         x=subPaths_matrix[:length,index:index+1] 
24 |         x_mask=subPaths_mask[:length,index:index+1] 
25 |         emb=lstmModel.get_lstm(model_options, tparams, x, x_mask, wordsEmbeddings)
26 |         emb=emb*discountModel(model_options['discount_alpha'], length)
27 |         return emb 
28 |     
29 |     rval,update=theano.scan(
30 |                                 _processSubpath,
31 |                                 sequences=tensor.arange(subPaths_lens.shape[0]), 
32 |                                 )
33 |     emb=0
34 |     if model_options['subpaths_pooling_method']=='mean-pooling': # mean-pooling
35 |         emb = rval.sum(axis=0) 
36 |         emb = emb / rval.shape[0]
37 |     elif model_options['subpaths_pooling_method']=='max-pooling': # max-pooling
38 |         emb = rval.max(axis=0)
39 |     else: # default, mean-pooling
40 |         emb = rval.sum(axis=0) 
41 |         emb = emb / rval.shape[0] 
42 |         
43 |     score=tensor.dot(emb,tparams['w'])
44 |     
45 |     return subPaths_matrix,subPaths_mask,subPaths_lens,wordsEmbeddings,score
46 |     
47 |     
48 | def discountModel(alpha,length):
49 |     """
50 |     discount model
51 |     """
52 |     return tensor.exp(alpha*length*(-1))
53 | 


--------------------------------------------------------------------------------
/code/symmetric/python - model/pythonParamsConfig:
--------------------------------------------------------------------------------
 1 | [param]
 2 | 
 3 | ############################################
 4 | # training data dictory
 5 | ############################################
 6 | # main work dir
 7 | root_dir = D:/test/test/toydata
 8 | # the name of one dataset, such as facebook, linkedin
 9 | dataset_name = linkedin
10 | # the suffix of dataset, such as 10,100,1000
11 | suffix = 4
12 | # the relation name of data, such as classmate, family, school, work
13 | class_name = work
14 | # the index of the dataset file
15 | index = 3
16 | 
17 | ############################################
18 | # paths for some prepared data
19 | ############################################
20 | # the file path of words embeddings
21 | wordsEmbeddings_path = %(root_dir)s/%(dataset_name)s/nodesFeatures
22 | # the file which contains sub-paths
23 | subpaths_file = %(root_dir)s/%(dataset_name)s/subpathsSaveFile
24 | 
25 | ############################################
26 | # experiment parameters - do not need to change frequently
27 | ############################################
28 | # the max length for sub-paths
29 | maxlen_subpaths = 1000
30 | # the size of words vocabulary
31 | wordsSize = 1000000
32 | # Sequence longer than this get ignored 
33 | maxlen = 1000
34 | # use a batch for training. This is the size of this batch.
35 | batch_size = 4
36 | # if need shuffle for training
37 | is_shuffle_for_batch = True
38 | # the frequences for display
39 | dispFreq = 2
40 | # the frequences for saving the parameters
41 | saveFreq = 2
42 | # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index. It will be generated in the code.
43 | saveto = 
44 | # the top num to predict
45 | top_num = 1
46 | 
47 | ############################################
48 | # experiment parameters - need to tune
49 | ############################################
50 | # learning rate
51 | lrate = 0.0001
52 | # dimension of words embeddings
53 | word_dimension = 4
54 | # the dimension of paths embeddings
55 | dimension = 5
56 | # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path.
57 | h_output_method = max-pooling
58 | # the parameter alpha for discount. The longer the subpath, the little will the weight be.
59 | discount_alpha = 0.3
60 | # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling.
61 | subpaths_pooling_method = max-pooling
62 | # loss function, we use sigmoid
63 | objective_function_method = sigmoid
64 | # the parameter in loss function, beta
65 | objective_function_param = 0.5
66 | # the max epochs for training
67 | max_epochs = 50
68 | # decay for lstm_W
69 | decay_lstm_W = 0.0001
70 | # decay for lstm_U
71 | decay_lstm_U = 0.0001
72 | # decay for lstm_b
73 | decay_lstm_b = 0.0001
74 | # decay for w
75 | decay_w = 0.0001
76 | 


--------------------------------------------------------------------------------
/code/symmetric/python - model/toolsFunction.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf-8
 2 | '''
 3 | some tools methods
 4 | '''
 5 | 
 6 | def mapSortByValueDESC(map,top):
 7 |     """
 8 |     sort by value desc
 9 |     """
10 |     if top>len(map): 
11 |         top=len(map)
12 |     items=map.items() 
13 |     backitems=[[v[1],v[0]] for v in items]  
14 |     backitems.sort(reverse=True) 
15 |     e=[ backitems[i][1] for i in range(top)] 
16 |     return e
17 | 
18 | 
19 | def mapSortByValueASC(map,top):
20 |     """
21 |     sort by value asc
22 |     """
23 |     if top>len(map): 
24 |         top=len(map)
25 |     items=map.items() 
26 |     backitems=[[v[1],v[0]] for v in items]  
27 |     backitems.sort() 
28 |     e=[ backitems[i][1] for i in range(top)]  
29 |     return e
30 | 
31 | 


--------------------------------------------------------------------------------
/toy_data/ReadMe.txt:
--------------------------------------------------------------------------------
 1 | This directory contains a toy data set for ProxEmbed model.
 2 | The toy data set is constructed according to the Figure 1 in the following paper:
 3 | 
 4 | @inproceedings{LiuZZZCWY17,
 5 |  author = {Liu, Zemin and Zheng, Vincent W. and Zhao, Zhou and Zhu, Fanwei and Chang, Kevin Chen-Chuan and Wu, Minghui and Ying, Jing},
 6 |  title = {Semantic Proximity Search on Heterogeneous Graph by Proximity Embedding},
 7 |  booktitle = {Proc. of the 31st AAAI Conference on Artificial Intelligence},
 8 |  series = {AAAI '17},
 9 |  year = {2017}
10 | } 
11 | 
12 | Please cite the above reference for using our code and data.
13 | ========================================================================================================
14 | 
15 | 1. In each folder 'dataset' (where 'dataset' = linkedin or dblp)
16 | 
17 | File "graph.node" : The nodes in each graph. Each row has 3 columns: column 1 is node ID, column 2 is node type, column 3 is node value.(we only need to use column 1 and 2)
18 | 
19 | File "graph.edge" : The edges in each graph. Each row indicates a directed edge.
20 | 
21 | ******************************************
22 | 
23 | 2. In each folder 'dataset.splits' (where 'dataset' = linkedin or dblp)
24 | 
25 | 2.1. In sub-folder 'train.labelSize' (where 'labelSize' = 4, 100 or 1000) (while in our real dataset 'labelSize' = 10, 100 or 1000)
26 | 
27 | File "train_relation_splitId" : One split of training data for a relation. 'splitId' is from 1 to 3, thus we will train our model on 3 different training data sets. (while in our real dataset 'splitId' is from 1 to 10)
28 | 
29 | Example : for "./linkedin.splits/train.4/train_school_1" 
30 | a) we have 4 labels, where each label is a tuple of <queryNodeId, targetNodeId_1, targetNodeId_2>, meaning: given queryNodeId, targetNodeId_1 is closer to queryNode than targetNodeId_2.
31 | b) it is the 1st split of training data for relation 'school'.
32 | 
33 | We do not provide the 'labelSize' = 100 and 1000, becasue the toydata is too small.
34 | 
35 | 2.2. In sub-folder 'test'
36 | 
37 | File "test_relation_splitId" : One split of test data for a relation. 
38 | 
39 | Example : for "./linkedin.splits/test/test_school_1"
40 | a) it is the 1st split of test data for relation 'school', thus used by all the "./linkedin.splits/train.labelSize/train_school_1" files (where 'labelSize' = 4, 100 or 1000).(while in our real dataset 'labelSize' = 10, 100 or 1000)
41 | b) each line of the file is <queryNodeId, targetNodeId_1, targetNodeId_2, ..., targetNodeId_m>, meaning: given queryNodeId, we want to apply our model and generate a ranking list over the targetNodeId_1, targetNodeId_2 until targetNodeId_m. For different lines, m can be different. 
42 | 
43 | 
44 | 2.3. In sub-folder 'ideal'
45 | 
46 | File "ideal_relation_splitId" : One split of ideal ranking for a relation. 
47 | 
48 | Example : for "./linkedin.splits/ideal/ideal_school_1" 
49 | a) it is the 1st split of ideal ranking for relation 'school', which is used to evaluate the ranking prediction for "./linkedin.splits/test/test_school_1";
50 | b) each line of the file is <queryNodeId, targetNodeId_g1, targetNodeId_g2, ..., targetNodeId_gn,>, meaning: given queryNodeId, the ground truth relevant target nodes are targetNodeId_g1, targetNodeId_g2, ..., targetNodeId_gn. Note that gn can be different from m; i.e., for the queryNodeId, there are m test targetNodeId's for us to rank, but in the end only a subset of them are relevant by ground truth. 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/ideal/ideal_advisee_1:
--------------------------------------------------------------------------------
1 | 5	8
2 | 13	0
3 | 13	11
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/ideal/ideal_advisee_2:
--------------------------------------------------------------------------------
1 | 5	8
2 | 13	0
3 | 13	11
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/ideal/ideal_advisee_3:
--------------------------------------------------------------------------------
1 | 13	11
2 | 5	8
3 | 13	0
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/ideal/ideal_advisor_1:
--------------------------------------------------------------------------------
1 | 11	13
2 | 0	13
3 | 8	5
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/ideal/ideal_advisor_2:
--------------------------------------------------------------------------------
1 | 8	5
2 | 11	13
3 | 0	13
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/ideal/ideal_advisor_3:
--------------------------------------------------------------------------------
1 | 0	13
2 | 8	5
3 | 11	13
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/test/test_advisee_1:
--------------------------------------------------------------------------------
1 | 5	8	0	13	11
2 | 13	0	5	8
3 | 13	5	11	8
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/test/test_advisee_2:
--------------------------------------------------------------------------------
1 | 5	11	8	13	0
2 | 13	0	5	8
3 | 13	5	8	11
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/test/test_advisee_3:
--------------------------------------------------------------------------------
1 | 13	11	5	8
2 | 5	0	13	11	8
3 | 13	5	8	0
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/test/test_advisor_1:
--------------------------------------------------------------------------------
1 | 11	5	8	0	13
2 | 0	5	8	13	11
3 | 8	11	5	0	13
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/test/test_advisor_2:
--------------------------------------------------------------------------------
1 | 8	11	13	0	5
2 | 11	8	5	13	0
3 | 0	13	5	8	11
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/test/test_advisor_3:
--------------------------------------------------------------------------------
1 | 0	5	13	8	11
2 | 8	5	0	13	11
3 | 11	5	8	0	13
4 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/train.4/train_advisee_1:
--------------------------------------------------------------------------------
1 | 13	0	8
2 | 5	8	11
3 | 13	11	5
4 | 5	8	0
5 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/train.4/train_advisee_2:
--------------------------------------------------------------------------------
1 | 5	8	13
2 | 13	11	5
3 | 13	11	8
4 | 5	8	0
5 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/train.4/train_advisee_3:
--------------------------------------------------------------------------------
1 | 13	11	8
2 | 13	0	5
3 | 5	8	13
4 | 5	8	0
5 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/train.4/train_advisor_1:
--------------------------------------------------------------------------------
1 | 8	5	11
2 | 11	13	5
3 | 0	13	5
4 | 11	13	8
5 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/train.4/train_advisor_2:
--------------------------------------------------------------------------------
1 | 11	13	5
2 | 0	13	8
3 | 8	5	13
4 | 8	5	11
5 | 


--------------------------------------------------------------------------------
/toy_data/dblp.splits/train.4/train_advisor_3:
--------------------------------------------------------------------------------
1 | 0	13	11
2 | 0	13	5
3 | 11	13	8
4 | 8	5	13
5 | 


--------------------------------------------------------------------------------
/toy_data/dblp/graph.edge:
--------------------------------------------------------------------------------
 1 | 0	14
 2 | 14	0
 3 | 1	2
 4 | 2	1
 5 | 1	3
 6 | 3	1
 7 | 2	6
 8 | 6	2
 9 | 3	4
10 | 4	3
11 | 4	5
12 | 5	4
13 | 4	13
14 | 13	4
15 | 5	6
16 | 6	5
17 | 5	9
18 | 9	5
19 | 6	7
20 | 7	6
21 | 7	9
22 | 9	7
23 | 8	9
24 | 9	8
25 | 9	10
26 | 10	9
27 | 10	12
28 | 12	10
29 | 11	12
30 | 12	11
31 | 12	13
32 | 13	12
33 | 13	14
34 | 14	13
35 | 


--------------------------------------------------------------------------------
/toy_data/dblp/graph.node:
--------------------------------------------------------------------------------
 1 | 0	user	Ivan
 2 | 1	paper	Paper-A
 3 | 2	conference	AAAI
 4 | 3	keyword	embedding
 5 | 4	paper	Paper-C
 6 | 5	user	Helen
 7 | 6	paper	Paper-B
 8 | 7	keyword	graph
 9 | 8	user	Karl
10 | 9	paper	Paper-D
11 | 10	year	2016
12 | 11	user	Larry
13 | 12	paper	Paper-F
14 | 13	user	Jane
15 | 14	paper	Paper-E
16 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/ideal/ideal_school_1:
--------------------------------------------------------------------------------
1 | 3	0
2 | 12	9
3 | 9	12
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/ideal/ideal_school_2:
--------------------------------------------------------------------------------
1 | 0	3
2 | 9	12
3 | 12	9
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/ideal/ideal_school_3:
--------------------------------------------------------------------------------
1 | 9	12
2 | 3	0
3 | 0	3
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/ideal/ideal_work_1:
--------------------------------------------------------------------------------
1 | 4	6
2 | 9	12
3 | 7	9
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/ideal/ideal_work_2:
--------------------------------------------------------------------------------
1 | 6	4
2 | 7	9
3 | 12	9


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/ideal/ideal_work_3:
--------------------------------------------------------------------------------
1 | 4	6
2 | 6	4
3 | 9	7
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/test/test_school_1:
--------------------------------------------------------------------------------
1 | 3	7	9	0	12	6
2 | 12	6	4	7	9	3
3 | 9	7	3	12	4	0
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/test/test_school_2:
--------------------------------------------------------------------------------
1 | 0	9	3	12	4	6
2 | 9	12	7	3	6	4
3 | 12	6	4	3	7	9
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/test/test_school_3:
--------------------------------------------------------------------------------
1 | 9	7	3	12	0	4
2 | 3	7	9	12	0	6
3 | 0	6	3	9	12	7
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/test/test_work_1:
--------------------------------------------------------------------------------
1 | 4	0	3	6	7	9	
2 | 9	12	3	0	4	6
3 | 7	3	0	9	4	6
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/test/test_work_2:
--------------------------------------------------------------------------------
1 | 6	4	3	7	9	12
2 | 7	0	9	12	6	4
3 | 12	4	0	3	7	9
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/test/test_work_3:
--------------------------------------------------------------------------------
1 | 4	0	3	6	7	9
2 | 6	7	3	4	0	9
3 | 9	7	3	0	4	6
4 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/train.4/train_school_1:
--------------------------------------------------------------------------------
1 | 0	3	4
2 | 12	9	4
3 | 3	0	12
4 | 9	12	7
5 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/train.4/train_school_2:
--------------------------------------------------------------------------------
1 | 9	12	7
2 | 3	0	6
3 | 0	3	4
4 | 12	9	6
5 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/train.4/train_school_3:
--------------------------------------------------------------------------------
1 | 0	3	4
2 | 3	0	6
3 | 9	12	7
4 | 12	9	7
5 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/train.4/train_work_1:
--------------------------------------------------------------------------------
1 | 4	6	3
2 | 12	9	0
3 | 9	7	4
4 | 7	9	12
5 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/train.4/train_work_2:
--------------------------------------------------------------------------------
1 | 7	9	0
2 | 6	4	3
3 | 4	6	0
4 | 12	9	3
5 | 


--------------------------------------------------------------------------------
/toy_data/linkedin.splits/train.4/train_work_3:
--------------------------------------------------------------------------------
1 | 12	9	3
2 | 7	9	6
3 | 4	6	3
4 | 6	4	12
5 | 


--------------------------------------------------------------------------------
/toy_data/linkedin/graph.edge:
--------------------------------------------------------------------------------
 1 | 0	1
 2 | 1	0
 3 | 0	2
 4 | 2	0
 5 | 0	3
 6 | 3	0
 7 | 1	3
 8 | 3	1
 9 | 2	3
10 | 3	2
11 | 2	4
12 | 4	2
13 | 2	6
14 | 6	2
15 | 3	7
16 | 7	3
17 | 4	5
18 | 5	4
19 | 4	6
20 | 6	4
21 | 5	6
22 | 6	5
23 | 6	10
24 | 10	6
25 | 6	12
26 | 12	6
27 | 7	8
28 | 8	7
29 | 7	9
30 | 9	7
31 | 8	9
32 | 9	8
33 | 9	10
34 | 10	9
35 | 9	11
36 | 11	9
37 | 10	12
38 | 12	10
39 | 11	12
40 | 12	11
41 | 


--------------------------------------------------------------------------------
/toy_data/linkedin/graph.node:
--------------------------------------------------------------------------------
 1 | 0	user	Alice
 2 | 1	college	UCLA
 3 | 2	location	L.A.
 4 | 3	user	Bob
 5 | 4	user	Emily
 6 | 5	employer	Google
 7 | 6	user	Frances
 8 | 7	user	Chris
 9 | 8	employer	Facebook
10 | 9	user	Donna
11 | 10	employer	Apple
12 | 11	college	UIUC
13 | 12	user	Glen
14 | 


--------------------------------------------------------------------------------