├── .gitignore
├── Makefile
├── README.md
├── aux
    ├── archive
    │   ├── downsample_dblp_hin.py
    │   ├── filter_eval_data.py
    │   ├── find_downsampled_embedding.py
    │   ├── match_eval_file_to_downsampled_hin.py
    │   └── remove_keyword.py
    ├── calc_edge_type_correlation_jaccard_nparray.py
    ├── downsample_eval_file.py
    ├── filter_hin_and_eval_file_for_aspem.py
    ├── find_inner.py
    ├── logit.py
    ├── logit_fast.py
    ├── logit_yago.py
    ├── logit_yago_fast.py
    ├── merge_edges_with_all_types.py
    ├── normalize_edge_weight_aspem.py
    ├── plot_aw_vs_ay.py
    ├── plot_from_nparray.py
    ├── preprocess_dblp_for_aspem.py
    └── separate_edges_by_types.py
├── eval
    ├── README.md
    ├── archive
    │   ├── edge_knock.py
    │   ├── edge_rec_eval_inner_prod.py
    │   ├── edge_rec_eval_score_provided.py
    │   ├── edge_rec_eval_temp.py
    │   ├── edgeknock.py
    │   ├── mrr_from_embedding_output_more.py
    │   ├── mrr_from_score_output_more.py
    │   ├── yago_mrr_from_embedding.py
    │   └── yago_mrr_from_score.py
    ├── mrr_from_embedding.py
    └── mrr_from_score.py
├── preprocessing
    ├── Readme.md
    ├── edge_knock
    │   ├── Readme.md
    │   ├── find_center_paper.py
    │   ├── gen_eval_file_from_knocked_out_hin.py
    │   ├── gen_training_file_for_logit.py
    │   ├── knock_out_hin.py
    │   └── ko_hin.py
    └── ko_hin.py
├── pretrain
    ├── line.cpp
    ├── makefile
    ├── ransampl.c
    └── ransampl.h
├── run
    ├── archive
    │   ├── aspem_batch_eval_dblp_0.2.sh
    │   ├── aspem_eval_dblp_0.2.sh
    │   ├── batch_eval_dblp_from_score.sh
    │   ├── batch_eval_yago_from_score.sh
    │   ├── edge_rec_eval_for_others_dblp_0.2.sh
    │   ├── edge_rec_eval_for_others_dblp_0.2_downsampled.sh
    │   ├── edge_rec_eval_for_others_dblp_0.4.sh
    │   ├── edge_rec_eval_for_others_yago_0.1.sh
    │   ├── edge_rec_eval_for_others_yago_0.4.sh
    │   ├── edge_rec_eval_pytorch_out.sh
    │   ├── pretrain_and_eval_yago_hins.sh
    │   ├── pretrain_and_eval_yago_hins_qi_filtered.sh
    │   ├── pretrain_dblp_aspem.sh
    │   ├── pretrain_eval.sh
    │   ├── pretrain_eval_0.2_downsampled.sh
    │   ├── pretrain_more_dblp_varying_knowout_rate.sh
    │   ├── pretrain_more_hins.sh
    │   ├── pretrain_more_hins_0.2_downsampled.sh
    │   ├── pretrain_more_hins_0.2_downsampled_keyless.sh
    │   ├── pretrain_more_yago_varying_knowout_rate.sh
    │   ├── pretrain_no_gender_yago_varying_knowout_rate.sh
    │   ├── re_eval_dblp_all_ko_rates.sh
    │   └── rename_edge_types.sh
    ├── eval_heer.sh
    └── knock_out_hin_and_pretrain.sh
└── src
    ├── case.sh
    ├── decoder.py
    ├── emb_lib.py
    ├── eval.sh
    ├── main.py
    ├── neg.py
    ├── network.py
    ├── pred.py
    ├── pred_case.py
    ├── run.sh
    ├── test.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .DS_Store
 3 | .idea/
 4 | data/
 5 | input_data/
 6 | intermediate_data/
 7 | log/
 8 | output/
 9 | pretrain/line
10 | *.o
11 | archive/
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | INPUTDIR = ./input_data
 2 | INTMEDIR = ./intermediate_data
 3 | MODELDIR = ./intermediate_data/model
 4 | PERTYPEDIR = ./intermediate_data/per_type_temp
 5 | OUTPUTDIR = ./output
 6 | LOGDIR = ./log
 7 | 
 8 | all: 
 9 | 	mkdir -p $(INPUTDIR) $(INTMEDIR) $(MODELDIR) $(PERTYPEDIR) $(OUTPUTDIR) $(LOGDIR)
10 | 	$(MAKE) -C pretrain/
11 | 
12 | clean: # intentionally omitted make clean to avoid accidentally deleting all data
13 | 	$(MAKE) -C pretrain/ clean
14 | 
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HEER: Easing Embedding Learning by Comprehensive Transcription of Heterogeneous Information Networks
 2 | 
 3 | Source code and data for KDD'18 paper *[Easing Embedding Learning by Comprehensive Transcription of
 4 | Heterogeneous Information Networks](https://arxiv.org/abs/1807.03490)*. 
 5 | 
 6 | KDD promotional video *[Click Here!](https://m.youtube.com/watch?v=LsOHdQ1Xdn8&feature=youtu.be)*
 7 | ## Dependencies
 8 | * python 2.7
 9 | * python 3.5
10 | * [PyTorch 4.0](https://pytorch.org/) with GPU support
11 | 
12 | ## Data
13 | We use two publicly available real-world HIN datasets: DBLP and YAGO. We provide processed data links to reproduce our results. 
14 | * **DBLP** ([Tang et al., 2008](https://dl.acm.org/citation.cfm?id=1402008)): DBLP is a bibliographical network in the computer science domain. There are five types of nodes in the network: author, paper, key term, venue, and year. The edge types include authorship (aut.), term usage (term), publishing venue(ven.), and publishing year (year) of a paper, and the reference relationship from a paper to another (ref.). [[download](https://s3.us-east-2.amazonaws.com/heer-data/dblp.zip)] [[pretrained LINE embeddings](https://s3.us-east-2.amazonaws.com/heer-data/pretrained_dblp_emb.zip)]
15 | * **YAGO** ([Suchanek et al., 2007](https://suchanek.name/work/publications/www2007.pdf)): YAGO is a large-scale knowledge graph derived from Wikipedia, WordNet, and GeoNames. There are seven types of nodes in the network: person, location, organization, piece of work, prize, position, and event. A total of 24 edge types exist in the network, with five being directed and others being undirected. [[download](https://s3.us-east-2.amazonaws.com/heer-data/yago.zip)] [[pretrained LINE embeddings](https://s3.us-east-2.amazonaws.com/heer-data/pretrained_yago_emb.zip)]
16 | 
17 | ## Preparation
18 | You will need to download zipped data and pretrained embedding via above links.
19 | ```
20 | $ make
21 | ```
22 | Please place unzipped pretrained embeddings under folder intermediate_data/ and network files under input_data/.
23 | 
24 | ## Train HEER
25 | The hyperparameters for HEER are network name and epoch number. Regarding our proposed edge reconstruction task, the $network is formatted as *"$data-name"*\_*ko*\_*"$ko-rate"*, e.g. yago_ko_0.4. You can find both DBLP and YAGO datasets with knock out rate from 0.1 to 0.9 in above link.
26 | ### Example Usage
27 | ```
28 | $ ./src/run.sh $network $epoch
29 | ```
30 | ### Default Run & Parameters
31 | Run HEER training on the YAGO dataset for 61 epochs, knock out rate is 0.4. We set the default model dump timer as 6, so you will have 10 models.
32 | ```
33 | $ ./src/run.sh yago_ko_0.4 61
34 | ```
35 | 
36 | ## Evaluation
37 | Similar with training, here we show how to evaluate HEER on the YAGO dataset, knock out rate is 0.4. Micro-MRR, Macro-MRR and MRR for each specific edge type can be found in evaluation result files under output/. 
38 | ```
39 | $ ./src/eval.sh yago_ko_0.4 61
40 | ```
41 | ## Play with Your Own Data
42 | We also provide tools to generate train and test data from any HINs. You can find detailed instructions under [preprocessing/](https://github.com/GentleZhu/HEER/tree/master/preprocessing). In short, you need to prepare a formatted edge list and a data-specific config file. Then pre-train LINE embedding via [pretrain/](https://github.com/GentleZhu/HEER/tree/master/pretrain). Take **yago.config** for example, 
43 | 	
44 |     [[0, 1], [0, 2], [0, 2], [0, 1], [0, 3], [0, 4], [4, 4], [0, 4], [0, 4], [0, 1], [0, 4], [0, 0], [0, 0], [0, 1], [0, 5], [0, 0], [2, 4], [0, 2], [0, 0], [6, 4], [0, 1], [0, 4], [0, 0], [4, 4]]
45 |     ['PE', 'WO', 'AS', 'PR', 'AD', 'PO', 'EV']
46 |     ['<created>:u', '<isAffiliatedTo>:u', '<playsFor>:u', '<actedIn>:u', '<hasWonPrize>:u', '<diedIn>:u', '<isPartOf>:d', '<isCitizenOf>:u', '<wasBornIn>:u', '<wroteMusicFor>:u', '<livesIn>:u', '<hasChild>:d', '<isMarriedTo>:u', '<directed>:u', '<holdsPosition>:u', '<influences>:d', '<isLocatedIn>:u', '<graduatedFrom>:u', '<isConnectedTo>:u', '<happenedIn>:u', '<edited>:u', '<isPoliticianOf>:u', '<isAdvisedBy>:d', '<hasCapital>:d']
47 |     [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1]
48 |   
49 | The first line describes left and right node types of a specific edge types, in which you need to index them in the node type list. The second line is the node type list. The third and fourth line describes edge types and directions. For example, `<created>:u` means `created` is an un-directed relation. Moreover, the first `0` in the fourth line indicates it is un-directed as well.
50 | 
51 | You can create your own train and evaluation file using command below, please refer [preprocessing/](https://github.com/GentleZhu/HEER/blob/master/preprocessing/Readme.md) for more details:
52 | ```
53 | $ python ./preprocessing/ko_hin.py --input-hin-file your-data --data-set-name preferred-network-name --path-output output-path --ko-rate 0.x
54 | ```
55 | 


--------------------------------------------------------------------------------
/aux/archive/downsample_dblp_hin.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | 
 4 | input_hin = sys.argv[1]
 5 | output_hin = sys.argv[2]
 6 | 
 7 | smp_rate = 0.1
 8 | 
 9 | paper_set = set()
10 | with open(input_hin, "r") as f_in:
11 |     for line in f_in:
12 |         node_1, node_2, _ = line.strip().split()
13 |         if "P" in node_1:
14 |             paper_set.add(node_1)
15 |         if "P" in node_2:
16 |             paper_set.add(node_2)
17 | 
18 | remaining_set = set(random.sample(paper_set, int(len(paper_set)*smp_rate)))
19 | 
20 | with open(input_hin, "r") as f_in, open(output_hin, "w") as f_out:
21 |     for line in f_in:
22 |         node_1, node_2, _ = line.strip().split()
23 |         if ("P" in node_1 and node_1 not in remaining_set) or ("P" in node_2 and node_2 not in remaining_set):
24 |             continue
25 | 
26 |         f_out.write(line)
27 | 


--------------------------------------------------------------------------------
/aux/archive/filter_eval_data.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | input_nih = sys.argv[1]
 4 | input_eval = sys.argv[2]
 5 | output_eval = sys.argv[3]
 6 | 
 7 | neg_size = 10
 8 | 
 9 | node_set = set()
10 | with open(input_nih, "r") as f_in_hin:
11 |     for line in f_in_hin:
12 |         node_1, node_2, _ = line.strip().split()
13 |         node_set.add(node_1)
14 |         node_set.add(node_2)
15 | 
16 | with open(input_eval, "r") as f_in_eval, open(output_eval, "w") as f_out_eval:
17 |     cur_batch = ""
18 |     cur_validity = True
19 |     for idx, line in enumerate(f_in_eval):
20 |         cur_batch += line
21 |         node_1, node_2, _, __ = line.strip().split()
22 |         if node_1 not in node_set or node_2 not in node_set:
23 |             cur_validity = False
24 | 
25 |         if (idx + 1) % (2 * neg_size + 1) == 0:
26 |             if cur_validity is True:
27 |                 f_out_eval.write(cur_batch)
28 |             cur_batch = ""
29 |             cur_validity = True
30 | 


--------------------------------------------------------------------------------
/aux/archive/find_downsampled_embedding.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | 
 4 | input_hin = sys.argv[1]
 5 | input_emb = sys.argv[2]
 6 | output_emb = sys.argv[3]
 7 | 
 8 | node_set = set()
 9 | with open(input_hin, "r") as f_in:
10 |     for line in f_in:
11 |         node_1, node_2, _ = line.strip().split()
12 |         node_set.add(node_1)
13 |         node_set.add(node_2)
14 | 
15 | with open(input_emb, "r") as f_in, open(output_emb, "w") as f_out:
16 |     first_line_split = f_in.readline().strip().split()
17 |     valid_lines = []
18 |     for line in f_in:
19 |         line_split = line.strip().split()
20 |         if line_split[0] not in node_set:
21 |             continue
22 | 
23 |         valid_lines.append(line)
24 | 
25 |     num_nodes = len(valid_lines)
26 |     f_out.write(str(num_nodes) + " " + first_line_split[1] + "\n")
27 |     f_out.writelines(valid_lines)
28 | 


--------------------------------------------------------------------------------
/aux/archive/match_eval_file_to_downsampled_hin.py:
--------------------------------------------------------------------------------
 1 | # the negative generation methods are approximate
 2 | import argparse
 3 | import random
 4 | 
 5 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 6 | parser.add_argument("--input-eval", nargs="?", help="Input evaluation filename.", type=str)
 7 | parser.add_argument("--input-hin", nargs="?", help="Input HIN filename.", type=str)
 8 | parser.add_argument("--output-file", nargs="?", help="Output matched evaluation filename.", type=str)
 9 | parser.add_argument('--neg-size', nargs="?", type=int, default=10, help="Negative pairs for each positive pair in one direction, default 10.")
10 | args = parser.parse_args()
11 | 
12 | input_eval = args.input_eval
13 | input_hin = args.input_hin
14 | output_eval = args.output_file
15 | neg_size = args.neg_size
16 | 
17 | threshold_for_valid_neg = 2
18 | 
19 | node_set = set()
20 | with open(input_hin, "r") as f_in_hin:
21 |     for line in f_in_hin:
22 |         node_1, node_2, _ = line.strip().split()
23 |         node_set.add(node_1)
24 |         node_set.add(node_2)
25 | 
26 | with open(input_eval, "r") as f_in_eval, open(output_eval, "w") as f_out_eval:
27 |     for idx, line in enumerate(f_in_eval):
28 |         if idx % (2 * neg_size + 1) == 0:
29 |             neg_left_line_list = []
30 |             neg_right_line_list = []
31 | 
32 |             pos_line = line
33 |             node_1, node_2, _, __ = line.strip().split()
34 |             if node_1 not in node_set or node_2 not in node_set:
35 |                 pos_line_keep = False
36 |             else:
37 |                 pos_line_keep = True
38 |         elif 0 < idx % (2 * neg_size + 1) <= neg_size: # neg_left
39 |             node_1, node_2, _, __ = line.strip().split()
40 |             if node_1 in node_set and node_2 in node_set:
41 |                 neg_left_line_list.append(line)
42 |         elif neg_size < idx % (2 * neg_size + 1) <= 2*neg_size: # neg_right
43 |             node_1, node_2, _, __ = line.strip().split()
44 |             if node_1 in node_set and node_2 in node_set:
45 |                 neg_right_line_list.append(line)
46 |         else:
47 |             raise Exception("Wrong index.")
48 | 
49 |         if idx % (2 * neg_size + 1) == 2*neg_size: # on end of block
50 |             # continue is the pos in this bag is invalid
51 |             if pos_line_keep is False:
52 |                 continue
53 | 
54 |             # continue if either left or right is invalid
55 |             if len(neg_left_line_list) < threshold_for_valid_neg or len(neg_right_line_list) < threshold_for_valid_neg:
56 |                 continue
57 | 
58 |             len_neg_left = len(neg_left_line_list)
59 |             new_neg_left_line_list = neg_left_line_list * int(neg_size/len_neg_left) + neg_left_line_list[:neg_size%len_neg_left]
60 | 
61 |             len_neg_right = len(neg_right_line_list)
62 |             new_neg_right_line_list = neg_right_line_list * int(neg_size/len_neg_right) + neg_right_line_list[:neg_size%len_neg_right]
63 | 
64 |             f_out_eval.write(pos_line)
65 |             f_out_eval.writelines(new_neg_left_line_list)
66 |             f_out_eval.writelines(new_neg_right_line_list)
67 | 
68 | 


--------------------------------------------------------------------------------
/aux/archive/remove_keyword.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | input_file = sys.argv[1]
 4 | output_file = sys.argv[2]
 5 | 
 6 | with open(input_file, "r") as f_in, open(output_file, "w") as f_out:
 7 |     for line in f_in:
 8 |         entry_1, entry_2 = line.strip().split()[:2]
 9 |         if "W:" in entry_1 or "W:" in entry_2:
10 |             continue
11 |         f_out.write(line)


--------------------------------------------------------------------------------
/aux/calc_edge_type_correlation_jaccard_nparray.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Compute jaccard-like correlation, weighted and unweighted, between each pair of edge types
  3 | """
  4 | 
  5 | import argparse
  6 | from collections import defaultdict
  7 | from random import random
  8 | import numpy as np
  9 | 
 10 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 11 | parser.add_argument("--input", nargs="?", help="Input HIN filename.", type=str)
 12 | parser.add_argument("--output-prefix", nargs="?", help="Output correlation filename prefix.", type=str)
 13 | parser.add_argument("--sample-rate", default=1.0, nargs="?", help="Output correlation filename.", type=float)
 14 | args = parser.parse_args()
 15 | 
 16 | """
 17 | First pass on input file to find normalization factor for each edge type
 18 | """
 19 | total_weights_dict = defaultdict(float)
 20 | normalization_multipliers_dict = {}
 21 | with open(args.input, "r") as f_in:
 22 |     for line in f_in:
 23 |         attr_node, center_node, weight_str, edge_type = line.strip().split()
 24 |         total_weights_dict[edge_type] += float(weight_str)
 25 | 
 26 |     for edge_type in total_weights_dict:
 27 |         normalization_multipliers_dict[edge_type] = 1./total_weights_dict[edge_type]
 28 | 
 29 | """
 30 | Second pass on input file to find egde type set and core center nodes
 31 | """
 32 | edge_type_set = set()
 33 | center_node_edge_dict = defaultdict(dict)  # {center_node: {edge_type: {attr_node: weight}}}
 34 | attr_node_edge_dict = defaultdict(dict)  # {attr_node: {edge_type: {center_node: weight}}}
 35 | with open(args.input, "r") as f_in:
 36 |     for idx, line in enumerate(f_in):
 37 |         attr_node, center_node, weight_str, edge_type = line.strip().split()
 38 | 
 39 |         edge_type_set.add(edge_type)
 40 |         if edge_type not in center_node_edge_dict[center_node]:
 41 |             center_node_edge_dict[center_node][edge_type] = defaultdict(float)
 42 |         if edge_type not in attr_node_edge_dict[attr_node]:
 43 |             attr_node_edge_dict[attr_node][edge_type] = defaultdict(float)
 44 | 
 45 |         center_node_edge_dict[center_node][edge_type][attr_node] += float(weight_str) * normalization_multipliers_dict[edge_type]
 46 |         attr_node_edge_dict[attr_node][edge_type][center_node] += float(weight_str) * normalization_multipliers_dict[edge_type]
 47 | 
 48 |         if idx % 10000 == 0:
 49 |             print "Line %d processed." % idx
 50 | 
 51 | edge_type_list = list(edge_type_set)
 52 | 
 53 | """
 54 | Third pass on input file to compute measures
 55 | """
 56 | for i, edge_type_i in enumerate(edge_type_list):
 57 |     for edge_type_j in edge_type_list[i:]:
 58 |         num_center_node = len(center_node_edge_dict)
 59 |         num_center_node_processed = 0
 60 |         weighted_jac_list = []
 61 |         unweighted_jac_list = []
 62 |         for center_node in center_node_edge_dict:
 63 |             if random() > args.sample_rate:
 64 |                 continue
 65 | 
 66 |             path_count_i_dict = defaultdict(float)
 67 |             path_count_j_dict = defaultdict(float)
 68 | 
 69 |             if (edge_type_i not in center_node_edge_dict[center_node]) or (edge_type_j not in center_node_edge_dict[center_node]):
 70 |                 continue
 71 | 
 72 |             for attr_node_i in center_node_edge_dict[center_node][edge_type_i]:
 73 |                 cur_weight = center_node_edge_dict[center_node][edge_type_i][attr_node_i]
 74 |                 for linked_center_node in attr_node_edge_dict[attr_node_i][edge_type_i]:
 75 |                     if linked_center_node == center_node:  # do not consider itself
 76 |                         continue
 77 |                     path_count_i_dict[linked_center_node] += attr_node_edge_dict[attr_node_i][edge_type_i][linked_center_node] * cur_weight
 78 | 
 79 |             for attr_node_j in center_node_edge_dict[center_node][edge_type_j]:
 80 |                 cur_weight = center_node_edge_dict[center_node][edge_type_j][attr_node_j]
 81 |                 for linked_center_node in attr_node_edge_dict[attr_node_j][edge_type_j]:
 82 |                     if linked_center_node == center_node:  # do not consider itself
 83 |                         continue
 84 |                     path_count_j_dict[linked_center_node] += attr_node_edge_dict[attr_node_j][edge_type_j][linked_center_node] * cur_weight
 85 | 
 86 |             linked_center_node_union_set = set(path_count_i_dict) | set(path_count_j_dict)
 87 |             if len(linked_center_node_union_set) == 0:
 88 |                 continue
 89 | 
 90 |             weighted_numerator = 0.
 91 |             weighted_denominator = 0.
 92 |             unweighted_numerator = 0.
 93 |             unweighted_denominator = 0.
 94 |             for linked_center_node in linked_center_node_union_set:
 95 |                 cur_path_count_i = path_count_i_dict[linked_center_node]
 96 |                 cur_path_count_j = path_count_j_dict[linked_center_node]
 97 |                 assert cur_path_count_i > 0 or cur_path_count_j > 0.
 98 |                 weighted_numerator += min(cur_path_count_i, cur_path_count_j)
 99 |                 weighted_denominator += max(cur_path_count_i, cur_path_count_j)
100 |                 unweighted_numerator += 1. if min(cur_path_count_i, cur_path_count_j) > 0. else 0.
101 |                 unweighted_denominator += 1.
102 | 
103 |             weighted_jac_list.append(weighted_numerator/(1.*weighted_denominator))
104 |             unweighted_jac_list.append(unweighted_numerator/(1.*unweighted_denominator))
105 | 
106 |             num_center_node_processed += 1
107 | 
108 |             if num_center_node_processed % 1000 == 0:
109 |                 print "%d out of %d * %f center nodes processed for edge type pair %s and %s" % (num_center_node_processed, num_center_node, args.sample_rate, edge_type_i, edge_type_j)
110 | 
111 |         np.savez(args.output_prefix+"_"+edge_type_i+"_"+edge_type_j+".npz",
112 |                  weighted=np.asarray(weighted_jac_list), unweighted=np.asarray(unweighted_jac_list))
113 | 


--------------------------------------------------------------------------------
/aux/downsample_eval_file.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import random
 4 | 
 5 | def len_file(input_file):
 6 |     with open(input_file) as f:
 7 |         for i, l in enumerate(f):
 8 |             pass
 9 |     return i + 1
10 | 
11 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
12 | parser.add_argument("--input-file", nargs="?", help="Input evaluation filename.", type=str)
13 | parser.add_argument("--output-file", nargs="?", help="Output downsampled evaluation filename.", type=str)
14 | #parser.add_argument('--sample-rate', nargs="?", type=float, default=0.1, help="Sample rate, default 0.1.")
15 | parser.add_argument('--sample-number', nargs="?", type=int, default=4000000,
16 |                     help="Sample number to approximate, inc. both pos and neg, 4000000.")
17 | parser.add_argument('--neg-size', nargs="?", type=int, default=10, help="Negative pairs for each positive pair in one direction, default 10.")
18 | args = parser.parse_args()
19 | 
20 | input_eval = args.input_file
21 | output_eval = args.output_file
22 | neg_size = args.neg_size
23 | sample_number = args.sample_number
24 | 
25 | input_eval_file_len = len_file(input_eval)
26 | smp_rate = 1.*sample_number/input_eval_file_len
27 | 
28 | lines_to_write = []
29 | with open(input_eval, "r") as f_in_eval, open(output_eval, "w") as f_out_eval:
30 |     first_line = f_in_eval.readline()
31 |     lines_to_write.append(first_line)  # the number of batches is to be updated
32 | 
33 |     neg_size_from_file, num_batches_from_file = map(int, first_line.strip().split())
34 |     assert neg_size == neg_size_from_file
35 | 
36 |     new_num_batches = 0
37 |     cur_batch = ""
38 |     for idx, line in enumerate(f_in_eval):
39 |         cur_batch += line
40 |         if (idx + 1) % (2 * neg_size + 1) == 0:
41 |             if random.random() < smp_rate:
42 |                 lines_to_write.append(cur_batch)
43 |                 new_num_batches += 1
44 |             cur_batch = ""
45 |     assert (idx + 1)/(2*neg_size + 1) == num_batches_from_file, "Number of positive edges does not agree."
46 | 
47 |     lines_to_write[0] = str(neg_size) + " " + str(new_num_batches) + "\n"
48 |     f_out_eval.writelines(lines_to_write)
49 | 


--------------------------------------------------------------------------------
/aux/filter_hin_and_eval_file_for_aspem.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | input_file_aspect = sys.argv[1]
 4 | input_file_hin = sys.argv[2]
 5 | output_file_hin = sys.argv[3]
 6 | input_file_eval = sys.argv[4]
 7 | output_file_eval = sys.argv[5]
 8 | 
 9 | edge_type_set = set()
10 | with open(input_file_aspect, "r") as f_in:
11 |     for line in f_in:
12 |         edge_type_set.add(line.strip())
13 | 
14 | with open(input_file_hin, "r") as f_in, open(output_file_hin, "w") as f_out:
15 |     for line in f_in:
16 |         edge_type = line.strip().split()[3].strip("-1")
17 |         if edge_type in edge_type_set:
18 |             f_out.write(line)
19 | 
20 | with open(input_file_eval, "r") as f_in, open(output_file_eval, "w") as f_out:
21 |     neg_rate, num_pos = map(int, f_in.readline().strip().split())
22 |     lines_out = []
23 |     for line in f_in:
24 |         edge_type = line.strip().split()[3].strip("-1")
25 |         if edge_type in edge_type_set:
26 |             lines_out.append(line)
27 | 
28 |     f_out.writelines([str(neg_rate) + " " + str(len(lines_out)/(neg_rate*2+1))+ "\n"] + lines_out)
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/aux/find_inner.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | 
 4 | input_eval = sys.argv[1]
 5 | input_emb = sys.argv[2]
 6 | output_score = sys.argv[3]
 7 | 
 8 | 
 9 | emb_dict={}
10 | with open(input_emb, "r") as f_in:
11 |     num_nodes, dim = map(int, f_in.readline().strip().split())  # first line is special
12 |     count=0
13 |     for line in f_in:
14 |         line_split = line.strip().split()
15 |         a=list(map(float, line_split[1:]))
16 |         emb_dict[line_split[0]] = np.asarray(a)
17 |     assert len(emb_dict) == num_nodes, "Number of nodes does not agree."
18 | print ("Embedding loading done.", num_nodes, "nodes with dim", dim, "from", input_emb)
19 | 
20 | with open(input_eval, "r") as f_in, open(output_score, "w") as f_out:
21 |     for line in f_in:
22 |         node_1, node_2, _, edge_type = line.strip().split()
23 | 
24 |         cur_score = emb_dict[node_1].dot(emb_dict[node_2])
25 | 
26 |         f_out.write(" ".join([node_1, node_2, str(cur_score), edge_type])+"\n")


--------------------------------------------------------------------------------
/aux/logit.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | from sklearn.linear_model import LogisticRegression # use this package
  4 | from sklearn.utils import shuffle
  5 | import time
  6 | import threading
  7 | import argparse
  8 | 
  9 | 
 10 | # for counting file lines
 11 | def file_len(f_name):
 12 |     with open(f_name) as f:
 13 |         for i, l in enumerate(f):
 14 |             pass
 15 |     return i + 1
 16 | 
 17 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 18 | parser.add_argument("--input-label-train", nargs="?", help="Input label train.", type=str)
 19 | parser.add_argument("--input-label-test", nargs="?", help="Input label test.", type=str)
 20 | parser.add_argument("--input-embedding", nargs="?", help="Input embedding", type=str)
 21 | parser.add_argument("--output-file", nargs="?", help="Output filename.", type=str)
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | input_label_train = args.input_label_train
 26 | input_label_test = args.input_label_test
 27 | input_embedding=args.input_embedding
 28 | output_test_score = args.output_file
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | """
 35 | Train logit model
 36 | """
 37 | start_time = time.time()
 38 | embedding_dict={}
 39 | with open(input_embedding, "r") as f_in:
 40 |     num_nodes, dim = map(int, f_in.readline().strip().split())  # first line is special
 41 |     count=0
 42 |     for line in f_in:
 43 |         line_split = line.strip().split()
 44 |         a=list(map(float, line_split[1:]))
 45 |         embedding_dict[line_split[0]] = np.asarray(a)
 46 |         count+=1
 47 |     assert len(embedding_dict) == num_nodes, "Number of nodes does not agree."
 48 |     
 49 | print ("Embedding loading done.", num_nodes, "nodes with dim", dim, "from", input_embedding)
 50 | 
 51 | feature_train_dic={}
 52 | feature_train_list = []
 53 | with open(input_label_train, "r") as f_in:
 54 |     count=0
 55 |     for line in f_in:
 56 |         line=line.strip().split()
 57 |         node_1=embedding_dict[line[0]]
 58 | 
 59 |         node_2=embedding_dict[line[1]]
 60 |         edge=line[-1]
 61 |         y_value=line[2]
 62 |         if edge not in feature_train_dic:
 63 |             feature_train_dic[edge]={}
 64 |             feature_train_dic[edge]['tuple']=[]
 65 |             feature_train_dic[edge]['yvalue']=[]
 66 |         feature_train_dic[edge]['tuple'].append(np.multiply(node_1,node_2))
 67 |         feature_train_dic[edge]['yvalue'].append(y_value)
 68 |         if edge!='PP':
 69 |             edge_reverse=edge[::-1]
 70 |             if edge_reverse not in feature_train_dic:
 71 |                 feature_train_dic[edge_reverse]={}
 72 |                 feature_train_dic[edge_reverse]['tuple']=[]
 73 |                 feature_train_dic[edge_reverse]['yvalue']=[]
 74 |             feature_train_dic[edge_reverse]['tuple'].append(np.multiply(node_1,node_2))
 75 |             feature_train_dic[edge_reverse]['yvalue'].append(y_value)
 76 |         count+=1
 77 |         if count%100000==0:
 78 |             print(count,' tuples read')
 79 |         
 80 | for edge in feature_train_dic.keys():
 81 |     f=[]
 82 |     f.append(np.array(feature_train_dic[edge]['tuple']))
 83 |     feature_train_dic[edge]['Xtrain']= np.hstack(tuple(f))
 84 |     num_instance_train = len(feature_train_dic[edge]['tuple'])
 85 |     feature_train_dic[edge]['tuple']=[]
 86 |     assert num_instance_train == feature_train_dic[edge]['Xtrain'].shape[0], "Train instance numbers do not match."
 87 |     y_train = np.zeros(num_instance_train)
 88 |     for i in range(num_instance_train):
 89 |         y_train[i] = int(feature_train_dic[edge]['yvalue'][i])
 90 |     feature_train_dic[edge]['ytrain']=y_train
 91 |     print(edge,' finished')
 92 | 
 93 | end_time = time.time()
 94 | print ("Train features loading and stacking done. Time: {0}s seconds. ".format((end_time - start_time)))
 95 |     
 96 | start_time = time.time()
 97 | for edge in feature_train_dic.keys():
 98 |     print('now training ',edge)
 99 |     logit_model = LogisticRegression(solver="sag",max_iter=1000)  
100 |     feature_train_dic[edge]['model']=logit_model
101 | 
102 | threads=[]
103 | for edge in feature_train_dic.keys():
104 |     t = threading.Thread(target=feature_train_dic[edge]['model'].fit, args=(feature_train_dic[edge]['Xtrain'], feature_train_dic[edge]['ytrain']),name=edge)
105 |     threads.append(t)
106 |     t.start()
107 |     '''
108 |     X_shuf, Y_shuf = shuffle(feature_train_dic[edge]['Xtrain'], feature_train_dic[edge]['ytrain'])
109 |     logit_model = logit_model.fit(X_shuf, Y_shuf)  
110 |     feature_train_dic[edge]['model']=logit_model
111 |     print(edge,' training is done.')'''
112 | has_running = True
113 | while has_running:
114 |     num_done = 0
115 |     for t in threads:
116 |         if not t.isAlive():
117 |             # get results from thtead
118 |             print(t.getName(),' training is done')
119 |             num_done += 1
120 |     if num_done == len(threads):
121 |         break
122 |     else:
123 |         time.sleep(3)
124 | end_time = time.time()
125 | 
126 | print ("Logit model fitting done. Training time: %s seconds" % (end_time - start_time))
127 | 
128 | 
129 | 
130 | """
131 | Predict on test
132 | """
133 | start_time = time.time()
134 | feature_test_dic={}
135 | with open(input_label_test, "r") as f_in:
136 |     count=0
137 |     for line in f_in:
138 |         line=line.strip().split()
139 |         node_1=embedding_dict[line[0]]
140 |         node_2=embedding_dict[line[1]]
141 |         edge=line[-1]
142 |         yvalue=line[2]
143 |         if edge not in feature_test_dic:
144 |             feature_test_dic[edge]={}
145 |             feature_test_dic[edge]['tuple']=[]
146 |             #feature_test_dic[edge]['line']=[]
147 |             feature_test_dic[edge]['current']=0
148 |         feature_test_dic[edge]['tuple'].append(np.multiply(node_1,node_2))
149 |         #feature_test_dic[edge]['line'].append([line[0],line[1]])
150 | end_time = time.time()
151 | print('finished reading test file, time: ', (end_time - start_time))
152 | 
153 | 
154 | for edge in feature_test_dic.keys():
155 |     f=[]
156 |     f.append(np.array(feature_test_dic[edge]['tuple']))
157 |     feature_test_dic[edge]['Xtest']=np.hstack(tuple(f))
158 |     num_instance_test = len(feature_test_dic[edge]['tuple'])
159 |     assert num_instance_test == feature_test_dic[edge]['Xtest'].shape[0], "Test instance numbers do not match."
160 | # compute predicted value for file_2; a row of X_test is the vector -- emb(node_1) 
161 | #hadamard-prod emb(node_2) -- where node_1 and node_2 are the two nodes on a line of file *2*
162 |     proba_test = logit_model.predict_proba(feature_test_dic[edge]['Xtest'])
163 |     #print(proba_test[:,1])
164 |     feature_test_dic[edge]['p_test']=proba_test[:,1]
165 |     feature_test_dic[edge]['Xtest']=[]
166 |     print('finished proba: ', edge)
167 | 
168 | 
169 | ## output a file with same format as file_2, with the third column replaced by your predicted value as in proba_test
170 | 
171 | ## summary: input -- file_2, file_3, embedding file;
172 | ##			output -- the file similar to file_2 with third column replace and 
173 | ## note: please be careful that for each edge type (r), a different model will be trained and used for prediction
174 | ## example files: 	file_2 -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt
175 | ## 					file_3 -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_for_logit_training.txt
176 | ## 					emb_file -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_line_samples100000_alpha0.1_dim128.emb
177 | 
178 | with open(input_label_test, "r") as f_in,open(output_test_score, "w+") as f_out:
179 |     content=""
180 |     rd=0
181 |     for line in f_in:
182 |         line=line.strip().split()
183 |         edge=line[-1]
184 |         current=feature_test_dic[edge]['current']
185 |         temp=line[0]+' '+line[1]+' '+str(feature_test_dic[edge]['p_test'][current])+' '+edge+'\n'
186 |         #print(temp)
187 |         current+=1
188 |         feature_test_dic[edge]['current']=current
189 |         content=content+temp
190 |         rd+=1
191 |         if rd%50000==0:
192 |             print (rd,'lines finished')
193 |             f_out.write(content)
194 |             content=''
195 |     f_out.write(content)
196 |     f_out.close()
197 |         
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 


--------------------------------------------------------------------------------
/aux/logit_fast.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | from sklearn.linear_model import LogisticRegression # use this package
  4 | from sklearn.utils import shuffle
  5 | import time
  6 | import threading
  7 | import argparse
  8 | 
  9 | 
 10 | # for counting file lines
 11 | def file_len(f_name):
 12 |     with open(f_name) as f:
 13 |         for i, l in enumerate(f):
 14 |             pass
 15 |     return i + 1
 16 | 
 17 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 18 | parser.add_argument("--input-label-train", nargs="?", help="Input label train.", type=str)
 19 | parser.add_argument("--input-label-test", nargs="?", help="Input label test.", type=str)
 20 | parser.add_argument("--input-embedding", nargs="?", help="Input embedding", type=str)
 21 | parser.add_argument("--output-file", nargs="?", help="Output filename.", type=str)
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | input_label_train = args.input_label_train
 26 | input_label_test = args.input_label_test
 27 | input_embedding=args.input_embedding
 28 | output_test_score = args.output_file
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | """
 35 | Train logit model
 36 | """
 37 | start_time = time.time()
 38 | embedding_dict={}
 39 | with open(input_embedding, "r") as f_in:
 40 |     num_nodes, dim = map(int, f_in.readline().strip().split())  # first line is special
 41 |     count=0
 42 |     for line in f_in:
 43 |         line_split = line.strip().split()
 44 |         a=list(map(float, line_split[1:]))
 45 |         embedding_dict[line_split[0]] = np.asarray(a)
 46 |         count+=1
 47 |     assert len(embedding_dict) == num_nodes, "Number of nodes does not agree."
 48 |     
 49 | print ("Embedding loading done.", num_nodes, "nodes with dim", dim, "from", input_embedding)
 50 | 
 51 | feature_train_dic={}
 52 | feature_train_list = []
 53 | with open(input_label_train, "r") as f_in:
 54 |     count=0
 55 |     for line in f_in:
 56 |         line=line.strip().split()
 57 |         node_1=embedding_dict[line[0]]
 58 | 
 59 |         node_2=embedding_dict[line[1]]
 60 |         edge=line[-1]
 61 |         y_value=line[2]
 62 |         if edge not in feature_train_dic:
 63 |             feature_train_dic[edge]={}
 64 |             feature_train_dic[edge]['tuple']=[]
 65 |             feature_train_dic[edge]['yvalue']=[]
 66 |         feature_train_dic[edge]['tuple'].append(np.multiply(node_1,node_2))
 67 |         feature_train_dic[edge]['yvalue'].append(y_value)
 68 |         if edge!='PP':
 69 |             edge_reverse=edge[::-1]
 70 |             if edge_reverse not in feature_train_dic:
 71 |                 feature_train_dic[edge_reverse]={}
 72 |                 feature_train_dic[edge_reverse]['tuple']=[]
 73 |                 feature_train_dic[edge_reverse]['yvalue']=[]
 74 |             feature_train_dic[edge_reverse]['tuple'].append(np.multiply(node_1,node_2))
 75 |             feature_train_dic[edge_reverse]['yvalue'].append(y_value)
 76 |         count+=1
 77 |         if count%100000==0:
 78 |             print(count,' tuples read')
 79 |         
 80 | for edge in feature_train_dic.keys():
 81 |     f=[]
 82 |     f.append(np.array(feature_train_dic[edge]['tuple']))
 83 |     feature_train_dic[edge]['Xtrain']= np.hstack(tuple(f))
 84 |     num_instance_train = len(feature_train_dic[edge]['tuple'])
 85 |     feature_train_dic[edge]['tuple']=[]
 86 |     assert num_instance_train == feature_train_dic[edge]['Xtrain'].shape[0], "Train instance numbers do not match."
 87 |     y_train = np.zeros(num_instance_train)
 88 |     for i in range(num_instance_train):
 89 |         y_train[i] = int(feature_train_dic[edge]['yvalue'][i])
 90 |     feature_train_dic[edge]['ytrain']=y_train
 91 |     print(edge,' finished')
 92 | 
 93 | end_time = time.time()
 94 | print ("Train features loading and stacking done. Time: {0}s seconds. ".format((end_time - start_time)))
 95 |     
 96 | start_time = time.time()
 97 | for edge in feature_train_dic.keys():
 98 |     print('now training ',edge)
 99 |     logit_model = LogisticRegression(solver="sag",max_iter=10)  
100 |     feature_train_dic[edge]['model']=logit_model
101 | 
102 | threads=[]
103 | for edge in feature_train_dic.keys():
104 |     t = threading.Thread(target=feature_train_dic[edge]['model'].fit, args=(feature_train_dic[edge]['Xtrain'], feature_train_dic[edge]['ytrain']),name=edge)
105 |     threads.append(t)
106 |     t.start()
107 |     '''
108 |     X_shuf, Y_shuf = shuffle(feature_train_dic[edge]['Xtrain'], feature_train_dic[edge]['ytrain'])
109 |     logit_model = logit_model.fit(X_shuf, Y_shuf)  
110 |     feature_train_dic[edge]['model']=logit_model
111 |     print(edge,' training is done.')'''
112 | has_running = True
113 | while has_running:
114 |     num_done = 0
115 |     for t in threads:
116 |         if not t.isAlive():
117 |             # get results from thtead
118 |             print(t.getName(),' training is done')
119 |             num_done += 1
120 |     if num_done == len(threads):
121 |         break
122 |     else:
123 |         time.sleep(3)
124 | end_time = time.time()
125 | 
126 | print ("Logit model fitting done. Training time: %s seconds" % (end_time - start_time))
127 | 
128 | 
129 | 
130 | """
131 | Predict on test
132 | """
133 | start_time = time.time()
134 | feature_test_dic={}
135 | with open(input_label_test, "r") as f_in:
136 |     count=0
137 |     for line in f_in:
138 |         line=line.strip().split()
139 |         node_1=embedding_dict[line[0]]
140 |         node_2=embedding_dict[line[1]]
141 |         edge=line[-1]
142 |         yvalue=line[2]
143 |         if edge not in feature_test_dic:
144 |             feature_test_dic[edge]={}
145 |             feature_test_dic[edge]['tuple']=[]
146 |             #feature_test_dic[edge]['line']=[]
147 |             feature_test_dic[edge]['current']=0
148 |         feature_test_dic[edge]['tuple'].append(np.multiply(node_1,node_2))
149 |         #feature_test_dic[edge]['line'].append([line[0],line[1]])
150 | end_time = time.time()
151 | print('finished reading test file, time: ', (end_time - start_time))
152 | 
153 | 
154 | for edge in feature_test_dic.keys():
155 |     f=[]
156 |     f.append(np.array(feature_test_dic[edge]['tuple']))
157 |     feature_test_dic[edge]['Xtest']=np.hstack(tuple(f))
158 |     num_instance_test = len(feature_test_dic[edge]['tuple'])
159 |     assert num_instance_test == feature_test_dic[edge]['Xtest'].shape[0], "Test instance numbers do not match."
160 | # compute predicted value for file_2; a row of X_test is the vector -- emb(node_1) 
161 | #hadamard-prod emb(node_2) -- where node_1 and node_2 are the two nodes on a line of file *2*
162 |     proba_test = logit_model.predict_proba(feature_test_dic[edge]['Xtest'])
163 |     #print(proba_test[:,1])
164 |     feature_test_dic[edge]['p_test']=proba_test[:,1]
165 |     feature_test_dic[edge]['Xtest']=[]
166 |     print('finished proba: ', edge)
167 | 
168 | 
169 | ## output a file with same format as file_2, with the third column replaced by your predicted value as in proba_test
170 | 
171 | ## summary: input -- file_2, file_3, embedding file;
172 | ##			output -- the file similar to file_2 with third column replace and 
173 | ## note: please be careful that for each edge type (r), a different model will be trained and used for prediction
174 | ## example files: 	file_2 -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt
175 | ## 					file_3 -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_for_logit_training.txt
176 | ## 					emb_file -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_line_samples100000_alpha0.1_dim128.emb
177 | 
178 | with open(input_label_test, "r") as f_in,open(output_test_score, "w+") as f_out:
179 |     content=""
180 |     rd=0
181 |     for line in f_in:
182 |         line=line.strip().split()
183 |         edge=line[-1]
184 |         current=feature_test_dic[edge]['current']
185 |         temp=line[0]+' '+line[1]+' '+str(feature_test_dic[edge]['p_test'][current])+' '+edge+'\n'
186 |         #print(temp)
187 |         current+=1
188 |         feature_test_dic[edge]['current']=current
189 |         content=content+temp
190 |         rd+=1
191 |         if rd%50000==0:
192 |             print (rd,'lines finished')
193 |             f_out.write(content)
194 |             content=''
195 |     f_out.write(content)
196 |     f_out.close()
197 |         
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 


--------------------------------------------------------------------------------
/aux/logit_yago.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | from sklearn.linear_model import LogisticRegression # use this package
  4 | from sklearn.utils import shuffle
  5 | import time
  6 | import threading
  7 | import argparse
  8 | 
  9 | 
 10 | # for counting file lines
 11 | def file_len(f_name):
 12 |     with open(f_name) as f:
 13 |         for i, l in enumerate(f):
 14 |             pass
 15 |     return i + 1
 16 | 
 17 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 18 | parser.add_argument("--input-label-train", nargs="?", help="Input label train.", type=str)
 19 | parser.add_argument("--input-label-test", nargs="?", help="Input label test.", type=str)
 20 | parser.add_argument("--input-embedding", nargs="?", help="Input embedding", type=str)
 21 | parser.add_argument("--output-file", nargs="?", help="Output filename.", type=str)
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | input_label_train = args.input_label_train
 26 | input_label_test = args.input_label_test
 27 | input_embedding=args.input_embedding
 28 | output_test_score = args.output_file
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | """
 35 | Train logit model
 36 | """
 37 | start_time = time.time()
 38 | embedding_dict={}
 39 | 
 40 | with open(input_embedding, "r") as f_in:
 41 |     num_nodes, dim = map(int, f_in.readline().strip().split())  # first line is special
 42 |     count=0
 43 |     for line in f_in:
 44 |         line_split = line.strip().split()
 45 |         a=list(map(float, line_split[1:]))
 46 |         embedding_dict[line_split[0]] = np.asarray(a)
 47 |         count+=1
 48 |     assert len(embedding_dict) == num_nodes, "Number of nodes does not agree."
 49 |     
 50 | print ("Embedding loading done.", num_nodes, "nodes with dim", dim, "from", input_embedding)
 51 | 
 52 | feature_train_dic={}
 53 | feature_train_list = []
 54 | with open(input_label_train, "r") as f_in:
 55 |     count=0
 56 |     for line in f_in:
 57 |         line=line.strip().split()
 58 |         if line[0] not in embedding_dict or line[1] not in embedding_dict:
 59 |             continue
 60 |         node_1=embedding_dict[line[0]]
 61 |         node_2=embedding_dict[line[1]]
 62 |         edge=line[-1]
 63 |         y_value=line[2]
 64 |         if '-1' not in edge:
 65 |             edge_neg=edge+'-1'
 66 |             if edge not in feature_train_dic:
 67 |                 feature_train_dic[edge]={}
 68 |                 feature_train_dic[edge]['tuple']=[]
 69 |                 feature_train_dic[edge]['yvalue']=[]
 70 |             feature_train_dic[edge]['tuple'].append(np.multiply(node_1,node_2))
 71 |             feature_train_dic[edge]['yvalue'].append(y_value)
 72 |             if edge_neg not in feature_train_dic:
 73 |                 feature_train_dic[edge_neg]={}
 74 |                 feature_train_dic[edge_neg]['tuple']=[]
 75 |                 feature_train_dic[edge_neg]['yvalue']=[]    
 76 |             feature_train_dic[edge_neg]['tuple'].append(np.multiply(node_1,node_2))
 77 |             feature_train_dic[edge_neg]['yvalue'].append(y_value)
 78 |         else:
 79 |             edge_pos=edge[0:-2]
 80 |             feature_train_dic[edge_pos]['tuple'].append(np.multiply(node_1,node_2))
 81 |             feature_train_dic[edge_pos]['yvalue'].append(y_value)
 82 |             feature_train_dic[edge]['tuple'].append(np.multiply(node_1,node_2))
 83 |             feature_train_dic[edge]['yvalue'].append(y_value)
 84 |         count+=1
 85 |         if count==50000:
 86 |             print(count,' tuples read')
 87 |         
 88 | for edge in feature_train_dic.keys():
 89 |     f=[]
 90 |     f.append(np.array(feature_train_dic[edge]['tuple']))
 91 |     feature_train_dic[edge]['Xtrain']= np.hstack(tuple(f))
 92 |     num_instance_train = len(feature_train_dic[edge]['tuple'])
 93 |     feature_train_dic[edge]['tuple']=[]
 94 |     assert num_instance_train == feature_train_dic[edge]['Xtrain'].shape[0], "Train instance numbers do not match."
 95 |     y_train = np.zeros(num_instance_train)
 96 |     for i in range(num_instance_train):
 97 |         y_train[i] = int(feature_train_dic[edge]['yvalue'][i])
 98 |     feature_train_dic[edge]['ytrain']=y_train
 99 |     print(edge,' finished')
100 | 
101 | end_time = time.time()
102 | print ("Train features loading and stacking done. Time: {0}s seconds. ".format((end_time - start_time)))
103 |     
104 | start_time = time.time()
105 | for edge in feature_train_dic.keys():
106 |     print('now training ',edge)
107 |     logit_model = LogisticRegression(solver="sag",max_iter=1000)  
108 |     feature_train_dic[edge]['model']=logit_model
109 | 
110 | threads=[]
111 | for edge in feature_train_dic.keys():
112 |     t = threading.Thread(target=feature_train_dic[edge]['model'].fit, args=(feature_train_dic[edge]['Xtrain'], feature_train_dic[edge]['ytrain']),name=edge)
113 |     threads.append(t)
114 |     t.start()
115 | 
116 | '''has_running = True
117 | while has_running:
118 |     num_done = 0
119 |     for t in threads:
120 |         if not t.isAlive():
121 |             # get results from thtead
122 |             print(t.getName(),' training is done')
123 |             num_done += 1
124 |     if num_done == len(threads):
125 |         break
126 |     else:
127 |         time.sleep(3)'''
128 | for t in threads:
129 |     t.join()
130 | end_time = time.time()
131 | 
132 | print ("Logit model fitting done. Training time: %s seconds" % (end_time - start_time))
133 | 
134 | 
135 | 
136 | """
137 | Predict on test
138 | """
139 | start_time = time.time()
140 | feature_test_dic={}
141 | with open(input_label_test, "r") as f_in:
142 |     count=0
143 |     for line in f_in:
144 |         line=line.strip().split()
145 |         if line[0] not in embedding_dict or line[1] not in embedding_dict:
146 |             continue
147 |         node_1=embedding_dict[line[0]]
148 |         node_2=embedding_dict[line[1]]
149 |         edge=line[-1]
150 |         yvalue=line[2]
151 |         if edge not in feature_test_dic:
152 |             feature_test_dic[edge]={}
153 |             feature_test_dic[edge]['tuple']=[]
154 |             #feature_test_dic[edge]['line']=[]
155 |             feature_test_dic[edge]['current']=0
156 |         feature_test_dic[edge]['tuple'].append(np.multiply(node_1,node_2))
157 |         #feature_test_dic[edge]['line'].append([line[0],line[1]])
158 |         count+=1
159 | 
160 | end_time = time.time()
161 | print('finished reading test file, time: ', (end_time - start_time))
162 | 
163 | 
164 | for edge in feature_test_dic.keys():
165 |     f=[]
166 |     f.append(np.array(feature_test_dic[edge]['tuple']))
167 |     feature_test_dic[edge]['Xtest']=np.hstack(tuple(f))
168 |     num_instance_test = len(feature_test_dic[edge]['tuple'])
169 |     assert num_instance_test == feature_test_dic[edge]['Xtest'].shape[0], "Test instance numbers do not match."
170 | # compute predicted value for file_2; a row of X_test is the vector -- emb(node_1) 
171 | #hadamard-prod emb(node_2) -- where node_1 and node_2 are the two nodes on a line of file *2*
172 |     proba_test = feature_train_dic[edge]['model'].predict_proba(feature_test_dic[edge]['Xtest'])
173 |     #print(proba_test[:,1])
174 |     feature_test_dic[edge]['p_test']=proba_test[:,1]
175 |     feature_test_dic[edge]['Xtest']=[]
176 |     print('finished proba: ', edge)
177 | 
178 | 
179 | ## output a file with same format as file_2, with the third column replaced by your predicted value as in proba_test
180 | 
181 | ## summary: input -- file_2, file_3, embedding file;
182 | ##			output -- the file similar to file_2 with third column replace and 
183 | ## note: please be careful that for each edge type (r), a different model will be trained and used for prediction
184 | ## example files: 	file_2 -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt
185 | ## 					file_3 -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_for_logit_training.txt
186 | ## 					emb_file -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_line_samples100000_alpha0.1_dim128.emb
187 | 
188 | with open(input_label_test, "r") as f_in,open(output_test_score, "w+") as f_out:
189 |     content=""
190 |     rd=0
191 |     for line in f_in:
192 |         line=line.strip().split()
193 |         edge=line[-1]
194 |         if line[0] not in embedding_dict or line[1] not in embedding_dict:
195 |             temp=line[0]+' '+line[1]+' '+'HERE'+' '+edge+'\n'
196 |             continue
197 |         current=feature_test_dic[edge]['current']
198 |         temp=line[0]+' '+line[1]+' '+str(feature_test_dic[edge]['p_test'][current])+' '+edge+'\n'
199 |         #print(temp)
200 |         current+=1
201 |         feature_test_dic[edge]['current']=current
202 |         content=content+temp
203 |         rd+=1
204 |         if rd%100000==0:
205 |             print (rd,'lines finished')
206 |             f_out.write(content)
207 |             content=''
208 |     f_out.write(content)
209 |     f_out.close()
210 |         
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 


--------------------------------------------------------------------------------
/aux/logit_yago_fast.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | from sklearn.linear_model import LogisticRegression # use this package
  4 | from sklearn.utils import shuffle
  5 | import time
  6 | import threading
  7 | import argparse
  8 | 
  9 | 
 10 | # for counting file lines
 11 | def file_len(f_name):
 12 |     with open(f_name) as f:
 13 |         for i, l in enumerate(f):
 14 |             pass
 15 |     return i + 1
 16 | 
 17 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 18 | parser.add_argument("--input-label-train", nargs="?", help="Input label train.", type=str)
 19 | parser.add_argument("--input-label-test", nargs="?", help="Input label test.", type=str)
 20 | parser.add_argument("--input-embedding", nargs="?", help="Input embedding", type=str)
 21 | parser.add_argument("--output-file", nargs="?", help="Output filename.", type=str)
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | input_label_train = args.input_label_train
 26 | input_label_test = args.input_label_test
 27 | input_embedding=args.input_embedding
 28 | output_test_score = args.output_file
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | """
 35 | Train logit model
 36 | """
 37 | start_time = time.time()
 38 | embedding_dict={}
 39 | 
 40 | with open(input_embedding, "r") as f_in:
 41 |     num_nodes, dim = map(int, f_in.readline().strip().split())  # first line is special
 42 |     count=0
 43 |     for line in f_in:
 44 |         line_split = line.strip().split()
 45 |         a=list(map(float, line_split[1:]))
 46 |         embedding_dict[line_split[0]] = np.asarray(a)
 47 |         count+=1
 48 |     assert len(embedding_dict) == num_nodes, "Number of nodes does not agree."
 49 |     
 50 | print ("Embedding loading done.", num_nodes, "nodes with dim", dim, "from", input_embedding)
 51 | 
 52 | feature_train_dic={}
 53 | feature_train_list = []
 54 | with open(input_label_train, "r") as f_in:
 55 |     count=0
 56 |     for line in f_in:
 57 |         line=line.strip().split()
 58 |         if line[0] not in embedding_dict or line[1] not in embedding_dict:
 59 |             continue
 60 |         node_1=embedding_dict[line[0]]
 61 |         node_2=embedding_dict[line[1]]
 62 |         edge=line[-1]
 63 |         y_value=line[2]
 64 |         if '-1' not in edge:
 65 |             edge_neg=edge+'-1'
 66 |             if edge not in feature_train_dic:
 67 |                 feature_train_dic[edge]={}
 68 |                 feature_train_dic[edge]['tuple']=[]
 69 |                 feature_train_dic[edge]['yvalue']=[]
 70 |             feature_train_dic[edge]['tuple'].append(np.multiply(node_1,node_2))
 71 |             feature_train_dic[edge]['yvalue'].append(y_value)
 72 |             if edge_neg not in feature_train_dic:
 73 |                 feature_train_dic[edge_neg]={}
 74 |                 feature_train_dic[edge_neg]['tuple']=[]
 75 |                 feature_train_dic[edge_neg]['yvalue']=[]    
 76 |             feature_train_dic[edge_neg]['tuple'].append(np.multiply(node_1,node_2))
 77 |             feature_train_dic[edge_neg]['yvalue'].append(y_value)
 78 |         else:
 79 |             edge_pos=edge[0:-2]
 80 |             feature_train_dic[edge_pos]['tuple'].append(np.multiply(node_1,node_2))
 81 |             feature_train_dic[edge_pos]['yvalue'].append(y_value)
 82 |             feature_train_dic[edge]['tuple'].append(np.multiply(node_1,node_2))
 83 |             feature_train_dic[edge]['yvalue'].append(y_value)
 84 |         count+=1
 85 |         if count==50000:
 86 |             print(count,' tuples read')
 87 |         
 88 | for edge in feature_train_dic.keys():
 89 |     f=[]
 90 |     f.append(np.array(feature_train_dic[edge]['tuple']))
 91 |     feature_train_dic[edge]['Xtrain']= np.hstack(tuple(f))
 92 |     num_instance_train = len(feature_train_dic[edge]['tuple'])
 93 |     feature_train_dic[edge]['tuple']=[]
 94 |     assert num_instance_train == feature_train_dic[edge]['Xtrain'].shape[0], "Train instance numbers do not match."
 95 |     y_train = np.zeros(num_instance_train)
 96 |     for i in range(num_instance_train):
 97 |         y_train[i] = int(feature_train_dic[edge]['yvalue'][i])
 98 |     feature_train_dic[edge]['ytrain']=y_train
 99 |     print(edge,' finished')
100 | 
101 | end_time = time.time()
102 | print ("Train features loading and stacking done. Time: {0}s seconds. ".format((end_time - start_time)))
103 |     
104 | start_time = time.time()
105 | for edge in feature_train_dic.keys():
106 |     print('now training ',edge)
107 |     logit_model = LogisticRegression(solver="sag",max_iter=100)  
108 |     feature_train_dic[edge]['model']=logit_model
109 | 
110 | threads=[]
111 | for edge in feature_train_dic.keys():
112 |     t = threading.Thread(target=feature_train_dic[edge]['model'].fit, args=(feature_train_dic[edge]['Xtrain'], feature_train_dic[edge]['ytrain']),name=edge)
113 |     threads.append(t)
114 |     t.start()
115 | 
116 | '''has_running = True
117 | while has_running:
118 |     num_done = 0
119 |     for t in threads:
120 |         if not t.isAlive():
121 |             # get results from thtead
122 |             print(t.getName(),' training is done')
123 |             num_done += 1
124 |     if num_done == len(threads):
125 |         break
126 |     else:
127 |         time.sleep(3)'''
128 | for t in threads:
129 |     t.join()
130 | end_time = time.time()
131 | 
132 | print ("Logit model fitting done. Training time: %s seconds" % (end_time - start_time))
133 | 
134 | 
135 | 
136 | """
137 | Predict on test
138 | """
139 | start_time = time.time()
140 | feature_test_dic={}
141 | with open(input_label_test, "r") as f_in:
142 |     count=0
143 |     for line in f_in:
144 |         line=line.strip().split()
145 |         if line[0] not in embedding_dict or line[1] not in embedding_dict:
146 |             continue
147 |         node_1=embedding_dict[line[0]]
148 |         node_2=embedding_dict[line[1]]
149 |         edge=line[-1]
150 |         yvalue=line[2]
151 |         if edge not in feature_test_dic:
152 |             feature_test_dic[edge]={}
153 |             feature_test_dic[edge]['tuple']=[]
154 |             #feature_test_dic[edge]['line']=[]
155 |             feature_test_dic[edge]['current']=0
156 |         feature_test_dic[edge]['tuple'].append(np.multiply(node_1,node_2))
157 |         #feature_test_dic[edge]['line'].append([line[0],line[1]])
158 |         count+=1
159 | 
160 | end_time = time.time()
161 | print('finished reading test file, time: ', (end_time - start_time))
162 | 
163 | 
164 | for edge in feature_test_dic.keys():
165 |     f=[]
166 |     f.append(np.array(feature_test_dic[edge]['tuple']))
167 |     feature_test_dic[edge]['Xtest']=np.hstack(tuple(f))
168 |     num_instance_test = len(feature_test_dic[edge]['tuple'])
169 |     assert num_instance_test == feature_test_dic[edge]['Xtest'].shape[0], "Test instance numbers do not match."
170 | # compute predicted value for file_2; a row of X_test is the vector -- emb(node_1) 
171 | #hadamard-prod emb(node_2) -- where node_1 and node_2 are the two nodes on a line of file *2*
172 |     proba_test = feature_train_dic[edge]['model'].predict_proba(feature_test_dic[edge]['Xtest'])
173 |     #print(proba_test[:,1])
174 |     feature_test_dic[edge]['p_test']=proba_test[:,1]
175 |     feature_test_dic[edge]['Xtest']=[]
176 |     print('finished proba: ', edge)
177 | 
178 | 
179 | ## output a file with same format as file_2, with the third column replaced by your predicted value as in proba_test
180 | 
181 | ## summary: input -- file_2, file_3, embedding file;
182 | ##			output -- the file similar to file_2 with third column replace and 
183 | ## note: please be careful that for each edge type (r), a different model will be trained and used for prediction
184 | ## example files: 	file_2 -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt
185 | ## 					file_3 -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_for_logit_training.txt
186 | ## 					emb_file -- /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_line_samples100000_alpha0.1_dim128.emb
187 | 
188 | with open(input_label_test, "r") as f_in,open(output_test_score, "w+") as f_out:
189 |     content=""
190 |     rd=0
191 |     for line in f_in:
192 |         line=line.strip().split()
193 |         edge=line[-1]
194 |         if line[0] not in embedding_dict or line[1] not in embedding_dict:
195 |             temp=line[0]+' '+line[1]+' '+'HERE'+' '+edge+'\n'
196 |             continue
197 |         current=feature_test_dic[edge]['current']
198 |         temp=line[0]+' '+line[1]+' '+str(feature_test_dic[edge]['p_test'][current])+' '+edge+'\n'
199 |         #print(temp)
200 |         current+=1
201 |         feature_test_dic[edge]['current']=current
202 |         content=content+temp
203 |         rd+=1
204 |         if rd%100000==0:
205 |             print (rd,'lines finished')
206 |             f_out.write(content)
207 |             content=''
208 |     f_out.write(content)
209 |     f_out.close()
210 |         
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 


--------------------------------------------------------------------------------
/aux/merge_edges_with_all_types.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 5 | parser.add_argument("--input-ref-file", nargs="?", help="Input query filename.", type=str)
 6 | parser.add_argument("--input-score-dir", nargs="?", help="Directory for score files.", type=str)
 7 | parser.add_argument("--input-score-keywords", nargs="?", help="Keyword contained by score files.", type=str)
 8 | parser.add_argument("--output-file", nargs="?", help="Directory for output.", type=str)
 9 | args = parser.parse_args()
10 | 
11 | input_ref_file = args.input_ref_file
12 | input_score_dir = args.input_score_dir
13 | input_score_keywords = args.input_score_keywords
14 | output_file = args.output_file
15 | 
16 | typed_node_pair_to_line_dict = {}
17 | # repeat for each file in the directory
18 | for input_score_file_basename in os.listdir(input_score_dir):
19 |    # apply file type filter
20 |    if input_score_keywords in input_score_file_basename:
21 |        with open(os.path.join(input_score_dir, input_score_file_basename), "r") as f_in_score:
22 |            for line in f_in_score:
23 |                line_split = line.strip().split()
24 |                typed_node_pair = "|".join([line_split[0],line_split[1],line_split[3]])
25 |                typed_node_pair_to_line_dict[typed_node_pair] = line
26 | 
27 | with open(input_ref_file, "r") as f_in, open(output_file, "w") as f_out:
28 |     f_out.write(f_in.readline())  # copy the first line used for sanity check: num of neg smp & num of eval batches
29 |     for line in f_in:
30 |         line_split = line.strip().split()
31 |         typed_node_pair = "|".join([line_split[0],line_split[1],line_split[3]])
32 | 
33 |         assert typed_node_pair in typed_node_pair_to_line_dict, "%s not in score files." % typed_node_pair
34 |         f_out.write(typed_node_pair_to_line_dict[typed_node_pair])
35 | 


--------------------------------------------------------------------------------
/aux/normalize_edge_weight_aspem.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from collections import defaultdict
 3 | 
 4 | aspect = sys.argv[1]
 5 | knockout_rate = sys.argv[2]
 6 | input_hin = "../input_data/dblp_" + knockout_rate + "_out_for_aspem_" + aspect + ".net"
 7 | output_hin = "../input_data/dblp_" + knockout_rate + "_out_for_aspem_" + aspect + "_normalized.net"
 8 | 
 9 | total_weight_dict = defaultdict(float)
10 | with open(input_hin, "r") as f_in:
11 |     for line in f_in:
12 |         node_1, node_2, weight_str = line.strip().split()
13 |         weight = float(weight_str)
14 |         edge_type = node_1[0] + node_2[0]
15 |         total_weight_dict[edge_type] += weight
16 | 
17 | max_total_weight = max(total_weight_dict.values())
18 | 
19 | with open(input_hin, "r") as f_in, open(output_hin, "w") as f_out:
20 |     for line in f_in:
21 |         node_1, node_2, weight_str = line.strip().split()
22 |         weight = float(weight_str)
23 |         edge_type = node_1[0] + node_2[0]
24 |         f_out.write(" ".join([node_1, node_2, str(weight * max_total_weight / total_weight_dict[edge_type])]) + "\n")
25 | 


--------------------------------------------------------------------------------
/aux/plot_aw_vs_ay.py:
--------------------------------------------------------------------------------
 1 | from plot_from_nparray import array_to_cdf
 2 | from pylab import *
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | from matplotlib import rc
 6 | 
 7 | aw = np.load("../data/dblp_stats/dblp_jac_a_w.npz")
 8 | ay = np.load("../data/dblp_stats/dblp_jac_y_a.npz")
 9 | aw_x, aw_cum_y = array_to_cdf(aw["weighted"])
10 | ay_x, ay_cum_y = array_to_cdf(ay["weighted"])
11 | 
12 | fig = plt.figure()
13 | 
14 | ax = fig.add_subplot(111)
15 | 
16 | #ax.plot(aw_x, aw_cum_y, 'r')
17 | #ax.plot(ay_x, ay_cum_y, 'r--')
18 | lns11 = ax.plot(aw_x, aw_cum_y, 'r', label = 'Authorship vs. Term usage', linewidth=2)
19 | lns12 = ax.plot(ay_x, ay_cum_y, 'r--', label = 'Authorship vs. Publishing year', linewidth=2)
20 | 
21 | 
22 | # added these three lines
23 | lns = lns11+lns12
24 | labs = [l.get_label() for l in lns]
25 | ax.legend(lns, labs, loc=0)
26 | 
27 | ax.grid()
28 | #ax.set_xscale('log')
29 | ax.set_xlabel(r"Generalized Jaccard coefficient", fontsize=16)
30 | ax.set_xlim(0, 0.0004)
31 | plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
32 | plt.xticks(fontsize=14)
33 | plt.yticks(fontsize=14)
34 | ax.set_ylabel(r"CDF", fontsize=16)
35 | ax.set_ylim(.0, 1.05)
36 | 
37 | plt.subplots_adjust(left=0.15, right=0.85, top=0.95, bottom=0.15)
38 | 
39 | plt.show()
40 | 


--------------------------------------------------------------------------------
/aux/plot_from_nparray.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pylab import *
 3 | from collections import Counter
 4 | 
 5 | 
 6 | def plot_and_show_cdf(arr, theta=1.0, style="r", label=None):
 7 |     cntr =Counter(arr)
 8 |     x, y = zip(*sorted(cntr.iteritems()))
 9 |     y = map(float, y)
10 |     x = np.asarray(x)
11 |     y = np.asarray(y)
12 |     y /= y.sum()
13 |     if theta < 1.0:
14 |         new_len = sum(x < theta)
15 |         x = x[:new_len]
16 |         y = y[:new_len]
17 |     cum_y = np.cumsum(y)
18 |     plot(x, cum_y, style, label=label)
19 |     show()
20 | 
21 | def plot_cdf(arr, theta=1.0, style="r", label=None):
22 |     cntr =Counter(arr)
23 |     x, y = zip(*sorted(cntr.iteritems()))
24 |     y = map(float, y)
25 |     x = np.asarray(x)
26 |     y = np.asarray(y)
27 |     y /= y.sum()
28 |     if theta < 1.0:
29 |         new_len = sum(x < theta)
30 |         x = x[:new_len]
31 |         y = y[:new_len]
32 |     cum_y = np.cumsum(y)
33 |     plot(x, cum_y, style, label=label)
34 | 
35 | def array_to_cdf(arr, theta=1.0):
36 |     cntr =Counter(arr)
37 |     x, y = zip(*sorted(cntr.iteritems()))
38 |     y = map(float, y)
39 |     x = np.asarray(x)
40 |     y = np.asarray(y)
41 |     y /= y.sum()
42 |     if theta < 1.0:
43 |         new_len = sum(x < theta)
44 |         x = x[:new_len]
45 |         y = y[:new_len]
46 |     cum_y = np.cumsum(y)
47 |     return x, cum_y


--------------------------------------------------------------------------------
/aux/preprocess_dblp_for_aspem.py:
--------------------------------------------------------------------------------
 1 | input_hin = "../input_data/dblp_0.2_out.net"
 2 | output_pay = "../input_data/dblp_0.2_out_for_aspem_apy.net"
 3 | output_papvw = "../input_data/dblp_0.2_out_for_aspem_papvw.net"
 4 | 
 5 | with open(input_hin, "r") as f_in, open(output_pay, "w") as f_out_pay, open(output_papvw, "w") as f_out_papvw:
 6 |     for line in f_in:
 7 |         node_1, node_2, _ = line.strip().split()
 8 |         assert "P" in node_1
 9 |         if "A" in node_2:  # to both
10 |             f_out_pay.write(line)
11 |             f_out_papvw.write(line)
12 |         elif "P" in node_2: # to papvw
13 |             f_out_papvw.write(line)
14 |         elif "V" in node_2: # to papvw
15 |             f_out_papvw.write(line)
16 |         elif "W" in node_2: # to papvw
17 |             f_out_papvw.write(line)
18 |         elif "Y" in node_2: # to pay
19 |             f_out_pay.write(line)
20 |         else:
21 |             raise Exception("Inconsistent edge type")
22 | 


--------------------------------------------------------------------------------
/aux/separate_edges_by_types.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 5 | parser.add_argument("--input-file", nargs="?", help="Input query filename.", type=str)
 6 | parser.add_argument("--output-dir", nargs="?", help="Directory for output.", type=str)
 7 | args = parser.parse_args()
 8 | 
 9 | input_file = args.input_file
10 | output_dir = args.output_dir
11 | 
12 | f_out_dict = {}
13 | with open(input_file, "r") as f_in:
14 |     f_in.readline()  # ignore the first line used for sanity check: num of neg smp & num of eval batches
15 |     for line in f_in:
16 |         _, __, ___, edge_type = line.strip().split()
17 | 
18 |         if edge_type not in f_out_dict:
19 |             cur_output_file_name = os.path.join(output_dir, edge_type+"_"+os.path.basename(input_file))
20 |             f_out_dict[edge_type] = open(cur_output_file_name, "w")
21 | 
22 |         f_out_dict[edge_type].write(line)
23 | 
24 | for edge_type in f_out_dict:
25 |     f_out_dict[edge_type].close()


--------------------------------------------------------------------------------
/eval/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation
 2 | 
 3 | This repository provides a reference implementation of evalution of data according to embedding/score file and x_eval.txt file generated from edge knockout process:<br>
 4 | 
 5 | 
 6 | ### Basic usage
 7 | 
 8 | #### Input
 9 | 
10 | There are 2 required input files.
11 | 1. **input-embedding-file** contains embedding result, they are in the format of:
12 | 					
13 | 		edge vector//EX: P:20883 A:20345 [0.2, 1.2, -1.3, 0.8,...,0.2,-0.1]
14 |    We also provide the **input-score-file** edtion in the folder, the score file should be in the format of:
15 |    
16 |    		edge score// EX: P:20883 A:20345 0.88
17 |   
18 | 	
19 | 2. **input-eval-file** is the eval file that we generated from edge knockout process.
20 | 
21 | 			
22 | 			
23 | And another 1 required input data:
24 | 
25 | 1. **sample-number** is the number that consist with input-eval-file, which is the # of edges that we generate for each node.
26 | 
27 | #### Output
28 | 
29 | It will print the averge mrr number for each edge_type and the total average number.
30 | 							
31 | #### Execute and example
32 | 
33 | _All the commands are executed from the project home directory. And we are using python3._<br/> 
34 | 
35 | Here is an exmaple of generating output files using embedding file:
36 | 
37 | 	python mrr_from_embedding --input-embedding-file xxx.emb --input-eval-file xxx.txt
38 | 
39 | ## Citing
40 | 
41 | 
42 | ## Miscellaneous
43 | 
44 | Please send any questions you might have about the codes and/or the algorithm to <fangguo1@illinois.edu>.
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/eval/archive/edge_rec_eval_inner_prod.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Jan 19 12:39:34 2018
  5 | 
  6 | @author: edz
  7 | """
  8 | import numpy as np
  9 | import sys
 10 | import argparse
 11 | 
 12 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 13 | parser.add_argument("--eval-file", nargs="?", help="Input evaluation filename.", type=str)
 14 | parser.add_argument("--emb-file", nargs="?", help="Input embedding filename.", type=str)
 15 | args = parser.parse_args()
 16 | 
 17 | input_embedding=args.emb_file
 18 | input_newfile=args.eval_file # '../input_data/dblp_0.1_out_20neg_eval.txt'
 19 | 
 20 | def calculate_rr(batch):
 21 |     target=batch[0]
 22 |     l=batch
 23 |     l.sort(reverse=True)
 24 |     rank=l.index(target)+1
 25 |     rr=1/rank
 26 |     return rr
 27 | 
 28 | if __name__ == '__main__':
 29 |     embedding_dict={}
 30 |     #input_embedding='dblp_0.1_out_line_samples1000_dim128.emb'
 31 |     with open(input_embedding, "r") as f_in:
 32 |         num_nodes, dim = map(int, f_in.readline().strip().split())  # first line is special
 33 |         count=0
 34 |         for line in f_in:
 35 |             line_split = line.strip().split()
 36 |             a=list(map(float, line_split[1:]))
 37 |             embedding_dict[line_split[0]] = np.asarray(a)
 38 |         assert len(embedding_dict) == num_nodes, "Number of nodes does not agree."
 39 |     print ("Embedding loading done.", num_nodes, "nodes with dim", dim, "from", input_embedding)
 40 |     #input_newfile='test/file2.txt'
 41 |     with open(input_newfile, "r") as f_in:
 42 |         count=0
 43 |         total_mrr={}
 44 | 
 45 |         exist=False
 46 |         rd=0
 47 |         checksametype=False
 48 |         for line in f_in:
 49 |             line_split = line.split(' ')
 50 |             key1=line_split[0] #.lower()
 51 |             key2=line_split[1] #.lower()
 52 |             #print(key1[0],key2[0])
 53 |             if count==0: 
 54 |                 current=[]
 55 |                 if key1 in embedding_dict and key2 in embedding_dict:
 56 |                     edge_type=key1[0]+key2[0]
 57 |                     #print(edge_type,'exists')
 58 |                     if edge_type==edge_type[::-1] :
 59 |                         checksametype=True
 60 |                         if edge_type not in total_mrr:
 61 |                             total_mrr[edge_type]=[]
 62 |                     else:
 63 |                         if edge_type not in total_mrr:
 64 |                             total_mrr[edge_type]=[]
 65 |                         if edge_type[::-1] not in total_mrr:
 66 |                             total_mrr[edge_type[::-1]]=[]
 67 |                     exist =True
 68 |                     target=embedding_dict[key1].dot(embedding_dict[key2])
 69 |                     current.append(target) 
 70 |                     #print(target)
 71 |                 count+=1
 72 |             else:
 73 |                 if exist:
 74 |                     if key1 in embedding_dict and key2 in embedding_dict:
 75 |                         current.append(embedding_dict[key1].dot(embedding_dict[key2])) 
 76 |                 if count==10 and checksametype==False:
 77 |                     if exist:
 78 |                         edge_type=key1[0]+key2[0]
 79 |                         #print('10-',edge_type,current)
 80 |                         rr=calculate_rr(current)
 81 |                         total_mrr[edge_type].append(rr) 
 82 |                         current=[]
 83 |                         current.append(target)
 84 |                 if count==20:  
 85 |                     if exist: 
 86 |                         edge_type=key1[0]+key2[0]
 87 |                         #print('20-',edge_type,current)
 88 |                         rr=calculate_rr(current)
 89 |                         total_mrr[edge_type].append(rr) 
 90 |                         exist=False
 91 |                     checksametype=False
 92 |                     count=0
 93 |                     rd+=1
 94 |                 else:
 95 |                     count+=1
 96 |         total=0
 97 |         num_mrr=0
 98 |         for key in total_mrr:
 99 |             s=sum(total_mrr[key])
100 |             l=len(total_mrr[key])
101 |             total=total+s
102 |             num_mrr=num_mrr+l
103 |             print ('edge is ',key,'with avg mrr ',s/l)
104 |         print ('# mrr is', num_mrr)
105 |         print ('total avg is', total/num_mrr)
106 |             
107 |  
108 | 


--------------------------------------------------------------------------------
/eval/archive/edge_rec_eval_score_provided.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Jan 19 12:39:34 2018
  5 | 
  6 | @author: edz
  7 | """
  8 | import numpy as np
  9 | import sys
 10 | import time
 11 | import argparse
 12 | 
 13 | parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 14 | parser.add_argument("--eval-file", nargs="?", help="Input score filename.", type=str)
 15 | parser.add_argument("--score-file", nargs="?", help="Input embedding filename.", type=str)
 16 | args = parser.parse_args()
 17 | 
 18 | input_scorefile=args.score_file
 19 | input_recordfile=args.eval_file # '../input_data/dblp_0.1_out_20neg_eval.txt'
 20 | 
 21 | def calculate_rr(batch):
 22 |     target=batch[0]
 23 |     l=batch
 24 |     l.sort(reverse=True)
 25 |     rank=l.index(target)+1
 26 |     rr=1/rank
 27 |     return rr
 28 | 
 29 | if __name__ == '__main__':
 30 |     start_time = time.time()
 31 |     score_dict={}
 32 |     #input_scorefile=sys.argv[1]
 33 |     with open(input_scorefile, "r") as f_in:
 34 |         for line in f_in:
 35 |             line_split = line.strip().split()
 36 |             #key=line_split[0].lower()+' '+line_split[1].lower()
 37 |             key=line_split[0]+' '+line_split[1]
 38 |             score_dict[key]=line_split[2]
 39 |     print ("Loading done.", len(score_dict), "pairs from", input_scorefile)
 40 |     #input_recordfile='kn2/file2.txt'
 41 | 
 42 |     with open(input_recordfile, "r") as f_in:
 43 |         count=0
 44 |         total_mrr={}
 45 | 
 46 |         exist=False
 47 |         checksametype=False
 48 |         rd=0
 49 |         for line in f_in:
 50 |             line_split = line.split(' ')
 51 |             key1=line_split[0] #.lower()
 52 |             key2=line_split[1] #.lower()
 53 |             key=key1+' '+key2
 54 | 
 55 |             #print(key1[0],key2[0])
 56 |             if count==0:
 57 |                 current=[]
 58 |                 if key in score_dict:
 59 |                     edge_type=key1[0]+key2[0]
 60 |                     if edge_type==edge_type[::-1] :
 61 |                         checksametype=True
 62 |                         if edge_type not in total_mrr:
 63 |                             total_mrr[edge_type]=[]
 64 |                     else:
 65 |                         if edge_type not in total_mrr:
 66 |                             total_mrr[edge_type]=[]
 67 |                         if edge_type[::-1] not in total_mrr:
 68 |                             total_mrr[edge_type[::-1]]=[]
 69 |                     exist=True
 70 |                     target=score_dict[key]
 71 |                     current.append(float(target))
 72 |                     #print(target)
 73 | 
 74 |                 count+=1
 75 |             else:
 76 |                 if exist:
 77 |                     if key in score_dict:
 78 |                         current.append(float(score_dict[key]))
 79 |                 if count==10 and checksametype==False:
 80 |                     if exist:
 81 |                         edge_type=key1[0]+key2[0]
 82 |                         #print('10-',edge_type,current)
 83 |                         rr=calculate_rr(current)
 84 |                         total_mrr[edge_type].append(rr)
 85 | 
 86 |                         current=[]
 87 |                         current.append(float(target))
 88 |                 if count==20:
 89 |                     if exist:
 90 |                         edge_type=key1[0]+key2[0]
 91 |                         #print('20-',edge_type,current)
 92 |                         rr=calculate_rr(current)
 93 | 
 94 |                         total_mrr[edge_type].append(rr)
 95 |                         exist=False
 96 |                     checksametype=False
 97 |                     count=0
 98 |                     rd+=1
 99 |                     if rd % 100000 == 0:
100 |                         elapsed_time = time.time() - start_time
101 |                         print(rd,' batchs finished with time',elapsed_time)
102 |                 else:
103 |                     count+=1
104 | 
105 |         total=0
106 |         num_mrr=0
107 | 
108 |         for key in total_mrr:
109 |             s=sum(total_mrr[key])
110 |             l=len(total_mrr[key])
111 |             total=total+s
112 |             num_mrr=num_mrr+l
113 | 
114 |             print ('edge is ',key,'with avg mrr ',s/l)
115 |         print ('# mrr is', num_mrr) 
116 |         print ('total avg is', total/num_mrr)
117 | 


--------------------------------------------------------------------------------
/eval/archive/edge_rec_eval_temp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Jan 19 12:39:34 2018
  5 | 
  6 | @author: edz
  7 | """
  8 | import numpy as np
  9 | import sys
 10 | 
 11 | input_embedding=sys.argv[1]
 12 | input_newfile='../input_data/dblp_0.1_out_20neg_eval.txt'
 13 | 
 14 | def calculate_rr(batch):
 15 |     target=batch[0]
 16 |     l=batch
 17 |     l.sort(reverse=True)
 18 |     rank=l.index(target)+1
 19 |     rr=1/rank
 20 |     return rr
 21 | 
 22 | if __name__ == '__main__':
 23 |     embedding_dict={}
 24 |     #input_embedding='dblp_0.1_out_line_samples1000_dim128.emb'
 25 |     with open(input_embedding, "r") as f_in:
 26 |         num_nodes, dim = map(int, f_in.readline().strip().split())  # first line is special
 27 |         count=0
 28 |         for line in f_in:
 29 |             line_split = line.strip().split()
 30 |             a=list(map(float, line_split[1:]))
 31 |             embedding_dict[line_split[0]] = np.asarray(a)
 32 |         assert len(embedding_dict) == num_nodes, "Number of nodes does not agree."
 33 |     print ("Embedding loading done.", num_nodes, "nodes with dim", dim, "from", input_embedding)
 34 |     #input_newfile='test/file2.txt'
 35 |     with open(input_newfile, "r") as f_in:
 36 |         count=0
 37 |         total_mrr={}
 38 | 
 39 |         exist=False
 40 |         rd=0
 41 |         checksametype=False
 42 |         for line in f_in:
 43 |             line_split = line.split(' ')
 44 |             key1=line_split[0].lower()
 45 |             key2=line_split[1].lower()
 46 |             #print(key1[0],key2[0])
 47 |             if count==0: 
 48 |                 current=[]
 49 |                 if key1 in embedding_dict and key2 in embedding_dict:
 50 |                     edge_type=key1[0]+key2[0]
 51 |                     #print(edge_type,'exists')
 52 |                     if edge_type==edge_type[::-1] :
 53 |                         checksametype=True
 54 |                         if edge_type not in total_mrr:
 55 |                             total_mrr[edge_type]=[]
 56 |                     else:
 57 |                         if edge_type not in total_mrr:
 58 |                             total_mrr[edge_type]=[]
 59 |                         if edge_type[::-1] not in total_mrr:
 60 |                             total_mrr[edge_type[::-1]]=[]
 61 |                     exist =True
 62 |                     target=embedding_dict[key1].dot(embedding_dict[key2])
 63 |                     current.append(target) 
 64 |                     #print(target)
 65 |                 count+=1
 66 |             else:
 67 |                 if exist:
 68 |                     if key1 in embedding_dict and key2 in embedding_dict:
 69 |                         current.append(embedding_dict[key1].dot(embedding_dict[key2])) 
 70 |                 if count==10 and checksametype==False:
 71 |                     if exist:
 72 |                         edge_type=key1[0]+key2[0]
 73 |                         #print('10-',edge_type,current)
 74 |                         rr=calculate_rr(current)
 75 |                         total_mrr[edge_type].append(rr) 
 76 |                         current=[]
 77 |                         current.append(target)
 78 |                 if count==20:  
 79 |                     if exist: 
 80 |                         edge_type=key1[0]+key2[0]
 81 |                         #print('20-',edge_type,current)
 82 |                         rr=calculate_rr(current)
 83 |                         total_mrr[edge_type].append(rr) 
 84 |                         exist=False
 85 |                     checksametype=False
 86 |                     count=0
 87 |                     rd+=1
 88 |                 else:
 89 |                     count+=1
 90 |         total=0
 91 |         num_mrr=0
 92 |         for key in total_mrr:
 93 |             s=sum(total_mrr[key])
 94 |             l=len(total_mrr[key])
 95 |             total=total+s
 96 |             num_mrr=num_mrr+l
 97 |             print ('edge is ',key,'with avg mrr ',s/l)
 98 |         print ('total avg is', total/num_mrr)
 99 |             
100 |  


--------------------------------------------------------------------------------
/eval/archive/edgeknock.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Dec 11 13:12:43 2017
  5 | 
  6 | @author: edz
  7 | """
  8 | 
  9 | from itertools import islice
 10 | 
 11 | from random import random,sample,choice
 12 | import time
 13 | import pandas as pd
 14 | import math
 15 | 
 16 | 
 17 | 
 18 | # ko_dic records all the tuple that have been knocked out, format: a: {p1,p2...}
 19 | 
 20 | def ko_edge(tuple_list,ko_rate,a_dic,index2type):
 21 |     size=len(tuple_list)
 22 |     ko_dic={}
 23 |     ko_index_list=sample(range(size-1),math.ceil(size*ko_rate))
 24 |     for i in ko_index_list:
 25 |         tuple_list[i][-1]='0'
 26 |     
 27 |         if tuple_list[i][1] not in ko_dic:
 28 |             ko_dic[tuple_list[i][1]]={}
 29 |             ko_dic[tuple_list[i][1]][tuple_list[i][0]]=1
 30 |         else:
 31 |             if tuple_list[i][0] not in ko_dic[tuple_list[i][1]]:
 32 |                 ko_dic[tuple_list[i][1]][tuple_list[i][0]]=1
 33 |     #write result from tuple_list to file1 
 34 |     file =open('file1.txt','w')
 35 |     count=0
 36 |     for i in tuple_list:
 37 |         if i[2]!='0':
 38 |             line=index2type[i[0]]+":"+i[0]+" "+index2type[i[1]]+":"+i[1]+" "+i[2]
 39 |             file.write(line)
 40 |             file.write('\n')
 41 |         else:
 42 |             count+=1
 43 |     file.close()
 44 |     print("finished file1")
 45 |     return ko_dic,tuple_list
 46 | 
 47 |             
 48 | #neagtive sampling, one from ko_list with another 20 from random sampling with same author
 49 | def build_file(ko_dic,a_dic,p_dic,o_dic,index2type,type_dic):
 50 |     p_list=list(p_dic)
 51 |     a_list=list(a_dic)
 52 |     file =open('file2.txt','w')
 53 |     #for key,value in ko_dic.items():
 54 |     for key,dic in ko_dic.items():   
 55 |         
 56 |         for sub_key,edge in dic.items():
 57 |             temp=index2type[key]+":"+key+" "+index2type[sub_key]+":"+sub_key+" "+str(edge)+" "+index2type[key]+index2type[sub_key]
 58 |             file.write(temp)
 59 |             file.write('\n')
 60 |             
 61 |             if (index2type[key]=="A"):
 62 |             #10 negative sampling with same author, but non-relative paper
 63 |                 count=0
 64 |                 while count<50:
 65 |                     random_paper=choice(p_list)      
 66 |                     while random_paper in a_dic[key]:
 67 |                         random_paper=choice(p_list)
 68 |                     temp="A:"+key+" "+index2type[random_paper]+":"+str(random_paper)+" "+"0"+" "+"AP"
 69 |                     file.write(temp)
 70 |                     file.write('\n')
 71 |                     count+=1
 72 |             #10 negative sampling with same paper, but random-author
 73 |                 count=0
 74 |                 while count<50:
 75 |                     random_author=choice(a_list)
 76 |                     while sub_key in a_dic[random_author]:
 77 |                         random_author=choice(a_list)
 78 |                     temp="P:"+sub_key+" "+"A:"+random_author+" "+"0"+" "+"PA"
 79 |                     file.write(temp)
 80 |                     file.write('\n')
 81 |                     count+=1
 82 |             else:
 83 |                 t_list=type_dic[index2type[key]]
 84 |                 #when key is others:
 85 |                 #10 negative sampling with same paper, but non-relative other
 86 |                 count=0
 87 |                 while count<50:
 88 |                     random_o=choice(t_list)
 89 |                     while random_o in p_dic[sub_key]:
 90 |                         random_o=choice(t_list)
 91 |                     temp="P:"+sub_key+" "+index2type[random_o]+":"+random_o+" "+"0"+" "+"P"+index2type[key]
 92 |                     file.write(temp)
 93 |                     file.write('\n')
 94 |                     count+=1
 95 |                 count=0
 96 |                 while count<50:
 97 |                     random_paper=choice(p_list)
 98 |                     while key in p_dic[random_paper]:
 99 |                         random_paper=choice(p_list)
100 |                     temp=index2type[key]+":"+key+" "+"P:"+random_paper+" "+"0"+" "+index2type[key]+"P"
101 |                     file.write(temp)
102 |                     file.write('\n')
103 |                     count+=1
104 |             
105 |     file.close()
106 |     return 
107 | 
108 | 
109 | #islice(file, 100000)
110 | if __name__ == '__main__':
111 |     
112 |     start_time = time.time()
113 |     #build the index2name hash table
114 |     
115 |     filename0="index2name.txt"
116 |     index2type={}
117 |     type_dic={}
118 |     with open(filename0,encoding="utf-8") as file:
119 |         for line in file:
120 |             line=line.split()
121 |             second_part=line[1].split(".")
122 |             itemtype=second_part[0]
123 |             index2type[line[0]]=itemtype
124 |             if second_part[0] not in type_dic:
125 |                 type_dic[itemtype]={}
126 |             type_dic[itemtype][line[0]]=1
127 |     for itemtype in type_dic:
128 |         type_dic[itemtype]=list(type_dic[itemtype])
129 |     print("finished index2type and type_dic")
130 |     
131 |     #create p hash ,a hash and tuple list from p2a
132 |     filename1 = "all_p2a.txt"
133 |     tuple_list=[]
134 |     p_dic={}
135 |     a_dic={} 
136 |     with open(filename1) as file1:
137 |         for line in file1:
138 |             line=line.split()
139 |             if(line[-1]!='0'):
140 |                 tuple_list.append(line)
141 |                 if line[0] not in p_dic:
142 |                     p_dic[line[0]]={}
143 |                 if line[1] not in a_dic:
144 |                     a_dic[line[1]]={}
145 |                     a_dic[line[1]][line[0]]=1
146 |                 else:
147 |                     if line[0] not in a_dic[line[1]]:
148 |                         a_dic[line[1]][line[0]]=1
149 |     
150 |     elapsed_time = time.time() - start_time
151 |     print(elapsed_time)
152 |     print("finished reading p2a")
153 |     #get more into p hash and tuple list from p2o
154 |     filename2 = "all_p2o.txt"
155 |     o_dic={}
156 |     with open(filename2) as file2:
157 |         for line in file2:
158 |             line=line.split()
159 |             if(line[2]!='0'):
160 |                 tuple_list.append(line)
161 |                 if line[0] not in p_dic:
162 |                     p_dic[line[0]]={}
163 |                 if line[1] not in p_dic[line[0]]:
164 |                     p_dic[line[0]][line[1]]=line[2]
165 |                 if line[1] not in o_dic:
166 |                     o_dic[line[1]]=1
167 |     #print(o_dic)
168 |     #print(p_dic)
169 |     elapsed_time = time.time() - start_time
170 |     print(elapsed_time)
171 |     print("finished reading p2o") 
172 |     ko_rate=0.1
173 |     ko_dic,tuple_list=ko_edge(tuple_list,ko_rate,a_dic,index2type)
174 |     build_file(ko_dic,a_dic,p_dic,o_dic,index2type,type_dic)
175 |     elapsed_time = time.time() - start_time
176 |     print(elapsed_time)
177 |     
178 |     '''
179 |     type_dic={}
180 |     
181 |     with open('file1.txt',encoding="utf-8") as file:
182 |         count=0
183 |         for line in file:
184 |             line=line.split()
185 |             part1=line[0]
186 |             part2=line[1]
187 |             part1=part1.split(":")
188 |             if part1[0] not in type_dic:
189 |                 type_dic[part1[0]]={}
190 |             if part1[1] not in type_dic[part1[0]]:
191 |                 type_dic[part1[0]][part1[1]]=1
192 |             part2=part2.split(":")
193 |             if part2[0] not in type_dic:
194 |                 type_dic[part2[0]]={}
195 |             if part2[1] not in type_dic[part2[0]]:  
196 |                 type_dic[part2[0]][part2[1]]=1
197 |     with open('file2.txt',encoding="utf-8") as file:
198 |         for line in file:
199 |             line=line.split()
200 |             if line[2]=='1':
201 |                 part1=line[0]
202 |                 part2=line[1]
203 |                 part1=part1.split(":")
204 |                 if part1[0] not in type_dic:
205 |                     type_dic[part1[0]]={}
206 |                 if part1[1] not in type_dic[part1[0]]:
207 |                     type_dic[part1[0]][part1[1]]=1
208 |                 part2=part2.split(":")
209 |                 if part2[0] not in type_dic:
210 |                     type_dic[part2[0]]={}
211 |                 if part2[1] not in type_dic[part2[0]]:  
212 |                     type_dic[part2[0]][part2[1]]=1
213 |     for ty in type_dic:
214 |         print(ty,len(type_dic[ty]))'''
215 |     


--------------------------------------------------------------------------------
/eval/archive/mrr_from_embedding_output_more.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Jan 19 12:39:34 2018
  5 | 
  6 | @author: Fang Guo
  7 | """
  8 | import numpy as np
  9 | import argparse
 10 | import time
 11 | import warnings
 12 | 
 13 | def calculate_rr(batch):
 14 |     target=batch[0]
 15 |     l=batch
 16 |     l.sort(reverse=True)
 17 |     rank=l.index(target)+1
 18 |     rr=1/rank
 19 |     return rr
 20 | 
 21 | if __name__ == '__main__':
 22 |     start_time = time.time()
 23 |     parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 24 |     
 25 |     parser.add_argument("--input-embedding", nargs="?", help="Input embedding filename.", type=str)
 26 |    
 27 |     parser.add_argument("--input-record", nargs="?", help="Input record filename.", type=str)
 28 |     
 29 |     parser.add_argument("--sample-number", nargs="?", help="Input sample number generated per node", type=int)
 30 |     
 31 |     args = parser.parse_args()
 32 |     embedding_dict={}
 33 |     input_embedding=args.input_embedding
 34 |     with open(input_embedding, "r") as f_in:
 35 |         num_nodes, dim = map(int, f_in.readline().strip().split())  # first line is special
 36 |         count=0
 37 |         for line in f_in:
 38 |             line_split = line.strip().split()
 39 |             a=list(map(float, line_split[1:]))
 40 |             embedding_dict[line_split[0]] = np.asarray(a)
 41 |         assert len(embedding_dict) == num_nodes, "Number of nodes does not agree."
 42 |     print ("Embedding loading done.", num_nodes, "nodes with dim", dim, "from", input_embedding)
 43 |     input_newfile=args.input_record
 44 |     
 45 |     with open(input_newfile, "r") as f_in:
 46 |         warnings.simplefilter('always', ImportWarning)
 47 |         count=0
 48 |         total_mrr={}
 49 | 
 50 |         exist=False
 51 |         rd=0
 52 |         checksametype=False
 53 |         sample_number=args.sample_number
 54 |         for line in f_in:
 55 |             line_split = line.split(' ')
 56 |             key1=line_split[0]
 57 |             key2=line_split[1]
 58 |             #print(key1[0],key2[0])
 59 |             if count==0: 
 60 |                 current=[]
 61 |                 if key1 in embedding_dict and key2 in embedding_dict:
 62 |                     edge_type=key1[0]+key2[0]
 63 |                     #print(edge_type,'exists')
 64 |                     if edge_type==edge_type[::-1] :
 65 |                         checksametype=True
 66 |                         if edge_type not in total_mrr:
 67 |                             total_mrr[edge_type]=[]
 68 |                     else:
 69 |                         if edge_type not in total_mrr:
 70 |                             total_mrr[edge_type]=[]
 71 |                         if edge_type[::-1] not in total_mrr:
 72 |                             total_mrr[edge_type[::-1]]=[]
 73 |                     exist =True
 74 |                     target=embedding_dict[key1].dot(embedding_dict[key2])
 75 |                     current.append(float(target)) 
 76 |                 else:
 77 |                     if key1 not in embedding_dict:
 78 |                         warning_word=key1+' does not exist.'
 79 |                         warnings.warn(warning_word)
 80 |                     if key2 not in embedding_dict:
 81 |                         warning_word=key2+' does not exist.'
 82 |                         warnings.warn(warning_word)
 83 |                     #print(target)
 84 |                 count+=1
 85 |             else:
 86 |                 if exist:
 87 |                     if key1 in embedding_dict and key2 in embedding_dict:
 88 |                         current.append(float(embedding_dict[key1].dot(embedding_dict[key2]))) 
 89 |                     else:
 90 |                         if key1 not in embedding_dict:
 91 |                             warning_word=key1+' does not exist.'
 92 |                             warnings.warn(warning_word)
 93 |                         if key2 not in embedding_dict:
 94 |                             warning_word=key2+' does not exist.'
 95 |                             warnings.warn(warning_word)
 96 |                 if count==sample_number and checksametype==False:
 97 |                     if exist:
 98 |                         edge_type=key1[0]+key2[0]
 99 |                         #print('10-',edge_type,current)
100 |                         print(current)
101 |                         rr=calculate_rr(current)
102 |                         total_mrr[edge_type].append(rr)
103 |                         current=[]
104 |                         current.append(float(target))
105 |                 if count==(sample_number*2):  
106 |                     if exist: 
107 |                         edge_type=key1[0]+key2[0]
108 |                         #print('20-',edge_type,current)
109 |                         print(current)
110 |                         rr=calculate_rr(current)
111 |                         total_mrr[edge_type].append(rr)
112 |                         exist=False
113 |                     checksametype=False
114 |                     count=0
115 |                     rd+=1
116 |                     if rd % 100000 == 0:
117 |                         elapsed_time = time.time() - start_time
118 |                         print(rd,' batchs finished with time',elapsed_time)
119 |                 else:
120 |                     count+=1
121 |             print (rd, "------------------------")
122 |             if rd == 10:
123 |                 print(total_mrr)
124 |                 break
125 | 
126 |         total=0
127 |         num_mrr=0
128 |         for key in total_mrr:
129 |             s=sum(total_mrr[key])
130 |             l=len(total_mrr[key])
131 |             total=total+s
132 |             num_mrr=num_mrr+l
133 |             if l > 0:
134 |                 print ('edge is ',key,'with avg mrr ',s/l)
135 |         print ('total avg is', total/num_mrr)
136 |             
137 |  
138 | 


--------------------------------------------------------------------------------
/eval/archive/mrr_from_score_output_more.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Jan 19 12:39:34 2018
  5 | 
  6 | @author: Fang Guo
  7 | """
  8 | import numpy as np
  9 | import sys
 10 | import time
 11 | import argparse
 12 | import warnings
 13 | 
 14 | def calculate_rr(batch):
 15 |     target=batch[0]
 16 |     l=batch
 17 |     l.sort(reverse=True)
 18 |     rank=l.index(target)+1
 19 |     rr=1/rank
 20 |     return rr
 21 | 
 22 | if __name__ == '__main__':
 23 |     start_time = time.time()
 24 |     parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 25 |     parser.add_argument("--input-score-file", nargs="?", help="Input score filename.", type=str)
 26 |     parser.add_argument("--input-record-file", nargs="?", help="Input record filename.", type=str)
 27 |     parser.add_argument("--sample-number", nargs="?", help="Input sample number generated per node", type=int)
 28 | 
 29 |     args = parser.parse_args()
 30 |     score_dict={}
 31 |     input_scorefile=args.input_score_file
 32 |     with open(input_scorefile, "r") as f_in:
 33 |         for line in f_in:
 34 |             line_split = line.strip().split()
 35 |             key=line_split[0]+' '+line_split[1]
 36 |             score_dict[key]=line_split[2]
 37 |     print ("Loading done.", len(score_dict), "pairs from", input_scorefile)
 38 |     input_recordfile=args.input_record_file
 39 |     
 40 |     with open(input_recordfile, "r") as f_in:
 41 |         warnings.simplefilter('always', ImportWarning)
 42 |         count=0
 43 |         total_mrr={}
 44 |         
 45 |         exist=False
 46 |         checksametype=False
 47 |         sample_number=args.sample_number
 48 | 
 49 |         rd=0
 50 |         for line in f_in:
 51 |             line_split = line.split(' ')
 52 |             key1=line_split[0]
 53 |             key2=line_split[1]
 54 |             key=key1+' '+key2
 55 |             
 56 |             #print(key1[0],key2[0])
 57 |             if count==0: 
 58 |                 current=[]
 59 |                 if key in score_dict:
 60 |                     edge_type=key1[0]+key2[0]
 61 |                     if edge_type==edge_type[::-1] :
 62 |                         checksametype=True
 63 |                         if edge_type not in total_mrr:
 64 |                             total_mrr[edge_type]=[]
 65 |                     else:
 66 |                         if edge_type not in total_mrr:
 67 |                             total_mrr[edge_type]=[]
 68 |                         if edge_type[::-1] not in total_mrr:
 69 |                             total_mrr[edge_type[::-1]]=[]
 70 |                     exist=True
 71 |                     target=score_dict[key]
 72 |                     current.append(float(target)) 
 73 |                     #print(target)
 74 |                 else:
 75 |                     warning_word=key+" does not exist."
 76 |                     warnings.warn(warning_word)
 77 |                    
 78 |                 count+=1
 79 |             else:
 80 |                 if exist:
 81 |                     if key in score_dict:
 82 |                         current.append(float(score_dict[key])) 
 83 |                     else:
 84 |                         warning_word=key+" does not exist."
 85 |                         warnings.warn(warning_word)
 86 |                 if count==sample_number and checksametype==False:
 87 |                     if exist:
 88 |                         edge_type=key1[0]+key2[0]
 89 |                         #print('10-',edge_type,current)
 90 |                         print(current)
 91 |                         rr=calculate_rr(current)
 92 |                         total_mrr[edge_type].append(rr) 
 93 |                         
 94 |                         current=[]
 95 |                         current.append(float(target))
 96 |                 if count==(sample_number*2):  
 97 |                     if exist: 
 98 |                         edge_type=key1[0]+key2[0]
 99 |                         #print('20-',edge_type,current)
100 |                         print(current)
101 |                         rr=calculate_rr(current)
102 |                         
103 |                         total_mrr[edge_type].append(rr) 
104 |                         exist=False
105 |                     checksametype=False
106 |                     count=0
107 |                     rd+=1
108 |                     if rd % 100000 == 0:
109 |                         elapsed_time = time.time() - start_time
110 |                         print(rd,' batchs finished with time',elapsed_time)
111 |                 else:
112 |                     count+=1
113 |             print (rd, "------------------------")
114 |             if rd == 10:
115 |                 print(total_mrr)
116 |                 break
117 |             
118 |         total=0
119 |         num_mrr=0
120 |         
121 |         for key in total_mrr:
122 |             s=sum(total_mrr[key])
123 |             l=len(total_mrr[key])
124 |             total=total+s
125 |             num_mrr=num_mrr+l
126 |             
127 |             print ('edge is ',key,'with avg mrr ',s/l)
128 |         print ('total avg is', total/num_mrr)
129 |         
130 |  
131 | 


--------------------------------------------------------------------------------
/eval/archive/yago_mrr_from_embedding.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Jan 19 12:39:34 2018
  5 | 
  6 | @author: Fang Guo
  7 | """
  8 | import numpy as np
  9 | import argparse
 10 | import time
 11 | import warnings
 12 | 
 13 | def calculate_rr(batch):
 14 |     target=batch[0]
 15 |     l=sorted(batch, reverse=True)
 16 |     #l.sort(reverse=True)
 17 |     rank=l.index(target)+1
 18 |     rr=1/rank
 19 |     return rr
 20 | 
 21 | if __name__ == '__main__':
 22 |     start_time = time.time()
 23 |     parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 24 |     
 25 |     parser.add_argument("--input-embedding", nargs="?", help="Input embedding filename.", type=str)
 26 |    
 27 |     parser.add_argument("--input-record", nargs="?", help="Input record filename.", type=str)
 28 |     
 29 |     parser.add_argument("--sample-number", nargs="?", help="Input sample number generated per node", type=int)
 30 |     
 31 |     args = parser.parse_args()
 32 |     embedding_dict={}
 33 |     input_embedding=args.input_embedding
 34 |     with open(input_embedding, "r") as f_in:
 35 |         num_nodes, dim = map(int, f_in.readline().strip().split())  # first line is special
 36 |         count=0
 37 |         for line in f_in:
 38 |             line_split = line.strip().split()
 39 |             a=list(map(float, line_split[1:]))
 40 |             embedding_dict[line_split[0]] = np.asarray(a)
 41 |         assert len(embedding_dict) == num_nodes, "Number of nodes does not agree."
 42 |     print ("Embedding loading done.", num_nodes, "nodes with dim", dim, "from", input_embedding)
 43 |     input_newfile=args.input_record
 44 |     
 45 |     with open(input_newfile, "r") as f_in:
 46 |         warnings.simplefilter('always', ImportWarning)
 47 |         count=0
 48 |         total_mrr={}
 49 |         exist=False
 50 |         rd=0
 51 |         sample_number=args.sample_number
 52 |         for line in f_in:
 53 |             line_split = line.strip().split()
 54 |             key1=line_split[0]
 55 |             key2=line_split[1]
 56 |             #print(key1[0],key2[0])
 57 |             if count==0: 
 58 |                 current=[]
 59 |                 if key1 in embedding_dict and key2 in embedding_dict:
 60 |                     edge_type=line_split[-1]
 61 |                     #print(edge_type,'exists')
 62 |                     edge_type_inverse=edge_type+'-1'
 63 |                     if edge_type not in total_mrr:
 64 |                         total_mrr[edge_type]=[]
 65 |                     if edge_type_inverse not in total_mrr:
 66 |                         total_mrr[edge_type_inverse]=[]
 67 |                     exist =True
 68 |                     target=embedding_dict[key1].dot(embedding_dict[key2])
 69 |                     current.append(target) 
 70 |                 else:
 71 |                     if key1 not in embedding_dict:
 72 |                         warning_word=key1+' does not exist.'
 73 |                         warnings.warn(warning_word)
 74 |                     if key2 not in embedding_dict:
 75 |                         warning_word=key2+' does not exist.'
 76 |                         warnings.warn(warning_word)
 77 |                     #print(target)
 78 |                 count+=1
 79 |             else:
 80 |                 if exist:
 81 |                     if key1 in embedding_dict and key2 in embedding_dict:
 82 |                         current.append(embedding_dict[key1].dot(embedding_dict[key2])) 
 83 |                     else:
 84 |                         if key1 not in embedding_dict:
 85 |                             warning_word=key1+' does not exist.'
 86 |                             warnings.warn(warning_word)
 87 |                         if key2 not in embedding_dict:
 88 |                             warning_word=key2+' does not exist.'
 89 |                             warnings.warn(warning_word)
 90 |                 if count==sample_number:
 91 |                     if exist:
 92 |                         edge_type=line_split[-1]
 93 |                         #print('10-',edge_type,current)
 94 |                         rr=calculate_rr(current)
 95 |                         total_mrr[edge_type].append(rr) 
 96 |                         current=[]
 97 |                         current.append(target)
 98 |                 if count==(sample_number*2):  
 99 |                     if exist: 
100 |                         edge_type=line_split[-1]
101 |                         #print('20-',edge_type,current)
102 |                         rr=calculate_rr(current)
103 |                         total_mrr[edge_type].append(rr) 
104 |                         exist=False
105 |                     count=0
106 |                     rd+=1
107 |                     if rd % 100000 == 0:
108 |                         
109 |                         elapsed_time = time.time() - start_time
110 |                         print(rd,' batchs finished with time',elapsed_time)
111 |                 else:
112 |                     count+=1
113 |         total=0
114 |         num_mrr=0
115 |         for key in total_mrr:
116 |             s=sum(total_mrr[key])
117 |             l=len(total_mrr[key])
118 |             total=total+s
119 |             num_mrr=num_mrr+l
120 |             if l > 0:
121 |                 print ('edge is ',key,'with avg mrr ',s/l)
122 |         print ('total avg is', total/num_mrr)
123 |             
124 |  
125 | 


--------------------------------------------------------------------------------
/eval/archive/yago_mrr_from_score.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Jan 19 12:39:34 2018
  5 | 
  6 | @author: Fang Guo
  7 | """
  8 | import numpy as np
  9 | import sys
 10 | import time
 11 | import argparse
 12 | import warnings
 13 | 
 14 | def calculate_rr(batch):
 15 |     target=batch[0]
 16 |     l=sorted(batch,reverse=True)
 17 |     #l.sort(reverse=True)
 18 |     rank=l.index(target)+1
 19 |     rr=1/rank
 20 |     return rr
 21 | 
 22 | if __name__ == '__main__':
 23 |     start_time = time.time()
 24 |     parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 25 |     parser.add_argument("--input-score-file", nargs="?", help="Input score filename.", type=str)
 26 |     parser.add_argument("--input-record-file", nargs="?", help="Input record filename.", type=str)
 27 |     parser.add_argument("--sample-number", nargs="?", help="Input sample number generated per node", type=int)
 28 | 
 29 |     args = parser.parse_args()
 30 |     score_dict={}
 31 |     input_scorefile=args.input_score_file
 32 |     with open(input_scorefile, "r") as f_in:
 33 |         for line in f_in:
 34 |             line_split = line.strip().split()
 35 |             key=line_split[0]+' '+line_split[1]
 36 |             score_dict[key]=line_split[2]
 37 |     print ("Loading done.", len(score_dict), "pairs from", input_scorefile)
 38 |     input_recordfile=args.input_record_file
 39 |     
 40 |     with open(input_recordfile, "r") as f_in:
 41 |         warnings.simplefilter('always', ImportWarning)
 42 |         count=0
 43 |         total_mrr={}
 44 |         
 45 |         exist=False
 46 |         sample_number=args.sample_number
 47 | 
 48 |         rd=0
 49 |         for line in f_in:
 50 |             line_split = line.strip().split()
 51 |             key1=line_split[0]
 52 |             key2=line_split[1]
 53 |             key=key1+' '+key2
 54 |             
 55 |             #print(key1[0],key2[0])
 56 |             if count==0: 
 57 |                 current=[]
 58 |                 if key in score_dict:
 59 |                     edge_type=line_split[-1]
 60 |                     edge_type_inverse=edge_type+'-1'
 61 |                     if edge_type not in total_mrr:
 62 |                         total_mrr[edge_type]=[]
 63 |                     if edge_type_inverse not in total_mrr:
 64 |                         total_mrr[edge_type_inverse]=[]
 65 |                     exist=True
 66 |                     target=score_dict[key]
 67 |                     current.append(float(target))
 68 |                     #print(target)
 69 |                 else:
 70 |                     warning_word=key+" does not exist."
 71 |                     warnings.warn(warning_word)
 72 |                    
 73 |                 count+=1
 74 |             else:
 75 |                 if exist:
 76 |                     if key in score_dict:
 77 |                         current.append(float(score_dict[key]))
 78 |                     else:
 79 |                         warning_word=key+" does not exist."
 80 |                         warnings.warn(warning_word)
 81 |                 if count==sample_number:
 82 |                     if exist:
 83 |                         edge_type=line_split[-1]
 84 |                         #print('10-',edge_type,current)
 85 |                         rr=calculate_rr(current)
 86 |                         total_mrr[edge_type].append(rr) 
 87 |                         current=[]
 88 |                         current.append(float(target))
 89 |                 if count==(sample_number*2):  
 90 |                     if exist: 
 91 |                         edge_type=line_split[-1]
 92 |                         #print('20-',edge_type,current)
 93 |                         rr=calculate_rr(current)
 94 |                         total_mrr[edge_type].append(rr) 
 95 |                         exist=False
 96 |                     count=0
 97 |                     rd+=1
 98 |                     if rd % 100000 == 0:
 99 |                         elapsed_time = time.time() - start_time
100 |                         print(rd,' batchs finished with time',elapsed_time)
101 |                 else:
102 |                     count+=1
103 |             
104 |         total=0
105 |         num_mrr=0
106 |         
107 |         mrr_list = []
108 |         for key in total_mrr:
109 |             s=sum(total_mrr[key])
110 |             l=len(total_mrr[key])
111 |             total=total+s
112 |             num_mrr=num_mrr+l
113 |             mrr_list.append(s/l)
114 |             print ('edge is ',key,'with avg mrr ',s/l)
115 |         print ('macro mrr is', np.mean(mrr_list))
116 |         print ('micro mrr is', total/num_mrr) 
117 | 


--------------------------------------------------------------------------------
/eval/mrr_from_embedding.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Jan 19 12:39:34 2018
  5 | 
  6 | @author: Fang Guo
  7 | """
  8 | import numpy as np
  9 | import argparse
 10 | import time
 11 | 
 12 | def calculate_rr(batch):
 13 |     target=batch[0]
 14 |     num_less, num_grtr = 0, 0
 15 |     for s in batch:
 16 |         if s < target:
 17 |             num_less += 1
 18 |         if s > target:
 19 |             num_grtr += 1
 20 |     rr_list = map(lambda x: 1./x, range(num_grtr+1, len(batch)-num_less+1))
 21 |     # l=sorted(batch,reverse=True)
 22 |     # rank=l.index(target)+1
 23 |     rr = sum(rr_list) / (len(batch) - num_less - num_grtr)
 24 |     return rr
 25 | 
 26 | if __name__ == '__main__':
 27 |     start_time = time.time()
 28 |     parser = argparse.ArgumentParser(description="Read in input and output filenames.") 
 29 |     parser.add_argument("--input-embedding", nargs="?", help="Input embedding filename.", type=str)
 30 |     parser.add_argument("--input-eval-file", nargs="?", help="Input evaluation file.", type=str) 
 31 |     parser.add_argument("--sample-number", nargs="?", help="Input sample number generated per node", type=int,default=10)
 32 |     args = parser.parse_args()
 33 |     embedding_dict={}
 34 |     input_embedding=args.input_embedding
 35 |     with open(input_embedding, "r") as f_in:
 36 |         num_nodes, dim = map(int, f_in.readline().strip().split())  # first line is special
 37 |         count=0
 38 |         for line in f_in:
 39 |             line_split = line.strip().split()
 40 |             a=list(map(float, line_split[1:]))
 41 |             embedding_dict[line_split[0]] = np.asarray(a)
 42 |         assert len(embedding_dict) == num_nodes, "Number of nodes does not agree."
 43 |     print ("Embedding loading done.", num_nodes, "nodes with dim", dim, "from", input_embedding)
 44 |     input_eval_file=args.input_eval_file
 45 |     
 46 |     with open(input_eval_file, "r") as f_in:
 47 |         count=0
 48 |         total_mrr={}
 49 |         exist=False
 50 |         rd=0
 51 |         sample_number=args.sample_number
 52 |         #negative_sample_number,num_positive_edges= map(int, f_in.readline().strip().split())
 53 |         #assert negative_sample_number==args.sample_number
 54 |         for idx, line in enumerate(f_in):
 55 |             line_split = line.strip().split()
 56 |             key1=line_split[0]
 57 |             key2=line_split[1]
 58 |             #print(key1[0],key2[0])
 59 |             if count==0: 
 60 |                 current=[]
 61 |                 assert key1 in embedding_dict, key1+' does not exist.'
 62 |                 assert key2 in embedding_dict, key2+' does not exist.'
 63 |                 
 64 |                 edge_type=line_split[-1]
 65 |                 edge_type_reverse=edge_type+'-1'
 66 |                 if edge_type not in total_mrr:
 67 |                     total_mrr[edge_type]=[]
 68 |                 if edge_type_reverse not in total_mrr:
 69 |                     total_mrr[edge_type_reverse]=[]
 70 |                 exist =True
 71 |                 target=embedding_dict[key1].dot(embedding_dict[key2])
 72 |                 current.append(target) 
 73 |                 count+=1
 74 |             else:
 75 |                 if exist:
 76 |                     assert key1 in embedding_dict, key1+' does not exist.'
 77 |                     assert key2 in embedding_dict, key2+' does not exist.'
 78 |                     current.append(embedding_dict[key1].dot(embedding_dict[key2])) 
 79 |                 if count==sample_number:
 80 |                     if exist:
 81 |                         edge_type=line_split[-1]
 82 |                         #print('10-',edge_type,current)
 83 |                         rr=calculate_rr(current)
 84 |                         total_mrr[edge_type].append(rr) 
 85 |                         current=[]
 86 |                         current.append(target)
 87 |                 if count==(sample_number*2):  
 88 |                     if exist: 
 89 |                         edge_type=line_split[-1]
 90 |                         #print('20-',edge_type,current)
 91 |                         rr=calculate_rr(current)
 92 |                         total_mrr[edge_type].append(rr) 
 93 |                         exist=False
 94 |                     count=0
 95 |                     rd+=1
 96 |                     if rd % 100000 == 0:
 97 |                         elapsed_time = time.time() - start_time
 98 |                         print(rd,' batchs finished with time',elapsed_time)
 99 |                 else:
100 |                     count+=1
101 |         idx+=1
102 |         #assert idx/(2*negative_sample_number+1) == num_positive_edges, "Number of positive edges does not agree."
103 |         
104 |         total=0
105 |         num_mrr=0
106 |         macro_mrr=0
107 |         key_list=[]
108 |         for key in total_mrr:
109 |             key_list.append(key)
110 |         key_list.sort()
111 |         for key in key_list:
112 |             s=sum(total_mrr[key])
113 |             l=len(total_mrr[key])
114 |             macro_mrr+=s/l
115 |             total=total+s
116 |             num_mrr=num_mrr+l
117 |             print('edge is '+key+'with avg mrr '+str(s/l))
118 |         print ('macro avg is', macro_mrr/len(total_mrr))
119 |         print ('micro avg is', total/num_mrr)
120 |             
121 |  
122 | 


--------------------------------------------------------------------------------
/eval/mrr_from_score.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Jan 19 12:39:34 2018
  5 | 
  6 | @author: Fang Guo
  7 | """
  8 | import numpy as np
  9 | import sys
 10 | import time
 11 | import argparse
 12 | 
 13 | def calculate_rr(batch):
 14 |     target=batch[0]
 15 |     num_less, num_grtr = 0, 0
 16 |     for s in batch:
 17 |         if s < target:
 18 |             num_less += 1
 19 |         if s > target:
 20 |             num_grtr += 1
 21 |     rr_list = map(lambda x: 1./x, range(num_grtr+1, len(batch)-num_less+1))
 22 |     # l=sorted(batch,reverse=True)
 23 |     # rank=l.index(target)+1
 24 |     rr = sum(rr_list) / (len(batch) - num_less - num_grtr)
 25 |     return rr
 26 | 
 27 | if __name__ == '__main__':
 28 |     start_time = time.time()
 29 |     parser = argparse.ArgumentParser(description="Read in input and output filenames.")
 30 |     parser.add_argument("--input-score-file", nargs="?", help="Input score filename.", type=str)
 31 |     parser.add_argument("--input-eval-file", nargs="?", help="Input evaluaiton file.", type=str)
 32 |     parser.add_argument("--sample-number", nargs="?", help="Input sample number generated per node", type=int,default=10)
 33 | 
 34 |     args = parser.parse_args()
 35 |     score_dict={}
 36 |     input_scorefile=args.input_score_file
 37 |     with open(input_scorefile, "r") as f_in:
 38 |         negative_sample_number,num_positive_edges= map(int, f_in.readline().strip().split())
 39 |         for idx,line in enumerate(f_in):
 40 |             line_split = line.strip().split()
 41 |             key=line_split[0]+' '+line_split[1]
 42 |             score_dict[key]=float(line_split[2])
 43 |         print('here')
 44 |         print(idx, num_positive_edges)
 45 |         idx+=1
 46 |         assert idx/(2*negative_sample_number+1) == num_positive_edges, str(idx)+" "+str(num_positive_edges)
 47 |     print ("Loading done.", len(score_dict), "pairs from", input_scorefile)
 48 |     input_eval_file=args.input_eval_file
 49 |     
 50 |     with open(input_eval_file, "r") as f_in:
 51 |         count=0
 52 |         total_mrr={}
 53 |         exist=False
 54 |         checksametype=False
 55 |         sample_number=args.sample_number
 56 |         rd=0
 57 |         negative_sample_number,num_positive_edges= map(int, f_in.readline().strip().split())
 58 |         assert negative_sample_number==args.sample_number
 59 |         for idx, line in enumerate(f_in):
 60 |             line_split = line.strip().split()
 61 |             key1=line_split[0]
 62 |             key2=line_split[1]
 63 |             key=key1+' '+key2
 64 |             if count==0: 
 65 |                 current=[]
 66 |                 assert key in score_dict,key+" does not exist."
 67 |                 edge_type=line_split[-1]
 68 |                 edge_type_reverse=edge_type+'-1'
 69 |                 if edge_type not in total_mrr:
 70 |                     total_mrr[edge_type]=[]
 71 |                 if edge_type_reverse not in total_mrr:
 72 |                     total_mrr[edge_type_reverse ]=[]
 73 |                 exist=True
 74 |                 target=score_dict[key]
 75 |                 current.append(float(target) )
 76 |                     #print(target)
 77 |                 count+=1
 78 |             else:
 79 |                 if exist:
 80 |                     assert key in score_dict, key+" does not exist."
 81 |                     current.append(float(score_dict[key]) )
 82 |                 if count==sample_number:
 83 |                     if exist:
 84 |                         edge_type=line_split[-1]
 85 |                         rr=calculate_rr(current)
 86 |                         total_mrr[edge_type].append(rr) 
 87 |                         current=[]
 88 |                         current.append(float(target))
 89 |                 if count==(sample_number*2):  
 90 |                     if exist: 
 91 |                         edge_type=line_split[-1]
 92 |                         rr=calculate_rr(current)
 93 |                         total_mrr[edge_type].append(rr) 
 94 |                         exist=False
 95 |                     #checksametype=False
 96 |                     count=0
 97 |                     
 98 |                     rd+=1
 99 |                     if rd % 100000 == 0:
100 |                         elapsed_time = time.time() - start_time
101 |                         print(rd,' batchs finished with time',elapsed_time)
102 |                 else:
103 |                     count+=1
104 |         idx+=1
105 |         assert idx/(2*negative_sample_number+1) == num_positive_edges, "Number of positive edges does not agree."
106 |             
107 |         total=0
108 |         num_mrr=0
109 |         macro_mrr=0
110 |         key_list=[]
111 |         for key in total_mrr:
112 |             key_list.append(key)
113 |         key_list.sort()
114 |         for key in key_list:
115 |             s=sum(total_mrr[key])
116 |             l=len(total_mrr[key])
117 |             macro_mrr+=s/l
118 |             total=total+s
119 |             num_mrr=num_mrr+l
120 |             print('edge is '+key+'with avg mrr '+str(s/l))
121 |         print ('macro avg is', macro_mrr/len(total_mrr))
122 |         print ('micro avg is', total/num_mrr)
123 |         
124 |         
125 |         
126 |  
127 | 


--------------------------------------------------------------------------------
/preprocessing/Readme.md:
--------------------------------------------------------------------------------
  1 | # Knockout HIN
  2 | 
  3 | This repository provides a reference implementation of edge reconstruction for HEER as described in the paper:<br>
  4 | 
  5 | 
  6 | ### Basic usage
  7 | 
  8 | #### Input
  9 | 
 10 | There is **1** required input file.
 11 | 1. **input-hin-file** is the file containing all edges of the input HIN, it should be in the format of :
 12 | 					
 13 | 		node_name_1 node_name_2 edge_weight edge_type
 14 | 					
 15 | 	Note that node_name_1 and node_name_2 are in the format of :
 16 | 		
 17 | 		node_type:node_value
 18 | 		
 19 | 	For example: 
 20 | 	
 21 | 		P:20883
 22 | 		
 23 | 	And also if edge is directed, edge_type should end with ':d', for example:
 24 | 	
 25 | 		<isLocatedin>:d 
 26 | 		
 27 | 	Otherwise it ends with ':u', for example:
 28 | 	
 29 | 		<isMarriedto>:u
 30 | 			
 31 | Another **3** required input arguments are:
 32 | 
 33 | 1. **ko-rate** stands for the knockout rate. It is a float. For example, 0.1 means you will knock out 10% of the original edges.
 34 | 
 35 | 2. **dataset-name** is the name that the output files will use as prefix. It is a string. For example, if the dataset-name is 'dblp' then the output fils will be named as 'dblp_xxx'. 
 36 | 
 37 | 3.  **path-output** is the path where the generator will put files to. It is a string. 
 38 | 
 39 | And another **2** optional input arguments are:
 40 | 
 41 | 1. (optional)**sample-number** stands for the number of new edges you want to generate. It is an integer. For exmaple, with knocked out edge AB, sample number =10 means you will generate 10 new edges with fixed node A and another 10 edges with fixed node B. Then default sample number =10
 42 | 
 43 | 2. (optional)**buffer-size** is the size of temporary trunk for output saving. It is an integer. The default buffer-size = 500000.
 44 | 
 45 | 
 46 | #### Output
 47 | 
 48 | There will be 3 files generated. 
 49 | **The first file** is named as:
 50 | 		
 51 | 	dataset-name_ko_ko-rate.hin
 52 | 
 53 | It contains all the edges from **input-hin-file** without the kicked out edges and it is in the format of:
 54 | 
 55 | 	node_name_1 node_name_2 weight edgetype
 56 | 		
 57 | **The second file** is named as:
 58 | 		
 59 | 	dataset-name_ko_ko-rate_eval.txt
 60 | 
 61 | The first line of **second file** contains the basic information as :
 62 | 	
 63 | 	#_of_negative_example_per_direction_in_one_batch #_of_total_batches.
 64 | 
 65 | Note that one Batch has (1+**sample-number***2) edges, the fisrt edge in the batch is the edge that has been knocked out from 
 66 | **input-hin-file**, then the following **sample-number***2 node pairs that are not associated by edges with respect to this knocked out edge. 
 67 | The detailed generating rule is explaned in our paper.
 68 | 
 69 | Each edge is in the format of:
 70 | 
 71 | 	node_name_1 node_name_2 weight edgetype
 72 | 
 73 |    For one edgetype 'xxx', its reverse type will be marked as 'xxx-1'. For example, one edge type is 'hasChild', 
 74 |    then its reverse edge will be 'hasChild-1'.
 75 |    
 76 | **The third file** is named as	
 77 | 	
 78 | 	dataset-name.config
 79 | 				
 80 | It contains following informaton:
 81 | The first line is a list of node type index pairs, each pair is represented as a list of left node index and right node index, each pair represents an edge; The second line is a list of node types; The third line is a list of edge types; The fourth line is a list of each edge's directed condition, 1 stands for directed and 0 stands for undirected. Therefore, we might have a sample config file **DBLP.config** as:
 82 | 	
 83 | 	[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4]]
 84 | 	['P', 'Y', 'W', 'A', 'V']
 85 | 	['PP:d', 'PY:u', 'PW:u', 'PA:u', 'PV:u']
 86 | 	[1, 0, 0, 0, 0]	
 87 | 	
 88 | Note that the elements with the same index in the differet lists from the first line, the third line and the fourth line, these elements present one same edge type.  For example, the second element in these three lists are
 89 | 
 90 | 	[0, 2], 'PW:u', 0
 91 | 	
 92 | It means the edge 'PW:u' is undirected and is formed by one node with index 0 and another node with index 1.
 93 | 
 94 | #### Execute and example
 95 | And we are using python3.<br/> 
 96 | 
 97 | Here is an exmaple of generating output files
 98 | 
 99 | 	python ko_hin.py --input-hin-file input_data/dblp.hin --data-set-name dblp --path-output output --ko-rate 0.2
100 | 
101 | ## Citing
102 | 
103 | 
104 | ## Miscellaneous
105 | 
106 | Please send any questions you might have about the codes and/or the algorithm to <fangguo1@illinois.edu>.
107 | 
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/preprocessing/edge_knock/Readme.md:
--------------------------------------------------------------------------------
 1 | # Edge Knockout
 2 | 
 3 | This repository provides a reference implementation of edge reconstruction for HEER as described in the paper:<br>
 4 | 
 5 | 
 6 | ### Basic usage
 7 | 
 8 | #### Input
 9 | 
10 | There are 3 required input files.
11 | 1. **p2afile** contains the edges between papers and authors, they are in the format of:
12 | 					
13 | 		paper_index author_index edge_weight
14 | 					
15 | 	Note that edge_weight=0 means the edge does not exist.
16 | 	
17 | 2. **p2ofile** contains the edges between papers and other type nodes, they are in the format of:
18 | 
19 | 		paper_index other_type_node_index edge_weight
20 | 			
21 | 3. **index2name** contains a dictionary between node indexs and node information, they are in the format of:
22 | 		
23 | 		node_index node_info
24 | 		
25 | 	Note that node_name is formatted as 
26 | 		
27 | 		node_type.node_name
28 | 			
29 | And another 3 required input datas:
30 | 
31 | 1. **ko-rate** stands for the knockout rate. It is float type. For example, 0.1 means you will knock out 10% of the original edges.
32 | 
33 | 2. **sample-number** stands for the number of new edges you want to generate. It is int type. For exmaple, with knocked out edge AB, sample number =10 means you will generate 10 new edges with fixed node A and another 10 edges with fixed node B.
34 | 
35 | 3. **dataset-name** is the name that the output files will use as prefix. It is string type. For example, if the dataset-name is 'DBLP' then the output fils will be named as 'DBLP_xxx'. The default name is 'unknown'.
36 | 
37 | 4. **path-output** is the path that the generator will put files to. It is string type. The dafault value is ''.
38 | 
39 | 5. **buffer-size** is the size of temporary trunk for output saving. The default value is 50000.
40 | 
41 | 
42 | #### Output
43 | 
44 | There will be 2 files generated. First file is named as 
45 | 		
46 | 	dataset-name_ko_ko-rate.hin
47 | 
48 | It contains edges from the input network without the kicked out edges and it is in the format of:
49 | 
50 | 	node_name node_name weight edgetype
51 | 	
52 |    For edgetype, it is in the format of "node1node2"
53 | 		
54 | And the second file is named as 
55 | 		
56 | 	dataset-name_ko_ko-rate_sample-number_eval.txt
57 | 
58 | It contains edges that being kickout out and the new generated edges with ko-rate. It is in the format of:
59 | 
60 | 	node_name node_name weight edgetype
61 | 
62 |    For edge type, if it truely exists in the network, then it is in the format of **'node1node2'**; if it is 
63 |    
64 |    generated and not exists in the network, then it is in the format of **'node1node2-1'**
65 | 		
66 |    And the first line of second file contain the basic information as :
67 | 	
68 | 	#of negative example per direction in one batch, #total batches.
69 | 
70 | 		
71 | 							
72 | #### Execute and example
73 | 
74 | _All the commands are executed from the project home directory. And we are using python3._<br/> 
75 | 
76 | Here is an exmaple of generating output files
77 | 
78 | 	python edge_knock.py --input-p2afile data/all_p2a.txt --input-p2ofile data/all_p2o.txt --input-index2name data/index2name.txt --ko-rate 0.2 --data-set-name DBLP --path-output data 
79 | 
80 | ## Citing
81 | 
82 | 
83 | ## Miscellaneous
84 | 
85 | Please send any questions you might have about the codes and/or the algorithm to <fangguo1@illinois.edu>.
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/preprocessing/edge_knock/find_center_paper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Feb  5 12:56:40 2018
  5 | 
  6 | @author: fangguo1
  7 | """
  8 | 
  9 | if __name__ == '__main__':
 10 |     author_set_dic={}
 11 |     venue_set_dic={}
 12 |     paper_set_dic={}#key-paper ,value- papers that cite key
 13 |     year_set_dic={}
 14 |     paper_connect_dic={}
 15 |     
 16 |     index2type={}
 17 |     '''
 18 |     First we build a dictionary, between index and type
 19 |     '''
 20 |     with open('index2name.txt',encoding="utf-8") as file:
 21 |         for line in file:
 22 |             line=line.split()
 23 |             second_part=line[1].split(".")
 24 |             itemtype=second_part[0]
 25 |             index2type[line[0]]=itemtype
 26 |     print('finished index2name')
 27 |     
 28 |     
 29 |     '''
 30 |     We start to save the edges in a dictionary called paper_connect_dic from reading p2a,
 31 |     And another dictionary called author_set_dic, each author will have a list of his papers
 32 |     '''
 33 |     with open('all_p2a.txt',encoding="utf-8") as file:
 34 |         for line in file:
 35 |             line=line.split()
 36 |             paper=line[0]
 37 |             author=line[1]
 38 |             if author not in author_set_dic:
 39 |                 author_set_dic[author]={}
 40 |             author_set_dic[author][paper]=1
 41 |             if paper not in paper_connect_dic:
 42 |                 paper_connect_dic[paper]={}
 43 |                 paper_connect_dic[paper]['author']={}
 44 |                 paper_connect_dic[paper]['venue']={}
 45 |                 paper_connect_dic[paper]['paper']={}
 46 |                 paper_connect_dic[paper]['pap']={}
 47 |                 paper_connect_dic[paper]['pvp']={}
 48 |                 paper_connect_dic[paper]['ppp']={}
 49 |                 paper_connect_dic[paper]['pyp']={}
 50 |                 paper_connect_dic[paper]['totalp']=set()
 51 | 
 52 |             if paper not in paper_set_dic:
 53 |                 paper_set_dic[paper]={}
 54 |             if author not in paper_connect_dic[paper]['author']:
 55 |                 paper_connect_dic[paper]['author'][author]=1
 56 |     print('finished reading p2a')
 57 |     
 58 |     '''
 59 |     After reading p2a, we will do pruning first, which will add qualified 
 60 |     pap to a list called qualified_papers_pap
 61 |     '''
 62 |     qualified_papers_pap=[]
 63 |     count=0
 64 |     for paper,dic in paper_connect_dic.items():
 65 |         #print(paper)
 66 |         for author in dic['author']:
 67 |             #print(author)
 68 |             for i in author_set_dic[author].keys():
 69 |                 if i not in dic['pap'] and i != paper:
 70 |                     dic['pap'][i]=1
 71 |         #print(len(dic['pap']))
 72 |         if len(dic['pap']) >=200:
 73 |             qualified_papers_pap.append(paper)
 74 |         count+=1
 75 |     print(len(qualified_papers_pap))
 76 |     
 77 |     '''
 78 |     We start to reaing from p2o, for type paper, venue and year, we will build their own dictionary.
 79 |     Ex, for Venue, we will have venue_set_dic, each venue will have a list of its related papers.
 80 |     And we continue adding edges to paper_connect_dic
 81 |     '''
 82 |     with open('all_p2o.txt',encoding="utf-8") as file:
 83 |         count=0
 84 |         for line in file:
 85 |             line=line.strip().split()
 86 |             paper=line[0]
 87 |             node_2=line[1]
 88 |             name=index2type[node_2]
 89 |             tp=name[0]
 90 |             count+=1
 91 |             if tp!='P' and  tp!='V' and tp!='Y':
 92 |                 continue
 93 |             if tp=='V':
 94 |                 venue=node_2
 95 |                 if paper in paper_connect_dic:  
 96 |                     if venue not in venue_set_dic:
 97 |                         venue_set_dic[venue]={}
 98 |                     venue_set_dic[venue][paper]=1
 99 |                     if venue not in paper_connect_dic[paper]['venue']:
100 |                         paper_connect_dic[paper]['venue'][venue]=1
101 |             if tp=='P':
102 |                 #paper_cited is being cited
103 |                 paper_cited=node_2
104 |                 if paper_cited in paper_connect_dic:
105 |                     paper_set_dic[paper_cited][paper]=1
106 |                     if paper not in paper_connect_dic[paper_cited]['paper']:
107 |                         paper_connect_dic[paper_cited]['paper'][paper]=1   
108 |             if tp=='Y':
109 |                 year=node_2
110 |                 paper_connect_dic[paper]['year']=year
111 |                 if year not in year_set_dic:
112 |                     year_set_dic[year]={}
113 |                 if paper not in year_set_dic[year]:
114 |                     year_set_dic[year][paper]=1
115 | 
116 |     print('finished reading p2o')
117 |     #print(paper_connect_dic['287144']['venue'])
118 |     
119 |     
120 |     '''After we finished all above dicitonaries, we start to prune from qualified_papers_pap. First we 
121 |         do with pvp and will add qualified paper to qualified_papers_pvp untill the count is 60000.
122 |     '''
123 |     count=0
124 |     qualified_papers_pvp=[]
125 |     for paper in qualified_papers_pap:
126 |         pap=paper_connect_dic[paper]['pap'].keys()
127 |         size_pap=len(pap)
128 |         for venue in paper_connect_dic[paper]['venue']:
129 |             for i in venue_set_dic[venue].keys():
130 |                 if i not in paper_connect_dic[paper]['pap'] and i not in paper_connect_dic[paper]['pvp'] and i!=paper:
131 |                     paper_connect_dic[paper]['pvp'][i]=1
132 |         size_pvp=len(paper_connect_dic[paper]['pvp'])
133 |         
134 |         if size_pvp>=200 and size_pvp/size_pap <=4 and size_pap/size_pvp <=4:
135 |             qualified_papers_pvp.append(paper)
136 |             #print(paper,size_pap, size_pvp)
137 |         count+=1
138 |         if(count%5000==0):
139 |             print(count,' papers checked')
140 |         if len(qualified_papers_pvp)==60000:
141 |             break
142 | 
143 |     '''Then we do pruning with ppp, add papers from qualified_papers_pvp to qualified_papers_ppp until count
144 |     is 1000
145 |     '''
146 |     
147 |     qualified_papers_ppp=[]
148 |     for paper in qualified_papers_pvp:
149 |         count=0
150 |         for i in paper_connect_dic[paper]['paper']:
151 |             if i not in paper_connect_dic[paper]['pvp'] and i not in paper_connect_dic[paper]['pap']:
152 |                 paper_connect_dic[paper]['ppp'][i]=1
153 |                 count+=1
154 |         size_ppp=len(paper_connect_dic[paper]['ppp'])
155 |         size_pap=len(paper_connect_dic[paper]['pap'])
156 |         size_pvp=len(paper_connect_dic[paper]['pvp'])
157 |         if count >=200 and size_ppp/size_pvp<=4 and size_pvp/size_ppp<=4 and size_pap/size_ppp<=4 and size_ppp/size_pap<=4:
158 |             qualified_papers_ppp.append(paper)
159 |             #print(paper,size_pap, paper_connect_dic[paper]['venue'],size_pvp,size_ppp)
160 | 
161 |         if len(qualified_papers_ppp)==1000:
162 |             break
163 |     print('finished ppp')
164 |     
165 |     '''
166 |     Finally we do pruning on pyp and stop when we have find 10 qualified papers.
167 |     '''
168 |         
169 |     qualified_papers_pyp=[]    
170 |     count=0
171 |     for paper in qualified_papers_ppp:
172 |         if 'year' not in paper_connect_dic[paper]:
173 |             print('year not exist ',paper)
174 |             continue
175 |         year=paper_connect_dic[paper]['year']
176 |         same_year_paper_count=0
177 |         for i in year_set_dic[year].keys():
178 |             if i not in paper_connect_dic[paper]['pap'] and i not in paper_connect_dic[paper]['pvp'] and i not in paper_connect_dic[paper]['ppp']and i!=paper:
179 |                 paper_connect_dic[paper]['pyp'][i]=1
180 |                 same_year_paper_count+=1
181 |                 if same_year_paper_count==500:
182 |                     break
183 |         if len(paper_connect_dic[paper]['pyp'])>=500:
184 |             qualified_papers_pyp.append(paper)
185 |             count+=1
186 |             if count==10:
187 |                 break
188 |             
189 |     
190 |     for paper in qualified_papers_pyp:
191 |         file_to_write='path/center_paper_'+paper+'.txt'
192 |         file=open(file_to_write,'w+')
193 |         for reached_paper in paper_connect_dic[paper]['pap']:
194 |             content=reached_paper+' PAP\n'
195 |             file.write(content)
196 |         for reached_paper in paper_connect_dic[paper]['pvp']:
197 |             content=reached_paper+' PVP\n'
198 |             file.write(content)
199 |         for reached_paper in paper_connect_dic[paper]['ppp']:
200 |             content=reached_paper+' PPP\n'
201 |             file.write(content)
202 |         for reached_paper in paper_connect_dic[paper]['pyp']:
203 |             content=reached_paper+' PYP\n'
204 |             file.write(content)
205 |         file.close()
206 |         print(paper,'finished writing')
207 | 
208 |         
209 | 
210 |             
211 | 
212 |         
213 |             
214 | 
215 |             
216 |         
217 |         
218 |         
219 |         
220 |                 
221 |             
222 |             
223 |         
224 | 


--------------------------------------------------------------------------------
/preprocessing/edge_knock/gen_training_file_for_logit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Feb  6 17:41:05 2018
  5 | 
  6 | @author: edz
  7 | """
  8 | 
  9 | 
 10 | from random import random,sample,choice
 11 | import time
 12 | import math
 13 | import argparse
 14 | 
 15 | def ko_edge(tuple_list,a_dic,p_dic,index2type):
 16 |     size=len(tuple_list)
 17 |     pick_dic={}
 18 |     pick_index_list=sample(range(size-1),200000)
 19 |     for i in pick_index_list:
 20 |         
 21 |         tuple_list[i][-1]='0'
 22 |         node1_value=tuple_list[i][0][2:] #it must be in P type
 23 |         node2_value=tuple_list[i][1][2:] #it will be A or P or Other
 24 | 
 25 |         '''For both a_dic,p_dic and o_dic, if after knocking out, the node has an empty dictionary
 26 |            then the node will be poped in the dictionary which it belongs to. 
 27 |         '''
 28 |         if node2_value not in pick_dic:
 29 |             pick_dic[node2_value]={}
 30 |             pick_dic[node2_value][node1_value]=1
 31 |         else:
 32 |             if node1_value not in pick_dic[node2_value]:
 33 |                 pick_dic[node2_value][node1_value]=1
 34 |                 
 35 |     print('finished pick_dic')
 36 | 
 37 |     return pick_dic,tuple_list
 38 | 
 39 | def build_file(ko_dic,a_dic,p_dic,o_dic,index2type,sample_number,file_3,buffer_size):
 40 |     p_list=list(p_dic)
 41 |     a_list=list(a_dic)
 42 |     o_list={}
 43 |     for key in o_dic:
 44 |         o_list[key]=list(o_dic[key])
 45 |     file =open(file_3,'w+')
 46 |     #for key,value in ko_dic.items():
 47 |     print("Started writing to file3")
 48 |     elapsed_time = time.time() - start_time
 49 |     print(elapsed_time)
 50 |     content=''
 51 |     rd=0
 52 |     for key,dic in ko_dic.items():   
 53 |         
 54 |         for sub_key,edge in dic.items():
 55 |             content_temp=[]
 56 |             temp=index2type[key]+":"+key+" "+index2type[sub_key]+":"+sub_key+" "+str(edge)+" "+index2type[key]+index2type[sub_key]+'\n'
 57 |             content_temp.append(temp)
 58 |             if (index2type[key]=="A"):
 59 |             #10 negative sampling with same author, but non-relative paper
 60 |                 count=0
 61 |                 while count<sample_number:
 62 |                     random_paper=choice(p_list)  
 63 |                     while random_paper in a_dic[key]:
 64 |                         random_paper=choice(p_list)
 65 |                     temp="A:"+key+" "+index2type[random_paper]+":"+str(random_paper)+" "+"0"+" "+"AP"+'\n'
 66 |                     content_temp.append(temp)
 67 |                     count+=1
 68 |             #10 negative sampling with same paper, but random-author
 69 |                 count=0        
 70 |                 while count<sample_number:
 71 |                     random_author=choice(a_list)
 72 |                     while random_author in p_dic[sub_key]:
 73 |                         random_author=choice(a_list)
 74 |                     temp="P:"+sub_key+" "+"A:"+random_author+" "+"0"+" "+"PA"+'\n'
 75 |                     content_temp.append(temp)
 76 |                     count+=1
 77 |             else: 
 78 |                 if index2type[key].lower()=='p':#when key is paper type
 79 |                 #10 negative sampling with same paper and non-relative papers
 80 |                     count=0
 81 |                     while count<sample_number:
 82 |                         random_paper=choice(p_list)
 83 |                         while random_paper in p_dic[sub_key]:
 84 |                             random_paper=choice(p_list)
 85 |                         temp="P:"+sub_key+" "+index2type[random_paper]+":"+random_paper+" "+"0"+" "+"PP"+'\n'
 86 |                         content_temp.append(temp)
 87 |                         count+=1
 88 |                     count=0
 89 |                     while count<sample_number:
 90 |                         random_paper=choice(p_list)
 91 |                         while random_paper in p_dic[key]:
 92 |                             random_paper=choice(p_list)
 93 |                         temp=index2type[key]+":"+key+" "+"P:"+random_paper+" "+"0"+" "+index2type[key]+"P"+'\n'
 94 |                         content_temp.append(temp)
 95 |                         count+=1
 96 |                 else:# when key is other types
 97 |                     node_type=index2type[key]
 98 |                     count=0
 99 |                     
100 |                     #10 negative sampling with same paper, but non-relative same other type nodes
101 |                     while count<sample_number:
102 |                         random_other=choice(o_list[node_type])
103 |                         while random_other in p_dic[sub_key]:
104 |                             random_other=choice(o_list[node_type])
105 |                         temp="P:"+sub_key+" "+node_type+":"+random_other+" "+"0"+" "+"P"+node_type+'\n'
106 |                         content_temp.append(temp)
107 |                         count+=1
108 |                     #10 negative sampling with same other type nodes, but non-realtive papers
109 |                     count=0
110 |                     while count<sample_number:
111 |                         random_paper=choice(p_list)
112 |                         while random_paper in o_dic[node_type][key]:
113 |                             random_paper=choice(p_list)
114 |                         temp=node_type+":"+key+" "+"P:"+random_paper+" "+"0"+" "+node_type+"P"+'\n'
115 |                         content_temp.append(temp)
116 |                         count+=1
117 |             content=content+"".join(content_temp)
118 |             rd+=1
119 |         if rd % buffer_size ==0:
120 |             print (rd,'batches finished')
121 |             file.write(content)
122 |             content=''
123 |     print(rd)
124 |     file.write(content)
125 |     file.close()
126 |     
127 |     print('finished writing to file3')
128 |     return 
129 | 
130 | if __name__ == '__main__':
131 |     
132 |     start_time = time.time()
133 |     parser = argparse.ArgumentParser(description="Read in input and output filenames.")
134 |     parser.add_argument("--input-file1", nargs="?", help="Input p2a filename.", type=str)
135 |     parser.add_argument("--input-index2name", nargs="?", help="Input index2name filename.", type=str)
136 |     parser.add_argument("--sample-number", nargs="?", help="Input sample number generated per node", type=int)
137 |     parser.add_argument("--output-file-3", nargs="?", help="Output file_2.", type=str)
138 |     parser.add_argument("--buffer-size", nargs="?", help="Buffer Size.", type=int,default=100000)
139 |     
140 |     args = parser.parse_args()
141 |     filename0=args.input_index2name
142 |     index2type={}
143 |     with open(filename0,encoding="utf-8") as file:
144 |         for line in file:
145 |             line=line.split()
146 |             second_part=line[1].split(".")
147 |             itemtype=second_part[0]
148 |             index2type[line[0]]=itemtype
149 |     print('finished index2type')
150 |     tuple_list=[]
151 |     p_dic={}
152 |     a_dic={} 
153 |     o_dic={}
154 |     filename1=args.input_file1
155 |     count=0
156 |     with open(filename1,encoding="utf-8") as file:
157 |         for line in file:
158 |             line=line.strip().split()
159 |             node_1=line[0]
160 |             node_2=line[1]
161 |             node_1_type=node_1[0]
162 |             node_1_value=node_1[2:]
163 |             node_2_type=node_2[0]
164 |             node_2_value=node_2[2:]
165 |             tuple_list.append(line)
166 |             if node_1_value not in p_dic:
167 |                     p_dic[node_1_value]={}
168 |             if node_2_value not in p_dic[node_1_value]:
169 |                     p_dic[node_1_value][node_2_value]=1
170 |             if node_2_type=='P':
171 |                 if node_2_value not in p_dic:
172 |                     p_dic[node_2_value]={}
173 |                 if node_2_value not in p_dic[node_1_value]:
174 |                     p_dic[node_1_value][node_2_value]=1
175 |             if node_2_type=='A':
176 |                 if node_2_value not in a_dic:
177 |                     a_dic[node_2_value]={}
178 |                 if node_1_value not in a_dic[node_2_value]:
179 |                     a_dic[node_2_value][node_1_value]=1
180 |             else:
181 |                 if node_2_type not in o_dic:
182 |                     o_dic[node_2_type]={}
183 |                 if node_2_value not in o_dic[node_2_type]:
184 |                     o_dic[node_2_type][node_2_value]={}
185 |                 if node_1_value not in o_dic[node_2_type][node_2_value]:
186 |                     o_dic[node_2_type][node_2_value][node_1_value]=1
187 |             count+=1
188 |         print('finished reading from file_1')
189 |         
190 |     sample_number=args.sample_number
191 |     file_3=args.output_file_3
192 |     pick_dic,tuple_list=ko_edge(tuple_list,a_dic,p_dic,index2type)
193 |     buffer_size=args.buffer_size
194 |     build_file(pick_dic,a_dic,p_dic,o_dic,index2type,sample_number,file_3,buffer_size)
195 |     elapsed_time = time.time() - start_time
196 |     print(elapsed_time)
197 | 
198 |         
199 |                 
200 |             
201 |             
202 |             
203 | 
204 |             
205 |             


--------------------------------------------------------------------------------
/pretrain/makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result 
 3 | 
 4 | all: ransampl.o line
 5 | 
 6 | ransampl.o : ransampl.c
 7 | 	$(CC) $(CFLAGS) $(OPT_DEF) -c ransampl.c
 8 | 
 9 | line : line.cpp
10 | 	$(CC) $(CFLAGS) $(OPT_DEF) -I/usr/local/opt/gsl/include -L/usr/local/opt/gsl/lib line.cpp ransampl.o -o line -lgsl -lm -lgslcblas
11 | 
12 | clean:
13 | 	rm -rf *.o line
14 | 


--------------------------------------------------------------------------------
/pretrain/ransampl.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Library:   ransampl (random number sampling)
  3 |  *
  4 |  * File:      ransampl.c
  5 |  *
  6 |  * Contents:  Random-number sampling using the Walker-Vose alias method,
  7 |  *            as described by Keith Schwarz (2011)
  8 |  *            [http://www.keithschwarz.com/darts-dice-coins]
  9 |  *
 10 |  * Copyright: Joachim Wuttke, Forschungszentrum Juelich GmbH (2013)
 11 |  *
 12 |  * License:   see ../COPYING (FreeBSD)
 13 |  * 
 14 |  * Homepage:  apps.jcns.fz-juelich.de/ransampl
 15 |  */
 16 | 
 17 | #include <stdlib.h>
 18 | #include <stdio.h>
 19 | #include <errno.h>
 20 | #include "ransampl.h"
 21 | 
 22 | //! Allocate workspace for random-number sampling.
 23 | ransampl_ws* ransampl_alloc( integer n )
 24 | {
 25 |     ransampl_ws *ws;
 26 | 	ws = (ransampl_ws *)malloc(sizeof(ransampl_ws));
 27 | 	ws->alias = (integer *)malloc(n*sizeof(integer));
 28 | 	ws->prob = (double *)malloc(n*sizeof(double));
 29 | 	if (ws == NULL || ws->alias == NULL || ws->prob == NULL)
 30 | 	{
 31 |         fprintf( stderr, "ransampl: workspace allocation failed\n" );
 32 |         exit(ENOMEM);
 33 |     }
 34 |     ws->n = n;
 35 |     return ws;
 36 | }
 37 | 
 38 | //! Initialize workspace by precompute alias tables from given probabilities.
 39 | void ransampl_set( ransampl_ws *ws, double* p )
 40 | {
 41 |     integer n = ws->n;
 42 |     integer i, a, g;
 43 | 
 44 |     // Local workspace:
 45 |     double *P;
 46 |     integer *S, *L;
 47 |     if ( !(P = (double*) malloc( n*sizeof(double) ) ) ||
 48 |          !(S = (integer*) malloc( n*sizeof(integer) ) ) ||
 49 |          !(L = (integer*) malloc( n*sizeof(integer) ) ) ) {
 50 |         fprintf( stderr, "ransampl: temporary allocation failed\n" );
 51 |         exit(ENOMEM);
 52 |     }
 53 | 
 54 |     // Normalise given probabilities:
 55 |     double sum=0;
 56 |     for ( i=0; i<n; ++i ) {
 57 |         if( p[i]<0 ) {
 58 |             fprintf( stderr, "ransampl: invalid probability p[%i]<0\n", int(i) );
 59 |             exit(EINVAL);
 60 |         }
 61 |         sum += p[i];
 62 |     }
 63 |     if ( !sum ) {
 64 |         fprintf( stderr, "ransampl: no nonzero probability\n" );
 65 |         exit(EINVAL);
 66 |     }
 67 |     for ( i=0; i<n; ++i )
 68 |         P[i] = p[i] * n / sum;
 69 | 
 70 |     // Set separate index lists for small and large probabilities:
 71 |     integer nS = 0, nL = 0;
 72 |     for ( i=n-1; i>=0; --i ) {
 73 |         // at variance from Schwarz, we revert the index order
 74 |         if ( P[i]<1 )
 75 |             S[nS++] = i;
 76 |         else
 77 |             L[nL++] = i;
 78 |     }
 79 | 
 80 |     // Work through index lists
 81 |     while ( nS && nL ) {
 82 |         a = S[--nS]; // Schwarz's l
 83 |         g = L[--nL]; // Schwarz's g
 84 |         ws->prob[a] = P[a];
 85 |         ws->alias[a] = g;
 86 |         P[g] = P[g] + P[a] - 1;
 87 |         if ( P[g] < 1 )
 88 |             S[nS++] = g;
 89 |         else
 90 |             L[nL++] = g;
 91 |     }
 92 | 
 93 |     while ( nL )
 94 |         ws->prob[ L[--nL] ] = 1;
 95 | 
 96 |     while ( nS )
 97 |         // can only happen through numeric instability
 98 |         ws->prob[ S[--nS] ] = 1;
 99 | 
100 |     // Cleanup:
101 |     free( P );
102 |     free( S );
103 |     free( L );
104 | }
105 | 
106 | //! Draw one random index, using two supplied uniform random numbers.
107 | integer ransampl_draw( ransampl_ws *ws, double ran1, double ran2 )
108 | {
109 |     integer i = (integer) ws->n * ran1;
110 |     return ran2 < ws->prob[i] ? i : ws->alias[i];
111 | }
112 | 
113 | //! Free the random-number sampling workspace.
114 | void ransampl_free( ransampl_ws *ws )
115 | {
116 |     free( ws->alias );
117 |     free( ws->prob );
118 |     free( ws );
119 | }
120 | 


--------------------------------------------------------------------------------
/pretrain/ransampl.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Library:   ransampl (random number sampling)
 3 |  *
 4 |  * File:      ransampl.h
 5 |  *
 6 |  * Contents:  Random-number sampling using the Walker-Vose alias method
 7 |  *
 8 |  * Copyright: Joachim Wuttke, Forschungszentrum Juelich GmbH (2013)
 9 |  *
10 |  * License:   see ../COPYING (FreeBSD)
11 |  * 
12 |  * Homepage:  apps.jcns.fz-juelich.de/ransampl
13 |  */
14 | 
15 | #ifndef RANSAMPL_H
16 | #define RANSAMPL_H
17 | #undef __BEGIN_DECLS
18 | #undef __END_DECLS
19 | #ifdef __cplusplus
20 | # define __BEGIN_DECLS extern "C" {
21 | # define __END_DECLS }
22 | #else
23 | # define __BEGIN_DECLS /* empty */
24 | # define __END_DECLS /* empty */
25 | #endif
26 | __BEGIN_DECLS
27 | 
28 | typedef int integer;
29 | 
30 | typedef struct {
31 |     integer n;
32 |     integer* alias;
33 |     double* prob;
34 | } ransampl_ws;
35 | 
36 | ransampl_ws* ransampl_alloc( integer n );
37 | 
38 | void ransampl_set( ransampl_ws *ws, double *p );
39 | 
40 | integer ransampl_draw( ransampl_ws *ws, double ran1, double ran2 );
41 | 
42 | void ransampl_free( ransampl_ws *ws );
43 | 
44 | __END_DECLS
45 | #endif /* RANSAMPL_H */
46 | 


--------------------------------------------------------------------------------
/run/archive/aspem_batch_eval_dblp_0.2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #score_prefix=$1  # the directory containing all score files (9 in the DBLP case) your model generates
 4 | #output_keyword=$2
 5 | 
 6 | time_start=$(date +"%Y%m%d_%H%M%S")
 7 | 
 8 | mkdir -p ../output/
 9 | 
10 | for smp in 1000 2000 5000 10000 20000
11 | do
12 | (
13 | score_prefix=../intermediate_data/dblp_0.2_out_aspem_samples"$smp"
14 | output_keyword=papvw_pay_smp"$smp"
15 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_pay_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_pay_"$time_start".txt
16 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_papvw_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_papvw_"$time_start".txt
17 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_mean_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_mean_"$time_start".txt
18 | ) &
19 | done
20 | 
21 | 
22 | for smp in 1000 2000 5000 10000 20000
23 | do
24 | (
25 | score_prefix=../intermediate_data/dblp_0.2_out_aspem_samples"$smp"_normalized
26 | output_keyword=papvw_pay_normalized_smp"$smp"
27 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_pay_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_pay_"$time_start".txt
28 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_papvw_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_papvw_"$time_start".txt
29 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_mean_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_mean_"$time_start".txt
30 | ) &
31 | done
32 | 
33 | 
34 | for smp in 10000
35 | do
36 | (
37 | score_prefix=../intermediate_data/dblp_0.2_out_aspem_samples"$smp"_normalized_papvw_and_papvwy
38 | output_keyword=papvw_papvwy_normalized_smp"$smp"
39 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_pay_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_pay_"$time_start".txt
40 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_papvw_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_papvw_"$time_start".txt
41 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_mean_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_mean_"$time_start".txt
42 | ) &
43 | done
44 | wait
45 | 
46 | 


--------------------------------------------------------------------------------
/run/archive/aspem_eval_dblp_0.2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | score_prefix=$1  # the directory containing all score files (9 in the DBLP case) your model generates
 4 | output_keyword=$2
 5 | 
 6 | time_start=$(date +"%Y%m%d_%H%M%S")
 7 | 
 8 | mkdir -p ../output/
 9 | 
10 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_pay_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_pay_"$time_start".txt
11 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_papvw_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_papvw_"$time_start".txt
12 | python3 ../eval/edge_rec_eval_score_provided.py --score-file "$score_prefix"_mean_score.txt --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_aspem_"$output_keyword"_mean_"$time_start".txt
13 | 
14 | 


--------------------------------------------------------------------------------
/run/archive/batch_eval_dblp_from_score.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | score_file_dir=$1
 4 | score_file_keyword=$2
 5 | eval_file=$3
 6 | 
 7 | time_start=$(date +"%Y%m%d_%H%M%S")
 8 | 
 9 | mkdir -p ../output/
10 | 
11 | for score_file in "$score_file_dir"/*"$score_file_keyword"*
12 | do
13 | (
14 | 	file_basename=`basename $score_file`
15 | 	python3 ../eval/mrr_from_score.py --input-score-file $score_file --input-record-file $eval_file --sample-number 10  > ../output/out_"$file_basename"
16 | ) &
17 | done
18 | wait
19 | 


--------------------------------------------------------------------------------
/run/archive/batch_eval_yago_from_score.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | score_file_dir=$1
 4 | score_file_keyword=$2
 5 | eval_file=$3
 6 | 
 7 | time_start=$(date +"%Y%m%d_%H%M%S")
 8 | 
 9 | mkdir -p ../output/
10 | 
11 | for score_file in "$score_file_dir"/*"$score_file_keyword"*
12 | do
13 | (
14 | 	file_basename=`basename $score_file`
15 | 	python3 ../eval/yago_mrr_from_score.py --input-score-file $score_file --input-record-file $eval_file --sample-number 10  > ../output/out_"$file_basename"
16 | ) &
17 | done
18 | wait
19 | 


--------------------------------------------------------------------------------
/run/archive/edge_rec_eval_for_others_dblp_0.2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | score_dir=$1  # the directory containing all score files (9 in the DBLP case) your model generates
 4 | score_keyword=$2  # the common keyword these score files have. Note that all files containing this keyword will be read in; so please make sure no other files in score_dir contains this keyword
 5 | output_keyword=$3
 6 | 
 7 | time_start=$(date +"%Y%m%d_%H%M%S")
 8 | 
 9 | mkdir -p ../output/
10 | 
11 | python2 /shared/data/yushi2/edge_rep_codes/aux/merge_edges_with_all_types.py --input-ref-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt --input-score-dir $score_dir --input-score-keywords $score_keyword --output-file ../output/merged_score_"$output_keyword".temp
12 | python3 ../eval/edge_rec_eval_score_provided.py --score-file ../output/merged_score_"$output_keyword".temp --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_20neg_eval_fast.txt > ../output/out_"$output_keyword"_"$time_start".txt
13 | 
14 | rm ../output/merged_score_"$output_keyword".temp
15 | 


--------------------------------------------------------------------------------
/run/archive/edge_rec_eval_for_others_dblp_0.2_downsampled.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | score_dir=$1  # the directory containing all score files (9 in the DBLP case) your model generates
 4 | score_keyword=$2  # the common keyword these score files have. Note that all files containing this keyword will be read in; so please make sure no other files in score_dir contains this keyword
 5 | output_keyword=$3
 6 | 
 7 | time_start=$(date +"%Y%m%d_%H%M%S")
 8 | 
 9 | mkdir -p ../output/
10 | 
11 | python2 /shared/data/yushi2/edge_rep_codes/aux/merge_edges_with_all_types.py --input-ref-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_downsampled_20neg_eval.txt --input-score-dir $score_dir --input-score-keywords $score_keyword --output-file ../output/merged_score_"$output_keyword".temp
12 | python3 ../eval/edge_rec_eval_score_provided.py --score-file ../output/merged_score_"$output_keyword".temp --eval-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.2_out_downsampled_20neg_eval.txt > ../output/out_"$output_keyword"_"$time_start".txt
13 | 
14 | rm ../output/merged_score_"$output_keyword".temp
15 | 


--------------------------------------------------------------------------------
/run/archive/edge_rec_eval_for_others_dblp_0.4.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | score_dir=$1  # the directory containing all score files (9 in the DBLP case) your model generates
 4 | score_keyword=$2  # the common keyword these score files have. Note that all files containing this keyword will be read in; so please make sure no other files in score_dir contains this keyword
 5 | output_keyword=$3
 6 | 
 7 | time_start=$(date +"%Y%m%d_%H%M%S")
 8 | 
 9 | mkdir -p /shared/data/qiz3/data/dblp/eval/output/
10 | 
11 | python2 /shared/data/yushi2/edge_rep_codes/aux/merge_edges_with_all_types.py --input-ref-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.4_out_20neg_eval_fast.txt --input-score-dir $score_dir --input-score-keywords $score_keyword --output-file /shared/data/qiz3/data/dblp/eval/output/merged_score_"$output_keyword".temp
12 | python3 /shared/data/yushi2/edge_rep_codes/eval/mrr_from_score.py --sample-number 10 --input-score-file /shared/data/qiz3/data/dblp/eval/output/merged_score_"$output_keyword".temp --input-record-file /shared/data/yushi2/edge_rep_codes/input_data/dblp_0.4_out_20neg_eval_fast.txt > /shared/data/qiz3/data/dblp/eval/result/out_"$output_keyword"_"$time_start".txt
13 | 
14 | rm /shared/data/qiz3/data/dblp/eval/output/merged_score_"$output_keyword".temp
15 | 


--------------------------------------------------------------------------------
/run/archive/edge_rec_eval_for_others_yago_0.1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | score_dir=$1  # the directory containing all score files (9 in the YaGo case) your model generates
 4 | score_keyword=$2  # the common keyword these score files have. Note that all files containing this keyword will be read in; so please make sure no other files in score_dir contains this keyword
 5 | output_keyword=$3
 6 | 
 7 | time_start=$(date +"%Y%m%d_%H%M%S")
 8 | 
 9 | mkdir -p /shared/data/qiz3/data/yago/output/
10 | 
11 | python2 /shared/data/yushi2/edge_rep_codes/aux/merge_edges_with_all_types.py --input-ref-file /shared/data/qiz3/data/yago/new_yago_0.1_out_20neg_eval.txt --input-score-dir $score_dir --input-score-keywords $score_keyword --output-file /shared/data/qiz3/data/yago/output/merged_score_"$output_keyword".temp
12 | python3 /shared/data/yushi2/edge_rep_codes/eval/yago_mrr_from_score.py --sample-number 10 --input-score-file /shared/data/qiz3/data/yago/output/merged_score_"$output_keyword".temp --input-record-file /shared/data/qiz3/data/yago/new_yago_0.1_out_20neg_eval.txt > /shared/data/qiz3/data/yago/output/out_"$output_keyword"_"$time_start".txt
13 | 
14 | rm /shared/data/qiz3/data/yago/output/merged_score_"$output_keyword".temp
15 | 


--------------------------------------------------------------------------------
/run/archive/edge_rec_eval_for_others_yago_0.4.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | score_dir=$1  # the directory containing all score files (9 in the YaGo case) your model generates
 4 | score_keyword=$2  # the common keyword these score files have. Note that all files containing this keyword will be read in; so please make sure no other files in score_dir contains this keyword
 5 | output_keyword=$3
 6 | 
 7 | time_start=$(date +"%Y%m%d_%H%M%S")
 8 | 
 9 | mkdir -p /shared/data/qiz3/data/yago/output/
10 | 
11 | python2 /shared/data/yushi2/edge_rep_codes/aux/merge_edges_with_all_types.py --input-ref-file /shared/data/qiz3/data/yago/yago_0.4_out_20neg_eval.txt --input-score-dir $score_dir --input-score-keywords $score_keyword --output-file /shared/data/qiz3/data/yago/output/merged_score_"$output_keyword".temp
12 | python3 /shared/data/yushi2/edge_rep_codes/eval/yago_mrr_from_score.py --sample-number 10 --input-score-file /shared/data/qiz3/data/yago/output/merged_score_"$output_keyword".temp --input-record-file /shared/data/qiz3/data/yago/yago_0.4_out_20neg_eval.txt > /shared/data/qiz3/data/yago/output/out_"$output_keyword"_"$time_start".txt
13 | 
14 | rm /shared/data/qiz3/data/yago/output/merged_score_"$output_keyword".temp
15 | 


--------------------------------------------------------------------------------
/run/archive/edge_rec_eval_pytorch_out.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | score_dir=$1  # the directory containing all score files (9 in the DBLP case) your model generates
 4 | score_keyword=$2  # the common keyword these score files have. Note that all files containing this keyword will be read in; so please make sure no other files in score_dir contains this keyword
 5 | output_keyword=$3
 6 | 
 7 | time_start=$(date +"%Y%m%d_%H%M%S")
 8 | 
 9 | #python2 ../aux/merge_edges_with_all_types.py --input-ref-file ../input_data/dblp_0.1_out_20neg_eval.txt --input-score-dir $score_dir --input-score-keywords $score_keyword --output-file ../intermediate_data/merged_score.temp
10 | #python3 ../eval/edge_rec_eval_score_provided.py --score-file ../intermediate_data/merged_score.temp --eval-file ../input_data/dblp_0.1_out_20neg_eval.txt > ../output/out_"$score_keyword"_"$time_start".txt
11 | #python2 ../aux/merge_edges_with_all_types.py --input-ref-file ../input_data/dblp_0.1_out_filtered_20neg_eval.txt --input-score-dir $score_dir --input-score-keywords $score_keyword --output-file ../intermediate_data/merged_score.temp
12 | #python3 ../eval/edge_rec_eval_score_provided.py --score-file ../intermediate_data/merged_score.temp --eval-file ../input_data/dblp_0.1_out_filtered_20neg_eval.txt > ../output/out_"$output_keyword"_"$time_start".txt
13 | 
14 | 
15 | python2 ../aux/merge_edges_with_all_types.py --input-ref-file ../input_data/dblp_0.2_out_downsampled_20neg_eval.txt --input-score-dir $score_dir --input-score-keywords $score_keyword --output-file ../intermediate_data/merged_score_"$output_keyword".temp
16 | python3 ../eval/edge_rec_eval_score_provided.py --score-file ../intermediate_data/merged_score_"$output_keyword".temp --eval-file ../input_data/dblp_0.2_out_downsampled_20neg_eval.txt > ../output/out_"$output_keyword"_"$time_start".txt
17 | 
18 | rm ../intermediate_data/merged_score_"$output_keyword".temp
19 | #rm ../intermediate_data/merged_score.temp
20 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_and_eval_yago_hins.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for i in 0.4 #0.6 0.8 #0.2 0.1 0.5 #
 4 | do
 5 |     for smp in 5000 10000 
 6 |     #for smp in 100 200 500 1000 2000 5000 10000
 7 |     do
 8 |         (
 9 |         #/data/yushi2/aspect_embedding_codes/baselines/line/line -train ../input_data/yago_"$i"_out_for_line.net -output ../intermediate_data/yago_"$i"_out_line_samples"$smp"_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
10 |         python3 /shared/data/yushi2/edge_rep_codes/eval/yago_mrr_from_embedding.py --sample-number 10 --input-embedding ../intermediate_data/yago_"$i"_out_line_samples"$smp"_dim128.emb --input-record ../input_data/yago_"$i"_out_20neg_eval.txt > ../output/yago_"$i"_out_pretrain_"$smp"_128.txt
11 |         ) &
12 |     done
13 | done
14 | wait
15 | 
16 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_and_eval_yago_hins_qi_filtered.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for i in 0.4 #0.6 0.8 #0.2 0.1 0.5 #
 4 | do
 5 |     for smp in 5000 10000 
 6 |     #for smp in 100 200 500 1000 2000 5000 10000
 7 |     do
 8 |         (
 9 |         #/data/yushi2/aspect_embedding_codes/baselines/line/line -train ../input_data/yago_"$i"_out_for_line.net -output ../intermediate_data/yago_"$i"_out_line_samples"$smp"_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
10 |         python3 /shared/data/yushi2/edge_rep_codes/eval/yago_mrr_from_embedding.py --sample-number 10 --input-embedding ../intermediate_data/yago_"$i"_out_line_samples"$smp"_dim128.emb --input-record ../input_data/yago_0.4_out_20neg_eval_qi_filtered.txt > ../output/yago_"$i"_out_pretrain_qi_filtered_"$smp"_128.txt
11 |         ) &
12 |     done
13 | done
14 | wait
15 | 
16 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_dblp_aspem.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for i in 0.2
 4 | do
 5 |     #for smp in 5000 10000 20000 50000 100000 200000
 6 |     for smp in 500 1000 2000 5000 10000 20000
 7 |     do
 8 |         /data/yushi2/aspect_embedding_codes/baselines/line/line -train ../input_data/dblp_"$i"_out_for_aspem_pay_normalized.net -output ../intermediate_data/dblp_"$i"_out_aspem_samples"$smp"_pay_normalized.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
 9 |         /data/yushi2/aspect_embedding_codes/baselines/line/line -train ../input_data/dblp_"$i"_out_for_aspem_papvw_normalized.net -output ../intermediate_data/dblp_"$i"_out_aspem_samples"$smp"_papvw_normalized.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
10 |         python ../aux/merge_score_aspem.py ../intermediate_data/dblp_"$i"_out_aspem_samples"$smp"_pay_normalized.emb ../intermediate_data/dblp_"$i"_out_aspem_samples"$smp"_papvw_normalized.emb ../intermediate_data/dblp_"$i"_out_aspem_samples"$smp"_normalized
11 |     done
12 | done
13 | 
14 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | smp=$1
 4 | out_rate=$2
 5 | 
 6 | echo "No additional fillters applied."
 7 | 
 8 | python3 ../eval/edge_rec_eval_inner_prod.py --eval-file ../input_data/dblp_"$out_rate"_out_20neg_eval_fast.txt --emb-file ../intermediate_data/dblp_"$out_rate"_out_line_samples"$smp"_dim128.emb > ../output/"$out_rate"_out_"$smp"_128.txt
 9 | #python3 ../eval/edge_rec_eval_inner_prod.py --eval-file ../input_data/dblp_"$out_rate"_out_filtered_20neg_eval.txt --emb-file ../intermediate_data/dblp_"$out_rate"_out_line_samples"$smp"_dim128.emb >> ../output/out_"$smp"_128.txt
10 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_eval_0.2_downsampled.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | smp=$1
4 | out_rate=0.2
5 | 
6 | 
7 | python3 ../eval/edge_rec_eval_inner_prod.py --eval-file ../input_data/dblp_"$out_rate"_out_downsampled_20neg_eval.txt --emb-file ../intermediate_data/dblp_"$out_rate"_out_downsampled_line_samples"$smp"_dim128.emb > ../output/"$out_rate"_out_downsampled_"$smp"_128.txt
8 | 
9 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_more_dblp_varying_knowout_rate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for smp in 50000 
4 | do
5 |     #/data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/dblp_0.4_out.net -output ../intermediate_data/dblp_0.4_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
6 |     #/data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/dblp_0.6_out.net -output ../intermediate_data/dblp_0.6_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
7 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/dblp_0.8_out.net -output ../intermediate_data/dblp_0.8_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
8 | done
9 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_more_hins.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for smp in 5000 10000 20000 50000 100000 200000
4 | do
5 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/dblp_0.2_out.net -output ../intermediate_data/dblp_0.2_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 20
6 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/dblp_0.5_out.net -output ../intermediate_data/dblp_0.5_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 20
7 | done
8 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_more_hins_0.2_downsampled.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for smp in 500 1000 2000 5000 10000 20000
4 | do
5 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/dblp_0.2_out_downsampled.net -output ../intermediate_data/dblp_0.2_out_downsampled_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 20
6 | done
7 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_more_hins_0.2_downsampled_keyless.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for smp in 500 1000 2000 5000 10000 20000 100000
4 | do
5 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/dblp_0.2_out_downsampled_keyless.net -output ../intermediate_data/dblp_0.2_out_downsampled_keyless_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 20
6 | done
7 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_more_yago_varying_knowout_rate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for smp in  5000 10000 
4 | do
5 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/yago_0.4_out_for_line.net -output ../intermediate_data/yago_0.4_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
6 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/yago_0.6_out_for_line.net -output ../intermediate_data/yago_0.6_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
7 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/yago_0.8_out_for_line.net -output ../intermediate_data/yago_0.8_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
8 | done
9 | 


--------------------------------------------------------------------------------
/run/archive/pretrain_no_gender_yago_varying_knowout_rate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for smp in  5000 10000 
 4 | do
 5 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/yago_no_gender_0.2_out_for_line.net -output ../intermediate_data/yago_no_gender_0.2_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
 6 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/yago_no_gender_0.4_out_for_line.net -output ../intermediate_data/yago_no_gender_0.4_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
 7 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/yago_no_gender_0.6_out_for_line.net -output ../intermediate_data/yago_no_gender_0.6_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
 8 |     /data/yushi2/aspect_embedding_codes/baselines/line/line  -train ../input_data/yago_no_gender_0.8_out_for_line.net -output ../intermediate_data/yago_no_gender_0.8_out_line_samples"$smp"_alpha0.1_dim128.emb -size 128 -order 1 -negative 5 -samples "$smp" -alpha 0.1 -threads 30
 9 | done
10 | 


--------------------------------------------------------------------------------
/run/archive/re_eval_dblp_all_ko_rates.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for smp in 5000 10000 20000 50000 100000 200000
 4 | do
 5 | for i  in 0.2 0.4 0.6 0.8
 6 | do
 7 |         python3 ../eval/mrr_from_embedding.py --input-record ../input_data/dblp_"$i"_out_20neg_eval_fast.txt --input-embedding ../intermediate_data/dblp_"$i"_out_line_samples"$smp"_alpha0.1_dim128.emb --sample-number 10 > ../output/dblp_"$i"_out_pretrain_"$smp"_128.txt &
 8 | done
 9 | done
10 | wait
11 | 


--------------------------------------------------------------------------------
/run/archive/rename_edge_types.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # to be called from input_data/
 4 | 
 5 | for filename in *dblp*
 6 | do
 7 | (
 8 |     sed -i 's/ PA/ PA:u/g' "$filename"
 9 |     sed -i 's/ PV/ PV:u/g' "$filename"
10 |     sed -i 's/ PW/ PW:u/g' "$filename"
11 |     sed -i 's/ PY/ PY:u/g' "$filename"
12 |     sed -i 's/ PP/ PP:d/g' "$filename"
13 | ) &
14 | done
15 | wait
16 | 
17 | for filename in *yago*
18 | do
19 | (
20 |     sed -i 's/ 1$/ <isAffiliatedTo>:u/g' "$filename"
21 |     sed -i 's/ 2$/ <playsFor>:u/g' "$filename"
22 |     sed -i 's/ 3$/ <wasBornIn>:u/g' "$filename"
23 |     sed -i 's/ 6$/ <holdsPosition>:u/g' "$filename"
24 |     sed -i 's/ 9$/ <created>:u/g' "$filename"
25 |     sed -i 's/ 10$/ <isCitizenOf>:u/g' "$filename"
26 |     sed -i 's/ 11$/ <graduatedFrom>:u/g' "$filename"
27 |     sed -i 's/ 12$/ <hasWonPrize>:u/g' "$filename"
28 |     sed -i 's/ 13$/ <livesIn>:u/g' "$filename"
29 |     sed -i 's/ 15$/ <hasCapital>:d/g' "$filename"
30 |     sed -i 's/ 17$/ <diedIn>:u/g' "$filename"
31 |     sed -i 's/ 20$/ <wroteMusicFor>:u/g' "$filename"
32 |     sed -i 's/ 21$/ <happenedIn>:u/g' "$filename"
33 |     sed -i 's/ 25$/ <actedIn>:u/g' "$filename"
34 |     sed -i 's/ 26$/ <isMarriedTo>:u/g' "$filename"
35 |     sed -i 's/ 27$/ <directed>:u/g' "$filename"
36 |     sed -i 's/ 29$/ <hasChild>:d/g' "$filename"
37 |     sed -i 's/ 30$/ <influences>:d/g' "$filename"
38 |     sed -i 's/ 31$/ <isConnectedTo>:u/g' "$filename"
39 |     sed -i 's/ 33$/ <isPoliticianOf>:u/g' "$filename"
40 |     sed -i 's/ 35$/ <isAdvisedBy>:d/g' "$filename"
41 |     sed -i 's/ 36$/ <edited>:u/g' "$filename"
42 |     sed -i 's/ 38$/ <isLocatedIn>:u/g' "$filename"
43 |     sed -i 's/ 39$/ <isPartOf>:d/g' "$filename"
44 |     sed -i 's/ 1-1$/ <isAffiliatedTo>:u-1/g' "$filename"
45 |     sed -i 's/ 2-1$/ <playsFor>:u-1/g' "$filename"
46 |     sed -i 's/ 3-1$/ <wasBornIn>:u-1/g' "$filename"
47 |     sed -i 's/ 6-1$/ <holdsPosition>:u-1/g' "$filename"
48 |     sed -i 's/ 9-1$/ <created>:u-1/g' "$filename"
49 |     sed -i 's/ 10-1$/ <isCitizenOf>:u-1/g' "$filename"
50 |     sed -i 's/ 11-1$/ <graduatedFrom>:u-1/g' "$filename"
51 |     sed -i 's/ 12-1$/ <hasWonPrize>:u-1/g' "$filename"
52 |     sed -i 's/ 13-1$/ <livesIn>:u-1/g' "$filename"
53 |     sed -i 's/ 15-1$/ <hasCapital>:d-1/g' "$filename"
54 |     sed -i 's/ 17-1$/ <diedIn>:u-1/g' "$filename"
55 |     sed -i 's/ 20-1$/ <wroteMusicFor>:u-1/g' "$filename"
56 |     sed -i 's/ 21-1$/ <happenedIn>:u-1/g' "$filename"
57 |     sed -i 's/ 25-1$/ <actedIn>:u-1/g' "$filename"
58 |     sed -i 's/ 26-1$/ <isMarriedTo>:u-1/g' "$filename"
59 |     sed -i 's/ 27-1$/ <directed>:u-1/g' "$filename"
60 |     sed -i 's/ 29-1$/ <hasChild>:d-1/g' "$filename"
61 |     sed -i 's/ 30-1$/ <influences>:d-1/g' "$filename"
62 |     sed -i 's/ 31-1$/ <isConnectedTo>:u-1/g' "$filename"
63 |     sed -i 's/ 33-1$/ <isPoliticianOf>:u-1/g' "$filename"
64 |     sed -i 's/ 35-1$/ <isAdvisedBy>:d-1/g' "$filename"
65 |     sed -i 's/ 36-1$/ <edited>:u-1/g' "$filename"
66 |     sed -i 's/ 38-1$/ <isLocatedIn>:u-1/g' "$filename"
67 |     sed -i 's/ 39-1$/ <isPartOf>:d-1/g' "$filename"
68 | ) &
69 | done
70 | wait


--------------------------------------------------------------------------------
/run/eval_heer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # e.g.: ./eval_heer.sh yago_ko_0.1 10 diag bn
 4 | 
 5 | time_start=$(date +"%Y%m%d_%H%M%S")
 6 | 
 7 | # find relative root directory
 8 | SOURCE="${BASH_SOURCE[0]}"
 9 | while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
10 |   DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
11 |   SOURCE="$(readlink "$SOURCE")"
12 |   [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
13 | done
14 | script_dir="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
15 | root_dir="$( dirname $script_dir )"
16 | 
17 | # input variables
18 | network=$1  # a.k.a. graph_name; e.g., yago_ko_0.2
19 | epoch=$2  # number of epochs
20 | operator=$3  # operator used to compose edge embedding from node embeddings
21 | map=$4  # mapping on top of edge embedding
22 | more_param=$5  # more customized parameters 
23 | 
24 | # optional argument specifying model evaluation start time when evaluating multiple epochs from the same model
25 | # null if evaluating a single file
26 | per_epoch_eval_time_start=${6:-null}
27 | 
28 | # files
29 | score_file="$root_dir"/intermediate_data/heer_"$network"_"$epoch"_"$operator"_"$map"_"$more_param".txt
30 | fast_eval_file="$root_dir"/input_data/"$network"_eval_fast.txt
31 | if [ -f "$fast_eval_file" ]; then
32 | 	eval_file="$fast_eval_file"
33 | else
34 | 	echo "File $fast_eval_file does not exist. Using non-fast version for evaluation."
35 | 	eval_file="$root_dir"/input_data/"$network"_eval.txt
36 | fi
37 | output_file="$root_dir"/output/out_heer_"$network"_"$epoch"_"$operator"_"$map"_"$more_param"_"$time_start".txt
38 | 
39 | python3 "$root_dir"/eval/mrr_from_score.py --input-score-file $score_file --input-eval-file $eval_file > "$output_file"
40 | 
41 | # when the 5th arg, per_epoch_eval_time_start, is specified, multi-epoch is called by "$root_dir"/src/eval.sh; two more files should be generated
42 | if [ "$per_epoch_eval_time_start" != "null" ]; then
43 |   tail -n 1 "$output_file" | awk '{print $NF}' >> "$root_dir"/output/mrr_micro_heer_"$network"_"$operator"_"$map"_"$more_param"_"$per_epoch_eval_time_start".txt
44 |   tail -n 2 "$output_file" | head -n 1 | awk '{print $NF}' >> "$root_dir"/output/mrr_macro_heer_"$network"_"$operator"_"$map"_"$more_param"_"$per_epoch_eval_time_start".txt
45 | fi
46 | 


--------------------------------------------------------------------------------
/run/knock_out_hin_and_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # e.g.: ./knock_out_hin_and_pretrain.sh ../input_data/yago_original.hin yago 0.1
 4 | 
 5 | # find relative root directory
 6 | SOURCE="${BASH_SOURCE[0]}"
 7 | while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
 8 |   DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
 9 |   SOURCE="$(readlink "$SOURCE")"
10 |   [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
11 | done
12 | script_dir="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
13 | root_dir="$( dirname $script_dir )"
14 | 
15 | # input variables
16 | input_hin=$1  # the path to the complete HIN with format of knocked_out_hin_file
17 | hin_name=$2
18 | ko_rate=$3
19 | if (( $# == 4 )); then
20 | 	num_edge_smp=$4  # number of million edges sampled by LINE
21 | else
22 | 	if [[ "$hin_name" = *"yago"* ]]; then
23 | 		num_edge_smp=10000
24 | 	else
25 | 		if [[ "$hin_name" = *"dblp"* ]]; then
26 | 			num_edge_smp=100000
27 | 		else
28 | 			echo "Sampling 100000 edges for pretrain using LINE. This variable can be specified as the fourth argument."
29 | 			num_edge_smp=100000
30 | 		fi
31 | 	fi
32 | fi
33 | 
34 | # files
35 | knocked_out_hin_file="$root_dir"/input_data/"$hin_name"_ko_"$ko_rate".hin
36 | eval_file="$root_dir"/input_data/"$hin_name"_ko_"$ko_rate"_eval.txt
37 | fast_eval_file="$root_dir"/input_data/"$hin_name"_ko_"$ko_rate"_eval_fast.txt
38 | knocked_out_hin_file_for_line="$knocked_out_hin_file".temp
39 | pretrained_emb="$root_dir"/intermediate_data/pretrained_"$hin_name"_ko_"$ko_rate".emb
40 | 
41 | # knock out HIN
42 | python3 "$root_dir"/preprocessing/ko_hin.py --input-hin-file "$input_hin" --data-set-name "$hin_name" --path-output "$root_dir"/input_data --ko-rate "$ko_rate"
43 | 
44 | # down sample eval file to generate the fast version
45 | python2 "$root_dir"/aux/downsample_eval_file.py --input-file "$eval_file" --output-file "$fast_eval_file"
46 | 
47 | # pretrain by LINE
48 | awk '{print $1, $2, $3}' "$knocked_out_hin_file" > "$knocked_out_hin_file_for_line"
49 | "$root_dir"/pretrain/line -train "$knocked_out_hin_file_for_line" -output "$pretrained_emb" -size 128 -order 1 -negative 5 -samples "$num_edge_smp" -threads 10
50 | rm "$knocked_out_hin_file_for_line"
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/src/case.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #e.g. bash ./src/case.sh yago_ko_0.4 43 1 0 rescale_0.1_lr_10_lrr_10 input_data/0.1_46059_30292_full.hin 3 6
 4 | 
 5 | time_start=$(date +"%Y%m%d_%H%M%S")
 6 | 
 7 | green=`tput setaf 2`
 8 | red=`tput setaf 1`
 9 | yellow=`tput setaf 3`
10 | reset=`tput sgr0`
11 | 
12 | # input variables
13 | network=$1  # a.k.a. graph_name; e.g., yago_ko_0.2
14 | epoch=$2  # number of epochs
15 | operator=$3  # operator used to compose edge embedding from node embeddings
16 | map=$4  # mapping on top of edge embedding
17 | more_param=$5  # more customized parameters
18 | 
19 | sub_net=$6
20 | gpu=$7 # working gpu for prediction
21 | dump_timer=${8:-2} # default dump timer
22 | 
23 | 
24 | # find relative root directory
25 | SOURCE="${BASH_SOURCE[0]}"
26 | while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
27 |   DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
28 |   SOURCE="$(readlink "$SOURCE")"
29 |   [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
30 | done
31 | script_dir="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
32 | root_dir="$( dirname $script_dir )"
33 | 
34 | echo ${yellow}===HEER Testing===${reset}
35 | 
36 | curr_step=0
37 | until [  $curr_step -gt $((epoch - 1)) ]; do
38 | 	echo $curr_step
39 | 	python2 "$root_dir"/src/pred_case.py --iter=$curr_step --batch-size=128 --dimensions=128  --graph-name=$network --data-dir="$root_dir"/intermediate_data/ --model-dir="$root_dir"/intermediate_data/model/ \
40 | 	--pre-train-path="$root_dir"/intermediate_data/pretrained_"$network".emb --more-param="$more_param" \
41 | 	--map_func=$map --gpu=$gpu --op=$operator --test-dir="$root_dir"/intermediate_data/ --sub-net=$sub_net
42 | 	let " curr_step += dump_timer "
43 | done


--------------------------------------------------------------------------------
/src/decoder.py:
--------------------------------------------------------------------------------
 1 | import torch as t
 2 | import numpy as np
 3 | import cPickle
 4 | import sys
 5 | 
 6 | def myfmt(r):
 7 | 	return "%.6f" % (r,)
 8 | 
 9 | def decode(offset, out_mapping, address):
10 | 	offset = list(offset.iteritems())
11 | 	offset.sort(key=lambda x: x[1])
12 | 	for k, i in enumerate(offset):
13 | 		if address >= i[1] and address < offset[k+1][1]:
14 | 			return i[0]+':'+str(out_mapping[i[0]][address-i[1]])
15 | 			
16 | 
17 | if __name__ == '__main__':
18 | 	model = t.load('/shared/data/qiz3/data/model/diag_' + sys.argv[1] +'.pt')
19 | 	prefix = 'dblp_'
20 | 	emb = model.input_embeddings()
21 | 	offset = cPickle.load(open('/shared/data/qiz3/data/' + prefix + 'offset.p'))
22 | 	out_mapping = cPickle.load(open('/shared/data/qiz3/data/' + prefix + 'out_mapping.p'))
23 | 	with open('/shared/data/qiz3/data/hin_' + sys.argv[2] + '.emb', 'w') as IN:
24 | 		for i in xrange(emb.shape[0]):
25 | 			prefix = decode(offset, out_mapping, i)
26 | 			if 'cp' in prefix:
27 | 				break
28 | 			#IN.write(decode(offset, out_mapping, i) +' '+ np.array2string(emb[i,:], prefix='',separator=' ', precision=6)+'\n')
29 | 			vecfmt = np.vectorize(myfmt)
30 | 			IN.write(prefix +' '+ ' '.join(vecfmt(emb[i,:]).tolist())+'\n')
31 | 			#print(np.ndarray.tolist(np.around(emb[i,:], decimals=6)))
32 | 			if i % 100000 == 0:
33 | 				print(i)
34 | 			#break


--------------------------------------------------------------------------------
/src/emb_lib.py:
--------------------------------------------------------------------------------
  1 | import neg
  2 | import torch.optim as optim
  3 | import torch as t
  4 | import numpy as np
  5 | import torch.utils.data as tdata
  6 | import utils
  7 | import math
  8 | import cPickle
  9 | from tqdm import tqdm
 10 | 
 11 | # support heterogenous node type
 12 | # first consider about first order proximity
 13 | # different node type and edge type mapping function
 14 | 
 15 | class SkipGram(object):
 16 | 	"""pytorch implementation for SkipGram"""
 17 | 	def __init__(self, arg):
 18 | 		super(SkipGram, self).__init__()
 19 | 		type_offset = cPickle.load(open(arg['data_dir'] + arg['graph_name'] + '_offset.p'))
 20 | 		self.neg_loss = neg.NEG_loss(type_offset=type_offset, node_types=arg['node_types'], edge_types=arg['edge_types'], 
 21 | 			embed_size=arg['emb_size'], pre_train_path=arg['pre_train'], 
 22 | 			graph_name=arg['graph_name'], mode=arg['mode'], map_mode=arg['map_mode'])
 23 | 		self.input = arg['network']
 24 | 		#print('edge layer learning rate is:', arg['lr'] * (float(len(arg['edge_types'])) / type_offset['sum']))
 25 | 		edge_stats = cPickle.load(open(arg['data_dir'] + arg['graph_name'] + '_edge_stat.p'))
 26 | 		#print(edge_stats)
 27 | 		self.mode = arg['mode']
 28 | 		self.map_mode = arg['map_mode']
 29 | 		self.dump_timer = arg['dump_timer']
 30 | 		self.model_dir = arg['model_dir']
 31 | 		self.log_dir = arg['log_dir']
 32 | 		self.more_param = arg['more_param']
 33 | 		self.fine_tune = arg['fine_tune']
 34 | 		self.lr = arg['lr']
 35 | 		self._params = []
 36 | 		if self.map_mode != -1:
 37 | 			self._params = [{'params': self.neg_loss.in_embed.parameters()}, 
 38 | 				{'params': self.neg_loss.out_embed.parameters()}]
 39 | 			for i in xrange(len(self.neg_loss.edge_mapping)):
 40 | 				self._params.append({'params': self.neg_loss.edge_mapping[i].parameters(),
 41 | 					'lr': arg['lr'] * arg['lr_ratio'] * (float(len(self.input))) / (type_offset['sum'] * edge_stats[i] + 1e-6)})
 42 | 		
 43 | 		self.window_size = arg['window_size']
 44 | 		self.graph_name = arg['graph_name']
 45 | 		
 46 | 		self.data = tdata.DataLoader(self.input, arg['batch_size'], shuffle=True)
 47 | 		self.batch_size = arg['batch_size']
 48 | 		self.iter = arg['iter']
 49 | 		self.neg_ratio = arg['neg_ratio']
 50 | 	
 51 | 	# support fine tune 
 52 | 	def freeze_embedding(self):
 53 | 		self.SGD = optim.SGD(self.neg_loss.parameters(), lr = self.lr)
 54 | 		for param in self.neg_loss.in_embed.parameters():
 55 | 			param.requires_grad = False
 56 | 		for param in self.neg_loss.out_embed.parameters():
 57 | 			param.requires_grad = False
 58 | 		
 59 | 	# support fine tune
 60 | 	def update_embedding(self):
 61 | 		for param in self.neg_loss.in_embed.parameters():
 62 | 			param.requires_grad = True
 63 | 		for param in self.neg_loss.out_embed.parameters():
 64 | 			param.requires_grad = True
 65 | 		if self.map_mode != -1:
 66 | 			self.SGD = optim.SGD(self._params, lr = self.lr)
 67 | 		else:
 68 | 			self.SGD = optim.SGD(self.neg_loss.parameters(), lr = self.lr)
 69 | 
 70 | 	def train(self):
 71 | 		self.neg_loss.train()
 72 | 		self.freeze_embedding()
 73 | 		with open(self.log_dir + 'heer_' + self.graph_name + '_op_' + str(self.mode) + 
 74 | 						'_mode_' + str(self.map_mode)+ '_' + self.more_param + '.log', 'w') as LOG:
 75 | 			for epoch in xrange(self.iter):
 76 | 				loss_sum = 0
 77 | 				if epoch == self.fine_tune:
 78 | 					self.update_embedding()
 79 | 					if self.map_mode > 1:
 80 | 						print("finish fine tuning for deep edge archs")
 81 | 				for i, data in enumerate(self.data, 0):
 82 | 					inputs, labels = data
 83 | 					loss, pure_loss = self.neg_loss(inputs, labels, self.neg_ratio)
 84 | 					
 85 | 					if np.isnan(loss.data.cpu().numpy()):
 86 | 						return -1
 87 | 					loss_sum += pure_loss * self.batch_size
 88 | 					self.SGD.zero_grad()
 89 | 					
 90 | 					loss.backward()
 91 | 					
 92 | 					# utils.clip_sparse_grad_norm(self.neg_loss.in_embed.parameters(), 0.1)
 93 | 					# utils.clip_sparse_grad_norm(self.neg_loss.out_embed.parameters(), 0.1)
 94 | 					
 95 | 					#for i in xrange(len(self.neg_loss.edge_mapping)):
 96 | 					#	utils.clip_grad_norm(self.neg_loss.edge_mapping[i].parameters(), 0.1)
 97 | 
 98 | 					self.SGD.step()
 99 | 					
100 | 
101 | 				if epoch % self.dump_timer == 0:
102 | 					if self.more_param != 'None':
103 | 						model_path = self.model_dir + 'heer_' + self.graph_name + '_' + str(epoch) + '_op_' + str(self.mode) + \
104 | 							'_mode_' + str(self.map_mode)+ '_' + self.more_param + '.pt'
105 | 					else:
106 | 						model_path = self.model_dir + 'heer_' + self.graph_name + '_' + str(epoch) + '_op_' + str(self.mode) + \
107 | 							'_mode_' + str(self.map_mode)+ '.pt'
108 | 					t.save(self.neg_loss.state_dict(), model_path)
109 | 
110 | 				LOG.write(str(epoch) + '\t' + str(np.asscalar(loss_sum.data.cpu().numpy())) + '\n')
111 | 				LOG.flush()
112 | 				
113 | 			#return i_sum, o_sum, e_sum
114 | 			#pbar.close()
115 | 				#print(self.neg_loss.input_embeddings()[0,:])
116 | 			#print(epoch, loss_sum)
117 | 
118 | 	def output(self):
119 | 		word_embeddings = self.neg_loss.input_embeddings()
120 | 


--------------------------------------------------------------------------------
/src/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #e.g. bash ./src/eval.sh yago_ko_0.4 0 1 -1 yago 7
 4 | 
 5 | time_start=$(date +"%Y%m%d_%H%M%S")
 6 | 
 7 | green=`tput setaf 2`
 8 | red=`tput setaf 1`
 9 | yellow=`tput setaf 3`
10 | reset=`tput sgr0`
11 | 
12 | # input variables
13 | network=$1  # a.k.a. graph_name; e.g., yago_ko_0.2
14 | epoch=$2  # number of epochs
15 | operator=${3:-1}  # operator used to compose edge embedding from node embeddings
16 | map=${4:-0}  # mapping on top of edge embedding
17 | more_param=${5:-rescale_0.1_lr_10_lrr_10}  # more customized parameters
18 | gpu=${6:-0} # working gpu for prediction
19 | dump_timer=${7:-6} # default dump timer
20 | 
21 | # find relative root directory
22 | SOURCE="${BASH_SOURCE[0]}"
23 | while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
24 |   DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
25 |   SOURCE="$(readlink "$SOURCE")"
26 |   [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
27 | done
28 | script_dir="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
29 | root_dir="$( dirname $script_dir )"
30 | 
31 | curr_step=0
32 | until [  $curr_step -gt $((epoch - 1)) ]; do
33 |     echo $curr_step
34 | 
35 |     python "$root_dir"/src/pred.py --iter=$curr_step --batch-size=128 --dimensions=128  --graph-name=$network --data-dir="$root_dir"/intermediate_data/ --model-dir="$root_dir"/intermediate_data/model/ \
36 |     --pre-train-path="$root_dir"/intermediate_data/pretrained_"$network".emb --more-param="$more_param" \
37 |     --map_func=$map --gpu=$gpu --op=$operator 	
38 |     let " curr_step += dump_timer "
39 | done
40 | 
41 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | from emb_lib import SkipGram
  5 | import network as nx
  6 | import torch as t
  7 | import cPickle
  8 | import utils
  9 | import torch.utils.data as tdata
 10 | 
 11 | global config
 12 | 
 13 | def parse_args():
 14 | 	'''
 15 | 	Parses the heer arguments.
 16 | 	'''
 17 | 	parser = argparse.ArgumentParser(description="Run heer.")
 18 | 
 19 | 	parser.add_argument('--more-param', nargs='?', default='None', 
 20 | 	                    help='customized parameter setting')
 21 | 
 22 | 	parser.add_argument('--input', nargs='?',
 23 | 	                    help='Input graph path')
 24 | 
 25 | 	parser.add_argument('--gpu', nargs='?', default='0',
 26 | 	                    help='Embeddings path')
 27 | 
 28 | 	parser.add_argument('--dimensions', type=int, default=128,
 29 | 	                    help='Number of dimensions. Default is 128.')
 30 | 	
 31 | 	parser.add_argument('--batch-size', type=int, default=50,
 32 | 	                    help='Batch size. Default is 50.')
 33 | 
 34 | 	parser.add_argument('--window-size', type=int, default=1,
 35 |                     	help='Context size for optimization. Default is 10.')
 36 | 
 37 | 	parser.add_argument('--pre-train-path', type=str, default='',
 38 |                     	help='embedding initialization')
 39 | 	parser.add_argument('--pre-load-model', type=str, default=None,
 40 |                     	help='module initialization')
 41 | 
 42 | 	parser.add_argument('--build-graph', type=bool, default=False,
 43 |                     	help='heterogeneous information network construction')
 44 | 
 45 | 	parser.add_argument('--graph-name', type=str, default='',
 46 |                     	help='prefix of dumped data')
 47 | 	parser.add_argument('--data-dir', type=str, default='',
 48 |                     	help='data directory')
 49 | 	parser.add_argument('--model-dir', type=str, default='',
 50 |                     	help='model directory')
 51 | 	parser.add_argument('--log-dir', type=str, default='',
 52 |                     	help='log directory')
 53 | 	parser.add_argument('--fine-tune', type=int, default=0,
 54 |                     	help='fine tune phase')
 55 | 
 56 | 	parser.add_argument('--node-types', type=list, default=['PR', 'AD', 'WO', 'AS', 'GE', 'PE', 'EV', 'PO'])
 57 | 
 58 | 	parser.add_argument('--edge-types', type=list, default=[(1,0),(1,1),(1,2), (1,3),(1,4)])
 59 | 	#parser.add_argument('--edge-types', type=list, default=[(0,5),(1,5),(2,5),(3,5),(4,5)])
 60 | 
 61 | 	parser.add_argument('--iter', default=500, type=int,
 62 |                       help='Number of epochs in SGD')
 63 | 	parser.add_argument('--op', default=0, type=int)
 64 | 	parser.add_argument('--map_func', default=0, type=int)
 65 | 	parser.add_argument('--dump-timer', default=5, type=int)
 66 | 
 67 | 	parser.add_argument('--weighted', dest='weighted', action='store_true',
 68 | 	                    help='Boolean specifying (un)weighted. Default is unweighted.')
 69 | 	parser.add_argument('--unweighted', dest='unweighted', action='store_false')
 70 | 	parser.set_defaults(weighted=False)
 71 | 
 72 | 	parser.add_argument('--directed', dest='directed', action='store_true',
 73 | 	                    help='Graph is (un)directed. Default is undirected.')
 74 | 	parser.add_argument('--undirected', dest='undirected', action='store_false')
 75 | 	parser.set_defaults(directed=False)
 76 | 
 77 | 	return parser.parse_args()
 78 | 
 79 | 
 80 | def learn_embeddings():
 81 | 	'''
 82 | 	Learn embeddings by optimizing the Skipgram objective using SGD.
 83 | 	'''
 84 | 	print('Network Spec:',config)
 85 | 
 86 | 	# flexible param interface for tuning
 87 | 	more_param = args.more_param  # everything separated by underscore, e.g., rescale_0.1_lr_0.02
 88 | 	more_param_dict = {}  # {param_key: param_value_str}
 89 | 	if more_param != 'None':
 90 | 		more_param_list = more_param.split("_")
 91 | 		assert len(more_param_list) % 2 == 0
 92 | 		for i in xrange(0, len(more_param_list), 2):
 93 | 			more_param_dict[more_param_list[i]] = more_param_list[i+1]
 94 | 	rescale_factor = 1. if 'rescale' not in more_param_dict else float(more_param_dict['rescale'])
 95 | 	learning_rate = 1. if 'lr' not in more_param_dict else float(more_param_dict['lr'])  # please keep default values consistent with records on our google spreadsheet
 96 | 	learning_rate_ratio = 16. if 'lrr' not in more_param_dict else float(more_param_dict['lrr'])  # please keep default values consistent with records on our google spreadsheet
 97 | 
 98 | 	_data = ''
 99 | 	if len(args.pre_train_path) > 0:
100 | 		_data = rescale_factor * utils.load_emb(args.data_dir, args.pre_train_path, args.dimensions, args.graph_name, config['nodes'])
101 | 	_network = tdata.TensorDataset(t.LongTensor(cPickle.load(open(args.data_dir + args.graph_name + '_input.p'))), 
102 |                                t.LongTensor(cPickle.load(open(args.data_dir + args.graph_name + '_output.p'))))
103 | 	model = SkipGram({'emb_size':args.dimensions,
104 | 		'window_size':1, 'batch_size':args.batch_size, 'iter':args.iter, 'neg_ratio':5,
105 | 		'graph_name':args.graph_name, 'dump_timer':args.dump_timer, 'model_dir':args.model_dir, 'log_dir':args.log_dir,
106 | 		'data_dir':args.data_dir, 'mode':args.op, 'map_mode':args.map_func,'fine_tune':args.fine_tune,
107 | 		'lr_ratio':learning_rate_ratio, 'lr': learning_rate, 'network':_network, 'more_param': args.more_param,
108 | 		'pre_train':_data, 'node_types':config['nodes'], 'edge_types':config['edges']})
109 | 	
110 | 	if args.pre_load_model:
111 | 		pre_load_model = t.load(args.pre_load_model, map_location=lambda storage, loc: storage)
112 | 		model.neg_loss.load_state_dict(pre_load_model)
113 | 		model.cuda()
114 | 	
115 | 	model.train()
116 | 	
117 | 	return
118 | 
119 | def main(args):
120 | 	'''
121 | 	Pipeline for representational learning for all nodes in a graph.
122 | 	'''
123 | 	global config
124 | 	config_name = os.path.join(args.data_dir.replace('intermediate', 'input'), args.graph_name.split('_ko_')[0] + '.config')
125 | 	config = utils.read_config(config_name)
126 | 	if args.build_graph:
127 | 		#print(args.node_types)
128 | 		tmp = nx.HinLoader({'graph': args.input, 'types':config['nodes'], 'edge_types':config['edges']})
129 | 		tmp.readHin(config['types'])
130 | 		tmp.encode()
131 | 		tmp.dump(args.data_dir + args.graph_name)
132 | 		#print(args.edge_types)
133 | 	else:
134 | 		learn_embeddings()
135 | 
136 | #for YaGo
137 | def load_aspect(args):
138 | 	total_types = [(5, 2), (5, 5), (5, 2), (5, 2), (6, 1), (5, 5), (5, 3), (5, 1), (5, 3), 
139 | 	(5, 7), (5, 2), (5, 4), (5, 1), (3, 1), (5, 3), (5, 1), (1, 1), (5, 0), (1, 1), (5, 1), 
140 | 	(5, 1), (5, 5), (5, 5), (5, 2), (5, 5)]
141 | 	#edge_type_id = ['3' , '13', '17', '10', '33', '1' , '11', '2' , '26', '31', '29', '30', '35', '9' , '27', '25', '20', '36', '12', '6' , '39', '15', '38', '21']
142 | 	edge_type_id = ['25', '26', '27', '20', '21', '29', '1', '3', '2', '6', '9', '8', '13', '38', '11', '10', '39', '12', '15', '17', '33', '31', '30', '36', '35']
143 | 	aspects = {
144 | 		'PE-LOC':['26', '29', '15', '39', '13', '10'],
145 | 		'PE-WO':['20', '25', '27', '9', '36'],
146 | 		'complex':['15', '39', '38', '1', '33', '13', '10', '6']
147 | 	}
148 | 	for aspect in aspects:
149 | 		tmp = nx.HinLoader({'graph': args.input, 'types':args.node_types, 'edge_types':total_types, 'edge_ids':aspects[aspect]})
150 | 		tmp.readHin()
151 | 		tmp.encode()
152 | 		tmp.dump('/shared/data/qiz3/data/' + args.graph_name+aspect+'_')
153 | 
154 | if __name__ == "__main__":
155 | 	args = parse_args()
156 | 	#read_hin(args.input)
157 | 	t.cuda.set_device(int(args.gpu))
158 | 	main(args)
159 | 	#load_aspect(args)
160 | 


--------------------------------------------------------------------------------
/src/network.py:
--------------------------------------------------------------------------------
 1 | import cPickle
 2 | 
 3 | class HinLoader(object):
 4 | 	"""docstring for HinLoader"""
 5 | 	def __init__(self, arg):
 6 | 		self.in_mapping = dict()
 7 | 		self.out_mapping = dict()
 8 | 		self.input = list()
 9 | 		self.output = list()
10 | 		self.arg = arg
11 | 		self.edge_stat = [0] * len(self.arg['edge_types'])
12 | 		#print(arg['types'])
13 | 		for k in arg['types']:
14 | 			self.in_mapping[k] = dict()
15 | 			self.out_mapping[k] = dict()
16 | 		#print(self.in_mapping.keys())
17 | 		#print(self.out_mapping.keys())
18 | 
19 | 	def inNodeMapping(self, key, type):
20 | 		if key not in self.in_mapping[type]:
21 | 			self.out_mapping[type][len(self.in_mapping[type])] = key
22 | 			self.in_mapping[type][key] = len(self.in_mapping[type])
23 | 
24 | 		return self.in_mapping[type][key]
25 | 
26 | 	def readHin(self, _edge_types):
27 | 		#num_nodes = defaultdict(int)
28 | 		with open(self.arg['graph']) as INPUT:
29 | 			for line in INPUT:
30 | 				edge = line.strip().split(' ')
31 | 				edge_type = _edge_types.index(edge[-1])
32 | 				node_a = edge[0].split(':')
33 | 				node_b = edge[1].split(':')
34 | 				node_a_type = self.arg['types'].index(node_a[0])
35 | 				node_b_type = self.arg['types'].index(node_b[0])
36 | 				#assert edge_type != 11
37 | 				self.edge_stat[edge_type] += 1
38 | 				assert [node_a_type, node_b_type] == self.arg['edge_types'][edge_type][:2]
39 | 				self.input.append([edge_type, self.inNodeMapping(node_a[1], node_a[0])])
40 | 				self.output.append([self.arg['types'].index(node_b[0]), self.inNodeMapping(node_b[1], node_b[0])])
41 | 	
42 | 	def encode(self):
43 | 		self.encoder = dict()
44 | 		offset = 0
45 | 		for k in self.arg['types']:
46 | 			self.encoder[k] = offset
47 | 			offset += len(self.in_mapping[k])
48 | 		self.encoder['sum'] = offset
49 | 		print(self.encoder)
50 | 		for i,ie in enumerate(self.input):
51 | 			self.input[i][1] += self.encoder[self.arg['types'][self.arg['edge_types'][ie[0]][0]]]
52 | 		for i,ie in enumerate(self.output):
53 | 			self.output[i][1] += self.encoder[self.arg['types'][ie[0]]]
54 | 			
55 | 
56 | 	def dump(self, dump_path):
57 | 		print(self.edge_stat)
58 | 		cPickle.dump(self.encoder, open(dump_path + '_offset.p', 'wb'))
59 | 		cPickle.dump(self.input, open(dump_path + '_input.p', 'wb'))
60 | 		cPickle.dump(self.output, open(dump_path + '_output.p', 'wb'))
61 | 		cPickle.dump(self.in_mapping, open(dump_path + '_in_mapping.p', 'wb'))
62 | 		cPickle.dump(self.out_mapping, open(dump_path + '_out_mapping.p', 'wb'))
63 | 		cPickle.dump(self.edge_stat, open(dump_path + '_edge_stat.p', 'wb'))


--------------------------------------------------------------------------------
/src/pred.py:
--------------------------------------------------------------------------------
  1 | import torch as t
  2 | import numpy as np
  3 | import cPickle
  4 | import sys,os
  5 | import neg
  6 | import argparse
  7 | import torch.utils.data as tdata
  8 | import utils
  9 | from tqdm import tqdm
 10 | 
 11 | def parse_args():
 12 | 	'''
 13 | 	Parses the heer arguments.
 14 | 	'''
 15 | 	parser = argparse.ArgumentParser(description="Run heer.")
 16 | 
 17 | 	parser.add_argument('--more-param', nargs='?', default='None',
 18 | 	                    help='customized parameter setting')
 19 | 
 20 | 	parser.add_argument('--input', nargs='?', default='graph/karate.edgelist',
 21 | 	                    help='Input graph path')
 22 | 
 23 | 	parser.add_argument('--gpu', nargs='?', default='0',
 24 | 	                    help='Embeddings path')
 25 | 
 26 | 	parser.add_argument('--dimensions', type=int, default=128,
 27 | 	                    help='Number of dimensions. Default is 128.')
 28 | 	
 29 | 	parser.add_argument('--batch-size', type=int, default=50,
 30 | 	                    help='Batch size. Default is 50.')
 31 | 
 32 | 	parser.add_argument('--window-size', type=int, default=1,
 33 |                     	help='Context size for optimization. Default is 10.')
 34 | 
 35 | 	parser.add_argument('--pre-train-path', type=str, default='',
 36 |                     	help='embedding initialization')
 37 | 
 38 | 	parser.add_argument('--build-graph', type=bool, default=False,
 39 |                     	help='heterogeneous information network construction')
 40 | 
 41 | 	parser.add_argument('--graph-name', type=str, default='',
 42 |                     	help='prefix of dumped data')
 43 | 	parser.add_argument('--data-dir', type=str, default='',
 44 |                     	help='data directory')
 45 | 	parser.add_argument('--model-dir', type=str, default='',
 46 |                     	help='model directory')
 47 | 	parser.add_argument('--test-dir', type=str, default='',
 48 |                     	help='test directory')
 49 | 
 50 | 	parser.add_argument('--iter', default=500, type=int,
 51 |                       help='Number of epochs in SGD')
 52 | 	parser.add_argument('--op', default=0, type=int)
 53 | 	parser.add_argument('--map_func', default=0, type=int)
 54 | 	parser.add_argument('--fast', default=1, type=int)
 55 | 	parser.add_argument('--dump-timer', default=5, type=int)
 56 | 
 57 | 	parser.add_argument('--weighted', dest='weighted', action='store_true',
 58 | 	                    help='Boolean specifying (un)weighted. Default is unweighted.')
 59 | 	parser.add_argument('--unweighted', dest='unweighted', action='store_false')
 60 | 	parser.set_defaults(weighted=False)
 61 | 
 62 | 	parser.add_argument('--directed', dest='directed', action='store_true',
 63 | 	                    help='Graph is (un)directed. Default is undirected.')
 64 | 	parser.add_argument('--undirected', dest='undirected', action='store_false')
 65 | 	parser.set_defaults(directed=False)
 66 | 
 67 | 	return parser.parse_args()
 68 | 
 69 | def load_mapping(input_file):
 70 | 	id2name = dict()
 71 | 	with open(input_file) as IN:
 72 | 		for line in IN:
 73 | 			tmp = line.strip().split('\t')
 74 | 			id2name[int(tmp[1])] = tmp[0]
 75 | 	return id2name
 76 | 
 77 | if __name__ == '__main__':
 78 | 	args = parse_args()
 79 | 	arg = {}
 80 | 	_data = ''
 81 | 	config_name = os.path.join(os.path.dirname(args.data_dir).replace('intermediate', 'input'), args.graph_name.split('_ko_')[0] + '.config')
 82 | 	config = utils.read_config(config_name)
 83 | 	#config['nodes'] = ['PR', 'AD', 'WO', 'AS', 'GE', 'PE', 'EV', 'PO']
 84 | 	#config['edges'] = [(5, 2), (5, 5), (5, 2), (5, 2), (6, 1), (5, 5), (5, 3), (5, 1), (5, 3), (5, 7), (5, 2), (5, 4), (5, 1), (3, 1), (5, 3), (5, 1), (1, 1), (5, 0), (1, 1), (5, 1), (5, 1), (5, 5), (5, 5), (5, 2), (5, 5)]
 85 | 
 86 | 	# baseline score
 87 | 	if args.op == -1:
 88 | 		_data = utils.load_emb(args.data_dir, args.pre_train_path, args.dimensions, args.graph_name, config['nodes'])
 89 | 		#args.op = 1
 90 | 	#print(_data)
 91 | 	t.cuda.set_device(int(args.gpu))
 92 | 	
 93 | 
 94 | 	type_offset = cPickle.load(open(args.data_dir + args.graph_name + '_offset.p'))
 95 | 	in_mapping = cPickle.load(open(args.data_dir + args.graph_name + '_in_mapping.p'))
 96 | 	out_mapping = cPickle.load(open(args.data_dir + args.graph_name + '_out_mapping.p'))
 97 | 	model = neg.NEG_loss(type_offset=type_offset, node_types=config['nodes'], edge_types=config['edges'], 
 98 | 		embed_size=args.dimensions, pre_train_path=_data, graph_name=args.graph_name, 
 99 | 		mode=args.op, map_mode=args.map_func)
100 | 	
101 | 
102 | 	
103 | 	#print(model.in_embed.weight.sum())
104 | 	if args.op != -1:
105 | 		if args.more_param != 'None':
106 | 			model_path = args.model_dir + 'heer_' + args.graph_name + '_' + str(args.iter) + '_op_' + str(args.op) + \
107 | 				'_mode_' + str(args.map_func)+ '_' + args.more_param + '.pt'
108 | 		else:
109 | 			model_path = args.model_dir + 'heer_' + args.graph_name + '_' + str(args.iter) + '_op_' + str(args.op) + \
110 | 				'_mode_' + str(args.map_func)+ '.pt'
111 | 		print('model path:',model_path)
112 | 		xxx = t.load(model_path, map_location=lambda storage, loc: storage)
113 | 		model.load_state_dict(xxx, False )
114 | 		out_emb = model.output_embeddings()
115 | 		
116 | 		offset,prev_offset = 0,0
117 | 		print(type_offset)
118 | 		with open(args.data_dir + 'heer_' + args.graph_name + '_' + str(args.iter) + '_op_' + str(args.op) + \
119 | 				'_mode_' + str(args.map_func)+ '_' + args.more_param + '.emb', 'w') as OUT:
120 | 			num_nodes, num_dim = out_emb.shape
121 | 			OUT.write(str(num_nodes)+' '+str(num_dim)+'\n')
122 | 			config['nodes'].append('sum')
123 | 			for idx,t in enumerate(config['nodes']):
124 | 				if t == 'sum':
125 | 					break
126 | 				tp = config['nodes'][idx+1]
127 | 				while offset < type_offset[tp]:
128 | 					OUT.write("{}:{} {}\n".format(t, out_mapping[t][offset-prev_offset], ' '.join(map(str,out_emb[offset].tolist())) ))
129 | 					offset += 1
130 | 				prev_offset = type_offset[tp]
131 | 				#	out_mapping[]
132 |             #type_offset[tp]
133 | 
134 | 
135 | 
136 | 		#print(type_offset['D'])
137 | 		#print("{} {}".format(len(in_mapping['D'])+len(in_mapping['P']), 100))
138 | 		#id2name = load_mapping('/shared/data/qiz3/text_summ/intermediate_data/nyt13_10k_9_25_kb.node')
139 | 		#for i in range(type_offset['D'], type_offset['D'] + len(in_mapping['D'])):
140 | 		#	print("{} {}".format(id2name[int(out_mapping['D'][i - type_offset['D']])], ' '.join(map(str,out_emb[i].tolist())) ))
141 | 			#break
142 | 		#for i in range(type_offset['P'], type_offset['P'] + len(in_mapping['P'])):
143 | 		#	print("{} {}".format(id2name[int(out_mapping['P'][i - type_offset['P']])], ' '.join(map(str,out_emb[i].tolist())) ))
144 | 


--------------------------------------------------------------------------------
/src/pred_case.py:
--------------------------------------------------------------------------------
  1 | import torch as t
  2 | import numpy as np
  3 | import cPickle
  4 | import sys,os
  5 | import neg
  6 | import argparse
  7 | import torch.utils.data as tdata
  8 | import utils
  9 | from tqdm import tqdm
 10 | 
 11 | def parse_args():
 12 | 	'''
 13 | 	Parses the heer arguments.
 14 | 	'''
 15 | 	parser = argparse.ArgumentParser(description="Run heer.")
 16 | 
 17 | 	parser.add_argument('--more-param', nargs='?', default='None',
 18 | 	                    help='customized parameter setting')
 19 | 
 20 | 	parser.add_argument('--input', nargs='?', default='graph/karate.edgelist',
 21 | 	                    help='Input graph path')
 22 | 
 23 | 	parser.add_argument('--gpu', nargs='?', default='0',
 24 | 	                    help='Embeddings path')
 25 | 
 26 | 	parser.add_argument('--dimensions', type=int, default=128,
 27 | 	                    help='Number of dimensions. Default is 128.')
 28 | 
 29 | 	parser.add_argument('--pre-train-path', type=str, default='',
 30 |                     	help='embedding initialization')
 31 | 	parser.add_argument('--batch-size', type=int, default=50,
 32 | 	                    help='Batch size. Default is 50.')
 33 | 	parser.add_argument('--iter', default=500, type=int,
 34 |                       help='Number of epochs in dumped model')
 35 | 	parser.add_argument('--graph-name', type=str, default='',
 36 |                     	help='prefix of dumped data')
 37 | 	parser.add_argument('--data-dir', type=str, default='',
 38 |                     	help='data directory')
 39 | 	parser.add_argument('--model-dir', type=str, default='',
 40 |                     	help='model directory')
 41 | 	parser.add_argument('--test-dir', type=str, default='',
 42 |                     	help='test directory')
 43 | 	parser.add_argument('--sub-net', type=str, default='',
 44 |                     	help='sub network path')
 45 | 	parser.add_argument('--dump-timer', default=5, type=int)
 46 | 	parser.add_argument('--op', default=0, type=int)
 47 | 	parser.add_argument('--map_func', default=0, type=int)
 48 | 	parser.set_defaults(directed=False)
 49 | 
 50 | 	return parser.parse_args()
 51 | 
 52 | if __name__ == '__main__':
 53 | 	args = parse_args()
 54 | 	arg = {}
 55 | 	_data = ''
 56 | 	config_name = os.path.join(os.path.dirname(args.data_dir).replace('intermediate', 'input'), args.graph_name.split('_ko_')[0] + '.config')
 57 | 	config = utils.read_config(config_name)
 58 | 	#config['nodes'] = ['PR', 'AD', 'WO', 'AS', 'GE', 'PE', 'EV', 'PO']
 59 | 	#config['edges'] = [(5, 2), (5, 5), (5, 2), (5, 2), (6, 1), (5, 5), (5, 3), (5, 1), (5, 3), (5, 7), (5, 2), (5, 4), (5, 1), (3, 1), (5, 3), (5, 1), (1, 1), (5, 0), (1, 1), (5, 1), (5, 1), (5, 5), (5, 5), (5, 2), (5, 5)]
 60 | 
 61 | 	# baseline score
 62 | 	if args.op == -1:
 63 | 		_data = utils.load_emb(args.data_dir, args.pre_train_path, args.dimensions, args.graph_name, config['nodes'])
 64 | 		#args.op = 1
 65 | 	#print(_data)
 66 | 	t.cuda.set_device(int(args.gpu))
 67 | 	
 68 | 
 69 | 	type_offset = cPickle.load(open(args.data_dir + args.graph_name + '_offset.p'))
 70 | 	model = neg.NEG_loss(type_offset=type_offset, node_types=config['nodes'], edge_types=config['edges'], 
 71 | 		embed_size=args.dimensions, pre_train_path=_data, graph_name=args.graph_name, 
 72 | 		mode=args.op, map_mode=args.map_func)
 73 | 	
 74 | 
 75 | 	in_mapping = cPickle.load(open(args.data_dir + args.graph_name +'_in_mapping.p'))
 76 | 	#print(model.in_embed.weight.sum())
 77 | 	if args.op != -1:
 78 | 		if args.more_param != 'None':
 79 | 			model_path = args.model_dir + 'heer_' + args.graph_name + '_' + str(args.iter) + '_op_' + str(args.op) + \
 80 | 				'_mode_' + str(args.map_func)+ '_' + args.more_param + '.pt'
 81 | 		else:
 82 | 			model_path = args.model_dir + 'heer_' + args.graph_name + '_' + str(args.iter) + '_op_' + str(args.op) + \
 83 | 				'_mode_' + str(args.map_func)+ '.pt'
 84 | 		print('model path:',model_path)
 85 | 		xxx = t.load(model_path, map_location=lambda storage, loc: storage)
 86 | 		model.load_state_dict(xxx, False )
 87 | 		model.cuda()
 88 | 	model.eval()
 89 | 	
 90 | 	# for later sanity check
 91 | 	# model.map_mode = -1
 92 | 	#if args.map_func == 1:
 93 | 	#	model.map_mode = 0
 94 | 		#print(model.parameters())
 95 | 
 96 | 	#model.load_state_dict(t.load('/shared/data/qiz3/data/model/' +  args.model_name +'.pt'))
 97 | 	#print(model.in_embed.weight.sum())
 98 | 	#print(model.in_embed.weight[1000].data.cpu().numpy().tolist())
 99 | 	#for el in model.edge_mapping:
100 | 	#	print(el, el.weight.data.cpu().numpy().tolist())
101 | 	#print(model.)
102 | 
103 | 	
104 | 	#sys.exit(-1)
105 | 	
106 | 	print("Model Mode:", args.op)
107 | 	#pred_types = [0, 1, 2, 3, 4]
108 | 	with open(args.sub_net, 'r') as SUB:
109 | 		edge_types = set()
110 | 		_input = []
111 | 		_output = []
112 | 		for line in SUB:
113 | 			node = line.strip().split(' ')
114 | 			_type_a, _id_a = node[0].split(':')
115 | 			_type_b, _id_b = node[1].split(':')
116 | 			edge_types.add(node[-1])
117 | 			_input.append(in_mapping[_type_a][_id_a] + type_offset[_type_a])
118 | 			_output.append(in_mapping[_type_b][_id_b] + type_offset[_type_b])
119 | 		print("#edge types: ", len(edge_types))
120 | 		edge_types = list(edge_types)
121 | 		for tp in edge_types:
122 | 			input_data = tdata.TensorDataset(t.LongTensor(_input), t.LongTensor(_output))
123 | 			#print(len(input_data))
124 | 			
125 | 			data_reader = tdata.DataLoader(input_data, args.batch_size, shuffle=False)
126 | 			score = []
127 | 			#model = neg.NEG_loss(type_offset=type_offset, node_types=args.node_types, edge_types=args.edge_types, embed_size=arg['emb_size'], pre_train_path=arg['pre_train'], graph_name=arg['graph_name'])
128 | 			#pbar = tqdm(total=len(data_reader) / args.batch_size)
129 | 			for i, data in enumerate(data_reader, 0):
130 | 				inputs, labels = data
131 | 				loss = model.predict(inputs, labels, config['types'].index(tp))
132 | 				score += loss
133 | 				#pbar.update(1)
134 | 			#pbar.close()
135 | 
136 | 			with open(args.sub_net) as INPUT, open(args.sub_net.replace('.hin', '_'+str(args.iter)+'_'+tp+'.txt'), 'w') as OUTPUT:
137 | 				for i, line in enumerate(INPUT):
138 | 					node = line.strip().split(' ')
139 | 					_type_a, _id_a = node[0].split(':')
140 | 					_type_b, _id_b = node[1].split(':')
141 | 					assert _id_a in in_mapping[_type_a] and _id_b in in_mapping[_type_b]
142 | 					node[2] = str(score[i])
143 | 					node[3] = tp
144 | 					OUTPUT.write(' '.join(node) + '\n')
145 | 
146 | 


--------------------------------------------------------------------------------
/src/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #e.g. bash ./src/run.sh yago_ko_0.4 20 1 0 yago 7
 4 | # bash ./src/run.sh yago_ko_0.4 11 1 3 deep_exp 2
 5 | 
 6 | green=`tput setaf 2`
 7 | red=`tput setaf 1`
 8 | yellow=`tput setaf 3`
 9 | reset=`tput sgr0`
10 | 
11 | # input variables
12 | network=$1  # a.k.a. graph_name; e.g., yago_ko_0.2
13 | epoch=$2  # number of epochs
14 | operator=${3:-1}  # operator used to compose edge embedding from node embeddings
15 | map=${4:-0}  # mapping on top of edge embedding
16 | more_param=${5:-rescale_0.1_lr_10_lrr_10}  # more customized parameters
17 | gpu=${6:-0} # working gpu for training
18 | dump_timer=${7:-6} # default dump timer
19 | 
20 | # find relative root directory
21 | SOURCE="${BASH_SOURCE[0]}"
22 | while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
23 |   DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
24 |   SOURCE="$(readlink "$SOURCE")"
25 |   [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
26 | done
27 | script_dir="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
28 | root_dir="$( dirname $script_dir )"
29 | SUFFIX=_input.p
30 | 
31 | 
32 | echo ${green}===Constructing Training Net===${reset}
33 | if [ ! -e  "$root_dir"/intermediate_data/"$network$SUFFIX" ]; then
34 | 	#python2 main.py --input=/shared/data/yushi2/edge_rep_codes/input_data/yago_no_gender_0.4_out.net --build-graph=True --graph-name=$GRAPH_NAME --data-dir=$DATA_DIR
35 | 	python2 ./src/main.py --input="$root_dir"/input_data/"$network".hin --build-graph=True \
36 | 		--graph-name=$network --data-dir="$root_dir"/intermediate_data/
37 | fi
38 | echo ${red}===HEER Training===${reset}
39 | python2 ./src/main.py --iter=$2 --batch-size=128 --dimensions=128  --graph-name=$network --data-dir="$root_dir"/intermediate_data/ --model-dir="$root_dir"/intermediate_data/model/ \
40 | --dump-timer=$dump_timer --map_func=$map --op=$operator --gpu=$gpu --more-param="$more_param" --log-dir="$root_dir"/log/ \
41 | --pre-train-path="$root_dir"/intermediate_data/pretrained_"$network".emb --fine-tune=0
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/src/test.py:
--------------------------------------------------------------------------------
 1 | import cPickle
 2 | import sys
 3 | 
 4 | if __name__ == '__main__':
 5 | 	nodes = dict()
 6 | 	with open(sys.argv[1]) as IN:
 7 | 		IN.readline()
 8 | 		
 9 | 		for line in IN:
10 | 			nodes[line.strip().split(' ')[0]] = 1
11 | 	with open(sys.argv[2]) as IN:
12 | 		IN.readline()
13 | 		for line in IN:
14 | 			assert(line.strip().split(' ')[0] in nodes)


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.nn import Parameter
  5 | 
  6 | import numpy as np
  7 | 
  8 | import cPickle
  9 | import ast
 10 | 
 11 | def read_config(conf_name):
 12 |     config = {}
 13 |     with open(conf_name) as IN:
 14 |         config['edges'] = ast.literal_eval(IN.readline())
 15 |         config['nodes'] = ast.literal_eval(IN.readline())
 16 |         config['types'] = ast.literal_eval(IN.readline())
 17 |         for i,x in enumerate(ast.literal_eval(IN.readline())):
 18 |             config['edges'][i].append(int(x))        
 19 |     assert len(config['edges']) == len(config['types'])
 20 |     return config
 21 | 
 22 | def load_emb(root_dir, emb_path, emb_size, graph_name, node_types):
 23 |     in_mapping = cPickle.load(open(root_dir + graph_name +'_in_mapping.p'))
 24 |     type_offset = cPickle.load(open(root_dir  + graph_name + '_offset.p'))
 25 |     with open(emb_path, 'r') as INPUT:
 26 |         _data = np.zeros((type_offset['sum'], emb_size))
 27 |         print(type_offset)
 28 |         INPUT.readline()
 29 |         INPUT.readline()
 30 |         for line in INPUT:
 31 |             node = line.strip().split(' ')
 32 |             _type, _id = node[0].split(':')
 33 |             _index = in_mapping[_type][_id] + type_offset[_type]
 34 |             _data[_index, :] = np.asarray(map(lambda x:float(x), node[1:]))
 35 |     return _data
 36 | 
 37 | def clip_grad_norm(parameters, max_norm, norm_type=2):
 38 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
 39 |     max_norm = float(max_norm)
 40 |     norm_type = float(norm_type)
 41 |     if norm_type == float('inf'):
 42 |         total_norm = max(p.grad.data.abs().max() for p in parameters)
 43 |     else:
 44 |         total_norm = 0
 45 |         for p in parameters:
 46 |             param_norm = p.grad.data.norm(norm_type)
 47 |             total_norm += param_norm ** norm_type
 48 |         total_norm = total_norm ** (1. / norm_type)
 49 |     clip_coef = max_norm / (total_norm + 1e-6)
 50 |     if clip_coef < 1:
 51 |         for p in parameters:
 52 |             p.grad.data.mul_(clip_coef)
 53 |         return 1
 54 |     else:
 55 |         return 0
 56 | 
 57 | def clip_sparse_grad_norm(parameters, max_norm, norm_type=2):
 58 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
 59 |     max_norm = float(max_norm)
 60 |     norm_type = float(norm_type)
 61 |     if norm_type == float('inf'):
 62 |         total_norm = max(p.grad.data.abs().max() for p in parameters)
 63 |     else:
 64 |         for p in parameters:
 65 |             param_norm = p.grad.data._values().norm(norm_type, 1)
 66 |             if param_norm.max() > max_norm:
 67 |                 param_norm.clamp_(min=max_norm).div_(max_norm).unsqueeze_(1)
 68 |                 #1 how often cut
 69 |                 #2 cut balanced not 
 70 |                 p.grad.data._values().div_(param_norm)
 71 |                 return (param_norm > 1.0).sum()
 72 |     return 0
 73 | 
 74 | class DiagLinear(nn.Module):
 75 |     def __init__(self, input_features):
 76 |         super(DiagLinear, self).__init__()
 77 |         self.input_features = input_features
 78 | 
 79 |         # nn.Parameter is a special kind of Variable, that will get
 80 |         # automatically registered as Module's parameter once it's assigned
 81 |         # as an attribute. Parameters and buffers need to be registered, or
 82 |         # they won't appear in .parameters() (doesn't apply to buffers), and
 83 |         # won't be converted when e.g. .cuda() is called. You can use
 84 |         # .register_buffer() to register buffers.
 85 |         # nn.Parameters can never be volatile and, different than Variables,
 86 |         # they require gradients by default.
 87 |         self.weight = nn.Parameter(torch.Tensor(input_features))
 88 | 
 89 |         # Not a very smart way to initialize weights
 90 |         self.weight.data.uniform_(-0.1, 0.1)
 91 | 
 92 |     def forward(self, input):
 93 |         # See the autograd section for explanation of what happens here.
 94 |         return input * self.weight
 95 | 
 96 | class SymmLinear(nn.Module):
 97 |     def __init__(self, input_features):
 98 |         super(SymmLinear, self).__init__()
 99 |         self.input_features = input_features
100 | 
101 |         # nn.Parameter is a special kind of Variable, that will get
102 |         # automatically registered as Module's parameter once it's assigned
103 |         # as an attribute. Parameters and buffers need to be registered, or
104 |         # they won't appear in .parameters() (doesn't apply to buffers), and
105 |         # won't be converted when e.g. .cuda() is called. You can use
106 |         # .register_buffer() to register buffers.
107 |         # nn.Parameters can never be volatile and, different than Variables,
108 |         # they require gradients by default.
109 |         self.weight = nn.Parameter(torch.Tensor(input_features, input_features))
110 | 
111 |         # Not a very smart way to initialize weights
112 |         self.weight.data.uniform_(-0.1, 0.1)
113 | 
114 |     def forward(self, input):
115 |         # See the autograd section for explanation of what happens here.
116 |         print(input.size(), self.weight.size())
117 |         return input * (self.weight.transpose(0, 1) + self.weight)
118 | 
119 | class DeepSemantics(nn.Module):
120 |     """
121 |     Multi-layer edge metrics
122 |     """
123 |     def __init__(self, in_features, out_features, hidden_features, bias=False, norm=False):
124 |         super(DeepSemantics, self).__init__()
125 | 
126 |         self.fc1 = nn.Linear(in_features, hidden_features, bias = bias)
127 |         self.fc2 = nn.Linear(hidden_features, out_features, bias = bias)
128 |         
129 |         #self.fc1.weight.data.uniform_(-0.5, 0.5)
130 |         #self.fc2.weight.data.uniform_(-0.5, 0.5)
131 |         self.fc1_bn = nn.BatchNorm1d(hidden_features)
132 |         self.fc2_bn = nn.BatchNorm1d(out_features)
133 |         self.norm = norm
134 |         if not bias:
135 |             self.fc1_bn.register_parameter('bias', None)
136 |             self.fc2_bn.register_parameter('bias', None)
137 | 
138 |     def forward(self, x):
139 |         if x.size(0) == 1 or not self.norm:
140 |             x = F.relu(self.fc1(x))
141 |             return self.fc2(x)
142 |         else:
143 |             x = F.relu(self.fc1_bn(self.fc1(x)))
144 |             return self.fc2_bn(self.fc2(x))
145 |         


--------------------------------------------------------------------------------