├── embedding_model ├── parser.pyc ├── sampler.pyc ├── utility.pyc ├── embedding.pyc ├── eval_metric.pyc ├── train_helper.pyc ├── utility.py ├── main.py ├── eval_metric.py ├── train_helper.py ├── parser.py ├── embedding.py └── sampler.py ├── README.md ├── emb └── doc_emb.txt └── sampled_data └── data.xml /embedding_model/parser.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/parser.pyc -------------------------------------------------------------------------------- /embedding_model/sampler.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/sampler.pyc -------------------------------------------------------------------------------- /embedding_model/utility.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/utility.pyc -------------------------------------------------------------------------------- /embedding_model/embedding.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/embedding.pyc -------------------------------------------------------------------------------- /embedding_model/eval_metric.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/eval_metric.pyc -------------------------------------------------------------------------------- /embedding_model/train_helper.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/train_helper.pyc -------------------------------------------------------------------------------- /embedding_model/utility.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import numpy as np 4 | 5 | 6 | def sigmoid(x): 7 | return float(1) / (1 + math.exp(-x)) 8 | 9 | 10 | def construct_doc_matrix(dict, paper_list): 11 | """ 12 | construct the learned embedding for document clustering 13 | dict: {paper_index, numpy_array} 14 | """ 15 | D_matrix = dict[paper_list[0]] 16 | for idx in xrange(1, len(paper_list)): 17 | D_matrix = np.vstack((D_matrix, dict[paper_list[idx]])) 18 | return D_matrix 19 | 20 | 21 | def softmax(x): 22 | """Compute softmax values for each sets of scores in x.""" 23 | e_x = np.exp(x - np.max(x)) 24 | return e_x / e_x.sum(axis=0) 25 | 26 | 27 | def save_embedding(dict, paper_list, num_dimen): 28 | """ 29 | save the final embedding results for each document 30 | """ 31 | embedding_file = open('../emb/doc_emb.txt','w') 32 | embedding_file.write(str(len(paper_list)) + ' ' + str(num_dimen) + os.linesep) 33 | D_matrix = dict[paper_list[0]] 34 | for idx in xrange(1, len(paper_list)): 35 | D_matrix = np.vstack((D_matrix, dict[paper_list[idx]])) 36 | D_matrix = np.hstack((np.array([range(1, len(paper_list)+1)]).T, D_matrix)) 37 | np.savetxt(embedding_file, D_matrix, 38 | fmt = ' '.join(['%i'] + ['%1.5f'] * num_dimen)) 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Name Disambiguation using Network Embedding 2 | This repository provides a reference implementation of name disambiguation using network embedding as described in the paper:
3 | > Name Disambiguation in Anonymized Graphs using Network Embedding.
4 | > Baichuan Zhang and Mohammad Al Hasan.
5 | > Proceedings of the 2017 ACM on Conference on Information and Knowledge Management (CIKM 2017)
6 | > 7 | ## Pre-Requisite 8 | 9 | * [Python 2.7](https://www.python.org/) 10 | * [Numpy](http://www.numpy.org/) 11 | * [Networkx](https://networkx.github.io/) 12 | * [scikit-learn](http://scikit-learn.org/stable/) 13 | 14 | ### Basic Usage 15 | 16 | #### Example 17 | To run disambiguation embedding code, execute the following command from the project home directory:
18 | ``python embedding_model/main.py sampled_data/data.xml 20 0.02 0.005 100 'uniform'`` 19 | 20 | #### Options 21 | You can check out the hyper-parameter options using:
22 | ``python embedding_model/main.py --help`` 23 | 24 | #### Output 25 | The output is Macro-F1 result and ranking loss value in each epoch. 26 | In the meanwhile, we also generate final embedding file in emb/ directory, which contains *n+1* lines for the document collection with *n* documents. 27 | The first line has the following format: 28 | 29 | num_of_docs dim_of_representation 30 | 31 | The next *n* lines are as follows: 32 | 33 | doc_id dim1 dim2 ... dimd 34 | 35 | where dim1, ... , dimd is the *d*-dimensional representation learned by the proposed embedding model. 36 | 37 | ### Reference 38 | If you find this work useful for your research, please consider citing the following paper: 39 | 40 | @inproceedings{zhang-cikm2017, 41 | author = {Zhang, Baichuan and Al Hasan, Mohammad}, 42 | title = {Name Disambiguation in Anonymized Graphs using Network Embedding}, 43 | booktitle = {Proceedings of the ACM on Conference on Information and Knowledge Management (CIKM)}, 44 | year = {2017} 45 | } 46 | -------------------------------------------------------------------------------- /embedding_model/main.py: -------------------------------------------------------------------------------- 1 | import parser 2 | import embedding 3 | import train_helper 4 | import sampler 5 | import eval_metric 6 | import argparse 7 | 8 | 9 | def parse_args(): 10 | """ 11 | parse the embedding model arguments 12 | """ 13 | parser_arg = argparse.ArgumentParser(description = 14 | "run embedding for name disambiguation") 15 | parser_arg.add_argument("file_path", help = 'input file name') 16 | parser_arg.add_argument("latent_dimen", type = int, default = 20, 17 | help = 'number of dimension in embedding') 18 | parser_arg.add_argument("alpha", type = float, default = 0.02, 19 | help = 'learning rate') 20 | parser_arg.add_argument("matrix_reg", type = float, default = 0.01, 21 | help = 'matrix regularization parameter') 22 | parser_arg.add_argument("num_epoch", type = int, default = 100, 23 | help = "number of epochs for SGD inference") 24 | parser_arg.add_argument("sampler_method", help = "sampling approach") 25 | return parser_arg.parse_args() 26 | 27 | 28 | def main(args): 29 | """ 30 | pipeline for representation learning for all papers for a given name reference 31 | """ 32 | dataset = parser.DataSet(args.file_path) 33 | dataset.reader_arnetminer() 34 | bpr_optimizer = embedding.BprOptimizer(args.latent_dimen, args.alpha, 35 | args.matrix_reg) 36 | pp_sampler = sampler.CoauthorGraphSampler() 37 | pd_sampler = sampler.BipartiteGraphSampler() 38 | dd_sampler = sampler.LinkedDocGraphSampler() 39 | eval_f1 = eval_metric.Evaluator() 40 | 41 | run_helper = train_helper.TrainHelper() 42 | run_helper.helper(args.num_epoch, dataset, bpr_optimizer, 43 | pp_sampler, pd_sampler, dd_sampler, 44 | eval_f1, args.sampler_method) 45 | 46 | 47 | if __name__ == "__main__": 48 | args = parse_args() 49 | main(args) 50 | -------------------------------------------------------------------------------- /embedding_model/eval_metric.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import AgglomerativeClustering 2 | import numpy as np 3 | from utility import construct_doc_matrix 4 | 5 | 6 | class Evaluator(): 7 | @staticmethod 8 | def compute_f1(dataset, bpr_optimizer): 9 | """ 10 | perform Hierarchy Clustering on doc embedding matrix 11 | for name disambiguation 12 | use cluster-level mean F1 for evaluation 13 | """ 14 | D_matrix = construct_doc_matrix(bpr_optimizer.paper_latent_matrix, 15 | dataset.paper_list) 16 | true_cluster_size = len(set(dataset.label_list)) 17 | y_pred = AgglomerativeClustering(n_clusters = true_cluster_size, 18 | linkage = "average", 19 | affinity = "cosine").fit_predict(D_matrix) 20 | 21 | true_label_dict = {} 22 | for idx, true_lbl in enumerate(dataset.label_list): 23 | if true_lbl not in true_label_dict: 24 | true_label_dict[true_lbl] = [idx] 25 | else: 26 | true_label_dict[true_lbl].append(idx) 27 | 28 | predict_label_dict = {} 29 | for idx, pred_lbl in enumerate(y_pred): 30 | if pred_lbl not in predict_label_dict: 31 | predict_label_dict[pred_lbl] = [idx] 32 | else: 33 | predict_label_dict[pred_lbl].append(idx) 34 | 35 | # compute cluster-level F1 36 | # let's denote C(r) as clustering result and T(k) as partition (ground-truth) 37 | # construct r * k contingency table for clustering purpose 38 | r_k_table = [] 39 | for v1 in predict_label_dict.itervalues(): 40 | k_list = [] 41 | for v2 in true_label_dict.itervalues(): 42 | N_ij = len(set(v1).intersection(v2)) 43 | k_list.append(N_ij) 44 | r_k_table.append(k_list) 45 | r_k_matrix = np.array(r_k_table) 46 | r_num = int(r_k_matrix.shape[0]) 47 | 48 | # compute F1 for each row C_i 49 | sum_f1 = 0.0 50 | for row in xrange(0, r_num): 51 | row_sum = np.sum(r_k_matrix[row,:]) 52 | if row_sum != 0: 53 | max_col_index = np.argmax(r_k_matrix[row,:]) 54 | row_max_value = r_k_matrix[row, max_col_index] 55 | prec = float(row_max_value) / row_sum 56 | col_sum = np.sum(r_k_matrix[:, max_col_index]) 57 | rec = float(row_max_value) / col_sum 58 | row_f1 = float(2 * prec * rec) / (prec + rec) 59 | sum_f1 += row_f1 60 | 61 | average_f1 = float(sum_f1) / r_num 62 | return average_f1 63 | -------------------------------------------------------------------------------- /embedding_model/train_helper.py: -------------------------------------------------------------------------------- 1 | from utility import save_embedding 2 | 3 | 4 | class TrainHelper(): 5 | @staticmethod 6 | def helper(num_epoch, dataset, bpr_optimizer, 7 | pp_sampler, pd_sampler, dd_sampler, 8 | eval_f1, sampler_method): 9 | 10 | bpr_optimizer.init_model(dataset) 11 | if sampler_method == "uniform": 12 | for _ in xrange(0, num_epoch): 13 | bpr_loss = 0.0 14 | for _ in xrange(0, dataset.num_nnz): 15 | """ 16 | update embedding in person-person network 17 | update embedding in person-document network 18 | update embedding in doc-doc network 19 | """ 20 | for i, j, t in pp_sampler.generate_triplet_uniform(dataset): 21 | bpr_optimizer.update_pp_gradient(i, j, t) 22 | bpr_loss += bpr_optimizer.compute_pp_loss(i, j, t) 23 | 24 | for i, j, t in pd_sampler.generate_triplet_uniform(dataset): 25 | bpr_optimizer.update_pd_gradient(i, j, t) 26 | bpr_loss += bpr_optimizer.compute_pd_loss(i, j, t) 27 | 28 | for i, j, t in dd_sampler.generate_triplet_uniform(dataset): 29 | bpr_optimizer.update_dd_gradient(i, j, t) 30 | bpr_loss += bpr_optimizer.compute_dd_loss(i, j, t) 31 | 32 | average_loss = float(bpr_loss) / dataset.num_nnz 33 | print "average bpr loss is " + str(average_loss) 34 | average_f1 = eval_f1.compute_f1(dataset, bpr_optimizer) 35 | print 'f1 is ' + str(average_f1) 36 | print 37 | 38 | elif sampler_method == "reject": 39 | for _ in xrange(0, num_epoch): 40 | #bpr_loss = 0.0 41 | for _ in xrange(0, dataset.num_nnz): 42 | """ 43 | update embedding in person-person network 44 | update embedding in person-document network 45 | update embedding in doc-doc network 46 | """ 47 | for i, j, t in pp_sampler.generate_triplet_reject(dataset, bpr_optimizer): 48 | bpr_optimizer.update_pp_gradient(i, j, t) 49 | #bpr_loss += bpr_optimizer.compute_pp_loss(i, j, t) 50 | 51 | for i, j, t in pd_sampler.generate_triplet_reject(dataset, bpr_optimizer): 52 | bpr_optimizer.update_pd_gradient(i, j, t) 53 | #bpr_loss += bpr_optimizer.compute_pd_loss(i, j, t) 54 | 55 | for i, j, t in dd_sampler.generate_triplet_reject(dataset, bpr_optimizer): 56 | bpr_optimizer.update_dd_gradient(i, j, t) 57 | #bpr_loss += bpr_optimizer.compute_dd_loss(i, j, t) 58 | 59 | elif sampler_method == "adaptive": 60 | for _ in xrange(0, num_epoch): 61 | #bpr_loss = 0.0 62 | for _ in xrange(0, dataset.num_nnz): 63 | """ 64 | update embedding in person-person network 65 | update embedding in person-document network 66 | update embedding in doc-doc network 67 | """ 68 | for i, j, t in pp_sampler.generate_triplet_adaptive(dataset, bpr_optimizer): 69 | bpr_optimizer.update_pp_gradient(i, j, t) 70 | #bpr_loss += bpr_optimizer.compute_pp_loss(i, j, t) 71 | 72 | for i, j, t in pd_sampler.generate_triplet_adaptive(dataset, bpr_optimizer): 73 | bpr_optimizer.update_pd_gradient(i, j, t) 74 | #bpr_loss += bpr_optimizer.compute_pd_loss(i, j, t) 75 | 76 | for i, j, t in dd_sampler.generate_triplet_adaptive(dataset, bpr_optimizer): 77 | bpr_optimizer.update_dd_gradient(i, j, t) 78 | #bpr_loss += bpr_optimizer.compute_dd_loss(i, j, t) 79 | 80 | save_embedding(bpr_optimizer.paper_latent_matrix, 81 | dataset.paper_list, bpr_optimizer.latent_dimen) 82 | -------------------------------------------------------------------------------- /embedding_model/parser.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | 4 | class DataSet(): 5 | 6 | def __init__(self, file_path): 7 | 8 | self.file_path = file_path 9 | self.paper_authorlist_dict = {} 10 | self.paper_list = [] 11 | self.coauthor_list = [] 12 | self.label_list = [] 13 | self.C_Graph = nx.Graph() 14 | self.D_Graph = nx.Graph() 15 | self.num_nnz = 0 16 | 17 | def reader_arnetminer(self): 18 | paper_index = 0 19 | coauthor_set = set() 20 | 21 | with open(self.file_path, "r") as filetoread: 22 | for line in filetoread: 23 | line = line.strip() 24 | if "FullName" in line: 25 | ego_name = line[line.find('>')+1:line.rfind('<')].strip() 26 | elif "" in line: 27 | paper_index += 1 28 | self.paper_list.append(paper_index) 29 | elif "" in line: 30 | author_list = line[line.find('>')+1: line.rfind('<')].strip().split(',') 31 | if len(author_list) > 1: 32 | if ego_name in author_list: 33 | author_list.remove(ego_name) 34 | self.paper_authorlist_dict[paper_index] = author_list 35 | else: 36 | self.paper_authorlist_dict[paper_index] = author_list 37 | 38 | for co_author in author_list: 39 | coauthor_set.add(co_author) 40 | 41 | # construct the coauthorship graph 42 | for pos in xrange(0, len(author_list) - 1): 43 | for inpos in xrange(pos+1, len(author_list)): 44 | src_node = author_list[pos] 45 | dest_node = author_list[inpos] 46 | if not self.C_Graph.has_edge(src_node, dest_node): 47 | self.C_Graph.add_edge(src_node, dest_node, weight = 1) 48 | else: 49 | edge_weight = self.C_Graph[src_node][dest_node]['weight'] 50 | edge_weight += 1 51 | self.C_Graph[src_node][dest_node]['weight'] = edge_weight 52 | else: 53 | self.paper_authorlist_dict[paper_index] = [] 54 | elif "