├── embedding_model
├── parser.pyc
├── sampler.pyc
├── utility.pyc
├── embedding.pyc
├── eval_metric.pyc
├── train_helper.pyc
├── utility.py
├── main.py
├── eval_metric.py
├── train_helper.py
├── parser.py
├── embedding.py
└── sampler.py
├── README.md
├── emb
└── doc_emb.txt
└── sampled_data
└── data.xml
/embedding_model/parser.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/parser.pyc
--------------------------------------------------------------------------------
/embedding_model/sampler.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/sampler.pyc
--------------------------------------------------------------------------------
/embedding_model/utility.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/utility.pyc
--------------------------------------------------------------------------------
/embedding_model/embedding.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/embedding.pyc
--------------------------------------------------------------------------------
/embedding_model/eval_metric.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/eval_metric.pyc
--------------------------------------------------------------------------------
/embedding_model/train_helper.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baichuan/disambiguation_embedding/HEAD/embedding_model/train_helper.pyc
--------------------------------------------------------------------------------
/embedding_model/utility.py:
--------------------------------------------------------------------------------
1 | import os
2 | import math
3 | import numpy as np
4 |
5 |
6 | def sigmoid(x):
7 | return float(1) / (1 + math.exp(-x))
8 |
9 |
10 | def construct_doc_matrix(dict, paper_list):
11 | """
12 | construct the learned embedding for document clustering
13 | dict: {paper_index, numpy_array}
14 | """
15 | D_matrix = dict[paper_list[0]]
16 | for idx in xrange(1, len(paper_list)):
17 | D_matrix = np.vstack((D_matrix, dict[paper_list[idx]]))
18 | return D_matrix
19 |
20 |
21 | def softmax(x):
22 | """Compute softmax values for each sets of scores in x."""
23 | e_x = np.exp(x - np.max(x))
24 | return e_x / e_x.sum(axis=0)
25 |
26 |
27 | def save_embedding(dict, paper_list, num_dimen):
28 | """
29 | save the final embedding results for each document
30 | """
31 | embedding_file = open('../emb/doc_emb.txt','w')
32 | embedding_file.write(str(len(paper_list)) + ' ' + str(num_dimen) + os.linesep)
33 | D_matrix = dict[paper_list[0]]
34 | for idx in xrange(1, len(paper_list)):
35 | D_matrix = np.vstack((D_matrix, dict[paper_list[idx]]))
36 | D_matrix = np.hstack((np.array([range(1, len(paper_list)+1)]).T, D_matrix))
37 | np.savetxt(embedding_file, D_matrix,
38 | fmt = ' '.join(['%i'] + ['%1.5f'] * num_dimen))
39 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Name Disambiguation using Network Embedding
2 | This repository provides a reference implementation of name disambiguation using network embedding as described in the paper:
3 | > Name Disambiguation in Anonymized Graphs using Network Embedding.
4 | > Baichuan Zhang and Mohammad Al Hasan.
5 | > Proceedings of the 2017 ACM on Conference on Information and Knowledge Management (CIKM 2017)
6 | >
7 | ## Pre-Requisite
8 |
9 | * [Python 2.7](https://www.python.org/)
10 | * [Numpy](http://www.numpy.org/)
11 | * [Networkx](https://networkx.github.io/)
12 | * [scikit-learn](http://scikit-learn.org/stable/)
13 |
14 | ### Basic Usage
15 |
16 | #### Example
17 | To run disambiguation embedding code, execute the following command from the project home directory:
18 | ``python embedding_model/main.py sampled_data/data.xml 20 0.02 0.005 100 'uniform'``
19 |
20 | #### Options
21 | You can check out the hyper-parameter options using:
22 | ``python embedding_model/main.py --help``
23 |
24 | #### Output
25 | The output is Macro-F1 result and ranking loss value in each epoch.
26 | In the meanwhile, we also generate final embedding file in emb/ directory, which contains *n+1* lines for the document collection with *n* documents.
27 | The first line has the following format:
28 |
29 | num_of_docs dim_of_representation
30 |
31 | The next *n* lines are as follows:
32 |
33 | doc_id dim1 dim2 ... dimd
34 |
35 | where dim1, ... , dimd is the *d*-dimensional representation learned by the proposed embedding model.
36 |
37 | ### Reference
38 | If you find this work useful for your research, please consider citing the following paper:
39 |
40 | @inproceedings{zhang-cikm2017,
41 | author = {Zhang, Baichuan and Al Hasan, Mohammad},
42 | title = {Name Disambiguation in Anonymized Graphs using Network Embedding},
43 | booktitle = {Proceedings of the ACM on Conference on Information and Knowledge Management (CIKM)},
44 | year = {2017}
45 | }
46 |
--------------------------------------------------------------------------------
/embedding_model/main.py:
--------------------------------------------------------------------------------
1 | import parser
2 | import embedding
3 | import train_helper
4 | import sampler
5 | import eval_metric
6 | import argparse
7 |
8 |
9 | def parse_args():
10 | """
11 | parse the embedding model arguments
12 | """
13 | parser_arg = argparse.ArgumentParser(description =
14 | "run embedding for name disambiguation")
15 | parser_arg.add_argument("file_path", help = 'input file name')
16 | parser_arg.add_argument("latent_dimen", type = int, default = 20,
17 | help = 'number of dimension in embedding')
18 | parser_arg.add_argument("alpha", type = float, default = 0.02,
19 | help = 'learning rate')
20 | parser_arg.add_argument("matrix_reg", type = float, default = 0.01,
21 | help = 'matrix regularization parameter')
22 | parser_arg.add_argument("num_epoch", type = int, default = 100,
23 | help = "number of epochs for SGD inference")
24 | parser_arg.add_argument("sampler_method", help = "sampling approach")
25 | return parser_arg.parse_args()
26 |
27 |
28 | def main(args):
29 | """
30 | pipeline for representation learning for all papers for a given name reference
31 | """
32 | dataset = parser.DataSet(args.file_path)
33 | dataset.reader_arnetminer()
34 | bpr_optimizer = embedding.BprOptimizer(args.latent_dimen, args.alpha,
35 | args.matrix_reg)
36 | pp_sampler = sampler.CoauthorGraphSampler()
37 | pd_sampler = sampler.BipartiteGraphSampler()
38 | dd_sampler = sampler.LinkedDocGraphSampler()
39 | eval_f1 = eval_metric.Evaluator()
40 |
41 | run_helper = train_helper.TrainHelper()
42 | run_helper.helper(args.num_epoch, dataset, bpr_optimizer,
43 | pp_sampler, pd_sampler, dd_sampler,
44 | eval_f1, args.sampler_method)
45 |
46 |
47 | if __name__ == "__main__":
48 | args = parse_args()
49 | main(args)
50 |
--------------------------------------------------------------------------------
/embedding_model/eval_metric.py:
--------------------------------------------------------------------------------
1 | from sklearn.cluster import AgglomerativeClustering
2 | import numpy as np
3 | from utility import construct_doc_matrix
4 |
5 |
6 | class Evaluator():
7 | @staticmethod
8 | def compute_f1(dataset, bpr_optimizer):
9 | """
10 | perform Hierarchy Clustering on doc embedding matrix
11 | for name disambiguation
12 | use cluster-level mean F1 for evaluation
13 | """
14 | D_matrix = construct_doc_matrix(bpr_optimizer.paper_latent_matrix,
15 | dataset.paper_list)
16 | true_cluster_size = len(set(dataset.label_list))
17 | y_pred = AgglomerativeClustering(n_clusters = true_cluster_size,
18 | linkage = "average",
19 | affinity = "cosine").fit_predict(D_matrix)
20 |
21 | true_label_dict = {}
22 | for idx, true_lbl in enumerate(dataset.label_list):
23 | if true_lbl not in true_label_dict:
24 | true_label_dict[true_lbl] = [idx]
25 | else:
26 | true_label_dict[true_lbl].append(idx)
27 |
28 | predict_label_dict = {}
29 | for idx, pred_lbl in enumerate(y_pred):
30 | if pred_lbl not in predict_label_dict:
31 | predict_label_dict[pred_lbl] = [idx]
32 | else:
33 | predict_label_dict[pred_lbl].append(idx)
34 |
35 | # compute cluster-level F1
36 | # let's denote C(r) as clustering result and T(k) as partition (ground-truth)
37 | # construct r * k contingency table for clustering purpose
38 | r_k_table = []
39 | for v1 in predict_label_dict.itervalues():
40 | k_list = []
41 | for v2 in true_label_dict.itervalues():
42 | N_ij = len(set(v1).intersection(v2))
43 | k_list.append(N_ij)
44 | r_k_table.append(k_list)
45 | r_k_matrix = np.array(r_k_table)
46 | r_num = int(r_k_matrix.shape[0])
47 |
48 | # compute F1 for each row C_i
49 | sum_f1 = 0.0
50 | for row in xrange(0, r_num):
51 | row_sum = np.sum(r_k_matrix[row,:])
52 | if row_sum != 0:
53 | max_col_index = np.argmax(r_k_matrix[row,:])
54 | row_max_value = r_k_matrix[row, max_col_index]
55 | prec = float(row_max_value) / row_sum
56 | col_sum = np.sum(r_k_matrix[:, max_col_index])
57 | rec = float(row_max_value) / col_sum
58 | row_f1 = float(2 * prec * rec) / (prec + rec)
59 | sum_f1 += row_f1
60 |
61 | average_f1 = float(sum_f1) / r_num
62 | return average_f1
63 |
--------------------------------------------------------------------------------
/embedding_model/train_helper.py:
--------------------------------------------------------------------------------
1 | from utility import save_embedding
2 |
3 |
4 | class TrainHelper():
5 | @staticmethod
6 | def helper(num_epoch, dataset, bpr_optimizer,
7 | pp_sampler, pd_sampler, dd_sampler,
8 | eval_f1, sampler_method):
9 |
10 | bpr_optimizer.init_model(dataset)
11 | if sampler_method == "uniform":
12 | for _ in xrange(0, num_epoch):
13 | bpr_loss = 0.0
14 | for _ in xrange(0, dataset.num_nnz):
15 | """
16 | update embedding in person-person network
17 | update embedding in person-document network
18 | update embedding in doc-doc network
19 | """
20 | for i, j, t in pp_sampler.generate_triplet_uniform(dataset):
21 | bpr_optimizer.update_pp_gradient(i, j, t)
22 | bpr_loss += bpr_optimizer.compute_pp_loss(i, j, t)
23 |
24 | for i, j, t in pd_sampler.generate_triplet_uniform(dataset):
25 | bpr_optimizer.update_pd_gradient(i, j, t)
26 | bpr_loss += bpr_optimizer.compute_pd_loss(i, j, t)
27 |
28 | for i, j, t in dd_sampler.generate_triplet_uniform(dataset):
29 | bpr_optimizer.update_dd_gradient(i, j, t)
30 | bpr_loss += bpr_optimizer.compute_dd_loss(i, j, t)
31 |
32 | average_loss = float(bpr_loss) / dataset.num_nnz
33 | print "average bpr loss is " + str(average_loss)
34 | average_f1 = eval_f1.compute_f1(dataset, bpr_optimizer)
35 | print 'f1 is ' + str(average_f1)
36 | print
37 |
38 | elif sampler_method == "reject":
39 | for _ in xrange(0, num_epoch):
40 | #bpr_loss = 0.0
41 | for _ in xrange(0, dataset.num_nnz):
42 | """
43 | update embedding in person-person network
44 | update embedding in person-document network
45 | update embedding in doc-doc network
46 | """
47 | for i, j, t in pp_sampler.generate_triplet_reject(dataset, bpr_optimizer):
48 | bpr_optimizer.update_pp_gradient(i, j, t)
49 | #bpr_loss += bpr_optimizer.compute_pp_loss(i, j, t)
50 |
51 | for i, j, t in pd_sampler.generate_triplet_reject(dataset, bpr_optimizer):
52 | bpr_optimizer.update_pd_gradient(i, j, t)
53 | #bpr_loss += bpr_optimizer.compute_pd_loss(i, j, t)
54 |
55 | for i, j, t in dd_sampler.generate_triplet_reject(dataset, bpr_optimizer):
56 | bpr_optimizer.update_dd_gradient(i, j, t)
57 | #bpr_loss += bpr_optimizer.compute_dd_loss(i, j, t)
58 |
59 | elif sampler_method == "adaptive":
60 | for _ in xrange(0, num_epoch):
61 | #bpr_loss = 0.0
62 | for _ in xrange(0, dataset.num_nnz):
63 | """
64 | update embedding in person-person network
65 | update embedding in person-document network
66 | update embedding in doc-doc network
67 | """
68 | for i, j, t in pp_sampler.generate_triplet_adaptive(dataset, bpr_optimizer):
69 | bpr_optimizer.update_pp_gradient(i, j, t)
70 | #bpr_loss += bpr_optimizer.compute_pp_loss(i, j, t)
71 |
72 | for i, j, t in pd_sampler.generate_triplet_adaptive(dataset, bpr_optimizer):
73 | bpr_optimizer.update_pd_gradient(i, j, t)
74 | #bpr_loss += bpr_optimizer.compute_pd_loss(i, j, t)
75 |
76 | for i, j, t in dd_sampler.generate_triplet_adaptive(dataset, bpr_optimizer):
77 | bpr_optimizer.update_dd_gradient(i, j, t)
78 | #bpr_loss += bpr_optimizer.compute_dd_loss(i, j, t)
79 |
80 | save_embedding(bpr_optimizer.paper_latent_matrix,
81 | dataset.paper_list, bpr_optimizer.latent_dimen)
82 |
--------------------------------------------------------------------------------
/embedding_model/parser.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 |
3 |
4 | class DataSet():
5 |
6 | def __init__(self, file_path):
7 |
8 | self.file_path = file_path
9 | self.paper_authorlist_dict = {}
10 | self.paper_list = []
11 | self.coauthor_list = []
12 | self.label_list = []
13 | self.C_Graph = nx.Graph()
14 | self.D_Graph = nx.Graph()
15 | self.num_nnz = 0
16 |
17 | def reader_arnetminer(self):
18 | paper_index = 0
19 | coauthor_set = set()
20 |
21 | with open(self.file_path, "r") as filetoread:
22 | for line in filetoread:
23 | line = line.strip()
24 | if "FullName" in line:
25 | ego_name = line[line.find('>')+1:line.rfind('<')].strip()
26 | elif "" in line:
27 | paper_index += 1
28 | self.paper_list.append(paper_index)
29 | elif "" in line:
30 | author_list = line[line.find('>')+1: line.rfind('<')].strip().split(',')
31 | if len(author_list) > 1:
32 | if ego_name in author_list:
33 | author_list.remove(ego_name)
34 | self.paper_authorlist_dict[paper_index] = author_list
35 | else:
36 | self.paper_authorlist_dict[paper_index] = author_list
37 |
38 | for co_author in author_list:
39 | coauthor_set.add(co_author)
40 |
41 | # construct the coauthorship graph
42 | for pos in xrange(0, len(author_list) - 1):
43 | for inpos in xrange(pos+1, len(author_list)):
44 | src_node = author_list[pos]
45 | dest_node = author_list[inpos]
46 | if not self.C_Graph.has_edge(src_node, dest_node):
47 | self.C_Graph.add_edge(src_node, dest_node, weight = 1)
48 | else:
49 | edge_weight = self.C_Graph[src_node][dest_node]['weight']
50 | edge_weight += 1
51 | self.C_Graph[src_node][dest_node]['weight'] = edge_weight
52 | else:
53 | self.paper_authorlist_dict[paper_index] = []
54 | elif "