├── BioWordVec.py
├── LICENSE.txt
├── data
    ├── MeSH_dic.pkl.gz
    ├── MeSH_graph.edgelist
    └── pubmed_sample
├── node2vec.py
└── readme.md


/BioWordVec.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Reference implementation for joint learning word vector based on bio text and mesh knowledge.
  3 | The implementation is based on Fasttext and Node2vec.
  4 | '''
  5 | #coding=utf-8
  6 | 
  7 | import argparse
  8 | import networkx as nx
  9 | import node2vec
 10 | from gensim.models import FastText
 11 | import random
 12 | import gzip
 13 | import pickle as pkl
 14 | 
 15 | 
 16 | def parse_args():
 17 | 
 18 | 	parser = argparse.ArgumentParser()
 19 | 
 20 | 	parser.add_argument('--input_corpus', nargs='?', default='./data/pubmed_sample',
 21 | 	                    help='Input biomedical corpus')
 22 | 
 23 | 	parser.add_argument('--input_mesh', nargs='?', default='./data/MeSH_graph.edgelist',
 24 | 						help='Input mesh knowledge')
 25 | 
 26 | 	parser.add_argument('--input_dic', nargs='?', default='./data/MeSH_dic.pkl.gz',
 27 | 						help='Input mesh dic')
 28 | 
 29 | 	parser.add_argument('--output_model', nargs='?', default='./pubmed_mesh_test',
 30 | 	                    help='output of word vector model')
 31 | 
 32 | 	parser.add_argument('--output_bin', nargs='?', default='./pubmed_mesh_test.bin',
 33 | 						help='output of word vector bin file')
 34 | 
 35 | 	parser.add_argument('--dimensions', type=int, default=200,
 36 | 	                    help='Number of dimensions. Default is 200.')
 37 | 
 38 | 	parser.add_argument('--walk-length', type=int, default=50,
 39 | 	                    help='Length of walk per source. Default is 100.')
 40 | 
 41 | 	parser.add_argument('--num-walks', type=int, default=2,
 42 | 	                    help='Number of walks per source. Default is 10.')
 43 | 
 44 | 	parser.add_argument('--windows', type=int, default=5,
 45 |                     	help='Context size for optimization. Default is 5.')
 46 | 
 47 | 	parser.add_argument('--iter', default=5, type=int,
 48 |                       help='Number of epochs in SGD')
 49 | 
 50 | 	parser.add_argument('--min_count', default=5, type=int,
 51 | 						help='Number of ignores min_count')
 52 | 
 53 | 	parser.add_argument('--sg', default=1, type=int,
 54 | 						help='if 1, skip-gram is used, otherwise, CBOW')
 55 | 
 56 | 	parser.add_argument('--workers', type=int, default=8,
 57 | 	                    help='Number of parallel workers. Default is 8.')
 58 | 
 59 | 	parser.add_argument('--p', type=float, default=2,
 60 | 	                    help='Return hyperparameter. Default is 1.')
 61 | 
 62 | 	parser.add_argument('--q', type=float, default=1,
 63 | 	                    help='Inout hyperparameter. Default is 1.')
 64 | 
 65 | 	parser.add_argument('--directed', dest='directed', action='store_true',
 66 | 	                    help='Graph is (un)directed. Default is undirected.')
 67 | 	parser.add_argument('--undirected', dest='undirected', action='store_false')
 68 | 	parser.set_defaults(directed=False)
 69 | 
 70 | 	return parser.parse_args()
 71 | 
 72 | class MySentences(object):
 73 | 	def __init__(self, mesh_list,pubmed_file):
 74 | 		self.mesh_list = mesh_list
 75 | 		self.pubmed_file=pubmed_file
 76 | 	def __iter__(self):
 77 | 
 78 | 		for instance in self.mesh_list:
 79 | 
 80 | 			yield instance
 81 | 
 82 | 		for line in open(self.pubmed_file, 'r'):
 83 | 			yield str(line).split()
 84 | 
 85 | 
 86 | def main(args):
 87 | 	f_pkl = gzip.open(args.input_dic, 'r')
 88 | 	mesh_dict = pkl.load(f_pkl)
 89 | 	f_pkl.close()
 90 | 
 91 | 
 92 | 	G = nx.read_edgelist(args.input_mesh, nodetype=str, create_using=nx.DiGraph())
 93 | 	for edge in G.edges():
 94 | 		G[edge[0]][edge[1]]['weight'] = 1
 95 | 
 96 | 	G = G.to_undirected()
 97 | 
 98 | 	G = node2vec.Graph(G, args.directed, args.p, args.q)
 99 | 
100 | 	G.preprocess_transition_probs()
101 | 
102 | 	walks = G.simulate_walks(args.num_walks, args.walk_length)
103 | 
104 | 	walks = [list(map(str, walk)) for walk in walks]
105 | 
106 | 	new_walks=[]
107 | 
108 | 	node_set=set([])
109 | 
110 | 	for instance in walks:
111 | 		temp_list=[]
112 | 		for node in instance:
113 | 			node_set.add(node)
114 | 			if node in mesh_dict:
115 | 				temp_list.append(mesh_dict[node])
116 | 		new_walks.append(temp_list)
117 | 
118 | 	model = FastText(MySentences(new_walks,args.input_corpus), size=args.dimensions, window=args.windows, min_count=args.min_count, workers=args.workers,
119 | 					 sg=args.sg, iter=args.iter)
120 | 
121 | 	model.save(args.output_model)
122 | 
123 | 	print(model)
124 | 
125 | 	model.wv.save_word2vec_format(args.output_bin, binary=True)
126 | 
127 | if __name__ == "__main__":
128 | 	args = parse_args()
129 | 	main(args)
130 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 |                        PUBLIC DOMAIN NOTICE
 2 |               National Center for Biotechnology Information
 3 | 
 4 | This software/database is a "United States Government Work" under the terms of
 5 | the United States Copyright Act.  It was written as part of the author's
 6 | official duties as a United States Government employee and thus cannot be
 7 | copyrighted.  This software/database is freely available to the public for use.
 8 | The National Library of Medicine and the U.S. Government have not placed any
 9 | restriction on its use or reproduction.
10 | 
11 | Although all reasonable efforts have been taken to ensure the accuracy and
12 | reliability of the software and data, the NLM and the U.S. Government do not and
13 | cannot warrant the performance or results that may be obtained by using this
14 | software or data. The NLM and the U.S. Government disclaim all warranties,
15 | express or implied, including warranties of performance, merchantability or
16 | fitness for any particular purpose.


--------------------------------------------------------------------------------
/data/MeSH_dic.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi-nlp/BioWordVec/0826267fdfdf87412baa3e1cac2225311434b0ab/data/MeSH_dic.pkl.gz


--------------------------------------------------------------------------------
/node2vec.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Node2vec implementation by Aditya Grover
  3 | For more details, refer to the paper:
  4 | node2vec: Scalable Feature Learning for Networks
  5 | Aditya Grover and Jure Leskovec
  6 | Knowledge Discovery and Data Mining (KDD), 2016
  7 | '''
  8 | 
  9 | import numpy as np
 10 | import networkx as nx
 11 | import random
 12 | 
 13 | class Graph():
 14 | 	def __init__(self, nx_G, is_directed, p, q):
 15 | 		self.G = nx_G
 16 | 		self.is_directed = is_directed
 17 | 		self.p = p
 18 | 		self.q = q
 19 | 
 20 | 	def node2vec_walk(self, walk_length, start_node):
 21 | 		'''
 22 | 		Simulate a random walk starting from start node.
 23 | 		'''
 24 | 		G = self.G
 25 | 		alias_nodes = self.alias_nodes
 26 | 		alias_edges = self.alias_edges
 27 | 
 28 | 		walk = [start_node]
 29 | 
 30 | 		while len(walk) < walk_length:
 31 | 			cur = walk[-1]
 32 | 			cur_nbrs = sorted(G.neighbors(cur))
 33 | 			if len(cur_nbrs) > 0:
 34 | 				if len(walk) == 1:
 35 | 					walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
 36 | 				else:
 37 | 					prev = walk[-2]
 38 | 					next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], 
 39 | 						alias_edges[(prev, cur)][1])]
 40 | 					walk.append(next)
 41 | 			else:
 42 | 				break
 43 | 
 44 | 		return walk
 45 | 
 46 | 	def simulate_walks(self, num_walks, walk_length):
 47 | 		'''
 48 | 		Repeatedly simulate random walks from each node.
 49 | 		'''
 50 | 		G = self.G
 51 | 		walks = []
 52 | 		nodes = list(G.nodes())
 53 | 		print ('Walk iteration:')
 54 | 		for walk_iter in range(num_walks):
 55 | 			print (str(walk_iter+1), '/', str(num_walks))
 56 | 			random.shuffle(nodes)
 57 | 			for node in nodes:
 58 | 				walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
 59 | 
 60 | 		return walks
 61 | 
 62 | 	def get_alias_edge(self, src, dst):
 63 | 		'''
 64 | 		Get the alias edge setup lists for a given edge.
 65 | 		'''
 66 | 		G = self.G
 67 | 		p = self.p
 68 | 		q = self.q
 69 | 
 70 | 		unnormalized_probs = []
 71 | 		for dst_nbr in sorted(G.neighbors(dst)):
 72 | 			if dst_nbr == src:
 73 | 				unnormalized_probs.append(G[dst][dst_nbr]['weight']/p)
 74 | 			elif G.has_edge(dst_nbr, src):
 75 | 				unnormalized_probs.append(G[dst][dst_nbr]['weight'])
 76 | 			else:
 77 | 				unnormalized_probs.append(G[dst][dst_nbr]['weight']/q)
 78 | 		norm_const = sum(unnormalized_probs)
 79 | 		normalized_probs =  [float(u_prob)/norm_const for u_prob in unnormalized_probs]
 80 | 
 81 | 		return alias_setup(normalized_probs)
 82 | 
 83 | 	def preprocess_transition_probs(self):
 84 | 		'''
 85 | 		Preprocessing of transition probabilities for guiding the random walks.
 86 | 		'''
 87 | 		G = self.G
 88 | 		is_directed = self.is_directed
 89 | 
 90 | 		alias_nodes = {}
 91 | 		for node in G.nodes():
 92 | 			unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]
 93 | 			norm_const = sum(unnormalized_probs)
 94 | 			normalized_probs =  [float(u_prob)/norm_const for u_prob in unnormalized_probs]
 95 | 			alias_nodes[node] = alias_setup(normalized_probs)
 96 | 
 97 | 		alias_edges = {}
 98 | 		triads = {}
 99 | 
100 | 		if is_directed:
101 | 			for edge in G.edges():
102 | 				alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
103 | 		else:
104 | 			for edge in G.edges():
105 | 				alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
106 | 				alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
107 | 
108 | 		self.alias_nodes = alias_nodes
109 | 		self.alias_edges = alias_edges
110 | 
111 | 		return
112 | 
113 | 
114 | def alias_setup(probs):
115 | 	'''
116 | 	Compute utility lists for non-uniform sampling from discrete distributions.
117 | 	Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
118 | 	for details
119 | 	'''
120 | 	K = len(probs)
121 | 	q = np.zeros(K)
122 | 	J = np.zeros(K, dtype=np.int)
123 | 
124 | 	smaller = []
125 | 	larger = []
126 | 	for kk, prob in enumerate(probs):
127 | 	    q[kk] = K*prob
128 | 	    if q[kk] < 1.0:
129 | 	        smaller.append(kk)
130 | 	    else:
131 | 	        larger.append(kk)
132 | 
133 | 	while len(smaller) > 0 and len(larger) > 0:
134 | 	    small = smaller.pop()
135 | 	    large = larger.pop()
136 | 
137 | 	    J[small] = large
138 | 	    q[large] = q[large] + q[small] - 1.0
139 | 	    if q[large] < 1.0:
140 | 	        smaller.append(large)
141 | 	    else:
142 | 	        larger.append(large)
143 | 
144 | 	return J, q
145 | 
146 | def alias_draw(J, q):
147 | 	'''
148 | 	Draw sample from a non-uniform discrete distribution using alias sampling.
149 | 	'''
150 | 	K = len(J)
151 | 
152 | 	kk = int(np.floor(np.random.rand()*K))
153 | 	if np.random.rand() < q[kk]:
154 | 	    return kk
155 | 	else:
156 | 	    return J[kk]


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # BioWordVec: Improving Biomedical Word Embeddings with Subowrd Information and MeSH #
 2 | This sourcecode is a demo implementation described in the paper "BioWordVec:Improving Biomedical Word Embeddings with Subowrd Information and MeSH." This is research software, provided as is without express or implied warranties etc. see licence.txt for more details. We have tried to make it reasonably usable and provided help options, but adapting the system to new environments or transforming a corpus to the format used by the system may require significant effort. 
 3 | 
 4 | ## Data files ##
 5 | Data: MeSH_graph.edgelist is the MeSH main-heading graph file. MeSH_dic.pkl.gz is used to align the MeSH heading ids with mention words. The PubMed corpus and MeSH RDF data can be download from NCBI. 
 6 |  
 7 | ## Prerequisites ##
 8 | - python 3.5
 9 | - networkx 1.11
10 | - gensim 2.3
11 | 
12 | ## Usage ##
13 | 
14 | User can use BioWordVec.py to automatically learn the biomedical word embedding based on PubMed text corpus and MeSH data.
15 | 
16 | ## Pre-trained word embedding ##
17 | 
18 | We created two specialized, task-dependent sets of word embeddings “Bio-embedding-intrinsic” and “Bio-embedding-extrinsic” via setting the context window size as 20 and 5, respectively. The pre-trained BioWordVec data are freely available on [Figshare](https://doi.org/10.6084/m9.figshare.6882647 ). "Bio-embedding-intrinsic" is for intrinsic tasks and used to calculate or predict semantic similarity between words, terms or sentences. "Bio_embedding_extrinsic" is for extrinsic tasks and used as the input for various downstream NLP tasks, such as relation extraction or text classification. Both sets are in binary format and contain 2,324,849 distinct words in total. All words were converted to lowercase and the number of dimensions is 200.
19 | 
20 | We used [UMNSRS](http://rxinformatics.umn.edu/SemanticRelatednessResources.html) datasets to evaluate the pre-trained word embeddings on medical word pair similarity.
21 | 
22 | | Word embeddings       |UMNSRS-Sim  (Pearson score)        | UMNSRS-Sim  (Spearman score)  | UMNSRS-Rel  (Pearson score)           |UMNSRS-Rel  (Pearson score)  |
23 | | ------------- |:-------------:| -----:|:-------------:| -----:|
24 | |[Pyysalo et al.](http://http://evexdb.org/pmresources/vec-space-models/)     | 0.662 | 0.652 |0.600 | 0.601 |
25 | |[Chiu et al.](http://github.com/cambridgeltl/BioNLP-2016)     | 0.665     |  0.654|0.608      |  0.607|
26 | | BioWordVec (win20) | 0.667    |   0.657 |0.619    |    0.617 |
27 | 
28 | We also used [BioCreative/OHNLP STS](https://sites.google.com/view/ohnlp2018/home) dataset to evaluate the pre-trained word embeddings on clinical sentence pair similarity. 
29 | 
30 | | Similarity measures      |[Pyysalo et al.](http://http://evexdb.org/pmresources/vec-space-models/)        | [Chiu et al.](http://github.com/cambridgeltl/BioNLP-2016)   | BioWordVec (win20) |
31 | | ------------- |:-------------:| -----:|:-------------:|
32 | |Cosine    | 0.755| 0.757 |0.771 |
33 | |Euclidean     | 0.723     | 0.727|0.753 |
34 | | Block |   0.722 |0.727   |    0.752 |
35 | 
36 | User can find more usage notes in our paper.
37 | 
38 | ## References
39 | When using some of our pre-trained models for your application, please cite the following paper:
40 | 
41 | Zhang Y, Chen Q, Yang Z, Lin H, Lu Z. [BioWordVec, improving biomedical word embeddings with subword information and MeSH](https://www.nature.com/articles/s41597-019-0055-0). Scientific Data. 2019.
42 | 
43 | ## List of Contributors ##
44 | Yijia  Zhang, Qingyu Chen, Zhihao Yang, Hongfei Lin and Zhiyong Lu
45 | 
46 | ## Acknowledgments ##
47 | This work was supported by the Intramural Research Programs of the National Institutes of Health, National Library of Medicine. We are grateful to the authors of fastText, Node2vec and UMNSRS for making their software and data publicly available.
48 | > 
49 | > 
50 | 


--------------------------------------------------------------------------------