├── BioWordVec.py ├── LICENSE.txt ├── data ├── MeSH_dic.pkl.gz ├── MeSH_graph.edgelist └── pubmed_sample ├── node2vec.py └── readme.md /BioWordVec.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Reference implementation for joint learning word vector based on bio text and mesh knowledge. 3 | The implementation is based on Fasttext and Node2vec. 4 | ''' 5 | #coding=utf-8 6 | 7 | import argparse 8 | import networkx as nx 9 | import node2vec 10 | from gensim.models import FastText 11 | import random 12 | import gzip 13 | import pickle as pkl 14 | 15 | 16 | def parse_args(): 17 | 18 | parser = argparse.ArgumentParser() 19 | 20 | parser.add_argument('--input_corpus', nargs='?', default='./data/pubmed_sample', 21 | help='Input biomedical corpus') 22 | 23 | parser.add_argument('--input_mesh', nargs='?', default='./data/MeSH_graph.edgelist', 24 | help='Input mesh knowledge') 25 | 26 | parser.add_argument('--input_dic', nargs='?', default='./data/MeSH_dic.pkl.gz', 27 | help='Input mesh dic') 28 | 29 | parser.add_argument('--output_model', nargs='?', default='./pubmed_mesh_test', 30 | help='output of word vector model') 31 | 32 | parser.add_argument('--output_bin', nargs='?', default='./pubmed_mesh_test.bin', 33 | help='output of word vector bin file') 34 | 35 | parser.add_argument('--dimensions', type=int, default=200, 36 | help='Number of dimensions. Default is 200.') 37 | 38 | parser.add_argument('--walk-length', type=int, default=50, 39 | help='Length of walk per source. Default is 100.') 40 | 41 | parser.add_argument('--num-walks', type=int, default=2, 42 | help='Number of walks per source. Default is 10.') 43 | 44 | parser.add_argument('--windows', type=int, default=5, 45 | help='Context size for optimization. Default is 5.') 46 | 47 | parser.add_argument('--iter', default=5, type=int, 48 | help='Number of epochs in SGD') 49 | 50 | parser.add_argument('--min_count', default=5, type=int, 51 | help='Number of ignores min_count') 52 | 53 | parser.add_argument('--sg', default=1, type=int, 54 | help='if 1, skip-gram is used, otherwise, CBOW') 55 | 56 | parser.add_argument('--workers', type=int, default=8, 57 | help='Number of parallel workers. Default is 8.') 58 | 59 | parser.add_argument('--p', type=float, default=2, 60 | help='Return hyperparameter. Default is 1.') 61 | 62 | parser.add_argument('--q', type=float, default=1, 63 | help='Inout hyperparameter. Default is 1.') 64 | 65 | parser.add_argument('--directed', dest='directed', action='store_true', 66 | help='Graph is (un)directed. Default is undirected.') 67 | parser.add_argument('--undirected', dest='undirected', action='store_false') 68 | parser.set_defaults(directed=False) 69 | 70 | return parser.parse_args() 71 | 72 | class MySentences(object): 73 | def __init__(self, mesh_list,pubmed_file): 74 | self.mesh_list = mesh_list 75 | self.pubmed_file=pubmed_file 76 | def __iter__(self): 77 | 78 | for instance in self.mesh_list: 79 | 80 | yield instance 81 | 82 | for line in open(self.pubmed_file, 'r'): 83 | yield str(line).split() 84 | 85 | 86 | def main(args): 87 | f_pkl = gzip.open(args.input_dic, 'r') 88 | mesh_dict = pkl.load(f_pkl) 89 | f_pkl.close() 90 | 91 | 92 | G = nx.read_edgelist(args.input_mesh, nodetype=str, create_using=nx.DiGraph()) 93 | for edge in G.edges(): 94 | G[edge[0]][edge[1]]['weight'] = 1 95 | 96 | G = G.to_undirected() 97 | 98 | G = node2vec.Graph(G, args.directed, args.p, args.q) 99 | 100 | G.preprocess_transition_probs() 101 | 102 | walks = G.simulate_walks(args.num_walks, args.walk_length) 103 | 104 | walks = [list(map(str, walk)) for walk in walks] 105 | 106 | new_walks=[] 107 | 108 | node_set=set([]) 109 | 110 | for instance in walks: 111 | temp_list=[] 112 | for node in instance: 113 | node_set.add(node) 114 | if node in mesh_dict: 115 | temp_list.append(mesh_dict[node]) 116 | new_walks.append(temp_list) 117 | 118 | model = FastText(MySentences(new_walks,args.input_corpus), size=args.dimensions, window=args.windows, min_count=args.min_count, workers=args.workers, 119 | sg=args.sg, iter=args.iter) 120 | 121 | model.save(args.output_model) 122 | 123 | print(model) 124 | 125 | model.wv.save_word2vec_format(args.output_bin, binary=True) 126 | 127 | if __name__ == "__main__": 128 | args = parse_args() 129 | main(args) 130 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | PUBLIC DOMAIN NOTICE 2 | National Center for Biotechnology Information 3 | 4 | This software/database is a "United States Government Work" under the terms of 5 | the United States Copyright Act. It was written as part of the author's 6 | official duties as a United States Government employee and thus cannot be 7 | copyrighted. This software/database is freely available to the public for use. 8 | The National Library of Medicine and the U.S. Government have not placed any 9 | restriction on its use or reproduction. 10 | 11 | Although all reasonable efforts have been taken to ensure the accuracy and 12 | reliability of the software and data, the NLM and the U.S. Government do not and 13 | cannot warrant the performance or results that may be obtained by using this 14 | software or data. The NLM and the U.S. Government disclaim all warranties, 15 | express or implied, including warranties of performance, merchantability or 16 | fitness for any particular purpose. -------------------------------------------------------------------------------- /data/MeSH_dic.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi-nlp/BioWordVec/0826267fdfdf87412baa3e1cac2225311434b0ab/data/MeSH_dic.pkl.gz -------------------------------------------------------------------------------- /node2vec.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Node2vec implementation by Aditya Grover 3 | For more details, refer to the paper: 4 | node2vec: Scalable Feature Learning for Networks 5 | Aditya Grover and Jure Leskovec 6 | Knowledge Discovery and Data Mining (KDD), 2016 7 | ''' 8 | 9 | import numpy as np 10 | import networkx as nx 11 | import random 12 | 13 | class Graph(): 14 | def __init__(self, nx_G, is_directed, p, q): 15 | self.G = nx_G 16 | self.is_directed = is_directed 17 | self.p = p 18 | self.q = q 19 | 20 | def node2vec_walk(self, walk_length, start_node): 21 | ''' 22 | Simulate a random walk starting from start node. 23 | ''' 24 | G = self.G 25 | alias_nodes = self.alias_nodes 26 | alias_edges = self.alias_edges 27 | 28 | walk = [start_node] 29 | 30 | while len(walk) < walk_length: 31 | cur = walk[-1] 32 | cur_nbrs = sorted(G.neighbors(cur)) 33 | if len(cur_nbrs) > 0: 34 | if len(walk) == 1: 35 | walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) 36 | else: 37 | prev = walk[-2] 38 | next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], 39 | alias_edges[(prev, cur)][1])] 40 | walk.append(next) 41 | else: 42 | break 43 | 44 | return walk 45 | 46 | def simulate_walks(self, num_walks, walk_length): 47 | ''' 48 | Repeatedly simulate random walks from each node. 49 | ''' 50 | G = self.G 51 | walks = [] 52 | nodes = list(G.nodes()) 53 | print ('Walk iteration:') 54 | for walk_iter in range(num_walks): 55 | print (str(walk_iter+1), '/', str(num_walks)) 56 | random.shuffle(nodes) 57 | for node in nodes: 58 | walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node)) 59 | 60 | return walks 61 | 62 | def get_alias_edge(self, src, dst): 63 | ''' 64 | Get the alias edge setup lists for a given edge. 65 | ''' 66 | G = self.G 67 | p = self.p 68 | q = self.q 69 | 70 | unnormalized_probs = [] 71 | for dst_nbr in sorted(G.neighbors(dst)): 72 | if dst_nbr == src: 73 | unnormalized_probs.append(G[dst][dst_nbr]['weight']/p) 74 | elif G.has_edge(dst_nbr, src): 75 | unnormalized_probs.append(G[dst][dst_nbr]['weight']) 76 | else: 77 | unnormalized_probs.append(G[dst][dst_nbr]['weight']/q) 78 | norm_const = sum(unnormalized_probs) 79 | normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] 80 | 81 | return alias_setup(normalized_probs) 82 | 83 | def preprocess_transition_probs(self): 84 | ''' 85 | Preprocessing of transition probabilities for guiding the random walks. 86 | ''' 87 | G = self.G 88 | is_directed = self.is_directed 89 | 90 | alias_nodes = {} 91 | for node in G.nodes(): 92 | unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))] 93 | norm_const = sum(unnormalized_probs) 94 | normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] 95 | alias_nodes[node] = alias_setup(normalized_probs) 96 | 97 | alias_edges = {} 98 | triads = {} 99 | 100 | if is_directed: 101 | for edge in G.edges(): 102 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 103 | else: 104 | for edge in G.edges(): 105 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 106 | alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) 107 | 108 | self.alias_nodes = alias_nodes 109 | self.alias_edges = alias_edges 110 | 111 | return 112 | 113 | 114 | def alias_setup(probs): 115 | ''' 116 | Compute utility lists for non-uniform sampling from discrete distributions. 117 | Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ 118 | for details 119 | ''' 120 | K = len(probs) 121 | q = np.zeros(K) 122 | J = np.zeros(K, dtype=np.int) 123 | 124 | smaller = [] 125 | larger = [] 126 | for kk, prob in enumerate(probs): 127 | q[kk] = K*prob 128 | if q[kk] < 1.0: 129 | smaller.append(kk) 130 | else: 131 | larger.append(kk) 132 | 133 | while len(smaller) > 0 and len(larger) > 0: 134 | small = smaller.pop() 135 | large = larger.pop() 136 | 137 | J[small] = large 138 | q[large] = q[large] + q[small] - 1.0 139 | if q[large] < 1.0: 140 | smaller.append(large) 141 | else: 142 | larger.append(large) 143 | 144 | return J, q 145 | 146 | def alias_draw(J, q): 147 | ''' 148 | Draw sample from a non-uniform discrete distribution using alias sampling. 149 | ''' 150 | K = len(J) 151 | 152 | kk = int(np.floor(np.random.rand()*K)) 153 | if np.random.rand() < q[kk]: 154 | return kk 155 | else: 156 | return J[kk] -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # BioWordVec: Improving Biomedical Word Embeddings with Subowrd Information and MeSH # 2 | This sourcecode is a demo implementation described in the paper "BioWordVec:Improving Biomedical Word Embeddings with Subowrd Information and MeSH." This is research software, provided as is without express or implied warranties etc. see licence.txt for more details. We have tried to make it reasonably usable and provided help options, but adapting the system to new environments or transforming a corpus to the format used by the system may require significant effort. 3 | 4 | ## Data files ## 5 | Data: MeSH_graph.edgelist is the MeSH main-heading graph file. MeSH_dic.pkl.gz is used to align the MeSH heading ids with mention words. The PubMed corpus and MeSH RDF data can be download from NCBI. 6 | 7 | ## Prerequisites ## 8 | - python 3.5 9 | - networkx 1.11 10 | - gensim 2.3 11 | 12 | ## Usage ## 13 | 14 | User can use BioWordVec.py to automatically learn the biomedical word embedding based on PubMed text corpus and MeSH data. 15 | 16 | ## Pre-trained word embedding ## 17 | 18 | We created two specialized, task-dependent sets of word embeddings “Bio-embedding-intrinsic” and “Bio-embedding-extrinsic” via setting the context window size as 20 and 5, respectively. The pre-trained BioWordVec data are freely available on [Figshare](https://doi.org/10.6084/m9.figshare.6882647 ). "Bio-embedding-intrinsic" is for intrinsic tasks and used to calculate or predict semantic similarity between words, terms or sentences. "Bio_embedding_extrinsic" is for extrinsic tasks and used as the input for various downstream NLP tasks, such as relation extraction or text classification. Both sets are in binary format and contain 2,324,849 distinct words in total. All words were converted to lowercase and the number of dimensions is 200. 19 | 20 | We used [UMNSRS](http://rxinformatics.umn.edu/SemanticRelatednessResources.html) datasets to evaluate the pre-trained word embeddings on medical word pair similarity. 21 | 22 | | Word embeddings |UMNSRS-Sim (Pearson score) | UMNSRS-Sim (Spearman score) | UMNSRS-Rel (Pearson score) |UMNSRS-Rel (Pearson score) | 23 | | ------------- |:-------------:| -----:|:-------------:| -----:| 24 | |[Pyysalo et al.](http://http://evexdb.org/pmresources/vec-space-models/) | 0.662 | 0.652 |0.600 | 0.601 | 25 | |[Chiu et al.](http://github.com/cambridgeltl/BioNLP-2016) | 0.665 | 0.654|0.608 | 0.607| 26 | | BioWordVec (win20) | 0.667 | 0.657 |0.619 | 0.617 | 27 | 28 | We also used [BioCreative/OHNLP STS](https://sites.google.com/view/ohnlp2018/home) dataset to evaluate the pre-trained word embeddings on clinical sentence pair similarity. 29 | 30 | | Similarity measures |[Pyysalo et al.](http://http://evexdb.org/pmresources/vec-space-models/) | [Chiu et al.](http://github.com/cambridgeltl/BioNLP-2016) | BioWordVec (win20) | 31 | | ------------- |:-------------:| -----:|:-------------:| 32 | |Cosine | 0.755| 0.757 |0.771 | 33 | |Euclidean | 0.723 | 0.727|0.753 | 34 | | Block | 0.722 |0.727 | 0.752 | 35 | 36 | User can find more usage notes in our paper. 37 | 38 | ## References 39 | When using some of our pre-trained models for your application, please cite the following paper: 40 | 41 | Zhang Y, Chen Q, Yang Z, Lin H, Lu Z. [BioWordVec, improving biomedical word embeddings with subword information and MeSH](https://www.nature.com/articles/s41597-019-0055-0). Scientific Data. 2019. 42 | 43 | ## List of Contributors ## 44 | Yijia Zhang, Qingyu Chen, Zhihao Yang, Hongfei Lin and Zhiyong Lu 45 | 46 | ## Acknowledgments ## 47 | This work was supported by the Intramural Research Programs of the National Institutes of Health, National Library of Medicine. We are grateful to the authors of fastText, Node2vec and UMNSRS for making their software and data publicly available. 48 | > 49 | > 50 | --------------------------------------------------------------------------------