├── LICENSE ├── QuickUMLS ├── README.md ├── __pycache__ ├── config.cpython-37.pyc └── config.cpython-38.pyc ├── augment4class.py ├── augment4ner.py ├── config.py ├── example ├── example4class.txt ├── example4class.txt.augmented ├── example4ner.conll └── example4ner.conll.augmented └── src ├── Authentication.py ├── QuickUMLS ├── __pycache__ ├── Authentication.cpython-36.pyc ├── Authentication.cpython-38.pyc ├── eda.cpython-36.pyc ├── eda.cpython-37.pyc ├── eda.cpython-38.pyc └── eda4ner.cpython-38.pyc ├── eda.py ├── eda4ner.py └── retrieve-cui-or-code.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 WengLab-InformaticsResearch 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /QuickUMLS: -------------------------------------------------------------------------------- 1 | /home/tk2624/tools/QuickUMLS-master -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UMLS-EDA :octocat: 2 | **:tada: A light-weighted UMLS-based data augmentation for biomedical NLP tasks including Named Entity Recognition and sentence classification :tada:** 3 | 4 | * Citation: [Kang, T., Perotte, A., Tang, Y., Ta, C., & Weng, C. (2020). *UMLS-based data augmentation for natural language processing of clinical research literature*. Journal of the American Medical Informatics Association.](https://academic.oup.com/jamia/advance-article/doi/10.1093/jamia/ocaa309/6046153) 5 | * Author: [Tian Kang](http://www.tiankangnlp.com) (tk2624@cumc.columbia.edu) 6 | * Affiliation: Department of Biomedical Informatics, Columbia Univerisity ([Dr. Chunhua Weng](http://people.dbmi.columbia.edu/~chw7007/)'s lab) 7 | * Built upon [EDA-Easy Data Augmentation](https://arxiv.org/abs/1901.11196) 8 | 9 | 10 | ## User Guide 11 | 12 | ### 0. Before start 13 | 1) Install ['UMLS'](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html) and ['QuickUMLS'](https://github.com/Georgetown-IR-Lab/QuickUMLS) locally 14 | 2) Get your **UMLS SOAP API Key** from the UTS ‘My Profile’ area after signing in [UMLS Terminology service](https://uts.nlm.nih.gov/home.html) 15 | 3) Add your API Key and QuickUMLS directory to the `config.py`. 16 | 4) Costomzie other variables in the `config.py` 17 | 18 | ### 1. Named Entity Recognition 19 | 20 | * **Input**: CoNLL format file 21 | * **Usage**: 22 | ``` 23 | python augment4ner.py [-h] --input INPUT [--output OUTPUT] [--num_aug NUM_AUG] [--alpha ALPHA] 24 | ``` 25 | 26 | ### 2. Sentence Classification 27 | 28 | * **Input**: "|" seperated file (`index|label|sentence text`) 29 | * **Usage**: 30 | ``` 31 | python augment4class.py [-h] --input INPUT [--output OUTPUT] [--num_aug NUM_AUG] [--alpha ALPHA] 32 | ``` 33 | 34 | See `examples/example4ner.conll` and `example/example4class.txt` 35 | 36 | 37 | ## Reference 38 | * Wei, J. and Zou, K., 2019. [Eda: Easy data augmentation techniques for boosting performance on text classification tasks](https://arxiv.org/abs/1901.11196). arXiv preprint arXiv:1901.11196. ([Github repo](https://github.com/jasonwei20/eda_nlp.git)) 39 | -------------------------------------------------------------------------------- /__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WengLab-InformaticsResearch/UMLS-EDA/5e368fb2973c0919af038b4722e75556e9341fd7/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WengLab-InformaticsResearch/UMLS-EDA/5e368fb2973c0919af038b4722e75556e9341fd7/__pycache__/config.cpython-38.pyc -------------------------------------------------------------------------------- /augment4class.py: -------------------------------------------------------------------------------- 1 | # Easy data augmentation techniques for text classification 2 | # Jason Wei and Kai Zou 3 | 4 | from src.eda import * 5 | 6 | #arguments to be parsed from command line 7 | import argparse 8 | ap = argparse.ArgumentParser() 9 | ap.add_argument("--input", required=True, type=str, help="input file of unaugmented data") 10 | ap.add_argument("--output", required=False, type=str, help="output file of unaugmented data") 11 | ap.add_argument("--num_aug", required=False, type=int, help="number of augmented sentences per original sentence") 12 | ap.add_argument("--alpha", required=False, type=float, help="percent of words in each sentence to be changed") 13 | args = ap.parse_args() 14 | from config import Config 15 | config = Config() 16 | apikey=config.apikey 17 | #the output file 18 | output = None 19 | if args.output: 20 | output = args.output 21 | else: 22 | from os.path import dirname, basename, join 23 | output = join(dirname(args.input), 'eda_' + basename(args.input)) 24 | 25 | #number of augmented sentences to generate per original sentence 26 | num_aug = 9 #default 27 | if args.num_aug: 28 | num_aug = args.num_aug 29 | 30 | #how much to change each sentence 31 | alpha = 0.3#default 32 | if args.alpha: 33 | alpha = args.alpha 34 | import re,time 35 | #generate more data with standard augmentation 36 | def gen_eda(train_orig, output_file, alpha, num_aug=9): 37 | 38 | writer = open(output_file, 'w') 39 | lines = open(train_orig, 'r').readlines() 40 | cur_time= time.time() 41 | for i, line in enumerate(lines): 42 | if re.search("^##|^\s+$",line): 43 | writer.write(line) 44 | continue 45 | parts = line.rstrip().split('|') 46 | label = parts[1] 47 | sentence = parts[2] 48 | aug_sentences = eda(sentence, apikey=apikey, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha,alpha_umls=alpha, num_aug=num_aug) 49 | for aug_sentence in aug_sentences: 50 | writer.write(parts[0]+"|"+label + "|" + aug_sentence + '\n') 51 | if i%10 ==0: 52 | cost = time.time()-cur_time 53 | minute=cost//60 54 | print ("......10 instance cost:",minute,"min, or",cost,"sec.\n") 55 | cur_time = time.time() 56 | writer.close() 57 | print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug)) 58 | 59 | 60 | # generate augmentation for bert relation extraction data 61 | def gen_eda_for_re(train_orig, output_file, alpha, num_aug=9): 62 | writer = open(output_file, 'w') 63 | lines = open(train_orig, 'r').readlines() 64 | cur_time= time.time() 65 | for i, line in enumerate(lines): 66 | 67 | if re.search("^index",line): 68 | writer.write(line) 69 | continue 70 | 71 | parts = line.rstrip().split('\t') 72 | index = parts[0] 73 | sentence = parts[1] 74 | label= parts[2] 75 | aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha,alpha_umls=alpha, num_aug=num_aug,task = "re") 76 | for aug_sentence in aug_sentences: 77 | 78 | writer.write(index+"\t"+aug_sentence + "\t" + label + '\n') 79 | if i%10 ==0: 80 | cost = time.time()-cur_time 81 | minute=cost//60 82 | cur_time = time.time() 83 | writer.close() 84 | print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug)) 85 | 86 | 87 | #main function 88 | if __name__ == "__main__": 89 | 90 | #generate augmented sentences and output into a new file 91 | gen_eda(args.input, output, alpha=alpha, num_aug=num_aug) 92 | -------------------------------------------------------------------------------- /augment4ner.py: -------------------------------------------------------------------------------- 1 | # Easy data augmentation techniques for text classification 2 | # Jason Wei and Kai Zou 3 | 4 | from src.eda4ner import * 5 | 6 | #arguments to be parsed from command line 7 | import argparse,codecs 8 | ap = argparse.ArgumentParser() 9 | ap.add_argument("--input", required=True, type=str, help="input file of unaugmented data") 10 | ap.add_argument("--output", required=False, type=str, help="output file of unaugmented data") 11 | ap.add_argument("--num_aug", required=False, type=int, help="number of augmented sentences per original sentence") 12 | ap.add_argument("--alpha", required=False, type=float, help="percent of words in each sentence to be changed") 13 | args = ap.parse_args() 14 | from config import Config 15 | config = Config() 16 | apikey = config.apikey 17 | 18 | #the output file 19 | output = None 20 | if args.output: 21 | output = args.output 22 | else: 23 | from os.path import dirname, basename, join 24 | output = join(dirname(args.input), 'eda_' + basename(args.input)) 25 | 26 | #number of augmented sentences to generate per original sentence 27 | num_aug = 10 #default 28 | if args.num_aug: 29 | num_aug = args.num_aug 30 | 31 | #how much to change each sentence 32 | alpha = 0.3#default 33 | if args.alpha: 34 | alpha = args.alpha 35 | 36 | # read conll file by sentence 37 | def delimited(file, delimiter = '\n', bufsize = 4096): 38 | buf = '' 39 | while True: 40 | newbuf= file.read(bufsize) 41 | if not newbuf: 42 | yield buf 43 | return 44 | buf +=newbuf 45 | lines =buf.split(delimiter) 46 | for line in lines[:-1]: 47 | yield line 48 | buf = lines[-1] 49 | 50 | #generate more data with standard augmentation 51 | def gen_eda(train_orig, output_file, alpha, num_aug=9): 52 | 53 | writer = open(output_file, 'w') 54 | #lines = open(train_orig, 'r').readlines() 55 | ann_infile = codecs.open(train_orig,'r') 56 | lines = delimited(ann_infile,"\n\n",bufsize = 1) 57 | 58 | for i, line in enumerate(lines): 59 | #parts = line[:-1].split('\t') 60 | #label = parts[0] 61 | if i% 50 == 0: 62 | print (i,"lines finished...") 63 | info = line.rstrip().split("\n") 64 | 65 | if line.rstrip() =="": 66 | continue 67 | 68 | sent = [a.split("\t")[0] for a in info] 69 | label = [a.split("\t")[1] for a in info] 70 | sentence = " ".join(sent) 71 | #sentence = line.rstrip() 72 | aug_sentences = eda(sentence,label,apikey=apikey, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha,alpha_umls=alpha, num_aug=num_aug) 73 | #aug_sentences = eda(sentence,label,alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug, task ="re") 74 | for aug_sentence in aug_sentences: 75 | 76 | words = aug_sentence[0] 77 | labels = aug_sentence[1] 78 | for w,l in zip(words,labels): 79 | writer.write(w+"\t"+l+"\n") 80 | writer.write("\n") 81 | #writer.write(aug_sentence) 82 | #writer.write('\n') 83 | 84 | writer.close() 85 | 86 | print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug)) 87 | 88 | #main function 89 | if __name__ == "__main__": 90 | import time 91 | before = time.time() 92 | #generate augmented sentences and output into a new file 93 | gen_eda(args.input, output, alpha=alpha, num_aug=num_aug) 94 | cost = time.time()-before 95 | print (cost) 96 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # configuration for UMLS-EDA 2 | import os 3 | class Config(): 4 | 5 | QuickUMLS_git_dir = "tools/QuickUMLS-master" 6 | QuickUMLS_dir = "tools/QuickUMLS" # where your QuickUMLS data is intalled 7 | 8 | apikey = "" # your api key from NLM UMLS API service 9 | threshold = 0.8 10 | 11 | # set soft link to QuickUMLS 12 | if not os.path.exists("QuickUMLS"): 13 | command = "ln -s "+ QuickUMLS_git_dir + " QuickUMLS" 14 | os.system (command) 15 | 16 | 17 | -------------------------------------------------------------------------------- /example/example4class.txt: -------------------------------------------------------------------------------- 1 | ###9793223: 2 | OBJECTIVE|A|to compare the results of recording enamel opacities using the tf and modified dde indices 3 | OBJECTIVE|A|to compare the opaqueness results of recording enamel opacities proponent using the tf and modified change exponent dde indices 4 | OBJECTIVE|A|using compare the results of indices recording opacities tf the to and modified dde enamel 5 | OBJECTIVE|A|to compare the results of recording enamel opacities using the tf and modified dde indices 6 | DESIGN|M|enamel opacities central the maxillary on weeks were recorded two apart incisors 7 | DESIGN|M|record enamel opacities track record on the maxillary central record incisors were recorded two weeks apart 8 | DESIGN|M|enamel opacities on the maxillary central incisors were recorded two 9 | DESIGN|M|enamel opacities on the maxillary central incisors were recorded two weeks apart 10 | DESIGN|M|on the first occasion scoring was the criteria of tf and on the the used 11 | DESIGN|M|personify utilize on the first occasion scoring was according to the criteria of the personify tf index and affair apply on the second occasion the modified dde utilize index along was used 12 | -------------------------------------------------------------------------------- /example/example4class.txt.augmented: -------------------------------------------------------------------------------- 1 | ###9793223: 2 | -------------------------------------------------------------------------------- /example/example4ner.conll: -------------------------------------------------------------------------------- 1 | This O 2 | article O 3 | describes O 4 | a O 5 | first O 6 | attempt O 7 | to O 8 | investigate O 9 | the O 10 | reliability B-Outcome 11 | and O 12 | validity B-Outcome 13 | of O 14 | the O 15 | TOM B-Intervention 16 | test I-Intervention 17 | . O 18 | 19 | a O 20 | new O 21 | instrument O 22 | for O 23 | assessing O 24 | theory B-Outcome 25 | of I-Outcome 26 | mind I-Outcome 27 | ability I-Outcome 28 | in O 29 | normal B-Participant 30 | children I-Participant 31 | . O 32 | -------------------------------------------------------------------------------- /example/example4ner.conll.augmented: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WengLab-InformaticsResearch/UMLS-EDA/5e368fb2973c0919af038b4722e75556e9341fd7/example/example4ner.conll.augmented -------------------------------------------------------------------------------- /src/Authentication.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ## 6/16/2017 - remove PyQuery dependency 3 | ## 5/19/2016 - update to allow for authentication based on api-key, rather than username/pw 4 | ## See https://documentation.uts.nlm.nih.gov/rest/authentication.html for full explanation 5 | 6 | import requests 7 | #from pyquery import PyQuery as pq 8 | import lxml.html as lh 9 | from lxml.html import fromstring 10 | 11 | uri="https://utslogin.nlm.nih.gov" 12 | #option 1 - username/pw authentication at /cas/v1/tickets 13 | #auth_endpoint = "/cas/v1/tickets/" 14 | #option 2 - api key authentication at /cas/v1/api-key 15 | auth_endpoint = "/cas/v1/api-key" 16 | 17 | class Authentication: 18 | 19 | #def __init__(self, username,password): 20 | def __init__(self, apikey): 21 | #self.username=username 22 | #self.password=password 23 | self.apikey=apikey 24 | self.service="http://umlsks.nlm.nih.gov" 25 | 26 | def gettgt(self): 27 | #params = {'username': self.username,'password': self.password} 28 | params = {'apikey': self.apikey} 29 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python" } 30 | r = requests.post(uri+auth_endpoint,data=params,headers=h) 31 | response = fromstring(r.text) 32 | ## extract the entire URL needed from the HTML form (action attribute) returned - looks similar to https://utslogin.nlm.nih.gov/cas/v1/tickets/TGT-36471-aYqNLN2rFIJPXKzxwdTNC5ZT7z3B3cTAKfSc5ndHQcUxeaDOLN-cas 33 | ## we make a POST call to this URL in the getst method 34 | tgt = response.xpath('//form/@action')[0] 35 | return tgt 36 | 37 | def getst(self,tgt): 38 | 39 | params = {'service': self.service} 40 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python" } 41 | r = requests.post(tgt,data=params,headers=h) 42 | st = r.text 43 | return st 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/QuickUMLS: -------------------------------------------------------------------------------- 1 | /home/tk2624/tools/QuickUMLS-master/ -------------------------------------------------------------------------------- /src/__pycache__/Authentication.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WengLab-InformaticsResearch/UMLS-EDA/5e368fb2973c0919af038b4722e75556e9341fd7/src/__pycache__/Authentication.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/Authentication.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WengLab-InformaticsResearch/UMLS-EDA/5e368fb2973c0919af038b4722e75556e9341fd7/src/__pycache__/Authentication.cpython-38.pyc -------------------------------------------------------------------------------- /src/__pycache__/eda.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WengLab-InformaticsResearch/UMLS-EDA/5e368fb2973c0919af038b4722e75556e9341fd7/src/__pycache__/eda.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/eda.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WengLab-InformaticsResearch/UMLS-EDA/5e368fb2973c0919af038b4722e75556e9341fd7/src/__pycache__/eda.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/eda.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WengLab-InformaticsResearch/UMLS-EDA/5e368fb2973c0919af038b4722e75556e9341fd7/src/__pycache__/eda.cpython-38.pyc -------------------------------------------------------------------------------- /src/__pycache__/eda4ner.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WengLab-InformaticsResearch/UMLS-EDA/5e368fb2973c0919af038b4722e75556e9341fd7/src/__pycache__/eda4ner.cpython-38.pyc -------------------------------------------------------------------------------- /src/eda.py: -------------------------------------------------------------------------------- 1 | # Easy data augmentation techniques for text classification 2 | # Jason Wei and Kai Zou 3 | 4 | import random 5 | from random import shuffle 6 | random.seed(1) 7 | from config import Config 8 | config = Config() 9 | 10 | 11 | #stop words list 12 | stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 13 | 'ours', 'ourselves', 'you', 'your', 'yours', 14 | 'yourself', 'yourselves', 'he', 'him', 'his', 15 | 'himself', 'she', 'her', 'hers', 'herself', 16 | 'it', 'its', 'itself', 'they', 'them', 'their', 17 | 'theirs', 'themselves', 'what', 'which', 'who', 18 | 'whom', 'this', 'that', 'these', 'those', 'am', 19 | 'is', 'are', 'was', 'were', 'be', 'been', 'being', 20 | 'have', 'has', 'had', 'having', 'do', 'does', 'did', 21 | 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 22 | 'because', 'as', 'until', 'while', 'of', 'at', 23 | 'by', 'for', 'with', 'about', 'against', 'between', 24 | 'into', 'through', 'during', 'before', 'after', 25 | 'above', 'below', 'to', 'from', 'up', 'down', 'in', 26 | 'out', 'on', 'off', 'over', 'under', 'again', 27 | 'further', 'then', 'once', 'here', 'there', 'when', 28 | 'where', 'why', 'how', 'all', 'any', 'both', 'each', 29 | 'few', 'more', 'most', 'other', 'some', 'such', 'no', 30 | 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 31 | 'very', 's', 't', 'can', 'will', 'just', 'don', 32 | 'should', 'now', ''] 33 | 34 | re_stop_words = [] 35 | #cleaning up text 36 | import re 37 | def get_only_chars(line): 38 | 39 | clean_line = "" 40 | 41 | line = line.replace("’", "") 42 | line = line.replace("'", "") 43 | line = line.replace("-", " ") #replace hyphens with spaces 44 | line = line.replace("\t", " ") 45 | line = line.replace("\n", " ") 46 | line = line.lower() 47 | 48 | for char in line: 49 | if char in 'qwertyuiopasdfghjklzxcvbnm ': 50 | clean_line += char 51 | else: 52 | clean_line += ' ' 53 | 54 | clean_line = re.sub(' +',' ',clean_line) #delete extra spaces 55 | if clean_line == "": 56 | return "" 57 | if clean_line[0] == ' ': 58 | clean_line = clean_line[1:] 59 | return clean_line 60 | 61 | 62 | #***************************** 63 | # UMLS Synonyms replacement # 64 | #***************************** 65 | 66 | 67 | #from QuickUMLS.quickumls import QuickUMLS 68 | matcher = None#QuickUMLS(config.QuickUMLS_dir, threshold=config.threshold, similarity_name ='cosine')#,overlapping_criteria='length') 69 | 70 | def get_umls_tagging(text,matcher): 71 | info = matcher.match(text, best_match=True, ignore_syntax=False) 72 | taggings=[] 73 | if len(info) == 0: 74 | return None 75 | for one_c in info: 76 | 77 | one_c = one_c[0] 78 | 79 | result = {"cui":one_c["cui"],"term":one_c["term"]} 80 | taggings.append(one_c) 81 | return taggings 82 | 83 | from src.Authentication import * 84 | import requests 85 | import json 86 | 87 | def get_atoms(apikey,cui): 88 | AuthClient = Authentication(apikey) 89 | tgt = AuthClient.gettgt() 90 | uri = "https://uts-ws.nlm.nih.gov" 91 | content_endpoint = "/rest/content/2019AB"+"/CUI/"+str(cui)+"/atoms" 92 | query = {'ticket':AuthClient.getst(tgt),'language':'ENG','pageSize':200} 93 | headers = {"Range": "bytes=0-1"} 94 | r = requests.get(uri+content_endpoint,params=query,headers=headers) 95 | r.encoding = 'utf-8' 96 | items = json.loads(r.text) 97 | jsonData = items["result"] 98 | atoms =[i["name"] for i in jsonData if not re.search("[,;\-\(\)\.\/]",i['name']) and not re.search("NOS",i['name']) ]# remove sysnonyms with punctuations, or NOS 99 | atoms = list(set(atoms)) 100 | return atoms 101 | #get_atoms(apikey,'C0006142') 102 | 103 | def umls_replacement(words,n,apikey,task = "sent"): 104 | sent = " ".join(words) 105 | 106 | augumented_sents = [] 107 | taggings = get_umls_tagging(sent, matcher) 108 | num_replaced = 0 109 | total_num = 1 110 | if taggings is None: 111 | return augumented_sents 112 | atoms_set = [] 113 | cui_set = [] 114 | for i in taggings: 115 | cui = i["cui"] 116 | if cui in cui_set: 117 | continue 118 | cui_set.append(cui) 119 | try: 120 | atoms_raw = get_atoms(apikey,cui) 121 | except: 122 | continue 123 | #remove synnonyms with only case differences 124 | atoms = [a for a in atoms_raw if not a.lower() == i['term'].lower() and not a.lower()+"s" ==i['term'].lower() and not a.lower() == i['term'].lower()+"s" ] 125 | if i['term'] in atoms: 126 | atoms.remove(i['term']) 127 | 128 | if len(atoms)>0: 129 | atoms_set.append({i['term']:atoms}) 130 | total_num=total_num*len(atoms) 131 | max_num = min(n, total_num) 132 | for _ in range(max_num): 133 | new_sent = sent 134 | for i in range(len(atoms_set)): 135 | atom = atoms_set[i] 136 | term = list(atom.keys())[0] 137 | atoms = atom[term] 138 | synonym = random.choice(atoms) 139 | new_sent = re.sub("[ |^]"+term+"[ |$]"," "+synonym+" ",new_sent) 140 | #atoms.remove(synonym) 141 | #atoms_set[i] = {term:atoms} 142 | augumented_sents.append(new_sent) 143 | return list(set(augumented_sents)) 144 | 145 | ######################################################################## 146 | # Synonym replacement 147 | # Replace n words in the sentence with synonyms from wordnet 148 | ######################################################################## 149 | 150 | #for the first time you use wordnet 151 | #import nltk 152 | #nltk.download('wordnet') 153 | from nltk.corpus import wordnet 154 | 155 | def synonym_replacement(words, n,task = "sentence"): 156 | new_words = words.copy() 157 | random_word_list = list(set([word for word in words if word not in stop_words])) 158 | 159 | if task == "re": 160 | random_word_list = list(set([word for word in words if not re.search("^@\w+\$$",word)])) 161 | 162 | random.shuffle(random_word_list) 163 | num_replaced = 0 164 | for random_word in random_word_list: 165 | synonyms = get_synonyms(random_word) 166 | if len(synonyms) >= 1: 167 | synonym = random.choice(list(synonyms)) 168 | new_words = [synonym if word == random_word else word for word in new_words] 169 | num_replaced += 1 170 | if num_replaced >= n: #only replace up to n words 171 | break 172 | 173 | #this is stupid but we need it, trust me 174 | sentence = ' '.join(new_words) 175 | new_words = sentence.split(' ') 176 | 177 | return new_words 178 | 179 | def get_synonyms(word): 180 | synonyms = set() 181 | for syn in wordnet.synsets(word): 182 | for l in syn.lemmas(): 183 | synonym = l.name().replace("_", " ").replace("-", " ").lower() 184 | synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm']) 185 | synonyms.add(synonym) 186 | if word in synonyms: 187 | synonyms.remove(word) 188 | return list(synonyms) 189 | 190 | ######################################################################## 191 | # Random deletion 192 | # Randomly delete words from the sentence with probability p 193 | ######################################################################## 194 | 195 | def random_deletion(words, p,task = "sentence"): 196 | #obviously, if there's only one word, don't delete it 197 | if len(words) == 1: 198 | return words 199 | 200 | #randomly delete words with probability p 201 | new_words = [] 202 | for word in words: 203 | if task == "re" and re.search("^@\w+\$$",word): 204 | new_words.append(word) 205 | continue 206 | 207 | r = random.uniform(0, 1) 208 | if r > p: 209 | new_words.append(word) 210 | 211 | #if you end up deleting all words, just return a random word 212 | if len(new_words) == 0: 213 | if len(words)-1<0: 214 | return 215 | rand_int = random.randint(0, len(words)-1) 216 | return [words[rand_int]] 217 | 218 | return new_words 219 | 220 | ######################################################################## 221 | # Random swap 222 | # Randomly swap two words in the sentence n times 223 | ######################################################################## 224 | 225 | def random_swap(words, n,task = "sent"): 226 | new_words = words.copy() 227 | for _ in range(n): 228 | new_words = swap_word(new_words) 229 | return new_words 230 | 231 | def swap_word(new_words, task = "sent"): 232 | if len(new_words)-1<0: 233 | return 234 | random_idx_1 = random.randint(0, len(new_words)-1) 235 | 236 | if task == "re": 237 | while re.search("^@\w+\$$",new_words[random_idx_1]): 238 | random_idx_1 = random.randint(0, len(new_words)-1) 239 | 240 | random_idx_2 = random_idx_1 241 | counter = 0 242 | while random_idx_2 == random_idx_1 or re.search("^@\w+\$$",new_words[random_idx_2]): 243 | random_idx_2 = random.randint(0, len(new_words)-1) 244 | counter += 1 245 | if counter > 3: 246 | return new_words 247 | new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 248 | return new_words 249 | 250 | ######################################################################## 251 | # Random insertion 252 | # Randomly insert n words into the sentence 253 | ######################################################################## 254 | 255 | def random_insertion(words, n,task = "sent"): 256 | new_words = words.copy() 257 | for _ in range(n): 258 | add_word(new_words) 259 | return new_words 260 | 261 | def add_word(new_words): 262 | synonyms = [] 263 | counter = 0 264 | while len(synonyms) < 1: 265 | if len(new_words)-1<0: 266 | return 267 | random_word = new_words[random.randint(0, len(new_words)-1)] 268 | 269 | synonyms = get_synonyms(random_word) 270 | counter += 1 271 | if counter >= 10: 272 | return 273 | random_synonym = synonyms[0] 274 | random_idx = random.randint(0, len(new_words)-1) 275 | new_words.insert(random_idx, random_synonym) 276 | 277 | ######################################################################## 278 | # main data augmentation function 279 | ######################################################################## 280 | 281 | def eda(sentence, apikey, alpha_sr=0.2, alpha_ri=0.2, alpha_rs=0.2, p_rd=0.2,alpha_umls=0.5,num_aug=9,task = "sent"): 282 | #task = "sent" sentence classification 283 | #task= "re" relation extraction "index sentence label" 284 | 285 | #sentence = get_only_chars(sentence) 286 | words = sentence.split(' ') 287 | words = [word for word in words if word != ''] 288 | num_words = len(words) 289 | 290 | augmented_sentences = [] 291 | num_new_per_technique = int(num_aug/4)+1 292 | n_sr = max(1, int(alpha_sr*num_words)) 293 | n_ri = max(1, int(alpha_ri*num_words)) 294 | n_rs = max(1, int(alpha_rs*num_words)) 295 | n_umls = max(1,int(alpha_umls*num_words)) 296 | 297 | 298 | #umls 299 | if True: 300 | sentences = umls_replacement(words,n_umls,apikey,task) 301 | if len(sentences) > 0: 302 | augmented_sentences.extend(sentences) 303 | #except: 304 | # augmented_sentences=[] 305 | 306 | #sr 307 | for _ in range(num_new_per_technique): 308 | a_words = synonym_replacement(words, n_sr,task) 309 | if a_words is None or len(a_words)<1: 310 | continue 311 | augmented_sentences.append(' '.join(a_words)) 312 | #ri 313 | for _ in range(num_new_per_technique): 314 | a_words = random_insertion(words, n_ri,task) 315 | if a_words is None or len(a_words)<1: 316 | continue 317 | augmented_sentences.append(' '.join(a_words)) 318 | #rs 319 | for _ in range(num_new_per_technique): 320 | a_words = random_swap(words, n_rs,task) 321 | if a_words is None or len(a_words)<1: 322 | continue 323 | augmented_sentences.append(' '.join(a_words)) 324 | #rd 325 | for _ in range(num_new_per_technique): 326 | a_words = random_deletion(words, p_rd,task) 327 | if a_words is None or len(a_words)<1: 328 | continue 329 | augmented_sentences.append(' '.join(a_words)) 330 | 331 | 332 | #augmented_sentences = list(set([get_only_chars(sentence) for sentence in augmented_sentences])) 333 | shuffle(augmented_sentences) 334 | 335 | #trim so that we have the desired number of augmented sentences 336 | if num_aug >= 1: 337 | augmented_sentences = augmented_sentences[:num_aug] 338 | else: 339 | keep_prob = num_aug / len(augmented_sentences) 340 | augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob] 341 | 342 | #append the original sentence 343 | augmented_sentences.append(sentence) 344 | 345 | return augmented_sentences 346 | 347 | -------------------------------------------------------------------------------- /src/eda4ner.py: -------------------------------------------------------------------------------- 1 | # Easy data augmentation techniques for text classification 2 | # Jason Wei and Kai Zou 3 | 4 | import random 5 | from random import shuffle 6 | random.seed(1) 7 | 8 | #stop words list 9 | stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 10 | 'ours', 'ourselves', 'you', 'your', 'yours', 11 | 'yourself', 'yourselves', 'he', 'him', 'his', 12 | 'himself', 'she', 'her', 'hers', 'herself', 13 | 'it', 'its', 'itself', 'they', 'them', 'their', 14 | 'theirs', 'themselves', 'what', 'which', 'who', 15 | 'whom', 'this', 'that', 'these', 'those', 'am', 16 | 'is', 'are', 'was', 'were', 'be', 'been', 'being', 17 | 'have', 'has', 'had', 'having', 'do', 'does', 'did', 18 | 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 19 | 'because', 'as', 'until', 'while', 'of', 'at', 20 | 'by', 'for', 'with', 'about', 'against', 'between', 21 | 'into', 'through', 'during', 'before', 'after', 22 | 'above', 'below', 'to', 'from', 'up', 'down', 'in', 23 | 'out', 'on', 'off', 'over', 'under', 'again', 24 | 'further', 'then', 'once', 'here', 'there', 'when', 25 | 'where', 'why', 'how', 'all', 'any', 'both', 'each', 26 | 'few', 'more', 'most', 'other', 'some', 'such', 'no', 27 | 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 28 | 'very', 's', 't', 'can', 'will', 'just', 'don', 29 | 'should', 'now', ''] 30 | 31 | #cleaning up text 32 | import re 33 | def get_only_chars(line): 34 | 35 | clean_line = "" 36 | 37 | line = line.replace("’", "") 38 | line = line.replace("'", "") 39 | line = line.replace("-", " ") #replace hyphens with spaces 40 | line = line.replace("\t", " ") 41 | line = line.replace("\n", " ") 42 | line = line.lower() 43 | 44 | for char in line: 45 | if char in 'qwertyuiopasdfghjklzxcvbnm ': 46 | clean_line += char 47 | else: 48 | clean_line += ' ' 49 | 50 | clean_line = re.sub(' +',' ',clean_line) #delete extra spaces 51 | if clean_line[0] == ' ': 52 | clean_line = clean_line[1:] 53 | return clean_line 54 | 55 | #***************************** 56 | # UMLS Synonyms replacement # 57 | #***************************** 58 | 59 | 60 | #from QuickUMLS.quickumls import QuickUMLS 61 | import nltk 62 | matcher = None#QuickUMLS("/home/tk2624/tools/QuickUMLS",threshold=0.8,similarity_name ='cosine',overlapping_criteria='length') 63 | 64 | def ngram_index(words, ngram): 65 | return list(nltk.ngrams(words, len(ngram))).index(tuple(ngram)) 66 | 67 | def isSubstring(s1, s2): 68 | M = len(s1) 69 | N = len(s2) 70 | 71 | # A loop to slide pat[] one by one 72 | for i in range(N - M + 1): 73 | 74 | # For current index i, 75 | # check for pattern match 76 | for j in range(M): 77 | if (s2[i + j] != s1[j]): 78 | break 79 | 80 | if j + 1 == M : 81 | return i 82 | 83 | return False 84 | 85 | def get_umls_tagging(text,matcher): 86 | info = matcher.match(text, best_match=True, ignore_syntax=False) 87 | taggings=[] 88 | if len(info) == 0: 89 | return None 90 | for one_c in info: 91 | 92 | one_c = one_c[0] 93 | 94 | result = {"cui":one_c["cui"],"term":one_c["term"]} 95 | taggings.append(one_c) 96 | return taggings 97 | 98 | from src.Authentication import * 99 | import requests 100 | import json 101 | 102 | def get_atoms(apikey,cui): 103 | AuthClient = Authentication(apikey) 104 | tgt = AuthClient.gettgt() 105 | uri = "https://uts-ws.nlm.nih.gov" 106 | content_endpoint = "/rest/content/2019AB"+"/CUI/"+str(cui)+"/atoms" 107 | query = {'ticket':AuthClient.getst(tgt),'language':'ENG','pageSize':200} 108 | headers = {"Range": "bytes=0-1"} 109 | r = requests.get(uri+content_endpoint,params=query,headers=headers) 110 | r.encoding = 'utf-8' 111 | items = json.loads(r.text) 112 | jsonData = items["result"] 113 | atoms =[i["name"] for i in jsonData if not re.search("[,;\-\(\)\.\/]",i['name']) and not re.search("NOS",i['name']) ]# remove sysnonyms with punctuations, or NOS 114 | atoms = list(set(atoms)) 115 | return atoms 116 | #get_atoms(apikey,'C0006142') 117 | 118 | def umls_replacement(words,labels,n, apikey): 119 | sent = " ".join(words) 120 | #new_labels = labels.copy() 121 | 122 | augumented_sents = [] 123 | taggings = get_umls_tagging(sent, matcher) 124 | num_replaced = 0 125 | total_num = 1 126 | if taggings is None: 127 | return augumented_sents 128 | atoms_set = [] 129 | cui_set = [] 130 | for i in taggings: 131 | cui = i["cui"] 132 | if cui in cui_set: 133 | continue 134 | cui_set.append(cui) 135 | try: 136 | atoms_raw = get_atoms(apikey,cui) 137 | except: 138 | continue 139 | 140 | #remove synnonyms with only case differences 141 | atoms = [a for a in atoms_raw if not a.lower() == i['term'].lower() and not a.lower()+"s" ==i['term'].lower() and not a.lower() == i['term'].lower()+"s" ] 142 | if i['term'] in atoms: 143 | atoms.remove(i['term']) 144 | 145 | #print(len(atoms),i['term'],"-------- atoms:",atoms,"\n") 146 | if len(atoms)>0: 147 | atoms_set.append({i['term']:atoms}) 148 | total_num=total_num*len(atoms) 149 | 150 | max_num = min(n, total_num) 151 | 152 | for _ in range(max_num): 153 | new_sent = sent 154 | new_label = labels.copy() 155 | 156 | for i in range(len(atoms_set)): 157 | 158 | atom = atoms_set[i] 159 | term = list(atom.keys())[0] 160 | atoms = atom[term] 161 | synonym = random.choice(atoms) 162 | 163 | try: 164 | a = re.search(synonym.lower(),term.lower()) 165 | b = re.search(term.lower(),synonym.lower()) 166 | except: 167 | a = True 168 | b = True 169 | if re.search("[\^\[\]\(\)\{\}]",synonym) or set(synonym.lower().split(" ")) == set(term.lower().split(" ")) or a == True or b == True or set(synonym.lower().split(" ")).issubset(set(term.lower().split(" "))) or set(term.lower().split(" ")).issubset(set(synonym.lower().split(" "))): 170 | continue 171 | #print (":",term) 172 | try: 173 | term_index = ngram_index(new_sent.split(" "),term.split(" ") ) 174 | except: 175 | continue 176 | 177 | before_index = term_index 178 | after_index = term_index+len(term.split(" ")) 179 | syn_label = new_label[before_index:after_index] 180 | ''' 181 | print (len(new_label),len(new_sent.split(" "))) 182 | print (new_label) 183 | print (new_sent) 184 | print (term,"==",synonym,"===",len(new_sent.split(" ")),term_index,syn_label) 185 | ''' 186 | if before_index == 0: 187 | before=["TEMP"] 188 | else: 189 | before = new_label[:before_index].copy() 190 | #print(term,synonym,syn_label) 191 | if re.search("O\s+.*\-"," ".join(syn_label)) or re.search("\-.*\s+O"," ".join(syn_label)): 192 | continue # syn span over both O and entities, skip 193 | elif re.search("^O+$","".join(syn_label)): # non entity span 194 | before.extend(["O"]*len(synonym.split(" "))) 195 | before.extend(new_label[after_index:]) 196 | elif len(set(syn_label)) == 1 and re.search("^I\-",syn_label[0]): 197 | before.extend([syn_label[0]]*len(synonym.split(" "))) 198 | before.extend(new_label[after_index:]) 199 | else: # both B- and I- or only B- 200 | if len(syn_label)>1: 201 | before.extend([syn_label[0]]) 202 | before.extend([syn_label[1]]*(len(synonym.split(" "))-1)) 203 | before.extend(new_label[after_index:]) 204 | 205 | else: 206 | #print (new_sent) 207 | #print (before) 208 | #print(syn_label[0]) 209 | before.extend([syn_label[0]]) 210 | before.extend([re.sub("^B","I",syn_label[0])]*(len(synonym.split(" "))-1)) 211 | before.extend(new_label[after_index:]) 212 | if before_index == 0: 213 | del before[0] 214 | words= new_sent.split(" ") 215 | new_label = before.copy() 216 | if before_index >0: 217 | new_sent = " ".join(words[:before_index])+" "+synonym+" " + " ".join(words[after_index:]) 218 | else: 219 | new_sent = synonym+" " + " ".join(words[after_index:]) 220 | augumented_sents.append([(new_sent).split(" "),new_label]) 221 | return augumented_sents 222 | 223 | 224 | ########################################################################## 225 | 226 | 227 | 228 | ######################################################################## 229 | # Synonym replacement 230 | # Replace n words in the sentence with synonyms from wordnet 231 | ######################################################################## 232 | 233 | #for the first time you use wordnet 234 | #import nltk 235 | #nltk.download('wordnet') 236 | from nltk.corpus import wordnet 237 | 238 | def synonym_replacement(words,labels,n): 239 | new_words = words.copy() 240 | new_labels = labels.copy() 241 | random_word_list = list(set([word for word in words if word not in stop_words])) 242 | #print (random_word_list,"***") 243 | random.shuffle(random_word_list) 244 | num_replaced = 0 245 | 246 | for random_word in random_word_list: 247 | synonyms = get_synonyms(random_word) 248 | #print (synonyms,"~~~~~~") 249 | if len(synonyms) >= 1: 250 | synonym = random.choice(list(synonyms)) 251 | if len(synonym.split(" "))>1 or re.search(synonym,random_word) or re.search(random_word,synonym): 252 | continue 253 | new_words = [synonym if word.lower() == random_word.lower() else word for word in new_words] 254 | #print("replaced", "==="+random_word+"===", "with", "==="+synonym+"===") 255 | num_replaced += 1 256 | if num_replaced >= n: #only replace up to n words 257 | break 258 | 259 | #this is stupid but we need it, trust me 260 | if new_words == words: 261 | return None,None 262 | 263 | sentence = ' '.join(new_words) 264 | #sentence = "1---"+sentence 265 | new_words =( sentence).split(' ') 266 | 267 | return new_words,new_labels 268 | 269 | def get_synonyms(word): 270 | synonyms = set() 271 | for syn in wordnet.synsets(word): 272 | for l in syn.lemmas(): 273 | synonym = l.name().replace("_", " ").replace("-", " ").lower() 274 | synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm']) 275 | synonyms.add(synonym) 276 | if word in synonyms: 277 | synonyms.remove(word) 278 | #print("\n==",word,"===", synonyms) 279 | 280 | return list(synonyms) 281 | 282 | 283 | ######################################################################## 284 | # Random deletion 285 | # Randomly delete words from the sentence with probability p 286 | ######################################################################## 287 | 288 | def random_deletion(words,labels, p): 289 | 290 | #obviously, if there's only one word, don't delete it 291 | if len(words) == 1: 292 | return words,labels 293 | 294 | #randomly delete words with probability p 295 | new_words = [] 296 | new_labels=[] 297 | # dont delete the words which is the begining of an entity(B-tag) 298 | for i, word in enumerate(words): 299 | r = random.uniform(0, 1) 300 | if re.search("^B-",labels[i]): 301 | new_words.append(word) 302 | new_labels.append(labels[i]) 303 | continue 304 | if r > p : 305 | new_words.append(word) 306 | new_labels.append(labels[i]) 307 | #if you end up deleting all words, just return a random word 308 | if len(new_words) == 0: 309 | rand_int = random.randint(0, len(words)-1) 310 | return words[rand_int],labels[rand_int] 311 | sentence = ' '.join(new_words) 312 | #sentence = "2---"+sentence 313 | new_words =( sentence).split(' ') 314 | return new_words,new_labels 315 | 316 | ######################################################################## 317 | # Random swap 318 | # Randomly swap two words in the sentence n times 319 | ######################################################################## 320 | 321 | def random_swap(words,labels, n): 322 | new_words = words.copy() 323 | new_labels = labels.copy() 324 | for _ in range(n): 325 | new_words = swap_word(new_words) 326 | sentence = ' '.join(new_words) 327 | #sentence = "3---"+sentence 328 | new_words = (sentence).split(' ') 329 | return new_words,new_labels 330 | 331 | def swap_word(new_words): 332 | random_idx_1 = random.randint(0, len(new_words)-1) 333 | random_idx_2 = random_idx_1 334 | counter = 0 335 | while random_idx_2 == random_idx_1: 336 | random_idx_2 = random.randint(0, len(new_words)-1) 337 | counter += 1 338 | if counter > 3: 339 | return new_words 340 | new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 341 | 342 | return new_words 343 | 344 | ######################################################################## 345 | # Random insertion 346 | # Randomly insert n words into the sentence 347 | ######################################################################## 348 | 349 | def random_insertion(words,labels, n): 350 | new_words = words.copy() 351 | new_labels = labels.copy() 352 | for _ in range(n): 353 | add_word(new_words,new_labels) 354 | sentence = ' '.join(new_words) 355 | #sentence = "4---"+sentence 356 | new_words = (sentence).split(' ') 357 | return new_words, new_labels 358 | 359 | def add_word(new_words,new_labels): 360 | synonyms = [] 361 | counter = 0 362 | while len(synonyms) < 1: 363 | #generate a random index thats not in the beginning in entities 364 | 365 | random_word = new_words[random.randint(0, len(new_words)-1) ] 366 | synonyms = get_synonyms(random_word) 367 | counter += 1 368 | if counter >= 10: 369 | return 370 | random_synonym = synonyms[0] 371 | seed = 1 372 | random.seed(seed) 373 | while re.search("B\-",new_labels[random.randint(0, len(new_words)-1)]) : 374 | seed += 1 375 | random.seed(seed) 376 | if seed > len(new_words): 377 | break 378 | random_idx = random.randint(0, len(new_words)-1) 379 | new_words.insert(random_idx, random_synonym) 380 | if random_idx==len(new_labels)-1: # insert in the end 381 | new_labels.insert(random_idx, "O") 382 | elif new_labels[random_idx+1] == "O": 383 | new_labels.insert(random_idx, "O") 384 | else: 385 | new_labels.insert(random_idx,new_labels[random_idx-1]) 386 | 387 | 388 | ######################################################################## 389 | # main data augmentation function 390 | ######################################################################## 391 | 392 | #0:ori 393 | #1:sr-wordnet 394 | #2:rd 395 | #3:rw 396 | #4:ri 397 | #5:sr-umls 398 | 399 | def eda(sentence, label, apikey, alpha_umls = 0.5, alpha_sr=0.5, alpha_ri=0.5, alpha_rs=0.5, p_rd=0.5, num_aug=10): 400 | 401 | #sentence = get_only_chars(sentence) 402 | words = sentence.split(' ') 403 | 404 | words = [word for word in words if word != ''] 405 | num_words = len(words) 406 | 407 | augmented_sentences = [] 408 | num_new_per_technique = int(num_aug/4)+1 409 | n_umls = max(1, int(alpha_umls*num_words)) 410 | n_sr = max(1, int(alpha_sr*num_words)) 411 | n_ri = max(1, int(alpha_ri*num_words)) 412 | n_rs = max(1, int(alpha_rs*num_words)) 413 | 414 | #umls 415 | new_sents = umls_replacement(words,label,4,apikey) 416 | if len(new_sents) >0: 417 | augmented_sentences.extend(new_sents) 418 | 419 | #sr 420 | for _ in range(num_new_per_technique): 421 | 422 | a_words,new_label = synonym_replacement(words,label, n_sr) 423 | 424 | if a_words is None: 425 | continue 426 | augmented_sentences.append([a_words,new_label]) 427 | 428 | #ri 429 | for _ in range(num_new_per_technique): 430 | a_words,new_label = random_insertion(words,label, n_ri) 431 | 432 | if a_words == words: 433 | continue 434 | #print(" ".join(a_words)) 435 | #print(" ".join(new_label)) 436 | augmented_sentences.append([a_words,new_label]) 437 | #augmented_sentences.append(' '.join(a_words)) 438 | 439 | #rs 440 | for _ in range(num_new_per_technique): 441 | a_words,new_label = random_swap(words,label, n_rs) 442 | if a_words == words: 443 | continue 444 | augmented_sentences.append([a_words,new_label]) 445 | 446 | 447 | #rd 448 | for _ in range(num_new_per_technique): 449 | a_words,new_label = random_deletion(words,label, p_rd) 450 | if a_words == words: 451 | continue 452 | augmented_sentences.append([a_words,new_label]) 453 | 454 | shuffle(augmented_sentences) 455 | 456 | #trim so that we have the desired number of augmented sentences 457 | if num_aug >= 1 :#and len(augmented_sentences)>num_aug: 458 | augmented_sentences = augmented_sentences[:num_aug] 459 | else: 460 | keep_prob = num_aug / len(augmented_sentences) 461 | augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob] 462 | 463 | #append the original sentence 464 | augmented_sentences.append([words,label])#"0---"+sentence) 465 | 466 | return augmented_sentences 467 | 468 | -------------------------------------------------------------------------------- /src/retrieve-cui-or-code.py: -------------------------------------------------------------------------------- 1 | ################################################################################################# 2 | # usage of the script 3 | # usage: python retrieve-cui-or-code.py -k APIKEY -v VERSION -i IDENTIFIER -s SOURCE 4 | # If you do not provide the -s parameter, the script assumes you are retrieving information for a 5 | # known UMLS CUI 6 | ################################################################################################# 7 | 8 | from Authentication import * 9 | import requests 10 | import json 11 | import argparse 12 | 13 | parser = argparse.ArgumentParser(description='process user given parameters') 14 | #parser.add_argument("-u", "--username", required = True, dest="username", help = "enter username") 15 | #parser.add_argument("-p", "--password", required = True, dest="password", help = "enter passowrd") 16 | parser.add_argument("-k", "--apikey", required = True, dest = "apikey", help = "enter api key from your UTS Profile") 17 | parser.add_argument("-v", "--version", required = False, dest="version", default = "current", help = "enter version example-2015AA") 18 | parser.add_argument("-i", "--identifier", required = True, dest="identifier", help = "enter identifier example-C0018787") 19 | parser.add_argument("-s", "--source", required = False, dest="source", help = "enter source name if known") 20 | 21 | args = parser.parse_args() 22 | 23 | #username = args.username 24 | #password = args.password 25 | apikey = args.apikey 26 | version = args.version 27 | identifier = args.identifier 28 | source = args.source 29 | AuthClient = Authentication(apikey) 30 | 31 | ################################### 32 | #get TGT for our session 33 | ################################### 34 | 35 | tgt = AuthClient.gettgt() 36 | uri = "https://uts-ws.nlm.nih.gov" 37 | 38 | try: 39 | source 40 | except NameError: 41 | source = None 42 | 43 | ##if we don't specify a source vocabulary, assume we're retrieving UMLS CUIs 44 | if source is None: 45 | content_endpoint = "/rest/content/"+str(version)+"/CUI/"+str(identifier) 46 | 47 | else: 48 | content_endpoint = "/rest/content/"+str(version)+"/source/"+str(source)+"/"+str(identifier) 49 | 50 | ##ticket is the only parameter needed for this call - paging does not come into play because we're only asking for one Json object 51 | query = {'ticket':AuthClient.getst(tgt)} 52 | print ("query 1:",query) 53 | r = requests.get(uri+content_endpoint,params=query) 54 | r.encoding = 'utf-8' 55 | items = json.loads(r.text) 56 | jsonData = items["result"] 57 | print("))))))",uri+content_endpoint) 58 | ##uncomment the print statment if you want the raw json output, or you can just look at the documentation :=) 59 | #https://documentation.uts.nlm.nih.gov/rest/concept/index.html#sample-output 60 | #https://documentation.uts.nlm.nih.gov/rest/source-asserted-identifiers/index.html#sample-output 61 | #print (json.dumps(items, indent = 4)) 62 | 63 | ############################ 64 | ### Print out fields #### 65 | 66 | classType = jsonData["classType"] 67 | name = jsonData["name"] 68 | ui = jsonData["ui"] 69 | AtomCount = jsonData["atomCount"] 70 | Definitions = jsonData["definitions"] 71 | Atoms = jsonData["atoms"] 72 | DefaultPreferredAtom = jsonData["defaultPreferredAtom"] 73 | 74 | ## print out the shared data elements that are common to both the 'Concept' and 'SourceAtomCluster' class 75 | print ("classType: " + classType) 76 | print ("ui: " + ui) 77 | print ("Name: " + name) 78 | print ("AtomCount: " + str(AtomCount)) 79 | print ("Atoms: " + Atoms) 80 | print ("Default Preferred Atom: " + DefaultPreferredAtom) 81 | #new = requests.get(uri+"/content/current/CUI/C0155502/atoms",params=query) 82 | query ={'ticket':AuthClient.getst(tgt)} 83 | #new = requests.get("https://uts-ws.nlm.nih.gov/rest/content/current/CUI/C0155502/atoms?ttys=PT&sabs=SNOMEDCT_US%2CICD9CM",params =query) 84 | #new.encoding = 'utf-8' 85 | #print ("=======",new.text) 86 | new = requests.get(Atoms, params=query) 87 | print ("====\n") 88 | items = json.loads(new.text)['result'] 89 | print( len(items)) 90 | 91 | for i in items: 92 | print (i["language"],i["name"],"\n") 93 | 94 | ''' 95 | #print (new_items) 96 | ## These data elements may or may not exist depending on what class ('Concept' or 'SourceAtomCluster') you're dealing with so we check for each one. 97 | try: 98 | jsonData["definitions"] 99 | print ("definitions: " + jsonData["definitions"]) 100 | except: 101 | pass 102 | 103 | try: 104 | jsonData["parents"] 105 | print ("parents: " + jsonData["parents"]) 106 | except: 107 | pass 108 | 109 | try: 110 | jsonData["children"] 111 | print ("children: " + jsonData["children"]) 112 | except: 113 | pass 114 | 115 | try: 116 | jsonData["relations"] 117 | print ("relations: " + jsonData["relations"]) 118 | except: 119 | pass 120 | 121 | try: 122 | jsonData["descendants"] 123 | print ("descendants: " + jsonData["descendants"]) 124 | except: 125 | pass 126 | 127 | try: 128 | jsonData["semanticTypes"] 129 | print("Semantic Types:") 130 | for stys in jsonData["semanticTypes"]: 131 | print("uri: "+ stys["uri"]) 132 | print("name: "+ stys["name"]) 133 | 134 | except: 135 | pass 136 | 137 | ''' 138 | --------------------------------------------------------------------------------