├── .gitignore ├── BERT_experiments ├── evaluate.py ├── train.py └── utils.py ├── LSTM_experiments ├── LSTM_no_attention.png ├── attention_lstm.png ├── attention_lstm_old.png ├── new_experiments │ ├── preprocess.py │ ├── train.py │ └── utils.py ├── old_experiments │ ├── classifier.py │ ├── embedding.py │ ├── readme.md │ └── utils.py └── training file.txt └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /BERT_experiments/evaluate.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is used for evaluating the model. 3 | For this task, the classification report of sklearn was used, which contains 4 | precision/recall/f1. 5 | Feel free to add more metrics 6 | """ 7 | 8 | # let's firstly load the model and validation data 9 | from __future__ import absolute_import, division, print_function 10 | from keras.utils import to_categorical 11 | from sklearn.metrics import classification_report 12 | from pytorch_pretrained_bert import BertForSequenceClassification 13 | from tqdm import tqdm_notebook 14 | import torch 15 | import os 16 | 17 | FILE_PATH = "./bert_pytorch.bin" 18 | VAL_X_PATH = "./val_X.npy" 19 | VAL_Y_PATH = ".val_y.npy" 20 | 21 | if __name__ == "__main__": 22 | 23 | # check if the train process has been completed and the file is put in the right place 24 | if os.path.exists(FILE_PATH): 25 | model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(set(tags))) 26 | model.load_state_dict(torch.load("bert_pytorch.bin" )) 27 | model.to(device) 28 | else: 29 | print(f"\nmodel file not found, run train.py first to get the model file, and put it in {FILE_PATH}") 30 | raise FileNotFoundError 31 | 32 | if os.path.exists(VAL_X_PATH): 33 | val_X = np.load(VAL_X_PATH) 34 | else: 35 | print(f"\nfile {VAL_X_PATH} not found, run train.py first to get the validation file, and put it in {VAL_X_PATH}") 36 | raise FileNotFoundError 37 | 38 | if os.path.exists(VAL_Y_PATH): 39 | val_y = np.load(VAL_Y_PATH) 40 | else: 41 | print(f"\nfile {VAL_Y_PATH} not found, run train.py first to get the validation file, and put it in {VAL_Y_PATH}") 42 | raise FileNotFoundError 43 | 44 | 45 | # freeze the model 46 | for param in model.parameters(): 47 | param.requires_grad=False 48 | model.eval() 49 | 50 | valid_preds = [] 51 | valid = torch.utils.data.TensorDataset(torch.tensor(val_X,dtype=torch.long)) 52 | valid_loader = torch.utils.data.DataLoader(valid, batch_size=32, shuffle=False) 53 | 54 | tk0 = tqdm_notebook(valid_loader) 55 | for i,(x_batch,) in enumerate(tk0): 56 | pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None) 57 | valid_preds.append(np.argmax(pred.cpu().numpy(),axis=1)) 58 | 59 | valid_preds = np.concatenate(valid_preds,axis=0) 60 | 61 | print(classification_report(valid_preds,val_y)) -------------------------------------------------------------------------------- /BERT_experiments/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import time 4 | import gc 5 | import re 6 | import sys 7 | import os 8 | import warnings 9 | import pandas as pd 10 | import numpy as np 11 | import re 12 | import sys 13 | 14 | import torch 15 | import torch.nn as nn 16 | import torch.utils.data 17 | import torch.nn.functional as F 18 | from keras.utils import to_categorical 19 | from keras.preprocessing.text import Tokenizer 20 | from sklearn.metrics import f1_score, classification_report 21 | from sklearn import model_selection 22 | 23 | from tqdm import tqdm, tqdm_notebook 24 | from IPython.core.interactiveshell import InteractiveShell 25 | from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam, BertModel 26 | from utils import get_train, clean_str, convert_lines 27 | from apex import amp 28 | 29 | InteractiveShell.ast_node_interactivity = "all" 30 | warnings.filterwarnings(action='once') 31 | device=torch.device('cuda') 32 | os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3" 33 | 34 | # hyperparameters 35 | FILE_PATH = "training file.txt" 36 | MAX_SEQUENCE_LENGTH = 75 37 | TRAIN_SIZE = 6500 38 | SEED = 666 39 | EPOCHS = 5 40 | LR=2e-5 41 | BATCH_SIZE = 32 42 | ACCUMULATION_STEPS=2 # how many steps it should backward propagate before optimization 43 | OUTPUT_FILE_NAME = "bert_pytorch.bin" 44 | 45 | # convert the origin data into a formatted pandas dataframe 46 | train_df = get_train(FILE_PATH) 47 | train_df['text'] = train_df['text'].apply(clean_str) 48 | 49 | #convert tag to sequence, maybe there are more elegant way to do this 50 | tags = train_df['tag'].to_list() 51 | tokenizer_tag = Tokenizer() 52 | tokenizer_tag.fit_on_texts(tags) 53 | tags = tokenizer_tag.texts_to_sequences(tags) 54 | tags = np.array(list((map(lambda x: x[0],tags)))) 55 | 56 | 57 | # convert text to bert format sequence 58 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 59 | sequences = convert_lines(train_df["text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer) 60 | 61 | 62 | #shuffle the data 63 | np.random.seed(2019) 64 | index = np.random.permutation(len(sequences)) 65 | sequences = sequences[index] 66 | tags = tags[index] 67 | 68 | 69 | # split the data into train/test 70 | X = sequences[:TRAIN_SIZE] 71 | y = tags[:TRAIN_SIZE] 72 | val_X = sequences[TRAIN_SIZE:] 73 | val_y = tags[TRAIN_SIZE:] 74 | y = to_categorical(y-1) 75 | val_y = to_categorical(val_y-1) 76 | 77 | #due to the GPU memory limitation, just use 64 samples to validate 78 | #the complete validation process would be done after the training process is over, see the evaluate.py 79 | val_y = val_y[:64] 80 | val_X = val_X[:64] 81 | train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long), torch.tensor(y,dtype=torch.float)) 82 | 83 | # Initialize the model 84 | np.random.seed(SEED) 85 | torch.manual_seed(SEED) 86 | torch.cuda.manual_seed(SEED) 87 | torch.backends.cudnn.deterministic = True 88 | output_model_file = OUTPUT_FILE_NAME 89 | 90 | model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(set(tags))) 91 | model.zero_grad() 92 | model = model.to(device) 93 | 94 | param_optimizer = list(model.named_parameters()) 95 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 96 | optimizer_grouped_parameters = [ 97 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 98 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 99 | ] 100 | train = train_dataset 101 | 102 | num_train_optimization_steps = int(EPOCHS*len(train)/BATCH_SIZE/ACCUMULATION_STEPS) 103 | 104 | optimizer = BertAdam(optimizer_grouped_parameters, 105 | lr=LR, 106 | warmup=0.05, 107 | t_total=num_train_optimization_steps) 108 | 109 | model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0) 110 | model=model.train() 111 | 112 | # train the model 113 | tq = tqdm_notebook(range(EPOCHS)) 114 | 115 | for epoch in tq: 116 | train_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True) 117 | avg_loss = 0. 118 | avg_accuracy = 0. 119 | lossf=None 120 | tk0 = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False) 121 | optimizer.zero_grad() 122 | 123 | for i,(x_batch, y_batch) in tk0: 124 | torch.cuda.empty_cache() 125 | y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None) 126 | loss = F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device)) 127 | with amp.scale_loss(loss, optimizer) as scaled_loss: 128 | scaled_loss.backward() 129 | if (i+1) % ACCUMULATION_STEPS == 0: # Wait for several backward steps 130 | optimizer.step() # Now we can do an optimizer step 131 | optimizer.zero_grad() 132 | if lossf: 133 | lossf = 0.98*lossf+0.02*loss.item() 134 | else: 135 | lossf = loss.item() 136 | tk0.set_postfix(loss = lossf) 137 | 138 | if i % 5 == 0: 139 | 140 | val_output = model(torch.tensor(val_X).to(device),attention_mask=(torch.tensor(val_X)>0).to(device), labels=None) 141 | val_pred = np.argmax(val_output.data.cpu(),axis=1) 142 | 143 | val_loss = F.binary_cross_entropy_with_logits(val_output,torch.tensor(val_y).to(device)) 144 | accuracy = torch.sum(torch.tensor(val_pred) == \ 145 | torch.tensor(np.argmax(val_y,axis=1))).type(torch.FloatTensor) / torch.tensor(val_y).size(0) 146 | 147 | print('Step: ', i, '| train loss: %.4f' % lossf, '| test accuracy: %.2f' % accuracy,'| val loss: %2f' % val_loss.item()) 148 | print(classification_report(np.argmax(val_y,axis=1),val_pred.numpy())) 149 | tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy) 150 | 151 | # save the model and validation data for evaluate 152 | torch.save(model.state_dict(), output_model_file) 153 | np.save("val_y.npy",tags[TRAIN_SIZE:]) 154 | np.save("val_X.npy",sequences[TRAIN_SIZE:]) -------------------------------------------------------------------------------- /BERT_experiments/utils.py: -------------------------------------------------------------------------------- 1 | def get_train(file_path, no_dup = True, no_other=False): 2 | """ 3 | format the i2b2 data into a pandas dataframe. 4 | notice that there are many duplicates texts in the dataset, so adjust the parameters according 5 | to your interests. 6 | 7 | parameters: 8 | file_path: the file's path, a string format 9 | no_dup: if true, the duplicate text would be removed 10 | no_other: if true, the samples of tag "other" should be removed 11 | 12 | sample usage: train_df = get_train("./training file.txt") 13 | return : a pd dataframe with columns: text, tag, test_info, problem_info, treatment_info 14 | """ 15 | 16 | file = open(file_path) 17 | file = [line.strip('\n').strip('\ufeff') for line in file.readlines()] 18 | 19 | def format_input(df): 20 | targets = ['test','problem','treatment'] 21 | for target in targets: 22 | df.loc[df['t1'].str.contains('\|'+target),target+'_info'] = df['t1'] 23 | df.loc[(df['t2'].str.contains('\|'+target)) & \ 24 | (df[target+'_info'].isnull()),target+'_info'] = df['t2'] 25 | df.drop(['t1','t2'],axis=1,inplace=True) 26 | if no_dup: 27 | df.drop_duplicates(['text'],inplace=True) 28 | if no_other: 29 | df = df.loc[df.tag!='other'] #delete tag "other" 30 | df.index = np.arange(df.shape[0]) 31 | return df 32 | 33 | 34 | train_df = pd.DataFrame(np.array([file[i::5] for i in range(4)]).T,columns=['text','t1','t2','tag']) 35 | train_df = format_input(train_df) 36 | return train_df 37 | 38 | 39 | 40 | def clean_str(text,lower=True): 41 | """ 42 | clean and format the text 43 | 44 | parameters: 45 | text: a string format text 46 | lower: if true, the text would be convert to lower format 47 | 48 | return: processed text 49 | """ 50 | 51 | text = text.lower() 52 | 53 | replace_pair = [(r"[^A-Za-z0-9^,!.\/'+-=]"," "),(r"what's","what is "),(r"that's","that is "),(r"there's","there is "), 54 | (r"it's","it is "),(r"\'s", " "),(r"\'ve", " have "),(r"can't", "can not "),(r"n't", " not "),(r"i'm", "i am "), 55 | (r"\'re", " are "),(r"\'d", " would "),(r"\'ll", " will "),(r",", " "),(r"\.", " "),(r"!", " ! "),(r"\/", " "), 56 | (r"\^", " ^ "),(r"\+", " + "),(r"\-", " - "),(r"\=", " = "),(r"'", " "),(r"(\d+)(k)", r"\g<1>000"),(r":", " : "), 57 | (r" e g ", " eg "),(r" b g ", " bg "),(r" u s ", " american "),(r"\0s", "0"),(r" 9 11 ", "911"),(r"e - mail", "email"), 58 | (r"j k", "jk"),(r"\s{2,}", " ")] 59 | 60 | for before, after in replace_pair: 61 | text = re.sub(before,after,text) 62 | 63 | return text.strip() 64 | 65 | 66 | 67 | # Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming 68 | def convert_lines(example, max_seq_length,tokenizer): 69 | """convert the given texts to BERT format sequences 70 | 71 | parameters: 72 | example: a list of text string of a pandas series 73 | max_seq_length: pad the text to max_seq_length 74 | tokenizer: bert tokenizer 75 | 76 | sample usage: 77 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 78 | sequences = convert_lines(train_df["text"].fillna("DUMMY_VALUE"), 100 ,tokenizer) 79 | 80 | return: formatted sequence 81 | """ 82 | max_seq_length -=2 83 | all_tokens = [] 84 | longer = 0 85 | for text in tqdm(example): 86 | tokens_a = tokenizer.tokenize(text) 87 | if len(tokens_a)>max_seq_length: 88 | tokens_a = tokens_a[:max_seq_length] 89 | longer += 1 90 | one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a)) 91 | all_tokens.append(one_token) 92 | print(longer) 93 | return np.array(all_tokens) -------------------------------------------------------------------------------- /LSTM_experiments/LSTM_no_attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ledzy/I2B2-Entity-Relation-Extraction/abea447826a570ba6d3d9c316ca72357a3824bea/LSTM_experiments/LSTM_no_attention.png -------------------------------------------------------------------------------- /LSTM_experiments/attention_lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ledzy/I2B2-Entity-Relation-Extraction/abea447826a570ba6d3d9c316ca72357a3824bea/LSTM_experiments/attention_lstm.png -------------------------------------------------------------------------------- /LSTM_experiments/attention_lstm_old.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ledzy/I2B2-Entity-Relation-Extraction/abea447826a570ba6d3d9c316ca72357a3824bea/LSTM_experiments/attention_lstm_old.png -------------------------------------------------------------------------------- /LSTM_experiments/new_experiments/preprocess.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file get preprocessed data 3 | """ 4 | 5 | from keras.preprocessing.text import Tokenizer 6 | from keras.preprocessing.sequence import pad_sequences 7 | from keras.utils import to_categorical 8 | from sklearn.model_selection import train_test_split 9 | from tqdm import tqdm 10 | from utils import load_glove, clean_str, get_train 11 | import pandas as pd 12 | import numpy as np 13 | import re 14 | 15 | 16 | # get input 17 | train_df = get_train() 18 | texts = train_df['text'].to_list() 19 | tags = train_df['tag'].to_list() 20 | 21 | # clean the text 22 | train_df['text'] = train_df['text'].apply(clean_str) 23 | 24 | # text2sequence 25 | emb_size = 300 26 | max_features = 6000 27 | maxlen = 50 28 | 29 | tokenizer = Tokenizer(num_words=max_features) 30 | tokenizer.fit_on_texts(texts) 31 | word_index = tokenizer.word_index 32 | sequences = tokenizer.texts_to_sequences(texts) 33 | sequences = pad_sequences(sequences,maxlen=maxlen) 34 | 35 | tokenizer_tag = Tokenizer() 36 | tokenizer_tag.fit_on_texts(tags) 37 | tags = tokenizer_tag.texts_to_sequences(tags) 38 | tags = np.array(list((map(lambda x: x[0],tags)))) 39 | tags = to_categorical(tags) 40 | 41 | 42 | # load embedding 43 | emb_matrix = load_glove(word_index) 44 | 45 | # Get test/problem/treatment matrix: info_matrix 46 | # info_matrix: (m,3,maxlen), which uses one-hot to indicate the entity property of the token 47 | targets = ['test_info','problem_info','treatment_info'] 48 | info_matrix = np.zeros((sequences.shape[0],3,maxlen)) 49 | 50 | for i,target in enumerate(targets): 51 | for k,j in train_df[target].str.extract('(\d+)\|(\d+)').iterrows(): 52 | if not pd.isnull(j[0]): 53 | info_matrix[k,i,int(j[0])-1:int(j[1])] = 1 54 | 55 | 56 | # Shuffle the data 57 | np.random.seed(2019) 58 | index = np.random.permutation(len(sequences)) 59 | 60 | sequences = sequences[index] 61 | tags = tags[index] 62 | info_matrix = info_matrix[index].swapaxes(1,2) -------------------------------------------------------------------------------- /LSTM_experiments/new_experiments/train.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Input, Dense, LSTM, CuDNNLSTM, Dropout, Embedding, Softmax, CuDNNGRU 2 | from keras.layers import Bidirectional, concatenate, RepeatVector, Dot, Activation, merge, Reshape, Add 3 | from keras.models import Model, Sequential 4 | from keras.optimizers import Adam 5 | from keras import backend as K 6 | from keras.callbacks import TensorBoard 7 | from keras.utils import to_categorical 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.metrics import classification_report 10 | from utils import f1, get_train, clean_str, load_glove 11 | import tensorflow as tf 12 | import os #for setting GPU 13 | import time #for formatting log name 14 | 15 | from preprocess import * #run preprocess file 16 | 17 | os.environ['CUDA_VISIBLE_DEVICES'] = "0" 18 | 19 | # hyperparameters 20 | LR = 1e-4 21 | EPOCHS = 50 22 | 23 | # define layer obejcts for achieving attention 24 | repeat_vec = RepeatVector(maxlen) 25 | densor = Dense(1,activation='relu') # repeat_vec & densor are used for one_step_attention 26 | activator = Activation('softmax',name='attention_weights') 27 | dotor = Dot(axes=1,name='context') 28 | densor2 = Dense(1,activation='relu') 29 | 30 | def one_step_attention(s_prev,a): 31 | """ 32 | Note: This attention method is not applicable for the task, since the dataset is small. Instead, 33 | using one_step_attention_v2. 34 | 35 | calculate the weight of each word for the given input, using second LSTM's previous 36 | hidden state and the output of the first LSTM (see attention_lstm_old.png) 37 | 38 | parameters: 39 | s_prev: the hidden state of the second LSTM 40 | a: the output of the first LSTM 41 | """ 42 | s_prev = repeat_vec(s_prev) 43 | concat = concatenate([s_prev,a],axis=-1) 44 | concat = Activation(activation='tanh')(concat) 45 | e = densor(concat) 46 | alphas = activator(e) 47 | context = dotor([alphas,a]) 48 | 49 | return context 50 | 51 | 52 | def one_step_attention_v2(a): 53 | """ 54 | use only the previous state to form attention weights, see attention_lstm.png 55 | 56 | parameters: 57 | a: the output of the first LSTM 58 | """ 59 | e = densor2(a) 60 | alphas = activator(e) 61 | context = dotor([alphas,a]) 62 | 63 | return context 64 | 65 | 66 | def model(): 67 | 68 | sequences = Input(shape=(maxlen,),name='sequences') 69 | info_matrix = Input(shape=(maxlen,3),name='info_matrix') 70 | 71 | embedding = Embedding(max_features,emb_size,weights=[emb_matrix],trainable=False,name='embedding')(sequences) 72 | X = concatenate([embedding,info_matrix],axis=2,name='concat') 73 | a = Bidirectional(CuDNNLSTM(64,return_sequences=True))(X) 74 | 75 | context = one_step_attention_v2(a) 76 | context = Activation(activation='tanh')(context) 77 | 78 | output = Dense(tags.shape[1],activation='softmax')(context) 79 | output = Reshape((tags.shape[1],))(output) 80 | 81 | model = Model(inputs=[sequences,info_matrix],outputs=output) 82 | return model 83 | 84 | 85 | # run the model 86 | def run_model(record=True,validation_split=0.15,epochs=50,lr=1e-3): 87 | deep_model = model() 88 | 89 | opt = Adam(lr=1e-3) 90 | deep_model.compile(opt,loss='categorical_crossentropy',metrics=[f1,'accuracy']) 91 | 92 | if record == True: 93 | deep_model.fit([sequences,info_matrix],tags,epochs=epochs,validation_split=validation_split,callbacks=[tensorboard]) 94 | else: deep_model.fit([sequences,info_matrix],tags,epochs=epochs,validation_split=validation_split) 95 | 96 | return deep_model 97 | 98 | # adjust the NAME according to your needs 99 | NAME = "Attention_v2-Simplify-Para-BiLSTM-Freeze-Embedding-Add-Pos-Preprocessing{}".format(int(time.time())) 100 | tensorboard = TensorBoard(log_dir='logs/{}'.format(NAME)) 101 | 102 | deep_model = run_model(lr=LR,epochs=EPOCHS) -------------------------------------------------------------------------------- /LSTM_experiments/new_experiments/utils.py: -------------------------------------------------------------------------------- 1 | def get_train(file_path, no_dup = True, no_other=False): 2 | """ 3 | format the i2b2 data into a pandas dataframe. 4 | notice that there are many duplicates texts in the dataset, so adjust the parameters according 5 | to your interests. 6 | 7 | parameters: 8 | file_path: the file's path, a string format 9 | no_dup: if true, the duplicate text would be removed 10 | no_other: if true, the samples of tag "other" should be removed 11 | 12 | sample usage: train_df = get_train("./training file.txt") 13 | return : a pd dataframe with columns: text, tag, test_info, problem_info, treatment_info 14 | """ 15 | 16 | file = open(file_path) 17 | file = [line.strip('\n').strip('\ufeff') for line in file.readlines()] 18 | 19 | def format_input(df): 20 | targets = ['test','problem','treatment'] 21 | for target in targets: 22 | df.loc[df['t1'].str.contains('\|'+target),target+'_info'] = df['t1'] 23 | df.loc[(df['t2'].str.contains('\|'+target)) & \ 24 | (df[target+'_info'].isnull()),target+'_info'] = df['t2'] 25 | df.drop(['t1','t2'],axis=1,inplace=True) 26 | if no_dup: 27 | df.drop_duplicates(['text'],inplace=True) 28 | if no_other: 29 | df = df.loc[df.tag!='other'] #delete tag "other" 30 | df.index = np.arange(df.shape[0]) 31 | return df 32 | 33 | 34 | train_df = pd.DataFrame(np.array([file[i::5] for i in range(4)]).T,columns=['text','t1','t2','tag']) 35 | train_df = format_input(train_df) 36 | return train_df 37 | 38 | 39 | 40 | def clean_str(text,lower=True): 41 | """ 42 | clean and format the text 43 | 44 | parameters: 45 | text: a string format text 46 | lower: if true, the text would be convert to lower format 47 | 48 | return: processed text 49 | """ 50 | 51 | text = text.lower() 52 | 53 | replace_pair = [(r"[^A-Za-z0-9^,!.\/'+-=]"," "),(r"what's","what is "),(r"that's","that is "),(r"there's","there is "), 54 | (r"it's","it is "),(r"\'s", " "),(r"\'ve", " have "),(r"can't", "can not "),(r"n't", " not "),(r"i'm", "i am "), 55 | (r"\'re", " are "),(r"\'d", " would "),(r"\'ll", " will "),(r",", " "),(r"\.", " "),(r"!", " ! "),(r"\/", " "), 56 | (r"\^", " ^ "),(r"\+", " + "),(r"\-", " - "),(r"\=", " = "),(r"'", " "),(r"(\d+)(k)", r"\g<1>000"),(r":", " : "), 57 | (r" e g ", " eg "),(r" b g ", " bg "),(r" u s ", " american "),(r"\0s", "0"),(r" 9 11 ", "911"),(r"e - mail", "email"), 58 | (r"j k", "jk"),(r"\s{2,}", " ")] 59 | 60 | for before, after in replace_pair: 61 | text = re.sub(before,after,text) 62 | 63 | return text.strip() 64 | 65 | def load_glove(word_index): 66 | def get_coefs(word,*emb): return word, np.asarray(emb,dtype='float32') 67 | embedding = dict(get_coefs(*o.split(' ')) for o in tqdm(open('glove.840B.300d.txt'))) 68 | 69 | emb_mean, emb_std = -0.005838459, 0.48782179 70 | embed_matrix = np.random.normal(emb_mean,emb_std,(max_features,emb_size)) 71 | 72 | for word, i in word_index.items(): 73 | if i >= max_features: continue 74 | if embedding.get(word) is not None: 75 | embed_matrix[i] = embedding.get(word) 76 | 77 | return embed_matrix 78 | 79 | def f1(y_true, y_pred): 80 | def recall(y_true, y_pred): 81 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 82 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 83 | recall = true_positives / (possible_positives + K.epsilon()) 84 | return recall 85 | 86 | def precision(y_true, y_pred): 87 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 88 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 89 | precision = true_positives / (predicted_positives + K.epsilon()) 90 | return precision 91 | 92 | precision = precision(y_true, y_pred) 93 | recall = recall(y_true, y_pred) 94 | return 2*((precision*recall)/(precision+recall+K.epsilon())) -------------------------------------------------------------------------------- /LSTM_experiments/old_experiments/classifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | These code were written when i just started using keras, so 3 | i'm not familiar with keras preprocessing tools and thereby did 4 | a lot of repetitive work. I keep it here to be a record for myself, 5 | but i suggest you to read new_experiments which is more readable:) 6 | """ 7 | 8 | # coding=utf-8 9 | 10 | from keras.layers import Dense, Input, CuDNNLSTM ,Dropout, Activation, Bidirectional 11 | from keras.models import Model 12 | from keras.layers.embeddings import Embedding 13 | from keras.layers import concatenate 14 | from keras.preprocessing import sequence 15 | from keras.initializers import glorot_uniform 16 | from keras.utils import to_categorical 17 | from utils import * 18 | from embedding import pretrained_embedding_layer 19 | from gensim.models.keyedvectors import KeyedVectors 20 | from imblearn.over_sampling import RandomOverSampler 21 | from imblearn.under_sampling import RandomUnderSampler 22 | import numpy as np 23 | import random 24 | import os 25 | 26 | 27 | os.environ["CUDA_VISIBLE_DEVICES"] = "3" #specify the GPU 28 | 29 | 30 | 31 | #Define the model 32 | def classifier(input_shape, input_shape2, word_to_vec_map, word_to_index): 33 | 34 | sentence_indices = Input(input_shape, dtype = 'int32') 35 | 36 | prob_test_oht = Input(input_shape2, dtype = 'float32') 37 | 38 | embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index) 39 | embeddings = embedding_layer(sentence_indices) 40 | 41 | #concatenate the embedding with prob_test_oht 42 | embeddings = concatenate([embeddings,prob_test_oht], axis=-1) 43 | 44 | #propogate the data through layers 45 | X = Bidirectional(CuDNNLSTM(128, return_sequences = True))(embeddings) 46 | X = Dropout(0.4)(X) 47 | X = Bidirectional(CuDNNLSTM(128, return_sequences = False))(X) 48 | X = Dropout(0.4)(X) 49 | X = Dense(6)(X) 50 | X = Activation('softmax')(X) 51 | 52 | model = Model(inputs=[sentence_indices,prob_test_oht], outputs = X) 53 | 54 | return model 55 | 56 | 57 | #load the file and embedding 58 | print('Reading the file...') 59 | with open('training file.txt', 'r') as f: 60 | data = f.readlines() 61 | print('File closed.\n') 62 | 63 | print('Loading the word-embedding...') 64 | word_to_vec_map = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True) 65 | print('Word-embedding loaded.\n') 66 | 67 | 68 | word_to_index, index_to_word = load_word_map(data) 69 | tag_to_index_map, index_to_tag_map = load_tag(data) 70 | 71 | Y_tags = data[3::5] 72 | Y_indices = [tag_to_index_map[tag] for tag in Y_tags] 73 | 74 | # Y_indices = [index for index in tag_to_index_map[Y_tags]] 75 | 76 | X = np.array(data[0::5]) 77 | # X = np.random.shuffle(X) 78 | 79 | max_len = get_max_length(X) 80 | 81 | 82 | prob_test_matrix = prob_test_matrix(data,max_len) 83 | 84 | 85 | #format the input of the model 86 | X_train_indices = sentences_to_indices(X,word_to_index,max_len) 87 | Y_train = to_categorical(Y_indices) 88 | 89 | 90 | #balance the training set 91 | ros = RandomOverSampler(random_state=0) #repeat all tags to the same #of the largest tags 92 | # ros = RandomUnderSampler(replacement=True, random_state=0) #Reduce the size of largest tags 93 | 94 | #shuflle 95 | index = [i for i in range(len(X_train_indices))] 96 | random.shuffle(index) 97 | prob_test_matrix = np.array([prob_test_matrix[i] for i in index]) 98 | X_train_indices = np.array([X_train_indices[i] for i in index]) 99 | Y_train = np.array([Y_train[i] for i in index]) 100 | 101 | #split into train and test 102 | X_train = X_train_indices[:int(0.8*len(X_train_indices))] 103 | X_test = X_train_indices[int(0.8*len(X_train_indices)):] 104 | Y_test = Y_train[int(0.8*len(X_train_indices)):] 105 | Y_train = Y_train[:int(0.8*len(X_train_indices))] 106 | prob_test_matrix_train = prob_test_matrix[:int(0.8*len(X_train_indices))] 107 | prob_test_matrix_test = prob_test_matrix[int(0.8*len(X_train_indices)):] 108 | 109 | X_resampled_train, Y_resampled_train = ros.fit_sample(X_train, Y_train) 110 | X_resampled_test, Y_resampled_test = ros.fit_sample(X_test, Y_test) 111 | prob_test_matrix_train_resampled,_ = ros.fit_sample(prob_test_matrix_train, Y_train) 112 | prob_test_matrix_test_resampled,_ = ros.fit_sample(prob_test_matrix_test, Y_test) 113 | 114 | 115 | #expand the dimension 116 | pt_matrix_train = [] 117 | pt_matrix_test = [] 118 | prob_test_dim = 32 119 | for i in range(prob_test_dim): 120 | pt_matrix_train.append(prob_test_matrix_train_resampled) 121 | pt_matrix_test.append(prob_test_matrix_test_resampled) 122 | 123 | #format the order of dimension 124 | pt_matrix_train = np.transpose(pt_matrix_train,(1,2,0)) 125 | pt_matrix_test = np.transpose(pt_matrix_test,(1,2,0)) 126 | 127 | 128 | 129 | #split validation of the model, can be omitted 130 | data_split = 5 131 | train_log = [] 132 | for i in range(data_split): 133 | start = int(i/5*len(X_resampled)) 134 | end = int((i+1)/5*len(X_resampled)) 135 | 136 | X_resampled_test = X_resampled[start:end] 137 | Y_resampled_test = Y_resampled[start:end] 138 | pt_matrix_test = pt_matrix[start:end] 139 | 140 | X_resampled_train = np.append(X_resampled[0:start],X_resampled[end:],axis=0) 141 | Y_resampled_train = np.append(Y_resampled[0:start],Y_resampled[end:],axis=0) 142 | pt_matrix_train = np.append(pt_matrix[0:start],pt_matrix[end:],axis=0) 143 | 144 | print('Constructing the model, split ',i) 145 | model = classifier((max_len,),(max_len,32),word_to_vec_map,word_to_index) 146 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 147 | History = model.fit([X_resampled_train,pt_matrix_train], Y_resampled_train, epochs=1, batch_size=50,validation_data=([X_resampled_test,pt_matrix_test],Y_resampled_test), shuffle=True) 148 | 149 | history = History.history 150 | acc,loss,val_acc,val_loss = history['acc'][-1], history['loss'][-1], history['val_acc'][-1], history['val_loss'][-1] 151 | train_log.append([acc,loss,val_acc,val_loss]) 152 | 153 | 154 | #print the performance of the model 155 | model = classifier((max_len,),(max_len,32),word_to_vec_map,word_to_index) 156 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 157 | model.fit([X_resampled_train,pt_matrix_train],Y_resampled_train,epochs=100,validation_data=([X_resampled_test,pt_matrix_test],Y_resampled_test),batch_size=100,shuffle=True) 158 | 159 | 160 | #save the model 161 | model.save('add_bidirect.h5') -------------------------------------------------------------------------------- /LSTM_experiments/old_experiments/embedding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.layers.embeddings import Embedding 3 | 4 | 5 | #define a embedding layer which is initialized with pre-trained word-embedding 6 | def pretrained_embedding_layer(word_to_vec_map, word_to_index): 7 | 8 | vocab_length = len(word_to_index) + 1 #adding 1 to fit the Embedding layer (keras requirement) 9 | emb_dim = word_to_vec_map['at'].shape[0] 10 | 11 | emb_matrix = np.zeros((vocab_length, emb_dim)) 12 | 13 | success = 0 14 | fail = [] 15 | for word, index in word_to_index.items(): 16 | try: 17 | emb_matrix[index,:] = word_to_vec_map[word] 18 | success += 1 19 | 20 | except Exception as e: 21 | fail.append(word) 22 | emb_matrix[index,:] = -np.random.randn(emb_dim)/20 23 | 24 | 25 | embedding_layer = Embedding(vocab_length,emb_dim,trainable=False) 26 | 27 | embedding_layer.build((None,)) 28 | 29 | embedding_layer.set_weights([emb_matrix]) 30 | 31 | return embedding_layer 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /LSTM_experiments/old_experiments/readme.md: -------------------------------------------------------------------------------- 1 | These code were written when i just started using keras. 2 | I was not familiar with keras preprocessing tools and thereby did 3 | a lot of repetitive work.
I keep it here to be a record for myself, 4 | but i suggest you to read new_experiments which is more readable :) -------------------------------------------------------------------------------- /LSTM_experiments/old_experiments/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.utils import to_categorical 3 | import re 4 | 5 | #get the function of word_to_index and index_to_word. 6 | def load_word_map(data): 7 | #temp solution 8 | with open('training file.txt', 'r') as f: 9 | data = f.read() 10 | 11 | word_to_index = {} 12 | index_to_word = {} 13 | index = 0 14 | for word in set(data.split()): 15 | word_to_index[word] = index 16 | index += 1 17 | 18 | for word, index in word_to_index.items(): 19 | index_to_word[index] = word 20 | 21 | return word_to_index, index_to_word 22 | 23 | 24 | #get the length of the longest sequence in the input sequence 25 | def get_max_length(input_sequence): 26 | max_len = 0 27 | 28 | for sentence in input_sequence: 29 | length = len(sentence.split()) 30 | if length > max_len: 31 | max_len = length 32 | 33 | return max_len 34 | 35 | 36 | #transfer the sentences into indices to get the input of embedding layer 37 | def sentences_to_indices(X, word_to_index, max_len): 38 | 39 | m = X.shape[0] 40 | X_indices = np.zeros((m, max_len)) 41 | 42 | for i in range(m): 43 | sentence_words = [w for w in X[i].split()] 44 | 45 | j = 0 46 | for word in sentence_words: 47 | try: 48 | X_indices[i,j] = word_to_index[word] 49 | j += 1 50 | except Exception as e: 51 | print(e) 52 | print(word, end='') 53 | 54 | return X_indices 55 | 56 | 57 | #get 6 tags from the data 58 | def load_tag(data): 59 | tag = set(data[3::5]) 60 | tag_to_index = {} 61 | index_to_tag = {} 62 | index = 0 63 | for i in tag: 64 | tag_to_index[i] = index 65 | index += 1 66 | 67 | for index, tag in tag_to_index.items(): 68 | index_to_tag[index] = tag 69 | 70 | return tag_to_index,index_to_tag 71 | 72 | 73 | '''get the problem/test information from data, 74 | and build a matrix such that for word tagged 75 | as problem, the relative cell would be 1. For 76 | test words, the relative cell would be -1 77 | ''' 78 | def prob_test_matrix(data,max_len): 79 | test_info = data[1::5] 80 | prob_info = data[2::5] 81 | test_pos = [] 82 | prob_pos = [] 83 | 84 | for key in test_info: 85 | pos = [int(i) for i in re.findall('\d+',key)] 86 | test_pos.append(pos) 87 | 88 | for key in prob_info: 89 | pos = [int(i) for i in re.findall('\d+',key)] 90 | prob_pos.append(pos) 91 | 92 | # test_pos_oht = to_categorical(test_pos) 93 | # prob_pos_oht = -to_categorical(prob_pos) 94 | 95 | prob_test_matrix = np.zeros([len(test_info),max_len]) 96 | 97 | for i in range(len(test_info)): 98 | 99 | prob_test_matrix[i,prob_pos[i][0]:prob_pos[i][1]+1] = 1 100 | prob_test_matrix[i,test_pos[i][0]:test_pos[i][1]+1] = -1 101 | 102 | return prob_test_matrix -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # I2B2-Entity-Relation-Extraction 2 | This repository contains several approaches of entity relation extraction, with regard to dataset i2b2, which i uploaded in the repository. 3 | 4 | Two main approaches are used and compared, which are 5 | * LSTM + attention mechanism 6 | * BERT, which is state-of-the-art NLP model 7 | 8 | ## LSTM 9 | Several LSTM networks are tried. Since the dataset is relatively small and has a lot of repetitions, the model should not be too complex. 10 | Currently the best LSTM model for the task is as follows: 11 | ![Image text](https://github.com/Ledzy/I2B2-Entity-Relation-Extraction/blob/master/LSTM_experiments/attention_lstm.png) 12 | 13 | which references: 14 | [Zhou, Peng, et al. "Attention-based bidirectional long short-term memory networks for relation classification." Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers). Vol. 2. 2016.](https://www.aclweb.org/anthology/P16-2034) 15 | 16 | Note for the embedding file GoogleNews-vectors-negative300.bin, you can download from https://code.google.com/archive/p/word2vec/. 17 | 18 | ## BERT 19 | 20 | --------------------------------------------------------------------------------