├── .gitignore
├── BERT_experiments
    ├── evaluate.py
    ├── train.py
    └── utils.py
├── LSTM_experiments
    ├── LSTM_no_attention.png
    ├── attention_lstm.png
    ├── attention_lstm_old.png
    ├── new_experiments
    │   ├── preprocess.py
    │   ├── train.py
    │   └── utils.py
    ├── old_experiments
    │   ├── classifier.py
    │   ├── embedding.py
    │   ├── readme.md
    │   └── utils.py
    └── training file.txt
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/BERT_experiments/evaluate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file is used for evaluating the model.
 3 | For this task, the classification report of sklearn was used, which contains
 4 | precision/recall/f1.
 5 | Feel free to add more metrics
 6 | """
 7 | 
 8 | # let's firstly load the model and validation data
 9 | from __future__ import absolute_import, division, print_function
10 | from keras.utils import to_categorical
11 | from sklearn.metrics import classification_report
12 | from pytorch_pretrained_bert import BertForSequenceClassification
13 | from tqdm import tqdm_notebook
14 | import torch
15 | import os
16 | 
17 | FILE_PATH = "./bert_pytorch.bin"
18 | VAL_X_PATH = "./val_X.npy"
19 | VAL_Y_PATH = ".val_y.npy"
20 | 
21 | if __name__ == "__main__":
22 | 
23 |     # check if the train process has been completed and the file is put in the right place
24 |     if os.path.exists(FILE_PATH):
25 |         model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(set(tags)))
26 |         model.load_state_dict(torch.load("bert_pytorch.bin" ))
27 |         model.to(device)
28 |     else:
29 |         print(f"\nmodel file not found, run train.py first to get the model file, and put it in {FILE_PATH}")
30 |         raise FileNotFoundError
31 | 
32 |     if os.path.exists(VAL_X_PATH):
33 |         val_X = np.load(VAL_X_PATH)
34 |     else: 
35 |         print(f"\nfile {VAL_X_PATH} not found, run train.py first to get the validation file, and put it in {VAL_X_PATH}")
36 |         raise FileNotFoundError
37 | 
38 |     if os.path.exists(VAL_Y_PATH):
39 |         val_y = np.load(VAL_Y_PATH)
40 |     else:
41 |         print(f"\nfile {VAL_Y_PATH} not found, run train.py first to get the validation file, and put it in {VAL_Y_PATH}")
42 |         raise FileNotFoundError
43 | 
44 | 
45 |     # freeze the model
46 |     for param in model.parameters():
47 |         param.requires_grad=False
48 |     model.eval()
49 | 
50 |     valid_preds = []
51 |     valid = torch.utils.data.TensorDataset(torch.tensor(val_X,dtype=torch.long))
52 |     valid_loader = torch.utils.data.DataLoader(valid, batch_size=32, shuffle=False)
53 | 
54 |     tk0 = tqdm_notebook(valid_loader)
55 |     for i,(x_batch,)  in enumerate(tk0):
56 |         pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
57 |         valid_preds.append(np.argmax(pred.cpu().numpy(),axis=1))
58 |         
59 |     valid_preds = np.concatenate(valid_preds,axis=0)
60 | 
61 |     print(classification_report(valid_preds,val_y))


--------------------------------------------------------------------------------
/BERT_experiments/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import time
  4 | import gc
  5 | import re
  6 | import sys
  7 | import os
  8 | import warnings
  9 | import pandas as pd
 10 | import numpy as np 
 11 | import re
 12 | import sys
 13 | 
 14 | import torch
 15 | import torch.nn as nn
 16 | import torch.utils.data
 17 | import torch.nn.functional as F
 18 | from keras.utils import to_categorical
 19 | from keras.preprocessing.text import Tokenizer
 20 | from sklearn.metrics import f1_score, classification_report
 21 | from sklearn import model_selection
 22 | 
 23 | from tqdm import tqdm, tqdm_notebook
 24 | from IPython.core.interactiveshell import InteractiveShell
 25 | from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam, BertModel
 26 | from utils import get_train, clean_str, convert_lines
 27 | from apex import amp
 28 | 
 29 | InteractiveShell.ast_node_interactivity = "all"
 30 | warnings.filterwarnings(action='once')
 31 | device=torch.device('cuda')
 32 | os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"
 33 | 
 34 | # hyperparameters
 35 | FILE_PATH = "training file.txt"
 36 | MAX_SEQUENCE_LENGTH = 75
 37 | TRAIN_SIZE = 6500
 38 | SEED = 666
 39 | EPOCHS = 5
 40 | LR=2e-5
 41 | BATCH_SIZE = 32
 42 | ACCUMULATION_STEPS=2 # how many steps it should backward propagate before optimization
 43 | OUTPUT_FILE_NAME = "bert_pytorch.bin"
 44 | 
 45 | # convert the origin data into a formatted pandas dataframe
 46 | train_df = get_train(FILE_PATH)
 47 | train_df['text'] = train_df['text'].apply(clean_str)
 48 | 
 49 | #convert tag to sequence, maybe there are more elegant way to do this
 50 | tags = train_df['tag'].to_list()
 51 | tokenizer_tag = Tokenizer()
 52 | tokenizer_tag.fit_on_texts(tags)
 53 | tags = tokenizer_tag.texts_to_sequences(tags)
 54 | tags = np.array(list((map(lambda x: x[0],tags))))
 55 | 
 56 | 
 57 | # convert text to bert format sequence
 58 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 59 | sequences = convert_lines(train_df["text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer)
 60 | 
 61 | 
 62 | #shuffle the data
 63 | np.random.seed(2019)
 64 | index = np.random.permutation(len(sequences))
 65 | sequences = sequences[index]
 66 | tags = tags[index]
 67 | 
 68 | 
 69 | # split the data into train/test
 70 | X = sequences[:TRAIN_SIZE]                
 71 | y = tags[:TRAIN_SIZE]
 72 | val_X = sequences[TRAIN_SIZE:]                
 73 | val_y = tags[TRAIN_SIZE:]
 74 | y = to_categorical(y-1)
 75 | val_y = to_categorical(val_y-1)
 76 | 
 77 | #due to the GPU memory limitation, just use 64 samples to validate
 78 | #the complete validation process would be done after the training process is over, see the evaluate.py
 79 | val_y = val_y[:64]
 80 | val_X = val_X[:64]
 81 | train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long), torch.tensor(y,dtype=torch.float))
 82 | 
 83 | # Initialize the model
 84 | np.random.seed(SEED)
 85 | torch.manual_seed(SEED)
 86 | torch.cuda.manual_seed(SEED)
 87 | torch.backends.cudnn.deterministic = True
 88 | output_model_file = OUTPUT_FILE_NAME
 89 | 
 90 | model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(set(tags)))
 91 | model.zero_grad()
 92 | model = model.to(device)
 93 | 
 94 | param_optimizer = list(model.named_parameters())
 95 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
 96 | optimizer_grouped_parameters = [
 97 |     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
 98 |     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
 99 |     ]
100 | train = train_dataset
101 | 
102 | num_train_optimization_steps = int(EPOCHS*len(train)/BATCH_SIZE/ACCUMULATION_STEPS)
103 | 
104 | optimizer = BertAdam(optimizer_grouped_parameters,
105 |                      lr=LR,
106 |                      warmup=0.05,
107 |                      t_total=num_train_optimization_steps)
108 | 
109 | model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
110 | model=model.train()
111 | 
112 | # train the model
113 | tq = tqdm_notebook(range(EPOCHS))
114 | 
115 | for epoch in tq:
116 |     train_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
117 |     avg_loss = 0.
118 |     avg_accuracy = 0.
119 |     lossf=None
120 |     tk0 = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False)
121 |     optimizer.zero_grad()
122 |     
123 |     for i,(x_batch, y_batch) in tk0:
124 |         torch.cuda.empty_cache()
125 |         y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
126 |         loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
127 |         with amp.scale_loss(loss, optimizer) as scaled_loss:
128 |             scaled_loss.backward()
129 |         if (i+1) % ACCUMULATION_STEPS == 0:             # Wait for several backward steps
130 |             optimizer.step()                            # Now we can do an optimizer step
131 |             optimizer.zero_grad()
132 |         if lossf:
133 |             lossf = 0.98*lossf+0.02*loss.item()
134 |         else:
135 |             lossf = loss.item()
136 |         tk0.set_postfix(loss = lossf)
137 |         
138 |         if i % 5 == 0:
139 |             
140 |             val_output = model(torch.tensor(val_X).to(device),attention_mask=(torch.tensor(val_X)>0).to(device), labels=None)
141 |             val_pred = np.argmax(val_output.data.cpu(),axis=1)
142 |             
143 |             val_loss =  F.binary_cross_entropy_with_logits(val_output,torch.tensor(val_y).to(device))
144 |             accuracy = torch.sum(torch.tensor(val_pred) == \
145 |                                  torch.tensor(np.argmax(val_y,axis=1))).type(torch.FloatTensor) / torch.tensor(val_y).size(0)
146 | 
147 |             print('Step: ', i, '| train loss: %.4f' % lossf, '| test accuracy: %.2f' % accuracy,'| val loss: %2f' % val_loss.item())
148 |     print(classification_report(np.argmax(val_y,axis=1),val_pred.numpy()))
149 |     tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)
150 |     
151 | # save the model and validation data for evaluate
152 | torch.save(model.state_dict(), output_model_file)
153 | np.save("val_y.npy",tags[TRAIN_SIZE:])
154 | np.save("val_X.npy",sequences[TRAIN_SIZE:])


--------------------------------------------------------------------------------
/BERT_experiments/utils.py:
--------------------------------------------------------------------------------
 1 | def get_train(file_path, no_dup = True, no_other=False):
 2 |     """
 3 |     format the i2b2 data into a pandas dataframe.
 4 |     notice that there are many duplicates texts in the dataset, so adjust the parameters according
 5 |     to your interests.
 6 | 
 7 |     parameters:
 8 |         file_path: the file's path, a string format
 9 |         no_dup: if true, the duplicate text would be removed
10 |         no_other: if true, the samples of tag "other" should be removed
11 | 
12 |     sample usage: train_df = get_train("./training file.txt")
13 |     return : a pd dataframe with columns: text, tag, test_info, problem_info, treatment_info
14 |     """
15 | 
16 |     file = open(file_path)
17 |     file = [line.strip('\n').strip('\ufeff') for line in file.readlines()]
18 | 
19 |     def format_input(df):
20 |         targets = ['test','problem','treatment']
21 |         for target in targets:
22 |             df.loc[df['t1'].str.contains('\|'+target),target+'_info'] = df['t1']
23 |             df.loc[(df['t2'].str.contains('\|'+target)) & \
24 |                          (df[target+'_info'].isnull()),target+'_info'] = df['t2']
25 |         df.drop(['t1','t2'],axis=1,inplace=True)
26 |         if no_dup:
27 |             df.drop_duplicates(['text'],inplace=True)
28 |         if no_other: 
29 |             df = df.loc[df.tag!='other']  #delete tag "other"
30 |         df.index = np.arange(df.shape[0])
31 |         return df
32 | 
33 | 
34 |     train_df = pd.DataFrame(np.array([file[i::5] for i in range(4)]).T,columns=['text','t1','t2','tag'])
35 |     train_df = format_input(train_df)
36 |     return train_df
37 | 
38 | 
39 | 
40 | def clean_str(text,lower=True):
41 |     """
42 |     clean and format the text
43 | 
44 |     parameters:
45 |         text: a string format text
46 |         lower: if true, the text would be convert to lower format
47 |     
48 |     return: processed text
49 |     """
50 | 
51 |     text = text.lower()
52 |     
53 |     replace_pair = [(r"[^A-Za-z0-9^,!.\/'+-=]"," "),(r"what's","what is "),(r"that's","that is "),(r"there's","there is "),
54 |                    (r"it's","it is "),(r"\'s", " "),(r"\'ve", " have "),(r"can't", "can not "),(r"n't", " not "),(r"i'm", "i am "),
55 |                    (r"\'re", " are "),(r"\'d", " would "),(r"\'ll", " will "),(r",", " "),(r"\.", " "),(r"!", " ! "),(r"\/", " "),
56 |                    (r"\^", " ^ "),(r"\+", " + "),(r"\-", " - "),(r"\=", " = "),(r"'", " "),(r"(\d+)(k)", r"\g<1>000"),(r":", " : "),
57 |                    (r" e g ", " eg "),(r" b g ", " bg "),(r" u s ", " american "),(r"\0s", "0"),(r" 9 11 ", "911"),(r"e - mail", "email"),
58 |                    (r"j k", "jk"),(r"\s{2,}", " ")]
59 |     
60 |     for before, after in replace_pair:
61 |         text = re.sub(before,after,text)
62 | 
63 |     return text.strip()
64 | 
65 | 
66 | 
67 | # Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
68 | def convert_lines(example, max_seq_length,tokenizer):
69 |     """convert the given texts to BERT format sequences
70 | 
71 |     parameters:
72 |         example: a list of text string of a pandas series
73 |         max_seq_length: pad the text to max_seq_length
74 |         tokenizer: bert tokenizer
75 | 
76 |     sample usage:
77 |         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
78 |         sequences = convert_lines(train_df["text"].fillna("DUMMY_VALUE"), 100 ,tokenizer)
79 | 
80 |     return: formatted sequence
81 |     """
82 |     max_seq_length -=2
83 |     all_tokens = []
84 |     longer = 0
85 |     for text in tqdm(example):
86 |         tokens_a = tokenizer.tokenize(text)
87 |         if len(tokens_a)>max_seq_length:
88 |             tokens_a = tokens_a[:max_seq_length]
89 |             longer += 1
90 |         one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
91 |         all_tokens.append(one_token)
92 |     print(longer)
93 |     return np.array(all_tokens)


--------------------------------------------------------------------------------
/LSTM_experiments/LSTM_no_attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ledzy/I2B2-Entity-Relation-Extraction/abea447826a570ba6d3d9c316ca72357a3824bea/LSTM_experiments/LSTM_no_attention.png


--------------------------------------------------------------------------------
/LSTM_experiments/attention_lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ledzy/I2B2-Entity-Relation-Extraction/abea447826a570ba6d3d9c316ca72357a3824bea/LSTM_experiments/attention_lstm.png


--------------------------------------------------------------------------------
/LSTM_experiments/attention_lstm_old.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ledzy/I2B2-Entity-Relation-Extraction/abea447826a570ba6d3d9c316ca72357a3824bea/LSTM_experiments/attention_lstm_old.png


--------------------------------------------------------------------------------
/LSTM_experiments/new_experiments/preprocess.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file get preprocessed data
 3 | """
 4 | 
 5 | from keras.preprocessing.text import Tokenizer
 6 | from keras.preprocessing.sequence import pad_sequences
 7 | from keras.utils import to_categorical
 8 | from sklearn.model_selection import train_test_split
 9 | from tqdm import tqdm
10 | from utils import load_glove, clean_str, get_train
11 | import pandas as pd
12 | import numpy as np 
13 | import re
14 | 
15 | 
16 | # get input
17 | train_df = get_train()
18 | texts = train_df['text'].to_list()
19 | tags = train_df['tag'].to_list()
20 | 
21 | # clean the text
22 | train_df['text'] = train_df['text'].apply(clean_str)
23 | 
24 | # text2sequence
25 | emb_size = 300
26 | max_features = 6000
27 | maxlen = 50
28 | 
29 | tokenizer = Tokenizer(num_words=max_features)
30 | tokenizer.fit_on_texts(texts)
31 | word_index = tokenizer.word_index
32 | sequences = tokenizer.texts_to_sequences(texts)
33 | sequences = pad_sequences(sequences,maxlen=maxlen)
34 | 
35 | tokenizer_tag = Tokenizer()
36 | tokenizer_tag.fit_on_texts(tags)
37 | tags = tokenizer_tag.texts_to_sequences(tags)
38 | tags = np.array(list((map(lambda x: x[0],tags))))
39 | tags = to_categorical(tags)
40 | 
41 | 
42 | # load embedding
43 | emb_matrix = load_glove(word_index)
44 | 
45 | # Get test/problem/treatment matrix: info_matrix
46 | # info_matrix: (m,3,maxlen), which uses one-hot to indicate the entity property of the token
47 | targets = ['test_info','problem_info','treatment_info']
48 | info_matrix = np.zeros((sequences.shape[0],3,maxlen))
49 | 
50 | for i,target in enumerate(targets):
51 |     for k,j in train_df[target].str.extract('(\d+)\|(\d+)').iterrows():
52 |         if not pd.isnull(j[0]): 
53 |             info_matrix[k,i,int(j[0])-1:int(j[1])] = 1
54 | 
55 | 
56 | # Shuffle the data
57 | np.random.seed(2019)
58 | index = np.random.permutation(len(sequences))
59 | 
60 | sequences = sequences[index]
61 | tags = tags[index]
62 | info_matrix = info_matrix[index].swapaxes(1,2)


--------------------------------------------------------------------------------
/LSTM_experiments/new_experiments/train.py:
--------------------------------------------------------------------------------
  1 | from keras.layers import Input, Dense, LSTM, CuDNNLSTM, Dropout, Embedding, Softmax, CuDNNGRU
  2 | from keras.layers import Bidirectional, concatenate, RepeatVector, Dot, Activation, merge, Reshape, Add
  3 | from keras.models import Model, Sequential
  4 | from keras.optimizers import Adam
  5 | from keras import backend as K
  6 | from keras.callbacks import TensorBoard
  7 | from keras.utils import to_categorical
  8 | from sklearn.model_selection import train_test_split
  9 | from sklearn.metrics import classification_report
 10 | from utils import f1, get_train, clean_str, load_glove
 11 | import tensorflow as tf
 12 | import os #for setting GPU
 13 | import time #for formatting log name
 14 | 
 15 | from preprocess import * #run preprocess file
 16 | 
 17 | os.environ['CUDA_VISIBLE_DEVICES'] = "0"
 18 | 
 19 | # hyperparameters
 20 | LR = 1e-4
 21 | EPOCHS = 50
 22 | 
 23 | # define layer obejcts for achieving attention
 24 | repeat_vec = RepeatVector(maxlen)
 25 | densor = Dense(1,activation='relu') # repeat_vec & densor are used for one_step_attention
 26 | activator = Activation('softmax',name='attention_weights')
 27 | dotor = Dot(axes=1,name='context')
 28 | densor2 = Dense(1,activation='relu')
 29 | 
 30 | def one_step_attention(s_prev,a):
 31 |     """
 32 |     Note: This attention method is not applicable for the task, since the dataset is small. Instead,
 33 |     using one_step_attention_v2.
 34 | 
 35 |     calculate the weight of each word for the given input, using second LSTM's previous
 36 |     hidden state and the output of the first LSTM (see attention_lstm_old.png)
 37 | 
 38 |     parameters:
 39 |         s_prev: the hidden state of the second LSTM
 40 |         a: the output of the first LSTM
 41 |     """
 42 |     s_prev = repeat_vec(s_prev)
 43 |     concat = concatenate([s_prev,a],axis=-1)
 44 |     concat = Activation(activation='tanh')(concat)
 45 |     e = densor(concat)
 46 |     alphas = activator(e)
 47 |     context = dotor([alphas,a])
 48 |     
 49 |     return context
 50 | 
 51 | 
 52 | def one_step_attention_v2(a):
 53 |     """
 54 |     use only the previous state to form attention weights, see attention_lstm.png
 55 | 
 56 |     parameters:
 57 |         a: the output of the first LSTM
 58 |     """
 59 |     e = densor2(a)
 60 |     alphas = activator(e)
 61 |     context = dotor([alphas,a])
 62 |     
 63 |     return context
 64 | 
 65 | 
 66 | def model():
 67 |     
 68 |     sequences = Input(shape=(maxlen,),name='sequences')
 69 |     info_matrix = Input(shape=(maxlen,3),name='info_matrix')
 70 |     
 71 |     embedding = Embedding(max_features,emb_size,weights=[emb_matrix],trainable=False,name='embedding')(sequences)
 72 |     X = concatenate([embedding,info_matrix],axis=2,name='concat')
 73 |     a = Bidirectional(CuDNNLSTM(64,return_sequences=True))(X)
 74 |     
 75 |     context = one_step_attention_v2(a)
 76 |     context = Activation(activation='tanh')(context)
 77 |     
 78 |     output = Dense(tags.shape[1],activation='softmax')(context)
 79 |     output = Reshape((tags.shape[1],))(output)
 80 |     
 81 |     model = Model(inputs=[sequences,info_matrix],outputs=output)
 82 |     return model
 83 | 
 84 | 
 85 | # run the model
 86 | def run_model(record=True,validation_split=0.15,epochs=50,lr=1e-3):
 87 |     deep_model = model()
 88 | 
 89 |     opt = Adam(lr=1e-3)
 90 |     deep_model.compile(opt,loss='categorical_crossentropy',metrics=[f1,'accuracy'])
 91 |     
 92 |     if record == True:
 93 |         deep_model.fit([sequences,info_matrix],tags,epochs=epochs,validation_split=validation_split,callbacks=[tensorboard])
 94 |     else: deep_model.fit([sequences,info_matrix],tags,epochs=epochs,validation_split=validation_split)
 95 |         
 96 |     return deep_model
 97 |         
 98 | # adjust the NAME according to your needs
 99 | NAME = "Attention_v2-Simplify-Para-BiLSTM-Freeze-Embedding-Add-Pos-Preprocessing{}".format(int(time.time()))
100 | tensorboard = TensorBoard(log_dir='logs/{}'.format(NAME))
101 | 
102 | deep_model = run_model(lr=LR,epochs=EPOCHS)


--------------------------------------------------------------------------------
/LSTM_experiments/new_experiments/utils.py:
--------------------------------------------------------------------------------
 1 | def get_train(file_path, no_dup = True, no_other=False):
 2 |     """
 3 |     format the i2b2 data into a pandas dataframe.
 4 |     notice that there are many duplicates texts in the dataset, so adjust the parameters according
 5 |     to your interests.
 6 | 
 7 |     parameters:
 8 |         file_path: the file's path, a string format
 9 |         no_dup: if true, the duplicate text would be removed
10 |         no_other: if true, the samples of tag "other" should be removed
11 | 
12 |     sample usage: train_df = get_train("./training file.txt")
13 |     return : a pd dataframe with columns: text, tag, test_info, problem_info, treatment_info
14 |     """
15 | 
16 |     file = open(file_path)
17 |     file = [line.strip('\n').strip('\ufeff') for line in file.readlines()]
18 | 
19 |     def format_input(df):
20 |         targets = ['test','problem','treatment']
21 |         for target in targets:
22 |             df.loc[df['t1'].str.contains('\|'+target),target+'_info'] = df['t1']
23 |             df.loc[(df['t2'].str.contains('\|'+target)) & \
24 |                          (df[target+'_info'].isnull()),target+'_info'] = df['t2']
25 |         df.drop(['t1','t2'],axis=1,inplace=True)
26 |         if no_dup:
27 |             df.drop_duplicates(['text'],inplace=True)
28 |         if no_other: 
29 |             df = df.loc[df.tag!='other']  #delete tag "other"
30 |         df.index = np.arange(df.shape[0])
31 |         return df
32 | 
33 | 
34 |     train_df = pd.DataFrame(np.array([file[i::5] for i in range(4)]).T,columns=['text','t1','t2','tag'])
35 |     train_df = format_input(train_df)
36 |     return train_df
37 | 
38 | 
39 | 
40 | def clean_str(text,lower=True):
41 |     """
42 |     clean and format the text
43 | 
44 |     parameters:
45 |         text: a string format text
46 |         lower: if true, the text would be convert to lower format
47 |     
48 |     return: processed text
49 |     """
50 | 
51 |     text = text.lower()
52 |     
53 |     replace_pair = [(r"[^A-Za-z0-9^,!.\/'+-=]"," "),(r"what's","what is "),(r"that's","that is "),(r"there's","there is "),
54 |                    (r"it's","it is "),(r"\'s", " "),(r"\'ve", " have "),(r"can't", "can not "),(r"n't", " not "),(r"i'm", "i am "),
55 |                    (r"\'re", " are "),(r"\'d", " would "),(r"\'ll", " will "),(r",", " "),(r"\.", " "),(r"!", " ! "),(r"\/", " "),
56 |                    (r"\^", " ^ "),(r"\+", " + "),(r"\-", " - "),(r"\=", " = "),(r"'", " "),(r"(\d+)(k)", r"\g<1>000"),(r":", " : "),
57 |                    (r" e g ", " eg "),(r" b g ", " bg "),(r" u s ", " american "),(r"\0s", "0"),(r" 9 11 ", "911"),(r"e - mail", "email"),
58 |                    (r"j k", "jk"),(r"\s{2,}", " ")]
59 |     
60 |     for before, after in replace_pair:
61 |         text = re.sub(before,after,text)
62 | 
63 |     return text.strip()
64 | 
65 | def load_glove(word_index):
66 |     def get_coefs(word,*emb): return word, np.asarray(emb,dtype='float32')
67 |     embedding = dict(get_coefs(*o.split(' ')) for o in tqdm(open('glove.840B.300d.txt')))
68 | 
69 |     emb_mean, emb_std = -0.005838459, 0.48782179
70 |     embed_matrix = np.random.normal(emb_mean,emb_std,(max_features,emb_size))
71 |     
72 |     for word, i in word_index.items():
73 |         if i >= max_features: continue
74 |         if embedding.get(word) is not None:
75 |             embed_matrix[i] = embedding.get(word)
76 |     
77 |     return embed_matrix
78 | 
79 | def f1(y_true, y_pred):
80 |     def recall(y_true, y_pred):
81 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
82 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
83 |         recall = true_positives / (possible_positives + K.epsilon())
84 |         return recall
85 | 
86 |     def precision(y_true, y_pred):
87 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
88 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
89 |         precision = true_positives / (predicted_positives + K.epsilon())
90 |         return precision
91 |     
92 |     precision = precision(y_true, y_pred)
93 |     recall = recall(y_true, y_pred)
94 |     return 2*((precision*recall)/(precision+recall+K.epsilon()))


--------------------------------------------------------------------------------
/LSTM_experiments/old_experiments/classifier.py:
--------------------------------------------------------------------------------
  1 | """
  2 | These code were written when i just started using keras, so
  3 | i'm not familiar with keras preprocessing tools and thereby did
  4 | a lot of repetitive work. I keep it here to be a record for myself,
  5 | but i suggest you to read new_experiments which is more readable:)
  6 | """
  7 | 
  8 | # coding=utf-8
  9 | 
 10 | from keras.layers import Dense, Input, CuDNNLSTM ,Dropout, Activation, Bidirectional
 11 | from keras.models import Model
 12 | from keras.layers.embeddings import Embedding
 13 | from keras.layers import concatenate
 14 | from keras.preprocessing import sequence
 15 | from keras.initializers import glorot_uniform
 16 | from keras.utils import to_categorical
 17 | from utils import *
 18 | from embedding import pretrained_embedding_layer
 19 | from gensim.models.keyedvectors import KeyedVectors
 20 | from imblearn.over_sampling import RandomOverSampler
 21 | from imblearn.under_sampling import RandomUnderSampler
 22 | import numpy as np
 23 | import random
 24 | import os
 25 | 
 26 |  
 27 | os.environ["CUDA_VISIBLE_DEVICES"] = "3" #specify the GPU
 28 | 
 29 | 
 30 | 
 31 | #Define the model
 32 | def classifier(input_shape, input_shape2, word_to_vec_map, word_to_index):
 33 |     
 34 |     sentence_indices = Input(input_shape, dtype = 'int32')
 35 | 
 36 |     prob_test_oht = Input(input_shape2, dtype = 'float32')
 37 | 
 38 |     embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
 39 |     embeddings = embedding_layer(sentence_indices)
 40 | 
 41 |     #concatenate the embedding with prob_test_oht
 42 |     embeddings = concatenate([embeddings,prob_test_oht], axis=-1)
 43 | 
 44 |     #propogate the data through layers
 45 |     X = Bidirectional(CuDNNLSTM(128, return_sequences = True))(embeddings)
 46 |     X = Dropout(0.4)(X)
 47 |     X = Bidirectional(CuDNNLSTM(128, return_sequences = False))(X)
 48 |     X = Dropout(0.4)(X)
 49 |     X = Dense(6)(X)
 50 |     X = Activation('softmax')(X)
 51 | 
 52 |     model = Model(inputs=[sentence_indices,prob_test_oht], outputs = X)
 53 | 
 54 |     return model
 55 | 
 56 | 
 57 | #load the file and embedding
 58 | print('Reading the file...')
 59 | with open('training file.txt', 'r') as f:
 60 |     data = f.readlines()
 61 | print('File closed.\n')
 62 | 
 63 | print('Loading the word-embedding...')
 64 | word_to_vec_map = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
 65 | print('Word-embedding loaded.\n')
 66 | 
 67 | 
 68 | word_to_index, index_to_word = load_word_map(data)
 69 | tag_to_index_map, index_to_tag_map = load_tag(data)
 70 | 
 71 | Y_tags = data[3::5]
 72 | Y_indices = [tag_to_index_map[tag] for tag in Y_tags]
 73 | 
 74 | # Y_indices = [index for index in tag_to_index_map[Y_tags]]
 75 | 
 76 | X = np.array(data[0::5])
 77 | # X = np.random.shuffle(X)
 78 | 
 79 | max_len = get_max_length(X)
 80 | 
 81 | 
 82 | prob_test_matrix = prob_test_matrix(data,max_len)
 83 | 
 84 | 
 85 | #format the input of the model
 86 | X_train_indices = sentences_to_indices(X,word_to_index,max_len)
 87 | Y_train = to_categorical(Y_indices)
 88 | 
 89 | 
 90 | #balance the training set
 91 | ros = RandomOverSampler(random_state=0) #repeat all tags to the same #of the largest tags
 92 | # ros = RandomUnderSampler(replacement=True, random_state=0)    #Reduce the size of largest tags
 93 | 
 94 | #shuflle 
 95 | index = [i for i in range(len(X_train_indices))]
 96 | random.shuffle(index)
 97 | prob_test_matrix = np.array([prob_test_matrix[i] for i in index])
 98 | X_train_indices = np.array([X_train_indices[i] for i in index])
 99 | Y_train = np.array([Y_train[i] for i in index])
100 | 
101 | #split into train and test
102 | X_train = X_train_indices[:int(0.8*len(X_train_indices))]
103 | X_test = X_train_indices[int(0.8*len(X_train_indices)):]
104 | Y_test = Y_train[int(0.8*len(X_train_indices)):]
105 | Y_train = Y_train[:int(0.8*len(X_train_indices))]
106 | prob_test_matrix_train = prob_test_matrix[:int(0.8*len(X_train_indices))]
107 | prob_test_matrix_test = prob_test_matrix[int(0.8*len(X_train_indices)):]
108 | 
109 | X_resampled_train, Y_resampled_train = ros.fit_sample(X_train, Y_train)
110 | X_resampled_test, Y_resampled_test = ros.fit_sample(X_test, Y_test)
111 | prob_test_matrix_train_resampled,_ = ros.fit_sample(prob_test_matrix_train, Y_train)
112 | prob_test_matrix_test_resampled,_ = ros.fit_sample(prob_test_matrix_test, Y_test)
113 | 
114 | 
115 | #expand the dimension
116 | pt_matrix_train = []
117 | pt_matrix_test = []
118 | prob_test_dim = 32
119 | for i in range(prob_test_dim):
120 |     pt_matrix_train.append(prob_test_matrix_train_resampled)
121 |     pt_matrix_test.append(prob_test_matrix_test_resampled)
122 | 
123 | #format the order of dimension
124 | pt_matrix_train = np.transpose(pt_matrix_train,(1,2,0))
125 | pt_matrix_test = np.transpose(pt_matrix_test,(1,2,0))
126 | 
127 | 
128 | 
129 | #split validation of the model, can be omitted
130 | data_split = 5
131 | train_log = []
132 | for i in range(data_split):
133 |     start = int(i/5*len(X_resampled))
134 |     end = int((i+1)/5*len(X_resampled))
135 | 
136 |     X_resampled_test = X_resampled[start:end]
137 |     Y_resampled_test = Y_resampled[start:end]
138 |     pt_matrix_test = pt_matrix[start:end]
139 | 
140 |     X_resampled_train = np.append(X_resampled[0:start],X_resampled[end:],axis=0)
141 |     Y_resampled_train = np.append(Y_resampled[0:start],Y_resampled[end:],axis=0)
142 |     pt_matrix_train = np.append(pt_matrix[0:start],pt_matrix[end:],axis=0)
143 | 
144 |     print('Constructing the model, split ',i)
145 |     model = classifier((max_len,),(max_len,32),word_to_vec_map,word_to_index)
146 |     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
147 |     History = model.fit([X_resampled_train,pt_matrix_train], Y_resampled_train, epochs=1, batch_size=50,validation_data=([X_resampled_test,pt_matrix_test],Y_resampled_test), shuffle=True)
148 | 
149 |     history = History.history
150 |     acc,loss,val_acc,val_loss = history['acc'][-1], history['loss'][-1], history['val_acc'][-1], history['val_loss'][-1]
151 |     train_log.append([acc,loss,val_acc,val_loss])
152 | 
153 | 
154 | #print the performance of the model
155 | model = classifier((max_len,),(max_len,32),word_to_vec_map,word_to_index)
156 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
157 | model.fit([X_resampled_train,pt_matrix_train],Y_resampled_train,epochs=100,validation_data=([X_resampled_test,pt_matrix_test],Y_resampled_test),batch_size=100,shuffle=True)
158 | 
159 | 
160 | #save the model
161 | model.save('add_bidirect.h5')


--------------------------------------------------------------------------------
/LSTM_experiments/old_experiments/embedding.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | from keras.layers.embeddings import Embedding
 3 | 
 4 | 
 5 | #define a embedding layer which is initialized with pre-trained word-embedding
 6 | def pretrained_embedding_layer(word_to_vec_map, word_to_index):
 7 |     
 8 |     vocab_length = len(word_to_index) + 1  #adding 1 to fit the Embedding layer (keras requirement)
 9 |     emb_dim = word_to_vec_map['at'].shape[0]
10 | 
11 |     emb_matrix = np.zeros((vocab_length, emb_dim))
12 | 
13 |     success = 0
14 |     fail = []
15 |     for word, index in word_to_index.items():
16 |         try:
17 |             emb_matrix[index,:] = word_to_vec_map[word]
18 |             success += 1
19 | 
20 |         except Exception as e:
21 |             fail.append(word)
22 |             emb_matrix[index,:] = -np.random.randn(emb_dim)/20
23 | 
24 | 
25 |     embedding_layer = Embedding(vocab_length,emb_dim,trainable=False)
26 | 
27 |     embedding_layer.build((None,))
28 | 
29 |     embedding_layer.set_weights([emb_matrix])
30 | 
31 |     return embedding_layer
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/LSTM_experiments/old_experiments/readme.md:
--------------------------------------------------------------------------------
1 | These code were written when i just started using keras.
2 | I was not familiar with keras preprocessing tools and thereby did
3 | a lot of repetitive work. <br>I keep it here to be a record for myself,
4 | but i suggest you to read new_experiments which is more readable :)


--------------------------------------------------------------------------------
/LSTM_experiments/old_experiments/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.utils import to_categorical
  3 | import re
  4 | 
  5 | #get the function of word_to_index and index_to_word. 
  6 | def load_word_map(data):
  7 |     #temp solution
  8 |     with open('training file.txt', 'r') as f:
  9 |         data = f.read()
 10 | 
 11 |     word_to_index = {}
 12 |     index_to_word = {}
 13 |     index = 0
 14 |     for word in set(data.split()):
 15 |         word_to_index[word] = index
 16 |         index += 1
 17 |     
 18 |     for word, index in word_to_index.items():
 19 |         index_to_word[index] = word
 20 | 
 21 |     return word_to_index, index_to_word
 22 | 
 23 | 
 24 | #get the length of the longest sequence in the input sequence
 25 | def get_max_length(input_sequence):
 26 |     max_len = 0
 27 | 
 28 |     for sentence in input_sequence: 
 29 |         length = len(sentence.split())
 30 |         if length > max_len:
 31 |             max_len = length
 32 |     
 33 |     return max_len
 34 | 
 35 | 
 36 | #transfer the sentences into indices to get the input of embedding layer
 37 | def sentences_to_indices(X, word_to_index, max_len):
 38 | 
 39 |     m = X.shape[0]
 40 |     X_indices = np.zeros((m, max_len))
 41 | 
 42 |     for i in range(m):
 43 |         sentence_words = [w for w in X[i].split()]
 44 | 
 45 |         j = 0
 46 |         for word in sentence_words:
 47 |             try:
 48 |                 X_indices[i,j] = word_to_index[word]
 49 |                 j += 1
 50 |             except Exception as e:
 51 |                 print(e)
 52 |                 print(word, end='')
 53 |         
 54 |     return X_indices
 55 | 
 56 | 
 57 | #get 6 tags from the data
 58 | def load_tag(data):
 59 |     tag = set(data[3::5])
 60 |     tag_to_index = {}
 61 |     index_to_tag = {}
 62 |     index = 0
 63 |     for i in tag:
 64 |         tag_to_index[i] = index
 65 |         index += 1
 66 | 
 67 |     for index, tag in tag_to_index.items():
 68 |         index_to_tag[index] = tag
 69 | 
 70 |     return tag_to_index,index_to_tag
 71 | 
 72 | 
 73 | '''get the problem/test information from data,
 74 |     and build a matrix such that for word tagged
 75 |     as problem, the relative cell would be 1. For 
 76 |     test words, the relative cell would be -1
 77 | '''
 78 | def prob_test_matrix(data,max_len):
 79 |     test_info = data[1::5]
 80 |     prob_info = data[2::5]
 81 |     test_pos = []
 82 |     prob_pos = []
 83 | 
 84 |     for key in test_info:
 85 |         pos = [int(i) for i in re.findall('\d+',key)]
 86 |         test_pos.append(pos)
 87 | 
 88 |     for key in prob_info:
 89 |         pos = [int(i) for i in re.findall('\d+',key)]
 90 |         prob_pos.append(pos)
 91 | 
 92 |     # test_pos_oht = to_categorical(test_pos)
 93 |     # prob_pos_oht = -to_categorical(prob_pos)
 94 | 
 95 |     prob_test_matrix = np.zeros([len(test_info),max_len])
 96 | 
 97 |     for i in range(len(test_info)):
 98 | 
 99 |         prob_test_matrix[i,prob_pos[i][0]:prob_pos[i][1]+1] = 1
100 |         prob_test_matrix[i,test_pos[i][0]:test_pos[i][1]+1] = -1
101 | 
102 |     return prob_test_matrix


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # I2B2-Entity-Relation-Extraction
 2 | This repository contains several approaches of entity relation extraction, with regard to dataset i2b2, which i uploaded in the repository.
 3 | 
 4 | Two main approaches are used and compared, which are
 5 | * LSTM + attention mechanism
 6 | * BERT, which is state-of-the-art NLP model
 7 | 
 8 | ## LSTM
 9 | Several LSTM networks are tried. Since the dataset is relatively small and has a lot of repetitions, the model should not be too complex.
10 | Currently the best LSTM model for the task is as follows:
11 | ![Image text](https://github.com/Ledzy/I2B2-Entity-Relation-Extraction/blob/master/LSTM_experiments/attention_lstm.png)
12 | 
13 | which references: 
14 | [Zhou, Peng, et al. "Attention-based bidirectional long short-term memory networks for relation classification." Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers). Vol. 2. 2016.](https://www.aclweb.org/anthology/P16-2034)
15 | 
16 | Note for the embedding file GoogleNews-vectors-negative300.bin, you can download from https://code.google.com/archive/p/word2vec/.
17 | 
18 | ## BERT
19 | 
20 | 


--------------------------------------------------------------------------------