├── LICENSE ├── process_mimic.py ├── README.md ├── testDoctorAI.py └── doctorAI.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, mp2893 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of Doctor AI nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /process_mimic.py: -------------------------------------------------------------------------------- 1 | # This script processes MIMIC-III dataset and builds longitudinal diagnosis records for patients with at least two visits. 2 | # The output data are cPickled, and suitable for training Doctor AI or RETAIN 3 | # Written by Edward Choi (mp2893@gatech.edu) 4 | # Usage: Put this script to the foler where MIMIC-III CSV files are located. Then execute the below command. 5 | # python process_mimic.py ADMISSIONS.csv DIAGNOSES_ICD.csv 6 | 7 | # Output files 8 | # .pids: List of unique Patient IDs. Used for intermediate processing 9 | # .dates: List of List of Python datetime objects. The outer List is for each patient. The inner List is for each visit made by each patient 10 | # .seqs: List of List of List of integer diagnosis codes. The outer List is for each patient. The middle List contains visits made by each patient. The inner List contains the integer diagnosis codes that occurred in each visit 11 | # .types: Python dictionary that maps string diagnosis codes to integer diagnosis codes. 12 | 13 | import sys 14 | import cPickle as pickle 15 | from datetime import datetime 16 | 17 | def convert_to_icd9(dxStr): 18 | if dxStr.startswith('E'): 19 | if len(dxStr) > 4: return dxStr[:4] + '.' + dxStr[4:] 20 | else: return dxStr 21 | else: 22 | if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:] 23 | else: return dxStr 24 | 25 | def convert_to_3digit_icd9(dxStr): 26 | if dxStr.startswith('E'): 27 | if len(dxStr) > 4: return dxStr[:4] 28 | else: return dxStr 29 | else: 30 | if len(dxStr) > 3: return dxStr[:3] 31 | else: return dxStr 32 | 33 | if __name__ == '__main__': 34 | admissionFile = sys.argv[1] 35 | diagnosisFile = sys.argv[2] 36 | outFile = sys.argv[3] 37 | 38 | print 'Building pid-admission mapping, admission-date mapping' 39 | pidAdmMap = {} 40 | admDateMap = {} 41 | infd = open(admissionFile, 'r') 42 | infd.readline() 43 | for line in infd: 44 | tokens = line.strip().split(',') 45 | pid = int(tokens[1]) 46 | admId = int(tokens[2]) 47 | admTime = datetime.strptime(tokens[3], '%Y-%m-%d %H:%M:%S') 48 | admDateMap[admId] = admTime 49 | if pid in pidAdmMap: pidAdmMap[pid].append(admId) 50 | else: pidAdmMap[pid] = [admId] 51 | infd.close() 52 | 53 | print 'Building admission-dxList mapping' 54 | admDxMap = {} 55 | infd = open(diagnosisFile, 'r') 56 | infd.readline() 57 | for line in infd: 58 | tokens = line.strip().split(',') 59 | admId = int(tokens[2]) 60 | dxStr = 'D_' + convert_to_icd9(tokens[4][1:-1]) ############## Uncomment this line and comment the line below, if you want to use the entire ICD9 digits. 61 | #dxStr = 'D_' + convert_to_3digit_icd9(tokens[4][1:-1]) 62 | if admId in admDxMap: admDxMap[admId].append(dxStr) 63 | else: admDxMap[admId] = [dxStr] 64 | infd.close() 65 | 66 | print 'Building pid-sortedVisits mapping' 67 | pidSeqMap = {} 68 | for pid, admIdList in pidAdmMap.iteritems(): 69 | if len(admIdList) < 2: continue 70 | sortedList = sorted([(admDateMap[admId], admDxMap[admId]) for admId in admIdList]) 71 | pidSeqMap[pid] = sortedList 72 | 73 | print 'Building pids, dates, strSeqs' 74 | pids = [] 75 | dates = [] 76 | seqs = [] 77 | for pid, visits in pidSeqMap.iteritems(): 78 | pids.append(pid) 79 | seq = [] 80 | date = [] 81 | for visit in visits: 82 | date.append(visit[0]) 83 | seq.append(visit[1]) 84 | dates.append(date) 85 | seqs.append(seq) 86 | 87 | print 'Converting strSeqs to intSeqs, and making types' 88 | types = {} 89 | newSeqs = [] 90 | for patient in seqs: 91 | newPatient = [] 92 | for visit in patient: 93 | newVisit = [] 94 | for code in visit: 95 | if code in types: 96 | newVisit.append(types[code]) 97 | else: 98 | types[code] = len(types) 99 | newVisit.append(types[code]) 100 | newPatient.append(newVisit) 101 | newSeqs.append(newPatient) 102 | 103 | pickle.dump(pids, open(outFile+'.pids', 'wb'), -1) 104 | pickle.dump(dates, open(outFile+'.dates', 'wb'), -1) 105 | pickle.dump(newSeqs, open(outFile+'.seqs', 'wb'), -1) 106 | pickle.dump(types, open(outFile+'.types', 'wb'), -1) 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Doctor AI 2 | ========================================= 3 | 4 | Doctor AI is a automatic diagnosis machine that predicts medical codes that occur in the next visit, while also predicting the time duration until the next visit. 5 | 6 | #### Relevant Publications 7 | 8 | Doctor AI implements an algorithm introduced in the following: 9 | 10 | Doctor AI: Predicting Clinical Events via Recurrent Neural Networks 11 | Edward Choi, Mohammad Taha Bahadori, Andy Schuetz, Walter F. Stewart, Jimeng Sun 12 | arXiv preprint arXiv:1511.05942 13 | 14 | Medical Concept Representation Learning from Electronic Health Records and its Application on Heart Failure Prediction 15 | Edward Choi, Andy Schuetz, Walter F. Stewart, Jimeng Sun 16 | arXiv preprint arXiv:1602.03686 17 | 18 | #### Running Doctor AI 19 | 20 | **STEP 1: Installation** 21 | 22 | 1. Install [python](https://www.python.org/), [Theano](http://deeplearning.net/software/theano/index.html). We use Python 2.7, Theano 0.7. Theano can be easily installed in Ubuntu as suggested [here](http://deeplearning.net/software/theano/install_ubuntu.html#install-ubuntu) 23 | 24 | 2. If you plan to use GPU computation, install [CUDA](https://developer.nvidia.com/cuda-downloads) 25 | 26 | 3. Download/clone the Doctor AI code 27 | 28 | **STEP 2: Preparing training data** 29 | 30 | 0. You can use "process_mimic.py" to process MIMIC-III dataset and generate a suitable training dataset for Doctor AI. Place the script to the same location where the MIMIC-III CSV files are located, and run the script. Instructions are described inside the script. However, I recommend the readers to read the following steps to understand the structure of the training data and learn how to prepare their own dataset. 31 | 32 | 1. Doctor AI's training dataset needs to be a Python Pickled list of list of list. Each list corresponds to patients, visits, and medical codes (e.g. diagnosis codes, medication codes, procedure codes, etc.) 33 | First, medical codes need to be converted to an integer. Then a single visit can be seen as a list of integers. Then a patient can be seen as a list of visits. 34 | For example, [5,8,15] means the patient was assigned with code 5, 8, and 15 at a certain visit. 35 | If a patient made two visits [1,2,3] and [4,5,6,7], it can be converted to a list of list [[1,2,3], [4,5,6,7]]. 36 | Multiple patients can be represented as [[[1,2,3], [4,5,6,7]], [[2,4], [8,3,1], [3]]], which means there are two patients where the first patient made two visits and the second patient made three visits. 37 | This list of list of list needs to be pickled using cPickle. We will refer to this file as the "visit file". 38 | 39 | 2. The total number of unique medical codes is required to run Doctor AI. 40 | For example, if the dataset is using 14,000 diagnosis codes and 11,000 procedure codes, the total number is 25,000. 41 | 42 | 3. The label dataset (let us call this "label file") needs to have the same format as the "visit file". 43 | The important thing is, time steps of both "label file" and "visit file" need to match. DO NOT train Doctor AI with labels that is one time step ahead of the visits. It is tempting since Doctor AI predicts the labels of the next visit. But it is internally taken care of. 44 | You can use the "visit file" as the "label file" if you want Doctor AI to predict the exact codes. 45 | Or you can use a grouped codes as the "label file" if you are okay with reasonable predictions and want to save time. 46 | For example, ICD9 diagnosis codes can be grouped into 283 categories by using [CCS](https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp) groupers. 47 | We STRONGLY recommend you do this, because the number of medical codes can be as high as tens of thousands, 48 | which can cause not only low predictive performance but also memory issues. (The high-end GPUs typically have only 12GB of VRAM) 49 | 50 | 4. Same as step 2, you will need to remember the total number of unique codes in the "label file". 51 | If you are using "visit file" as the "label file", than the number of unique codes will be the same, of course. 52 | 53 | 5. The "visit file" and "label file" need to have 3 sets respectively: training set, validation set, and test set. 54 | The file extension must be ".train", ".valid", and ".test" respectivley. 55 | For example, if you want to use a file named "my_visit_sequences" as the "visit file", then Doctor AI will try to load "my_visit_sequences.train", "my_visit_sequences.valid", and "my_visit_sequences.test". 56 | This is also true for the "label file" 57 | 58 | 5. You can use the time duration between visits as an additional source of information. Let us call this "time file". 59 | "time file" needs to be prepared as a Python Pickled List of List. Each list corresponds to patients and the duration between each visit. 60 | For example, given a "visit file" [[[1,2,3], [4,5,6,7]], [[2,4], [8,3,1], [3]]], its corresponding "time file" should look like [[0, 15], [0, 45, 23]]. 61 | Of course, the numbers are fake, but the important thing is that the duration for the first visit needs to be zero. 62 | Use "--time\_file" option to use "time file" 63 | Remember that the ".train", ".valid", ".test" rule also applies to the "time file" as well. 64 | 65 | **Additional: Predicting time duration until next visit** 66 | In addtion to predicting the codes of the next visit, you can make Doctor AI predict the time duration until next visit. 67 | Use "--predict\_time" option to do this. And obviously, predicting time requires the "time file". 68 | Time prediction also comes with many hyperparameters such as "--tradeoff", "--L2\_time", "--use\_log\_time". 69 | Refer to "--help" for more detailed information 70 | 71 | **Additional: Using your own medical code representations** 72 | Doctor AI internally learns vector representation of medical codes while training. These vectors are initialized with random values of course. 73 | You can, however, also provide medical code representations, if you have one. (They can be easily trained by using Skip-gram like algorithms.) 74 | If you want to provide the medical code representations, it has to be a list of list (basically a matrix) of N rows and M columns where N is the number of unique codes in your "visit file" and M is the size of the code representations. 75 | Specify the path to your code representation file using "--embed\_file". 76 | For more details regarding the training of medical code representations and using them for predictive tasks, please refer to the second paper of the "Related Publication" section. 77 | Additionally even if you provided your own medical code representations, you can re-train (a.k.a fine-tune) them as you train Doctor AI. 78 | Use "--embed\_finetune" option to do this. If you are not providing your own medical code representations, Doctor AI will use randomly initialized one, which obviously requires this fine-tuning process. Since the default is to use the fine-tuning, you do not need to worry about this. 79 | 80 | **STEP 3: Running Doctor AI** 81 | 82 | 1. The minimum input you need to run Doctor AI is the "visit file", the number of unique medical codes in the "visit file", 83 | the "label file", the number of unique medical codes in the "label file", and the output path. The output path is where the learned weights will be saved. 84 | `python doctorAI.py <# codes in the visit file> <# codes in the label file> ` 85 | 86 | 2. Specifying `--verbose` option will print training process after each 10 mini-batches. 87 | 88 | 3. You can specify how many GRU layers you want to use by using "--hidden\_dim\_size" option. 89 | For example "--hidden\_dim\_size \[400,200\]" will give you a two layer GRU where the lower layer uses a 400-dimensional hidden layer 90 | and the upper layer uses a 200-dimensional hidden layer. 91 | 92 | 4. Additional options can be specified such as the size of the embedding layer, batch size, the number of epochs, dropout rate, etc. Detailed information can be accessed by `python doctorAI.py --help` 93 | 94 | **STEP 4: Getting your results** 95 | 96 | Doctor AI checks the validation cross entropy after each epoch, and if it is lower than all previous value, it will save the current model. The model file is generated by [numpy.savez_compressed](http://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.savez_compressed.html). 97 | 98 | **Step 5: Testing your model** 99 | 100 | 1. Using the file "testDoctorAI.py", you can calculate the recall@10,20,30 for the code prediction and R^2 for the time prediction. First you need to have a trained model that was saved by numpy.savez\_compressed. Note that you need to know the configuration with which you trained Doctor AI (use of "time file", use of "--use\_log\_time", value of "--hidden\_dim\_size", etc.) 101 | 102 | 2. Again, you need the "visit file" and "label file" prepared in the same way. This time, however, you do not need to follow the ".train", ".valid", ".test" rule. The testing script will try to load the file name as given. 103 | 104 | 3. Using additional options such as "--hidden\_dim\_size" and "--use\_log\_time", you should use exactly the same configuration with which you trained the model. For more detailed information, use "--help" option. 105 | 106 | 4. To evaluate the time prediction performance, we provide R^2 error. In order to calculate this, you need to provide the mean value of all durations in the "time file" you used to train Doctor AI. (You must ignore the 0 duration of the first visits or course) Use "--mean\_duration" option to do this. 107 | 108 | 5. The minimum input to run the testing script is the "model file", "visit file", "label file", and "hidden dim size". 109 | `python testDoctorAI.py ` 110 | -------------------------------------------------------------------------------- /testDoctorAI.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # Code written by Edward Choi (mp2893@gatech.edu) 3 | # For bug report, please contact author using the email address 4 | ################################################################# 5 | 6 | import sys 7 | import numpy as np 8 | import cPickle as pickle 9 | from collections import OrderedDict 10 | import argparse 11 | 12 | import theano 13 | import theano.tensor as T 14 | from theano import config 15 | 16 | from Queue import heapq 17 | import operator 18 | import time 19 | import warnings 20 | 21 | def recallTop(y_true, y_pred, rank=[10, 20, 30]): 22 | recall = list() 23 | for i in range(len(y_pred)): 24 | thisOne = list() 25 | codes = y_true[i] 26 | tops = y_pred[i] 27 | for rk in rank: 28 | thisOne.append(len(set(codes).intersection(set(tops[:rk])))*1.0/len(set(codes))) 29 | recall.append( thisOne ) 30 | return (np.array(recall)).mean(axis=0).tolist() 31 | 32 | def calculate_r_squared(trueVec, predVec, options): 33 | if options['useLogTime']: 34 | trueVec = np.log(np.array(trueVec) + options['logEps']) 35 | mean_duration = np.log(options['mean_duration']) 36 | else: 37 | trueVec = np.array(trueVec) 38 | mean_duration = options['mean_duration'] 39 | predVec = np.array(predVec) 40 | 41 | 42 | numerator = ((trueVec - predVec) ** 2).sum() 43 | denominator = ((trueVec - mean_duration) ** 2).sum() 44 | 45 | return 1.0 - (numerator / denominator) 46 | 47 | def numpy_floatX(data): 48 | return np.asarray(data, dtype=config.floatX) 49 | 50 | def init_tparams(params): 51 | tparams = OrderedDict() 52 | for key, value in params.iteritems(): 53 | tparams[key] = theano.shared(value, name=key) 54 | return tparams 55 | 56 | def gru_layer(tparams, emb, layerIndex, hiddenDimSize, mask=None): 57 | timesteps = emb.shape[0] 58 | if emb.ndim == 3: n_samples = emb.shape[1] 59 | else: n_samples = 1 60 | 61 | W_rx = T.dot(emb, tparams['W_r_'+layerIndex]) 62 | W_zx = T.dot(emb, tparams['W_z_'+layerIndex]) 63 | Wx = T.dot(emb, tparams['W_'+layerIndex]) 64 | 65 | def stepFn(stepMask, wrx, wzx, wx, h): 66 | r = T.nnet.sigmoid(wrx + T.dot(h, tparams['U_r_'+layerIndex]) + tparams['b_r_'+layerIndex]) 67 | z = T.nnet.sigmoid(wzx + T.dot(h, tparams['U_z_'+layerIndex]) + tparams['b_z_'+layerIndex]) 68 | h_tilde = T.tanh(wx + T.dot(r*h, tparams['U_'+layerIndex]) + tparams['b_'+layerIndex]) 69 | h_new = z * h + ((1. - z) * h_tilde) 70 | h_new = stepMask[:, None] * h_new + (1. - stepMask)[:, None] * h 71 | return h_new#, output, time 72 | 73 | results, updates = theano.scan(fn=stepFn, sequences=[mask,W_rx,W_zx,Wx], outputs_info=T.alloc(numpy_floatX(0.0), n_samples, hiddenDimSize), name='gru_layer'+layerIndex, n_steps=timesteps) 74 | 75 | return results 76 | 77 | def build_model(tparams, options): 78 | x = T.tensor3('x', dtype=config.floatX) 79 | t = T.matrix('t', dtype=config.floatX) 80 | mask = T.matrix('mask', dtype=config.floatX) 81 | 82 | n_timesteps = x.shape[0] 83 | n_samples = x.shape[1] 84 | 85 | emb = T.dot(x, tparams['W_emb']) 86 | if options['useTime']: 87 | emb = T.concatenate([t.reshape([n_timesteps,n_samples,1]), emb], axis=2) #Adding the time element to the embedding 88 | 89 | inputVector = emb 90 | for i, hiddenDimSize in enumerate(options['hiddenDimSize']): 91 | memories = gru_layer(tparams, inputVector, str(i), hiddenDimSize, mask=mask) 92 | inputVector = memories * 0.5 93 | 94 | def softmaxStep(memory2d): 95 | return T.nnet.softmax(T.dot(memory2d, tparams['W_output']) + tparams['b_output']) 96 | 97 | results, updates = theano.scan(fn=softmaxStep, sequences=[inputVector], outputs_info=None, name='softmax_layer', n_steps=n_timesteps) 98 | results = results * mask[:,:,None] 99 | 100 | duration = 0.0 101 | if options['predictTime']: 102 | duration = T.maximum(T.dot(inputVector, tparams['W_time']) + tparams['b_time'], 0) 103 | duration = duration.reshape([n_timesteps,n_samples]) * mask 104 | return x, t, mask, results, duration 105 | elif options['useTime']: 106 | return x, t, mask, results 107 | else: 108 | return x, mask, results 109 | 110 | def load_data(dataFile, labelFile, timeFile): 111 | test_set_x = np.array(pickle.load(open(dataFile, 'rb'))) 112 | test_set_y = np.array(pickle.load(open(labelFile, 'rb'))) 113 | test_set_t = None 114 | if len(timeFile) > 0: 115 | test_set_t = np.array(pickle.load(open(timeFile, 'rb'))) 116 | 117 | def len_argsort(seq): 118 | return sorted(range(len(seq)), key=lambda x: len(seq[x])) 119 | 120 | sorted_index = len_argsort(test_set_x) 121 | test_set_x = [test_set_x[i] for i in sorted_index] 122 | test_set_y = [test_set_y[i] for i in sorted_index] 123 | if len(timeFile) > 0: 124 | test_set_t = [test_set_t[i] for i in sorted_index] 125 | 126 | test_set = (test_set_x, test_set_y, test_set_t) 127 | 128 | return test_set 129 | 130 | def padMatrixWithTime(seqs, times, options): 131 | lengths = np.array([len(seq) for seq in seqs]) - 1 132 | n_samples = len(seqs) 133 | maxlen = np.max(lengths) 134 | inputDimSize = options['inputDimSize'] 135 | numClass = options['numClass'] 136 | 137 | x = np.zeros((maxlen, n_samples, inputDimSize)).astype(config.floatX) 138 | t = np.zeros((maxlen, n_samples)).astype(config.floatX) 139 | mask = np.zeros((maxlen, n_samples)).astype(config.floatX) 140 | for idx, (seq,time) in enumerate(zip(seqs,times)): 141 | for xvec, subseq in zip(x[:,idx,:], seq[:-1]): 142 | xvec[subseq] = 1. 143 | mask[:lengths[idx], idx] = 1. 144 | t[:lengths[idx], idx] = time[:-1] 145 | 146 | if options['useLogTime']: 147 | t = np.log(t + options['logEps']) 148 | 149 | return x, t, mask, lengths 150 | 151 | def padMatrixWithoutTime(seqs, options): 152 | lengths = np.array([len(seq) for seq in seqs]) - 1 153 | n_samples = len(seqs) 154 | maxlen = np.max(lengths) 155 | inputDimSize = options['inputDimSize'] 156 | numClass = options['numClass'] 157 | 158 | x = np.zeros((maxlen, n_samples, inputDimSize)).astype(config.floatX) 159 | mask = np.zeros((maxlen, n_samples)).astype(config.floatX) 160 | for idx, seq in enumerate(seqs): 161 | for xvec, subseq in zip(x[:,idx,:], seq[:-1]): 162 | xvec[subseq] = 1. 163 | mask[:lengths[idx], idx] = 1. 164 | 165 | return x, mask, lengths 166 | 167 | def test_doctorAI( 168 | modelFile='model.txt', 169 | seqFile='seq.txt', 170 | inputDimSize=20000, 171 | labelFile='label.txt', 172 | numClass=500, 173 | timeFile='', 174 | predictTime=False, 175 | useLogTime=True, 176 | hiddenDimSize=[200,200], 177 | batchSize=100, 178 | logEps=1e-8, 179 | mean_duration=20.0, 180 | verbose=False 181 | ): 182 | options = locals().copy() 183 | 184 | if len(timeFile) > 0: useTime = True 185 | else: useTime = False 186 | options['useTime'] = useTime 187 | 188 | models = np.load(modelFile) 189 | tparams = init_tparams(models) 190 | 191 | print 'build model ... ', 192 | if predictTime: 193 | x, t, mask, codePred, timePred = build_model(tparams, options) 194 | predict_code = theano.function(inputs=[x,t,mask], outputs=codePred, name='predict_code') 195 | predict_time = theano.function(inputs=[x,t,mask], outputs=timePred, name='predict_time') 196 | elif useTime: 197 | x, t, mask, codePred = build_model(tparams, options) 198 | predict_code = theano.function(inputs=[x,t,mask], outputs=codePred, name='predict_code') 199 | else: 200 | x, mask, codePred = build_model(tparams, options) 201 | predict_code = theano.function(inputs=[x,mask], outputs=codePred, name='predict_code') 202 | 203 | options['inputDimSize']=models['W_emb'].shape[0] 204 | options['numClass']=models['b_output'].shape[0] 205 | print 'load data ... ', 206 | testSet = load_data(seqFile, labelFile, timeFile) 207 | n_batches = int(np.ceil(float(len(testSet[0])) / float(batchSize))) 208 | print 'done' 209 | 210 | predVec = [] 211 | trueVec = [] 212 | predTimeVec = [] 213 | trueTimeVec = [] 214 | iteration = 0 215 | for batchIndex in range(n_batches): 216 | tempX = testSet[0][batchIndex*batchSize: (batchIndex+1)*batchSize] 217 | tempY = testSet[1][batchIndex*batchSize: (batchIndex+1)*batchSize] 218 | if predictTime: 219 | tempT = testSet[2][batchIndex*batchSize: (batchIndex+1)*batchSize] 220 | x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options) 221 | codeResults = predict_code(x, t, mask) 222 | timeResults = predict_time(x, t, mask) 223 | elif useTime: 224 | tempT = testSet[2][batchIndex*batchSize: (batchIndex+1)*batchSize] 225 | x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options) 226 | codeResults = predict_code(x, t, mask) 227 | else: 228 | x, mask, lengths = padMatrixWithoutTime(tempX, options) 229 | codeResults = predict_code(x, mask) 230 | 231 | for i in range(codeResults.shape[1]): 232 | tensorMatrix = codeResults[:,i,:] 233 | thisY = tempY[i][1:] 234 | for timeIndex in range(lengths[i]): 235 | if len(thisY[timeIndex]) == 0: continue 236 | trueVec.append(thisY[timeIndex]) 237 | output = tensorMatrix[timeIndex] 238 | predVec.append(zip(*heapq.nlargest(30, enumerate(output), key=operator.itemgetter(1)))[0]) 239 | 240 | if predictTime: 241 | for i in range(timeResults.shape[1]): 242 | timeVec = timeResults[:,i] 243 | trueTimeVec.extend(tempT[i][1:]) 244 | for timeIndex in range(lengths[i]): 245 | predTimeVec.append(timeVec[timeIndex]) 246 | 247 | if (iteration % 10 == 0) and verbose: print 'iteration:%d/%d' % (iteration, n_batches) 248 | iteration += 1 249 | if iteration == 10: break 250 | 251 | recall = recallTop(trueVec, predVec) 252 | print 'recall@10:%f, recall@20:%f, recall@30:%f' % (recall[0], recall[1], recall[2]) 253 | 254 | if predictTime: 255 | r_squared = calculate_r_squared(trueTimeVec, predTimeVec, options) 256 | print 'R2:%f' % r_squared 257 | 258 | def parse_arguments(parser): 259 | parser.add_argument('model_file', type=str, metavar='', help='The path to the model file saved by Doctor AI') 260 | parser.add_argument('seq_file', type=str, metavar='', help='The path to the Pickled file containing visit information of patients') 261 | parser.add_argument('label_file', type=str, metavar='', help='The path to the Pickled file containing label information of patients') 262 | parser.add_argument('hidden_dim_size', type=str, metavar='', help='The size of the hidden layers of the Doctor AI. This is a string argument. For example, [500,400] means you are using a two-layer GRU where the lower layer uses a 500-dimensional hidden layer, and the upper layer uses a 400-dimensional hidden layer. (default value: [200,200])') 263 | parser.add_argument('--time_file', type=str, default='', help='The path to the Pickled file containing durations between visits of patients. If you are not using duration information, do not use this option') 264 | parser.add_argument('--predict_time', type=int, default=0, choices=[0,1], help='Use this option if you want Doctor AI to also predict the time duration until the next visit (0 for false, 1 for true) (default value: 0)') 265 | parser.add_argument('--use_log_time', type=int, default=1, choices=[0,1], help='Use logarithm of time duration to dampen the impact of the outliers (0 for false, 1 for true) (default value: 1)') 266 | parser.add_argument('--batch_size', type=int, default=100, help='The size of a single mini-batch (default value: 100)') 267 | parser.add_argument('--mean_duration', type=float, default=20.0, help='The mean value of the durations between visits of the training data. This will be used to calculate the R^2 error (default value: 20.0)') 268 | parser.add_argument('--verbose', action='store_true', help='Print output after every 10 mini-batches (default false)') 269 | args = parser.parse_args() 270 | return args 271 | 272 | if __name__ == '__main__': 273 | parser = argparse.ArgumentParser() 274 | args = parse_arguments(parser) 275 | hiddenDimSize = [int(strDim) for strDim in args.hidden_dim_size[1:-1].split(',')] 276 | 277 | if args.predict_time and args.time_file == '': 278 | print 'Cannot predict time duration without time file' 279 | sys.exit() 280 | 281 | test_doctorAI( 282 | modelFile=args.model_file, 283 | seqFile=args.seq_file, 284 | labelFile=args.label_file, 285 | timeFile=args.time_file, 286 | predictTime=args.predict_time, 287 | useLogTime=args.use_log_time, 288 | hiddenDimSize=hiddenDimSize, 289 | batchSize=args.batch_size, 290 | mean_duration=args.mean_duration, 291 | verbose=args.verbose 292 | ) 293 | -------------------------------------------------------------------------------- /doctorAI.py: -------------------------------------------------------------------------------- 1 | ################################################################# 2 | # Code written by Edward Choi (mp2893@gatech.edu) 3 | # For bug report, please contact author using the email address 4 | ################################################################# 5 | 6 | import sys, random 7 | import numpy as np 8 | import cPickle as pickle 9 | from collections import OrderedDict 10 | import argparse 11 | 12 | import theano 13 | import theano.tensor as T 14 | from theano import config 15 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 16 | 17 | def unzip(zipped): 18 | new_params = OrderedDict() 19 | for key, value in zipped.iteritems(): 20 | new_params[key] = value.get_value() 21 | return new_params 22 | 23 | def numpy_floatX(data): 24 | return np.asarray(data, dtype=config.floatX) 25 | 26 | def load_embedding(infile): 27 | Wemb = np.array(pickle.load(open(infile, 'rb'))).astype(config.floatX) 28 | return Wemb 29 | 30 | def init_params(options): 31 | params = OrderedDict() 32 | timeFile = options['timeFile'] 33 | embFile = options['embFile'] 34 | embSize = options['embSize'] 35 | inputDimSize = options['inputDimSize'] 36 | numClass = options['numClass'] 37 | 38 | if len(embFile) > 0: 39 | print 'using external code embedding' 40 | params['W_emb'] = load_embedding(embFile) 41 | embSize = params['W_emb'].shape[1] 42 | else: 43 | print 'using randomly initialized code embedding' 44 | params['W_emb'] = np.random.uniform(-0.01, 0.01, (inputDimSize, embSize)).astype(config.floatX) 45 | params['b_emb'] = np.zeros(embSize).astype(config.floatX) 46 | 47 | prevDimSize = embSize 48 | if len(timeFile) > 0: prevDimSize += 1 #We need to consider an extra dimension for the duration information 49 | for count, hiddenDimSize in enumerate(options['hiddenDimSize']): 50 | params['W_'+str(count)] = np.random.uniform(-0.01, 0.01, (prevDimSize, hiddenDimSize)).astype(config.floatX) 51 | params['W_r_'+str(count)] = np.random.uniform(-0.01, 0.01, (prevDimSize, hiddenDimSize)).astype(config.floatX) 52 | params['W_z_'+str(count)] = np.random.uniform(-0.01, 0.01, (prevDimSize, hiddenDimSize)).astype(config.floatX) 53 | params['U_'+str(count)] = np.random.uniform(-0.01, 0.01, (hiddenDimSize, hiddenDimSize)).astype(config.floatX) 54 | params['U_r_'+str(count)] = np.random.uniform(-0.01, 0.01, (hiddenDimSize, hiddenDimSize)).astype(config.floatX) 55 | params['U_z_'+str(count)] = np.random.uniform(-0.01, 0.01, (hiddenDimSize, hiddenDimSize)).astype(config.floatX) 56 | params['b_'+str(count)] = np.zeros(hiddenDimSize).astype(config.floatX) 57 | params['b_r_'+str(count)] = np.zeros(hiddenDimSize).astype(config.floatX) 58 | params['b_z_'+str(count)] = np.zeros(hiddenDimSize).astype(config.floatX) 59 | prevDimSize = hiddenDimSize 60 | 61 | params['W_output'] = np.random.uniform(-0.01, 0.01, (prevDimSize, numClass)).astype(config.floatX) 62 | params['b_output'] = np.zeros(numClass).astype(config.floatX) 63 | 64 | if options['predictTime']: 65 | params['W_time'] = np.random.uniform(-0.01, 0.01, (prevDimSize, 1)).astype(config.floatX) 66 | params['b_time'] = np.zeros(1).astype(config.floatX) 67 | 68 | return params 69 | 70 | def init_tparams(params, options): 71 | tparams = OrderedDict() 72 | for key, value in params.iteritems(): 73 | if not options['embFineTune'] and key == 'W_emb': continue 74 | tparams[key] = theano.shared(value, name=key) 75 | return tparams 76 | 77 | def dropout_layer(state_before, use_noise, trng, dropout_rate): 78 | proj = T.switch(use_noise, (state_before * trng.binomial(state_before.shape, p=dropout_rate, n=1, dtype=state_before.dtype)), state_before * 0.5) 79 | return proj 80 | 81 | def gru_layer(tparams, emb, layerIndex, hiddenDimSize, mask=None): 82 | timesteps = emb.shape[0] 83 | if emb.ndim == 3: n_samples = emb.shape[1] 84 | else: n_samples = 1 85 | 86 | W_rx = T.dot(emb, tparams['W_r_'+layerIndex]) 87 | W_zx = T.dot(emb, tparams['W_z_'+layerIndex]) 88 | Wx = T.dot(emb, tparams['W_'+layerIndex]) 89 | 90 | def stepFn(stepMask, wrx, wzx, wx, h): 91 | r = T.nnet.sigmoid(wrx + T.dot(h, tparams['U_r_'+layerIndex]) + tparams['b_r_'+layerIndex]) 92 | z = T.nnet.sigmoid(wzx + T.dot(h, tparams['U_z_'+layerIndex]) + tparams['b_z_'+layerIndex]) 93 | h_tilde = T.tanh(wx + T.dot(r*h, tparams['U_'+layerIndex]) + tparams['b_'+layerIndex]) 94 | h_new = z * h + ((1. - z) * h_tilde) 95 | h_new = stepMask[:, None] * h_new + (1. - stepMask)[:, None] * h 96 | return h_new 97 | 98 | results, updates = theano.scan(fn=stepFn, sequences=[mask,W_rx,W_zx,Wx], outputs_info=T.alloc(numpy_floatX(0.0), n_samples, hiddenDimSize), name='gru_layer'+layerIndex, n_steps=timesteps) 99 | 100 | return results 101 | 102 | def build_model(tparams, options, W_emb=None): 103 | trng = RandomStreams(123) 104 | use_noise = theano.shared(numpy_floatX(0.)) 105 | if len(options['timeFile']) > 0: useTime = True 106 | else: useTime = False 107 | 108 | x = T.tensor3('x', dtype=config.floatX) 109 | t = T.matrix('t', dtype=config.floatX) 110 | y = T.tensor3('y', dtype=config.floatX) 111 | t_label = T.matrix('t_label', dtype=config.floatX) 112 | mask = T.matrix('mask', dtype=config.floatX) 113 | lengths = T.vector('lengths', dtype=config.floatX) 114 | 115 | n_timesteps = x.shape[0] 116 | n_samples = x.shape[1] 117 | 118 | if options['embFineTune']: emb = T.tanh(T.dot(x, tparams['W_emb']) + tparams['b_emb']) 119 | else: emb = T.tanh(T.dot(x, W_emb) + tparams['b_emb']) 120 | if useTime: 121 | emb = T.concatenate([t.reshape([n_timesteps,n_samples,1]), emb], axis=2) #Adding the time element to the embedding 122 | 123 | inputVector = emb 124 | for i, hiddenDimSize in enumerate(options['hiddenDimSize']): 125 | memories = gru_layer(tparams, inputVector, str(i), hiddenDimSize, mask=mask) 126 | memories = dropout_layer(memories, use_noise, trng, options['dropout_rate']) 127 | inputVector = memories 128 | 129 | def softmaxStep(memory2d): 130 | return T.nnet.softmax(T.dot(memory2d, tparams['W_output']) + tparams['b_output']) 131 | 132 | logEps = options['logEps'] 133 | results, updates = theano.scan(fn=softmaxStep, sequences=[inputVector], outputs_info=None, name='softmax_layer', n_steps=n_timesteps) 134 | results = results * mask[:,:,None] 135 | cross_entropy = -(y * T.log(results + logEps) + (1. - y) * T.log(1. - results + logEps)) 136 | prediction_loss = cross_entropy.sum(axis=2).sum(axis=0) / lengths 137 | 138 | if options['predictTime']: 139 | duration = T.maximum(T.dot(inputVector, tparams['W_time']) + tparams['b_time'], 0) #ReLU 140 | duration = duration.reshape([n_timesteps,n_samples]) * mask 141 | duration_loss = 0.5 * ((duration - t_label) ** 2).sum(axis=0) / lengths 142 | cost = T.mean(prediction_loss) + options['tradeoff'] * T.mean(duration_loss) + options['L2_output'] * (tparams['W_output'] ** 2).sum() + options['L2_time'] * (tparams['W_time'] ** 2).sum() 143 | else: 144 | cost = T.mean(prediction_loss) + options['L2_output'] * (tparams['W_output'] ** 2).sum() 145 | 146 | if options['predictTime']: return use_noise, x, y, t, t_label, mask, lengths, cost 147 | elif useTime: return use_noise, x, y, t, mask, lengths, cost 148 | else: return use_noise, x, y, mask, lengths, cost 149 | 150 | def adadelta(tparams, grads, x, y, mask, lengths, cost, options, t=None, t_label=None): 151 | zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] 152 | running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] 153 | running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] 154 | 155 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 156 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] 157 | 158 | if options['predictTime']: 159 | f_grad_shared = theano.function([x, y, t, t_label, mask, lengths], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') 160 | elif len(options['timeFile']) > 0: 161 | f_grad_shared = theano.function([x, y, t, mask, lengths], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') 162 | else: 163 | f_grad_shared = theano.function([x, y, mask, lengths], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') 164 | 165 | updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] 166 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] 167 | param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] 168 | 169 | f_update = theano.function([], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') 170 | 171 | return f_grad_shared, f_update 172 | 173 | def padMatrixWithTimePrediction(seqs, labels, times, options): 174 | lengths = np.array([len(seq) for seq in seqs]) - 1 175 | n_samples = len(seqs) 176 | maxlen = np.max(lengths) 177 | inputDimSize = options['inputDimSize'] 178 | numClass = options['numClass'] 179 | 180 | x = np.zeros((maxlen, n_samples, inputDimSize)).astype(config.floatX) 181 | y = np.zeros((maxlen, n_samples, numClass)).astype(config.floatX) 182 | t = np.zeros((maxlen, n_samples)).astype(config.floatX) 183 | t_label = np.zeros((maxlen, n_samples)).astype(config.floatX) 184 | mask = np.zeros((maxlen, n_samples)).astype(config.floatX) 185 | for idx, (seq,time,label) in enumerate(zip(seqs,times,labels)): 186 | for xvec, subseq in zip(x[:,idx,:], seq[:-1]): 187 | xvec[subseq] = 1. 188 | for yvec, subseq in zip(y[:,idx,:], label[1:]): 189 | yvec[subseq] = 1. 190 | mask[:lengths[idx], idx] = 1. 191 | t[:lengths[idx], idx] = time[:-1] 192 | t_label[:lengths[idx], idx] = time[1:] 193 | 194 | lengths = np.array(lengths, dtype=config.floatX) 195 | if options['useLogTime']: 196 | t = np.log(t + options['logEps']) 197 | t_label = np.log(t_label + options['logEps']) 198 | 199 | return x, y, t, t_label, mask, lengths 200 | 201 | def padMatrixWithTime(seqs, labels, times, options): 202 | lengths = np.array([len(seq) for seq in seqs]) - 1 203 | n_samples = len(seqs) 204 | maxlen = np.max(lengths) 205 | inputDimSize = options['inputDimSize'] 206 | numClass = options['numClass'] 207 | 208 | x = np.zeros((maxlen, n_samples, inputDimSize)).astype(config.floatX) 209 | y = np.zeros((maxlen, n_samples, numClass)).astype(config.floatX) 210 | t = np.zeros((maxlen, n_samples)).astype(config.floatX) 211 | mask = np.zeros((maxlen, n_samples)).astype(config.floatX) 212 | for idx, (seq,time,label) in enumerate(zip(seqs,times,labels)): 213 | for xvec, subseq in zip(x[:,idx,:], seq[:-1]): 214 | xvec[subseq] = 1. 215 | for yvec, subseq in zip(y[:,idx,:], label[1:]): 216 | yvec[subseq] = 1. 217 | mask[:lengths[idx], idx] = 1. 218 | t[:lengths[idx], idx] = time[:-1] 219 | 220 | lengths = np.array(lengths, dtype=config.floatX) 221 | if options['useLogTime']: 222 | t = np.log(t + options['logEps']) 223 | 224 | return x, y, t, mask, lengths 225 | 226 | def padMatrixWithoutTime(seqs, labels, options): 227 | lengths = np.array([len(seq) for seq in seqs]) - 1 228 | n_samples = len(seqs) 229 | maxlen = np.max(lengths) 230 | inputDimSize = options['inputDimSize'] 231 | numClass = options['numClass'] 232 | 233 | x = np.zeros((maxlen, n_samples, inputDimSize)).astype(config.floatX) 234 | y = np.zeros((maxlen, n_samples, numClass)).astype(config.floatX) 235 | mask = np.zeros((maxlen, n_samples)).astype(config.floatX) 236 | for idx, (seq,label) in enumerate(zip(seqs,labels)): 237 | for xvec, subseq in zip(x[:,idx,:], seq[:-1]): 238 | xvec[subseq] = 1. 239 | for yvec, subseq in zip(y[:,idx,:], label[1:]): 240 | yvec[subseq] = 1. 241 | mask[:lengths[idx], idx] = 1. 242 | 243 | lengths = np.array(lengths, dtype=config.floatX) 244 | 245 | return x, y, mask, lengths 246 | 247 | def load_data(seqFile, labelFile, timeFile): 248 | train_set_x = pickle.load(open(seqFile+'.train', 'rb')) 249 | valid_set_x = pickle.load(open(seqFile+'.valid', 'rb')) 250 | test_set_x = pickle.load(open(seqFile+'.test', 'rb')) 251 | train_set_y = pickle.load(open(labelFile+'.train', 'rb')) 252 | valid_set_y = pickle.load(open(labelFile+'.valid', 'rb')) 253 | test_set_y = pickle.load(open(labelFile+'.test', 'rb')) 254 | train_set_t = None 255 | valid_set_t = None 256 | test_set_t = None 257 | 258 | if len(timeFile) > 0: 259 | train_set_t = pickle.load(open(timeFile+'.train', 'rb')) 260 | valid_set_t = pickle.load(open(timeFile+'.valid', 'rb')) 261 | test_set_t = pickle.load(open(timeFile+'.test', 'rb')) 262 | 263 | '''For debugging purposes 264 | sequences = np.array(pickle.load(open(seqFile, 'rb'))) 265 | labels = np.array(pickle.load(open(labelFile, 'rb'))) 266 | if len(timeFile) > 0: 267 | times = np.array(pickle.load(open(timeFile, 'rb'))) 268 | 269 | dataSize = len(labels) 270 | np.random.seed(0) 271 | ind = np.random.permutation(dataSize) 272 | nTest = int(0.15 * dataSize) 273 | nValid = int(0.10 * dataSize) 274 | 275 | test_indices = ind[:nTest] 276 | valid_indices = ind[nTest:nTest+nValid] 277 | train_indices = ind[nTest+nValid:] 278 | 279 | train_set_x = sequences[train_indices] 280 | train_set_y = labels[train_indices] 281 | test_set_x = sequences[test_indices] 282 | test_set_y = labels[test_indices] 283 | valid_set_x = sequences[valid_indices] 284 | valid_set_y = labels[valid_indices] 285 | train_set_t = None 286 | test_set_t = None 287 | valid_set_t = None 288 | 289 | if len(timeFile) > 0: 290 | train_set_t = times[train_indices] 291 | test_set_t = times[test_indices] 292 | valid_set_t = times[valid_indices] 293 | ''' 294 | 295 | def len_argsort(seq): 296 | return sorted(range(len(seq)), key=lambda x: len(seq[x])) 297 | 298 | train_sorted_index = len_argsort(train_set_x) 299 | train_set_x = [train_set_x[i] for i in train_sorted_index] 300 | train_set_y = [train_set_y[i] for i in train_sorted_index] 301 | 302 | valid_sorted_index = len_argsort(valid_set_x) 303 | valid_set_x = [valid_set_x[i] for i in valid_sorted_index] 304 | valid_set_y = [valid_set_y[i] for i in valid_sorted_index] 305 | 306 | test_sorted_index = len_argsort(test_set_x) 307 | test_set_x = [test_set_x[i] for i in test_sorted_index] 308 | test_set_y = [test_set_y[i] for i in test_sorted_index] 309 | 310 | if len(timeFile) > 0: 311 | train_set_t = [train_set_t[i] for i in train_sorted_index] 312 | valid_set_t = [valid_set_t[i] for i in valid_sorted_index] 313 | test_set_t = [test_set_t[i] for i in test_sorted_index] 314 | 315 | train_set = (train_set_x, train_set_y, train_set_t) 316 | valid_set = (valid_set_x, valid_set_y, valid_set_t) 317 | test_set = (test_set_x, test_set_y, test_set_t) 318 | 319 | return train_set, valid_set, test_set 320 | 321 | def calculate_auc(test_model, dataset, options): 322 | inputDimSize = options['inputDimSize'] 323 | numClass = options['numClass'] 324 | batchSize = options['batchSize'] 325 | useTime = options['useTime'] 326 | predictTime = options['predictTime'] 327 | 328 | n_batches = int(np.ceil(float(len(dataset[0])) / float(batchSize))) 329 | aucSum = 0.0 330 | dataCount = 0.0 331 | for index in xrange(n_batches): 332 | batchX = dataset[0][index*batchSize:(index+1)*batchSize] 333 | batchY = dataset[1][index*batchSize:(index+1)*batchSize] 334 | if predictTime: 335 | batchT = dataset[2][index*batchSize:(index+1)*batchSize] 336 | x, y, t, t_label, mask, lengths = padMatrixWithTimePrediction(batchX, batchY, batchT, options) 337 | auc = test_model(x, y, t, t_label, mask, lengths) 338 | elif useTime: 339 | batchT = dataset[2][index*batchSize:(index+1)*batchSize] 340 | x, y, t, mask, lengths = padMatrixWithTime(batchX, batchY, batchT, options) 341 | auc = test_model(x, y, t, mask, lengths) 342 | else: 343 | x, y, mask, lengths = padMatrixWithoutTime(batchX, batchY, options) 344 | auc = test_model(x, y, mask, lengths) 345 | aucSum += auc * len(batchX) 346 | dataCount += float(len(batchX)) 347 | return aucSum / dataCount 348 | 349 | def train_doctorAI( 350 | seqFile='seqFile.txt', 351 | inputDimSize=20000, 352 | labelFile='labelFile.txt', 353 | numClass=500, 354 | outFile='outFile.txt', 355 | timeFile='timeFile.txt', 356 | predictTime=False, 357 | tradeoff=1.0, 358 | useLogTime=True, 359 | embFile='embFile.txt', 360 | embSize=200, 361 | embFineTune=True, 362 | hiddenDimSize=[200,200], 363 | batchSize=100, 364 | max_epochs=10, 365 | L2_output=0.001, 366 | L2_time=0.001, 367 | dropout_rate=0.5, 368 | logEps=1e-8, 369 | verbose=False 370 | ): 371 | options = locals().copy() 372 | 373 | if len(timeFile) > 0: useTime = True 374 | else: useTime = False 375 | options['useTime'] = useTime 376 | 377 | print 'Initializing the parameters ... ', 378 | params = init_params(options) 379 | tparams = init_tparams(params, options) 380 | 381 | print 'Building the model ... ', 382 | f_grad_shared = None 383 | f_update = None 384 | if predictTime and embFineTune: 385 | print 'predicting duration, fine-tuning code representations' 386 | use_noise, x, y, t, t_label, mask, lengths, cost = build_model(tparams, options) 387 | grads = T.grad(cost, wrt=tparams.values()) 388 | f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t, t_label) 389 | elif predictTime and not embFineTune: 390 | print 'predicting duration, not fine-tuning code representations' 391 | W_emb = theano.shared(params['W_emb'], name='W_emb') 392 | use_noise, x, y, t, t_label, mask, lengths, cost = build_model(tparams, options, W_emb) 393 | grads = T.grad(cost, wrt=tparams.values()) 394 | f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t, t_label) 395 | elif useTime and embFineTune: 396 | print 'using duration information, fine-tuning code representations' 397 | use_noise, x, y, t, mask, lengths, cost = build_model(tparams, options) 398 | grads = T.grad(cost, wrt=tparams.values()) 399 | f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t) 400 | elif useTime and not embFineTune: 401 | print 'using duration information, not fine-tuning code representations' 402 | W_emb = theano.shared(params['W_emb'], name='W_emb') 403 | use_noise, x, y, t, mask, lengths, cost = build_model(tparams, options, W_emb) 404 | grads = T.grad(cost, wrt=tparams.values()) 405 | f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t) 406 | elif not useTime and embFineTune: 407 | print 'not using duration information, fine-tuning code representations' 408 | use_noise, x, y, mask, lengths, cost = build_model(tparams, options) 409 | grads = T.grad(cost, wrt=tparams.values()) 410 | f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options) 411 | elif not useTime and not embFineTune: 412 | print 'not using duration information, not fine-tuning code representations' 413 | W_emb = theano.shared(params['W_emb'], name='W_emb') 414 | use_noise, x, y, mask, lengths, cost = build_model(tparams, options, W_emb) 415 | grads = T.grad(cost, wrt=tparams.values()) 416 | f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options) 417 | 418 | print 'Loading data ... ', 419 | trainSet, validSet, testSet = load_data(seqFile, labelFile, timeFile) 420 | n_batches = int(np.ceil(float(len(trainSet[0])) / float(batchSize))) 421 | print 'done' 422 | 423 | if predictTime: test_model = theano.function(inputs=[x, y, t, t_label, mask, lengths], outputs=cost, name='test_model') 424 | elif useTime: test_model = theano.function(inputs=[x, y, t, mask, lengths], outputs=cost, name='test_model') 425 | else: test_model = theano.function(inputs=[x, y, mask, lengths], outputs=cost, name='test_model') 426 | 427 | bestValidCrossEntropy = 1e20 428 | bestValidEpoch = 0 429 | testCrossEntropy = 0.0 430 | print 'Optimization start !!' 431 | for epoch in xrange(max_epochs): 432 | iteration = 0 433 | costVector = [] 434 | for index in random.sample(range(n_batches), n_batches): 435 | use_noise.set_value(1.) 436 | batchX = trainSet[0][index*batchSize:(index+1)*batchSize] 437 | batchY = trainSet[1][index*batchSize:(index+1)*batchSize] 438 | if predictTime: 439 | batchT = trainSet[2][index*batchSize:(index+1)*batchSize] 440 | x, y, t, t_label, mask, lengths = padMatrixWithTimePrediction(batchX, batchY, batchT, options) 441 | cost = f_grad_shared(x, y, t, t_label, mask, lengths) 442 | elif useTime: 443 | batchT = trainSet[2][index*batchSize:(index+1)*batchSize] 444 | x, y, t, mask, lengths = padMatrixWithTime(batchX, batchY, batchT, options) 445 | cost = f_grad_shared(x, y, t, mask, lengths) 446 | else: 447 | x, y, mask, lengths = padMatrixWithoutTime(batchX, batchY, options) 448 | cost = f_grad_shared(x, y, mask, lengths) 449 | costVector.append(cost) 450 | f_update() 451 | if (iteration % 10 == 0) and verbose: print 'epoch:%d, iteration:%d/%d, cost:%f' % (epoch, iteration, n_batches, cost) 452 | iteration += 1 453 | 454 | print 'epoch:%d, mean_cost:%f' % (epoch, np.mean(costVector)) 455 | use_noise.set_value(0.) 456 | validAuc = calculate_auc(test_model, validSet, options) 457 | print 'Validation cross entropy:%f at epoch:%d' % (validAuc, epoch) 458 | if validAuc < bestValidCrossEntropy: 459 | bestValidCrossEntropy = validAuc 460 | bestValidEpoch = epoch 461 | bestParams = unzip(tparams) 462 | testCrossEntropy = calculate_auc(test_model, testSet, options) 463 | print 'Test cross entropy:%f at epoch:%d' % (testCrossEntropy, epoch) 464 | tempParams = unzip(tparams) 465 | np.savez_compressed(outFile + '.' + str(epoch), **tempParams) 466 | print 'The best valid cross entropy:%f at epoch:%d' % (bestValidCrossEntropy, bestValidEpoch) 467 | print 'The test cross entropy: %f' % testCrossEntropy 468 | 469 | def parse_arguments(parser): 470 | parser.add_argument('seq_file', type=str, metavar='', help='The path to the Pickled file containing visit information of patients') 471 | parser.add_argument('n_input_codes', type=int, metavar='', help='The number of unique input medical codes') 472 | parser.add_argument('label_file', type=str, metavar='', help='The path to the Pickled file containing label information of patients') 473 | parser.add_argument('n_output_codes', type=int, metavar='', help='The number of unique label medical codes') 474 | parser.add_argument('out_file', metavar='out_file', help='The path to the output models. The models will be saved after every epoch') 475 | parser.add_argument('--time_file', type=str, default='', help='The path to the Pickled file containing durations between visits of patients. If you are not using duration information, do not use this option') 476 | parser.add_argument('--predict_time', type=int, default=0, choices=[0,1], help='Use this option if you want the GRU to also predict the time duration until the next visit (0 for false, 1 for true) (default value: 0)') 477 | parser.add_argument('--tradeoff', type=float, default=1.0, help='Tradeoff variable for balancing the two loss functions: code prediction function and duration prediction function (default value: 1.0)') 478 | parser.add_argument('--use_log_time', type=int, default=1, choices=[0,1], help='Use logarithm of time duration to dampen the impact of the outliers (0 for false, 1 for true) (default value: 1)') 479 | parser.add_argument('--embed_file', type=str, default='', help='The path to the Pickled file containing the representation vectors of medical codes. If you are not using medical code representations, do not use this option') 480 | parser.add_argument('--embed_size', type=int, default=200, help='The size of the visit embedding before passing it to the GRU layers. If you are not providing your own medical code vectors, you must specify this value (default value: 200)') 481 | parser.add_argument('--embed_finetune', type=int, default=1, choices=[0,1], help='If you are using randomly initialized code representations, always use this option. If you are using an external medical code representations, and you want to fine-tune them as you train the GRU, use this option as well. (0 for false, 1 for true) (default value: 1)') 482 | parser.add_argument('--hidden_dim_size', type=str, default='[200,200]', help='The size of the hidden layers of the GRU. This is a string argument. For example, [500,400] means you are using a two-layer GRU where the lower layer uses a 500-dimensional hidden layer, and the upper layer uses a 400-dimensional hidden layer. (default value: [200,200])') 483 | parser.add_argument('--batch_size', type=int, default=100, help='The size of a single mini-batch (default value: 100)') 484 | parser.add_argument('--n_epochs', type=int, default=10, help='The number of training epochs (default value: 10)') 485 | parser.add_argument('--L2_softmax', type=float, default=0.001, help='L2 regularization for the softmax function (default value: 0.001)') 486 | parser.add_argument('--L2_time', type=float, default=0.001, help='L2 regularization for the linear regression (default value: 0.001)') 487 | parser.add_argument('--dropout_rate', type=float, default=0.5, help='Dropout rate between GRU hidden layers, and between the final hidden layer and the softmax layer (default value: 0.5)') 488 | parser.add_argument('--log_eps', type=float, default=1e-8, help='A small value to prevent log(0) (default value: 1e-8)') 489 | parser.add_argument('--verbose', action='store_true', help='Print output after every 10 mini-batches (default false)') 490 | args = parser.parse_args() 491 | return args 492 | 493 | if __name__ == '__main__': 494 | parser = argparse.ArgumentParser() 495 | args = parse_arguments(parser) 496 | hiddenDimSize = [int(strDim) for strDim in args.hidden_dim_size[1:-1].split(',')] 497 | 498 | if args.predict_time and args.time_file == '': 499 | print 'Cannot predict time duration without time file' 500 | sys.exit() 501 | 502 | train_doctorAI( 503 | seqFile=args.seq_file, 504 | inputDimSize=args.n_input_codes, 505 | labelFile=args.label_file, 506 | numClass=args.n_output_codes, 507 | outFile=args.out_file, 508 | timeFile=args.time_file, 509 | predictTime=args.predict_time, 510 | tradeoff=args.tradeoff, 511 | useLogTime=args.use_log_time, 512 | embFile=args.embed_file, 513 | embSize=args.embed_size, 514 | embFineTune=args.embed_finetune, 515 | hiddenDimSize=hiddenDimSize, 516 | batchSize=args.batch_size, 517 | max_epochs=args.n_epochs, 518 | L2_output=args.L2_softmax, 519 | L2_time=args.L2_time, 520 | dropout_rate=args.dropout_rate, 521 | logEps=args.log_eps, 522 | verbose=args.verbose 523 | ) 524 | --------------------------------------------------------------------------------