├── ODIR ├── M3Care.ipynb └── utils │ ├── __pycache__ │ ├── common_utils.cpython-36.pyc │ ├── common_utils.cpython-37.pyc │ ├── feature_extractor.cpython-36.pyc │ ├── feature_extractor.cpython-37.pyc │ ├── logging.cpython-37.pyc │ ├── metrics.cpython-36.pyc │ ├── metrics.cpython-37.pyc │ ├── preprocessing.cpython-36.pyc │ ├── preprocessing.cpython-37.pyc │ ├── readers.cpython-36.pyc │ ├── readers.cpython-37.pyc │ ├── utils.cpython-36.pyc │ └── utils.cpython-37.pyc │ ├── common_utils.py │ ├── decomp_normalizer │ ├── feature_extractor.py │ ├── logging.py │ ├── metrics.py │ ├── preprocessing.py │ ├── readers.py │ ├── resources │ ├── channel_info.json │ ├── discretizer_config.json │ └── valset.csv │ └── utils.py ├── OV ├── M3Care.ipynb └── utils │ ├── __pycache__ │ ├── common_utils.cpython-36.pyc │ ├── common_utils.cpython-37.pyc │ ├── feature_extractor.cpython-36.pyc │ ├── feature_extractor.cpython-37.pyc │ ├── logging.cpython-37.pyc │ ├── metrics.cpython-36.pyc │ ├── metrics.cpython-37.pyc │ ├── preprocessing.cpython-36.pyc │ ├── preprocessing.cpython-37.pyc │ ├── readers.cpython-36.pyc │ ├── readers.cpython-37.pyc │ ├── utils.cpython-36.pyc │ └── utils.cpython-37.pyc │ ├── common_utils.py │ ├── decomp_normalizer │ ├── feature_extractor.py │ ├── logging.py │ ├── metrics.py │ ├── preprocessing.py │ ├── readers.py │ ├── resources │ ├── channel_info.json │ ├── discretizer_config.json │ └── valset.csv │ └── utils.py └── README.md /ODIR/utils/__pycache__/common_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/common_utils.cpython-36.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/common_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/common_utils.cpython-37.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/feature_extractor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/feature_extractor.cpython-36.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/feature_extractor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/feature_extractor.cpython-37.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/logging.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/logging.cpython-37.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/metrics.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/metrics.cpython-37.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/preprocessing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/preprocessing.cpython-36.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/preprocessing.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/preprocessing.cpython-37.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/readers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/readers.cpython-36.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/readers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/readers.cpython-37.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /ODIR/utils/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /ODIR/utils/common_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | import os 6 | import json 7 | import random 8 | 9 | from .feature_extractor import extract_features 10 | 11 | 12 | def convert_to_dict(data, header, channel_info): 13 | """ convert data from readers output in to array of arrays format """ 14 | ret = [[] for i in range(data.shape[1] - 1)] 15 | for i in range(1, data.shape[1]): 16 | ret[i-1] = [(t, x) for (t, x) in zip(data[:, 0], data[:, i]) if x != ""] 17 | channel = header[i] 18 | if len(channel_info[channel]['possible_values']) != 0: 19 | ret[i-1] = list(map(lambda x: (x[0], channel_info[channel]['values'][x[1]]), ret[i-1])) 20 | ret[i-1] = list(map(lambda x: (float(x[0]), float(x[1])), ret[i-1])) 21 | return ret 22 | 23 | 24 | def extract_features_from_rawdata(chunk, header, period, features): 25 | with open(os.path.join(os.path.dirname(__file__), "resources/channel_info.json")) as channel_info_file: 26 | channel_info = json.loads(channel_info_file.read()) 27 | data = [convert_to_dict(X, header, channel_info) for X in chunk] 28 | return extract_features(data, period, features) 29 | 30 | 31 | def read_chunk(reader, chunk_size): 32 | data = {} 33 | for i in range(chunk_size): 34 | ret = reader.read_next() 35 | for k, v in ret.items(): 36 | if k not in data: 37 | data[k] = [] 38 | data[k].append(v) 39 | data["header"] = data["header"][0] 40 | return data 41 | 42 | 43 | def sort_and_shuffle(data, batch_size): 44 | """ Sort data by the length and then make batches and shuffle them. 45 | data is tuple (X1, X2, ..., Xn) all of them have the same length. 46 | Usually data = (X, y). 47 | """ 48 | assert len(data) >= 2 49 | data = list(zip(*data)) 50 | 51 | random.shuffle(data) 52 | 53 | old_size = len(data) 54 | rem = old_size % batch_size 55 | head = data[:old_size - rem] 56 | tail = data[old_size - rem:] 57 | data = [] 58 | 59 | head.sort(key=(lambda x: x[0].shape[0])) 60 | 61 | mas = [head[i: i+batch_size] for i in range(0, len(head), batch_size)] 62 | random.shuffle(mas) 63 | 64 | for x in mas: 65 | data += x 66 | data += tail 67 | 68 | data = list(zip(*data)) 69 | return data 70 | 71 | 72 | def add_common_arguments(parser): 73 | """ Add all the parameters which are common across the tasks 74 | """ 75 | parser.add_argument('--network', type=str, required=True) 76 | parser.add_argument('--dim', type=int, default=256, 77 | help='number of hidden units') 78 | parser.add_argument('--depth', type=int, default=1, 79 | help='number of bi-LSTMs') 80 | parser.add_argument('--epochs', type=int, default=100, 81 | help='number of chunks to train') 82 | parser.add_argument('--load_state', type=str, default="", 83 | help='state file path') 84 | parser.add_argument('--mode', type=str, default="train", 85 | help='mode: train or test') 86 | parser.add_argument('--batch_size', type=int, default=64) 87 | parser.add_argument('--l2', type=float, default=0, help='L2 regularization') 88 | parser.add_argument('--l1', type=float, default=0, help='L1 regularization') 89 | parser.add_argument('--save_every', type=int, default=1, 90 | help='save state every x epoch') 91 | parser.add_argument('--prefix', type=str, default="", 92 | help='optional prefix of network name') 93 | parser.add_argument('--dropout', type=float, default=0.0) 94 | parser.add_argument('--rec_dropout', type=float, default=0.0, 95 | help="dropout rate for recurrent connections") 96 | parser.add_argument('--batch_norm', type=bool, default=False, 97 | help='batch normalization') 98 | parser.add_argument('--timestep', type=float, default=1.0, 99 | help="fixed timestep used in the dataset") 100 | parser.add_argument('--imputation', type=str, default='previous') 101 | parser.add_argument('--small_part', dest='small_part', action='store_true') 102 | parser.add_argument('--whole_data', dest='small_part', action='store_false') 103 | parser.add_argument('--optimizer', type=str, default='adam') 104 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 105 | parser.add_argument('--beta_1', type=float, default=0.9, 106 | help='beta_1 param for Adam optimizer') 107 | parser.add_argument('--verbose', type=int, default=2) 108 | parser.add_argument('--size_coef', type=float, default=4.0) 109 | parser.add_argument('--normalizer_state', type=str, default=None, 110 | help='Path to a state file of a normalizer. Leave none if you want to ' 111 | 'use one of the provided ones.') 112 | parser.set_defaults(small_part=False) 113 | 114 | 115 | class DeepSupervisionDataLoader: 116 | r""" 117 | Data loader for decompensation and length of stay task. 118 | Reads all the data for one patient at once. 119 | 120 | Parameters 121 | ---------- 122 | dataset_dir : str 123 | Directory where timeseries files are stored. 124 | listfile : str 125 | Path to a listfile. If this parameter is left `None` then 126 | `dataset_dir/listfile.csv` will be used. 127 | """ 128 | def __init__(self, dataset_dir, listfile=None, small_part=False): 129 | 130 | self._dataset_dir = dataset_dir 131 | if listfile is None: 132 | listfile_path = os.path.join(dataset_dir, "listfile.csv") 133 | else: 134 | listfile_path = listfile 135 | with open(listfile_path, "r") as lfile: 136 | self._data = lfile.readlines()[1:] # skip the header 137 | 138 | self._data = [line.split(',') for line in self._data] 139 | self._data = [(x, float(t), y) for (x, t, y) in self._data] 140 | self._data = sorted(self._data) 141 | 142 | mas = {"X": [], 143 | "ts": [], 144 | "ys": [], 145 | "name": []} 146 | i = 0 147 | while i < len(self._data): 148 | j = i 149 | cur_stay = self._data[i][0] 150 | cur_ts = [] 151 | cur_labels = [] 152 | while j < len(self._data) and self._data[j][0] == cur_stay: 153 | cur_ts.append(self._data[j][1]) 154 | cur_labels.append(self._data[j][2]) 155 | j += 1 156 | 157 | cur_X, header = self._read_timeseries(cur_stay) 158 | mas["X"].append(cur_X) 159 | mas["ts"].append(cur_ts) 160 | mas["ys"].append(cur_labels) 161 | mas["name"].append(cur_stay) 162 | 163 | i = j 164 | if small_part and len(mas["name"]) == 256: 165 | break 166 | 167 | self._data = mas 168 | 169 | def _read_timeseries(self, ts_filename): 170 | ret = [] 171 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 172 | header = tsfile.readline().strip().split(',') 173 | assert header[0] == "Hours" 174 | for line in tsfile: 175 | mas = line.strip().split(',') 176 | ret.append(np.array(mas)) 177 | return (np.stack(ret), header) 178 | 179 | 180 | def create_directory(directory): 181 | if not os.path.exists(directory): 182 | os.makedirs(directory) 183 | 184 | 185 | def pad_zeros(arr, min_length=None): 186 | """ 187 | `arr` is an array of `np.array`s 188 | 189 | The function appends zeros to every `np.array` in `arr` 190 | to equalize their first axis lenghts. 191 | """ 192 | dtype = arr[0].dtype 193 | max_len = max([x.shape[0] for x in arr]) 194 | ret = [np.concatenate([x, np.zeros((max_len - x.shape[0],) + x.shape[1:], dtype=dtype)], axis=0) 195 | for x in arr] 196 | if (min_length is not None) and ret[0].shape[0] < min_length: 197 | ret = [np.concatenate([x, np.zeros((min_length - x.shape[0],) + x.shape[1:], dtype=dtype)], axis=0) 198 | for x in ret] 199 | return np.array(ret) 200 | -------------------------------------------------------------------------------- /ODIR/utils/decomp_normalizer: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/ODIR/utils/decomp_normalizer -------------------------------------------------------------------------------- /ODIR/utils/feature_extractor.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | from scipy.stats import skew 6 | 7 | all_functions = [min, max, np.mean, np.std, skew, len] 8 | 9 | functions_map = { 10 | "all": all_functions, 11 | "len": [len], 12 | "all_but_len": all_functions[:-1] 13 | } 14 | 15 | periods_map = { 16 | "all": (0, 0, 1, 0), 17 | "first4days": (0, 0, 0, 4 * 24), 18 | "first8days": (0, 0, 0, 8 * 24), 19 | "last12hours": (1, -12, 1, 0), 20 | "first25percent": (2, 25), 21 | "first50percent": (2, 50) 22 | } 23 | 24 | sub_periods = [(2, 100), (2, 10), (2, 25), (2, 50), 25 | (3, 10), (3, 25), (3, 50)] 26 | 27 | 28 | def get_range(begin, end, period): 29 | # first p % 30 | if period[0] == 2: 31 | return (begin, begin + (end - begin) * period[1] / 100.0) 32 | # last p % 33 | if period[0] == 3: 34 | return (end - (end - begin) * period[1] / 100.0, end) 35 | 36 | if period[0] == 0: 37 | L = begin + period[1] 38 | else: 39 | L = end + period[1] 40 | 41 | if period[2] == 0: 42 | R = begin + period[3] 43 | else: 44 | R = end + period[3] 45 | 46 | return (L, R) 47 | 48 | 49 | def calculate(channel_data, period, sub_period, functions): 50 | if len(channel_data) == 0: 51 | return np.full((len(functions, )), np.nan) 52 | 53 | L = channel_data[0][0] 54 | R = channel_data[-1][0] 55 | L, R = get_range(L, R, period) 56 | L, R = get_range(L, R, sub_period) 57 | 58 | data = [x for (t, x) in channel_data 59 | if L - 1e-6 < t < R + 1e-6] 60 | 61 | if len(data) == 0: 62 | return np.full((len(functions, )), np.nan) 63 | return np.array([fn(data) for fn in functions], dtype=np.float32) 64 | 65 | 66 | def extract_features_single_episode(data_raw, period, functions): 67 | global sub_periods 68 | extracted_features = [np.concatenate([calculate(data_raw[i], period, sub_period, functions) 69 | for sub_period in sub_periods], 70 | axis=0) 71 | for i in range(len(data_raw))] 72 | return np.concatenate(extracted_features, axis=0) 73 | 74 | 75 | def extract_features(data_raw, period, features): 76 | period = periods_map[period] 77 | functions = functions_map[features] 78 | return np.array([extract_features_single_episode(x, period, functions) 79 | for x in data_raw]) 80 | -------------------------------------------------------------------------------- /ODIR/utils/logging.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import logging 4 | 5 | 6 | def init_log(output_dir): 7 | logging.basicConfig(level=logging.DEBUG, 8 | format='%(asctime)s %(message)s', 9 | datefmt='%Y%m%d-%H:%M:%S', 10 | filename=os.path.join(output_dir, 'log.log'), 11 | filemode='w') 12 | console = logging.StreamHandler() 13 | console.setLevel(logging.INFO) 14 | logging.getLogger('').addHandler(console) 15 | return logging 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /ODIR/utils/metrics.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | from sklearn import metrics 6 | 7 | 8 | # for decompensation, in-hospital mortality 9 | 10 | def print_metrics_binary(y_true, predictions, verbose=1): 11 | predictions = np.array(predictions) 12 | if len(predictions.shape) == 1: 13 | predictions = np.stack([1 - predictions, predictions]).transpose((1, 0)) 14 | 15 | cf = metrics.confusion_matrix(y_true, predictions.argmax(axis=1)) 16 | if verbose: 17 | print("confusion matrix:") 18 | print(cf) 19 | cf = cf.astype(np.float32) 20 | 21 | acc = (cf[0][0] + cf[1][1]) / np.sum(cf) 22 | prec0 = cf[0][0] / (cf[0][0] + cf[1][0]) 23 | prec1 = cf[1][1] / (cf[1][1] + cf[0][1]) 24 | rec0 = cf[0][0] / (cf[0][0] + cf[0][1]) 25 | rec1 = cf[1][1] / (cf[1][1] + cf[1][0]) 26 | auroc = metrics.roc_auc_score(y_true, predictions[:, 1]) 27 | 28 | (precisions, recalls, thresholds) = metrics.precision_recall_curve(y_true, predictions[:, 1]) 29 | auprc = metrics.auc(recalls, precisions) 30 | minpse = np.max([min(x, y) for (x, y) in zip(precisions, recalls)]) 31 | f1_score=2*prec1*rec1/(prec1+rec1) 32 | if verbose: 33 | print("accuracy = {}".format(acc)) 34 | print("precision class 0 = {}".format(prec0)) 35 | print("precision class 1 = {}".format(prec1)) 36 | print("recall class 0 = {}".format(rec0)) 37 | print("recall class 1 = {}".format(rec1)) 38 | print("AUC of ROC = {}".format(auroc)) 39 | print("AUC of PRC = {}".format(auprc)) 40 | print("min(+P, Se) = {}".format(minpse)) 41 | print("f1_score = {}".format(f1_score)) 42 | 43 | return {"acc": acc, 44 | "prec0": prec0, 45 | "prec1": prec1, 46 | "rec0": rec0, 47 | "rec1": rec1, 48 | "auroc": auroc, 49 | "auprc": auprc, 50 | "minpse": minpse, 51 | "f1_score":f1_score} 52 | 53 | 54 | # for phenotyping 55 | 56 | def print_metrics_multilabel(y_true, predictions, verbose=1): 57 | y_true = np.array(y_true) 58 | predictions = np.array(predictions) 59 | 60 | auc_scores = metrics.roc_auc_score(y_true, predictions, average=None) 61 | ave_auc_micro = metrics.roc_auc_score(y_true, predictions, 62 | average="micro") 63 | ave_auc_macro = metrics.roc_auc_score(y_true, predictions, 64 | average="macro") 65 | ave_auc_weighted = metrics.roc_auc_score(y_true, predictions, 66 | average="weighted") 67 | 68 | predictions2 = np.zeros_like(predictions) 69 | for i in range(len(predictions2)): 70 | for j in range(len(predictions2[i])): 71 | if predictions[i][j]>=0.5: 72 | predictions2[i][j] = 1 73 | # print(predictions[:10]) 74 | # print(predictions2[:10]) 75 | 76 | # print(y_true[:,0][:10]) 77 | # print(predictions2[:,0][:10]) 78 | f1_0 = metrics.f1_score(y_true[:,0], predictions2[:,0]) 79 | # print(f1_0) 80 | f1_1 = metrics.f1_score(y_true[:,1], predictions2[:,1]) 81 | f1_2 = metrics.f1_score(y_true[:,2], predictions2[:,2]) 82 | 83 | total_labels = np.array(list(y_true[:,0])+list(y_true[:,1])+list(y_true[:,2])) 84 | total_preds = np.array(list(predictions2[:,0])+list(predictions2[:,1])+list(predictions2[:,2])) 85 | 86 | ave_f1_micro = metrics.f1_score(total_labels, total_preds) 87 | ave_f1_macro = (f1_0+f1_1+f1_2)/3 88 | 89 | # ave_f1_micro = metrics.f1_score(y_true, predictions2, 90 | # average="micro") 91 | # ave_f1_macro = metrics.f1_score(y_true, predictions2, 92 | # average="macro") 93 | 94 | coverage_error = metrics.coverage_error(y_true, predictions) 95 | label_ranking_loss = metrics.label_ranking_loss(y_true, predictions) 96 | 97 | if verbose: 98 | print("ROC AUC scores for labels:", auc_scores) 99 | print("ave_auc_micro = {}".format(ave_auc_micro)) 100 | print("ave_auc_macro = {}".format(ave_auc_macro)) 101 | print("ave_auc_weighted = {}".format(ave_auc_weighted)) 102 | 103 | return {"auc_scores": auc_scores, 104 | "ave_auc_micro": ave_auc_micro, 105 | "ave_auc_macro": ave_auc_macro, 106 | "ave_auc_weighted": ave_auc_weighted, 107 | "ave_f1_micro": ave_f1_micro, 108 | "ave_f1_macro": ave_f1_macro, 109 | "coverage_error": coverage_error, 110 | "label_ranking_loss": label_ranking_loss,} 111 | 112 | 113 | # for length of stay 114 | 115 | def mean_absolute_percentage_error(y_true, y_pred): 116 | return np.mean(np.abs((y_true - y_pred) / (y_true + 0.1))) * 100 117 | 118 | 119 | def print_metrics_regression(y_true, predictions, verbose=1): 120 | predictions = np.array(predictions) 121 | predictions = np.maximum(predictions, 0).flatten() 122 | y_true = np.array(y_true) 123 | 124 | y_true_bins = [get_bin_custom(x, CustomBins.nbins) for x in y_true] 125 | prediction_bins = [get_bin_custom(x, CustomBins.nbins) for x in predictions] 126 | cf = metrics.confusion_matrix(y_true_bins, prediction_bins) 127 | if verbose: 128 | print("Custom bins confusion matrix:") 129 | print(cf) 130 | 131 | kappa = metrics.cohen_kappa_score(y_true_bins, prediction_bins, 132 | weights='linear') 133 | mad = metrics.mean_absolute_error(y_true, predictions) 134 | mse = metrics.mean_squared_error(y_true, predictions) 135 | mape = mean_absolute_percentage_error(y_true, predictions) 136 | 137 | if verbose: 138 | print("Mean absolute deviation (MAD) = {}".format(mad)) 139 | print("Mean squared error (MSE) = {}".format(mse)) 140 | print("Mean absolute percentage error (MAPE) = {}".format(mape)) 141 | print("Cohen kappa score = {}".format(kappa)) 142 | 143 | return {"mad": mad, 144 | "mse": mse, 145 | "mape": mape, 146 | "kappa": kappa} 147 | 148 | 149 | class LogBins: 150 | nbins = 10 151 | means = [0.611848, 2.587614, 6.977417, 16.465430, 37.053745, 152 | 81.816438, 182.303159, 393.334856, 810.964040, 1715.702848] 153 | 154 | 155 | def get_bin_log(x, nbins, one_hot=False): 156 | binid = int(np.log(x + 1) / 8.0 * nbins) 157 | if binid < 0: 158 | binid = 0 159 | if binid >= nbins: 160 | binid = nbins - 1 161 | 162 | if one_hot: 163 | ret = np.zeros((LogBins.nbins,)) 164 | ret[binid] = 1 165 | return ret 166 | return binid 167 | 168 | 169 | def get_estimate_log(prediction, nbins): 170 | bin_id = np.argmax(prediction) 171 | return LogBins.means[bin_id] 172 | 173 | 174 | def print_metrics_log_bins(y_true, predictions, verbose=1): 175 | y_true_bins = [get_bin_log(x, LogBins.nbins) for x in y_true] 176 | prediction_bins = [get_bin_log(x, LogBins.nbins) for x in predictions] 177 | cf = metrics.confusion_matrix(y_true_bins, prediction_bins) 178 | if verbose: 179 | print("LogBins confusion matrix:") 180 | print(cf) 181 | return print_metrics_regression(y_true, predictions, verbose) 182 | 183 | 184 | class CustomBins: 185 | inf = 1e18 186 | bins = [(-inf, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 14), (14, +inf)] 187 | nbins = len(bins) 188 | means = [11.450379, 35.070846, 59.206531, 83.382723, 107.487817, 189 | 131.579534, 155.643957, 179.660558, 254.306624, 585.325890] 190 | 191 | 192 | def get_bin_custom(x, nbins, one_hot=False): 193 | for i in range(nbins): 194 | a = CustomBins.bins[i][0] * 24.0 195 | b = CustomBins.bins[i][1] * 24.0 196 | if a <= x < b: 197 | if one_hot: 198 | ret = np.zeros((CustomBins.nbins,)) 199 | ret[i] = 1 200 | return ret 201 | return i 202 | return None 203 | 204 | 205 | def get_estimate_custom(prediction, nbins): 206 | bin_id = np.argmax(prediction) 207 | assert 0 <= bin_id < nbins 208 | return CustomBins.means[bin_id] 209 | 210 | 211 | def print_metrics_custom_bins(y_true, predictions, verbose=1): 212 | return print_metrics_regression(y_true, predictions, verbose) 213 | -------------------------------------------------------------------------------- /ODIR/utils/preprocessing.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | import platform 6 | import pickle 7 | import json 8 | import os 9 | 10 | 11 | class Discretizer: 12 | def __init__(self, timestep=0.8, store_masks=True, impute_strategy='zero', start_time='zero', 13 | config_path=os.path.join(os.path.dirname(__file__), 'resources/discretizer_config.json')): 14 | 15 | with open(config_path) as f: 16 | config = json.load(f) 17 | self._id_to_channel = config['id_to_channel'] 18 | self._channel_to_id = dict(zip(self._id_to_channel, range(len(self._id_to_channel)))) 19 | self._is_categorical_channel = config['is_categorical_channel'] 20 | self._possible_values = config['possible_values'] 21 | self._normal_values = config['normal_values'] 22 | 23 | self._header = ["Hours"] + self._id_to_channel 24 | self._timestep = timestep 25 | self._store_masks = store_masks 26 | self._start_time = start_time 27 | self._impute_strategy = impute_strategy 28 | 29 | # for statistics 30 | self._done_count = 0 31 | self._empty_bins_sum = 0 32 | self._unused_data_sum = 0 33 | 34 | def transform(self, X, header=None, end=None): 35 | if header is None: 36 | header = self._header 37 | assert header[0] == "Hours" 38 | eps = 1e-6 39 | 40 | N_channels = len(self._id_to_channel) 41 | ts = [float(row[0]) for row in X] 42 | for i in range(len(ts) - 1): 43 | assert ts[i] < ts[i+1] + eps 44 | 45 | if self._start_time == 'relative': 46 | first_time = ts[0] 47 | elif self._start_time == 'zero': 48 | first_time = 0 49 | else: 50 | raise ValueError("start_time is invalid") 51 | 52 | if end is None: 53 | max_hours = max(ts) - first_time 54 | else: 55 | max_hours = end - first_time 56 | 57 | N_bins = int(max_hours / self._timestep + 1.0 - eps) 58 | 59 | cur_len = 0 60 | begin_pos = [0 for i in range(N_channels)] 61 | end_pos = [0 for i in range(N_channels)] 62 | for i in range(N_channels): 63 | channel = self._id_to_channel[i] 64 | begin_pos[i] = cur_len 65 | if self._is_categorical_channel[channel]: 66 | end_pos[i] = begin_pos[i] + len(self._possible_values[channel]) 67 | else: 68 | end_pos[i] = begin_pos[i] + 1 69 | cur_len = end_pos[i] 70 | 71 | data = np.zeros(shape=(N_bins, cur_len), dtype=float) 72 | mask = np.zeros(shape=(N_bins, N_channels), dtype=int) 73 | original_value = [["" for j in range(N_channels)] for i in range(N_bins)] 74 | total_data = 0 75 | unused_data = 0 76 | 77 | def write(data, bin_id, channel, value, begin_pos): 78 | channel_id = self._channel_to_id[channel] 79 | if self._is_categorical_channel[channel]: 80 | category_id = self._possible_values[channel].index(value) 81 | N_values = len(self._possible_values[channel]) 82 | one_hot = np.zeros((N_values,)) 83 | one_hot[category_id] = 1 84 | for pos in range(N_values): 85 | data[bin_id, begin_pos[channel_id] + pos] = one_hot[pos] 86 | else: 87 | data[bin_id, begin_pos[channel_id]] = float(value) 88 | 89 | for row in X: 90 | t = float(row[0]) - first_time 91 | if t > max_hours + eps: 92 | continue 93 | bin_id = int(t / self._timestep - eps) 94 | assert 0 <= bin_id < N_bins 95 | 96 | for j in range(1, len(row)): 97 | if row[j] == "": 98 | continue 99 | channel = header[j] 100 | channel_id = self._channel_to_id[channel] 101 | 102 | total_data += 1 103 | if mask[bin_id][channel_id] == 1: 104 | unused_data += 1 105 | mask[bin_id][channel_id] = 1 106 | 107 | write(data, bin_id, channel, row[j], begin_pos) 108 | original_value[bin_id][channel_id] = row[j] 109 | 110 | # impute missing values 111 | 112 | if self._impute_strategy not in ['zero', 'normal_value', 'previous', 'next']: 113 | raise ValueError("impute strategy is invalid") 114 | 115 | if self._impute_strategy in ['normal_value', 'previous']: 116 | prev_values = [[] for i in range(len(self._id_to_channel))] 117 | for bin_id in range(N_bins): 118 | for channel in self._id_to_channel: 119 | channel_id = self._channel_to_id[channel] 120 | if mask[bin_id][channel_id] == 1: 121 | prev_values[channel_id].append(original_value[bin_id][channel_id]) 122 | continue 123 | if self._impute_strategy == 'normal_value': 124 | imputed_value = self._normal_values[channel] 125 | if self._impute_strategy == 'previous': 126 | if len(prev_values[channel_id]) == 0: 127 | imputed_value = self._normal_values[channel] 128 | else: 129 | imputed_value = prev_values[channel_id][-1] 130 | write(data, bin_id, channel, imputed_value, begin_pos) 131 | 132 | if self._impute_strategy == 'next': 133 | prev_values = [[] for i in range(len(self._id_to_channel))] 134 | for bin_id in range(N_bins-1, -1, -1): 135 | for channel in self._id_to_channel: 136 | channel_id = self._channel_to_id[channel] 137 | if mask[bin_id][channel_id] == 1: 138 | prev_values[channel_id].append(original_value[bin_id][channel_id]) 139 | continue 140 | if len(prev_values[channel_id]) == 0: 141 | imputed_value = self._normal_values[channel] 142 | else: 143 | imputed_value = prev_values[channel_id][-1] 144 | write(data, bin_id, channel, imputed_value, begin_pos) 145 | 146 | empty_bins = np.sum([1 - min(1, np.sum(mask[i, :])) for i in range(N_bins)]) 147 | self._done_count += 1 148 | self._empty_bins_sum += empty_bins / (N_bins + eps) 149 | self._unused_data_sum += unused_data / (total_data + eps) 150 | 151 | if self._store_masks: 152 | data = np.hstack([data, mask.astype(np.float32)]) 153 | 154 | # create new header 155 | new_header = [] 156 | for channel in self._id_to_channel: 157 | if self._is_categorical_channel[channel]: 158 | values = self._possible_values[channel] 159 | for value in values: 160 | new_header.append(channel + "->" + value) 161 | else: 162 | new_header.append(channel) 163 | 164 | if self._store_masks: 165 | for i in range(len(self._id_to_channel)): 166 | channel = self._id_to_channel[i] 167 | new_header.append("mask->" + channel) 168 | 169 | new_header = ",".join(new_header) 170 | 171 | return (data, new_header) 172 | 173 | def print_statistics(self): 174 | print("statistics of discretizer:") 175 | print("\tconverted {} examples".format(self._done_count)) 176 | print("\taverage unused data = {:.2f} percent".format(100.0 * self._unused_data_sum / self._done_count)) 177 | print("\taverage empty bins = {:.2f} percent".format(100.0 * self._empty_bins_sum / self._done_count)) 178 | 179 | 180 | class Normalizer: 181 | def __init__(self, fields=None): 182 | self._means = None 183 | self._stds = None 184 | self._fields = None 185 | if fields is not None: 186 | self._fields = [col for col in fields] 187 | 188 | self._sum_x = None 189 | self._sum_sq_x = None 190 | self._count = 0 191 | 192 | def _feed_data(self, x): 193 | x = np.array(x) 194 | self._count += x.shape[0] 195 | if self._sum_x is None: 196 | self._sum_x = np.sum(x, axis=0) 197 | self._sum_sq_x = np.sum(x**2, axis=0) 198 | else: 199 | self._sum_x += np.sum(x, axis=0) 200 | self._sum_sq_x += np.sum(x**2, axis=0) 201 | 202 | def _save_params(self, save_file_path): 203 | eps = 1e-7 204 | with open(save_file_path, "wb") as save_file: 205 | N = self._count 206 | self._means = 1.0 / N * self._sum_x 207 | self._stds = np.sqrt(1.0/(N - 1) * (self._sum_sq_x - 2.0 * self._sum_x * self._means + N * self._means**2)) 208 | self._stds[self._stds < eps] = eps 209 | pickle.dump(obj={'means': self._means, 210 | 'stds': self._stds}, 211 | file=save_file, 212 | protocol=2) 213 | 214 | def load_params(self, load_file_path): 215 | with open(load_file_path, "rb") as load_file: 216 | if platform.python_version()[0] == '2': 217 | dct = pickle.load(load_file) 218 | else: 219 | dct = pickle.load(load_file, encoding='latin1') 220 | self._means = dct['means'] 221 | self._stds = dct['stds'] 222 | 223 | def transform(self, X): 224 | if self._fields is None: 225 | fields = range(X.shape[1]) 226 | else: 227 | fields = self._fields 228 | ret = 1.0 * X 229 | for col in fields: 230 | ret[:, col] = (X[:, col] - self._means[col]) / self._stds[col] 231 | return ret 232 | -------------------------------------------------------------------------------- /ODIR/utils/readers.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import os 5 | import numpy as np 6 | import random 7 | 8 | 9 | class Reader(object): 10 | def __init__(self, dataset_dir, listfile=None): 11 | self._dataset_dir = dataset_dir 12 | self._current_index = 0 13 | if listfile is None: 14 | listfile_path = os.path.join(dataset_dir, "listfile.csv") 15 | else: 16 | listfile_path = listfile 17 | with open(listfile_path, "r") as lfile: 18 | self._data = lfile.readlines() 19 | self._listfile_header = self._data[0] 20 | self._data = self._data[1:] 21 | 22 | def get_number_of_examples(self): 23 | return len(self._data) 24 | 25 | def random_shuffle(self, seed=None): 26 | if seed is not None: 27 | random.seed(seed) 28 | random.shuffle(self._data) 29 | 30 | def read_example(self, index): 31 | raise NotImplementedError() 32 | 33 | def read_next(self): 34 | to_read_index = self._current_index 35 | self._current_index += 1 36 | if self._current_index == self.get_number_of_examples(): 37 | self._current_index = 0 38 | return self.read_example(to_read_index) 39 | 40 | 41 | class DecompensationReader(Reader): 42 | def __init__(self, dataset_dir, listfile=None): 43 | """ Reader for decompensation prediction task. 44 | :param dataset_dir: Directory where timeseries files are stored. 45 | :param listfile: Path to a listfile. If this parameter is left `None` then 46 | `dataset_dir/listfile.csv` will be used. 47 | """ 48 | Reader.__init__(self, dataset_dir, listfile) 49 | self._data = [line.split(',') for line in self._data] 50 | self._data = [(x, float(t), int(y)) for (x, t, y) in self._data] 51 | 52 | def _read_timeseries(self, ts_filename, time_bound): 53 | ret = [] 54 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 55 | header = tsfile.readline().strip().split(',') 56 | assert header[0] == "Hours" 57 | for line in tsfile: 58 | mas = line.strip().split(',') 59 | t = float(mas[0]) 60 | if t > time_bound + 1e-6: 61 | break 62 | ret.append(np.array(mas)) 63 | return (np.stack(ret), header) 64 | 65 | def read_example(self, index): 66 | """ Read the example with given index. 67 | 68 | :param index: Index of the line of the listfile to read (counting starts from 0). 69 | :return: Directory with the following keys: 70 | X : np.array 71 | 2D array containing all events. Each row corresponds to a moment. 72 | First column is the time and other columns correspond to different 73 | variables. 74 | t : float 75 | Length of the data in hours. Note, in general, it is not equal to the 76 | timestamp of last event. 77 | y : int (0 or 1) 78 | Mortality within next 24 hours. 79 | header : array of strings 80 | Names of the columns. The ordering of the columns is always the same. 81 | name: Name of the sample. 82 | """ 83 | if index < 0 or index >= len(self._data): 84 | raise ValueError("Index must be from 0 (inclusive) to number of examples (exclusive).") 85 | 86 | name = self._data[index][0] 87 | t = self._data[index][1] 88 | y = self._data[index][2] 89 | (X, header) = self._read_timeseries(name, t) 90 | 91 | return {"X": X, 92 | "t": t, 93 | "y": y, 94 | "header": header, 95 | "name": name} 96 | 97 | 98 | class InHospitalMortalityReader(Reader): 99 | def __init__(self, dataset_dir, listfile=None, period_length=48.0): 100 | """ Reader for in-hospital moratality prediction task. 101 | 102 | :param dataset_dir: Directory where timeseries files are stored. 103 | :param listfile: Path to a listfile. If this parameter is left `None` then 104 | `dataset_dir/listfile.csv` will be used. 105 | :param period_length: Length of the period (in hours) from which the prediction is done. 106 | """ 107 | Reader.__init__(self, dataset_dir, listfile) 108 | self._data = [line.split(',') for line in self._data] 109 | self._data = [(x, int(y)) for (x, y) in self._data] 110 | self._period_length = period_length 111 | 112 | def _read_timeseries(self, ts_filename): 113 | ret = [] 114 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 115 | header = tsfile.readline().strip().split(',') 116 | assert header[0] == "Hours" 117 | for line in tsfile: 118 | mas = line.strip().split(',') 119 | ret.append(np.array(mas)) 120 | return (np.stack(ret), header) 121 | 122 | def read_example(self, index): 123 | """ Reads the example with given index. 124 | 125 | :param index: Index of the line of the listfile to read (counting starts from 0). 126 | :return: Dictionary with the following keys: 127 | X : np.array 128 | 2D array containing all events. Each row corresponds to a moment. 129 | First column is the time and other columns correspond to different 130 | variables. 131 | t : float 132 | Length of the data in hours. Note, in general, it is not equal to the 133 | timestamp of last event. 134 | y : int (0 or 1) 135 | In-hospital mortality. 136 | header : array of strings 137 | Names of the columns. The ordering of the columns is always the same. 138 | name: Name of the sample. 139 | """ 140 | if index < 0 or index >= len(self._data): 141 | raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).") 142 | 143 | name = self._data[index][0] 144 | t = self._period_length 145 | y = self._data[index][1] 146 | (X, header) = self._read_timeseries(name) 147 | 148 | return {"X": X, 149 | "t": t, 150 | "y": y, 151 | "header": header, 152 | "name": name} 153 | 154 | 155 | class LengthOfStayReader(Reader): 156 | def __init__(self, dataset_dir, listfile=None): 157 | """ Reader for length of stay prediction task. 158 | 159 | :param dataset_dir: Directory where timeseries files are stored. 160 | :param listfile: Path to a listfile. If this parameter is left `None` then 161 | `dataset_dir/listfile.csv` will be used. 162 | """ 163 | Reader.__init__(self, dataset_dir, listfile) 164 | self._data = [line.split(',') for line in self._data] 165 | self._data = [(x, float(t), float(y)) for (x, t, y) in self._data] 166 | 167 | def _read_timeseries(self, ts_filename, time_bound): 168 | ret = [] 169 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 170 | header = tsfile.readline().strip().split(',') 171 | assert header[0] == "Hours" 172 | for line in tsfile: 173 | mas = line.strip().split(',') 174 | t = float(mas[0]) 175 | if t > time_bound + 1e-6: 176 | break 177 | ret.append(np.array(mas)) 178 | return (np.stack(ret), header) 179 | 180 | def read_example(self, index): 181 | """ Reads the example with given index. 182 | 183 | :param index: Index of the line of the listfile to read (counting starts from 0). 184 | :return: Dictionary with the following keys: 185 | X : np.array 186 | 2D array containing all events. Each row corresponds to a moment. 187 | First column is the time and other columns correspond to different 188 | variables. 189 | t : float 190 | Length of the data in hours. Note, in general, it is not equal to the 191 | timestamp of last event. 192 | y : float 193 | Remaining time in ICU. 194 | header : array of strings 195 | Names of the columns. The ordering of the columns is always the same. 196 | name: Name of the sample. 197 | """ 198 | if index < 0 or index >= len(self._data): 199 | raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).") 200 | 201 | name = self._data[index][0] 202 | t = self._data[index][1] 203 | y = self._data[index][2] 204 | (X, header) = self._read_timeseries(name, t) 205 | 206 | return {"X": X, 207 | "t": t, 208 | "y": y, 209 | "header": header, 210 | "name": name} 211 | 212 | 213 | class PhenotypingReader(Reader): 214 | def __init__(self, dataset_dir, listfile=None): 215 | """ Reader for phenotype classification task. 216 | 217 | :param dataset_dir: Directory where timeseries files are stored. 218 | :param listfile: Path to a listfile. If this parameter is left `None` then 219 | `dataset_dir/listfile.csv` will be used. 220 | """ 221 | Reader.__init__(self, dataset_dir, listfile) 222 | self._data = [line.split(',') for line in self._data] 223 | self._data = [(mas[0], float(mas[1]), list(map(int, mas[2:]))) for mas in self._data] 224 | 225 | def _read_timeseries(self, ts_filename): 226 | ret = [] 227 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 228 | header = tsfile.readline().strip().split(',') 229 | assert header[0] == "Hours" 230 | for line in tsfile: 231 | mas = line.strip().split(',') 232 | ret.append(np.array(mas)) 233 | return (np.stack(ret), header) 234 | 235 | def read_example(self, index): 236 | """ Reads the example with given index. 237 | 238 | :param index: Index of the line of the listfile to read (counting starts from 0). 239 | :return: Dictionary with the following keys: 240 | X : np.array 241 | 2D array containing all events. Each row corresponds to a moment. 242 | First column is the time and other columns correspond to different 243 | variables. 244 | t : float 245 | Length of the data in hours. Note, in general, it is not equal to the 246 | timestamp of last event. 247 | y : array of ints 248 | Phenotype labels. 249 | header : array of strings 250 | Names of the columns. The ordering of the columns is always the same. 251 | name: Name of the sample. 252 | """ 253 | if index < 0 or index >= len(self._data): 254 | raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).") 255 | 256 | name = self._data[index][0] 257 | t = self._data[index][1] 258 | y = self._data[index][2] 259 | (X, header) = self._read_timeseries(name) 260 | 261 | return {"X": X, 262 | "t": t, 263 | "y": y, 264 | "header": header, 265 | "name": name} 266 | 267 | 268 | class MultitaskReader(Reader): 269 | def __init__(self, dataset_dir, listfile=None): 270 | """ Reader for multitask learning. 271 | 272 | :param dataset_dir: Directory where timeseries files are stored. 273 | :param listfile: Path to a listfile. If this parameter is left `None` then 274 | `dataset_dir/listfile.csv` will be used. 275 | """ 276 | Reader.__init__(self, dataset_dir, listfile) 277 | self._data = [line.split(',') for line in self._data] 278 | 279 | def process_ihm(x): 280 | return list(map(int, x.split(';'))) 281 | 282 | def process_los(x): 283 | x = x.split(';') 284 | if x[0] == '': 285 | return ([], []) 286 | return (list(map(int, x[:len(x)//2])), list(map(float, x[len(x)//2:]))) 287 | 288 | def process_ph(x): 289 | return list(map(int, x.split(';'))) 290 | 291 | def process_decomp(x): 292 | x = x.split(';') 293 | if x[0] == '': 294 | return ([], []) 295 | return (list(map(int, x[:len(x)//2])), list(map(int, x[len(x)//2:]))) 296 | 297 | self._data = [(fname, float(t), process_ihm(ihm), process_los(los), 298 | process_ph(pheno), process_decomp(decomp)) 299 | for fname, t, ihm, los, pheno, decomp in self._data] 300 | 301 | def _read_timeseries(self, ts_filename): 302 | ret = [] 303 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 304 | header = tsfile.readline().strip().split(',') 305 | assert header[0] == "Hours" 306 | for line in tsfile: 307 | mas = line.strip().split(',') 308 | ret.append(np.array(mas)) 309 | return (np.stack(ret), header) 310 | 311 | def read_example(self, index): 312 | """ Reads the example with given index. 313 | 314 | :param index: Index of the line of the listfile to read (counting starts from 0). 315 | :return: Return dictionary with the following keys: 316 | X : np.array 317 | 2D array containing all events. Each row corresponds to a moment. 318 | First column is the time and other columns correspond to different 319 | variables. 320 | t : float 321 | Length of the data in hours. Note, in general, it is not equal to the 322 | timestamp of last event. 323 | ihm : array 324 | Array of 3 integers: [pos, mask, label]. 325 | los : array 326 | Array of 2 arrays: [masks, labels]. 327 | pheno : array 328 | Array of 25 binary integers (phenotype labels). 329 | decomp : array 330 | Array of 2 arrays: [masks, labels]. 331 | header : array of strings 332 | Names of the columns. The ordering of the columns is always the same. 333 | name: Name of the sample. 334 | """ 335 | if index < 0 or index >= len(self._data): 336 | raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).") 337 | 338 | name = self._data[index][0] 339 | (X, header) = self._read_timeseries(name) 340 | 341 | return {"X": X, 342 | "t": self._data[index][1], 343 | "ihm": self._data[index][2], 344 | "los": self._data[index][3], 345 | "pheno": self._data[index][4], 346 | "decomp": self._data[index][5], 347 | "header": header, 348 | "name": name} 349 | -------------------------------------------------------------------------------- /ODIR/utils/resources/channel_info.json: -------------------------------------------------------------------------------- 1 | { 2 | "Glucose": { 3 | "possible_values": [] 4 | }, 5 | "Systolic blood pressure": { 6 | "possible_values": [] 7 | }, 8 | "Glascow coma scale verbal response": { 9 | "possible_values": ["1 No Response", "1.0 ET/Trach", "2 Incomp sounds", "3 Inapprop words", "4 Confused", "5 Oriented", "Confused", "Inappropriate Words", "Incomprehensible sounds", "No Response", "No Response-ETT", "Oriented"], 10 | "values": { 11 | "No Response-ETT": 1, 12 | "No Response": 1, 13 | "1 No Response": 1, 14 | "1.0 ET/Trach": 1, 15 | "2 Incomp sounds": 2, 16 | "Incomprehensible sounds": 2, 17 | "3 Inapprop words": 3, 18 | "Inappropriate Words": 3, 19 | "4 Confused": 4, 20 | "Confused": 4, 21 | "5 Oriented": 5, 22 | "Oriented": 5 23 | } 24 | }, 25 | "Temperature": { 26 | "possible_values": [] 27 | }, 28 | "Weight": { 29 | "possible_values": [] 30 | }, 31 | "Diastolic blood pressure": { 32 | "possible_values": [] 33 | }, 34 | "Fraction inspired oxygen": { 35 | "possible_values": [] 36 | }, 37 | "Glascow coma scale total": { 38 | "possible_values": ["10", "11", "12", "13", "14", "15", "3", "4", "5", "6", "7", "8", "9"], 39 | "values": { 40 | "3": 3, 41 | "4": 4, 42 | "5": 5, 43 | "6": 6, 44 | "7": 7, 45 | "8": 8, 46 | "9": 9, 47 | "10": 10, 48 | "11": 11, 49 | "12": 12, 50 | "13": 13, 51 | "14": 14, 52 | "15": 15 53 | } 54 | }, 55 | "Capillary refill rate": { 56 | "possible_values": ["0.0", "1.0"], 57 | "values": { 58 | "0.0": 0, 59 | "1.0": 1 60 | } 61 | }, 62 | "Mean blood pressure": { 63 | "possible_values": [] 64 | }, 65 | "Heart Rate": { 66 | "possible_values": [] 67 | }, 68 | "Oxygen saturation": { 69 | "possible_values": [] 70 | }, 71 | "pH": { 72 | "possible_values": [] 73 | }, 74 | "Height": { 75 | "possible_values": [] 76 | }, 77 | "Glascow coma scale eye opening": { 78 | "possible_values": ["1 No Response", "2 To pain", "3 To speech", "4 Spontaneously", "None", "Spontaneously", "To Pain", "To Speech"], 79 | "values": { 80 | "None": 0, 81 | "1 No Response": 1, 82 | "2 To pain": 2, 83 | "To Pain": 2, 84 | "3 To speech": 3, 85 | "To Speech": 3, 86 | "4 Spontaneously": 4, 87 | "Spontaneously": 4 88 | } 89 | }, 90 | "Respiratory rate": { 91 | "possible_values": [] 92 | }, 93 | "Glascow coma scale motor response": { 94 | "possible_values": ["1 No Response", "2 Abnorm extensn", "3 Abnorm flexion", "4 Flex-withdraws", "5 Localizes Pain", "6 Obeys Commands", "Abnormal Flexion", "Abnormal extension", "Flex-withdraws", "Localizes Pain", "No response", "Obeys Commands"], 95 | "values": { 96 | "1 No Response": 1, 97 | "No response": 1, 98 | "2 Abnorm extensn": 2, 99 | "Abnormal extension": 2, 100 | "3 Abnorm flexion": 3, 101 | "Abnormal Flexion": 3, 102 | "4 Flex-withdraws": 4, 103 | "Flex-withdraws": 4, 104 | "5 Localizes Pain": 5, 105 | "Localizes Pain": 5, 106 | "6 Obeys Commands": 6, 107 | "Obeys Commands": 6 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /ODIR/utils/resources/discretizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "id_to_channel": [ 3 | "Capillary refill rate", 4 | "Diastolic blood pressure", 5 | "Fraction inspired oxygen", 6 | "Glascow coma scale eye opening", 7 | "Glascow coma scale motor response", 8 | "Glascow coma scale total", 9 | "Glascow coma scale verbal response", 10 | "Glucose", 11 | "Heart Rate", 12 | "Height", 13 | "Mean blood pressure", 14 | "Oxygen saturation", 15 | "Respiratory rate", 16 | "Systolic blood pressure", 17 | "Temperature", 18 | "Weight", 19 | "pH" 20 | ], 21 | "is_categorical_channel": { 22 | "Capillary refill rate": true, 23 | "Diastolic blood pressure": false, 24 | "Fraction inspired oxygen": false, 25 | "Glascow coma scale eye opening": true, 26 | "Glascow coma scale motor response": true, 27 | "Glascow coma scale total": true, 28 | "Glascow coma scale verbal response": true, 29 | "Glucose": false, 30 | "Heart Rate": false, 31 | "Height": false, 32 | "Mean blood pressure": false, 33 | "Oxygen saturation": false, 34 | "Respiratory rate": false, 35 | "Systolic blood pressure": false, 36 | "Temperature": false, 37 | "Weight": false, 38 | "pH": false 39 | }, 40 | "possible_values": { 41 | "Capillary refill rate": [ 42 | "0.0", 43 | "1.0" 44 | ], 45 | "Diastolic blood pressure": [ 46 | 47 | ], 48 | "Fraction inspired oxygen": [ 49 | 50 | ], 51 | "Glascow coma scale eye opening": [ 52 | "To Pain", 53 | "3 To speech", 54 | "1 No Response", 55 | "4 Spontaneously", 56 | "None", 57 | "To Speech", 58 | "Spontaneously", 59 | "2 To pain" 60 | ], 61 | "Glascow coma scale motor response": [ 62 | "1 No Response", 63 | "3 Abnorm flexion", 64 | "Abnormal extension", 65 | "No response", 66 | "4 Flex-withdraws", 67 | "Localizes Pain", 68 | "Flex-withdraws", 69 | "Obeys Commands", 70 | "Abnormal Flexion", 71 | "6 Obeys Commands", 72 | "5 Localizes Pain", 73 | "2 Abnorm extensn" 74 | ], 75 | "Glascow coma scale total": [ 76 | "11", 77 | "10", 78 | "13", 79 | "12", 80 | "15", 81 | "14", 82 | "3", 83 | "5", 84 | "4", 85 | "7", 86 | "6", 87 | "9", 88 | "8" 89 | ], 90 | "Glascow coma scale verbal response": [ 91 | "1 No Response", 92 | "No Response", 93 | "Confused", 94 | "Inappropriate Words", 95 | "Oriented", 96 | "No Response-ETT", 97 | "5 Oriented", 98 | "Incomprehensible sounds", 99 | "1.0 ET/Trach", 100 | "4 Confused", 101 | "2 Incomp sounds", 102 | "3 Inapprop words" 103 | ], 104 | "Glucose": [ 105 | 106 | ], 107 | "Heart Rate": [ 108 | 109 | ], 110 | "Height": [ 111 | 112 | ], 113 | "Mean blood pressure": [ 114 | 115 | ], 116 | "Oxygen saturation": [ 117 | 118 | ], 119 | "Respiratory rate": [ 120 | 121 | ], 122 | "Systolic blood pressure": [ 123 | 124 | ], 125 | "Temperature": [ 126 | 127 | ], 128 | "Weight": [ 129 | 130 | ], 131 | "pH": [ 132 | 133 | ] 134 | }, 135 | "normal_values": { 136 | "Capillary refill rate": "0.0", 137 | "Diastolic blood pressure": "59.0", 138 | "Fraction inspired oxygen": "0.21", 139 | "Glascow coma scale eye opening": "4 Spontaneously", 140 | "Glascow coma scale motor response": "6 Obeys Commands", 141 | "Glascow coma scale total": "15", 142 | "Glascow coma scale verbal response": "5 Oriented", 143 | "Glucose": "128.0", 144 | "Heart Rate": "86", 145 | "Height": "170.0", 146 | "Mean blood pressure": "77.0", 147 | "Oxygen saturation": "98.0", 148 | "Respiratory rate": "19", 149 | "Systolic blood pressure": "118.0", 150 | "Temperature": "36.6", 151 | "Weight": "81.0", 152 | "pH": "7.4" 153 | } 154 | } -------------------------------------------------------------------------------- /ODIR/utils/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | from . import common_utils 5 | import numpy as np 6 | import os 7 | 8 | 9 | def load_data(reader, discretizer, normalizer, small_part=False, return_names=False): 10 | N = reader.get_number_of_examples() 11 | if small_part: 12 | N = 1000 13 | ret = common_utils.read_chunk(reader, N) 14 | data = ret["X"] 15 | ts = ret["t"] 16 | labels = ret["y"] 17 | names = ret["name"] 18 | data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(data, ts)] 19 | if normalizer is not None: 20 | data = [normalizer.transform(X) for X in data] 21 | whole_data = (np.array(data), labels) 22 | if not return_names: 23 | return whole_data 24 | return {"data": whole_data, "names": names} 25 | 26 | 27 | def save_results(names, pred, y_true, path): 28 | common_utils.create_directory(os.path.dirname(path)) 29 | with open(path, 'w') as f: 30 | f.write("stay,prediction,y_true\n") 31 | for (name, x, y) in zip(names, pred, y_true): 32 | f.write("{},{:.6f},{}\n".format(name, x, y)) 33 | -------------------------------------------------------------------------------- /OV/utils/__pycache__/common_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/common_utils.cpython-36.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/common_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/common_utils.cpython-37.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/feature_extractor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/feature_extractor.cpython-36.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/feature_extractor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/feature_extractor.cpython-37.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/logging.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/logging.cpython-37.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/metrics.cpython-36.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/metrics.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/metrics.cpython-37.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/preprocessing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/preprocessing.cpython-36.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/preprocessing.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/preprocessing.cpython-37.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/readers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/readers.cpython-36.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/readers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/readers.cpython-37.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /OV/utils/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /OV/utils/common_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | import os 6 | import json 7 | import random 8 | 9 | from .feature_extractor import extract_features 10 | 11 | 12 | def convert_to_dict(data, header, channel_info): 13 | """ convert data from readers output in to array of arrays format """ 14 | ret = [[] for i in range(data.shape[1] - 1)] 15 | for i in range(1, data.shape[1]): 16 | ret[i-1] = [(t, x) for (t, x) in zip(data[:, 0], data[:, i]) if x != ""] 17 | channel = header[i] 18 | if len(channel_info[channel]['possible_values']) != 0: 19 | ret[i-1] = list(map(lambda x: (x[0], channel_info[channel]['values'][x[1]]), ret[i-1])) 20 | ret[i-1] = list(map(lambda x: (float(x[0]), float(x[1])), ret[i-1])) 21 | return ret 22 | 23 | 24 | def extract_features_from_rawdata(chunk, header, period, features): 25 | with open(os.path.join(os.path.dirname(__file__), "resources/channel_info.json")) as channel_info_file: 26 | channel_info = json.loads(channel_info_file.read()) 27 | data = [convert_to_dict(X, header, channel_info) for X in chunk] 28 | return extract_features(data, period, features) 29 | 30 | 31 | def read_chunk(reader, chunk_size): 32 | data = {} 33 | for i in range(chunk_size): 34 | ret = reader.read_next() 35 | for k, v in ret.items(): 36 | if k not in data: 37 | data[k] = [] 38 | data[k].append(v) 39 | data["header"] = data["header"][0] 40 | return data 41 | 42 | 43 | def sort_and_shuffle(data, batch_size): 44 | """ Sort data by the length and then make batches and shuffle them. 45 | data is tuple (X1, X2, ..., Xn) all of them have the same length. 46 | Usually data = (X, y). 47 | """ 48 | assert len(data) >= 2 49 | data = list(zip(*data)) 50 | 51 | random.shuffle(data) 52 | 53 | old_size = len(data) 54 | rem = old_size % batch_size 55 | head = data[:old_size - rem] 56 | tail = data[old_size - rem:] 57 | data = [] 58 | 59 | head.sort(key=(lambda x: x[0].shape[0])) 60 | 61 | mas = [head[i: i+batch_size] for i in range(0, len(head), batch_size)] 62 | random.shuffle(mas) 63 | 64 | for x in mas: 65 | data += x 66 | data += tail 67 | 68 | data = list(zip(*data)) 69 | return data 70 | 71 | 72 | def add_common_arguments(parser): 73 | """ Add all the parameters which are common across the tasks 74 | """ 75 | parser.add_argument('--network', type=str, required=True) 76 | parser.add_argument('--dim', type=int, default=256, 77 | help='number of hidden units') 78 | parser.add_argument('--depth', type=int, default=1, 79 | help='number of bi-LSTMs') 80 | parser.add_argument('--epochs', type=int, default=100, 81 | help='number of chunks to train') 82 | parser.add_argument('--load_state', type=str, default="", 83 | help='state file path') 84 | parser.add_argument('--mode', type=str, default="train", 85 | help='mode: train or test') 86 | parser.add_argument('--batch_size', type=int, default=64) 87 | parser.add_argument('--l2', type=float, default=0, help='L2 regularization') 88 | parser.add_argument('--l1', type=float, default=0, help='L1 regularization') 89 | parser.add_argument('--save_every', type=int, default=1, 90 | help='save state every x epoch') 91 | parser.add_argument('--prefix', type=str, default="", 92 | help='optional prefix of network name') 93 | parser.add_argument('--dropout', type=float, default=0.0) 94 | parser.add_argument('--rec_dropout', type=float, default=0.0, 95 | help="dropout rate for recurrent connections") 96 | parser.add_argument('--batch_norm', type=bool, default=False, 97 | help='batch normalization') 98 | parser.add_argument('--timestep', type=float, default=1.0, 99 | help="fixed timestep used in the dataset") 100 | parser.add_argument('--imputation', type=str, default='previous') 101 | parser.add_argument('--small_part', dest='small_part', action='store_true') 102 | parser.add_argument('--whole_data', dest='small_part', action='store_false') 103 | parser.add_argument('--optimizer', type=str, default='adam') 104 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 105 | parser.add_argument('--beta_1', type=float, default=0.9, 106 | help='beta_1 param for Adam optimizer') 107 | parser.add_argument('--verbose', type=int, default=2) 108 | parser.add_argument('--size_coef', type=float, default=4.0) 109 | parser.add_argument('--normalizer_state', type=str, default=None, 110 | help='Path to a state file of a normalizer. Leave none if you want to ' 111 | 'use one of the provided ones.') 112 | parser.set_defaults(small_part=False) 113 | 114 | 115 | class DeepSupervisionDataLoader: 116 | r""" 117 | Data loader for decompensation and length of stay task. 118 | Reads all the data for one patient at once. 119 | 120 | Parameters 121 | ---------- 122 | dataset_dir : str 123 | Directory where timeseries files are stored. 124 | listfile : str 125 | Path to a listfile. If this parameter is left `None` then 126 | `dataset_dir/listfile.csv` will be used. 127 | """ 128 | def __init__(self, dataset_dir, listfile=None, small_part=False): 129 | 130 | self._dataset_dir = dataset_dir 131 | if listfile is None: 132 | listfile_path = os.path.join(dataset_dir, "listfile.csv") 133 | else: 134 | listfile_path = listfile 135 | with open(listfile_path, "r") as lfile: 136 | self._data = lfile.readlines()[1:] # skip the header 137 | 138 | self._data = [line.split(',') for line in self._data] 139 | self._data = [(x, float(t), y) for (x, t, y) in self._data] 140 | self._data = sorted(self._data) 141 | 142 | mas = {"X": [], 143 | "ts": [], 144 | "ys": [], 145 | "name": []} 146 | i = 0 147 | while i < len(self._data): 148 | j = i 149 | cur_stay = self._data[i][0] 150 | cur_ts = [] 151 | cur_labels = [] 152 | while j < len(self._data) and self._data[j][0] == cur_stay: 153 | cur_ts.append(self._data[j][1]) 154 | cur_labels.append(self._data[j][2]) 155 | j += 1 156 | 157 | cur_X, header = self._read_timeseries(cur_stay) 158 | mas["X"].append(cur_X) 159 | mas["ts"].append(cur_ts) 160 | mas["ys"].append(cur_labels) 161 | mas["name"].append(cur_stay) 162 | 163 | i = j 164 | if small_part and len(mas["name"]) == 256: 165 | break 166 | 167 | self._data = mas 168 | 169 | def _read_timeseries(self, ts_filename): 170 | ret = [] 171 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 172 | header = tsfile.readline().strip().split(',') 173 | assert header[0] == "Hours" 174 | for line in tsfile: 175 | mas = line.strip().split(',') 176 | ret.append(np.array(mas)) 177 | return (np.stack(ret), header) 178 | 179 | 180 | def create_directory(directory): 181 | if not os.path.exists(directory): 182 | os.makedirs(directory) 183 | 184 | 185 | def pad_zeros(arr, min_length=None): 186 | """ 187 | `arr` is an array of `np.array`s 188 | 189 | The function appends zeros to every `np.array` in `arr` 190 | to equalize their first axis lenghts. 191 | """ 192 | dtype = arr[0].dtype 193 | max_len = max([x.shape[0] for x in arr]) 194 | ret = [np.concatenate([x, np.zeros((max_len - x.shape[0],) + x.shape[1:], dtype=dtype)], axis=0) 195 | for x in arr] 196 | if (min_length is not None) and ret[0].shape[0] < min_length: 197 | ret = [np.concatenate([x, np.zeros((min_length - x.shape[0],) + x.shape[1:], dtype=dtype)], axis=0) 198 | for x in ret] 199 | return np.array(ret) 200 | -------------------------------------------------------------------------------- /OV/utils/decomp_normalizer: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choczhang/M3Care/3036c425ec25665c6fe19175b6f37a6dc4fe3900/OV/utils/decomp_normalizer -------------------------------------------------------------------------------- /OV/utils/feature_extractor.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | from scipy.stats import skew 6 | 7 | all_functions = [min, max, np.mean, np.std, skew, len] 8 | 9 | functions_map = { 10 | "all": all_functions, 11 | "len": [len], 12 | "all_but_len": all_functions[:-1] 13 | } 14 | 15 | periods_map = { 16 | "all": (0, 0, 1, 0), 17 | "first4days": (0, 0, 0, 4 * 24), 18 | "first8days": (0, 0, 0, 8 * 24), 19 | "last12hours": (1, -12, 1, 0), 20 | "first25percent": (2, 25), 21 | "first50percent": (2, 50) 22 | } 23 | 24 | sub_periods = [(2, 100), (2, 10), (2, 25), (2, 50), 25 | (3, 10), (3, 25), (3, 50)] 26 | 27 | 28 | def get_range(begin, end, period): 29 | # first p % 30 | if period[0] == 2: 31 | return (begin, begin + (end - begin) * period[1] / 100.0) 32 | # last p % 33 | if period[0] == 3: 34 | return (end - (end - begin) * period[1] / 100.0, end) 35 | 36 | if period[0] == 0: 37 | L = begin + period[1] 38 | else: 39 | L = end + period[1] 40 | 41 | if period[2] == 0: 42 | R = begin + period[3] 43 | else: 44 | R = end + period[3] 45 | 46 | return (L, R) 47 | 48 | 49 | def calculate(channel_data, period, sub_period, functions): 50 | if len(channel_data) == 0: 51 | return np.full((len(functions, )), np.nan) 52 | 53 | L = channel_data[0][0] 54 | R = channel_data[-1][0] 55 | L, R = get_range(L, R, period) 56 | L, R = get_range(L, R, sub_period) 57 | 58 | data = [x for (t, x) in channel_data 59 | if L - 1e-6 < t < R + 1e-6] 60 | 61 | if len(data) == 0: 62 | return np.full((len(functions, )), np.nan) 63 | return np.array([fn(data) for fn in functions], dtype=np.float32) 64 | 65 | 66 | def extract_features_single_episode(data_raw, period, functions): 67 | global sub_periods 68 | extracted_features = [np.concatenate([calculate(data_raw[i], period, sub_period, functions) 69 | for sub_period in sub_periods], 70 | axis=0) 71 | for i in range(len(data_raw))] 72 | return np.concatenate(extracted_features, axis=0) 73 | 74 | 75 | def extract_features(data_raw, period, features): 76 | period = periods_map[period] 77 | functions = functions_map[features] 78 | return np.array([extract_features_single_episode(x, period, functions) 79 | for x in data_raw]) 80 | -------------------------------------------------------------------------------- /OV/utils/logging.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import logging 4 | 5 | 6 | def init_log(output_dir): 7 | logging.basicConfig(level=logging.DEBUG, 8 | format='%(asctime)s %(message)s', 9 | datefmt='%Y%m%d-%H:%M:%S', 10 | filename=os.path.join(output_dir, 'log.log'), 11 | filemode='w') 12 | console = logging.StreamHandler() 13 | console.setLevel(logging.INFO) 14 | logging.getLogger('').addHandler(console) 15 | return logging 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /OV/utils/metrics.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | from sklearn import metrics 6 | 7 | 8 | # for decompensation, in-hospital mortality 9 | 10 | def print_metrics_binary(y_true, predictions, verbose=1): 11 | predictions = np.array(predictions) 12 | if len(predictions.shape) == 1: 13 | predictions = np.stack([1 - predictions, predictions]).transpose((1, 0)) 14 | 15 | cf = metrics.confusion_matrix(y_true, predictions.argmax(axis=1)) 16 | if verbose: 17 | print("confusion matrix:") 18 | print(cf) 19 | cf = cf.astype(np.float32) 20 | 21 | acc = (cf[0][0] + cf[1][1]) / np.sum(cf) 22 | prec0 = cf[0][0] / (cf[0][0] + cf[1][0]) 23 | prec1 = cf[1][1] / (cf[1][1] + cf[0][1]) 24 | rec0 = cf[0][0] / (cf[0][0] + cf[0][1]) 25 | rec1 = cf[1][1] / (cf[1][1] + cf[1][0]) 26 | auroc = metrics.roc_auc_score(y_true, predictions[:, 1]) 27 | 28 | (precisions, recalls, thresholds) = metrics.precision_recall_curve(y_true, predictions[:, 1]) 29 | auprc = metrics.auc(recalls, precisions) 30 | minpse = np.max([min(x, y) for (x, y) in zip(precisions, recalls)]) 31 | f1_score=2*prec1*rec1/(prec1+rec1) 32 | if verbose: 33 | print("accuracy = {}".format(acc)) 34 | print("precision class 0 = {}".format(prec0)) 35 | print("precision class 1 = {}".format(prec1)) 36 | print("recall class 0 = {}".format(rec0)) 37 | print("recall class 1 = {}".format(rec1)) 38 | print("AUC of ROC = {}".format(auroc)) 39 | print("AUC of PRC = {}".format(auprc)) 40 | print("min(+P, Se) = {}".format(minpse)) 41 | print("f1_score = {}".format(f1_score)) 42 | 43 | return {"acc": acc, 44 | "prec0": prec0, 45 | "prec1": prec1, 46 | "rec0": rec0, 47 | "rec1": rec1, 48 | "auroc": auroc, 49 | "auprc": auprc, 50 | "minpse": minpse, 51 | "f1_score":f1_score} 52 | 53 | 54 | # for phenotyping 55 | 56 | def print_metrics_multilabel(y_true, predictions, verbose=1): 57 | y_true = np.array(y_true) 58 | predictions = np.array(predictions) 59 | 60 | auc_scores = metrics.roc_auc_score(y_true, predictions, average=None) 61 | ave_auc_micro = metrics.roc_auc_score(y_true, predictions, 62 | average="micro") 63 | ave_auc_macro = metrics.roc_auc_score(y_true, predictions, 64 | average="macro") 65 | ave_auc_weighted = metrics.roc_auc_score(y_true, predictions, 66 | average="weighted") 67 | 68 | coverage_error = metrics.coverage_error(y_true, predictions) 69 | label_ranking_loss = metrics.label_ranking_loss(y_true, predictions) 70 | 71 | if verbose: 72 | print("ROC AUC scores for labels:", auc_scores) 73 | print("ave_auc_micro = {}".format(ave_auc_micro)) 74 | print("ave_auc_macro = {}".format(ave_auc_macro)) 75 | print("ave_auc_weighted = {}".format(ave_auc_weighted)) 76 | 77 | return {"auc_scores": auc_scores, 78 | "ave_auc_micro": ave_auc_micro, 79 | "ave_auc_macro": ave_auc_macro, 80 | "ave_auc_weighted": ave_auc_weighted, 81 | "coverage_error": coverage_error, 82 | "label_ranking_loss": label_ranking_loss} 83 | 84 | 85 | # for length of stay 86 | 87 | def mean_absolute_percentage_error(y_true, y_pred): 88 | return np.mean(np.abs((y_true - y_pred) / (y_true + 0.1))) * 100 89 | 90 | 91 | def print_metrics_regression(y_true, predictions, verbose=1): 92 | predictions = np.array(predictions) 93 | predictions = np.maximum(predictions, 0).flatten() 94 | y_true = np.array(y_true) 95 | 96 | y_true_bins = [get_bin_custom(x, CustomBins.nbins) for x in y_true] 97 | prediction_bins = [get_bin_custom(x, CustomBins.nbins) for x in predictions] 98 | cf = metrics.confusion_matrix(y_true_bins, prediction_bins) 99 | if verbose: 100 | print("Custom bins confusion matrix:") 101 | print(cf) 102 | 103 | kappa = metrics.cohen_kappa_score(y_true_bins, prediction_bins, 104 | weights='linear') 105 | mad = metrics.mean_absolute_error(y_true, predictions) 106 | mse = metrics.mean_squared_error(y_true, predictions) 107 | mape = mean_absolute_percentage_error(y_true, predictions) 108 | 109 | if verbose: 110 | print("Mean absolute deviation (MAD) = {}".format(mad)) 111 | print("Mean squared error (MSE) = {}".format(mse)) 112 | print("Mean absolute percentage error (MAPE) = {}".format(mape)) 113 | print("Cohen kappa score = {}".format(kappa)) 114 | 115 | return {"mad": mad, 116 | "mse": mse, 117 | "mape": mape, 118 | "kappa": kappa} 119 | 120 | 121 | class LogBins: 122 | nbins = 10 123 | means = [0.611848, 2.587614, 6.977417, 16.465430, 37.053745, 124 | 81.816438, 182.303159, 393.334856, 810.964040, 1715.702848] 125 | 126 | 127 | def get_bin_log(x, nbins, one_hot=False): 128 | binid = int(np.log(x + 1) / 8.0 * nbins) 129 | if binid < 0: 130 | binid = 0 131 | if binid >= nbins: 132 | binid = nbins - 1 133 | 134 | if one_hot: 135 | ret = np.zeros((LogBins.nbins,)) 136 | ret[binid] = 1 137 | return ret 138 | return binid 139 | 140 | 141 | def get_estimate_log(prediction, nbins): 142 | bin_id = np.argmax(prediction) 143 | return LogBins.means[bin_id] 144 | 145 | 146 | def print_metrics_log_bins(y_true, predictions, verbose=1): 147 | y_true_bins = [get_bin_log(x, LogBins.nbins) for x in y_true] 148 | prediction_bins = [get_bin_log(x, LogBins.nbins) for x in predictions] 149 | cf = metrics.confusion_matrix(y_true_bins, prediction_bins) 150 | if verbose: 151 | print("LogBins confusion matrix:") 152 | print(cf) 153 | return print_metrics_regression(y_true, predictions, verbose) 154 | 155 | 156 | class CustomBins: 157 | inf = 1e18 158 | bins = [(-inf, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 14), (14, +inf)] 159 | nbins = len(bins) 160 | means = [11.450379, 35.070846, 59.206531, 83.382723, 107.487817, 161 | 131.579534, 155.643957, 179.660558, 254.306624, 585.325890] 162 | 163 | 164 | def get_bin_custom(x, nbins, one_hot=False): 165 | for i in range(nbins): 166 | a = CustomBins.bins[i][0] * 24.0 167 | b = CustomBins.bins[i][1] * 24.0 168 | if a <= x < b: 169 | if one_hot: 170 | ret = np.zeros((CustomBins.nbins,)) 171 | ret[i] = 1 172 | return ret 173 | return i 174 | return None 175 | 176 | 177 | def get_estimate_custom(prediction, nbins): 178 | bin_id = np.argmax(prediction) 179 | assert 0 <= bin_id < nbins 180 | return CustomBins.means[bin_id] 181 | 182 | 183 | def print_metrics_custom_bins(y_true, predictions, verbose=1): 184 | return print_metrics_regression(y_true, predictions, verbose) 185 | -------------------------------------------------------------------------------- /OV/utils/preprocessing.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | import platform 6 | import pickle 7 | import json 8 | import os 9 | 10 | 11 | class Discretizer: 12 | def __init__(self, timestep=0.8, store_masks=True, impute_strategy='zero', start_time='zero', 13 | config_path=os.path.join(os.path.dirname(__file__), 'resources/discretizer_config.json')): 14 | 15 | with open(config_path) as f: 16 | config = json.load(f) 17 | self._id_to_channel = config['id_to_channel'] 18 | self._channel_to_id = dict(zip(self._id_to_channel, range(len(self._id_to_channel)))) 19 | self._is_categorical_channel = config['is_categorical_channel'] 20 | self._possible_values = config['possible_values'] 21 | self._normal_values = config['normal_values'] 22 | 23 | self._header = ["Hours"] + self._id_to_channel 24 | self._timestep = timestep 25 | self._store_masks = store_masks 26 | self._start_time = start_time 27 | self._impute_strategy = impute_strategy 28 | 29 | # for statistics 30 | self._done_count = 0 31 | self._empty_bins_sum = 0 32 | self._unused_data_sum = 0 33 | 34 | def transform(self, X, header=None, end=None): 35 | if header is None: 36 | header = self._header 37 | assert header[0] == "Hours" 38 | eps = 1e-6 39 | 40 | N_channels = len(self._id_to_channel) 41 | ts = [float(row[0]) for row in X] 42 | for i in range(len(ts) - 1): 43 | assert ts[i] < ts[i+1] + eps 44 | 45 | if self._start_time == 'relative': 46 | first_time = ts[0] 47 | elif self._start_time == 'zero': 48 | first_time = 0 49 | else: 50 | raise ValueError("start_time is invalid") 51 | 52 | if end is None: 53 | max_hours = max(ts) - first_time 54 | else: 55 | max_hours = end - first_time 56 | 57 | N_bins = int(max_hours / self._timestep + 1.0 - eps) 58 | 59 | cur_len = 0 60 | begin_pos = [0 for i in range(N_channels)] 61 | end_pos = [0 for i in range(N_channels)] 62 | for i in range(N_channels): 63 | channel = self._id_to_channel[i] 64 | begin_pos[i] = cur_len 65 | if self._is_categorical_channel[channel]: 66 | end_pos[i] = begin_pos[i] + len(self._possible_values[channel]) 67 | else: 68 | end_pos[i] = begin_pos[i] + 1 69 | cur_len = end_pos[i] 70 | 71 | data = np.zeros(shape=(N_bins, cur_len), dtype=float) 72 | mask = np.zeros(shape=(N_bins, N_channels), dtype=int) 73 | original_value = [["" for j in range(N_channels)] for i in range(N_bins)] 74 | total_data = 0 75 | unused_data = 0 76 | 77 | def write(data, bin_id, channel, value, begin_pos): 78 | channel_id = self._channel_to_id[channel] 79 | if self._is_categorical_channel[channel]: 80 | category_id = self._possible_values[channel].index(value) 81 | N_values = len(self._possible_values[channel]) 82 | one_hot = np.zeros((N_values,)) 83 | one_hot[category_id] = 1 84 | for pos in range(N_values): 85 | data[bin_id, begin_pos[channel_id] + pos] = one_hot[pos] 86 | else: 87 | data[bin_id, begin_pos[channel_id]] = float(value) 88 | 89 | for row in X: 90 | t = float(row[0]) - first_time 91 | if t > max_hours + eps: 92 | continue 93 | bin_id = int(t / self._timestep - eps) 94 | assert 0 <= bin_id < N_bins 95 | 96 | for j in range(1, len(row)): 97 | if row[j] == "": 98 | continue 99 | channel = header[j] 100 | channel_id = self._channel_to_id[channel] 101 | 102 | total_data += 1 103 | if mask[bin_id][channel_id] == 1: 104 | unused_data += 1 105 | mask[bin_id][channel_id] = 1 106 | 107 | write(data, bin_id, channel, row[j], begin_pos) 108 | original_value[bin_id][channel_id] = row[j] 109 | 110 | # impute missing values 111 | 112 | if self._impute_strategy not in ['zero', 'normal_value', 'previous', 'next']: 113 | raise ValueError("impute strategy is invalid") 114 | 115 | if self._impute_strategy in ['normal_value', 'previous']: 116 | prev_values = [[] for i in range(len(self._id_to_channel))] 117 | for bin_id in range(N_bins): 118 | for channel in self._id_to_channel: 119 | channel_id = self._channel_to_id[channel] 120 | if mask[bin_id][channel_id] == 1: 121 | prev_values[channel_id].append(original_value[bin_id][channel_id]) 122 | continue 123 | if self._impute_strategy == 'normal_value': 124 | imputed_value = self._normal_values[channel] 125 | if self._impute_strategy == 'previous': 126 | if len(prev_values[channel_id]) == 0: 127 | imputed_value = self._normal_values[channel] 128 | else: 129 | imputed_value = prev_values[channel_id][-1] 130 | write(data, bin_id, channel, imputed_value, begin_pos) 131 | 132 | if self._impute_strategy == 'next': 133 | prev_values = [[] for i in range(len(self._id_to_channel))] 134 | for bin_id in range(N_bins-1, -1, -1): 135 | for channel in self._id_to_channel: 136 | channel_id = self._channel_to_id[channel] 137 | if mask[bin_id][channel_id] == 1: 138 | prev_values[channel_id].append(original_value[bin_id][channel_id]) 139 | continue 140 | if len(prev_values[channel_id]) == 0: 141 | imputed_value = self._normal_values[channel] 142 | else: 143 | imputed_value = prev_values[channel_id][-1] 144 | write(data, bin_id, channel, imputed_value, begin_pos) 145 | 146 | empty_bins = np.sum([1 - min(1, np.sum(mask[i, :])) for i in range(N_bins)]) 147 | self._done_count += 1 148 | self._empty_bins_sum += empty_bins / (N_bins + eps) 149 | self._unused_data_sum += unused_data / (total_data + eps) 150 | 151 | if self._store_masks: 152 | data = np.hstack([data, mask.astype(np.float32)]) 153 | 154 | # create new header 155 | new_header = [] 156 | for channel in self._id_to_channel: 157 | if self._is_categorical_channel[channel]: 158 | values = self._possible_values[channel] 159 | for value in values: 160 | new_header.append(channel + "->" + value) 161 | else: 162 | new_header.append(channel) 163 | 164 | if self._store_masks: 165 | for i in range(len(self._id_to_channel)): 166 | channel = self._id_to_channel[i] 167 | new_header.append("mask->" + channel) 168 | 169 | new_header = ",".join(new_header) 170 | 171 | return (data, new_header) 172 | 173 | def print_statistics(self): 174 | print("statistics of discretizer:") 175 | print("\tconverted {} examples".format(self._done_count)) 176 | print("\taverage unused data = {:.2f} percent".format(100.0 * self._unused_data_sum / self._done_count)) 177 | print("\taverage empty bins = {:.2f} percent".format(100.0 * self._empty_bins_sum / self._done_count)) 178 | 179 | 180 | class Normalizer: 181 | def __init__(self, fields=None): 182 | self._means = None 183 | self._stds = None 184 | self._fields = None 185 | if fields is not None: 186 | self._fields = [col for col in fields] 187 | 188 | self._sum_x = None 189 | self._sum_sq_x = None 190 | self._count = 0 191 | 192 | def _feed_data(self, x): 193 | x = np.array(x) 194 | self._count += x.shape[0] 195 | if self._sum_x is None: 196 | self._sum_x = np.sum(x, axis=0) 197 | self._sum_sq_x = np.sum(x**2, axis=0) 198 | else: 199 | self._sum_x += np.sum(x, axis=0) 200 | self._sum_sq_x += np.sum(x**2, axis=0) 201 | 202 | def _save_params(self, save_file_path): 203 | eps = 1e-7 204 | with open(save_file_path, "wb") as save_file: 205 | N = self._count 206 | self._means = 1.0 / N * self._sum_x 207 | self._stds = np.sqrt(1.0/(N - 1) * (self._sum_sq_x - 2.0 * self._sum_x * self._means + N * self._means**2)) 208 | self._stds[self._stds < eps] = eps 209 | pickle.dump(obj={'means': self._means, 210 | 'stds': self._stds}, 211 | file=save_file, 212 | protocol=2) 213 | 214 | def load_params(self, load_file_path): 215 | with open(load_file_path, "rb") as load_file: 216 | if platform.python_version()[0] == '2': 217 | dct = pickle.load(load_file) 218 | else: 219 | dct = pickle.load(load_file, encoding='latin1') 220 | self._means = dct['means'] 221 | self._stds = dct['stds'] 222 | 223 | def transform(self, X): 224 | if self._fields is None: 225 | fields = range(X.shape[1]) 226 | else: 227 | fields = self._fields 228 | ret = 1.0 * X 229 | for col in fields: 230 | ret[:, col] = (X[:, col] - self._means[col]) / self._stds[col] 231 | return ret 232 | -------------------------------------------------------------------------------- /OV/utils/readers.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import os 5 | import numpy as np 6 | import random 7 | 8 | 9 | class Reader(object): 10 | def __init__(self, dataset_dir, listfile=None): 11 | self._dataset_dir = dataset_dir 12 | self._current_index = 0 13 | if listfile is None: 14 | listfile_path = os.path.join(dataset_dir, "listfile.csv") 15 | else: 16 | listfile_path = listfile 17 | with open(listfile_path, "r") as lfile: 18 | self._data = lfile.readlines() 19 | self._listfile_header = self._data[0] 20 | self._data = self._data[1:] 21 | 22 | def get_number_of_examples(self): 23 | return len(self._data) 24 | 25 | def random_shuffle(self, seed=None): 26 | if seed is not None: 27 | random.seed(seed) 28 | random.shuffle(self._data) 29 | 30 | def read_example(self, index): 31 | raise NotImplementedError() 32 | 33 | def read_next(self): 34 | to_read_index = self._current_index 35 | self._current_index += 1 36 | if self._current_index == self.get_number_of_examples(): 37 | self._current_index = 0 38 | return self.read_example(to_read_index) 39 | 40 | 41 | class DecompensationReader(Reader): 42 | def __init__(self, dataset_dir, listfile=None): 43 | """ Reader for decompensation prediction task. 44 | :param dataset_dir: Directory where timeseries files are stored. 45 | :param listfile: Path to a listfile. If this parameter is left `None` then 46 | `dataset_dir/listfile.csv` will be used. 47 | """ 48 | Reader.__init__(self, dataset_dir, listfile) 49 | self._data = [line.split(',') for line in self._data] 50 | self._data = [(x, float(t), int(y)) for (x, t, y) in self._data] 51 | 52 | def _read_timeseries(self, ts_filename, time_bound): 53 | ret = [] 54 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 55 | header = tsfile.readline().strip().split(',') 56 | assert header[0] == "Hours" 57 | for line in tsfile: 58 | mas = line.strip().split(',') 59 | t = float(mas[0]) 60 | if t > time_bound + 1e-6: 61 | break 62 | ret.append(np.array(mas)) 63 | return (np.stack(ret), header) 64 | 65 | def read_example(self, index): 66 | """ Read the example with given index. 67 | 68 | :param index: Index of the line of the listfile to read (counting starts from 0). 69 | :return: Directory with the following keys: 70 | X : np.array 71 | 2D array containing all events. Each row corresponds to a moment. 72 | First column is the time and other columns correspond to different 73 | variables. 74 | t : float 75 | Length of the data in hours. Note, in general, it is not equal to the 76 | timestamp of last event. 77 | y : int (0 or 1) 78 | Mortality within next 24 hours. 79 | header : array of strings 80 | Names of the columns. The ordering of the columns is always the same. 81 | name: Name of the sample. 82 | """ 83 | if index < 0 or index >= len(self._data): 84 | raise ValueError("Index must be from 0 (inclusive) to number of examples (exclusive).") 85 | 86 | name = self._data[index][0] 87 | t = self._data[index][1] 88 | y = self._data[index][2] 89 | (X, header) = self._read_timeseries(name, t) 90 | 91 | return {"X": X, 92 | "t": t, 93 | "y": y, 94 | "header": header, 95 | "name": name} 96 | 97 | 98 | class InHospitalMortalityReader(Reader): 99 | def __init__(self, dataset_dir, listfile=None, period_length=48.0): 100 | """ Reader for in-hospital moratality prediction task. 101 | 102 | :param dataset_dir: Directory where timeseries files are stored. 103 | :param listfile: Path to a listfile. If this parameter is left `None` then 104 | `dataset_dir/listfile.csv` will be used. 105 | :param period_length: Length of the period (in hours) from which the prediction is done. 106 | """ 107 | Reader.__init__(self, dataset_dir, listfile) 108 | self._data = [line.split(',') for line in self._data] 109 | self._data = [(x, int(y)) for (x, y) in self._data] 110 | self._period_length = period_length 111 | 112 | def _read_timeseries(self, ts_filename): 113 | ret = [] 114 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 115 | header = tsfile.readline().strip().split(',') 116 | assert header[0] == "Hours" 117 | for line in tsfile: 118 | mas = line.strip().split(',') 119 | ret.append(np.array(mas)) 120 | return (np.stack(ret), header) 121 | 122 | def read_example(self, index): 123 | """ Reads the example with given index. 124 | 125 | :param index: Index of the line of the listfile to read (counting starts from 0). 126 | :return: Dictionary with the following keys: 127 | X : np.array 128 | 2D array containing all events. Each row corresponds to a moment. 129 | First column is the time and other columns correspond to different 130 | variables. 131 | t : float 132 | Length of the data in hours. Note, in general, it is not equal to the 133 | timestamp of last event. 134 | y : int (0 or 1) 135 | In-hospital mortality. 136 | header : array of strings 137 | Names of the columns. The ordering of the columns is always the same. 138 | name: Name of the sample. 139 | """ 140 | if index < 0 or index >= len(self._data): 141 | raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).") 142 | 143 | name = self._data[index][0] 144 | t = self._period_length 145 | y = self._data[index][1] 146 | (X, header) = self._read_timeseries(name) 147 | 148 | return {"X": X, 149 | "t": t, 150 | "y": y, 151 | "header": header, 152 | "name": name} 153 | 154 | 155 | class LengthOfStayReader(Reader): 156 | def __init__(self, dataset_dir, listfile=None): 157 | """ Reader for length of stay prediction task. 158 | 159 | :param dataset_dir: Directory where timeseries files are stored. 160 | :param listfile: Path to a listfile. If this parameter is left `None` then 161 | `dataset_dir/listfile.csv` will be used. 162 | """ 163 | Reader.__init__(self, dataset_dir, listfile) 164 | self._data = [line.split(',') for line in self._data] 165 | self._data = [(x, float(t), float(y)) for (x, t, y) in self._data] 166 | 167 | def _read_timeseries(self, ts_filename, time_bound): 168 | ret = [] 169 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 170 | header = tsfile.readline().strip().split(',') 171 | assert header[0] == "Hours" 172 | for line in tsfile: 173 | mas = line.strip().split(',') 174 | t = float(mas[0]) 175 | if t > time_bound + 1e-6: 176 | break 177 | ret.append(np.array(mas)) 178 | return (np.stack(ret), header) 179 | 180 | def read_example(self, index): 181 | """ Reads the example with given index. 182 | 183 | :param index: Index of the line of the listfile to read (counting starts from 0). 184 | :return: Dictionary with the following keys: 185 | X : np.array 186 | 2D array containing all events. Each row corresponds to a moment. 187 | First column is the time and other columns correspond to different 188 | variables. 189 | t : float 190 | Length of the data in hours. Note, in general, it is not equal to the 191 | timestamp of last event. 192 | y : float 193 | Remaining time in ICU. 194 | header : array of strings 195 | Names of the columns. The ordering of the columns is always the same. 196 | name: Name of the sample. 197 | """ 198 | if index < 0 or index >= len(self._data): 199 | raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).") 200 | 201 | name = self._data[index][0] 202 | t = self._data[index][1] 203 | y = self._data[index][2] 204 | (X, header) = self._read_timeseries(name, t) 205 | 206 | return {"X": X, 207 | "t": t, 208 | "y": y, 209 | "header": header, 210 | "name": name} 211 | 212 | 213 | class PhenotypingReader(Reader): 214 | def __init__(self, dataset_dir, listfile=None): 215 | """ Reader for phenotype classification task. 216 | 217 | :param dataset_dir: Directory where timeseries files are stored. 218 | :param listfile: Path to a listfile. If this parameter is left `None` then 219 | `dataset_dir/listfile.csv` will be used. 220 | """ 221 | Reader.__init__(self, dataset_dir, listfile) 222 | self._data = [line.split(',') for line in self._data] 223 | self._data = [(mas[0], float(mas[1]), list(map(int, mas[2:]))) for mas in self._data] 224 | 225 | def _read_timeseries(self, ts_filename): 226 | ret = [] 227 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 228 | header = tsfile.readline().strip().split(',') 229 | assert header[0] == "Hours" 230 | for line in tsfile: 231 | mas = line.strip().split(',') 232 | ret.append(np.array(mas)) 233 | return (np.stack(ret), header) 234 | 235 | def read_example(self, index): 236 | """ Reads the example with given index. 237 | 238 | :param index: Index of the line of the listfile to read (counting starts from 0). 239 | :return: Dictionary with the following keys: 240 | X : np.array 241 | 2D array containing all events. Each row corresponds to a moment. 242 | First column is the time and other columns correspond to different 243 | variables. 244 | t : float 245 | Length of the data in hours. Note, in general, it is not equal to the 246 | timestamp of last event. 247 | y : array of ints 248 | Phenotype labels. 249 | header : array of strings 250 | Names of the columns. The ordering of the columns is always the same. 251 | name: Name of the sample. 252 | """ 253 | if index < 0 or index >= len(self._data): 254 | raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).") 255 | 256 | name = self._data[index][0] 257 | t = self._data[index][1] 258 | y = self._data[index][2] 259 | (X, header) = self._read_timeseries(name) 260 | 261 | return {"X": X, 262 | "t": t, 263 | "y": y, 264 | "header": header, 265 | "name": name} 266 | 267 | 268 | class MultitaskReader(Reader): 269 | def __init__(self, dataset_dir, listfile=None): 270 | """ Reader for multitask learning. 271 | 272 | :param dataset_dir: Directory where timeseries files are stored. 273 | :param listfile: Path to a listfile. If this parameter is left `None` then 274 | `dataset_dir/listfile.csv` will be used. 275 | """ 276 | Reader.__init__(self, dataset_dir, listfile) 277 | self._data = [line.split(',') for line in self._data] 278 | 279 | def process_ihm(x): 280 | return list(map(int, x.split(';'))) 281 | 282 | def process_los(x): 283 | x = x.split(';') 284 | if x[0] == '': 285 | return ([], []) 286 | return (list(map(int, x[:len(x)//2])), list(map(float, x[len(x)//2:]))) 287 | 288 | def process_ph(x): 289 | return list(map(int, x.split(';'))) 290 | 291 | def process_decomp(x): 292 | x = x.split(';') 293 | if x[0] == '': 294 | return ([], []) 295 | return (list(map(int, x[:len(x)//2])), list(map(int, x[len(x)//2:]))) 296 | 297 | self._data = [(fname, float(t), process_ihm(ihm), process_los(los), 298 | process_ph(pheno), process_decomp(decomp)) 299 | for fname, t, ihm, los, pheno, decomp in self._data] 300 | 301 | def _read_timeseries(self, ts_filename): 302 | ret = [] 303 | with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile: 304 | header = tsfile.readline().strip().split(',') 305 | assert header[0] == "Hours" 306 | for line in tsfile: 307 | mas = line.strip().split(',') 308 | ret.append(np.array(mas)) 309 | return (np.stack(ret), header) 310 | 311 | def read_example(self, index): 312 | """ Reads the example with given index. 313 | 314 | :param index: Index of the line of the listfile to read (counting starts from 0). 315 | :return: Return dictionary with the following keys: 316 | X : np.array 317 | 2D array containing all events. Each row corresponds to a moment. 318 | First column is the time and other columns correspond to different 319 | variables. 320 | t : float 321 | Length of the data in hours. Note, in general, it is not equal to the 322 | timestamp of last event. 323 | ihm : array 324 | Array of 3 integers: [pos, mask, label]. 325 | los : array 326 | Array of 2 arrays: [masks, labels]. 327 | pheno : array 328 | Array of 25 binary integers (phenotype labels). 329 | decomp : array 330 | Array of 2 arrays: [masks, labels]. 331 | header : array of strings 332 | Names of the columns. The ordering of the columns is always the same. 333 | name: Name of the sample. 334 | """ 335 | if index < 0 or index >= len(self._data): 336 | raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).") 337 | 338 | name = self._data[index][0] 339 | (X, header) = self._read_timeseries(name) 340 | 341 | return {"X": X, 342 | "t": self._data[index][1], 343 | "ihm": self._data[index][2], 344 | "los": self._data[index][3], 345 | "pheno": self._data[index][4], 346 | "decomp": self._data[index][5], 347 | "header": header, 348 | "name": name} 349 | -------------------------------------------------------------------------------- /OV/utils/resources/channel_info.json: -------------------------------------------------------------------------------- 1 | { 2 | "Glucose": { 3 | "possible_values": [] 4 | }, 5 | "Systolic blood pressure": { 6 | "possible_values": [] 7 | }, 8 | "Glascow coma scale verbal response": { 9 | "possible_values": ["1 No Response", "1.0 ET/Trach", "2 Incomp sounds", "3 Inapprop words", "4 Confused", "5 Oriented", "Confused", "Inappropriate Words", "Incomprehensible sounds", "No Response", "No Response-ETT", "Oriented"], 10 | "values": { 11 | "No Response-ETT": 1, 12 | "No Response": 1, 13 | "1 No Response": 1, 14 | "1.0 ET/Trach": 1, 15 | "2 Incomp sounds": 2, 16 | "Incomprehensible sounds": 2, 17 | "3 Inapprop words": 3, 18 | "Inappropriate Words": 3, 19 | "4 Confused": 4, 20 | "Confused": 4, 21 | "5 Oriented": 5, 22 | "Oriented": 5 23 | } 24 | }, 25 | "Temperature": { 26 | "possible_values": [] 27 | }, 28 | "Weight": { 29 | "possible_values": [] 30 | }, 31 | "Diastolic blood pressure": { 32 | "possible_values": [] 33 | }, 34 | "Fraction inspired oxygen": { 35 | "possible_values": [] 36 | }, 37 | "Glascow coma scale total": { 38 | "possible_values": ["10", "11", "12", "13", "14", "15", "3", "4", "5", "6", "7", "8", "9"], 39 | "values": { 40 | "3": 3, 41 | "4": 4, 42 | "5": 5, 43 | "6": 6, 44 | "7": 7, 45 | "8": 8, 46 | "9": 9, 47 | "10": 10, 48 | "11": 11, 49 | "12": 12, 50 | "13": 13, 51 | "14": 14, 52 | "15": 15 53 | } 54 | }, 55 | "Capillary refill rate": { 56 | "possible_values": ["0.0", "1.0"], 57 | "values": { 58 | "0.0": 0, 59 | "1.0": 1 60 | } 61 | }, 62 | "Mean blood pressure": { 63 | "possible_values": [] 64 | }, 65 | "Heart Rate": { 66 | "possible_values": [] 67 | }, 68 | "Oxygen saturation": { 69 | "possible_values": [] 70 | }, 71 | "pH": { 72 | "possible_values": [] 73 | }, 74 | "Height": { 75 | "possible_values": [] 76 | }, 77 | "Glascow coma scale eye opening": { 78 | "possible_values": ["1 No Response", "2 To pain", "3 To speech", "4 Spontaneously", "None", "Spontaneously", "To Pain", "To Speech"], 79 | "values": { 80 | "None": 0, 81 | "1 No Response": 1, 82 | "2 To pain": 2, 83 | "To Pain": 2, 84 | "3 To speech": 3, 85 | "To Speech": 3, 86 | "4 Spontaneously": 4, 87 | "Spontaneously": 4 88 | } 89 | }, 90 | "Respiratory rate": { 91 | "possible_values": [] 92 | }, 93 | "Glascow coma scale motor response": { 94 | "possible_values": ["1 No Response", "2 Abnorm extensn", "3 Abnorm flexion", "4 Flex-withdraws", "5 Localizes Pain", "6 Obeys Commands", "Abnormal Flexion", "Abnormal extension", "Flex-withdraws", "Localizes Pain", "No response", "Obeys Commands"], 95 | "values": { 96 | "1 No Response": 1, 97 | "No response": 1, 98 | "2 Abnorm extensn": 2, 99 | "Abnormal extension": 2, 100 | "3 Abnorm flexion": 3, 101 | "Abnormal Flexion": 3, 102 | "4 Flex-withdraws": 4, 103 | "Flex-withdraws": 4, 104 | "5 Localizes Pain": 5, 105 | "Localizes Pain": 5, 106 | "6 Obeys Commands": 6, 107 | "Obeys Commands": 6 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /OV/utils/resources/discretizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "id_to_channel": [ 3 | "Capillary refill rate", 4 | "Diastolic blood pressure", 5 | "Fraction inspired oxygen", 6 | "Glascow coma scale eye opening", 7 | "Glascow coma scale motor response", 8 | "Glascow coma scale total", 9 | "Glascow coma scale verbal response", 10 | "Glucose", 11 | "Heart Rate", 12 | "Height", 13 | "Mean blood pressure", 14 | "Oxygen saturation", 15 | "Respiratory rate", 16 | "Systolic blood pressure", 17 | "Temperature", 18 | "Weight", 19 | "pH" 20 | ], 21 | "is_categorical_channel": { 22 | "Capillary refill rate": true, 23 | "Diastolic blood pressure": false, 24 | "Fraction inspired oxygen": false, 25 | "Glascow coma scale eye opening": true, 26 | "Glascow coma scale motor response": true, 27 | "Glascow coma scale total": true, 28 | "Glascow coma scale verbal response": true, 29 | "Glucose": false, 30 | "Heart Rate": false, 31 | "Height": false, 32 | "Mean blood pressure": false, 33 | "Oxygen saturation": false, 34 | "Respiratory rate": false, 35 | "Systolic blood pressure": false, 36 | "Temperature": false, 37 | "Weight": false, 38 | "pH": false 39 | }, 40 | "possible_values": { 41 | "Capillary refill rate": [ 42 | "0.0", 43 | "1.0" 44 | ], 45 | "Diastolic blood pressure": [ 46 | 47 | ], 48 | "Fraction inspired oxygen": [ 49 | 50 | ], 51 | "Glascow coma scale eye opening": [ 52 | "To Pain", 53 | "3 To speech", 54 | "1 No Response", 55 | "4 Spontaneously", 56 | "None", 57 | "To Speech", 58 | "Spontaneously", 59 | "2 To pain" 60 | ], 61 | "Glascow coma scale motor response": [ 62 | "1 No Response", 63 | "3 Abnorm flexion", 64 | "Abnormal extension", 65 | "No response", 66 | "4 Flex-withdraws", 67 | "Localizes Pain", 68 | "Flex-withdraws", 69 | "Obeys Commands", 70 | "Abnormal Flexion", 71 | "6 Obeys Commands", 72 | "5 Localizes Pain", 73 | "2 Abnorm extensn" 74 | ], 75 | "Glascow coma scale total": [ 76 | "11", 77 | "10", 78 | "13", 79 | "12", 80 | "15", 81 | "14", 82 | "3", 83 | "5", 84 | "4", 85 | "7", 86 | "6", 87 | "9", 88 | "8" 89 | ], 90 | "Glascow coma scale verbal response": [ 91 | "1 No Response", 92 | "No Response", 93 | "Confused", 94 | "Inappropriate Words", 95 | "Oriented", 96 | "No Response-ETT", 97 | "5 Oriented", 98 | "Incomprehensible sounds", 99 | "1.0 ET/Trach", 100 | "4 Confused", 101 | "2 Incomp sounds", 102 | "3 Inapprop words" 103 | ], 104 | "Glucose": [ 105 | 106 | ], 107 | "Heart Rate": [ 108 | 109 | ], 110 | "Height": [ 111 | 112 | ], 113 | "Mean blood pressure": [ 114 | 115 | ], 116 | "Oxygen saturation": [ 117 | 118 | ], 119 | "Respiratory rate": [ 120 | 121 | ], 122 | "Systolic blood pressure": [ 123 | 124 | ], 125 | "Temperature": [ 126 | 127 | ], 128 | "Weight": [ 129 | 130 | ], 131 | "pH": [ 132 | 133 | ] 134 | }, 135 | "normal_values": { 136 | "Capillary refill rate": "0.0", 137 | "Diastolic blood pressure": "59.0", 138 | "Fraction inspired oxygen": "0.21", 139 | "Glascow coma scale eye opening": "4 Spontaneously", 140 | "Glascow coma scale motor response": "6 Obeys Commands", 141 | "Glascow coma scale total": "15", 142 | "Glascow coma scale verbal response": "5 Oriented", 143 | "Glucose": "128.0", 144 | "Heart Rate": "86", 145 | "Height": "170.0", 146 | "Mean blood pressure": "77.0", 147 | "Oxygen saturation": "98.0", 148 | "Respiratory rate": "19", 149 | "Systolic blood pressure": "118.0", 150 | "Temperature": "36.6", 151 | "Weight": "81.0", 152 | "pH": "7.4" 153 | } 154 | } -------------------------------------------------------------------------------- /OV/utils/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | from . import common_utils 5 | import numpy as np 6 | import os 7 | 8 | 9 | def load_data(reader, discretizer, normalizer, small_part=False, return_names=False): 10 | N = reader.get_number_of_examples() 11 | if small_part: 12 | N = 1000 13 | ret = common_utils.read_chunk(reader, N) 14 | data = ret["X"] 15 | ts = ret["t"] 16 | labels = ret["y"] 17 | names = ret["name"] 18 | data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(data, ts)] 19 | if normalizer is not None: 20 | data = [normalizer.transform(X) for X in data] 21 | whole_data = (np.array(data), labels) 22 | if not return_names: 23 | return whole_data 24 | return {"data": whole_data, "names": names} 25 | 26 | 27 | def save_results(names, pred, y_true, path): 28 | common_utils.create_directory(os.path.dirname(path)) 29 | with open(path, 'w') as f: 30 | f.write("stay,prediction,y_true\n") 31 | for (name, x, y) in zip(names, pred, y_true): 32 | f.write("{},{:.6f},{}\n".format(name, x, y)) 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # M$^3$Care: Learning with Missing Modalities in Multimodal Healthcare Data 2 | 3 | The source code for *M$^3$Care: Learning with Missing Modalities in Multimodal Healthcare Data* 4 | 5 | Thanks for your interest in our work. 6 | 7 | Due to the limitation of upload file size, **a more detailed version** including the trained model saved file, code can be obtained through the Github repo [here](https://github.com/choczhang/M3Care) and Google drive [here](https://drive.google.com/drive/folders/1C95YymB3fOXsZ78Uk0iVQYXyo8Hu_Iqz?usp=sharing) 8 | 9 | ## Requirements 10 | 11 | * Install python, pytorch. We use Python 3.7.3, Pytorch 1.5.1. 12 | * If you plan to use GPU computation, install CUDA 13 | 14 | 15 | ## Run the model 16 | 17 | All the hyper-parameters and steps are included in the `.ipynb` file. 18 | --------------------------------------------------------------------------------