├── .gitignore ├── .gitmodules ├── 90-day-post-discharge-mortality ├── FIDDLE_icd │ ├── config.py │ ├── config.yaml │ ├── helpers.py │ ├── run.py │ └── steps.py ├── cohort.ipynb ├── helper.py ├── log │ ├── model_clinical.txt │ ├── model_clinical_icd[0,1,2].txt │ ├── model_clinical_icd[0,1].txt │ └── model_clinical_icd[0].txt ├── main.ipynb ├── model.ipynb ├── model_clinical.py ├── model_clinical_icd[0,1,2].py ├── model_clinical_icd[0,1].py └── model_clinical_icd[0].py ├── README.md ├── eicu_experiments ├── 1_data_extraction │ ├── PopulationSummary.ipynb │ ├── extract_medication.py │ ├── extract_nurseCharting.py │ ├── extract_other_tables.ipynb │ ├── extract_pivoted.py │ ├── extract_resp_IO.ipynb │ ├── generate_labels.ipynb │ ├── population_ARF.ipynb │ ├── population_Shock.ipynb │ └── population_mortality.ipynb ├── 2_apply_FIDDLE │ ├── prepare_data.py │ ├── prepare_data.sh │ ├── prepare_data_mortality.py │ └── run_make_all.sh └── 3_ML_models │ ├── DataSummary.ipynb │ ├── Test.ipynb │ ├── config.yaml │ ├── lib │ ├── data.py │ ├── evaluate.py │ ├── experiment.py │ ├── models.py │ └── trainer.py │ ├── run_deep.py │ ├── run_deep_all.sh │ ├── run_deep_eval.py │ ├── run_shallow.py │ └── run_shallow_all.sh ├── environment.yml ├── mimic3_ablations ├── 2_apply_FIDDLE │ ├── FIDDLE_mask+Dt │ │ ├── config.py │ │ ├── config.yaml │ │ ├── helpers.py │ │ ├── run.py │ │ └── steps.py │ ├── FIDDLE_maskonly │ │ ├── config.py │ │ ├── config.yaml │ │ ├── helpers.py │ │ ├── run.py │ │ └── steps.py │ ├── FIDDLE_medianimpute │ │ ├── config.py │ │ ├── config.yaml │ │ ├── helpers.py │ │ ├── run.py │ │ └── steps.py │ ├── FIDDLE_noimpute │ │ ├── config.py │ │ ├── config.yaml │ │ ├── helpers.py │ │ ├── run.py │ │ └── steps.py │ ├── FIDDLE_ordinal │ │ ├── config.py │ │ ├── config.yaml │ │ ├── helpers.py │ │ ├── run.py │ │ └── steps.py │ ├── log │ │ ├── impute,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── impute,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── impute,benchmark,outcome=mortality,T=48.0,dt=12.0.err │ │ ├── impute,benchmark,outcome=mortality,T=48.0,dt=12.0.out │ │ ├── impute,benchmark,outcome=mortality,T=48.0,dt=4.0.err │ │ ├── impute,benchmark,outcome=mortality,T=48.0,dt=4.0.out │ │ ├── mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=12.0.err │ │ ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=12.0.out │ │ ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=4.0.err │ │ ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=4.0.out │ │ ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0.err │ │ ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0.out │ │ ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=4.0.err │ │ ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=4.0.out │ │ ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0.err │ │ ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0.out │ │ ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=12.0.err │ │ ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=12.0.out │ │ ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=4.0.err │ │ ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=4.0.out │ │ ├── ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── theta=0.001,medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── theta=0.001,medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── theta=0.001,noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ └── theta=0.001,noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ ├── run_mortality_impute.sh │ ├── run_mortality_mask+Dt.sh │ ├── run_mortality_maskonly.sh │ ├── run_mortality_nofreq.sh │ ├── run_mortality_noimpute.sh │ └── run_mortality_ordinal.sh └── 3_ML_models │ ├── config.yaml │ ├── lib │ ├── data.py │ ├── evaluate.py │ ├── experiment.py │ ├── models.py │ └── trainer.py │ ├── run_shallow_ablations.sh │ ├── run_shallow_impute.py │ ├── run_shallow_maskonly.py │ ├── run_shallow_medianimpute.py │ ├── run_shallow_nofreq.py │ ├── run_shallow_noimpute.py │ └── run_shallow_ordinal.py ├── mimic3_comparisons ├── 2_apply_FIDDLE │ ├── log │ │ ├── dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ ├── theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ │ ├── theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0.err │ │ └── theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0.out │ ├── run_mortality_dt.sh │ └── run_mortality_theta.sh └── 3_ML_models │ ├── config.yaml │ ├── lib │ ├── data.py │ ├── evaluate.py │ ├── experiment.py │ ├── models.py │ └── trainer.py │ ├── run_shallow_dt.py │ ├── run_shallow_dt.sh │ ├── run_shallow_theta.py │ └── run_shallow_theta.sh └── mimic3_experiments ├── 1_data_extraction ├── FIDDLE_input_lengths.ipynb ├── InclusionExclusion.ipynb ├── LabelDistributions.ipynb ├── PopulationSummary.ipynb ├── config.py ├── extract_data.py ├── generate_labels.py ├── grouped_variables.yaml ├── prepare_input.py ├── resources │ ├── IHM_benchmark.ipynb │ ├── all_stays.csv │ ├── test_listfile.csv │ ├── train_listfile.csv │ └── val_listfile.csv └── run_prepare_all.sh ├── 2_apply_FIDDLE ├── log,discretize=no │ ├── benchmark,outcome=mortality,T=48.0,dt=1.0.err │ ├── benchmark,outcome=mortality,T=48.0,dt=1.0.out │ ├── outcome=ARF,T=12.0,dt=1.0.err │ ├── outcome=ARF,T=12.0,dt=1.0.out │ ├── outcome=ARF,T=4.0,dt=1.0.err │ ├── outcome=ARF,T=4.0,dt=1.0.out │ ├── outcome=Shock,T=12.0,dt=1.0.err │ ├── outcome=Shock,T=12.0,dt=1.0.out │ ├── outcome=Shock,T=4.0,dt=1.0.err │ └── outcome=Shock,T=4.0,dt=1.0.out ├── log │ ├── benchmark,outcome=mortality,T=48.0,dt=1.0.err │ ├── benchmark,outcome=mortality,T=48.0,dt=1.0.out │ ├── outcome=ARF,T=12.0,dt=1.0.err │ ├── outcome=ARF,T=12.0,dt=1.0.out │ ├── outcome=ARF,T=4.0,dt=1.0.err │ ├── outcome=ARF,T=4.0,dt=1.0.out │ ├── outcome=Shock,T=12.0,dt=1.0.err │ ├── outcome=Shock,T=12.0,dt=1.0.out │ ├── outcome=Shock,T=4.0,dt=1.0.err │ ├── outcome=Shock,T=4.0,dt=1.0.out │ ├── outcome=mortality,T=48.0,dt=1.0.err │ └── outcome=mortality,T=48.0,dt=1.0.out ├── run_make_all,discretize=no.sh └── run_make_all.sh ├── 3_ML_models ├── DataSummary.ipynb ├── README.md ├── config.yaml ├── lib │ ├── data.py │ ├── evaluate.py │ ├── experiment.py │ ├── models.py │ └── trainer.py ├── run_deep.py ├── run_deep_all.sh ├── run_deep_eval.py ├── run_shallow.py └── run_shallow_all.sh ├── 4_evaluation ├── CombineFigures.ipynb ├── Evaluation.ipynb ├── PredictionGap.ipynb ├── d_items.csv └── d_labitems.csv ├── 5_baseline_NEWS ├── CalculateNEWS.ipynb └── NEWS_table.jpg ├── README.md └── config.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | .ipynb_checkpoints 3 | *.png 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | 9 | 10 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "FIDDLE"] 2 | path = FIDDLE 3 | url = https://github.com/MLD3/FIDDLE 4 | -------------------------------------------------------------------------------- /90-day-post-discharge-mortality/FIDDLE_icd/config.py: -------------------------------------------------------------------------------- 1 | import os, yaml 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f: 3 | config = yaml.full_load(f) 4 | 5 | ID_col = config['column_names']['ID'] 6 | var_col = config['column_names']['var_name'] 7 | val_col = config['column_names']['var_value'] 8 | t_col = config['column_names']['t'] 9 | hierarchical_sep = config['hierarchical_sep'] 10 | hierarchical_levels = config['hierarchical_levels'] 11 | 12 | value_type_override = config['value_types'] 13 | 14 | parallel = True 15 | n_jobs = 64 16 | -------------------------------------------------------------------------------- /90-day-post-discharge-mortality/FIDDLE_icd/config.yaml: -------------------------------------------------------------------------------- 1 | # Customize table headers 2 | column_names: 3 | ID: ID 4 | t: t 5 | var_name: variable_name 6 | var_value: variable_value 7 | 8 | hierarchical_sep: ":" 9 | hierarchical_levels: [0,1,1] 10 | 11 | value_types: 12 | # enter the feature type that you would like to override in the following format: 13 | FIRST_WARDID: Categorical 14 | MedA_AMOUNT: Numeric 15 | MedA_ROUTE: Categorical 16 | ICD9_CODE: hierarchical_ICD9 17 | -------------------------------------------------------------------------------- /90-day-post-discharge-mortality/FIDDLE_icd/run.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | import pickle 3 | import pandas as pd 4 | import numpy as np 5 | import time 6 | import os 7 | 8 | import argparse 9 | from .helpers import str2bool 10 | 11 | parser = argparse.ArgumentParser(description='') 12 | parser.add_argument('--T', type=float, required=True) 13 | parser.add_argument('--dt', type=float, required=True) 14 | parser.add_argument('--theta_1', type=float, default=0.001) 15 | parser.add_argument('--theta_2', type=float, default=0.001) 16 | parser.add_argument('--theta_freq', type=float, default=1.0) 17 | parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean']) 18 | parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True) 19 | 20 | parser.add_argument('--data_path', type=str, required=True) 21 | parser.add_argument('--input_fname', type=str, required=False) 22 | parser.add_argument('--population', type=str, required=True) 23 | parser.add_argument('--N', type=int, required=False) 24 | parser.add_argument('--Ds', nargs='+', type=int) 25 | 26 | parser.add_argument('--no_prefilter', dest='prefilter', action='store_false') 27 | parser.add_argument('--no_postfilter', dest='postfilter', action='store_false') 28 | parser.set_defaults(prefilter=True, postfilter=True) 29 | 30 | args = parser.parse_args() 31 | 32 | data_path = args.data_path 33 | if not data_path.endswith('/'): 34 | data_path += '/' 35 | 36 | population = args.population 37 | T = int(args.T) 38 | dt = args.dt 39 | theta_1 = args.theta_1 40 | theta_2 = args.theta_2 41 | theta_freq = args.theta_freq 42 | stats_functions = args.stats_functions 43 | binarize = args.binarize 44 | 45 | df_population = pd.read_csv(population).set_index('ID') 46 | N = args.N or len(df_population) 47 | df_population = df_population.iloc[:args.N] 48 | L = int(np.floor(T/dt)) 49 | 50 | args.df_population = df_population 51 | args.N = N 52 | args.L = L 53 | args.parallel = parallel 54 | 55 | if args.input_fname and os.path.isfile(args.input_fname): 56 | input_fname = args.input_fname 57 | if input_fname.endswith('.p' or '.pickle'): 58 | df_data = pd.read_pickle(input_fname) 59 | elif input_fname.endswith('.csv'): 60 | df_data = pd.read_csv(input_fname) 61 | else: 62 | assert False 63 | elif os.path.isfile(data_path + 'input_data.p'): 64 | input_fname = data_path + 'input_data.p' 65 | df_data = pd.read_pickle(input_fname) 66 | elif os.path.isfile(data_path + 'input_data.pickle'): 67 | input_fname = data_path + 'input_data.pickle' 68 | df_data = pd.read_pickle(input_fname) 69 | elif os.path.isfile(data_path + 'input_data.csv'): 70 | input_fname = data_path + 'input_data.csv' 71 | df_data = pd.read_csv(input_fname) 72 | else: 73 | raise NotImplementedError 74 | 75 | 76 | from .steps import * 77 | 78 | print('Input data file:', input_fname) 79 | print() 80 | print('Input arguments:') 81 | print(' {:<6} = {}'.format('T', T)) 82 | print(' {:<6} = {}'.format('dt', dt)) 83 | print(' {:<6} = {}'.format('\u03B8\u2081', theta_1)) 84 | print(' {:<6} = {}'.format('\u03B8\u2082', theta_2)) 85 | print(' {:<6} = {}'.format('\u03B8_freq', theta_freq)) 86 | print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions)) 87 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize])) 88 | print() 89 | print('N = {}'.format(N)) 90 | print('L = {}'.format(L)) 91 | print('', flush=True) 92 | 93 | 94 | ###### 95 | # Main 96 | ###### 97 | if args.prefilter: 98 | print_header('1) Pre-filter') 99 | df_data = pre_filter(df_data, theta_1, df_population, args) 100 | df_data.to_csv(data_path + 'pre-filtered.csv', index=False) 101 | 102 | print_header('2) Transform; 3) Post-filter') 103 | df_data, df_types = parse_variable_data_type(df_data, value_type_override, args) 104 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data) 105 | 106 | # Process time-invariant data 107 | if len(df_time_invariant) > 0: 108 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args) 109 | 110 | # Process time-dependent data 111 | if len(df_time_series) > 0: 112 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args) 113 | -------------------------------------------------------------------------------- /90-day-post-discharge-mortality/model_clinical.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from collections import Counter 4 | 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.model_selection import ShuffleSplit 7 | 8 | df = pd.read_csv('input/label.csv').rename(columns={'HADM_ID': 'ID'}) 9 | 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, RandomizedSearchCV 13 | from sklearn import metrics, feature_selection, utils 14 | import scipy.stats 15 | from joblib import Parallel, delayed 16 | from tqdm import tqdm#_notebook as tqdm 17 | import random 18 | 19 | def train_model(Xtr, ytr, Xte, yte, model_name, exp_name): 20 | np.random.seed(0) 21 | random.seed(0) 22 | 23 | clf = helper(model_name) 24 | 25 | clf.fit(Xtr, ytr) 26 | print('best_params_', clf.best_params_) 27 | print('best_score_ ', clf.best_score_) 28 | try: 29 | np.savetxt( 30 | 'output/{}.{},coef.txt'.format(exp_name, model_name), 31 | clf.best_estimator_.coef_, 32 | delimiter=',', 33 | ) 34 | except: 35 | print('Coefficients not saved') 36 | pass 37 | 38 | ###### 39 | # Eval 40 | # Bootstrapped 95% Confidence Interval 41 | try: 42 | yte_pred = clf.predict_proba(Xte)[:,1] 43 | except AttributeError: 44 | print('Cannot produce probabilistic estimates') 45 | raise 46 | 47 | def func(i): 48 | yte_true_b, yte_pred_b = utils.resample(yte, yte_pred, replace=True, random_state=i) 49 | return metrics.roc_auc_score(yte_true_b, yte_pred_b) 50 | 51 | test_scores = Parallel(n_jobs=2)(delayed(func)(i) for i in tqdm(range(1000), leave=False)) 52 | print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5))) 53 | 54 | save_test_predictions(yte, yte_pred, model_name, 'output/'+exp_name+'.') 55 | 56 | 57 | n_jobs=12 58 | search_budget=50 59 | 60 | def helper(model_type): 61 | if model_type == 'LR': 62 | clf = RandomizedSearchCV( 63 | LogisticRegression(solver='lbfgs'), 64 | {'C': scipy.stats.reciprocal(1e-5, 1e5)}, 65 | n_iter=search_budget, 66 | cv=StratifiedKFold(5), 67 | scoring='roc_auc', 68 | n_jobs=n_jobs, verbose=2, 69 | ) 70 | elif model_type == 'RF': 71 | clf = RandomizedSearchCV( 72 | RandomForestClassifier(), 73 | { 74 | "criterion": ["gini", "entropy"], 75 | "max_depth": [4, 8, 16, 32, None], 76 | "max_features": scipy.stats.randint(1, 100), 77 | "min_samples_split": scipy.stats.randint(2, 11), 78 | "min_samples_leaf": scipy.stats.randint(1, 11), 79 | "n_estimators": scipy.stats.randint(50,500), 80 | "bootstrap": [True], 81 | }, 82 | n_iter=search_budget, 83 | cv=StratifiedKFold(5), 84 | scoring='roc_auc', 85 | n_jobs=n_jobs, verbose=2, 86 | ) 87 | else: 88 | assert False 89 | 90 | return clf 91 | 92 | def save_test_predictions(y_true, y_score, model_name, save_dir): 93 | # import pathlib 94 | # pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True) 95 | 96 | fname = save_dir + '{}.test.npz'.format(model_name) 97 | np.savez( 98 | open(fname, 'wb'), 99 | y_score = y_score, 100 | y_true = y_true, 101 | ) 102 | print('Test predictions saved to', fname) 103 | 104 | 105 | 106 | import sparse 107 | X = sparse.load_npz('output.clinical/X.npz').todense().squeeze() 108 | 109 | Xtr = X[df.partition=="train"] 110 | ytr = df[df.partition=="train"]['label'] 111 | Xte = X[df.partition=="test"] 112 | yte = df[df.partition=="test"]['label'] 113 | print(Xtr.shape, ytr.shape, Xte.shape, yte.shape) 114 | 115 | train_model(Xtr, ytr, Xte, yte, 'LR', 'clinical') 116 | train_model(Xtr, ytr, Xte, yte, 'RF', 'clinical') 117 | -------------------------------------------------------------------------------- /90-day-post-discharge-mortality/model_clinical_icd[0,1,2].py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from collections import Counter 4 | 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.model_selection import ShuffleSplit 7 | 8 | df = pd.read_csv('input/label.csv').rename(columns={'HADM_ID': 'ID'}) 9 | 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, RandomizedSearchCV 13 | from sklearn import metrics, feature_selection, utils 14 | import scipy.stats 15 | from joblib import Parallel, delayed 16 | from tqdm import tqdm#_notebook as tqdm 17 | import random 18 | 19 | def train_model(Xtr, ytr, Xte, yte, model_name, exp_name): 20 | np.random.seed(0) 21 | random.seed(0) 22 | 23 | clf = helper(model_name) 24 | 25 | clf.fit(Xtr, ytr) 26 | print('best_params_', clf.best_params_) 27 | print('best_score_ ', clf.best_score_) 28 | try: 29 | np.savetxt( 30 | 'output/{}.{},coef.txt'.format(exp_name, model_name), 31 | clf.best_estimator_.coef_, 32 | delimiter=',', 33 | ) 34 | except: 35 | print('Coefficients not saved') 36 | pass 37 | 38 | ###### 39 | # Eval 40 | # Bootstrapped 95% Confidence Interval 41 | try: 42 | yte_pred = clf.predict_proba(Xte)[:,1] 43 | except AttributeError: 44 | print('Cannot produce probabilistic estimates') 45 | raise 46 | 47 | def func(i): 48 | yte_true_b, yte_pred_b = utils.resample(yte, yte_pred, replace=True, random_state=i) 49 | return metrics.roc_auc_score(yte_true_b, yte_pred_b) 50 | 51 | test_scores = Parallel(n_jobs=2)(delayed(func)(i) for i in tqdm(range(1000), leave=False)) 52 | print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5))) 53 | 54 | save_test_predictions(yte, yte_pred, model_name, 'output/'+exp_name+'.') 55 | 56 | 57 | n_jobs=12 58 | search_budget=50 59 | 60 | def helper(model_type): 61 | if model_type == 'LR': 62 | clf = RandomizedSearchCV( 63 | LogisticRegression(solver='lbfgs'), 64 | {'C': scipy.stats.reciprocal(1e-5, 1e5)}, 65 | n_iter=search_budget, 66 | cv=StratifiedKFold(5), 67 | scoring='roc_auc', 68 | n_jobs=n_jobs, verbose=2, 69 | ) 70 | elif model_type == 'RF': 71 | clf = RandomizedSearchCV( 72 | RandomForestClassifier(), 73 | { 74 | "criterion": ["gini", "entropy"], 75 | "max_depth": [4, 8, 16, 32, None], 76 | "max_features": scipy.stats.randint(1, 100), 77 | "min_samples_split": scipy.stats.randint(2, 11), 78 | "min_samples_leaf": scipy.stats.randint(1, 11), 79 | "n_estimators": scipy.stats.randint(50,500), 80 | "bootstrap": [True], 81 | }, 82 | n_iter=search_budget, 83 | cv=StratifiedKFold(5), 84 | scoring='roc_auc', 85 | n_jobs=n_jobs, verbose=2, 86 | ) 87 | else: 88 | assert False 89 | 90 | return clf 91 | 92 | def save_test_predictions(y_true, y_score, model_name, save_dir): 93 | # import pathlib 94 | # pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True) 95 | 96 | fname = save_dir + '{}.test.npz'.format(model_name) 97 | np.savez( 98 | open(fname, 'wb'), 99 | y_score = y_score, 100 | y_true = y_true, 101 | ) 102 | print('Test predictions saved to', fname) 103 | 104 | 105 | 106 | import sparse 107 | X = sparse.load_npz('output.clinical/X.npz').todense().squeeze() 108 | s = sparse.load_npz('output.icd[0,1,2]/s.npz').todense() 109 | X = np.concatenate((s, X), axis=1) 110 | 111 | Xtr = X[df.partition=="train"] 112 | ytr = df[df.partition=="train"]['label'] 113 | Xte = X[df.partition=="test"] 114 | yte = df[df.partition=="test"]['label'] 115 | print(Xtr.shape, ytr.shape, Xte.shape, yte.shape) 116 | 117 | train_model(Xtr, ytr, Xte, yte, 'LR', 'clinical+ICD[0,1,2]') 118 | train_model(Xtr, ytr, Xte, yte, 'RF', 'clinical+ICD[0,1,2]') 119 | -------------------------------------------------------------------------------- /90-day-post-discharge-mortality/model_clinical_icd[0,1].py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from collections import Counter 4 | 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.model_selection import ShuffleSplit 7 | 8 | df = pd.read_csv('input/label.csv').rename(columns={'HADM_ID': 'ID'}) 9 | 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, RandomizedSearchCV 13 | from sklearn import metrics, feature_selection, utils 14 | import scipy.stats 15 | from joblib import Parallel, delayed 16 | from tqdm import tqdm#_notebook as tqdm 17 | import random 18 | 19 | def train_model(Xtr, ytr, Xte, yte, model_name, exp_name): 20 | np.random.seed(0) 21 | random.seed(0) 22 | 23 | clf = helper(model_name) 24 | 25 | clf.fit(Xtr, ytr) 26 | print('best_params_', clf.best_params_) 27 | print('best_score_ ', clf.best_score_) 28 | try: 29 | np.savetxt( 30 | 'output/{}.{},coef.txt'.format(exp_name, model_name), 31 | clf.best_estimator_.coef_, 32 | delimiter=',', 33 | ) 34 | except: 35 | print('Coefficients not saved') 36 | pass 37 | 38 | ###### 39 | # Eval 40 | # Bootstrapped 95% Confidence Interval 41 | try: 42 | yte_pred = clf.predict_proba(Xte)[:,1] 43 | except AttributeError: 44 | print('Cannot produce probabilistic estimates') 45 | raise 46 | 47 | def func(i): 48 | yte_true_b, yte_pred_b = utils.resample(yte, yte_pred, replace=True, random_state=i) 49 | return metrics.roc_auc_score(yte_true_b, yte_pred_b) 50 | 51 | test_scores = Parallel(n_jobs=2)(delayed(func)(i) for i in tqdm(range(1000), leave=False)) 52 | print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5))) 53 | 54 | save_test_predictions(yte, yte_pred, model_name, 'output/'+exp_name+'.') 55 | 56 | 57 | n_jobs=12 58 | search_budget=50 59 | 60 | def helper(model_type): 61 | if model_type == 'LR': 62 | clf = RandomizedSearchCV( 63 | LogisticRegression(solver='lbfgs'), 64 | {'C': scipy.stats.reciprocal(1e-5, 1e5)}, 65 | n_iter=search_budget, 66 | cv=StratifiedKFold(5), 67 | scoring='roc_auc', 68 | n_jobs=n_jobs, verbose=2, 69 | ) 70 | elif model_type == 'RF': 71 | clf = RandomizedSearchCV( 72 | RandomForestClassifier(), 73 | { 74 | "criterion": ["gini", "entropy"], 75 | "max_depth": [4, 8, 16, 32, None], 76 | "max_features": scipy.stats.randint(1, 100), 77 | "min_samples_split": scipy.stats.randint(2, 11), 78 | "min_samples_leaf": scipy.stats.randint(1, 11), 79 | "n_estimators": scipy.stats.randint(50,500), 80 | "bootstrap": [True], 81 | }, 82 | n_iter=search_budget, 83 | cv=StratifiedKFold(5), 84 | scoring='roc_auc', 85 | n_jobs=n_jobs, verbose=2, 86 | ) 87 | else: 88 | assert False 89 | 90 | return clf 91 | 92 | def save_test_predictions(y_true, y_score, model_name, save_dir): 93 | # import pathlib 94 | # pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True) 95 | 96 | fname = save_dir + '{}.test.npz'.format(model_name) 97 | np.savez( 98 | open(fname, 'wb'), 99 | y_score = y_score, 100 | y_true = y_true, 101 | ) 102 | print('Test predictions saved to', fname) 103 | 104 | 105 | 106 | import sparse 107 | X = sparse.load_npz('output.clinical/X.npz').todense().squeeze() 108 | s = sparse.load_npz('output.icd[0,1]/s.npz').todense() 109 | X = np.concatenate((s, X), axis=1) 110 | 111 | Xtr = X[df.partition=="train"] 112 | ytr = df[df.partition=="train"]['label'] 113 | Xte = X[df.partition=="test"] 114 | yte = df[df.partition=="test"]['label'] 115 | print(Xtr.shape, ytr.shape, Xte.shape, yte.shape) 116 | 117 | train_model(Xtr, ytr, Xte, yte, 'LR', 'clinical+ICD[0,1]') 118 | train_model(Xtr, ytr, Xte, yte, 'RF', 'clinical+ICD[0,1]') 119 | -------------------------------------------------------------------------------- /90-day-post-discharge-mortality/model_clinical_icd[0].py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from collections import Counter 4 | 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.model_selection import ShuffleSplit 7 | 8 | df = pd.read_csv('input/label.csv').rename(columns={'HADM_ID': 'ID'}) 9 | 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.ensemble import RandomForestClassifier 12 | from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, RandomizedSearchCV 13 | from sklearn import metrics, feature_selection, utils 14 | import scipy.stats 15 | from joblib import Parallel, delayed 16 | from tqdm import tqdm#_notebook as tqdm 17 | import random 18 | 19 | def train_model(Xtr, ytr, Xte, yte, model_name, exp_name): 20 | np.random.seed(0) 21 | random.seed(0) 22 | 23 | clf = helper(model_name) 24 | 25 | clf.fit(Xtr, ytr) 26 | print('best_params_', clf.best_params_) 27 | print('best_score_ ', clf.best_score_) 28 | try: 29 | np.savetxt( 30 | 'output/{}.{},coef.txt'.format(exp_name, model_name), 31 | clf.best_estimator_.coef_, 32 | delimiter=',', 33 | ) 34 | except: 35 | print('Coefficients not saved') 36 | pass 37 | 38 | ###### 39 | # Eval 40 | # Bootstrapped 95% Confidence Interval 41 | try: 42 | yte_pred = clf.predict_proba(Xte)[:,1] 43 | except AttributeError: 44 | print('Cannot produce probabilistic estimates') 45 | raise 46 | 47 | def func(i): 48 | yte_true_b, yte_pred_b = utils.resample(yte, yte_pred, replace=True, random_state=i) 49 | return metrics.roc_auc_score(yte_true_b, yte_pred_b) 50 | 51 | test_scores = Parallel(n_jobs=2)(delayed(func)(i) for i in tqdm(range(1000), leave=False)) 52 | print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5))) 53 | 54 | save_test_predictions(yte, yte_pred, model_name, 'output/'+exp_name+'.') 55 | 56 | 57 | n_jobs=12 58 | search_budget=50 59 | 60 | def helper(model_type): 61 | if model_type == 'LR': 62 | clf = RandomizedSearchCV( 63 | LogisticRegression(solver='lbfgs'), 64 | {'C': scipy.stats.reciprocal(1e-5, 1e5)}, 65 | n_iter=search_budget, 66 | cv=StratifiedKFold(5), 67 | scoring='roc_auc', 68 | n_jobs=n_jobs, verbose=2, 69 | ) 70 | elif model_type == 'RF': 71 | clf = RandomizedSearchCV( 72 | RandomForestClassifier(), 73 | { 74 | "criterion": ["gini", "entropy"], 75 | "max_depth": [4, 8, 16, 32, None], 76 | "max_features": scipy.stats.randint(1, 100), 77 | "min_samples_split": scipy.stats.randint(2, 11), 78 | "min_samples_leaf": scipy.stats.randint(1, 11), 79 | "n_estimators": scipy.stats.randint(50,500), 80 | "bootstrap": [True], 81 | }, 82 | n_iter=search_budget, 83 | cv=StratifiedKFold(5), 84 | scoring='roc_auc', 85 | n_jobs=n_jobs, verbose=2, 86 | ) 87 | else: 88 | assert False 89 | 90 | return clf 91 | 92 | def save_test_predictions(y_true, y_score, model_name, save_dir): 93 | # import pathlib 94 | # pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True) 95 | 96 | fname = save_dir + '{}.test.npz'.format(model_name) 97 | np.savez( 98 | open(fname, 'wb'), 99 | y_score = y_score, 100 | y_true = y_true, 101 | ) 102 | print('Test predictions saved to', fname) 103 | 104 | 105 | 106 | import sparse 107 | X = sparse.load_npz('output.clinical/X.npz').todense().squeeze() 108 | s = sparse.load_npz('output.icd[0]/s.npz').todense() 109 | X = np.concatenate((s, X), axis=1) 110 | 111 | Xtr = X[df.partition=="train"] 112 | ytr = df[df.partition=="train"]['label'] 113 | Xte = X[df.partition=="test"] 114 | yte = df[df.partition=="test"]['label'] 115 | print(Xtr.shape, ytr.shape, Xte.shape, yte.shape) 116 | 117 | train_model(Xtr, ytr, Xte, yte, 'LR', 'clinical+ICD[0]') 118 | train_model(Xtr, ytr, Xte, yte, 'RF', 'clinical+ICD[0]') 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FIDDLE experiments 2 | 3 | This repository contains code for the experiments in the JAMIA paper, [**Democratizing EHR analyses with FIDDLE: a flexible data-driven preprocessing pipeline for structured clinical data**](https://doi.org/10.1093/jamia/ocaa139) by Tang et al. (2020). Please also refer to the main [FIDDLE respository](https://github.com/MLD3/FIDDLE). 4 | 5 | **IMPORTANT NOTE:** Due to updated versions of python and related packages (pandas etc.), it might be impossible to replicate the exact numerical results in the paper. Moreover, due to sheer size of the datasets, full processing requires a machine with many CPU cores and a very large RAM (at least 500GB for MIMIC-III, ~3TB for eICU). Therefore, we recommend the following options: 6 | - To reproduce MIMIC-III results similar to that in the paper, we recommend running the latest version of FIDDLE on the data and updating the feature dimensions in the metadata files to match the extracted feature sets. 7 | - To replicate MIMIC-III results reported in the paper, consider using the [jamia-replication](https://github.com/MLD3/FIDDLE-experiments/tree/jamia-replication) branch. We have made every attempt to derive the same set of features (~0.00001% difference) from the MIMIC-III data. To make this experiment suite more accessible, we have released preprocessed MIMIC-III and eICU features via PhysioNet (for use with code in the [jamia-replication](https://github.com/MLD3/FIDDLE-experiments/tree/jamia-replication) branch). Please download the datasets here: https://physionet.org/content/mimic-eicu-fiddle-feature/1.0.0/ 8 | 9 | ## Usage 10 | Clone the repository and initialize the FIDDLE submodule: 11 | ```bash 12 | git clone https://github.com/MLD3/FIDDLE-experiments.git 13 | git submodule update --init --recursive 14 | ``` 15 | 16 | To reproduce the experiments on MIMIC-III, use `conda env create -f environment.yml` to create a conda envionment named `FIDDLE-env` which uses python 3.7, and then follow the steps in README. 17 | -------------------------------------------------------------------------------- /eicu_experiments/1_data_extraction/extract_medication.py: -------------------------------------------------------------------------------- 1 | eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/' 2 | save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/' 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | 9 | config = { 10 | 'n_rows': { 11 | 'medication': 7_301_853, 12 | } 13 | } 14 | 15 | def _read_events(fname, t_cols, chunksize): 16 | """ 17 | A helper function to read csv in chunks 18 | Arguments: 19 | - fname is the file name (i.e INPUTEVENTS) 20 | - t_cols is a list that contains the names of the time columns that should be parsed 21 | - chunksize is the size of each chunk 22 | """ 23 | n_rows = config['n_rows'][fname] 24 | with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar: 25 | for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize): 26 | pbar.update() 27 | yield df 28 | 29 | 30 | fname = 'medication' 31 | df_M = [] 32 | for i, df in enumerate(_read_events(fname, [], chunksize=100000)): 33 | # Remove unknow drug name or drug seqnum 34 | df['drughiclseqno'] = df['drughiclseqno'].astype('Int64') 35 | df = df.dropna(subset=['drugname', 'drughiclseqno'], how='all') 36 | 37 | # Combine drug name and ID 38 | df.loc[:, 'drugnameid'] = df[['drugname', 'drughiclseqno']].apply( 39 | lambda x: '{}|{}'.format(x[0], x[1]), axis=1) 40 | 41 | df = df.rename(columns={'patientunitstayid': 'ID', 'drugstartoffset': 't'}) 42 | df = df.set_index([ 43 | 'ID', 't', 'drugnameid' 44 | ])[['dosage', 'routeadmin', 'frequency']] 45 | 46 | df.columns.name = 'property' 47 | df = df.stack() 48 | df.name = 'variable_value' 49 | df = df.reset_index() 50 | 51 | df['variable_name'] = df[['drugnameid', 'property']].apply(lambda x: '|'.join(x), axis=1) 52 | df['variable_value'] = pd.to_numeric(df['variable_value'], errors='ignore') 53 | df = df[['ID', 't', 'variable_name', 'variable_value']] 54 | 55 | df = df.reset_index(drop=True) 56 | df_M.append(df) 57 | 58 | df_out = pd.concat(df_M, ignore_index=True) 59 | try: 60 | df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False) 61 | except: 62 | df_out.to_pickle(save_path + '{}.pickle'.format(fname)) 63 | -------------------------------------------------------------------------------- /eicu_experiments/1_data_extraction/extract_nurseCharting.py: -------------------------------------------------------------------------------- 1 | eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/' 2 | save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/' 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | config = { 9 | 'n_rows': { 10 | 'nurseCharting': 151_604_232, 11 | } 12 | } 13 | 14 | def _read_events(fname, t_cols, chunksize): 15 | """ 16 | A helper function to read csv in chunks 17 | Arguments: 18 | - fname is the file name (i.e INPUTEVENTS) 19 | - t_cols is a list that contains the names of the time columns that should be parsed 20 | - chunksize is the size of each chunk 21 | """ 22 | n_rows = config['n_rows'][fname] 23 | with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar: 24 | for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize): 25 | pbar.update() 26 | yield df 27 | 28 | 29 | fname = 'nurseCharting' 30 | df_NC = [] 31 | for i, df in enumerate(_read_events(fname, [], chunksize=1000000)): 32 | df = df.drop(columns=[ 33 | 'nursingchartid', 34 | 'nursingchartentryoffset', 35 | ]) 36 | df = df.rename(columns={ 37 | 'patientunitstayid': 'ID', 38 | 'nursingchartoffset': 't', 39 | }) 40 | df['variable_name'] = df[[ 41 | 'nursingchartcelltypecat', 'nursingchartcelltypevallabel', 42 | 'nursingchartcelltypevalname' 43 | ]].apply(lambda x: '|'.join(x), axis=1) 44 | 45 | df['variable_value'] = pd.to_numeric(df['nursingchartvalue'], errors='ignore') 46 | 47 | df = df[['ID', 't', 'variable_name', 'variable_value']] 48 | df = df.reset_index(drop=True) 49 | df_NC.append(df) 50 | if i % 40 == 39: 51 | df_out = pd.concat(df_NC, ignore_index=True) 52 | try: 53 | df_out.to_parquet(data_path + '{}_{}.parquet'.format(fname, int(i//40)), index=False) 54 | except: 55 | df_out.to_pickle(data_path + '{}_{}.pickle'.format(fname, int(i//40))) 56 | df_NC = [] 57 | 58 | df_out = pd.concat(df_NC, ignore_index=True) 59 | try: 60 | df_out.to_parquet(save_path + '{}_{}.parquet'.format(fname, int(i//40)), index=False) 61 | except: 62 | df_out.to_pickle(save_path + '{}_{}.pickle'.format(fname, int(i//40))) -------------------------------------------------------------------------------- /eicu_experiments/1_data_extraction/extract_pivoted.py: -------------------------------------------------------------------------------- 1 | # python extract_pivoted.py vitalPeriodic 2 | # python extract_pivoted.py vitalAperiodic 3 | 4 | eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/' 5 | save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/' 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | import argparse 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('filename') 14 | args = parser.parse_args() 15 | fname = args.filename 16 | 17 | config = { 18 | 'n_rows': { 19 | 'vitalPeriodic': 146_671_642, 20 | 'vitalAperiodic': 25_075_074, 21 | } 22 | } 23 | 24 | def _read_events(fname, t_cols, chunksize): 25 | """ 26 | A helper function to read csv in chunks 27 | Arguments: 28 | - fname is the file name (i.e INPUTEVENTS) 29 | - t_cols is a list that contains the names of the time columns that should be parsed 30 | - chunksize is the size of each chunk 31 | """ 32 | n_rows = config['n_rows'][fname] 33 | with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar: 34 | for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize): 35 | pbar.update() 36 | yield df 37 | 38 | 39 | 40 | 41 | df_V = [] 42 | for i, df in enumerate(_read_events(fname, [], chunksize=1000000)): 43 | df = df.iloc[:,1:].set_index(['patientunitstayid', 'observationoffset']) 44 | df.columns.name = 'variable_name' 45 | df = df.stack() 46 | df.name = 'variable_value' 47 | df = df.reset_index() 48 | df_V.append(df) 49 | if i % 20 == 0: 50 | df_out = pd.concat(df_V, ignore_index=True) 51 | df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False) 52 | 53 | df_out = pd.concat(df_V, ignore_index=True) 54 | df_out.columns = ['ID', 't', 'variable_name', 'variable_value'] 55 | df_out = df_out.groupby(['ID', 't', 'variable_name']).median().reset_index() # Drop duplicates and keep the median value 56 | df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False) 57 | -------------------------------------------------------------------------------- /eicu_experiments/2_apply_FIDDLE/prepare_data.py: -------------------------------------------------------------------------------- 1 | eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/' #read from here 2 | data_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/' #save here 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | import pandas as pd 9 | import pickle 10 | import os 11 | 12 | import argparse 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--outcome', type=str, required=True) 15 | parser.add_argument('--T', type=float, required=True) 16 | args = parser.parse_args() 17 | outcome, T = args.outcome, args.T 18 | 19 | data_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/' 20 | pop_path = 'population/{}_{}h.csv'.format(outcome, T) 21 | 22 | pop = pd.read_csv(data_path + pop_path) 23 | data = [] 24 | 25 | for i, filename in enumerate(reversed(sorted(os.listdir(data_path + 'extracted/')))): 26 | if filename.endswith(".parquet") or filename.endswith(".pickle"): 27 | print('___', filename, '___', flush=True) 28 | 29 | if filename.endswith(".parquet"): df = pd.read_parquet(data_path + 'extracted/' + filename) 30 | else: df = pickle.load(open(data_path + 'extracted/' + filename, 'rb')) 31 | 32 | #subsetting population 33 | print('rows: ', df.shape[0]) 34 | df = df[df.ID.isin(pop.ID)] 35 | print('rows after subsetting population: ', df.shape[0]) 36 | 37 | #subsetting time 38 | df = df[((df.t >= 0) & (df.t < T*60)) | np.isnan(df.t)] 39 | print('rows after subsetting time: ', df.shape[0]) 40 | 41 | df['variable_value'] = pd.to_numeric(df['variable_value'], errors='ignore') 42 | data.append(df) 43 | del df 44 | 45 | data = pd.concat(data) 46 | print(data.shape) 47 | print(data.head()) 48 | 49 | print('Number of unique variable_names:', data['variable_name'].nunique()) 50 | print('Number of rows:', len(data)) 51 | 52 | # Remove duplicate rows and recording any duplicates and inconsistencies 53 | data = data.drop_duplicates(subset=['ID', 't', 'variable_name'], keep='first') 54 | data = data.sort_values(by=['ID', 't', 'variable_name']) 55 | print('Number of rows after removing duplicate rows:', len(data)) 56 | 57 | data.to_csv(data_path + 'features/{}_{}h/input_data.csv'.format(outcome, T), index=False) 58 | -------------------------------------------------------------------------------- /eicu_experiments/2_apply_FIDDLE/prepare_data.sh: -------------------------------------------------------------------------------- 1 | # python prepare_data_mortality.py 2>&1 | tee log/prepare_data_mortality.log 2 | # python prepare_data.py --outcome='ARF' --T='4.0' | tee log/prepare_data_ARF_4h.log 3 | # python prepare_data.py --outcome='Shock' --T='4.0' | tee log/prepare_data_Shock_4h.log 4 | python prepare_data.py --outcome='ARF' --T='12.0' | tee log/prepare_data_ARF_12h.log 5 | python prepare_data.py --outcome='Shock' --T='12.0' | tee log/prepare_data_Shock_12h.log 6 | -------------------------------------------------------------------------------- /eicu_experiments/2_apply_FIDDLE/prepare_data_mortality.py: -------------------------------------------------------------------------------- 1 | eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/' #read from here 2 | data_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/' #save here 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | import pandas as pd 9 | import pickle 10 | import os 11 | 12 | data_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/' 13 | pop_path = 'population/mortality_48h.csv' 14 | 15 | pop = pd.read_csv(data_path + pop_path) 16 | T = 48.0 17 | data = [] 18 | 19 | for i, filename in enumerate(reversed(sorted(os.listdir(data_path + 'extracted/')))): 20 | if filename.endswith(".parquet") or filename.endswith(".pickle"): 21 | print('___', filename, '___', flush=True) 22 | 23 | if filename.endswith(".parquet"): df = pd.read_parquet(data_path + 'extracted/' + filename) 24 | else: df = pickle.load(open(data_path + 'extracted/' + filename, 'rb')) 25 | 26 | #subsetting population 27 | print('rows: ', df.shape[0]) 28 | df = df[df.ID.isin(pop.ID)] 29 | print('rows after subsetting population: ', df.shape[0]) 30 | 31 | #subsetting time 32 | df = df[((df.t >= 0) & (df.t < T*60)) | np.isnan(df.t)] 33 | print('rows after subsetting time: ', df.shape[0]) 34 | 35 | df['variable_value'] = pd.to_numeric(df['variable_value'], errors='ignore') 36 | data.append(df) 37 | del df 38 | 39 | data = pd.concat(data) 40 | print(data.shape) 41 | print(data.head()) 42 | 43 | print('Number of unique variable_names:', data['variable_name'].nunique()) 44 | print('Number of rows:', len(data)) 45 | 46 | # Remove duplicate rows and recording any duplicates and inconsistencies 47 | data = data.drop_duplicates(subset=['ID', 't', 'variable_name'], keep='first') 48 | data = data.sort_values(by=['ID', 't', 'variable_name']) 49 | print('Number of rows after removing duplicate rows:', len(data)) 50 | 51 | data.to_csv(data_path + 'features/mortality/input_data.csv', index=False) 52 | -------------------------------------------------------------------------------- /eicu_experiments/2_apply_FIDDLE/run_make_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | DATAPATH="/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/" 5 | export PYTHONPATH="/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/" 6 | mkdir -p log 7 | 8 | OUTCOME=ARF 9 | T=240.0 10 | dt=60.0 11 | Th=4.0 12 | python -m FIDDLE.run \ 13 | --data_path="$DATAPATH/features/${OUTCOME}_${Th}h/" \ 14 | --population="$DATAPATH/population/${OUTCOME}_${Th}h.csv" \ 15 | --T=$T \ 16 | --dt=$dt \ 17 | --theta_1=0.001 \ 18 | --theta_2=0.001 \ 19 | --theta_freq=1 \ 20 | --stats_functions 'min' 'max' 'mean' \ 21 | > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \ 22 | 2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2) 23 | 24 | 25 | # OUTCOME=mortality 26 | # T=48 27 | # dt=1.0 28 | # python -m FIDDLE.run \ 29 | # --data_path="$DATAPATH/features/mortality/" \ 30 | # --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \ 31 | # --T=$T \ 32 | # --dt=$dt \ 33 | # --theta_1=0.001 \ 34 | # --theta_2=0.001 \ 35 | # --theta_freq=1 \ 36 | # --stats_functions 'min' 'max' 'mean' \ 37 | # > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \ 38 | # 2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2) 39 | -------------------------------------------------------------------------------- /eicu_experiments/3_ML_models/DataSummary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from lib.data import _eICUReader\n", 12 | "import pandas as pd\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "timestep = 1.0" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Outcome ARF \t T 4\n", 35 | "Finish reading data \t 793.68 s\n", 36 | "s (138840, 717)\n", 37 | "X (138840, 4, 5854)\n", 38 | "\n", 39 | "Outcome Shock \t T 4\n", 40 | "Finish reading data \t 1193.06 s\n", 41 | "s (164333, 770)\n", 42 | "X (164333, 4, 6314)\n", 43 | "\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "for task in ['ARF', 'Shock']:\n", 49 | " for duration in [4]:\n", 50 | " print('Outcome', task, '\\t', 'T', duration)\n", 51 | " reader = _eICUReader(task, duration, timestep)\n", 52 | " print('s', reader.s.shape)\n", 53 | " print('X', reader.X.shape)\n", 54 | " print()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "scrolled": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "for task in ['ARF', 'Shock']:\n", 66 | " for duration in [12]:\n", 67 | " print('Outcome', task, 'T', duration)\n", 68 | " reader = _eICUReader(task, duration, timestep)\n", 69 | " print('s', reader.s.shape)\n", 70 | " print('X', reader.X.shape)\n", 71 | " print()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "Finish reading data \t 35.72 s\n", 84 | "s (11695, 97)\n", 85 | "X (11695, 48, 7411)\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "for task in ['mortality']:\n", 91 | " for duration in [48]:\n", 92 | " reader = _Mimic3Reader(task, duration, timestep)\n", 93 | " print('s', reader.s.shape)\n", 94 | " print('X', reader.X.shape)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.7.4" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 4 126 | } 127 | -------------------------------------------------------------------------------- /eicu_experiments/3_ML_models/config.yaml: -------------------------------------------------------------------------------- 1 | data_path: ../data/ 2 | 3 | model_names: { 4 | 'CNN': 'CNN_V3', 5 | 'RNN': 'RNN_V2', 6 | 'LR': 'LR', 7 | 'RF': 'RF', 8 | } 9 | 10 | train: 11 | budget: 50 12 | repeat: 1 13 | epochs: 15 14 | 15 | feature_dimension: 16 | ARF: 17 | 4.0 : 4143 18 | 12.0: 4912 19 | 20 | Shock: 21 | 4.0 : 4620 22 | 12.0: 5597 23 | 24 | mortality: 25 | 48.0: 7508 26 | -------------------------------------------------------------------------------- /eicu_experiments/3_ML_models/lib/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | def get_best_model_info(df_search): 5 | df_search_sorted = df_search.sort_values('best_score', ascending=False).head() 6 | best_model_info = df_search_sorted.iloc[0, 1:] 7 | return best_model_info 8 | 9 | def load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params, load_filename=None): 10 | if load_filename is None: 11 | savename = best_model_info['savename'] 12 | split = savename.split('/') 13 | split[-1] = 'best_' + split[-1] 14 | load_filename = '/'.join(split) 15 | 16 | checkpoint = torch.load(load_filename) 17 | _iter = checkpoint['_iter'] 18 | print("Loaded checkpoint (trained for {} iterations)".format(checkpoint['_iter'])) 19 | # print(load_filename) 20 | 21 | best_HP = best_model_info.drop(['savename', 'best_iter', 'seed']).to_dict() 22 | model = ModelClass( 23 | in_channels, L_in, 1, 24 | **{k:best_HP[k] for k in best_HP.keys() if k not in training_params} 25 | ) 26 | model.load_state_dict(checkpoint['state_dict']) 27 | model.cuda() 28 | print("Restored model {} with #params={}".format(ModelClass, sum(p.numel() for p in model.parameters()))) 29 | 30 | return checkpoint, model 31 | 32 | def get_test_predictions(model, te_loader, task=None, model_name=None): 33 | model.eval() 34 | running_pred = [] 35 | 36 | cuda = True 37 | for i, (X, y) in enumerate(te_loader): 38 | if cuda: 39 | X = X.contiguous().cuda() 40 | y = y.contiguous().cuda(non_blocking=True) 41 | 42 | with torch.set_grad_enabled(False): 43 | output = model(X) 44 | running_pred.append((output.data.detach().cpu(), y.data.detach().cpu())) 45 | 46 | y_score, y_true = zip(*running_pred) 47 | y_score = torch.cat(y_score).numpy() 48 | y_true = torch.cat(y_true).numpy() 49 | 50 | assert (np.stack(te_loader.dataset.y) == y_true).all() 51 | return y_true, y_score 52 | 53 | def save_test_predictions(y_true, y_score, task, T, dt, model_name): 54 | import pathlib 55 | pathlib.Path('./output/outcome={}.T={}.dt={}/'.format(task, T, dt)).mkdir(parents=True, exist_ok=True) 56 | 57 | fname = './output/outcome={}.T={}.dt={}/{}.test.npz'.format(task, T, dt, model_name) 58 | np.savez( 59 | open(fname, 'wb'), 60 | y_score = y_score, 61 | y_true = y_true, 62 | ) 63 | print('Test predictions saved to', fname) 64 | -------------------------------------------------------------------------------- /eicu_experiments/3_ML_models/lib/experiment.py: -------------------------------------------------------------------------------- 1 | from .trainer import Trainer 2 | import time 3 | import random 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.model_selection import ParameterSampler 8 | 9 | class Experiment(object): 10 | def __init__(self, param_grid, budget=1, repeat=1, n_epochs=5, name='tmp'): 11 | self.name = name 12 | self.budget = budget 13 | self.repeat = repeat # number of restarts with different random seeds 14 | self.n_epochs = n_epochs 15 | self.param_grid = param_grid 16 | self.param_sampler = ParameterSampler(param_grid, n_iter=self.budget, random_state=0) 17 | 18 | def run(self): 19 | df_search = pd.DataFrame(columns=['best_score', 'best_iter', 'seed', 'savename'] + list(self.param_grid.keys())) 20 | start_time = time.time() 21 | for run, params in enumerate(self.param_sampler): 22 | print(self.name, '\t', 'Run:', run, '/', self.budget) 23 | print(params) 24 | for i in range(self.repeat): 25 | results = self._run_trial(i, params) 26 | df_search = df_search.append(results, ignore_index=True) 27 | df_search.to_csv('./log/df_search.current.{}.csv'.format(self.name), index=False) 28 | 29 | print('Took:', time.time() - start_time) 30 | return df_search 31 | 32 | def _run_trial(self, seed, params): 33 | savename = 'checkpoint/{}/{}_seed={}.pth.tar'.format(self.name, params, seed) 34 | 35 | random.seed(seed) 36 | np.random.seed(seed) 37 | torch.manual_seed(seed) 38 | torch.cuda.manual_seed_all(seed) 39 | 40 | tr_loader, va_loader = self.get_data() 41 | model, criterion, optimizer = self.get_model_params(params) 42 | trainer = Trainer(model, criterion, optimizer, tr_loader, va_loader, 43 | n_epochs=self.n_epochs, batch_size=params['batch_size'], 44 | savename=savename, 45 | save_every=100, plot_every=50, cuda=True) 46 | # print(trainer) 47 | trainer.fit() 48 | 49 | print(trainer._best_iter, '{:.5f}'.format(trainer.best_score)) 50 | 51 | del model 52 | return { 53 | 'best_score': trainer.best_score, 'best_iter': trainer._best_iter, 54 | 'savename': savename, 'seed': seed, 55 | **params, 56 | } 57 | 58 | def get_model_params(self): 59 | raise NotImplementedError 60 | 61 | def get_data(self): 62 | raise NotImplementedError 63 | -------------------------------------------------------------------------------- /eicu_experiments/3_ML_models/lib/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | class CNN_V3(nn.Module): 7 | """ 8 | Multilayer CNN with 1D convolutions 9 | """ 10 | def __init__( 11 | self, 12 | in_channels, 13 | L_in, 14 | output_size, 15 | depth=2, 16 | filter_size=3, 17 | n_filters=64, 18 | n_neurons=64, 19 | dropout=0.2, 20 | activation='relu', 21 | ): 22 | super().__init__() 23 | self.depth = depth 24 | if activation == 'relu': 25 | self.activation = F.relu 26 | elif activation == 'elu': 27 | self.activation = F.elu 28 | padding = int(np.floor(filter_size / 2)) 29 | 30 | if depth == 1: 31 | self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding) 32 | self.pool1 = nn.MaxPool1d(2, 2) 33 | self.fc1 = nn.Linear(int(L_in * (n_filters) / 2), n_neurons) 34 | self.fc1_drop = nn.Dropout(dropout) 35 | self.fc2 = nn.Linear(n_neurons, 1) 36 | 37 | elif depth == 2: 38 | self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding) 39 | self.pool1 = nn.MaxPool1d(2, 2) 40 | self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding) 41 | self.pool2 = nn.MaxPool1d(2, 2) 42 | self.fc1 = nn.Linear(int(L_in * (n_filters) / 4), n_neurons) 43 | self.fc1_drop = nn.Dropout(dropout) 44 | self.fc2 = nn.Linear(n_neurons, 1) 45 | 46 | elif depth == 3: 47 | self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding) 48 | self.pool1 = nn.MaxPool1d(2, 2) 49 | self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding) 50 | self.pool2 = nn.MaxPool1d(2, 2) 51 | self.conv3 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding) 52 | self.pool3 = nn.MaxPool1d(2, 2) 53 | self.fc1 = nn.Linear(int(L_in * (n_filters) / 8), n_neurons) 54 | self.fc1_drop = nn.Dropout(dropout) 55 | self.fc2 = nn.Linear(n_neurons, 1) 56 | 57 | def forward(self, x): 58 | # x: tensor (batch_size, L_in, in_channels) 59 | x = x.transpose(1,2) # swap time and feature axes 60 | 61 | x = self.pool1(self.activation(self.conv1(x))) 62 | if self.depth == 2 or self.depth == 3: 63 | x = self.pool2(self.activation(self.conv2(x))) 64 | if self.depth == 3: 65 | x = self.pool3(self.activation(self.conv3(x))) 66 | 67 | x = x.view(x.size(0), -1) # flatten 68 | x = self.activation(self.fc1_drop(self.fc1(x))) 69 | x = torch.sigmoid(self.fc2(x)) 70 | return x 71 | 72 | class RNN_V2(nn.Module): 73 | """ 74 | Multi-layer LSTM network 75 | """ 76 | def __init__( 77 | self, 78 | input_size, 79 | input_length, 80 | output_size, 81 | hidden_size=64, 82 | num_layers=1, 83 | dropout=0.0, 84 | n_neurons=64, 85 | activation='relu', 86 | ): 87 | super().__init__() 88 | if activation == 'relu': 89 | self.activation = F.relu 90 | elif activation == 'elu': 91 | self.activation = F.elu 92 | 93 | self.hidden_size = int(hidden_size) 94 | self.num_layers = int(num_layers) 95 | 96 | self.lstm = nn.LSTM(int(input_size), int(hidden_size), int(num_layers), batch_first=True) 97 | self.fc1 = nn.Linear(hidden_size, n_neurons) 98 | self.fc1_drop = nn.Dropout(dropout) 99 | self.fc2 = nn.Linear(n_neurons, output_size) 100 | 101 | def forward(self, x): 102 | # x: tensor (batch_size, T, input_size) 103 | # h_all: (batch_size, T, hidden_size) 104 | h_0, c_0 = self.init_hidden(x) 105 | h_all, (h_T, c_T) = self.lstm(x, (h_0, c_0)) 106 | output = h_T[-1] 107 | output = self.activation(self.fc1_drop(self.fc1(output))) 108 | output = torch.sigmoid(self.fc2(output)) 109 | return output 110 | 111 | def init_hidden(self, x): 112 | batch_size = x.size(0) 113 | return (torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device), 114 | torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)) -------------------------------------------------------------------------------- /eicu_experiments/3_ML_models/run_deep.py: -------------------------------------------------------------------------------- 1 | # python run_deep.py --outcome=ARF --T=4 --dt=0.5 --model_type=CNN --cuda=7 2 | 3 | import sys, os, time, pickle, random 4 | import pandas as pd 5 | import numpy as np 6 | import pathlib 7 | pathlib.Path('log').mkdir(parents=True, exist_ok=True) 8 | 9 | import yaml 10 | with open('config.yaml') as f: 11 | config = yaml.load(f) 12 | 13 | ######## 14 | ## Constants 15 | data_path = config['data_path'] 16 | model_names = config['model_names'] 17 | 18 | budget = config['train']['budget'] # Number of randomized hyperparameter settings to try 19 | repeat = config['train']['repeat'] # 1 # number of restarts (with different seeds) for each setting 20 | epochs = config['train']['epochs'] # 15 # Max epochs for each setting 21 | 22 | # Feature dimensions 23 | dimensions = config['feature_dimension'] 24 | 25 | # Hyperparameter search space 26 | train_param_grid = { 27 | 'batch_size': [16, 32, 64, 128], 28 | 'lr': [1e-2, 1e-3, 1e-4], 29 | } 30 | CNN_param_grid = { 31 | 'dropout': [0.0, 0.1, 0.2, 0.4, 0.8], 32 | 'depth': [1, 2],#, 3], 33 | 'filter_size': [1, 2, 3, 4], 34 | 'n_filters': [16, 32, 64, 128], 35 | 'n_neurons': [16, 32, 64, 128], 36 | 'activation': ['relu', 'elu'], 37 | } 38 | RNN_param_grid = { 39 | 'dropout': [0.0, 0.1, 0.2, 0.4, 0.8], 40 | 'num_layers': [1, 2, 3], 41 | 'hidden_size': [16, 32, 64, 128], 42 | 'n_neurons': [16, 32, 64, 128], 43 | 'activation': ['relu', 'elu'], 44 | } 45 | 46 | training_params = {'batch_size', 'lr'} 47 | 48 | ######## 49 | 50 | import argparse 51 | 52 | parser = argparse.ArgumentParser(description='') 53 | 54 | parser.add_argument('--outcome', type=str, required=True) 55 | parser.add_argument('--T', type=float, required=True) 56 | parser.add_argument('--dt', type=float, required=True) 57 | parser.add_argument('--model_type', type=str, required=True) 58 | parser.add_argument('--cuda', type=int, default=7) 59 | parser.add_argument('--seed', type=int, default=42) 60 | 61 | args = parser.parse_args() 62 | 63 | task = args.outcome 64 | model_type = args.model_type 65 | 66 | T = float(args.T) 67 | dt = float(args.dt) 68 | L_in = int(np.floor(T / dt)) 69 | in_channels = dimensions[task][float(T)] 70 | 71 | import lib.models as models 72 | model_name = model_names[model_type] 73 | ModelClass = getattr(models, model_name) 74 | 75 | if model_type == 'CNN': 76 | param_grid = {**train_param_grid, **CNN_param_grid} 77 | elif model_type == 'RNN': 78 | param_grid = {**train_param_grid, **RNN_param_grid} 79 | else: 80 | assert False 81 | 82 | # Create checkpoint directories 83 | import pathlib 84 | pathlib.Path("./checkpoint/model={}.outcome={}.T={}.dt={}/".format(model_name, task, T, dt)).mkdir(parents=True, exist_ok=True) 85 | 86 | ## Data 87 | import lib.data as data 88 | if task == 'mortality': 89 | tr_loader, va_loader, te_loader = data.get_benchmark_splits(fuse=True) 90 | else: 91 | tr_loader, va_loader, te_loader = data.get_train_val_test(task, duration=T, timestep=dt, fuse=True) 92 | 93 | import torch 94 | from torch.utils.data import Dataset, DataLoader 95 | from sklearn.model_selection import StratifiedShuffleSplit 96 | 97 | # Set CUDA 98 | if args.cuda: 99 | torch.cuda.set_device(args.cuda) 100 | print('cuda', torch.cuda.current_device()) 101 | 102 | if args.seed: 103 | torch.manual_seed(args.seed) 104 | np.random.seed(args.seed) 105 | random.seed(args.seed) 106 | 107 | 108 | from lib.experiment import Experiment 109 | 110 | class MIMICExperiment(Experiment): 111 | def get_model_params(self, params): 112 | model = ModelClass( 113 | in_channels, L_in, 1, 114 | **{k:params[k] for k in params.keys() if k not in training_params} 115 | ) 116 | criterion = torch.nn.BCELoss() 117 | optimizer = torch.optim.Adam(model.parameters(), lr=params['lr']) 118 | return model, criterion, optimizer 119 | 120 | def get_data(self): 121 | return tr_loader, va_loader 122 | 123 | exp = MIMICExperiment( 124 | param_grid, name='model={}.outcome={}.T={}.dt={}'.format(model_name, task, T, dt), 125 | budget=budget, n_epochs=epochs, repeat=repeat, 126 | ) 127 | 128 | print('EXPERIMENT:', exp.name) 129 | 130 | df_search = exp.run() 131 | df_search.to_csv('./log/df_search.{}.csv'.format(exp.name), index=False) 132 | -------------------------------------------------------------------------------- /eicu_experiments/3_ML_models/run_deep_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | mkdir -p log 4 | cuda=0 5 | 6 | python run_deep.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=mortality,T=48,dt=1.0,CNN.log' 7 | python run_deep.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=mortality,T=48,dt=1.0,RNN.log' 8 | 9 | python run_deep.py --outcome=ARF --T=4.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=ARF,T=4,dt=1.0,CNN.log' 10 | python run_deep.py --outcome=ARF --T=4.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=ARF,T=4,dt=1.0,RNN.log' 11 | 12 | python run_deep.py --outcome=ARF --T=12.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=ARF,T=12,dt=1.0,CNN.log' 13 | python run_deep.py --outcome=ARF --T=12.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=ARF,T=12,dt=1.0,RNN.log' 14 | 15 | python run_deep.py --outcome=Shock --T=4.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=Shock,T=4,dt=1.0,CNN.log' 16 | python run_deep.py --outcome=Shock --T=4.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=Shock,T=4,dt=1.0,RNN.log' 17 | 18 | python run_deep.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=Shock,T=12,dt=1.0,CNN.log' 19 | python run_deep.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=Shock,T=12,dt=1.0,RNN.log' 20 | -------------------------------------------------------------------------------- /eicu_experiments/3_ML_models/run_deep_eval.py: -------------------------------------------------------------------------------- 1 | import sys, os, time, pickle, random 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import yaml 6 | with open('config.yaml') as f: 7 | config = yaml.load(f) 8 | 9 | ######## 10 | ## Constants 11 | model_names = config['model_names'] 12 | training_params = {'batch_size', 'lr'} 13 | 14 | # Feature dimensions 15 | dimensions = config['feature_dimension'] 16 | 17 | ######## 18 | 19 | def main(task, T, dt, model_type): 20 | L_in = int(np.floor(T / dt)) 21 | in_channels = dimensions[task][T] 22 | 23 | import lib.models as models 24 | model_name = model_names[model_type] 25 | ModelClass = getattr(models, model_name) 26 | df_search = pd.read_csv('./log/df_search.model={}.outcome={}.T={}.dt={}.csv'.format(model_name, task, T, dt)) 27 | import lib.evaluate as evaluate 28 | best_model_info = evaluate.get_best_model_info(df_search) 29 | checkpoint, model = evaluate.load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params) 30 | 31 | 32 | import lib.data as data 33 | if task == 'mortality': 34 | te_loader = data.get_benchmark_test(fuse=True) 35 | else: 36 | te_loader = data.get_test(task, duration=T, timestep=dt, fuse=True) 37 | 38 | y_true, y_score = evaluate.get_test_predictions(model, te_loader, '{}_T={}_dt={}'.format(task, T, dt), model_name) 39 | evaluate.save_test_predictions(y_true, y_score, task, T, dt, model_name) 40 | 41 | from sklearn import metrics, utils 42 | fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score) 43 | fig = plt.figure(figsize=(5,5)) 44 | plt.xlabel('False Positive Rate') 45 | plt.ylabel('True Positive Rate') 46 | plt.xlim(0,1) 47 | plt.ylim(0,1) 48 | plt.plot([0,1], [0,1], ':') 49 | plt.plot(fpr, tpr, color='darkorange') 50 | plt.show() 51 | 52 | ## Bootstrapped 95% Confidence Interval 53 | # try: 54 | # yte_pred = clf.decision_function(Xte) 55 | # except AttributeError: 56 | # yte_pred = clf.predict_proba(Xte)[:,1] 57 | from sklearn.externals.joblib import Parallel, delayed 58 | from tqdm import tqdm_notebook as tqdm 59 | def func(i): 60 | yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i) 61 | return metrics.roc_auc_score(yte_true_b, yte_pred_b) 62 | 63 | test_scores = Parallel(n_jobs=16)(delayed(func)(i) for i in tqdm(range(1000), leave=False)) 64 | print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5))) 65 | 66 | # idx = (np.abs(tpr - 0.5)).argmin() 67 | # y_pred = (y_score > thresholds[idx]) 68 | # metrics.roc_auc_score(y_true, y_score) 69 | 70 | precision, recall, thresholds_ = metrics.precision_recall_curve(y_true, y_score) 71 | fig = plt.figure(figsize=(5,5)) 72 | plt.xlabel('Recall') 73 | plt.ylabel('Precision') 74 | plt.xlim(0,1) 75 | plt.ylim(0,1) 76 | plt.plot(recall, precision, color='darkorange') 77 | plt.show() 78 | 79 | # target TPR = 50% 80 | idx = (np.abs(tpr - 0.5)).argmin() 81 | y_pred = (y_score > thresholds[idx]) 82 | metrics.roc_auc_score(y_true, y_score) 83 | 84 | pd.DataFrame([{ 85 | 'tpr': tpr[idx], 86 | 'fpr': fpr[idx], 87 | 'ppv': metrics.precision_score(y_true, y_pred), 88 | }]) 89 | -------------------------------------------------------------------------------- /eicu_experiments/3_ML_models/run_shallow_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | mkdir -p log 4 | mkdir -p output 5 | 6 | python3 run_shallow.py --outcome=ARF --T=4.0 --model_type=LR \ 7 | > >(tee 'log/outcome=ARF,T=4.0,dt=1.0,LR.out') \ 8 | 2> >(tee 'log/outcome=ARF,T=4.0,dt=1.0,LR.err' >&2) 9 | 10 | python3 run_shallow.py --outcome=Shock --T=4.0 --model_type=LR \ 11 | > >(tee 'log/outcome=Shock,T=4.0,dt=1.0,LR.out') \ 12 | 2> >(tee 'log/outcome=Shock,T=4.0,dt=1.0,LR.err' >&2) 13 | 14 | python3 run_shallow.py --outcome=ARF --T=4.0 --model_type=RF \ 15 | > >(tee 'log/outcome=ARF,T=4.0,dt=1.0,RF.out') \ 16 | 2> >(tee 'log/outcome=ARF,T=4.0,dt=1.0,RF.err' >&2) 17 | 18 | python3 run_shallow.py --outcome=Shock --T=4.0 --model_type=RF \ 19 | > >(tee 'log/outcome=Shock,T=4.0,dt=1.0,RF.out') \ 20 | 2> >(tee 'log/outcome=Shock,T=4.0,dt=1.0,RF.err' >&2) 21 | 22 | 23 | python run_shallow.py --outcome=ARF --T=12.0 --dt=1.0 --model_type=LR &> 'log/outcome=ARF,T=12.0,dt=1.0,LR.log' 24 | python run_shallow.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=LR &> 'log/outcome=Shock,T=12.0,dt=1.0,LR.log' 25 | 26 | python run_shallow.py --outcome=ARF --T=12.0 --dt=1.0 --model_type=RF &> 'log/outcome=ARF,T=12.0,dt=1.0,RF.log' 27 | python run_shallow.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=RF &> 'log/outcome=Shock,T=12.0,dt=1.0,RF.log' 28 | 29 | 30 | python run_shallow.py --outcome=mortality --T=48.0 --model_type=LR \ 31 | > >(tee 'log/outcome=mortality,T=48.0,dt=1.0,LR.out') \ 32 | 2> >(tee 'log/outcome=mortality,T=48.0,dt=1.0,LR.err' >&2) 33 | python run_shallow.py --outcome=mortality --T=48.0 --model_type=RF \ 34 | > >(tee 'log/outcome=mortality,T=48.0,dt=1.0,RF.out') \ 35 | 2> >(tee 'log/outcome=mortality,T=48.0,dt=1.0,RF.err' >&2) 36 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: FIDDLE-env 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.7 7 | - sparse 8 | - pandas 9 | - tqdm 10 | - pyyaml 11 | - scikit-learn 12 | - numpy 13 | - joblib 14 | - ipykernel 15 | - matplotlib 16 | prefix: /home/tangsp/miniconda3/envs/FIDDLE-env 17 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_mask+Dt/config.py: -------------------------------------------------------------------------------- 1 | import os, yaml 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f: 3 | config = yaml.full_load(f) 4 | 5 | ID_col = config['column_names']['ID'] 6 | var_col = config['column_names']['var_name'] 7 | val_col = config['column_names']['var_value'] 8 | t_col = config['column_names']['t'] 9 | 10 | value_type_override = config['value_types'] 11 | 12 | parallel = True 13 | n_jobs = 72 14 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_mask+Dt/config.yaml: -------------------------------------------------------------------------------- 1 | # Customize table headers 2 | column_names: 3 | ID: ID 4 | t: t 5 | var_name: variable_name 6 | var_value: variable_value 7 | 8 | value_types: 9 | # enter the feature type that you would like to override in the following format: 10 | FIRST_WARDID: Categorical 11 | MedA: 12 | AMOUNT: Numeric 13 | ROUTE: Categorical 14 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_mask+Dt/run.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | import pickle 3 | import pandas as pd 4 | import numpy as np 5 | import time 6 | import os 7 | 8 | import argparse 9 | from .helpers import str2bool 10 | 11 | parser = argparse.ArgumentParser(description='') 12 | parser.add_argument('--T', type=float, required=True) 13 | parser.add_argument('--dt', type=float, required=True) 14 | parser.add_argument('--theta_1', type=float, default=0.001) 15 | parser.add_argument('--theta_2', type=float, default=0.001) 16 | parser.add_argument('--theta_freq', type=float, default=1.0) 17 | parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean']) 18 | parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True) 19 | 20 | parser.add_argument('--data_path', type=str, required=True) 21 | parser.add_argument('--input_fname', type=str, required=False) 22 | parser.add_argument('--population', type=str, required=True) 23 | parser.add_argument('--N', type=int, required=False) 24 | parser.add_argument('--Ds', nargs='+', type=int) 25 | 26 | parser.add_argument('--no_prefilter', dest='prefilter', action='store_false') 27 | parser.add_argument('--no_postfilter', dest='postfilter', action='store_false') 28 | parser.set_defaults(prefilter=True, postfilter=True) 29 | 30 | args = parser.parse_args() 31 | 32 | data_path = args.data_path 33 | if not data_path.endswith('/'): 34 | data_path += '/' 35 | 36 | population = args.population 37 | T = int(args.T) 38 | dt = args.dt 39 | theta_1 = args.theta_1 40 | theta_2 = args.theta_2 41 | theta_freq = args.theta_freq 42 | stats_functions = args.stats_functions 43 | binarize = args.binarize 44 | 45 | df_population = pd.read_csv(population).set_index('ID') 46 | N = args.N or len(df_population) 47 | df_population = df_population.iloc[:args.N] 48 | L = int(np.floor(T/dt)) 49 | 50 | args.df_population = df_population 51 | args.N = N 52 | args.L = L 53 | args.parallel = parallel 54 | 55 | if args.input_fname and os.path.isfile(args.input_fname): 56 | input_fname = args.input_fname 57 | if input_fname.endswith('.p' or '.pickle'): 58 | df_data = pd.read_pickle(input_fname) 59 | elif input_fname.endswith('.csv'): 60 | df_data = pd.read_csv(input_fname) 61 | else: 62 | assert False 63 | elif os.path.isfile(data_path + 'input_data.p'): 64 | input_fname = data_path + 'input_data.p' 65 | df_data = pd.read_pickle(input_fname) 66 | elif os.path.isfile(data_path + 'input_data.pickle'): 67 | input_fname = data_path + 'input_data.pickle' 68 | df_data = pd.read_pickle(input_fname) 69 | elif os.path.isfile(data_path + 'input_data.csv'): 70 | input_fname = data_path + 'input_data.csv' 71 | df_data = pd.read_csv(input_fname) 72 | 73 | 74 | from .steps import * 75 | 76 | print('Input data file:', input_fname) 77 | print() 78 | print('Input arguments:') 79 | print(' {:<6} = {}'.format('T', T)) 80 | print(' {:<6} = {}'.format('dt', dt)) 81 | print(' {:<6} = {}'.format('\u03B8\u2081', theta_1)) 82 | print(' {:<6} = {}'.format('\u03B8\u2082', theta_2)) 83 | print(' {:<6} = {}'.format('\u03B8_freq', theta_freq)) 84 | print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions)) 85 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize])) 86 | print() 87 | print('N = {}'.format(N)) 88 | print('L = {}'.format(L)) 89 | print('', flush=True) 90 | 91 | 92 | ###### 93 | # Main 94 | ###### 95 | if args.prefilter: 96 | print_header('1) Pre-filter') 97 | df_data = pre_filter(df_data, theta_1, df_population, args) 98 | df_data.to_csv(data_path + 'pre-filtered.csv', index=False) 99 | 100 | print_header('2) Transform; 3) Post-filter') 101 | df_data, df_types = detect_variable_data_type(df_data, value_type_override, args) 102 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data) 103 | 104 | # Process time-invariant data 105 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args) 106 | 107 | # Process time-dependent data 108 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args) 109 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_maskonly/config.py: -------------------------------------------------------------------------------- 1 | import os, yaml 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f: 3 | config = yaml.full_load(f) 4 | 5 | ID_col = config['column_names']['ID'] 6 | var_col = config['column_names']['var_name'] 7 | val_col = config['column_names']['var_value'] 8 | t_col = config['column_names']['t'] 9 | 10 | value_type_override = config['value_types'] 11 | 12 | parallel = True 13 | n_jobs = 72 14 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_maskonly/config.yaml: -------------------------------------------------------------------------------- 1 | # Customize table headers 2 | column_names: 3 | ID: ID 4 | t: t 5 | var_name: variable_name 6 | var_value: variable_value 7 | 8 | value_types: 9 | # enter the feature type that you would like to override in the following format: 10 | FIRST_WARDID: Categorical 11 | MedA: 12 | AMOUNT: Numeric 13 | ROUTE: Categorical 14 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_maskonly/run.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | import pickle 3 | import pandas as pd 4 | import numpy as np 5 | import time 6 | import os 7 | 8 | import argparse 9 | from .helpers import str2bool 10 | 11 | parser = argparse.ArgumentParser(description='') 12 | parser.add_argument('--T', type=float, required=True) 13 | parser.add_argument('--dt', type=float, required=True) 14 | parser.add_argument('--theta_1', type=float, default=0.001) 15 | parser.add_argument('--theta_2', type=float, default=0.001) 16 | parser.add_argument('--theta_freq', type=float, default=1.0) 17 | parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean']) 18 | parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True) 19 | 20 | parser.add_argument('--data_path', type=str, required=True) 21 | parser.add_argument('--input_fname', type=str, required=False) 22 | parser.add_argument('--population', type=str, required=True) 23 | parser.add_argument('--N', type=int, required=False) 24 | parser.add_argument('--Ds', nargs='+', type=int) 25 | 26 | parser.add_argument('--no_prefilter', dest='prefilter', action='store_false') 27 | parser.add_argument('--no_postfilter', dest='postfilter', action='store_false') 28 | parser.set_defaults(prefilter=True, postfilter=True) 29 | 30 | args = parser.parse_args() 31 | 32 | data_path = args.data_path 33 | if not data_path.endswith('/'): 34 | data_path += '/' 35 | 36 | population = args.population 37 | T = int(args.T) 38 | dt = args.dt 39 | theta_1 = args.theta_1 40 | theta_2 = args.theta_2 41 | theta_freq = args.theta_freq 42 | stats_functions = args.stats_functions 43 | binarize = args.binarize 44 | 45 | df_population = pd.read_csv(population).set_index('ID') 46 | N = args.N or len(df_population) 47 | df_population = df_population.iloc[:args.N] 48 | L = int(np.floor(T/dt)) 49 | 50 | args.df_population = df_population 51 | args.N = N 52 | args.L = L 53 | args.parallel = parallel 54 | 55 | if args.input_fname and os.path.isfile(args.input_fname): 56 | input_fname = args.input_fname 57 | if input_fname.endswith('.p' or '.pickle'): 58 | df_data = pd.read_pickle(input_fname) 59 | elif input_fname.endswith('.csv'): 60 | df_data = pd.read_csv(input_fname) 61 | else: 62 | assert False 63 | elif os.path.isfile(data_path + 'input_data.p'): 64 | input_fname = data_path + 'input_data.p' 65 | df_data = pd.read_pickle(input_fname) 66 | elif os.path.isfile(data_path + 'input_data.pickle'): 67 | input_fname = data_path + 'input_data.pickle' 68 | df_data = pd.read_pickle(input_fname) 69 | elif os.path.isfile(data_path + 'input_data.csv'): 70 | input_fname = data_path + 'input_data.csv' 71 | df_data = pd.read_csv(input_fname) 72 | 73 | 74 | from .steps import * 75 | 76 | print('Input data file:', input_fname) 77 | print() 78 | print('Input arguments:') 79 | print(' {:<6} = {}'.format('T', T)) 80 | print(' {:<6} = {}'.format('dt', dt)) 81 | print(' {:<6} = {}'.format('\u03B8\u2081', theta_1)) 82 | print(' {:<6} = {}'.format('\u03B8\u2082', theta_2)) 83 | print(' {:<6} = {}'.format('\u03B8_freq', theta_freq)) 84 | print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions)) 85 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize])) 86 | print() 87 | print('N = {}'.format(N)) 88 | print('L = {}'.format(L)) 89 | print('', flush=True) 90 | 91 | 92 | ###### 93 | # Main 94 | ###### 95 | if args.prefilter: 96 | print_header('1) Pre-filter') 97 | df_data = pre_filter(df_data, theta_1, df_population, args) 98 | df_data.to_csv(data_path + 'pre-filtered.csv', index=False) 99 | 100 | print_header('2) Transform; 3) Post-filter') 101 | df_data, df_types = detect_variable_data_type(df_data, value_type_override, args) 102 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data) 103 | 104 | # Process time-invariant data 105 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args) 106 | 107 | # Process time-dependent data 108 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args) 109 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_medianimpute/config.py: -------------------------------------------------------------------------------- 1 | import os, yaml 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f: 3 | config = yaml.full_load(f) 4 | 5 | ID_col = config['column_names']['ID'] 6 | var_col = config['column_names']['var_name'] 7 | val_col = config['column_names']['var_value'] 8 | t_col = config['column_names']['t'] 9 | 10 | value_type_override = config['value_types'] 11 | 12 | parallel = True 13 | n_jobs = 72 14 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_medianimpute/config.yaml: -------------------------------------------------------------------------------- 1 | # Customize table headers 2 | column_names: 3 | ID: ID 4 | t: t 5 | var_name: variable_name 6 | var_value: variable_value 7 | 8 | value_types: 9 | # enter the feature type that you would like to override in the following format: 10 | FIRST_WARDID: Categorical 11 | MedA: 12 | AMOUNT: Numeric 13 | ROUTE: Categorical 14 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_medianimpute/run.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | import pickle 3 | import pandas as pd 4 | import numpy as np 5 | import time 6 | import os 7 | 8 | import argparse 9 | from .helpers import str2bool 10 | 11 | parser = argparse.ArgumentParser(description='') 12 | parser.add_argument('--T', type=float, required=True) 13 | parser.add_argument('--dt', type=float, required=True) 14 | parser.add_argument('--theta_1', type=float, default=0.001) 15 | parser.add_argument('--theta_2', type=float, default=0.001) 16 | parser.add_argument('--theta_freq', type=float, default=1.0) 17 | parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean']) 18 | parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True) 19 | 20 | parser.add_argument('--data_path', type=str, required=True) 21 | parser.add_argument('--input_fname', type=str, required=False) 22 | parser.add_argument('--population', type=str, required=True) 23 | parser.add_argument('--N', type=int, required=False) 24 | parser.add_argument('--Ds', nargs='+', type=int) 25 | 26 | parser.add_argument('--no_prefilter', dest='prefilter', action='store_false') 27 | parser.add_argument('--no_postfilter', dest='postfilter', action='store_false') 28 | parser.set_defaults(prefilter=True, postfilter=True) 29 | 30 | args = parser.parse_args() 31 | 32 | data_path = args.data_path 33 | if not data_path.endswith('/'): 34 | data_path += '/' 35 | 36 | population = args.population 37 | T = int(args.T) 38 | dt = args.dt 39 | theta_1 = args.theta_1 40 | theta_2 = args.theta_2 41 | theta_freq = args.theta_freq 42 | stats_functions = args.stats_functions 43 | binarize = args.binarize 44 | 45 | df_population = pd.read_csv(population).set_index('ID') 46 | N = args.N or len(df_population) 47 | df_population = df_population.iloc[:args.N] 48 | L = int(np.floor(T/dt)) 49 | 50 | args.df_population = df_population 51 | args.N = N 52 | args.L = L 53 | args.parallel = parallel 54 | 55 | if args.input_fname and os.path.isfile(args.input_fname): 56 | input_fname = args.input_fname 57 | if input_fname.endswith('.p' or '.pickle'): 58 | df_data = pd.read_pickle(input_fname) 59 | elif input_fname.endswith('.csv'): 60 | df_data = pd.read_csv(input_fname) 61 | else: 62 | assert False 63 | elif os.path.isfile(data_path + 'input_data.p'): 64 | input_fname = data_path + 'input_data.p' 65 | df_data = pd.read_pickle(input_fname) 66 | elif os.path.isfile(data_path + 'input_data.pickle'): 67 | input_fname = data_path + 'input_data.pickle' 68 | df_data = pd.read_pickle(input_fname) 69 | elif os.path.isfile(data_path + 'input_data.csv'): 70 | input_fname = data_path + 'input_data.csv' 71 | df_data = pd.read_csv(input_fname) 72 | 73 | 74 | from .steps import * 75 | 76 | print('Input data file:', input_fname) 77 | print() 78 | print('Input arguments:') 79 | print(' {:<6} = {}'.format('T', T)) 80 | print(' {:<6} = {}'.format('dt', dt)) 81 | print(' {:<6} = {}'.format('\u03B8\u2081', theta_1)) 82 | print(' {:<6} = {}'.format('\u03B8\u2082', theta_2)) 83 | print(' {:<6} = {}'.format('\u03B8_freq', theta_freq)) 84 | print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions)) 85 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize])) 86 | print() 87 | print('N = {}'.format(N)) 88 | print('L = {}'.format(L)) 89 | print('', flush=True) 90 | 91 | 92 | ###### 93 | # Main 94 | ###### 95 | if args.prefilter: 96 | print_header('1) Pre-filter') 97 | df_data = pre_filter(df_data, theta_1, df_population, args) 98 | df_data.to_csv(data_path + 'pre-filtered.csv', index=False) 99 | 100 | print_header('2) Transform; 3) Post-filter') 101 | df_data, df_types = detect_variable_data_type(df_data, value_type_override, args) 102 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data) 103 | 104 | # Process time-invariant data 105 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args) 106 | 107 | # Process time-dependent data 108 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args) 109 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_noimpute/config.py: -------------------------------------------------------------------------------- 1 | import os, yaml 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f: 3 | config = yaml.full_load(f) 4 | 5 | ID_col = config['column_names']['ID'] 6 | var_col = config['column_names']['var_name'] 7 | val_col = config['column_names']['var_value'] 8 | t_col = config['column_names']['t'] 9 | 10 | value_type_override = config['value_types'] 11 | 12 | parallel = True 13 | n_jobs = 72 14 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_noimpute/config.yaml: -------------------------------------------------------------------------------- 1 | # Customize table headers 2 | column_names: 3 | ID: ID 4 | t: t 5 | var_name: variable_name 6 | var_value: variable_value 7 | 8 | value_types: 9 | # enter the feature type that you would like to override in the following format: 10 | FIRST_WARDID: Categorical 11 | MedA: 12 | AMOUNT: Numeric 13 | ROUTE: Categorical 14 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_noimpute/run.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | import pickle 3 | import pandas as pd 4 | import numpy as np 5 | import time 6 | import os 7 | 8 | import argparse 9 | from .helpers import str2bool 10 | 11 | parser = argparse.ArgumentParser(description='') 12 | parser.add_argument('--T', type=float, required=True) 13 | parser.add_argument('--dt', type=float, required=True) 14 | parser.add_argument('--theta_1', type=float, default=0.001) 15 | parser.add_argument('--theta_2', type=float, default=0.001) 16 | parser.add_argument('--theta_freq', type=float, default=1.0) 17 | parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean']) 18 | parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True) 19 | 20 | parser.add_argument('--data_path', type=str, required=True) 21 | parser.add_argument('--input_fname', type=str, required=False) 22 | parser.add_argument('--population', type=str, required=True) 23 | parser.add_argument('--N', type=int, required=False) 24 | parser.add_argument('--Ds', nargs='+', type=int) 25 | 26 | parser.add_argument('--no_prefilter', dest='prefilter', action='store_false') 27 | parser.add_argument('--no_postfilter', dest='postfilter', action='store_false') 28 | parser.set_defaults(prefilter=True, postfilter=True) 29 | 30 | args = parser.parse_args() 31 | 32 | data_path = args.data_path 33 | if not data_path.endswith('/'): 34 | data_path += '/' 35 | 36 | population = args.population 37 | T = int(args.T) 38 | dt = args.dt 39 | theta_1 = args.theta_1 40 | theta_2 = args.theta_2 41 | theta_freq = args.theta_freq 42 | stats_functions = args.stats_functions 43 | binarize = args.binarize 44 | 45 | df_population = pd.read_csv(population).set_index('ID') 46 | N = args.N or len(df_population) 47 | df_population = df_population.iloc[:args.N] 48 | L = int(np.floor(T/dt)) 49 | 50 | args.df_population = df_population 51 | args.N = N 52 | args.L = L 53 | args.parallel = parallel 54 | 55 | if args.input_fname and os.path.isfile(args.input_fname): 56 | input_fname = args.input_fname 57 | if input_fname.endswith('.p' or '.pickle'): 58 | df_data = pd.read_pickle(input_fname) 59 | elif input_fname.endswith('.csv'): 60 | df_data = pd.read_csv(input_fname) 61 | else: 62 | assert False 63 | elif os.path.isfile(data_path + 'input_data.p'): 64 | input_fname = data_path + 'input_data.p' 65 | df_data = pd.read_pickle(input_fname) 66 | elif os.path.isfile(data_path + 'input_data.pickle'): 67 | input_fname = data_path + 'input_data.pickle' 68 | df_data = pd.read_pickle(input_fname) 69 | elif os.path.isfile(data_path + 'input_data.csv'): 70 | input_fname = data_path + 'input_data.csv' 71 | df_data = pd.read_csv(input_fname) 72 | 73 | 74 | from .steps import * 75 | 76 | print('Input data file:', input_fname) 77 | print() 78 | print('Input arguments:') 79 | print(' {:<6} = {}'.format('T', T)) 80 | print(' {:<6} = {}'.format('dt', dt)) 81 | print(' {:<6} = {}'.format('\u03B8\u2081', theta_1)) 82 | print(' {:<6} = {}'.format('\u03B8\u2082', theta_2)) 83 | print(' {:<6} = {}'.format('\u03B8_freq', theta_freq)) 84 | print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions)) 85 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize])) 86 | print() 87 | print('N = {}'.format(N)) 88 | print('L = {}'.format(L)) 89 | print('', flush=True) 90 | 91 | 92 | ###### 93 | # Main 94 | ###### 95 | if args.prefilter: 96 | print_header('1) Pre-filter') 97 | df_data = pre_filter(df_data, theta_1, df_population, args) 98 | df_data.to_csv(data_path + 'pre-filtered.csv', index=False) 99 | 100 | print_header('2) Transform; 3) Post-filter') 101 | df_data, df_types = detect_variable_data_type(df_data, value_type_override, args) 102 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data) 103 | 104 | # Process time-invariant data 105 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args) 106 | 107 | # Process time-dependent data 108 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args) 109 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_ordinal/config.py: -------------------------------------------------------------------------------- 1 | import os, yaml 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f: 3 | config = yaml.full_load(f) 4 | 5 | ID_col = config['column_names']['ID'] 6 | var_col = config['column_names']['var_name'] 7 | val_col = config['column_names']['var_value'] 8 | t_col = config['column_names']['t'] 9 | 10 | use_ordinal_encoding = config['use_ordinal_encoding'] 11 | value_type_override = config['value_types'] 12 | 13 | parallel = True 14 | n_jobs = 72 15 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_ordinal/config.yaml: -------------------------------------------------------------------------------- 1 | # Customize table headers 2 | column_names: 3 | ID: ID 4 | t: t 5 | var_name: variable_name 6 | var_value: variable_value 7 | 8 | use_ordinal_encoding: yes 9 | 10 | value_types: 11 | # enter the feature type that you would like to override in the following format: 12 | FIRST_WARDID: Categorical 13 | MedA: 14 | AMOUNT: Numeric 15 | ROUTE: Categorical 16 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/FIDDLE_ordinal/run.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | import pickle 3 | import pandas as pd 4 | import numpy as np 5 | import time 6 | import os 7 | 8 | import argparse 9 | from .helpers import str2bool 10 | 11 | parser = argparse.ArgumentParser(description='') 12 | parser.add_argument('--T', type=float, required=True) 13 | parser.add_argument('--dt', type=float, required=True) 14 | parser.add_argument('--theta_1', type=float, default=0.001) 15 | parser.add_argument('--theta_2', type=float, default=0.001) 16 | parser.add_argument('--theta_freq', type=float, default=1.0) 17 | parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean']) 18 | parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True) 19 | 20 | parser.add_argument('--data_path', type=str, required=True) 21 | parser.add_argument('--input_fname', type=str, required=False) 22 | parser.add_argument('--population', type=str, required=True) 23 | parser.add_argument('--N', type=int, required=False) 24 | parser.add_argument('--Ds', nargs='+', type=int) 25 | 26 | parser.add_argument('--no_prefilter', dest='prefilter', action='store_false') 27 | parser.add_argument('--no_postfilter', dest='postfilter', action='store_false') 28 | parser.set_defaults(prefilter=True, postfilter=True) 29 | 30 | args = parser.parse_args() 31 | 32 | data_path = args.data_path 33 | if not data_path.endswith('/'): 34 | data_path += '/' 35 | 36 | population = args.population 37 | T = int(args.T) 38 | dt = args.dt 39 | theta_1 = args.theta_1 40 | theta_2 = args.theta_2 41 | theta_freq = args.theta_freq 42 | stats_functions = args.stats_functions 43 | binarize = args.binarize 44 | 45 | df_population = pd.read_csv(population).set_index('ID') 46 | N = args.N or len(df_population) 47 | df_population = df_population.iloc[:args.N] 48 | L = int(np.floor(T/dt)) 49 | 50 | args.df_population = df_population 51 | args.N = N 52 | args.L = L 53 | args.parallel = parallel 54 | 55 | if args.input_fname and os.path.isfile(args.input_fname): 56 | input_fname = args.input_fname 57 | if input_fname.endswith('.p' or '.pickle'): 58 | df_data = pd.read_pickle(input_fname) 59 | elif input_fname.endswith('.csv'): 60 | df_data = pd.read_csv(input_fname) 61 | else: 62 | assert False 63 | elif os.path.isfile(data_path + 'input_data.p'): 64 | input_fname = data_path + 'input_data.p' 65 | df_data = pd.read_pickle(input_fname) 66 | elif os.path.isfile(data_path + 'input_data.pickle'): 67 | input_fname = data_path + 'input_data.pickle' 68 | df_data = pd.read_pickle(input_fname) 69 | elif os.path.isfile(data_path + 'input_data.csv'): 70 | input_fname = data_path + 'input_data.csv' 71 | df_data = pd.read_csv(input_fname) 72 | 73 | 74 | from .steps import * 75 | 76 | print('Input data file:', input_fname) 77 | print() 78 | print('Input arguments:') 79 | print(' {:<6} = {}'.format('T', T)) 80 | print(' {:<6} = {}'.format('dt', dt)) 81 | print(' {:<6} = {}'.format('\u03B8\u2081', theta_1)) 82 | print(' {:<6} = {}'.format('\u03B8\u2082', theta_2)) 83 | print(' {:<6} = {}'.format('\u03B8_freq', theta_freq)) 84 | print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions)) 85 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize])) 86 | print() 87 | print('N = {}'.format(N)) 88 | print('L = {}'.format(L)) 89 | print('', flush=True) 90 | 91 | 92 | ###### 93 | # Main 94 | ###### 95 | if args.prefilter: 96 | print_header('1) Pre-filter') 97 | df_data = pre_filter(df_data, theta_1, df_population, args) 98 | df_data.to_csv(data_path + 'pre-filtered.csv', index=False) 99 | 100 | print_header('2) Transform; 3) Post-filter') 101 | df_data, df_types = detect_variable_data_type(df_data, value_type_override, args) 102 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data) 103 | 104 | # Process time-invariant data 105 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args) 106 | 107 | # Process time-dependent data 108 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args) 109 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/impute,benchmark,outcome=mortality,T=48.0,dt=1.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 1.0 6 | θ₁ = 0.1 7 | θ₂ = 0.1 8 | θ_freq = 1.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 48 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.1) 22 | Total variables : 5405 23 | Rare variables : 4400 24 | Remaining variables : 1005 25 | # rows (original) : 33684409 26 | # rows (filtered) : 30906331 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/impute,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 993 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 30803407 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.103816 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.513918 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 115 61 | Correlated : 3 62 | Time elapsed: 1.529421 seconds 63 | 64 | Output 65 | s: shape=(8577, 34), density=0.267 66 | Total time: 1.560092 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 993 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP'] 74 | M₁ = 6 75 | M₂ = 987 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 179265 out of 8577×48×6=2470176 total 81 | (freq) number of imputed entries : 97328 82 | (freq) number of not imputed entries : 81937 83 | (non-freq) number of missing entries : 379890514 out of 8577×48×987=406343952 total 84 | 85 | (N × L × ^D) table : (8577, 48, 1023) 86 | Time elapsed: 282.528516 seconds 87 | Discretizing features... 88 | 89 | Processing 1017 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 48, 3071), density=0.032 96 | Time elapsed: 508.261614 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 48, 3071) 0.03224463116576165 102 | Original : 3071 103 | Nearly-constant: 1598 104 | *** time: 649.6199653148651 105 | Correlated : 75 106 | *** time: 1411.193752527237 107 | 108 | Output 109 | X: shape=(8577, 48, 1398), density=0.060 110 | (8577, 48, 1398) 0.059674307789588654 111 | Time elapsed: 1919.462684 seconds 112 | 113 | Output 114 | X: shape=(8577, 48, 1398), density=0.060 115 | Total time: 1943.168096 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 1.0 6 | θ₁ = 0.001 7 | θ₂ = 0.001 8 | θ_freq = 1.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 48 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.001) 22 | Total variables : 5405 23 | Rare variables : 1524 24 | Remaining variables : 3881 25 | # rows (original) : 33684409 26 | # rows (filtered) : 33661000 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 3869 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 33558076 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.138054 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.914684 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 53 61 | Correlated : 3 62 | Time elapsed: 1.935639 seconds 63 | 64 | Output 65 | s: shape=(8577, 96), density=0.116 66 | Total time: 1.982121 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 3869 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP'] 74 | M₁ = 6 75 | M₂ = 3863 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 179265 out of 8577×48×6=2470176 total 81 | (freq) number of imputed entries : 97328 82 | (freq) number of not imputed entries : 81937 83 | (non-freq) number of missing entries : 1561275820 out of 8577×48×3863=1590381648 total 84 | 85 | (N × L × ^D) table : (8577, 48, 3881) 86 | Time elapsed: 2208.715729 seconds 87 | Discretizing features... 88 | 89 | Processing 3875 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 48, 9641), density=0.009 96 | Time elapsed: 6289.449163 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 48, 9641) 0.009134210670151034 102 | Original : 9641 103 | Nearly-constant: 2072 104 | *** time: 4570.7659068107605 105 | Correlated : 334 106 | *** time: 9981.303814411163 107 | 108 | Output 109 | X: shape=(8577, 48, 7235), density=0.011 110 | (8577, 48, 7235) 0.010731039202925532 111 | Time elapsed: 16270.757611 seconds 112 | 113 | Output 114 | X: shape=(8577, 48, 7235), density=0.011 115 | Total time: 16299.821350 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 1.0 6 | θ₁ = 0.001 7 | θ₂ = 0.001 8 | θ_freq = 1.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 48 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.001) 22 | Total variables : 5405 23 | Rare variables : 1524 24 | Remaining variables : 3881 25 | # rows (original) : 33684409 26 | # rows (filtered) : 33661000 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 3869 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 33558076 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.144644 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.576981 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 53 61 | Correlated : 3 62 | Time elapsed: 1.595947 seconds 63 | 64 | Output 65 | s: shape=(8577, 96), density=0.116 66 | Total time: 1.633576 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 3869 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP'] 74 | M₁ = 6 75 | M₂ = 3863 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 179265 out of 8577×48×6=2470176 total 81 | (freq) number of imputed entries : 0 82 | (freq) number of not imputed entries : 0 83 | (non-freq) number of missing entries : 1561275820 out of 8577×48×3863=1590381648 total 84 | 85 | (N × L × ^D) table : (8577, 48, 3875) 86 | Time elapsed: 1895.090169 seconds 87 | Discretizing features... 88 | 89 | Processing 3869 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 48, 9635), density=0.009 96 | Time elapsed: 6725.114749 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 48, 9635) 0.008517169182244537 102 | Original : 9635 103 | Nearly-constant: 2066 104 | *** time: 3142.791482448578 105 | Correlated : 334 106 | *** time: 7060.699452161789 107 | 108 | Output 109 | X: shape=(8577, 48, 7235), density=0.011 110 | (8577, 48, 7235) 0.010731039202925532 111 | Time elapsed: 13785.849776 seconds 112 | 113 | Output 114 | X: shape=(8577, 48, 7235), density=0.011 115 | Total time: 13810.326319 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 1.0 6 | θ₁ = 0.1 7 | θ₂ = 0.1 8 | θ_freq = 1.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 48 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.1) 22 | Total variables : 5405 23 | Rare variables : 4400 24 | Remaining variables : 1005 25 | # rows (original) : 33684409 26 | # rows (filtered) : 30906331 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 993 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 30803407 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.111821 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.643674 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 115 61 | Correlated : 3 62 | Time elapsed: 1.659079 seconds 63 | 64 | Output 65 | s: shape=(8577, 34), density=0.267 66 | Total time: 1.689984 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 993 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP'] 74 | M₁ = 6 75 | M₂ = 987 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 179265 out of 8577×48×6=2470176 total 81 | (freq) number of imputed entries : 97328 82 | (freq) number of not imputed entries : 81937 83 | (non-freq) number of missing entries : 379890514 out of 8577×48×987=406343952 total 84 | 85 | (N × L × ^D) table : (8577, 48, 1023) 86 | Time elapsed: 599.909334 seconds 87 | Discretizing features... 88 | 89 | Processing 1017 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 48, 3071), density=0.032 96 | Time elapsed: 882.033273 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 48, 3071) 0.03243142271843646 102 | Original : 3071 103 | Nearly-constant: 1602 104 | *** time: 656.6613774299622 105 | Correlated : 72 106 | *** time: 1418.9878075122833 107 | 108 | Output 109 | X: shape=(8577, 48, 1397), density=0.059 110 | (8577, 48, 1397) 0.05935146891854264 111 | Time elapsed: 2301.027611 seconds 112 | 113 | Output 114 | X: shape=(8577, 48, 1397), density=0.059 115 | Total time: 2324.529186 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 1.0 6 | θ₁ = 0.001 7 | θ₂ = 0.001 8 | θ_freq = 10000.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 48 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.001) 22 | Total variables : 5405 23 | Rare variables : 1524 24 | Remaining variables : 3881 25 | # rows (original) : 33684409 26 | # rows (filtered) : 33661000 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 3869 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 33558076 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.134358 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.595397 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 53 61 | Correlated : 3 62 | Time elapsed: 1.615305 seconds 63 | 64 | Output 65 | s: shape=(8577, 96), density=0.116 66 | Total time: 1.661497 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 3869 73 | Frequent variables : [] 74 | M₁ = 0 75 | M₂ = 3869 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 0.0 out of 8577×48×0=0 total 81 | (freq) number of imputed entries : 0.0 82 | (freq) number of not imputed entries : 0.0 83 | (non-freq) number of missing entries : 1561455085 out of 8577×48×3869=1592851824 total 84 | 85 | (N × L × ^D) table : (8577, 48, 3869) 86 | Time elapsed: 1273.130074 seconds 87 | Discretizing features... 88 | 89 | Processing 3869 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 48, 9629), density=0.008 96 | Time elapsed: 7504.357979 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 48, 9629) 0.007920027846763844 102 | Original : 9629 103 | Nearly-constant: 2066 104 | *** time: 3209.2778713703156 105 | Correlated : 334 106 | *** time: 7071.120953321457 107 | 108 | Output 109 | X: shape=(8577, 48, 7229), density=0.010 110 | (8577, 48, 7229) 0.009937486747645477 111 | Time elapsed: 14575.489017 seconds 112 | 113 | Output 114 | X: shape=(8577, 48, 7229), density=0.010 115 | Total time: 14597.368192 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 12.0 6 | θ₁ = 0.001 7 | θ₂ = 0.001 8 | θ_freq = 1000000.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 4 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.001) 22 | Total variables : 5405 23 | Rare variables : 1524 24 | Remaining variables : 3881 25 | # rows (original) : 33684409 26 | # rows (filtered) : 33661000 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 3869 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 33558076 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.234322 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.939724 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 53 61 | Correlated : 3 62 | Time elapsed: 2.005332 seconds 63 | 64 | Output 65 | s: shape=(8577, 96), density=0.116 66 | Total time: 2.075384 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 3869 73 | Frequent variables : [] 74 | M₁ = 0 75 | M₂ = 3869 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 0.0 out of 8577×4×0=0 total 81 | (freq) number of imputed entries : 0.0 82 | (freq) number of not imputed entries : 0.0 83 | (non-freq) number of missing entries : 122456605 out of 8577×4×3869=132737652 total 84 | 85 | (N × L × ^D) table : (8577, 4, 3869) 86 | Time elapsed: 442.025943 seconds 87 | Discretizing features... 88 | 89 | Processing 3869 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 4, 9625), density=0.031 96 | Time elapsed: 593.201395 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 4, 9625) 0.031134450485971996 102 | Original : 9625 103 | Nearly-constant: 2171 104 | *** time: 423.86314630508423 105 | Correlated : 333 106 | *** time: 831.847085237503 107 | 108 | Output 109 | X: shape=(8577, 4, 7121), density=0.040 110 | (8577, 4, 7121) 0.04025364075537859 111 | Time elapsed: 1425.054848 seconds 112 | 113 | Output 114 | X: shape=(8577, 4, 7121), density=0.040 115 | Total time: 1430.915702 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/nofreq,benchmark,outcome=mortality,T=48.0,dt=4.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 4.0 6 | θ₁ = 0.001 7 | θ₂ = 0.001 8 | θ_freq = 100000.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 12 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.001) 22 | Total variables : 5405 23 | Rare variables : 1524 24 | Remaining variables : 3881 25 | # rows (original) : 33684409 26 | # rows (filtered) : 33661000 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=4.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 3869 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 33558076 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.161524 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.528967 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 53 61 | Correlated : 3 62 | Time elapsed: 1.584104 seconds 63 | 64 | Output 65 | s: shape=(8577, 96), density=0.116 66 | Total time: 1.633917 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 3869 73 | Frequent variables : [] 74 | M₁ = 0 75 | M₂ = 3869 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 0.0 out of 8577×12×0=0 total 81 | (freq) number of imputed entries : 0.0 82 | (freq) number of not imputed entries : 0.0 83 | (non-freq) number of missing entries : 377580723 out of 8577×12×3869=398212956 total 84 | 85 | (N × L × ^D) table : (8577, 12, 3869) 86 | Time elapsed: 533.026957 seconds 87 | Discretizing features... 88 | 89 | Processing 3869 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 12, 9661), density=0.021 96 | Time elapsed: 1533.176079 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 12, 9661) 0.020749493283425048 102 | Original : 9661 103 | Nearly-constant: 2121 104 | *** time: 1299.0333437919617 105 | Correlated : 332 106 | *** time: 2987.5951042175293 107 | 108 | Output 109 | X: shape=(8577, 12, 7208), density=0.027 110 | (8577, 12, 7208) 0.02676353172417211 111 | Time elapsed: 4520.922278 seconds 112 | 113 | Output 114 | X: shape=(8577, 12, 7208), density=0.027 115 | Total time: 4537.501810 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 48.0 6 | θ₁ = 0.001 7 | θ₂ = 0.001 8 | θ_freq = 1000000.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 1 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.001) 22 | Total variables : 5405 23 | Rare variables : 1524 24 | Remaining variables : 3881 25 | # rows (original) : 33684409 26 | # rows (filtered) : 33661000 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 3869 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 33558076 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.115656 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.586363 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 53 61 | Correlated : 3 62 | Time elapsed: 1.605498 seconds 63 | 64 | Output 65 | s: shape=(8577, 96), density=0.116 66 | Total time: 1.643156 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 3869 73 | Frequent variables : [] 74 | M₁ = 0 75 | M₂ = 3869 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 0.0 out of 8577×1×0=0 total 81 | (freq) number of imputed entries : 0.0 82 | (freq) number of not imputed entries : 0.0 83 | (non-freq) number of missing entries : 29214964 out of 8577×1×3869=33184413 total 84 | 85 | (N × L × ^D) table : (8577, 1, 3869) 86 | Time elapsed: 421.370643 seconds 87 | Discretizing features... 88 | 89 | Processing 3869 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 1, 9433), density=0.049 96 | Time elapsed: 502.070196 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 1, 9433) 0.04906196992662215 102 | Original : 9433 103 | Nearly-constant: 2211 104 | *** time: 139.90465545654297 105 | Correlated : 347 106 | *** time: 263.2418613433838 107 | 108 | Output 109 | X: shape=(8577, 1, 6875), density=0.064 110 | (8577, 1, 6875) 0.0636226525485707 111 | Time elapsed: 765.346615 seconds 112 | 113 | Output 114 | X: shape=(8577, 1, 6875), density=0.064 115 | Total time: 767.538363 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 1.0 6 | θ₁ = 0.1 7 | θ₂ = 0.1 8 | θ_freq = 1.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 48 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.1) 22 | Total variables : 5405 23 | Rare variables : 4400 24 | Remaining variables : 1005 25 | # rows (original) : 33684409 26 | # rows (filtered) : 30906331 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 993 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 30803407 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.103230 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.321447 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 115 61 | Correlated : 3 62 | Time elapsed: 1.335476 seconds 63 | 64 | Output 65 | s: shape=(8577, 34), density=0.267 66 | Total time: 1.366069 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 993 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP'] 74 | M₁ = 6 75 | M₂ = 987 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 179265 out of 8577×48×6=2470176 total 81 | (freq) number of imputed entries : 97328 82 | (freq) number of not imputed entries : 81937 83 | (non-freq) number of missing entries : 379890514 out of 8577×48×987=406343952 total 84 | 85 | (N × L × ^D) table : (8577, 48, 1023) 86 | Time elapsed: 459.223514 seconds 87 | Discretizing features... 88 | 89 | Processing 1017 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 48, 3071), density=0.032 96 | Time elapsed: 697.539552 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 48, 3071) 0.03193670873492995 102 | Original : 3071 103 | Nearly-constant: 1598 104 | *** time: 651.8644843101501 105 | Correlated : 76 106 | *** time: 1414.6409630775452 107 | 108 | Output 109 | X: shape=(8577, 48, 1397), density=0.058 110 | (8577, 48, 1397) 0.05843776333619845 111 | Time elapsed: 2112.187592 seconds 112 | 113 | Output 114 | X: shape=(8577, 48, 1397), density=0.058 115 | Total time: 2135.861422 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 1.0 6 | θ₁ = 0.001 7 | θ₂ = 0.001 8 | θ_freq = 1.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 48 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.001) 22 | Total variables : 5405 23 | Rare variables : 1524 24 | Remaining variables : 3881 25 | # rows (original) : 33684409 26 | # rows (filtered) : 33661000 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 3869 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 33558076 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.256884 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 2.509025 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 55 61 | Correlated : 3 62 | Time elapsed: 2.586198 seconds 63 | 64 | Output 65 | s: shape=(8577, 94), density=0.140 66 | Total time: 2.659438 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 3869 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP'] 74 | M₁ = 6 75 | M₂ = 3863 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 179265 out of 8577×48×6=2470176 total 81 | (freq) number of imputed entries : 97328 82 | (freq) number of not imputed entries : 81937 83 | (non-freq) number of missing entries : 1561275820 out of 8577×48×3863=1590381648 total 84 | 85 | (N × L × ^D) table : (8577, 48, 3899) 86 | Time elapsed: 1602.799133 seconds 87 | Discretizing features... 88 | 89 | Processing 3893 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 48, 9950), density=0.016 96 | Time elapsed: 5116.279016 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 48, 9950) 0.015691314360071314 102 | Original : 9950 103 | Nearly-constant: 1642 104 | *** time: 5924.276056051254 105 | Correlated : 359 106 | *** time: 12414.71133685112 107 | 108 | Output 109 | X: shape=(8577, 48, 7949), density=0.018 110 | (8577, 48, 7949) 0.017977339510562455 111 | Time elapsed: 17531.153291 seconds 112 | 113 | Output 114 | X: shape=(8577, 48, 7949), density=0.018 115 | Total time: 17576.561756 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/theta=0.001,medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 1.0 6 | θ₁ = 0.001 7 | θ₂ = 0.001 8 | θ_freq = 1.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 48 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.001) 22 | Total variables : 5405 23 | Rare variables : 1524 24 | Remaining variables : 3881 25 | # rows (original) : 33684409 26 | # rows (filtered) : 33661000 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 3869 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 33558076 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.617579 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 3.354901 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 53 61 | Correlated : 3 62 | Time elapsed: 3.446176 seconds 63 | 64 | Output 65 | s: shape=(8577, 96), density=0.116 66 | Total time: 3.512502 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 3869 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP'] 74 | M₁ = 6 75 | M₂ = 3863 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 179265 out of 8577×48×6=2470176 total 81 | (freq) number of imputed entries : 97328 82 | (freq) number of not imputed entries : 81937 83 | (non-freq) number of missing entries : 1561275820 out of 8577×48×3863=1590381648 total 84 | 85 | (N × L × ^D) table : (8577, 48, 3899) 86 | Time elapsed: 1575.769876 seconds 87 | Discretizing features... 88 | 89 | Processing 3893 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 48, 9719), density=0.011 96 | Time elapsed: 6665.714383 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 48, 9719) 0.010910535315619061 102 | Original : 9719 103 | Nearly-constant: 2072 104 | *** time: 3580.4881682395935 105 | Correlated : 340 106 | *** time: 8112.426522254944 107 | 108 | Output 109 | X: shape=(8577, 48, 7307), density=0.013 110 | (8577, 48, 7307) 0.012678978253797912 111 | Time elapsed: 14778.144839 seconds 112 | 113 | Output 114 | X: shape=(8577, 48, 7307), density=0.013 115 | Total time: 14806.507060 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/log/theta=0.001,noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out: -------------------------------------------------------------------------------- 1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 1.0 6 | θ₁ = 0.001 7 | θ₂ = 0.001 8 | θ_freq = 1.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 48 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.001) 22 | Total variables : 5405 23 | Rare variables : 1524 24 | Remaining variables : 3881 25 | # rows (original) : 33684409 26 | # rows (filtered) : 33661000 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/features,ablations/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 3869 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 33558076 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.112251 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.584486 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 53 61 | Correlated : 3 62 | Time elapsed: 1.613765 seconds 63 | 64 | Output 65 | s: shape=(8577, 96), density=0.116 66 | Total time: 1.665902 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 3869 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP'] 74 | M₁ = 6 75 | M₂ = 3863 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 179265 out of 8577×48×6=2470176 total 81 | (freq) number of imputed entries : 97328 82 | (freq) number of not imputed entries : 81937 83 | (non-freq) number of missing entries : 1561275820 out of 8577×48×3863=1590381648 total 84 | 85 | (N × L × ^D) table : (8577, 48, 3899) 86 | Time elapsed: 1101.718930 seconds 87 | Discretizing features... 88 | 89 | Processing 3893 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 48, 9719), density=0.011 96 | Time elapsed: 3553.558418 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 48, 9719) 0.010754216080785386 102 | Original : 9719 103 | Nearly-constant: 2072 104 | *** time: 3614.3632407188416 105 | Correlated : 341 106 | *** time: 7846.658187866211 107 | 108 | Output 109 | X: shape=(8577, 48, 7306), density=0.012 110 | (8577, 48, 7306) 0.012370484208457008 111 | Time elapsed: 11400.220244 seconds 112 | 113 | Output 114 | X: shape=(8577, 48, 7306), density=0.012 115 | Total time: 11427.416120 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/run_mortality_mask+Dt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | export PYTHONPATH="./" 5 | mkdir -p log 6 | mkdir -p "../data/features,ablations/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0/" 7 | 8 | python -m FIDDLE_mask+Dt.run \ 9 | --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 10 | --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \ 11 | --data_path="../data/features,ablations/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 12 | --T=48.0 \ 13 | --dt=1.0 \ 14 | --theta_1=0.001 \ 15 | --theta_2=0.001 \ 16 | --theta_freq=1 \ 17 | --stats_functions 'min' 'max' 'mean' \ 18 | > >(tee 'log/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 19 | 2> >(tee 'log/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 20 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/run_mortality_maskonly.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | export PYTHONPATH="./" 5 | mkdir -p log 6 | mkdir -p "../data/features,ablations/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0/" 7 | 8 | python -m FIDDLE_maskonly.run \ 9 | --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 10 | --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \ 11 | --data_path="../data/features,ablations/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 12 | --T=48.0 \ 13 | --dt=1.0 \ 14 | --theta_1=0.001 \ 15 | --theta_2=0.001 \ 16 | --theta_freq=1 \ 17 | --stats_functions 'min' 'max' 'mean' \ 18 | > >(tee 'log/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 19 | 2> >(tee 'log/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 20 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/run_mortality_nofreq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | export PYTHONPATH="../../../FIDDLE/" 5 | mkdir -p log 6 | mkdir -p "../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0/" 7 | mkdir -p "../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0/" 8 | mkdir -p "../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0/" 9 | 10 | # python -m FIDDLE.run \ 11 | # --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 12 | # --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \ 13 | # --data_path="../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 14 | # --T=48.0 \ 15 | # --dt=1.0 \ 16 | # --theta_1=0.001 \ 17 | # --theta_2=0.001 \ 18 | # --theta_freq=1000000 \ 19 | # --stats_functions 'min' 'max' 'mean' \ 20 | # > >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 21 | # 2> >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) & 22 | 23 | python -m FIDDLE.run \ 24 | --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 25 | --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \ 26 | --data_path="../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0/" \ 27 | --T=48.0 \ 28 | --dt=12.0 \ 29 | --theta_1=0.001 \ 30 | --theta_2=0.001 \ 31 | --theta_freq=1000000 \ 32 | --stats_functions 'min' 'max' 'mean' \ 33 | > >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0.out') \ 34 | 2> >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0.err' >&2) & 35 | 36 | python -m FIDDLE.run \ 37 | --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 38 | --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \ 39 | --data_path="../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0/" \ 40 | --T=48.0 \ 41 | --dt=48.0 \ 42 | --theta_1=0.001 \ 43 | --theta_2=0.001 \ 44 | --theta_freq=1000000 \ 45 | --stats_functions 'min' 'max' 'mean' \ 46 | > >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0.out') \ 47 | 2> >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0.err' >&2) & 48 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/run_mortality_noimpute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | export PYTHONPATH="./" 5 | mkdir -p log 6 | mkdir -p "../data/features,ablations/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/" 7 | 8 | python -m FIDDLE_noimpute.run \ 9 | --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 10 | --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \ 11 | --data_path="../data/features,ablations/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 12 | --T=48.0 \ 13 | --dt=1.0 \ 14 | --theta_1=0.001 \ 15 | --theta_2=0.001 \ 16 | --theta_freq=1 \ 17 | --stats_functions 'min' 'max' 'mean' \ 18 | > >(tee 'log/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 19 | 2> >(tee 'log/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 20 | -------------------------------------------------------------------------------- /mimic3_ablations/2_apply_FIDDLE/run_mortality_ordinal.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | export PYTHONPATH="./" 5 | mkdir -p log 6 | mkdir -p "../data/features,ablations/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0/" 7 | 8 | python -m FIDDLE_ordinal.run \ 9 | --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 10 | --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \ 11 | --data_path="../data/features,ablations/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 12 | --T=48.0 \ 13 | --dt=1.0 \ 14 | --theta_1=0.001 \ 15 | --theta_2=0.001 \ 16 | --theta_freq=1 \ 17 | --stats_functions 'min' 'max' 'mean' \ 18 | > >(tee 'log/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 19 | 2> >(tee 'log/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 20 | -------------------------------------------------------------------------------- /mimic3_ablations/3_ML_models/config.yaml: -------------------------------------------------------------------------------- 1 | data_path: ../data/processed/ 2 | 3 | model_names: { 4 | 'CNN': 'CNN_V3', 5 | 'RNN': 'RNN_V2', 6 | 'LR': 'LR', 7 | 'RF': 'RF', 8 | } 9 | 10 | train: 11 | budget: 50 12 | repeat: 1 13 | epochs: 15 14 | 15 | feature_dimension: 16 | ARF: 17 | 4.0 : 4143 18 | 12.0: 4912 19 | 20 | Shock: 21 | 4.0 : 4620 22 | 12.0: 5597 23 | 24 | mortality: 25 | 48.0: 7508 26 | -------------------------------------------------------------------------------- /mimic3_ablations/3_ML_models/lib/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | def get_best_model_info(df_search): 5 | df_search_sorted = df_search.sort_values('best_score', ascending=False).head() 6 | best_model_info = df_search_sorted.iloc[0, 1:] 7 | return best_model_info 8 | 9 | def load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params, load_filename=None): 10 | if load_filename is None: 11 | savename = best_model_info['savename'] 12 | split = savename.split('/') 13 | split[-1] = 'best_' + split[-1] 14 | load_filename = '/'.join(split) 15 | 16 | checkpoint = torch.load(load_filename) 17 | _iter = checkpoint['_iter'] 18 | print("Loaded checkpoint (trained for {} iterations)".format(checkpoint['_iter'])) 19 | # print(load_filename) 20 | 21 | best_HP = best_model_info.drop(['savename', 'best_iter', 'seed']).to_dict() 22 | model = ModelClass( 23 | in_channels, L_in, 1, 24 | **{k:best_HP[k] for k in best_HP.keys() if k not in training_params} 25 | ) 26 | model.load_state_dict(checkpoint['state_dict']) 27 | model.cuda() 28 | print("Restored model {} with #params={}".format(ModelClass, sum(p.numel() for p in model.parameters()))) 29 | 30 | return checkpoint, model 31 | 32 | def get_test_predictions(model, te_loader, task=None, model_name=None): 33 | model.eval() 34 | running_pred = [] 35 | 36 | cuda = True 37 | for i, (X, y) in enumerate(te_loader): 38 | if cuda: 39 | X = X.contiguous().cuda() 40 | y = y.contiguous().cuda(non_blocking=True) 41 | 42 | with torch.set_grad_enabled(False): 43 | output = model(X) 44 | running_pred.append((output.data.detach().cpu(), y.data.detach().cpu())) 45 | 46 | y_score, y_true = zip(*running_pred) 47 | y_score = torch.cat(y_score).numpy() 48 | y_true = torch.cat(y_true).numpy() 49 | 50 | assert (np.stack(te_loader.dataset.y) == y_true).all() 51 | return y_true, y_score 52 | 53 | def save_test_predictions(y_true, y_score, model_name, save_dir): 54 | import pathlib 55 | pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True) 56 | 57 | fname = save_dir + '/{}.test.npz'.format(model_name) 58 | np.savez( 59 | open(fname, 'wb'), 60 | y_score = y_score, 61 | y_true = y_true, 62 | ) 63 | print('Test predictions saved to', fname) 64 | -------------------------------------------------------------------------------- /mimic3_ablations/3_ML_models/lib/experiment.py: -------------------------------------------------------------------------------- 1 | from .trainer import Trainer 2 | import time 3 | import random 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.model_selection import ParameterSampler 8 | 9 | class Experiment(object): 10 | def __init__(self, param_grid, budget=1, repeat=1, n_epochs=5, name='tmp'): 11 | self.name = name 12 | self.budget = budget 13 | self.repeat = repeat # number of restarts with different random seeds 14 | self.n_epochs = n_epochs 15 | self.param_grid = param_grid 16 | self.param_sampler = ParameterSampler(param_grid, n_iter=self.budget, random_state=0) 17 | 18 | def run(self): 19 | df_search = pd.DataFrame(columns=['best_score', 'best_iter', 'seed', 'savename'] + list(self.param_grid.keys())) 20 | start_time = time.time() 21 | for run, params in enumerate(self.param_sampler): 22 | print(self.name, '\t', 'Run:', run, '/', self.budget) 23 | print(params) 24 | for i in range(self.repeat): 25 | results = self._run_trial(i, params) 26 | df_search = df_search.append(results, ignore_index=True) 27 | df_search.to_csv('./log/df_search.current.{}.csv'.format(self.name), index=False) 28 | 29 | print('Took:', time.time() - start_time) 30 | return df_search 31 | 32 | def _run_trial(self, seed, params): 33 | savename = 'checkpoint/{}/{}_seed={}.pth.tar'.format(self.name, params, seed) 34 | 35 | random.seed(seed) 36 | np.random.seed(seed) 37 | torch.manual_seed(seed) 38 | torch.cuda.manual_seed_all(seed) 39 | 40 | tr_loader, va_loader = self.get_data() 41 | model, criterion, optimizer = self.get_model_params(params) 42 | trainer = Trainer(model, criterion, optimizer, tr_loader, va_loader, 43 | n_epochs=self.n_epochs, batch_size=params['batch_size'], 44 | savename=savename, 45 | save_every=100, plot_every=50, cuda=True) 46 | # print(trainer) 47 | trainer.fit() 48 | 49 | print(trainer._best_iter, '{:.5f}'.format(trainer.best_score)) 50 | 51 | del model 52 | return { 53 | 'best_score': trainer.best_score, 'best_iter': trainer._best_iter, 54 | 'savename': savename, 'seed': seed, 55 | **params, 56 | } 57 | 58 | def get_model_params(self): 59 | raise NotImplementedError 60 | 61 | def get_data(self): 62 | raise NotImplementedError 63 | -------------------------------------------------------------------------------- /mimic3_ablations/3_ML_models/lib/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | class CNN_V3(nn.Module): 7 | """ 8 | Multilayer CNN with 1D convolutions 9 | """ 10 | def __init__( 11 | self, 12 | in_channels, 13 | L_in, 14 | output_size, 15 | depth=2, 16 | filter_size=3, 17 | n_filters=64, 18 | n_neurons=64, 19 | dropout=0.2, 20 | activation='relu', 21 | ): 22 | super().__init__() 23 | self.depth = depth 24 | if activation == 'relu': 25 | self.activation = F.relu 26 | elif activation == 'elu': 27 | self.activation = F.elu 28 | padding = int(np.floor(filter_size / 2)) 29 | 30 | if depth == 1: 31 | self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding) 32 | self.pool1 = nn.MaxPool1d(2, 2) 33 | self.fc1 = nn.Linear(int(L_in * (n_filters) / 2), n_neurons) 34 | self.fc1_drop = nn.Dropout(dropout) 35 | self.fc2 = nn.Linear(n_neurons, 1) 36 | 37 | elif depth == 2: 38 | self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding) 39 | self.pool1 = nn.MaxPool1d(2, 2) 40 | self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding) 41 | self.pool2 = nn.MaxPool1d(2, 2) 42 | self.fc1 = nn.Linear(int(L_in * (n_filters) / 4), n_neurons) 43 | self.fc1_drop = nn.Dropout(dropout) 44 | self.fc2 = nn.Linear(n_neurons, 1) 45 | 46 | elif depth == 3: 47 | self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding) 48 | self.pool1 = nn.MaxPool1d(2, 2) 49 | self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding) 50 | self.pool2 = nn.MaxPool1d(2, 2) 51 | self.conv3 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding) 52 | self.pool3 = nn.MaxPool1d(2, 2) 53 | self.fc1 = nn.Linear(int(L_in * (n_filters) / 8), n_neurons) 54 | self.fc1_drop = nn.Dropout(dropout) 55 | self.fc2 = nn.Linear(n_neurons, 1) 56 | 57 | def forward(self, x): 58 | # x: tensor (batch_size, L_in, in_channels) 59 | x = x.transpose(1,2) # swap time and feature axes 60 | 61 | x = self.pool1(self.activation(self.conv1(x))) 62 | if self.depth == 2 or self.depth == 3: 63 | x = self.pool2(self.activation(self.conv2(x))) 64 | if self.depth == 3: 65 | x = self.pool3(self.activation(self.conv3(x))) 66 | 67 | x = x.view(x.size(0), -1) # flatten 68 | x = self.activation(self.fc1_drop(self.fc1(x))) 69 | x = torch.sigmoid(self.fc2(x)) 70 | return x 71 | 72 | class RNN_V2(nn.Module): 73 | """ 74 | Multi-layer LSTM network 75 | """ 76 | def __init__( 77 | self, 78 | input_size, 79 | input_length, 80 | output_size, 81 | hidden_size=64, 82 | num_layers=1, 83 | dropout=0.0, 84 | n_neurons=64, 85 | activation='relu', 86 | ): 87 | super().__init__() 88 | if activation == 'relu': 89 | self.activation = F.relu 90 | elif activation == 'elu': 91 | self.activation = F.elu 92 | 93 | self.hidden_size = int(hidden_size) 94 | self.num_layers = int(num_layers) 95 | 96 | self.lstm = nn.LSTM(int(input_size), int(hidden_size), int(num_layers), batch_first=True) 97 | self.fc1 = nn.Linear(hidden_size, n_neurons) 98 | self.fc1_drop = nn.Dropout(dropout) 99 | self.fc2 = nn.Linear(n_neurons, output_size) 100 | 101 | def forward(self, x): 102 | # x: tensor (batch_size, T, input_size) 103 | # h_all: (batch_size, T, hidden_size) 104 | h_0, c_0 = self.init_hidden(x) 105 | h_all, (h_T, c_T) = self.lstm(x, (h_0, c_0)) 106 | output = h_T[-1] 107 | output = self.activation(self.fc1_drop(self.fc1(output))) 108 | output = torch.sigmoid(self.fc2(output)) 109 | return output 110 | 111 | def init_hidden(self, x): 112 | batch_size = x.size(0) 113 | return (torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device), 114 | torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)) -------------------------------------------------------------------------------- /mimic3_comparisons/2_apply_FIDDLE/log/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0.out: -------------------------------------------------------------------------------- 1 | Input data file: ../data/processed//features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p 2 | 3 | Input arguments: 4 | T = 48 5 | dt = 1.0 6 | θ₁ = 0.4 7 | θ₂ = 0.4 8 | θ_freq = 1.0 9 | k = 3 ['min', 'max', 'mean'] 10 | binarize = yes 11 | 12 | N = 8577 13 | L = 48 14 | 15 | 16 | ================================================================================ 17 | 1) Pre-filter 18 | ================================================================================ 19 | Remove rows not in population 20 | Remove rows with t outside of [0, 48] 21 | Remove rare variables (<= 0.4) 22 | Total variables : 5405 23 | Rare variables : 4965 24 | Remaining variables : 440 25 | # rows (original) : 33684409 26 | # rows (filtered) : 24750654 27 | 28 | ================================================================================ 29 | 2) Transform; 3) Post-filter 30 | ================================================================================ 31 | 32 | -------------------------------------------------------------------------------- 33 | *) Detecting value types 34 | -------------------------------------------------------------------------------- 35 | Saved as: ../data/processed//features,comparison/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv 36 | 37 | -------------------------------------------------------------------------------- 38 | *) Separate time-invariant and time-dependent 39 | -------------------------------------------------------------------------------- 40 | Variables (time-invariant): 12 41 | Variables (time-dependent): 428 42 | # rows (time-invariant): 102924 43 | # rows (time-dependent): 24647730 44 | 45 | -------------------------------------------------------------------------------- 46 | 2-A) Transform time-invariant data 47 | -------------------------------------------------------------------------------- 48 | (N × ^d) table : (8577, 12) 49 | number of missing entries : 374 out of 102924 total 50 | Time elapsed: 0.120688 seconds 51 | 52 | Output 53 | s_all, binary features : (8577, 152) 54 | Time elapsed: 1.519710 seconds 55 | 56 | -------------------------------------------------------------------------------- 57 | 3-A) Post-filter time-invariant data 58 | -------------------------------------------------------------------------------- 59 | Original : 152 60 | Nearly-constant: 147 61 | Correlated : 1 62 | Time elapsed: 1.528469 seconds 63 | 64 | Output 65 | s: shape=(8577, 4), density=0.487 66 | Total time: 1.539292 seconds 67 | 68 | 69 | -------------------------------------------------------------------------------- 70 | 2-B) Transform time-dependent data 71 | -------------------------------------------------------------------------------- 72 | Total variables : 428 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP'] 74 | M₁ = 6 75 | M₂ = 422 76 | k = 3 ['min', 'max', 'mean'] 77 | 78 | Transforming each example... 79 | DONE: Transforming each example... 80 | (freq) number of missing entries : 179265 out of 8577×48×6=2470176 total 81 | (freq) number of imputed entries : 97328 82 | (freq) number of not imputed entries : 81937 83 | (non-freq) number of missing entries : 153244621 out of 8577×48×422=173735712 total 84 | 85 | (N × L × ^D) table : (8577, 48, 458) 86 | Time elapsed: 334.029715 seconds 87 | Discretizing features... 88 | 89 | Processing 452 non-boolean variable columns... 90 | Binning numeric variables by quintile... 91 | Converting variables to binary features 92 | Finished discretizing features 93 | 94 | Output 95 | X_all: shape=(8577, 48, 1536), density=0.055 96 | Time elapsed: 546.195059 seconds 97 | 98 | -------------------------------------------------------------------------------- 99 | 3-B) Post-filter time-dependent data 100 | -------------------------------------------------------------------------------- 101 | (8577, 48, 1536) 0.055039622556247 102 | Original : 1536 103 | Nearly-constant: 998 104 | *** time: 282.423082113266 105 | Correlated : 16 106 | *** time: 628.0092172622681 107 | 108 | Output 109 | X: shape=(8577, 48, 522), density=0.128 110 | (8577, 48, 522) 0.1278138950795223 111 | Time elapsed: 1174.211754 seconds 112 | 113 | Output 114 | X: shape=(8577, 48, 522), density=0.128 115 | Total time: 1197.517308 seconds 116 | 117 | -------------------------------------------------------------------------------- /mimic3_comparisons/2_apply_FIDDLE/run_mortality_dt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | export PYTHONPATH="../../FIDDLE/" 5 | DATAPATH=$(python -c "import yaml;print(yaml.full_load(open('../config.yaml'))['data_path']);") 6 | mkdir -p log 7 | mkdir -p "$DATAPATH/features,comparison/dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" 8 | mkdir -p "$DATAPATH/features,comparison/dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" 9 | mkdir -p "$DATAPATH/features,comparison/dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" 10 | mkdir -p "$DATAPATH/features,comparison/dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" 11 | 12 | python -m FIDDLE.run \ 13 | --output_dir="$DATAPATH/features,comparison/dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 14 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 15 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 16 | --T=48.0 \ 17 | --dt=48.0 \ 18 | --theta_1=0.001 \ 19 | --theta_2=0.001 \ 20 | --theta_freq=1 \ 21 | --stats_functions 'min' 'max' 'mean' \ 22 | > >(tee 'log/dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 23 | 2> >(tee 'log/dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) & 24 | 25 | python -m FIDDLE.run \ 26 | --output_dir="$DATAPATH/features,comparison/dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 27 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 28 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 29 | --T=48.0 \ 30 | --dt=24.0 \ 31 | --theta_1=0.001 \ 32 | --theta_2=0.001 \ 33 | --theta_freq=1 \ 34 | --stats_functions 'min' 'max' 'mean' \ 35 | > >(tee 'log/dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 36 | 2> >(tee 'log/dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) & 37 | 38 | python -m FIDDLE.run \ 39 | --output_dir="$DATAPATH/features,comparison/dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 40 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 41 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 42 | --T=48.0 \ 43 | --dt=12.0 \ 44 | --theta_1=0.001 \ 45 | --theta_2=0.001 \ 46 | --theta_freq=1 \ 47 | --stats_functions 'min' 'max' 'mean' \ 48 | > >(tee 'log/dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 49 | 2> >(tee 'log/dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) & 50 | 51 | python -m FIDDLE.run \ 52 | --output_dir="$DATAPATH/features,comparison/dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 53 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 54 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 55 | --T=48.0 \ 56 | --dt=4.0 \ 57 | --theta_1=0.001 \ 58 | --theta_2=0.001 \ 59 | --theta_freq=1 \ 60 | --stats_functions 'min' 'max' 'mean' \ 61 | > >(tee 'log/dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 62 | 2> >(tee 'log/dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) & 63 | -------------------------------------------------------------------------------- /mimic3_comparisons/2_apply_FIDDLE/run_mortality_theta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | export PYTHONPATH="../../FIDDLE/" 5 | DATAPATH=$(python -c "import yaml;print(yaml.full_load(open('../config.yaml'))['data_path']);") 6 | mkdir -p log 7 | mkdir -p "$DATAPATH/features,comparison/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0/" 8 | mkdir -p "$DATAPATH/features,comparison/theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0/" 9 | mkdir -p "$DATAPATH/features,comparison/theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0/" 10 | mkdir -p "$DATAPATH/features,comparison/theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0/" 11 | mkdir -p "$DATAPATH/features,comparison/theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0/" 12 | 13 | python -m FIDDLE.run \ 14 | --output_dir="$DATAPATH/features,comparison/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 15 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 16 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 17 | --T=48.0 \ 18 | --dt=1.0 \ 19 | --theta_1=0.4 \ 20 | --theta_2=0.4 \ 21 | --theta_freq=1 \ 22 | --stats_functions 'min' 'max' 'mean' \ 23 | > >(tee 'log/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 24 | 2> >(tee 'log/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 25 | 26 | python -m FIDDLE.run \ 27 | --output_dir="$DATAPATH/features,comparison/theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 28 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 29 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 30 | --T=48.0 \ 31 | --dt=1.0 \ 32 | --theta_1=0.2 \ 33 | --theta_2=0.2 \ 34 | --theta_freq=1 \ 35 | --stats_functions 'min' 'max' 'mean' \ 36 | > >(tee 'log/theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 37 | 2> >(tee 'log/theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 38 | 39 | python -m FIDDLE.run \ 40 | --output_dir="$DATAPATH/features,comparison/theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 41 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 42 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 43 | --T=48.0 \ 44 | --dt=1.0 \ 45 | --theta_1=0.1 \ 46 | --theta_2=0.1 \ 47 | --theta_freq=1 \ 48 | --stats_functions 'min' 'max' 'mean' \ 49 | > >(tee 'log/theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 50 | 2> >(tee 'log/theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 51 | 52 | python -m FIDDLE.run \ 53 | --output_dir="$DATAPATH/features,comparison/theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 54 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 55 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 56 | --T=48.0 \ 57 | --dt=1.0 \ 58 | --theta_1=0.05 \ 59 | --theta_2=0.05 \ 60 | --theta_freq=1 \ 61 | --stats_functions 'min' 'max' 'mean' \ 62 | > >(tee 'log/theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 63 | 2> >(tee 'log/theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 64 | 65 | python -m FIDDLE.run \ 66 | --output_dir="$DATAPATH/features,comparison/theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 67 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 68 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 69 | --T=48.0 \ 70 | --dt=1.0 \ 71 | --theta_1=0.01 \ 72 | --theta_2=0.01 \ 73 | --theta_freq=1 \ 74 | --stats_functions 'min' 'max' 'mean' \ 75 | > >(tee 'log/theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 76 | 2> >(tee 'log/theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 77 | -------------------------------------------------------------------------------- /mimic3_comparisons/3_ML_models/config.yaml: -------------------------------------------------------------------------------- 1 | data_path: ../data/processed/ 2 | 3 | model_names: { 4 | 'CNN': 'CNN_V3', 5 | 'RNN': 'RNN_V2', 6 | 'LR': 'LR', 7 | 'RF': 'RF', 8 | } 9 | 10 | train: 11 | budget: 50 12 | repeat: 1 13 | epochs: 15 14 | 15 | feature_dimension: 16 | ARF: 17 | 4.0 : 4143 18 | 12.0: 4912 19 | 20 | Shock: 21 | 4.0 : 4620 22 | 12.0: 5597 23 | 24 | mortality: 25 | 48.0: 7508 26 | -------------------------------------------------------------------------------- /mimic3_comparisons/3_ML_models/lib/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | def get_best_model_info(df_search): 5 | df_search_sorted = df_search.sort_values('best_score', ascending=False).head() 6 | best_model_info = df_search_sorted.iloc[0, 1:] 7 | return best_model_info 8 | 9 | def load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params, load_filename=None): 10 | if load_filename is None: 11 | savename = best_model_info['savename'] 12 | split = savename.split('/') 13 | split[-1] = 'best_' + split[-1] 14 | load_filename = '/'.join(split) 15 | 16 | checkpoint = torch.load(load_filename) 17 | _iter = checkpoint['_iter'] 18 | print("Loaded checkpoint (trained for {} iterations)".format(checkpoint['_iter'])) 19 | # print(load_filename) 20 | 21 | best_HP = best_model_info.drop(['savename', 'best_iter', 'seed']).to_dict() 22 | model = ModelClass( 23 | in_channels, L_in, 1, 24 | **{k:best_HP[k] for k in best_HP.keys() if k not in training_params} 25 | ) 26 | model.load_state_dict(checkpoint['state_dict']) 27 | model.cuda() 28 | print("Restored model {} with #params={}".format(ModelClass, sum(p.numel() for p in model.parameters()))) 29 | 30 | return checkpoint, model 31 | 32 | def get_test_predictions(model, te_loader, task=None, model_name=None): 33 | model.eval() 34 | running_pred = [] 35 | 36 | cuda = True 37 | for i, (X, y) in enumerate(te_loader): 38 | if cuda: 39 | X = X.contiguous().cuda() 40 | y = y.contiguous().cuda(non_blocking=True) 41 | 42 | with torch.set_grad_enabled(False): 43 | output = model(X) 44 | running_pred.append((output.data.detach().cpu(), y.data.detach().cpu())) 45 | 46 | y_score, y_true = zip(*running_pred) 47 | y_score = torch.cat(y_score).numpy() 48 | y_true = torch.cat(y_true).numpy() 49 | 50 | assert (np.stack(te_loader.dataset.y) == y_true).all() 51 | return y_true, y_score 52 | 53 | def save_test_predictions(y_true, y_score, model_name, save_dir): 54 | import pathlib 55 | pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True) 56 | 57 | fname = save_dir + '/{}.test.npz'.format(model_name) 58 | np.savez( 59 | open(fname, 'wb'), 60 | y_score = y_score, 61 | y_true = y_true, 62 | ) 63 | print('Test predictions saved to', fname) 64 | -------------------------------------------------------------------------------- /mimic3_comparisons/3_ML_models/lib/experiment.py: -------------------------------------------------------------------------------- 1 | from .trainer import Trainer 2 | import time 3 | import random 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.model_selection import ParameterSampler 8 | 9 | class Experiment(object): 10 | def __init__(self, param_grid, budget=1, repeat=1, n_epochs=5, name='tmp'): 11 | self.name = name 12 | self.budget = budget 13 | self.repeat = repeat # number of restarts with different random seeds 14 | self.n_epochs = n_epochs 15 | self.param_grid = param_grid 16 | self.param_sampler = ParameterSampler(param_grid, n_iter=self.budget, random_state=0) 17 | 18 | def run(self): 19 | df_search = pd.DataFrame(columns=['best_score', 'best_iter', 'seed', 'savename'] + list(self.param_grid.keys())) 20 | start_time = time.time() 21 | for run, params in enumerate(self.param_sampler): 22 | print(self.name, '\t', 'Run:', run, '/', self.budget) 23 | print(params) 24 | for i in range(self.repeat): 25 | results = self._run_trial(i, params) 26 | df_search = df_search.append(results, ignore_index=True) 27 | df_search.to_csv('./log/df_search.current.{}.csv'.format(self.name), index=False) 28 | 29 | print('Took:', time.time() - start_time) 30 | return df_search 31 | 32 | def _run_trial(self, seed, params): 33 | savename = 'checkpoint/{}/{}_seed={}.pth.tar'.format(self.name, params, seed) 34 | 35 | random.seed(seed) 36 | np.random.seed(seed) 37 | torch.manual_seed(seed) 38 | torch.cuda.manual_seed_all(seed) 39 | 40 | tr_loader, va_loader = self.get_data() 41 | model, criterion, optimizer = self.get_model_params(params) 42 | trainer = Trainer(model, criterion, optimizer, tr_loader, va_loader, 43 | n_epochs=self.n_epochs, batch_size=params['batch_size'], 44 | savename=savename, 45 | save_every=100, plot_every=50, cuda=True) 46 | # print(trainer) 47 | trainer.fit() 48 | 49 | print(trainer._best_iter, '{:.5f}'.format(trainer.best_score)) 50 | 51 | del model 52 | return { 53 | 'best_score': trainer.best_score, 'best_iter': trainer._best_iter, 54 | 'savename': savename, 'seed': seed, 55 | **params, 56 | } 57 | 58 | def get_model_params(self): 59 | raise NotImplementedError 60 | 61 | def get_data(self): 62 | raise NotImplementedError 63 | -------------------------------------------------------------------------------- /mimic3_comparisons/3_ML_models/lib/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | class CNN_V3(nn.Module): 7 | """ 8 | Multilayer CNN with 1D convolutions 9 | """ 10 | def __init__( 11 | self, 12 | in_channels, 13 | L_in, 14 | output_size, 15 | depth=2, 16 | filter_size=3, 17 | n_filters=64, 18 | n_neurons=64, 19 | dropout=0.2, 20 | activation='relu', 21 | ): 22 | super().__init__() 23 | self.depth = depth 24 | if activation == 'relu': 25 | self.activation = F.relu 26 | elif activation == 'elu': 27 | self.activation = F.elu 28 | padding = int(np.floor(filter_size / 2)) 29 | 30 | if depth == 1: 31 | self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding) 32 | self.pool1 = nn.MaxPool1d(2, 2) 33 | self.fc1 = nn.Linear(int(L_in * (n_filters) / 2), n_neurons) 34 | self.fc1_drop = nn.Dropout(dropout) 35 | self.fc2 = nn.Linear(n_neurons, 1) 36 | 37 | elif depth == 2: 38 | self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding) 39 | self.pool1 = nn.MaxPool1d(2, 2) 40 | self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding) 41 | self.pool2 = nn.MaxPool1d(2, 2) 42 | self.fc1 = nn.Linear(int(L_in * (n_filters) / 4), n_neurons) 43 | self.fc1_drop = nn.Dropout(dropout) 44 | self.fc2 = nn.Linear(n_neurons, 1) 45 | 46 | elif depth == 3: 47 | self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding) 48 | self.pool1 = nn.MaxPool1d(2, 2) 49 | self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding) 50 | self.pool2 = nn.MaxPool1d(2, 2) 51 | self.conv3 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding) 52 | self.pool3 = nn.MaxPool1d(2, 2) 53 | self.fc1 = nn.Linear(int(L_in * (n_filters) / 8), n_neurons) 54 | self.fc1_drop = nn.Dropout(dropout) 55 | self.fc2 = nn.Linear(n_neurons, 1) 56 | 57 | def forward(self, x): 58 | # x: tensor (batch_size, L_in, in_channels) 59 | x = x.transpose(1,2) # swap time and feature axes 60 | 61 | x = self.pool1(self.activation(self.conv1(x))) 62 | if self.depth == 2 or self.depth == 3: 63 | x = self.pool2(self.activation(self.conv2(x))) 64 | if self.depth == 3: 65 | x = self.pool3(self.activation(self.conv3(x))) 66 | 67 | x = x.view(x.size(0), -1) # flatten 68 | x = self.activation(self.fc1_drop(self.fc1(x))) 69 | x = torch.sigmoid(self.fc2(x)) 70 | return x 71 | 72 | class RNN_V2(nn.Module): 73 | """ 74 | Multi-layer LSTM network 75 | """ 76 | def __init__( 77 | self, 78 | input_size, 79 | input_length, 80 | output_size, 81 | hidden_size=64, 82 | num_layers=1, 83 | dropout=0.0, 84 | n_neurons=64, 85 | activation='relu', 86 | ): 87 | super().__init__() 88 | if activation == 'relu': 89 | self.activation = F.relu 90 | elif activation == 'elu': 91 | self.activation = F.elu 92 | 93 | self.hidden_size = int(hidden_size) 94 | self.num_layers = int(num_layers) 95 | 96 | self.lstm = nn.LSTM(int(input_size), int(hidden_size), int(num_layers), batch_first=True) 97 | self.fc1 = nn.Linear(hidden_size, n_neurons) 98 | self.fc1_drop = nn.Dropout(dropout) 99 | self.fc2 = nn.Linear(n_neurons, output_size) 100 | 101 | def forward(self, x): 102 | # x: tensor (batch_size, T, input_size) 103 | # h_all: (batch_size, T, hidden_size) 104 | h_0, c_0 = self.init_hidden(x) 105 | h_all, (h_T, c_T) = self.lstm(x, (h_0, c_0)) 106 | output = h_T[-1] 107 | output = self.activation(self.fc1_drop(self.fc1(output))) 108 | output = torch.sigmoid(self.fc2(output)) 109 | return output 110 | 111 | def init_hidden(self, x): 112 | batch_size = x.size(0) 113 | return (torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device), 114 | torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)) -------------------------------------------------------------------------------- /mimic3_comparisons/3_ML_models/run_shallow_dt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | mkdir -p log 4 | mkdir -p output 5 | 6 | export DT=48.0 7 | python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \ 8 | > >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.out") \ 9 | 2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2) & 10 | python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \ 11 | > >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.out") \ 12 | 2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2) 13 | 14 | # export DT=24.0 15 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \ 16 | # > >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.out") \ 17 | # 2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2) & 18 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \ 19 | # > >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.out") \ 20 | # 2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2) 21 | 22 | # export DT=12.0 23 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \ 24 | # > >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.out") \ 25 | # 2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2) & 26 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \ 27 | # > >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.out") \ 28 | # 2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2) 29 | 30 | # export DT=4.0 31 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \ 32 | # > >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.out") \ 33 | # 2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2) & 34 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \ 35 | # > >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.out") \ 36 | # 2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2) 37 | -------------------------------------------------------------------------------- /mimic3_comparisons/3_ML_models/run_shallow_theta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | mkdir -p log 4 | mkdir -p output 5 | 6 | export THETA=0.4 7 | python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \ 8 | > >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.out") \ 9 | 2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2) & 10 | python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \ 11 | > >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.out") \ 12 | 2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2) 13 | 14 | # export THETA=0.2 15 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \ 16 | # > >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.out") \ 17 | # 2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2) & 18 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \ 19 | # > >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.out") \ 20 | # 2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2) 21 | 22 | # export THETA=0.1 23 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \ 24 | # > >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.out") \ 25 | # 2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2) 26 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \ 27 | # > >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.out") \ 28 | # 2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2) 29 | 30 | # export THETA=0.05 31 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \ 32 | # > >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.out") \ 33 | # 2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2) 34 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \ 35 | # > >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.out") \ 36 | # 2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2) 37 | 38 | # export THETA=0.01 39 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \ 40 | # > >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.out") \ 41 | # 2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2) 42 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \ 43 | # > >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.out") \ 44 | # 2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2) 45 | -------------------------------------------------------------------------------- /mimic3_experiments/1_data_extraction/config.py: -------------------------------------------------------------------------------- 1 | import os, yaml 2 | with open(os.path.join(os.path.dirname(__file__), '../config.yaml')) as f: 3 | config = yaml.full_load(f) 4 | 5 | data_path = os.path.join(os.path.dirname(__file__), config['data_path']) 6 | mimic3_path = os.path.join(os.path.dirname(__file__), config['mimic3_path']) 7 | 8 | ID_col = config['column_names']['ID'] 9 | t_col = config['column_names']['t'] 10 | var_col = config['column_names']['var_name'] 11 | val_col = config['column_names']['var_value'] 12 | 13 | parallel = True 14 | n_jobs = 72 15 | -------------------------------------------------------------------------------- /mimic3_experiments/1_data_extraction/generate_labels.py: -------------------------------------------------------------------------------- 1 | """ 2 | generate_labels.py 3 | Author: Shengpu Tang 4 | 5 | Generate labels for two adverse outcomes: ARF and shock. 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import scipy.stats 11 | import itertools 12 | from collections import OrderedDict, Counter 13 | from joblib import Parallel, delayed 14 | from tqdm import tqdm as tqdm 15 | import yaml 16 | data_path = yaml.full_load(open('../config.yaml'))['data_path'] 17 | 18 | import pathlib 19 | pathlib.Path(data_path, 'labels').mkdir(parents=True, exist_ok=True) 20 | 21 | examples = pd.read_csv(data_path + 'prep/icustays_MV.csv', parse_dates=['INTIME', 'OUTTIME']).sort_values(by='ICUSTAY_ID') 22 | chartevents = pd.read_pickle(data_path + 'prep/chartevents.p') 23 | procedures = pd.read_pickle(data_path + 'prep/procedureevents_mv.p') 24 | inputevents = pd.read_pickle(data_path + 'prep/inputevents_mv.p') 25 | 26 | ventilation = [ 27 | '225792', # Invasive Ventilation 28 | '225794', # Non-invasive Ventilation 29 | ] 30 | 31 | PEEP = [ 32 | '220339', # PEEP set 33 | ] 34 | 35 | vasopressors = [ 36 | '221906', # Norepinephrine 37 | '221289', # Epinephrine 38 | '221662', # Dopamine 39 | '222315', # Vasopressin 40 | '221749', # Phenylephrine 41 | ] 42 | 43 | ## ARF: (PEEP) OR (mechanical ventilation) 44 | df_PEEP = chartevents[chartevents.ITEMID.isin(PEEP)].copy() 45 | df_vent = procedures[procedures.ITEMID.isin(ventilation)].rename(columns={'t_start': 't'}).copy() 46 | df_ARF = pd.concat([df_PEEP[['ICUSTAY_ID', 't']], df_vent[['ICUSTAY_ID', 't']]], axis=0) 47 | df_ARF['ICUSTAY_ID'] = df_ARF['ICUSTAY_ID'].astype(int) 48 | df_ARF = df_ARF.sort_values(by=['ICUSTAY_ID', 't']).drop_duplicates(['ICUSTAY_ID'], keep='first').reset_index(drop=True) 49 | df_ARF = df_ARF.rename(columns={'t': 'ARF_ONSET_HOUR'}) 50 | df_ARF = pd.merge(examples[['ICUSTAY_ID']], df_ARF, on='ICUSTAY_ID', how='left') 51 | df_ARF['ARF_LABEL'] = df_ARF['ARF_ONSET_HOUR'].notnull().astype(int) 52 | print('ARF: ', dict(Counter(df_ARF['ARF_LABEL'])), 'N = {}'.format(len(df_ARF)), sep='\t') 53 | df_ARF.to_csv(data_path + 'labels/ARF.csv', index=False) 54 | 55 | ## Shock: (one of vasopressors) 56 | df_vaso = inputevents[inputevents.ITEMID.isin(vasopressors)].rename(columns={'t_start': 't'}).copy() 57 | df_shock = df_vaso.copy() 58 | df_shock['ICUSTAY_ID'] = df_shock['ICUSTAY_ID'].astype(int) 59 | df_shock = df_shock.sort_values(by=['ICUSTAY_ID', 't']).drop_duplicates(['ICUSTAY_ID'], keep='first').reset_index(drop=True) 60 | df_shock = df_shock.rename(columns={'t': 'Shock_ONSET_HOUR'}) 61 | df_shock = pd.merge(examples[['ICUSTAY_ID']], df_shock, on='ICUSTAY_ID', how='left') 62 | df_shock['Shock_LABEL'] = df_shock['Shock_ONSET_HOUR'].notnull().astype(int) 63 | print('Shock: ', dict(Counter(df_shock['Shock_LABEL'])), 'N = {}'.format(len(df_shock)), sep='\t') 64 | df_shock.to_csv(data_path + 'labels/Shock.csv', index=False) 65 | -------------------------------------------------------------------------------- /mimic3_experiments/1_data_extraction/grouped_variables.yaml: -------------------------------------------------------------------------------- 1 | HR: 2 | - 220045 # Heart Rate 3 | 4 | SysBP: 5 | - 224167 # Manual Blood Pressure Systolic Left 6 | - 227243 # Manual Blood Pressure Systolic Right 7 | - 220050 # Arterial Blood Pressure systolic 8 | - 220179 # Non Invasive Blood Pressure systolic 9 | - 225309 # ART BP Systolic 10 | 11 | DiaBP: 12 | - 224643 # Manual Blood Pressure Diastolic Left 13 | - 227242 # Manual Blood Pressure Diastolic Right 14 | - 220051 # Arterial Blood Pressure diastolic 15 | - 220180 # Non Invasive Blood Pressure diastolic 16 | - 225310 # ART BP Diastolic 17 | 18 | RR: 19 | - 220210 # Respiratory Rate 20 | - 224690 # Respiratory Rate (Total) 21 | 22 | Temperature: 23 | - 223761 # Temperature Fahrenheit 24 | - 223762 # Temperature Celsius 25 | 26 | SpO2: 27 | - 220277 # O2 saturation pulseoxymetry 28 | 29 | Height: 30 | - 226707 # Height 31 | - 226730 # Height (cm) 32 | 33 | Weight: 34 | - 224639 # Daily Weight 35 | - 226512 # Admission Weight (Kg) 36 | - 226531 # Admission Weight (lbs.) 37 | -------------------------------------------------------------------------------- /mimic3_experiments/1_data_extraction/run_prepare_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | python prepare_input.py --outcome=ARF --T=4 --dt=1 5 | python prepare_input.py --outcome=ARF --T=12 --dt=1 6 | python prepare_input.py --outcome=Shock --T=4 --dt=1 7 | python prepare_input.py --outcome=Shock --T=12 --dt=1 8 | 9 | python prepare_input.py --outcome=mortality --T=48 --dt=1 10 | cp -r ../data/processed/features/outcome=mortality,T=48.0,dt=1.0 ../data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0 11 | -------------------------------------------------------------------------------- /mimic3_experiments/2_apply_FIDDLE/run_make_all,discretize=no.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | export PYTHONPATH="../../" 5 | DATAPATH=$(python -c "import yaml;print(yaml.full_load(open('../config.yaml'))['data_path']);") 6 | mkdir -p 'log,discretize=no' 7 | 8 | OUTCOME=ARF 9 | T=4.0 10 | dt=1.0 11 | python -m FIDDLE.run \ 12 | --output_dir="$DATAPATH/features,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt/" \ 13 | --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \ 14 | --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \ 15 | --T=$T \ 16 | --dt=$dt \ 17 | --theta_1=0.001 \ 18 | --theta_2=0.001 \ 19 | --theta_freq=1 \ 20 | --stats_functions 'min' 'max' 'mean' \ 21 | --discretize=no \ 22 | > >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.out") \ 23 | 2> >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2) 24 | 25 | OUTCOME=ARF 26 | T=12.0 27 | dt=1.0 28 | python -m FIDDLE.run \ 29 | --output_dir="$DATAPATH/features,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt/" \ 30 | --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \ 31 | --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \ 32 | --T=$T \ 33 | --dt=$dt \ 34 | --theta_1=0.001 \ 35 | --theta_2=0.001 \ 36 | --theta_freq=1 \ 37 | --stats_functions 'min' 'max' 'mean' \ 38 | --discretize=no \ 39 | > >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.out") \ 40 | 2> >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2) 41 | 42 | OUTCOME=Shock 43 | T=4.0 44 | dt=1.0 45 | python -m FIDDLE.run \ 46 | --output_dir="$DATAPATH/features,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt/" \ 47 | --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \ 48 | --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \ 49 | --T=$T \ 50 | --dt=$dt \ 51 | --theta_1=0.001 \ 52 | --theta_2=0.001 \ 53 | --theta_freq=1 \ 54 | --stats_functions 'min' 'max' 'mean' \ 55 | --discretize=no \ 56 | > >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.out") \ 57 | 2> >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2) 58 | 59 | OUTCOME=Shock 60 | T=12.0 61 | dt=1.0 62 | python -m FIDDLE.run \ 63 | --output_dir="$DATAPATH/features,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt/" \ 64 | --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \ 65 | --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \ 66 | --T=$T \ 67 | --dt=$dt \ 68 | --theta_1=0.001 \ 69 | --theta_2=0.001 \ 70 | --theta_freq=1 \ 71 | --stats_functions 'min' 'max' 'mean' \ 72 | --discretize=no \ 73 | > >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.out") \ 74 | 2> >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2) 75 | 76 | 77 | 78 | python -m FIDDLE.run \ 79 | --output_dir="$DATAPATH/features,discretize=no/benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 80 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 81 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 82 | --T=48.0 \ 83 | --dt=1.0 \ 84 | --theta_1=0.001 \ 85 | --theta_2=0.001 \ 86 | --theta_freq=1 \ 87 | --stats_functions 'min' 'max' 'mean' \ 88 | --discretize=no \ 89 | > >(tee 'log,discretize=no/benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 90 | 2> >(tee 'log,discretize=no/benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 91 | -------------------------------------------------------------------------------- /mimic3_experiments/2_apply_FIDDLE/run_make_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | export PYTHONPATH="../../FIDDLE/" 5 | DATAPATH=$(python -c "import yaml;print(yaml.full_load(open('../config.yaml'))['data_path']);") 6 | mkdir -p log 7 | 8 | OUTCOME=ARF 9 | T=4.0 10 | dt=1.0 11 | python -m FIDDLE.run \ 12 | --output_dir="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/" \ 13 | --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \ 14 | --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \ 15 | --T=$T \ 16 | --dt=$dt \ 17 | --theta_1=0.001 \ 18 | --theta_2=0.001 \ 19 | --theta_freq=1 \ 20 | --stats_functions 'min' 'max' 'mean' \ 21 | > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \ 22 | 2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2) 23 | 24 | 25 | OUTCOME=ARF 26 | T=12.0 27 | dt=1.0 28 | python -m FIDDLE.run \ 29 | --output_dir="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/" \ 30 | --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \ 31 | --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \ 32 | --T=$T \ 33 | --dt=$dt \ 34 | --theta_1=0.001 \ 35 | --theta_2=0.001 \ 36 | --theta_freq=1 \ 37 | --stats_functions 'min' 'max' 'mean' \ 38 | > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \ 39 | 2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2) 40 | 41 | OUTCOME=Shock 42 | T=4.0 43 | dt=1.0 44 | python -m FIDDLE.run \ 45 | --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \ 46 | --output_dir="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/" \ 47 | --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \ 48 | --T=$T \ 49 | --dt=$dt \ 50 | --theta_1=0.001 \ 51 | --theta_2=0.001 \ 52 | --theta_freq=1 \ 53 | --stats_functions 'min' 'max' 'mean' \ 54 | > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \ 55 | 2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2) 56 | 57 | OUTCOME=Shock 58 | T=12.0 59 | dt=1.0 60 | python -m FIDDLE.run \ 61 | --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \ 62 | --output_dir="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/" \ 63 | --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \ 64 | --T=$T \ 65 | --dt=$dt \ 66 | --theta_1=0.001 \ 67 | --theta_2=0.001 \ 68 | --theta_freq=1 \ 69 | --stats_functions 'min' 'max' 'mean' \ 70 | > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \ 71 | 2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2) 72 | 73 | T=48.0 74 | dt=1.0 75 | python -m FIDDLE.run \ 76 | --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \ 77 | --output_dir="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/" \ 78 | --population="$DATAPATH/population/pop.mortality_benchmark.csv" \ 79 | --T=$T \ 80 | --dt=$dt \ 81 | --theta_1=0.001 \ 82 | --theta_2=0.001 \ 83 | --theta_freq=1 \ 84 | --stats_functions 'min' 'max' 'mean' \ 85 | > >(tee 'log/benchmark,outcome=mortality,T=48.0,dt=1.0.out') \ 86 | 2> >(tee 'log/benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) 87 | -------------------------------------------------------------------------------- /mimic3_experiments/3_ML_models/DataSummary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/tangsp/mimic3_experiments/lib/data.py:14: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", 13 | " config = yaml.load(f)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from lib.data import _Mimic3Reader\n", 19 | "import pandas as pd\n", 20 | "import numpy as np" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "timestep = 1.0" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "Finish reading data \t 2.28 s\n", 42 | "s (15873, 98)\n", 43 | "X (15873, 4, 4045)\n", 44 | "Finish reading data \t 11.16 s\n", 45 | "s (14174, 96)\n", 46 | "X (14174, 12, 4816)\n", 47 | "Finish reading data \t 3.49 s\n", 48 | "s (19342, 98)\n", 49 | "X (19342, 4, 4522)\n", 50 | "Finish reading data \t 10.31 s\n", 51 | "s (17588, 97)\n", 52 | "X (17588, 12, 5500)\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "for task in ['ARF', 'Shock']:\n", 58 | " for duration in [4, 12]:\n", 59 | " reader = _Mimic3Reader(task, duration, timestep)\n", 60 | " print('s', reader.s.shape)\n", 61 | " print('X', reader.X.shape)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "Finish reading data \t 35.72 s\n", 74 | "s (11695, 97)\n", 75 | "X (11695, 48, 7411)\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "for task in ['mortality']:\n", 81 | " for duration in [48]:\n", 82 | " reader = _Mimic3Reader(task, duration, timestep)\n", 83 | " print('s', reader.s.shape)\n", 84 | " print('X', reader.X.shape)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.5.2" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 2 116 | } 117 | -------------------------------------------------------------------------------- /mimic3_experiments/3_ML_models/README.md: -------------------------------------------------------------------------------- 1 | Config file: 2 | - `config.yaml`. Change data_path to point to the directory where the features/labels/population are stored 3 | 4 | Library files: 5 | - `data.py` 6 | - `models.py` 7 | - `trainer.py` 8 | - `experiment.py` 9 | - `eval_deep.py` 10 | - `evaluate.py` 11 | 12 | Executable files: 13 | - `run_deep.py` 14 | - `run_shallow.py` 15 | 16 | Notebooks: 17 | - `RunShallow.ipynb` 18 | - `NewEval_Deep.ipynb` 19 | - `Evaluation.ipynb` 20 | - `PredictionGap.ipynb` 21 | -------------------------------------------------------------------------------- /mimic3_experiments/3_ML_models/config.yaml: -------------------------------------------------------------------------------- 1 | data_path: ../data/processed/ 2 | 3 | model_names: { 4 | 'CNN': 'CNN_V3', 5 | 'RNN': 'RNN_V2', 6 | 'LR': 'LR', 7 | 'RF': 'RF', 8 | } 9 | 10 | train: 11 | budget: 50 12 | repeat: 1 13 | epochs: 15 14 | 15 | feature_dimension: 16 | ARF: 17 | 4.0 : 4380 18 | 12.0: 5226 19 | 20 | Shock: 21 | 4.0 : 4857 22 | 12.0: 5892 23 | 24 | mortality: 25 | 48.0: 7822 26 | -------------------------------------------------------------------------------- /mimic3_experiments/3_ML_models/lib/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | def get_best_model_info(df_search): 5 | df_search_sorted = df_search.sort_values('best_score', ascending=False).head() 6 | best_model_info = df_search_sorted.iloc[0, 1:] 7 | return best_model_info 8 | 9 | def load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params, load_filename=None): 10 | if load_filename is None: 11 | savename = best_model_info['savename'] 12 | split = savename.split('/') 13 | split[-1] = 'best_' + split[-1] 14 | load_filename = '/'.join(split) 15 | 16 | checkpoint = torch.load(load_filename) 17 | _iter = checkpoint['_iter'] 18 | print("Loaded checkpoint (trained for {} iterations)".format(checkpoint['_iter'])) 19 | # print(load_filename) 20 | 21 | best_HP = best_model_info.drop(['savename', 'best_iter', 'seed']).to_dict() 22 | model = ModelClass( 23 | in_channels, L_in, 1, 24 | **{k:best_HP[k] for k in best_HP.keys() if k not in training_params} 25 | ) 26 | model.load_state_dict(checkpoint['state_dict']) 27 | model.cuda() 28 | print("Restored model {} with #params={}".format(ModelClass, sum(p.numel() for p in model.parameters()))) 29 | 30 | return checkpoint, model 31 | 32 | def get_test_predictions(model, te_loader, task=None, model_name=None): 33 | model.eval() 34 | running_pred = [] 35 | 36 | cuda = True 37 | for i, (X, y) in enumerate(te_loader): 38 | if cuda: 39 | X = X.contiguous().cuda() 40 | y = y.contiguous().cuda(non_blocking=True) 41 | 42 | with torch.set_grad_enabled(False): 43 | output = model(X) 44 | running_pred.append((output.data.detach().cpu(), y.data.detach().cpu())) 45 | 46 | y_score, y_true = zip(*running_pred) 47 | y_score = torch.cat(y_score).numpy() 48 | y_true = torch.cat(y_true).numpy() 49 | 50 | assert (np.stack(te_loader.dataset.y) == y_true).all() 51 | return y_true, y_score 52 | 53 | def save_test_predictions(y_true, y_score, task, T, dt, model_name): 54 | import pathlib 55 | pathlib.Path('./output/outcome={}.T={}.dt={}/'.format(task, T, dt)).mkdir(parents=True, exist_ok=True) 56 | 57 | fname = './output/outcome={}.T={}.dt={}/{}.test.npz'.format(task, T, dt, model_name) 58 | np.savez( 59 | open(fname, 'wb'), 60 | y_score = y_score, 61 | y_true = y_true, 62 | ) 63 | print('Test predictions saved to', fname) 64 | -------------------------------------------------------------------------------- /mimic3_experiments/3_ML_models/lib/experiment.py: -------------------------------------------------------------------------------- 1 | from .trainer import Trainer 2 | import time 3 | import random 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from sklearn.model_selection import ParameterSampler 8 | 9 | class Experiment(object): 10 | def __init__(self, param_grid, budget=1, repeat=1, n_epochs=5, name='tmp'): 11 | self.name = name 12 | self.budget = budget 13 | self.repeat = repeat # number of restarts with different random seeds 14 | self.n_epochs = n_epochs 15 | self.param_grid = param_grid 16 | self.param_sampler = ParameterSampler(param_grid, n_iter=self.budget, random_state=0) 17 | 18 | def run(self): 19 | df_search = pd.DataFrame(columns=['best_score', 'best_iter', 'seed', 'savename'] + list(self.param_grid.keys())) 20 | start_time = time.time() 21 | for run, params in enumerate(self.param_sampler): 22 | print(self.name, '\t', 'Run:', run, '/', self.budget) 23 | print(params) 24 | for i in range(self.repeat): 25 | results = self._run_trial(i, params) 26 | df_search = df_search.append(results, ignore_index=True) 27 | df_search.to_csv('./log/df_search.current.{}.csv'.format(self.name), index=False) 28 | 29 | print('Took:', time.time() - start_time) 30 | return df_search 31 | 32 | def _run_trial(self, seed, params): 33 | savename = 'checkpoint/{}/{}_seed={}.pth.tar'.format(self.name, params, seed) 34 | 35 | random.seed(seed) 36 | np.random.seed(seed) 37 | torch.manual_seed(seed) 38 | torch.cuda.manual_seed_all(seed) 39 | 40 | tr_loader, va_loader = self.get_data() 41 | model, criterion, optimizer = self.get_model_params(params) 42 | trainer = Trainer(model, criterion, optimizer, tr_loader, va_loader, 43 | n_epochs=self.n_epochs, batch_size=params['batch_size'], 44 | savename=savename, 45 | save_every=100, plot_every=50, cuda=True) 46 | # print(trainer) 47 | trainer.fit() 48 | 49 | print(trainer._best_iter, '{:.5f}'.format(trainer.best_score)) 50 | 51 | del model 52 | return { 53 | 'best_score': trainer.best_score, 'best_iter': trainer._best_iter, 54 | 'savename': savename, 'seed': seed, 55 | **params, 56 | } 57 | 58 | def get_model_params(self): 59 | raise NotImplementedError 60 | 61 | def get_data(self): 62 | raise NotImplementedError 63 | -------------------------------------------------------------------------------- /mimic3_experiments/3_ML_models/run_deep.py: -------------------------------------------------------------------------------- 1 | # python run_deep.py --outcome=ARF --T=4 --dt=0.5 --model_type=CNN --cuda=7 2 | 3 | import sys, os, time, pickle, random 4 | import pandas as pd 5 | import numpy as np 6 | import pathlib 7 | pathlib.Path('log').mkdir(parents=True, exist_ok=True) 8 | 9 | import yaml 10 | with open('config.yaml') as f: 11 | config = yaml.load(f) 12 | 13 | ######## 14 | ## Constants 15 | data_path = config['data_path'] 16 | model_names = config['model_names'] 17 | 18 | budget = config['train']['budget'] # Number of randomized hyperparameter settings to try 19 | repeat = config['train']['repeat'] # 1 # number of restarts (with different seeds) for each setting 20 | epochs = config['train']['epochs'] # 15 # Max epochs for each setting 21 | 22 | # Feature dimensions 23 | dimensions = config['feature_dimension'] 24 | 25 | # Hyperparameter search space 26 | train_param_grid = { 27 | 'batch_size': [16, 32, 64, 128], 28 | 'lr': [1e-2, 1e-3, 1e-4], 29 | } 30 | CNN_param_grid = { 31 | 'dropout': [0.0, 0.1, 0.2, 0.4, 0.8], 32 | 'depth': [1, 2],#, 3], 33 | 'filter_size': [1, 2, 3, 4], 34 | 'n_filters': [16, 32, 64, 128], 35 | 'n_neurons': [16, 32, 64, 128], 36 | 'activation': ['relu', 'elu'], 37 | } 38 | RNN_param_grid = { 39 | 'dropout': [0.0, 0.1, 0.2, 0.4, 0.8], 40 | 'num_layers': [1, 2, 3], 41 | 'hidden_size': [16, 32, 64, 128], 42 | 'n_neurons': [16, 32, 64, 128], 43 | 'activation': ['relu', 'elu'], 44 | } 45 | 46 | training_params = {'batch_size', 'lr'} 47 | 48 | ######## 49 | 50 | import argparse 51 | 52 | parser = argparse.ArgumentParser(description='') 53 | 54 | parser.add_argument('--outcome', type=str, required=True) 55 | parser.add_argument('--T', type=float, required=True) 56 | parser.add_argument('--dt', type=float, required=True) 57 | parser.add_argument('--model_type', type=str, required=True) 58 | parser.add_argument('--cuda', type=int, default=7) 59 | parser.add_argument('--seed', type=int, default=42) 60 | 61 | args = parser.parse_args() 62 | 63 | task = args.outcome 64 | model_type = args.model_type 65 | 66 | T = float(args.T) 67 | dt = float(args.dt) 68 | L_in = int(np.floor(T / dt)) 69 | in_channels = dimensions[task][float(T)] 70 | 71 | import lib.models as models 72 | model_name = model_names[model_type] 73 | ModelClass = getattr(models, model_name) 74 | 75 | if model_type == 'CNN': 76 | param_grid = {**train_param_grid, **CNN_param_grid} 77 | elif model_type == 'RNN': 78 | param_grid = {**train_param_grid, **RNN_param_grid} 79 | else: 80 | assert False 81 | 82 | # Create checkpoint directories 83 | import pathlib 84 | pathlib.Path("./checkpoint/model={}.outcome={}.T={}.dt={}/".format(model_name, task, T, dt)).mkdir(parents=True, exist_ok=True) 85 | 86 | ## Data 87 | import lib.data as data 88 | if task == 'mortality': 89 | tr_loader, va_loader, te_loader = data.get_benchmark_splits(fuse=True) 90 | else: 91 | tr_loader, va_loader, te_loader = data.get_train_val_test(task, duration=T, timestep=dt, fuse=True) 92 | 93 | import torch 94 | from torch.utils.data import Dataset, DataLoader 95 | from sklearn.model_selection import StratifiedShuffleSplit 96 | 97 | # Set CUDA 98 | if args.cuda: 99 | torch.cuda.set_device(args.cuda) 100 | print('cuda', torch.cuda.current_device()) 101 | 102 | if args.seed: 103 | torch.manual_seed(args.seed) 104 | np.random.seed(args.seed) 105 | random.seed(args.seed) 106 | 107 | 108 | from lib.experiment import Experiment 109 | 110 | class MIMICExperiment(Experiment): 111 | def get_model_params(self, params): 112 | model = ModelClass( 113 | in_channels, L_in, 1, 114 | **{k:params[k] for k in params.keys() if k not in training_params} 115 | ) 116 | criterion = torch.nn.BCELoss() 117 | optimizer = torch.optim.Adam(model.parameters(), lr=params['lr']) 118 | return model, criterion, optimizer 119 | 120 | def get_data(self): 121 | return tr_loader, va_loader 122 | 123 | exp = MIMICExperiment( 124 | param_grid, name='model={}.outcome={}.T={}.dt={}'.format(model_name, task, T, dt), 125 | budget=budget, n_epochs=epochs, repeat=repeat, 126 | ) 127 | 128 | print('EXPERIMENT:', exp.name) 129 | 130 | df_search = exp.run() 131 | df_search.to_csv('./log/df_search.{}.csv'.format(exp.name), index=False) 132 | -------------------------------------------------------------------------------- /mimic3_experiments/3_ML_models/run_deep_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | mkdir -p log 4 | cuda=0 5 | 6 | python run_deep.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=mortality,T=48,dt=1.0,CNN.log' 7 | python run_deep.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=mortality,T=48,dt=1.0,RNN.log' 8 | 9 | python run_deep.py --outcome=ARF --T=4.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=ARF,T=4,dt=1.0,CNN.log' 10 | python run_deep.py --outcome=ARF --T=4.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=ARF,T=4,dt=1.0,RNN.log' 11 | 12 | python run_deep.py --outcome=ARF --T=12.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=ARF,T=12,dt=1.0,CNN.log' 13 | python run_deep.py --outcome=ARF --T=12.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=ARF,T=12,dt=1.0,RNN.log' 14 | 15 | python run_deep.py --outcome=Shock --T=4.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=Shock,T=4,dt=1.0,CNN.log' 16 | python run_deep.py --outcome=Shock --T=4.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=Shock,T=4,dt=1.0,RNN.log' 17 | 18 | python run_deep.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=Shock,T=12,dt=1.0,CNN.log' 19 | python run_deep.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=Shock,T=12,dt=1.0,RNN.log' 20 | -------------------------------------------------------------------------------- /mimic3_experiments/3_ML_models/run_deep_eval.py: -------------------------------------------------------------------------------- 1 | import sys, os, time, pickle, random 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import yaml 6 | with open('config.yaml') as f: 7 | config = yaml.load(f) 8 | 9 | ######## 10 | ## Constants 11 | model_names = config['model_names'] 12 | training_params = {'batch_size', 'lr'} 13 | 14 | # Feature dimensions 15 | dimensions = config['feature_dimension'] 16 | 17 | ######## 18 | 19 | def main(task, T, dt, model_type): 20 | L_in = int(np.floor(T / dt)) 21 | in_channels = dimensions[task][T] 22 | 23 | import lib.models as models 24 | model_name = model_names[model_type] 25 | ModelClass = getattr(models, model_name) 26 | df_search = pd.read_csv('./log/df_search.model={}.outcome={}.T={}.dt={}.csv'.format(model_name, task, T, dt)) 27 | import lib.evaluate as evaluate 28 | best_model_info = evaluate.get_best_model_info(df_search) 29 | checkpoint, model = evaluate.load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params) 30 | 31 | 32 | import lib.data as data 33 | if task == 'mortality': 34 | te_loader = data.get_benchmark_test(fuse=True) 35 | else: 36 | te_loader = data.get_test(task, duration=T, timestep=dt, fuse=True) 37 | 38 | y_true, y_score = evaluate.get_test_predictions(model, te_loader, '{}_T={}_dt={}'.format(task, T, dt), model_name) 39 | evaluate.save_test_predictions(y_true, y_score, task, T, dt, model_name) 40 | 41 | from sklearn import metrics, utils 42 | fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score) 43 | fig = plt.figure(figsize=(5,5)) 44 | plt.xlabel('False Positive Rate') 45 | plt.ylabel('True Positive Rate') 46 | plt.xlim(0,1) 47 | plt.ylim(0,1) 48 | plt.plot([0,1], [0,1], ':') 49 | plt.plot(fpr, tpr, color='darkorange') 50 | plt.show() 51 | 52 | ## Bootstrapped 95% Confidence Interval 53 | # try: 54 | # yte_pred = clf.decision_function(Xte) 55 | # except AttributeError: 56 | # yte_pred = clf.predict_proba(Xte)[:,1] 57 | from joblib import Parallel, delayed 58 | from tqdm import tqdm_notebook as tqdm 59 | def func(i): 60 | yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i) 61 | return metrics.roc_auc_score(yte_true_b, yte_pred_b) 62 | 63 | test_scores = Parallel(n_jobs=16)(delayed(func)(i) for i in tqdm(range(1000), leave=False)) 64 | print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5))) 65 | 66 | # idx = (np.abs(tpr - 0.5)).argmin() 67 | # y_pred = (y_score > thresholds[idx]) 68 | # metrics.roc_auc_score(y_true, y_score) 69 | 70 | precision, recall, thresholds_ = metrics.precision_recall_curve(y_true, y_score) 71 | fig = plt.figure(figsize=(5,5)) 72 | plt.xlabel('Recall') 73 | plt.ylabel('Precision') 74 | plt.xlim(0,1) 75 | plt.ylim(0,1) 76 | plt.plot(recall, precision, color='darkorange') 77 | plt.show() 78 | 79 | # target TPR = 50% 80 | idx = (np.abs(tpr - 0.5)).argmin() 81 | y_pred = (y_score > thresholds[idx]) 82 | metrics.roc_auc_score(y_true, y_score) 83 | 84 | pd.DataFrame([{ 85 | 'tpr': tpr[idx], 86 | 'fpr': fpr[idx], 87 | 'ppv': metrics.precision_score(y_true, y_pred), 88 | }]) 89 | -------------------------------------------------------------------------------- /mimic3_experiments/3_ML_models/run_shallow_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | mkdir -p log 4 | mkdir -p output 5 | 6 | python run_shallow.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \ 7 | > >(tee 'log/outcome=mortality,T=48.0,dt=1.0,LR.out') \ 8 | 2> >(tee 'log/outcome=mortality,T=48.0,dt=1.0,LR.err' >&2) 9 | python run_shallow.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \ 10 | > >(tee 'log/outcome=mortality,T=48.0,dt=1.0,RF.out') \ 11 | 2> >(tee 'log/outcome=mortality,T=48.0,dt=1.0,RF.err' >&2) 12 | 13 | python run_shallow.py --outcome=ARF --T=4.0 --dt=1.0 --model_type=LR &> 'log/outcome=ARF,T=4.0,dt=1.0,LR.log' 14 | python run_shallow.py --outcome=Shock --T=4.0 --dt=1.0 --model_type=LR &> 'log/outcome=Shock,T=4.0,dt=1.0,LR.log' 15 | 16 | python run_shallow.py --outcome=ARF --T=4.0 --dt=1.0 --model_type=RF &> 'log/outcome=ARF,T=4.0,dt=1.0,RF.log' 17 | python run_shallow.py --outcome=Shock --T=4.0 --dt=1.0 --model_type=RF &> 'log/outcome=Shock,T=4.0,dt=1.0,RF.log' 18 | 19 | python run_shallow.py --outcome=ARF --T=12.0 --dt=1.0 --model_type=LR &> 'log/outcome=ARF,T=12.0,dt=1.0,LR.log' 20 | python run_shallow.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=LR &> 'log/outcome=Shock,T=12.0,dt=1.0,LR.log' 21 | 22 | python run_shallow.py --outcome=ARF --T=12.0 --dt=1.0 --model_type=RF &> 'log/outcome=ARF,T=12.0,dt=1.0,RF.log' 23 | python run_shallow.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=RF &> 'log/outcome=Shock,T=12.0,dt=1.0,RF.log' 24 | -------------------------------------------------------------------------------- /mimic3_experiments/5_baseline_NEWS/NEWS_table.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLD3/FIDDLE-experiments/77483adf4327e87cbea4963252db873829cad813/mimic3_experiments/5_baseline_NEWS/NEWS_table.jpg -------------------------------------------------------------------------------- /mimic3_experiments/README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | - Requires: the raw csv files of the [MIMIC-III database](https://mimic.physionet.org/about/mimic/) 3 | - Extract and format data from structured tables in MIMIC-III as input to FIDDLE 4 | - Goal: using data from all structured tables, generate Time-Invariant features **s** and Time-Series features **X**. 5 | 6 | 7 | We considered five prediction tasks involving three adverse outcomes: 8 | 9 | - in-hospital mortality (48h) 10 | - ARF (4h) 11 | - ARF (12h) 12 | - shock (4h) 13 | - shock (12h) 14 | 15 | 16 | ## Steps to reproduce results 17 | 18 | 0. Modify `config.yaml` to specify `mimic3_path` and `data_path`. 19 | 20 | ### 1) Data Extraction 21 | 22 | 1. Data Extraction 23 | - Run `python -c "from extract_data import *; check_nrows();"` to verify the integrity of raw csv files. 24 | - Run `python extract_data.py`. 25 | 26 | 2. Labels & Cohort definitions 27 | - Run `python generate_labels.py` to generate the event onset time and labels for three outcomes: in-hospital mortality, ARF, and shock. The output should be 28 | ``` 29 | ARF: {0: 13125, 1: 10495} N = 23620 30 | Shock: {0: 16629, 1: 6991} N = 23620 31 | ``` 32 | - Run the following notebooks in order: `LabelDistribution.ipynb` and `InclusionExclusion.ipynb`. 33 | > The above also generates the cohort for 48h in-hospital mortality in `mortality_48.0h.csv`. However, we found some inconsistencies compared to the [mimic3-benchmark](https://github.com/YerevaNN/mimic3-benchmarks) (see also: [multitask benchmarking paper](https://doi.org/10.1038/s41597-019-0103-9)). To ensure a fair comparison with the benchmark feature set (and use the same train/val/test splits), we used their label definitions, but only consider the subset of their cohort recorded using MetaVision (i.e., also in our mortality cohort). Run the notebook `resources/IHM_benchmark.ipynb` to generate the final cohort for 48h in-hospital mortality prediction in `pop.mortality_benchmark.csv`. 34 | - Run `PopulationSummary.ipynb`. 35 | 36 | 3. Prepare input tables for each cohort 37 | 38 | - Run `python prepare_input.py --outcome={outcome} --T={T} --dt={dt}` 39 | 40 | Note: a bash script is provided for prepare input tables for all cohorts: `./run_prepare_all.sh` 41 | 42 | Since `pop.mortality_benchmark.csv` is a subset of `mortality_48.0h.csv`, we only create one `input_data.p` for the larger `mortality_48.0h.csv` and copy it into two output folders. 43 | 44 | 4. Run the notebook `FIDDLE_input_lengths.ipynb` to check the file size and the number of rows in each `input_data.p` table. 45 | 46 | ### 2) Apply FIDDLE 47 | 48 | 1. Apply FIDDLE on each cohort to generate features. 49 | 50 | - A bash script is provided for generating features: `./run_make_all.sh` 51 | 52 | - The generated features and associated metadata are located in `{data_path}/features/outcome={outcome},T={T},dt={dt}/`: 53 | 54 | - `s.npz`: a sparse array of shape (N, d) 55 | - `X.npz`: a sparse tensor of shape (N, L, D) 56 | - `s.feature_names.txt`: names of _d_ time-invariant features 57 | - `X.feature_names.txt`: names of _D_ time-series features 58 | 59 | ### 3) ML Models 60 | 61 | We used four commonly used ML algorithms to train models using the generated features: 62 | 63 | - LR: L2-regularized logistic regression 64 | - RF: random forest 65 | - CNN: 1D convolutional neural networks 66 | - LSTM: recurrent neural networks with long short-term memory cells 67 | 68 | To establish a fair comparison, all models are tuned for hyperparameter settings using a random search with a budget of 50, maximizing the area under the receiver operating characteristic curve (AUROC). 69 | 70 | To train the shallow models (LR and RF), run the following bash script. This part uses sklearn implementation of the models. 71 | 72 | ```bash 73 | > ./run_shallow_all.sh 74 | ``` 75 | 76 | To train the deep models (CNN and LSTM), run the following bash script. This part uses pytorch implementation of the layers, and custom architectures defined in `lib/models.py`; it will use GPUs if available. 77 | 78 | ```bash 79 | > ./run_deep_all.sh 80 | ``` 81 | 82 | ### 4) Evaluation 83 | 84 | See instructions in `Evaluation.ipynb` to generate the following plots using the held-out test set: ROC curves with AUROC, PR curves with AUPR, and calibration plots with Brier scores. Plots include 95% confidence intervals calculated on 1000 bootstraps of the held-out test set. 85 | -------------------------------------------------------------------------------- /mimic3_experiments/config.yaml: -------------------------------------------------------------------------------- 1 | # Location of input files 2 | mimic3_path: ../data/mimic3_csv/ 3 | data_path: ../data/processed/ 4 | 5 | # Customize table headers 6 | column_names: 7 | ID: ID 8 | t: t 9 | var_name: variable_name 10 | var_value: variable_value 11 | --------------------------------------------------------------------------------