├── .gitignore
├── .gitmodules
├── 90-day-post-discharge-mortality
    ├── FIDDLE_icd
    │   ├── config.py
    │   ├── config.yaml
    │   ├── helpers.py
    │   ├── run.py
    │   └── steps.py
    ├── cohort.ipynb
    ├── helper.py
    ├── log
    │   ├── model_clinical.txt
    │   ├── model_clinical_icd[0,1,2].txt
    │   ├── model_clinical_icd[0,1].txt
    │   └── model_clinical_icd[0].txt
    ├── main.ipynb
    ├── model.ipynb
    ├── model_clinical.py
    ├── model_clinical_icd[0,1,2].py
    ├── model_clinical_icd[0,1].py
    └── model_clinical_icd[0].py
├── README.md
├── eicu_experiments
    ├── 1_data_extraction
    │   ├── PopulationSummary.ipynb
    │   ├── extract_medication.py
    │   ├── extract_nurseCharting.py
    │   ├── extract_other_tables.ipynb
    │   ├── extract_pivoted.py
    │   ├── extract_resp_IO.ipynb
    │   ├── generate_labels.ipynb
    │   ├── population_ARF.ipynb
    │   ├── population_Shock.ipynb
    │   └── population_mortality.ipynb
    ├── 2_apply_FIDDLE
    │   ├── prepare_data.py
    │   ├── prepare_data.sh
    │   ├── prepare_data_mortality.py
    │   └── run_make_all.sh
    └── 3_ML_models
    │   ├── DataSummary.ipynb
    │   ├── Test.ipynb
    │   ├── config.yaml
    │   ├── lib
    │       ├── data.py
    │       ├── evaluate.py
    │       ├── experiment.py
    │       ├── models.py
    │       └── trainer.py
    │   ├── run_deep.py
    │   ├── run_deep_all.sh
    │   ├── run_deep_eval.py
    │   ├── run_shallow.py
    │   └── run_shallow_all.sh
├── environment.yml
├── mimic3_ablations
    ├── 2_apply_FIDDLE
    │   ├── FIDDLE_mask+Dt
    │   │   ├── config.py
    │   │   ├── config.yaml
    │   │   ├── helpers.py
    │   │   ├── run.py
    │   │   └── steps.py
    │   ├── FIDDLE_maskonly
    │   │   ├── config.py
    │   │   ├── config.yaml
    │   │   ├── helpers.py
    │   │   ├── run.py
    │   │   └── steps.py
    │   ├── FIDDLE_medianimpute
    │   │   ├── config.py
    │   │   ├── config.yaml
    │   │   ├── helpers.py
    │   │   ├── run.py
    │   │   └── steps.py
    │   ├── FIDDLE_noimpute
    │   │   ├── config.py
    │   │   ├── config.yaml
    │   │   ├── helpers.py
    │   │   ├── run.py
    │   │   └── steps.py
    │   ├── FIDDLE_ordinal
    │   │   ├── config.py
    │   │   ├── config.yaml
    │   │   ├── helpers.py
    │   │   ├── run.py
    │   │   └── steps.py
    │   ├── log
    │   │   ├── impute,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── impute,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── impute,benchmark,outcome=mortality,T=48.0,dt=12.0.err
    │   │   ├── impute,benchmark,outcome=mortality,T=48.0,dt=12.0.out
    │   │   ├── impute,benchmark,outcome=mortality,T=48.0,dt=4.0.err
    │   │   ├── impute,benchmark,outcome=mortality,T=48.0,dt=4.0.out
    │   │   ├── mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=12.0.err
    │   │   ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=12.0.out
    │   │   ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=4.0.err
    │   │   ├── medianimpute,benchmark,outcome=mortality,T=48.0,dt=4.0.out
    │   │   ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0.err
    │   │   ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0.out
    │   │   ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=4.0.err
    │   │   ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=4.0.out
    │   │   ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0.err
    │   │   ├── nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0.out
    │   │   ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=12.0.err
    │   │   ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=12.0.out
    │   │   ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=4.0.err
    │   │   ├── noimpute,benchmark,outcome=mortality,T=48.0,dt=4.0.out
    │   │   ├── ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── theta=0.001,medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── theta=0.001,medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── theta=0.001,noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   └── theta=0.001,noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   ├── run_mortality_impute.sh
    │   ├── run_mortality_mask+Dt.sh
    │   ├── run_mortality_maskonly.sh
    │   ├── run_mortality_nofreq.sh
    │   ├── run_mortality_noimpute.sh
    │   └── run_mortality_ordinal.sh
    └── 3_ML_models
    │   ├── config.yaml
    │   ├── lib
    │       ├── data.py
    │       ├── evaluate.py
    │       ├── experiment.py
    │       ├── models.py
    │       └── trainer.py
    │   ├── run_shallow_ablations.sh
    │   ├── run_shallow_impute.py
    │   ├── run_shallow_maskonly.py
    │   ├── run_shallow_medianimpute.py
    │   ├── run_shallow_nofreq.py
    │   ├── run_shallow_noimpute.py
    │   └── run_shallow_ordinal.py
├── mimic3_comparisons
    ├── 2_apply_FIDDLE
    │   ├── log
    │   │   ├── dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   ├── theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   │   ├── theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0.err
    │   │   └── theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0.out
    │   ├── run_mortality_dt.sh
    │   └── run_mortality_theta.sh
    └── 3_ML_models
    │   ├── config.yaml
    │   ├── lib
    │       ├── data.py
    │       ├── evaluate.py
    │       ├── experiment.py
    │       ├── models.py
    │       └── trainer.py
    │   ├── run_shallow_dt.py
    │   ├── run_shallow_dt.sh
    │   ├── run_shallow_theta.py
    │   └── run_shallow_theta.sh
└── mimic3_experiments
    ├── 1_data_extraction
        ├── FIDDLE_input_lengths.ipynb
        ├── InclusionExclusion.ipynb
        ├── LabelDistributions.ipynb
        ├── PopulationSummary.ipynb
        ├── config.py
        ├── extract_data.py
        ├── generate_labels.py
        ├── grouped_variables.yaml
        ├── prepare_input.py
        ├── resources
        │   ├── IHM_benchmark.ipynb
        │   ├── all_stays.csv
        │   ├── test_listfile.csv
        │   ├── train_listfile.csv
        │   └── val_listfile.csv
        └── run_prepare_all.sh
    ├── 2_apply_FIDDLE
        ├── log,discretize=no
        │   ├── benchmark,outcome=mortality,T=48.0,dt=1.0.err
        │   ├── benchmark,outcome=mortality,T=48.0,dt=1.0.out
        │   ├── outcome=ARF,T=12.0,dt=1.0.err
        │   ├── outcome=ARF,T=12.0,dt=1.0.out
        │   ├── outcome=ARF,T=4.0,dt=1.0.err
        │   ├── outcome=ARF,T=4.0,dt=1.0.out
        │   ├── outcome=Shock,T=12.0,dt=1.0.err
        │   ├── outcome=Shock,T=12.0,dt=1.0.out
        │   ├── outcome=Shock,T=4.0,dt=1.0.err
        │   └── outcome=Shock,T=4.0,dt=1.0.out
        ├── log
        │   ├── benchmark,outcome=mortality,T=48.0,dt=1.0.err
        │   ├── benchmark,outcome=mortality,T=48.0,dt=1.0.out
        │   ├── outcome=ARF,T=12.0,dt=1.0.err
        │   ├── outcome=ARF,T=12.0,dt=1.0.out
        │   ├── outcome=ARF,T=4.0,dt=1.0.err
        │   ├── outcome=ARF,T=4.0,dt=1.0.out
        │   ├── outcome=Shock,T=12.0,dt=1.0.err
        │   ├── outcome=Shock,T=12.0,dt=1.0.out
        │   ├── outcome=Shock,T=4.0,dt=1.0.err
        │   ├── outcome=Shock,T=4.0,dt=1.0.out
        │   ├── outcome=mortality,T=48.0,dt=1.0.err
        │   └── outcome=mortality,T=48.0,dt=1.0.out
        ├── run_make_all,discretize=no.sh
        └── run_make_all.sh
    ├── 3_ML_models
        ├── DataSummary.ipynb
        ├── README.md
        ├── config.yaml
        ├── lib
        │   ├── data.py
        │   ├── evaluate.py
        │   ├── experiment.py
        │   ├── models.py
        │   └── trainer.py
        ├── run_deep.py
        ├── run_deep_all.sh
        ├── run_deep_eval.py
        ├── run_shallow.py
        └── run_shallow_all.sh
    ├── 4_evaluation
        ├── CombineFigures.ipynb
        ├── Evaluation.ipynb
        ├── PredictionGap.ipynb
        ├── d_items.csv
        └── d_labitems.csv
    ├── 5_baseline_NEWS
        ├── CalculateNEWS.ipynb
        └── NEWS_table.jpg
    ├── README.md
    └── config.yaml


/.gitignore:
--------------------------------------------------------------------------------
 1 | data/*
 2 | .ipynb_checkpoints
 3 | *.png
 4 | 
 5 | # Byte-compiled / optimized / DLL files
 6 | __pycache__/
 7 | *.py[cod]
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "FIDDLE"]
2 | 	path = FIDDLE
3 | 	url = https://github.com/MLD3/FIDDLE
4 | 


--------------------------------------------------------------------------------
/90-day-post-discharge-mortality/FIDDLE_icd/config.py:
--------------------------------------------------------------------------------
 1 | import os, yaml
 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f:
 3 |     config = yaml.full_load(f)
 4 | 
 5 | ID_col = config['column_names']['ID']
 6 | var_col = config['column_names']['var_name']
 7 | val_col = config['column_names']['var_value']
 8 | t_col = config['column_names']['t']
 9 | hierarchical_sep = config['hierarchical_sep']
10 | hierarchical_levels = config['hierarchical_levels']
11 | 
12 | value_type_override = config['value_types']
13 | 
14 | parallel = True
15 | n_jobs = 64
16 | 


--------------------------------------------------------------------------------
/90-day-post-discharge-mortality/FIDDLE_icd/config.yaml:
--------------------------------------------------------------------------------
 1 | # Customize table headers
 2 | column_names:
 3 |     ID: ID
 4 |     t: t
 5 |     var_name: variable_name
 6 |     var_value: variable_value
 7 | 
 8 | hierarchical_sep: ":"
 9 | hierarchical_levels: [0,1,1]
10 | 
11 | value_types:
12 |     # enter the feature type that you would like to override in the following format:
13 |     FIRST_WARDID: Categorical
14 |     MedA_AMOUNT: Numeric
15 |     MedA_ROUTE: Categorical
16 |     ICD9_CODE: hierarchical_ICD9
17 | 


--------------------------------------------------------------------------------
/90-day-post-discharge-mortality/FIDDLE_icd/run.py:
--------------------------------------------------------------------------------
  1 | from .config import *
  2 | import pickle
  3 | import pandas as pd
  4 | import numpy as np
  5 | import time
  6 | import os
  7 | 
  8 | import argparse
  9 | from .helpers import str2bool
 10 | 
 11 | parser = argparse.ArgumentParser(description='')
 12 | parser.add_argument('--T',               type=float,   required=True)
 13 | parser.add_argument('--dt',              type=float,   required=True)
 14 | parser.add_argument('--theta_1',         type=float,   default=0.001)
 15 | parser.add_argument('--theta_2',         type=float,   default=0.001)
 16 | parser.add_argument('--theta_freq',      type=float,   default=1.0)
 17 | parser.add_argument('--stats_functions', nargs='+',    default=['min', 'max', 'mean'])
 18 | parser.add_argument('--binarize',        type=str2bool, default=True, nargs='?', const=True)
 19 | 
 20 | parser.add_argument('--data_path',       type=str,     required=True)
 21 | parser.add_argument('--input_fname',     type=str,     required=False)
 22 | parser.add_argument('--population',      type=str,     required=True)
 23 | parser.add_argument('--N',               type=int,     required=False)
 24 | parser.add_argument('--Ds',              nargs='+',    type=int)
 25 | 
 26 | parser.add_argument('--no_prefilter',    dest='prefilter',  action='store_false')
 27 | parser.add_argument('--no_postfilter',   dest='postfilter', action='store_false')
 28 | parser.set_defaults(prefilter=True, postfilter=True)
 29 | 
 30 | args = parser.parse_args()
 31 | 
 32 | data_path = args.data_path
 33 | if not data_path.endswith('/'):
 34 |     data_path += '/'
 35 | 
 36 | population = args.population
 37 | T = int(args.T)
 38 | dt = args.dt
 39 | theta_1 = args.theta_1
 40 | theta_2 = args.theta_2
 41 | theta_freq = args.theta_freq
 42 | stats_functions = args.stats_functions
 43 | binarize = args.binarize
 44 | 
 45 | df_population = pd.read_csv(population).set_index('ID')
 46 | N = args.N or len(df_population)
 47 | df_population = df_population.iloc[:args.N]
 48 | L = int(np.floor(T/dt))
 49 | 
 50 | args.df_population = df_population
 51 | args.N = N
 52 | args.L = L
 53 | args.parallel = parallel
 54 | 
 55 | if args.input_fname and os.path.isfile(args.input_fname):
 56 |     input_fname = args.input_fname
 57 |     if input_fname.endswith('.p' or '.pickle'):
 58 |         df_data = pd.read_pickle(input_fname)
 59 |     elif input_fname.endswith('.csv'):
 60 |         df_data = pd.read_csv(input_fname)
 61 |     else:
 62 |         assert False
 63 | elif os.path.isfile(data_path + 'input_data.p'):
 64 |     input_fname = data_path + 'input_data.p'
 65 |     df_data = pd.read_pickle(input_fname)
 66 | elif os.path.isfile(data_path + 'input_data.pickle'):
 67 |     input_fname = data_path + 'input_data.pickle'
 68 |     df_data = pd.read_pickle(input_fname)
 69 | elif os.path.isfile(data_path + 'input_data.csv'):
 70 |     input_fname = data_path + 'input_data.csv'
 71 |     df_data = pd.read_csv(input_fname)
 72 | else:
 73 |     raise NotImplementedError
 74 | 
 75 | 
 76 | from .steps import *
 77 | 
 78 | print('Input data file:', input_fname)
 79 | print()
 80 | print('Input arguments:')
 81 | print('    {:<6} = {}'.format('T', T))
 82 | print('    {:<6} = {}'.format('dt', dt))
 83 | print('    {:<6} = {}'.format('\u03B8\u2081', theta_1))
 84 | print('    {:<6} = {}'.format('\u03B8\u2082', theta_2))
 85 | print('    {:<6} = {}'.format('\u03B8_freq', theta_freq))
 86 | print('    {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
 87 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
 88 | print()
 89 | print('N = {}'.format(N))
 90 | print('L = {}'.format(L))
 91 | print('', flush=True)
 92 | 
 93 | 
 94 | ######
 95 | # Main
 96 | ######
 97 | if args.prefilter:
 98 |     print_header('1) Pre-filter')
 99 |     df_data = pre_filter(df_data, theta_1, df_population, args)
100 |     df_data.to_csv(data_path + 'pre-filtered.csv', index=False)
101 | 
102 | print_header('2) Transform; 3) Post-filter')
103 | df_data, df_types = parse_variable_data_type(df_data, value_type_override, args)
104 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data)
105 | 
106 | # Process time-invariant data
107 | if len(df_time_invariant) > 0:
108 |     s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args)
109 | 
110 | # Process time-dependent data
111 | if len(df_time_series) > 0:
112 |     X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args)
113 | 


--------------------------------------------------------------------------------
/90-day-post-discharge-mortality/model_clinical.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from collections import Counter
  4 | 
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.model_selection import ShuffleSplit
  7 | 
  8 | df = pd.read_csv('input/label.csv').rename(columns={'HADM_ID': 'ID'})
  9 | 
 10 | from sklearn.linear_model import LogisticRegression
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, RandomizedSearchCV
 13 | from sklearn import metrics, feature_selection, utils
 14 | import scipy.stats
 15 | from joblib import Parallel, delayed
 16 | from tqdm import tqdm#_notebook as tqdm
 17 | import random
 18 | 
 19 | def train_model(Xtr, ytr, Xte, yte, model_name, exp_name):
 20 |     np.random.seed(0)
 21 |     random.seed(0)
 22 | 
 23 |     clf = helper(model_name)
 24 | 
 25 |     clf.fit(Xtr, ytr)
 26 |     print('best_params_', clf.best_params_)
 27 |     print('best_score_ ', clf.best_score_)
 28 |     try:
 29 |         np.savetxt(
 30 |             'output/{}.{},coef.txt'.format(exp_name, model_name), 
 31 |             clf.best_estimator_.coef_,
 32 |             delimiter=',',
 33 |         )
 34 |     except:
 35 |         print('Coefficients not saved')
 36 |         pass
 37 | 
 38 |     ###### 
 39 |     # Eval
 40 |     # Bootstrapped 95% Confidence Interval
 41 |     try:
 42 |         yte_pred = clf.predict_proba(Xte)[:,1]
 43 |     except AttributeError:
 44 |         print('Cannot produce probabilistic estimates')
 45 |         raise
 46 | 
 47 |     def func(i):
 48 |         yte_true_b, yte_pred_b = utils.resample(yte, yte_pred, replace=True, random_state=i)
 49 |         return metrics.roc_auc_score(yte_true_b, yte_pred_b)
 50 | 
 51 |     test_scores = Parallel(n_jobs=2)(delayed(func)(i) for i in tqdm(range(1000), leave=False))
 52 |     print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5)))
 53 | 
 54 |     save_test_predictions(yte, yte_pred, model_name, 'output/'+exp_name+'.')
 55 | 
 56 | 
 57 | n_jobs=12
 58 | search_budget=50
 59 | 
 60 | def helper(model_type):
 61 |     if model_type == 'LR':
 62 |         clf = RandomizedSearchCV(
 63 |             LogisticRegression(solver='lbfgs'), 
 64 |             {'C': scipy.stats.reciprocal(1e-5, 1e5)},
 65 |             n_iter=search_budget,
 66 |             cv=StratifiedKFold(5),
 67 |             scoring='roc_auc',
 68 |             n_jobs=n_jobs, verbose=2,
 69 |         )
 70 |     elif model_type == 'RF':
 71 |         clf = RandomizedSearchCV(
 72 |             RandomForestClassifier(), 
 73 |             {
 74 |                 "criterion": ["gini", "entropy"],
 75 |                 "max_depth": [4, 8, 16, 32, None],
 76 |                 "max_features": scipy.stats.randint(1, 100),
 77 |                 "min_samples_split": scipy.stats.randint(2, 11),
 78 |                 "min_samples_leaf": scipy.stats.randint(1, 11),
 79 |                 "n_estimators": scipy.stats.randint(50,500),
 80 |                 "bootstrap": [True],
 81 |             },
 82 |             n_iter=search_budget,
 83 |             cv=StratifiedKFold(5),
 84 |             scoring='roc_auc',
 85 |             n_jobs=n_jobs, verbose=2,
 86 |         )
 87 |     else:
 88 |         assert False
 89 |     
 90 |     return clf
 91 | 
 92 | def save_test_predictions(y_true, y_score, model_name, save_dir):
 93 | #     import pathlib
 94 | #     pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True)
 95 |     
 96 |     fname = save_dir + '{}.test.npz'.format(model_name)
 97 |     np.savez(
 98 |         open(fname, 'wb'),
 99 |         y_score = y_score,
100 |         y_true  = y_true,
101 |     )
102 |     print('Test predictions saved to', fname)
103 | 
104 | 
105 | 
106 | import sparse
107 | X = sparse.load_npz('output.clinical/X.npz').todense().squeeze()
108 | 
109 | Xtr = X[df.partition=="train"]
110 | ytr = df[df.partition=="train"]['label']
111 | Xte = X[df.partition=="test"]
112 | yte = df[df.partition=="test"]['label']
113 | print(Xtr.shape, ytr.shape, Xte.shape, yte.shape)
114 | 
115 | train_model(Xtr, ytr, Xte, yte, 'LR', 'clinical')
116 | train_model(Xtr, ytr, Xte, yte, 'RF', 'clinical')
117 | 


--------------------------------------------------------------------------------
/90-day-post-discharge-mortality/model_clinical_icd[0,1,2].py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from collections import Counter
  4 | 
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.model_selection import ShuffleSplit
  7 | 
  8 | df = pd.read_csv('input/label.csv').rename(columns={'HADM_ID': 'ID'})
  9 | 
 10 | from sklearn.linear_model import LogisticRegression
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, RandomizedSearchCV
 13 | from sklearn import metrics, feature_selection, utils
 14 | import scipy.stats
 15 | from joblib import Parallel, delayed
 16 | from tqdm import tqdm#_notebook as tqdm
 17 | import random
 18 | 
 19 | def train_model(Xtr, ytr, Xte, yte, model_name, exp_name):
 20 |     np.random.seed(0)
 21 |     random.seed(0)
 22 | 
 23 |     clf = helper(model_name)
 24 | 
 25 |     clf.fit(Xtr, ytr)
 26 |     print('best_params_', clf.best_params_)
 27 |     print('best_score_ ', clf.best_score_)
 28 |     try:
 29 |         np.savetxt(
 30 |             'output/{}.{},coef.txt'.format(exp_name, model_name), 
 31 |             clf.best_estimator_.coef_,
 32 |             delimiter=',',
 33 |         )
 34 |     except:
 35 |         print('Coefficients not saved')
 36 |         pass
 37 | 
 38 |     ###### 
 39 |     # Eval
 40 |     # Bootstrapped 95% Confidence Interval
 41 |     try:
 42 |         yte_pred = clf.predict_proba(Xte)[:,1]
 43 |     except AttributeError:
 44 |         print('Cannot produce probabilistic estimates')
 45 |         raise
 46 | 
 47 |     def func(i):
 48 |         yte_true_b, yte_pred_b = utils.resample(yte, yte_pred, replace=True, random_state=i)
 49 |         return metrics.roc_auc_score(yte_true_b, yte_pred_b)
 50 | 
 51 |     test_scores = Parallel(n_jobs=2)(delayed(func)(i) for i in tqdm(range(1000), leave=False))
 52 |     print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5)))
 53 | 
 54 |     save_test_predictions(yte, yte_pred, model_name, 'output/'+exp_name+'.')
 55 | 
 56 | 
 57 | n_jobs=12
 58 | search_budget=50
 59 | 
 60 | def helper(model_type):
 61 |     if model_type == 'LR':
 62 |         clf = RandomizedSearchCV(
 63 |             LogisticRegression(solver='lbfgs'), 
 64 |             {'C': scipy.stats.reciprocal(1e-5, 1e5)},
 65 |             n_iter=search_budget,
 66 |             cv=StratifiedKFold(5),
 67 |             scoring='roc_auc',
 68 |             n_jobs=n_jobs, verbose=2,
 69 |         )
 70 |     elif model_type == 'RF':
 71 |         clf = RandomizedSearchCV(
 72 |             RandomForestClassifier(), 
 73 |             {
 74 |                 "criterion": ["gini", "entropy"],
 75 |                 "max_depth": [4, 8, 16, 32, None],
 76 |                 "max_features": scipy.stats.randint(1, 100),
 77 |                 "min_samples_split": scipy.stats.randint(2, 11),
 78 |                 "min_samples_leaf": scipy.stats.randint(1, 11),
 79 |                 "n_estimators": scipy.stats.randint(50,500),
 80 |                 "bootstrap": [True],
 81 |             },
 82 |             n_iter=search_budget,
 83 |             cv=StratifiedKFold(5),
 84 |             scoring='roc_auc',
 85 |             n_jobs=n_jobs, verbose=2,
 86 |         )
 87 |     else:
 88 |         assert False
 89 |     
 90 |     return clf
 91 | 
 92 | def save_test_predictions(y_true, y_score, model_name, save_dir):
 93 | #     import pathlib
 94 | #     pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True)
 95 |     
 96 |     fname = save_dir + '{}.test.npz'.format(model_name)
 97 |     np.savez(
 98 |         open(fname, 'wb'),
 99 |         y_score = y_score,
100 |         y_true  = y_true,
101 |     )
102 |     print('Test predictions saved to', fname)
103 | 
104 | 
105 | 
106 | import sparse
107 | X = sparse.load_npz('output.clinical/X.npz').todense().squeeze()
108 | s = sparse.load_npz('output.icd[0,1,2]/s.npz').todense()
109 | X = np.concatenate((s, X), axis=1)
110 | 
111 | Xtr = X[df.partition=="train"]
112 | ytr = df[df.partition=="train"]['label']
113 | Xte = X[df.partition=="test"]
114 | yte = df[df.partition=="test"]['label']
115 | print(Xtr.shape, ytr.shape, Xte.shape, yte.shape)
116 | 
117 | train_model(Xtr, ytr, Xte, yte, 'LR', 'clinical+ICD[0,1,2]')
118 | train_model(Xtr, ytr, Xte, yte, 'RF', 'clinical+ICD[0,1,2]')
119 | 


--------------------------------------------------------------------------------
/90-day-post-discharge-mortality/model_clinical_icd[0,1].py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from collections import Counter
  4 | 
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.model_selection import ShuffleSplit
  7 | 
  8 | df = pd.read_csv('input/label.csv').rename(columns={'HADM_ID': 'ID'})
  9 | 
 10 | from sklearn.linear_model import LogisticRegression
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, RandomizedSearchCV
 13 | from sklearn import metrics, feature_selection, utils
 14 | import scipy.stats
 15 | from joblib import Parallel, delayed
 16 | from tqdm import tqdm#_notebook as tqdm
 17 | import random
 18 | 
 19 | def train_model(Xtr, ytr, Xte, yte, model_name, exp_name):
 20 |     np.random.seed(0)
 21 |     random.seed(0)
 22 | 
 23 |     clf = helper(model_name)
 24 | 
 25 |     clf.fit(Xtr, ytr)
 26 |     print('best_params_', clf.best_params_)
 27 |     print('best_score_ ', clf.best_score_)
 28 |     try:
 29 |         np.savetxt(
 30 |             'output/{}.{},coef.txt'.format(exp_name, model_name), 
 31 |             clf.best_estimator_.coef_,
 32 |             delimiter=',',
 33 |         )
 34 |     except:
 35 |         print('Coefficients not saved')
 36 |         pass
 37 | 
 38 |     ###### 
 39 |     # Eval
 40 |     # Bootstrapped 95% Confidence Interval
 41 |     try:
 42 |         yte_pred = clf.predict_proba(Xte)[:,1]
 43 |     except AttributeError:
 44 |         print('Cannot produce probabilistic estimates')
 45 |         raise
 46 | 
 47 |     def func(i):
 48 |         yte_true_b, yte_pred_b = utils.resample(yte, yte_pred, replace=True, random_state=i)
 49 |         return metrics.roc_auc_score(yte_true_b, yte_pred_b)
 50 | 
 51 |     test_scores = Parallel(n_jobs=2)(delayed(func)(i) for i in tqdm(range(1000), leave=False))
 52 |     print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5)))
 53 | 
 54 |     save_test_predictions(yte, yte_pred, model_name, 'output/'+exp_name+'.')
 55 | 
 56 | 
 57 | n_jobs=12
 58 | search_budget=50
 59 | 
 60 | def helper(model_type):
 61 |     if model_type == 'LR':
 62 |         clf = RandomizedSearchCV(
 63 |             LogisticRegression(solver='lbfgs'), 
 64 |             {'C': scipy.stats.reciprocal(1e-5, 1e5)},
 65 |             n_iter=search_budget,
 66 |             cv=StratifiedKFold(5),
 67 |             scoring='roc_auc',
 68 |             n_jobs=n_jobs, verbose=2,
 69 |         )
 70 |     elif model_type == 'RF':
 71 |         clf = RandomizedSearchCV(
 72 |             RandomForestClassifier(), 
 73 |             {
 74 |                 "criterion": ["gini", "entropy"],
 75 |                 "max_depth": [4, 8, 16, 32, None],
 76 |                 "max_features": scipy.stats.randint(1, 100),
 77 |                 "min_samples_split": scipy.stats.randint(2, 11),
 78 |                 "min_samples_leaf": scipy.stats.randint(1, 11),
 79 |                 "n_estimators": scipy.stats.randint(50,500),
 80 |                 "bootstrap": [True],
 81 |             },
 82 |             n_iter=search_budget,
 83 |             cv=StratifiedKFold(5),
 84 |             scoring='roc_auc',
 85 |             n_jobs=n_jobs, verbose=2,
 86 |         )
 87 |     else:
 88 |         assert False
 89 |     
 90 |     return clf
 91 | 
 92 | def save_test_predictions(y_true, y_score, model_name, save_dir):
 93 | #     import pathlib
 94 | #     pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True)
 95 |     
 96 |     fname = save_dir + '{}.test.npz'.format(model_name)
 97 |     np.savez(
 98 |         open(fname, 'wb'),
 99 |         y_score = y_score,
100 |         y_true  = y_true,
101 |     )
102 |     print('Test predictions saved to', fname)
103 | 
104 | 
105 | 
106 | import sparse
107 | X = sparse.load_npz('output.clinical/X.npz').todense().squeeze()
108 | s = sparse.load_npz('output.icd[0,1]/s.npz').todense()
109 | X = np.concatenate((s, X), axis=1)
110 | 
111 | Xtr = X[df.partition=="train"]
112 | ytr = df[df.partition=="train"]['label']
113 | Xte = X[df.partition=="test"]
114 | yte = df[df.partition=="test"]['label']
115 | print(Xtr.shape, ytr.shape, Xte.shape, yte.shape)
116 | 
117 | train_model(Xtr, ytr, Xte, yte, 'LR', 'clinical+ICD[0,1]')
118 | train_model(Xtr, ytr, Xte, yte, 'RF', 'clinical+ICD[0,1]')
119 | 


--------------------------------------------------------------------------------
/90-day-post-discharge-mortality/model_clinical_icd[0].py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from collections import Counter
  4 | 
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.model_selection import ShuffleSplit
  7 | 
  8 | df = pd.read_csv('input/label.csv').rename(columns={'HADM_ID': 'ID'})
  9 | 
 10 | from sklearn.linear_model import LogisticRegression
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV, RandomizedSearchCV
 13 | from sklearn import metrics, feature_selection, utils
 14 | import scipy.stats
 15 | from joblib import Parallel, delayed
 16 | from tqdm import tqdm#_notebook as tqdm
 17 | import random
 18 | 
 19 | def train_model(Xtr, ytr, Xte, yte, model_name, exp_name):
 20 |     np.random.seed(0)
 21 |     random.seed(0)
 22 | 
 23 |     clf = helper(model_name)
 24 | 
 25 |     clf.fit(Xtr, ytr)
 26 |     print('best_params_', clf.best_params_)
 27 |     print('best_score_ ', clf.best_score_)
 28 |     try:
 29 |         np.savetxt(
 30 |             'output/{}.{},coef.txt'.format(exp_name, model_name), 
 31 |             clf.best_estimator_.coef_,
 32 |             delimiter=',',
 33 |         )
 34 |     except:
 35 |         print('Coefficients not saved')
 36 |         pass
 37 | 
 38 |     ###### 
 39 |     # Eval
 40 |     # Bootstrapped 95% Confidence Interval
 41 |     try:
 42 |         yte_pred = clf.predict_proba(Xte)[:,1]
 43 |     except AttributeError:
 44 |         print('Cannot produce probabilistic estimates')
 45 |         raise
 46 | 
 47 |     def func(i):
 48 |         yte_true_b, yte_pred_b = utils.resample(yte, yte_pred, replace=True, random_state=i)
 49 |         return metrics.roc_auc_score(yte_true_b, yte_pred_b)
 50 | 
 51 |     test_scores = Parallel(n_jobs=2)(delayed(func)(i) for i in tqdm(range(1000), leave=False))
 52 |     print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5)))
 53 | 
 54 |     save_test_predictions(yte, yte_pred, model_name, 'output/'+exp_name+'.')
 55 | 
 56 | 
 57 | n_jobs=12
 58 | search_budget=50
 59 | 
 60 | def helper(model_type):
 61 |     if model_type == 'LR':
 62 |         clf = RandomizedSearchCV(
 63 |             LogisticRegression(solver='lbfgs'), 
 64 |             {'C': scipy.stats.reciprocal(1e-5, 1e5)},
 65 |             n_iter=search_budget,
 66 |             cv=StratifiedKFold(5),
 67 |             scoring='roc_auc',
 68 |             n_jobs=n_jobs, verbose=2,
 69 |         )
 70 |     elif model_type == 'RF':
 71 |         clf = RandomizedSearchCV(
 72 |             RandomForestClassifier(), 
 73 |             {
 74 |                 "criterion": ["gini", "entropy"],
 75 |                 "max_depth": [4, 8, 16, 32, None],
 76 |                 "max_features": scipy.stats.randint(1, 100),
 77 |                 "min_samples_split": scipy.stats.randint(2, 11),
 78 |                 "min_samples_leaf": scipy.stats.randint(1, 11),
 79 |                 "n_estimators": scipy.stats.randint(50,500),
 80 |                 "bootstrap": [True],
 81 |             },
 82 |             n_iter=search_budget,
 83 |             cv=StratifiedKFold(5),
 84 |             scoring='roc_auc',
 85 |             n_jobs=n_jobs, verbose=2,
 86 |         )
 87 |     else:
 88 |         assert False
 89 |     
 90 |     return clf
 91 | 
 92 | def save_test_predictions(y_true, y_score, model_name, save_dir):
 93 | #     import pathlib
 94 | #     pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True)
 95 |     
 96 |     fname = save_dir + '{}.test.npz'.format(model_name)
 97 |     np.savez(
 98 |         open(fname, 'wb'),
 99 |         y_score = y_score,
100 |         y_true  = y_true,
101 |     )
102 |     print('Test predictions saved to', fname)
103 | 
104 | 
105 | 
106 | import sparse
107 | X = sparse.load_npz('output.clinical/X.npz').todense().squeeze()
108 | s = sparse.load_npz('output.icd[0]/s.npz').todense()
109 | X = np.concatenate((s, X), axis=1)
110 | 
111 | Xtr = X[df.partition=="train"]
112 | ytr = df[df.partition=="train"]['label']
113 | Xte = X[df.partition=="test"]
114 | yte = df[df.partition=="test"]['label']
115 | print(Xtr.shape, ytr.shape, Xte.shape, yte.shape)
116 | 
117 | train_model(Xtr, ytr, Xte, yte, 'LR', 'clinical+ICD[0]')
118 | train_model(Xtr, ytr, Xte, yte, 'RF', 'clinical+ICD[0]')
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FIDDLE experiments
 2 | 
 3 | This repository contains code for the experiments in the JAMIA paper, [**Democratizing EHR analyses with FIDDLE: a flexible data-driven preprocessing pipeline for structured clinical data**](https://doi.org/10.1093/jamia/ocaa139) by Tang et al. (2020). Please also refer to the main [FIDDLE respository](https://github.com/MLD3/FIDDLE).
 4 | 
 5 | **IMPORTANT NOTE:** Due to updated versions of python and related packages (pandas etc.), it might be impossible to replicate the exact numerical results in the paper. Moreover, due to sheer size of the datasets, full processing requires a machine with many CPU cores and a very large RAM (at least 500GB for MIMIC-III, ~3TB for eICU). Therefore, we recommend the following options:
 6 | - To reproduce MIMIC-III results similar to that in the paper, we recommend running the latest version of FIDDLE on the data and updating the feature dimensions in the metadata files to match the extracted feature sets. 
 7 | - To replicate MIMIC-III results reported in the paper, consider using the [jamia-replication](https://github.com/MLD3/FIDDLE-experiments/tree/jamia-replication) branch. We have made every attempt to derive the same set of features (~0.00001% difference) from the MIMIC-III data. To make this experiment suite more accessible, we have released preprocessed MIMIC-III and eICU features via PhysioNet (for use with code in the [jamia-replication](https://github.com/MLD3/FIDDLE-experiments/tree/jamia-replication) branch). Please download the datasets here: https://physionet.org/content/mimic-eicu-fiddle-feature/1.0.0/
 8 | 
 9 | ## Usage
10 | Clone the repository and initialize the FIDDLE submodule:
11 | ```bash
12 | git clone https://github.com/MLD3/FIDDLE-experiments.git
13 | git submodule update --init --recursive
14 | ```
15 | 
16 | To reproduce the experiments on MIMIC-III, use `conda env create -f environment.yml` to create a conda envionment named `FIDDLE-env` which uses python 3.7, and then follow the steps in README.
17 | 


--------------------------------------------------------------------------------
/eicu_experiments/1_data_extraction/extract_medication.py:
--------------------------------------------------------------------------------
 1 | eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
 2 | save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | config = {
10 |     'n_rows': {
11 |         'medication': 7_301_853,
12 |     }
13 | }
14 | 
15 | def _read_events(fname, t_cols, chunksize):
16 |     """
17 |     A helper function to read csv in chunks
18 |     Arguments:
19 |         - fname is the file name (i.e INPUTEVENTS)
20 |         - t_cols is a list that contains the names of the time columns that should be parsed
21 |         - chunksize is the size of each chunk
22 |     """
23 |     n_rows = config['n_rows'][fname]
24 |     with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar:
25 |         for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize):
26 |             pbar.update()
27 |             yield df
28 | 
29 | 
30 | fname = 'medication'
31 | df_M = []
32 | for i, df in enumerate(_read_events(fname, [], chunksize=100000)):
33 |     # Remove unknow drug name or drug seqnum
34 |     df['drughiclseqno'] = df['drughiclseqno'].astype('Int64')
35 |     df = df.dropna(subset=['drugname', 'drughiclseqno'], how='all')
36 |     
37 |     # Combine drug name and ID
38 |     df.loc[:, 'drugnameid'] = df[['drugname', 'drughiclseqno']].apply(
39 |         lambda x: '{}|{}'.format(x[0], x[1]), axis=1)
40 |     
41 |     df = df.rename(columns={'patientunitstayid': 'ID', 'drugstartoffset': 't'})
42 |     df = df.set_index([
43 |         'ID', 't', 'drugnameid'
44 |     ])[['dosage', 'routeadmin', 'frequency']]
45 |     
46 |     df.columns.name = 'property'
47 |     df = df.stack()
48 |     df.name = 'variable_value'
49 |     df = df.reset_index()
50 |     
51 |     df['variable_name'] = df[['drugnameid', 'property']].apply(lambda x: '|'.join(x), axis=1)
52 |     df['variable_value'] = pd.to_numeric(df['variable_value'], errors='ignore')
53 |     df = df[['ID', 't', 'variable_name', 'variable_value']]
54 |     
55 |     df = df.reset_index(drop=True)
56 |     df_M.append(df)
57 | 
58 | df_out = pd.concat(df_M, ignore_index=True)
59 | try:
60 |     df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False)
61 | except:
62 |     df_out.to_pickle(save_path + '{}.pickle'.format(fname))
63 | 


--------------------------------------------------------------------------------
/eicu_experiments/1_data_extraction/extract_nurseCharting.py:
--------------------------------------------------------------------------------
 1 | eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
 2 | save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | 
 8 | config = {
 9 |     'n_rows': {
10 |         'nurseCharting': 151_604_232,
11 |     }
12 | }
13 | 
14 | def _read_events(fname, t_cols, chunksize):
15 |     """
16 |     A helper function to read csv in chunks
17 |     Arguments:
18 |         - fname is the file name (i.e INPUTEVENTS)
19 |         - t_cols is a list that contains the names of the time columns that should be parsed
20 |         - chunksize is the size of each chunk
21 |     """
22 |     n_rows = config['n_rows'][fname]
23 |     with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar:
24 |         for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize):
25 |             pbar.update()
26 |             yield df
27 | 
28 | 
29 | fname = 'nurseCharting'
30 | df_NC = []
31 | for i, df in enumerate(_read_events(fname, [], chunksize=1000000)):
32 |     df = df.drop(columns=[
33 |         'nursingchartid', 
34 |         'nursingchartentryoffset', 
35 |     ])
36 |     df = df.rename(columns={
37 |         'patientunitstayid': 'ID',
38 |         'nursingchartoffset': 't',
39 |     })
40 |     df['variable_name'] = df[[
41 |         'nursingchartcelltypecat', 'nursingchartcelltypevallabel', 
42 |         'nursingchartcelltypevalname'
43 |     ]].apply(lambda x: '|'.join(x), axis=1)
44 | 
45 |     df['variable_value'] = pd.to_numeric(df['nursingchartvalue'], errors='ignore')
46 | 
47 |     df = df[['ID', 't', 'variable_name', 'variable_value']]
48 |     df = df.reset_index(drop=True)
49 |     df_NC.append(df)
50 |     if i % 40 == 39:
51 |         df_out = pd.concat(df_NC, ignore_index=True)
52 |         try:
53 |             df_out.to_parquet(data_path + '{}_{}.parquet'.format(fname, int(i//40)), index=False)
54 |         except:
55 |             df_out.to_pickle(data_path + '{}_{}.pickle'.format(fname, int(i//40)))
56 |         df_NC = []
57 | 
58 | df_out = pd.concat(df_NC, ignore_index=True)
59 | try:
60 |     df_out.to_parquet(save_path + '{}_{}.parquet'.format(fname, int(i//40)), index=False)
61 | except:
62 |     df_out.to_pickle(save_path + '{}_{}.pickle'.format(fname, int(i//40)))


--------------------------------------------------------------------------------
/eicu_experiments/1_data_extraction/extract_pivoted.py:
--------------------------------------------------------------------------------
 1 | # python extract_pivoted.py vitalPeriodic
 2 | # python extract_pivoted.py vitalAperiodic
 3 | 
 4 | eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
 5 | save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | from tqdm import tqdm
10 | 
11 | import argparse
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('filename')
14 | args = parser.parse_args()
15 | fname = args.filename
16 | 
17 | config = {
18 |     'n_rows': {
19 |         'vitalPeriodic': 146_671_642,
20 |         'vitalAperiodic': 25_075_074,
21 |     }
22 | }
23 | 
24 | def _read_events(fname, t_cols, chunksize):
25 |     """
26 |     A helper function to read csv in chunks
27 |     Arguments:
28 |         - fname is the file name (i.e INPUTEVENTS)
29 |         - t_cols is a list that contains the names of the time columns that should be parsed
30 |         - chunksize is the size of each chunk
31 |     """
32 |     n_rows = config['n_rows'][fname]
33 |     with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar:
34 |         for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize):
35 |             pbar.update()
36 |             yield df
37 | 
38 | 
39 | 
40 | 
41 | df_V = []
42 | for i, df in enumerate(_read_events(fname, [], chunksize=1000000)):
43 |     df = df.iloc[:,1:].set_index(['patientunitstayid', 'observationoffset'])
44 |     df.columns.name = 'variable_name'
45 |     df = df.stack()
46 |     df.name = 'variable_value'
47 |     df = df.reset_index()
48 |     df_V.append(df)
49 |     if i % 20 == 0:
50 |         df_out = pd.concat(df_V, ignore_index=True)
51 |         df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False)
52 | 
53 | df_out = pd.concat(df_V, ignore_index=True)
54 | df_out.columns = ['ID', 't', 'variable_name', 'variable_value']
55 | df_out = df_out.groupby(['ID', 't', 'variable_name']).median().reset_index() # Drop duplicates and keep the median value
56 | df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False)
57 | 


--------------------------------------------------------------------------------
/eicu_experiments/2_apply_FIDDLE/prepare_data.py:
--------------------------------------------------------------------------------
 1 | eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/' #read from here
 2 | data_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/' #save here
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | 
 8 | import pandas as pd
 9 | import pickle
10 | import os
11 | 
12 | import argparse
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--outcome', type=str, required=True)
15 | parser.add_argument('--T', type=float, required=True)
16 | args = parser.parse_args()
17 | outcome, T = args.outcome, args.T
18 | 
19 | data_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/'
20 | pop_path = 'population/{}_{}h.csv'.format(outcome, T)
21 | 
22 | pop = pd.read_csv(data_path + pop_path)
23 | data = []
24 | 
25 | for i, filename in enumerate(reversed(sorted(os.listdir(data_path + 'extracted/')))):
26 |     if filename.endswith(".parquet") or filename.endswith(".pickle"): 
27 |         print('___', filename, '___', flush=True)
28 |         
29 |         if filename.endswith(".parquet"): df = pd.read_parquet(data_path + 'extracted/' + filename)
30 |         else: df = pickle.load(open(data_path + 'extracted/' + filename, 'rb'))
31 |     
32 |         #subsetting population
33 |         print('rows: ', df.shape[0])
34 |         df = df[df.ID.isin(pop.ID)]
35 |         print('rows after subsetting population: ', df.shape[0])
36 |         
37 |         #subsetting time
38 |         df = df[((df.t >= 0) & (df.t < T*60)) | np.isnan(df.t)]
39 |         print('rows after subsetting time: ', df.shape[0])
40 |         
41 |         df['variable_value'] = pd.to_numeric(df['variable_value'], errors='ignore')
42 |         data.append(df)
43 |         del df
44 | 
45 | data = pd.concat(data)
46 | print(data.shape)
47 | print(data.head())
48 | 
49 | print('Number of unique variable_names:', data['variable_name'].nunique())
50 | print('Number of rows:', len(data))
51 | 
52 | # Remove duplicate rows and recording any duplicates and inconsistencies
53 | data = data.drop_duplicates(subset=['ID', 't', 'variable_name'], keep='first')
54 | data = data.sort_values(by=['ID', 't', 'variable_name'])
55 | print('Number of rows after removing duplicate rows:', len(data))
56 | 
57 | data.to_csv(data_path + 'features/{}_{}h/input_data.csv'.format(outcome, T), index=False)
58 | 


--------------------------------------------------------------------------------
/eicu_experiments/2_apply_FIDDLE/prepare_data.sh:
--------------------------------------------------------------------------------
1 | # python prepare_data_mortality.py 2>&1 | tee log/prepare_data_mortality.log
2 | # python prepare_data.py --outcome='ARF' --T='4.0' | tee log/prepare_data_ARF_4h.log
3 | # python prepare_data.py --outcome='Shock' --T='4.0' | tee log/prepare_data_Shock_4h.log
4 | python prepare_data.py --outcome='ARF' --T='12.0' | tee log/prepare_data_ARF_12h.log
5 | python prepare_data.py --outcome='Shock' --T='12.0' | tee log/prepare_data_Shock_12h.log
6 | 


--------------------------------------------------------------------------------
/eicu_experiments/2_apply_FIDDLE/prepare_data_mortality.py:
--------------------------------------------------------------------------------
 1 | eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/' #read from here
 2 | data_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/' #save here
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | 
 8 | import pandas as pd
 9 | import pickle
10 | import os
11 | 
12 | data_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/'
13 | pop_path = 'population/mortality_48h.csv'
14 | 
15 | pop = pd.read_csv(data_path + pop_path)
16 | T = 48.0
17 | data = []
18 | 
19 | for i, filename in enumerate(reversed(sorted(os.listdir(data_path + 'extracted/')))):
20 |     if filename.endswith(".parquet") or filename.endswith(".pickle"): 
21 |         print('___', filename, '___', flush=True)
22 |         
23 |         if filename.endswith(".parquet"): df = pd.read_parquet(data_path + 'extracted/' + filename)
24 |         else: df = pickle.load(open(data_path + 'extracted/' + filename, 'rb'))
25 |     
26 |         #subsetting population
27 |         print('rows: ', df.shape[0])
28 |         df = df[df.ID.isin(pop.ID)]
29 |         print('rows after subsetting population: ', df.shape[0])
30 |         
31 |         #subsetting time
32 |         df = df[((df.t >= 0) & (df.t < T*60)) | np.isnan(df.t)]
33 |         print('rows after subsetting time: ', df.shape[0])
34 |         
35 |         df['variable_value'] = pd.to_numeric(df['variable_value'], errors='ignore')
36 |         data.append(df)
37 |         del df
38 | 
39 | data = pd.concat(data)
40 | print(data.shape)
41 | print(data.head())
42 | 
43 | print('Number of unique variable_names:', data['variable_name'].nunique())
44 | print('Number of rows:', len(data))
45 | 
46 | # Remove duplicate rows and recording any duplicates and inconsistencies
47 | data = data.drop_duplicates(subset=['ID', 't', 'variable_name'], keep='first')
48 | data = data.sort_values(by=['ID', 't', 'variable_name'])
49 | print('Number of rows after removing duplicate rows:', len(data))
50 | 
51 | data.to_csv(data_path + 'features/mortality/input_data.csv', index=False)
52 | 


--------------------------------------------------------------------------------
/eicu_experiments/2_apply_FIDDLE/run_make_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | DATAPATH="/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/"
 5 | export PYTHONPATH="/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/"
 6 | mkdir -p log
 7 | 
 8 | OUTCOME=ARF
 9 | T=240.0
10 | dt=60.0
11 | Th=4.0
12 | python -m FIDDLE.run \
13 |     --data_path="$DATAPATH/features/${OUTCOME}_${Th}h/" \
14 |     --population="$DATAPATH/population/${OUTCOME}_${Th}h.csv" \
15 |     --T=$T \
16 |     --dt=$dt \
17 |     --theta_1=0.001 \
18 |     --theta_2=0.001 \
19 |     --theta_freq=1 \
20 |     --stats_functions 'min' 'max' 'mean' \
21 |     > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \
22 |     2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2)
23 | 
24 | 
25 | # OUTCOME=mortality
26 | # T=48
27 | # dt=1.0
28 | # python -m FIDDLE.run \
29 | #     --data_path="$DATAPATH/features/mortality/" \
30 | #     --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \
31 | #     --T=$T \
32 | #     --dt=$dt \
33 | #     --theta_1=0.001 \
34 | #     --theta_2=0.001 \
35 | #     --theta_freq=1 \
36 | #     --stats_functions 'min' 'max' 'mean' \
37 | #     > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \
38 | #     2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2)
39 | 


--------------------------------------------------------------------------------
/eicu_experiments/3_ML_models/DataSummary.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "scrolled": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from lib.data import _eICUReader\n",
 12 |     "import pandas as pd\n",
 13 |     "import numpy as np"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "timestep = 1.0"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "name": "stdout",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "Outcome ARF \t T 4\n",
 35 |       "Finish reading data \t 793.68 s\n",
 36 |       "s (138840, 717)\n",
 37 |       "X (138840, 4, 5854)\n",
 38 |       "\n",
 39 |       "Outcome Shock \t T 4\n",
 40 |       "Finish reading data \t 1193.06 s\n",
 41 |       "s (164333, 770)\n",
 42 |       "X (164333, 4, 6314)\n",
 43 |       "\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "for task in ['ARF', 'Shock']:\n",
 49 |     "    for duration in [4]:\n",
 50 |     "        print('Outcome', task, '\\t', 'T', duration)\n",
 51 |     "        reader = _eICUReader(task, duration, timestep)\n",
 52 |     "        print('s', reader.s.shape)\n",
 53 |     "        print('X', reader.X.shape)\n",
 54 |     "        print()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "scrolled": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "for task in ['ARF', 'Shock']:\n",
 66 |     "    for duration in [12]:\n",
 67 |     "        print('Outcome', task, 'T', duration)\n",
 68 |     "        reader = _eICUReader(task, duration, timestep)\n",
 69 |     "        print('s', reader.s.shape)\n",
 70 |     "        print('X', reader.X.shape)\n",
 71 |     "        print()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 4,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "Finish reading data \t 35.72 s\n",
 84 |       "s (11695, 97)\n",
 85 |       "X (11695, 48, 7411)\n"
 86 |      ]
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "for task in ['mortality']:\n",
 91 |     "    for duration in [48]:\n",
 92 |     "        reader = _Mimic3Reader(task, duration, timestep)\n",
 93 |     "        print('s', reader.s.shape)\n",
 94 |     "        print('X', reader.X.shape)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": []
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": "Python 3",
108 |    "language": "python",
109 |    "name": "python3"
110 |   },
111 |   "language_info": {
112 |    "codemirror_mode": {
113 |     "name": "ipython",
114 |     "version": 3
115 |    },
116 |    "file_extension": ".py",
117 |    "mimetype": "text/x-python",
118 |    "name": "python",
119 |    "nbconvert_exporter": "python",
120 |    "pygments_lexer": "ipython3",
121 |    "version": "3.7.4"
122 |   }
123 |  },
124 |  "nbformat": 4,
125 |  "nbformat_minor": 4
126 | }
127 | 


--------------------------------------------------------------------------------
/eicu_experiments/3_ML_models/config.yaml:
--------------------------------------------------------------------------------
 1 | data_path: ../data/
 2 | 
 3 | model_names: {
 4 |     'CNN': 'CNN_V3',
 5 |     'RNN': 'RNN_V2',
 6 |     'LR': 'LR',
 7 |     'RF': 'RF',
 8 | }
 9 | 
10 | train:
11 |     budget: 50
12 |     repeat: 1
13 |     epochs: 15
14 | 
15 | feature_dimension:
16 |     ARF:
17 |         4.0 : 4143
18 |         12.0: 4912
19 | 
20 |     Shock:
21 |         4.0 : 4620
22 |         12.0: 5597
23 | 
24 |     mortality:
25 |         48.0: 7508
26 | 


--------------------------------------------------------------------------------
/eicu_experiments/3_ML_models/lib/evaluate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | def get_best_model_info(df_search):
 5 |     df_search_sorted = df_search.sort_values('best_score', ascending=False).head()
 6 |     best_model_info = df_search_sorted.iloc[0, 1:]
 7 |     return best_model_info
 8 | 
 9 | def load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params, load_filename=None):
10 |     if load_filename is None:
11 |         savename = best_model_info['savename']
12 |         split = savename.split('/')
13 |         split[-1] = 'best_' + split[-1]
14 |         load_filename = '/'.join(split)
15 |     
16 |     checkpoint = torch.load(load_filename)
17 |     _iter = checkpoint['_iter']
18 |     print("Loaded checkpoint (trained for {} iterations)".format(checkpoint['_iter']))
19 | #     print(load_filename)
20 |     
21 |     best_HP = best_model_info.drop(['savename', 'best_iter', 'seed']).to_dict()
22 |     model = ModelClass(
23 |         in_channels, L_in, 1,
24 |         **{k:best_HP[k] for k in best_HP.keys() if k not in training_params}
25 |     )
26 |     model.load_state_dict(checkpoint['state_dict'])
27 |     model.cuda()
28 |     print("Restored model {} with #params={}".format(ModelClass, sum(p.numel() for p in model.parameters())))
29 |     
30 |     return checkpoint, model
31 | 
32 | def get_test_predictions(model, te_loader, task=None, model_name=None):
33 |     model.eval()
34 |     running_pred = []
35 | 
36 |     cuda = True
37 |     for i, (X, y) in enumerate(te_loader):
38 |         if cuda:
39 |             X = X.contiguous().cuda()
40 |             y = y.contiguous().cuda(non_blocking=True)
41 | 
42 |         with torch.set_grad_enabled(False):
43 |             output = model(X)
44 |             running_pred.append((output.data.detach().cpu(), y.data.detach().cpu()))
45 | 
46 |     y_score, y_true = zip(*running_pred)
47 |     y_score = torch.cat(y_score).numpy()
48 |     y_true = torch.cat(y_true).numpy()
49 | 
50 |     assert (np.stack(te_loader.dataset.y) == y_true).all()
51 |     return y_true, y_score
52 | 
53 | def save_test_predictions(y_true, y_score, task, T, dt, model_name):
54 |     import pathlib
55 |     pathlib.Path('./output/outcome={}.T={}.dt={}/'.format(task, T, dt)).mkdir(parents=True, exist_ok=True)
56 |     
57 |     fname = './output/outcome={}.T={}.dt={}/{}.test.npz'.format(task, T, dt, model_name)
58 |     np.savez(
59 |         open(fname, 'wb'),
60 |         y_score = y_score,
61 |         y_true  = y_true,
62 |     )
63 |     print('Test predictions saved to', fname)
64 | 


--------------------------------------------------------------------------------
/eicu_experiments/3_ML_models/lib/experiment.py:
--------------------------------------------------------------------------------
 1 | from .trainer import Trainer
 2 | import time
 3 | import random
 4 | import numpy as np
 5 | import pandas as pd
 6 | import torch
 7 | from sklearn.model_selection import ParameterSampler
 8 | 
 9 | class Experiment(object):
10 |     def __init__(self, param_grid, budget=1, repeat=1, n_epochs=5, name='tmp'):
11 |         self.name = name
12 |         self.budget = budget
13 |         self.repeat = repeat # number of restarts with different random seeds
14 |         self.n_epochs = n_epochs
15 |         self.param_grid = param_grid
16 |         self.param_sampler = ParameterSampler(param_grid, n_iter=self.budget, random_state=0)
17 |     
18 |     def run(self):
19 |         df_search = pd.DataFrame(columns=['best_score', 'best_iter', 'seed', 'savename'] + list(self.param_grid.keys()))
20 |         start_time = time.time()
21 |         for run, params in enumerate(self.param_sampler):
22 |             print(self.name, '\t', 'Run:', run, '/', self.budget)
23 |             print(params)
24 |             for i in range(self.repeat):
25 |                 results = self._run_trial(i, params)
26 |                 df_search = df_search.append(results, ignore_index=True)
27 |                 df_search.to_csv('./log/df_search.current.{}.csv'.format(self.name), index=False)
28 | 
29 |         print('Took:', time.time() - start_time)
30 |         return df_search
31 |     
32 |     def _run_trial(self, seed, params):
33 |         savename = 'checkpoint/{}/{}_seed={}.pth.tar'.format(self.name, params, seed)
34 |         
35 |         random.seed(seed)
36 |         np.random.seed(seed)
37 |         torch.manual_seed(seed)
38 |         torch.cuda.manual_seed_all(seed)
39 | 
40 |         tr_loader, va_loader = self.get_data()
41 |         model, criterion, optimizer = self.get_model_params(params)
42 |         trainer = Trainer(model, criterion, optimizer, tr_loader, va_loader, 
43 |                           n_epochs=self.n_epochs, batch_size=params['batch_size'], 
44 |                           savename=savename, 
45 |                           save_every=100, plot_every=50, cuda=True)
46 | #         print(trainer)
47 |         trainer.fit()
48 | 
49 |         print(trainer._best_iter, '{:.5f}'.format(trainer.best_score))
50 |         
51 |         del model
52 |         return {
53 |             'best_score': trainer.best_score, 'best_iter': trainer._best_iter, 
54 |             'savename': savename, 'seed': seed,
55 |             **params,
56 |         }
57 |     
58 |     def get_model_params(self):
59 |         raise NotImplementedError
60 |     
61 |     def get_data(self):
62 |         raise NotImplementedError
63 |     


--------------------------------------------------------------------------------
/eicu_experiments/3_ML_models/lib/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | 
  6 | class CNN_V3(nn.Module):
  7 |     """
  8 |     Multilayer CNN with 1D convolutions
  9 |     """
 10 |     def __init__(
 11 |         self,
 12 |         in_channels,
 13 |         L_in,
 14 |         output_size,
 15 |         depth=2,
 16 |         filter_size=3, 
 17 |         n_filters=64, 
 18 |         n_neurons=64, 
 19 |         dropout=0.2,
 20 |         activation='relu',
 21 |     ):
 22 |         super().__init__()
 23 |         self.depth = depth
 24 |         if activation == 'relu':
 25 |             self.activation = F.relu
 26 |         elif activation == 'elu':
 27 |             self.activation = F.elu
 28 |         padding = int(np.floor(filter_size / 2))
 29 |         
 30 |         if depth == 1:
 31 |             self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding)
 32 |             self.pool1 = nn.MaxPool1d(2, 2)
 33 |             self.fc1 = nn.Linear(int(L_in * (n_filters) / 2), n_neurons)
 34 |             self.fc1_drop = nn.Dropout(dropout)
 35 |             self.fc2 = nn.Linear(n_neurons, 1)
 36 |     
 37 |         elif depth == 2:
 38 |             self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding)
 39 |             self.pool1 = nn.MaxPool1d(2, 2)
 40 |             self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding)
 41 |             self.pool2 = nn.MaxPool1d(2, 2)
 42 |             self.fc1 = nn.Linear(int(L_in * (n_filters) / 4), n_neurons)
 43 |             self.fc1_drop = nn.Dropout(dropout)
 44 |             self.fc2 = nn.Linear(n_neurons, 1)
 45 |             
 46 |         elif depth == 3:
 47 |             self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding)
 48 |             self.pool1 = nn.MaxPool1d(2, 2)
 49 |             self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding)
 50 |             self.pool2 = nn.MaxPool1d(2, 2)
 51 |             self.conv3 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding)
 52 |             self.pool3 = nn.MaxPool1d(2, 2)
 53 |             self.fc1 = nn.Linear(int(L_in * (n_filters) / 8), n_neurons)
 54 |             self.fc1_drop = nn.Dropout(dropout)
 55 |             self.fc2 = nn.Linear(n_neurons, 1)
 56 |     
 57 |     def forward(self, x):
 58 |         # x: tensor (batch_size, L_in, in_channels)
 59 |         x = x.transpose(1,2) # swap time and feature axes
 60 |         
 61 |         x = self.pool1(self.activation(self.conv1(x)))
 62 |         if self.depth == 2 or self.depth == 3:
 63 |             x = self.pool2(self.activation(self.conv2(x)))
 64 |         if self.depth == 3:
 65 |             x = self.pool3(self.activation(self.conv3(x)))
 66 |         
 67 |         x = x.view(x.size(0), -1) # flatten
 68 |         x = self.activation(self.fc1_drop(self.fc1(x)))
 69 |         x = torch.sigmoid(self.fc2(x))
 70 |         return x
 71 | 
 72 | class RNN_V2(nn.Module):
 73 |     """
 74 |     Multi-layer LSTM network
 75 |     """
 76 |     def __init__(
 77 |         self, 
 78 |         input_size,
 79 |         input_length,
 80 |         output_size,
 81 |         hidden_size=64,
 82 |         num_layers=1,
 83 |         dropout=0.0,
 84 |         n_neurons=64,
 85 |         activation='relu',
 86 |     ):
 87 |         super().__init__()
 88 |         if activation == 'relu':
 89 |             self.activation = F.relu
 90 |         elif activation == 'elu':
 91 |             self.activation = F.elu
 92 |         
 93 |         self.hidden_size = int(hidden_size)
 94 |         self.num_layers = int(num_layers)
 95 |         
 96 |         self.lstm = nn.LSTM(int(input_size), int(hidden_size), int(num_layers), batch_first=True)
 97 |         self.fc1 = nn.Linear(hidden_size, n_neurons)
 98 |         self.fc1_drop = nn.Dropout(dropout)
 99 |         self.fc2 = nn.Linear(n_neurons, output_size)
100 |     
101 |     def forward(self, x):
102 |         # x: tensor (batch_size, T, input_size)
103 |         # h_all: (batch_size, T, hidden_size)
104 |         h_0, c_0 = self.init_hidden(x)
105 |         h_all, (h_T, c_T) = self.lstm(x, (h_0, c_0))
106 |         output = h_T[-1]
107 |         output = self.activation(self.fc1_drop(self.fc1(output)))
108 |         output = torch.sigmoid(self.fc2(output))
109 |         return output
110 |     
111 |     def init_hidden(self, x):
112 |         batch_size = x.size(0)
113 |         return (torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device),
114 |                 torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device))


--------------------------------------------------------------------------------
/eicu_experiments/3_ML_models/run_deep.py:
--------------------------------------------------------------------------------
  1 | # python run_deep.py --outcome=ARF --T=4 --dt=0.5 --model_type=CNN --cuda=7
  2 | 
  3 | import sys, os, time, pickle, random
  4 | import pandas as pd
  5 | import numpy as np
  6 | import pathlib
  7 | pathlib.Path('log').mkdir(parents=True, exist_ok=True)
  8 | 
  9 | import yaml
 10 | with open('config.yaml') as f:
 11 |     config = yaml.load(f)
 12 | 
 13 | ########
 14 | ## Constants
 15 | data_path = config['data_path']
 16 | model_names = config['model_names']
 17 | 
 18 | budget = config['train']['budget'] # Number of randomized hyperparameter settings to try
 19 | repeat = config['train']['repeat'] # 1   # number of restarts (with different seeds) for each setting
 20 | epochs = config['train']['epochs'] # 15  # Max epochs for each setting
 21 | 
 22 | # Feature dimensions
 23 | dimensions = config['feature_dimension']
 24 | 
 25 | # Hyperparameter search space
 26 | train_param_grid = {
 27 |     'batch_size': [16, 32, 64, 128],
 28 |     'lr': [1e-2, 1e-3, 1e-4],
 29 | }
 30 | CNN_param_grid = {
 31 |     'dropout': [0.0, 0.1, 0.2, 0.4, 0.8],
 32 |     'depth': [1, 2],#, 3],
 33 |     'filter_size': [1, 2, 3, 4],
 34 |     'n_filters': [16, 32, 64, 128],
 35 |     'n_neurons': [16, 32, 64, 128],
 36 |     'activation': ['relu', 'elu'],
 37 | }
 38 | RNN_param_grid = {
 39 |     'dropout': [0.0, 0.1, 0.2, 0.4, 0.8],
 40 |     'num_layers': [1, 2, 3],
 41 |     'hidden_size': [16, 32, 64, 128],
 42 |     'n_neurons': [16, 32, 64, 128],
 43 |     'activation': ['relu', 'elu'],
 44 | }
 45 | 
 46 | training_params = {'batch_size', 'lr'}
 47 | 
 48 | ########
 49 | 
 50 | import argparse
 51 | 
 52 | parser = argparse.ArgumentParser(description='')
 53 | 
 54 | parser.add_argument('--outcome', type=str, required=True)
 55 | parser.add_argument('--T', type=float, required=True)
 56 | parser.add_argument('--dt', type=float, required=True)
 57 | parser.add_argument('--model_type', type=str, required=True)
 58 | parser.add_argument('--cuda', type=int, default=7)
 59 | parser.add_argument('--seed', type=int, default=42)
 60 | 
 61 | args = parser.parse_args()
 62 | 
 63 | task = args.outcome
 64 | model_type = args.model_type
 65 | 
 66 | T = float(args.T)
 67 | dt = float(args.dt)
 68 | L_in = int(np.floor(T / dt))
 69 | in_channels = dimensions[task][float(T)]
 70 | 
 71 | import lib.models as models
 72 | model_name = model_names[model_type]
 73 | ModelClass = getattr(models, model_name)
 74 | 
 75 | if model_type == 'CNN':
 76 |     param_grid = {**train_param_grid, **CNN_param_grid}
 77 | elif model_type == 'RNN':
 78 |     param_grid = {**train_param_grid, **RNN_param_grid}
 79 | else:
 80 |     assert False
 81 | 
 82 | # Create checkpoint directories
 83 | import pathlib
 84 | pathlib.Path("./checkpoint/model={}.outcome={}.T={}.dt={}/".format(model_name, task, T, dt)).mkdir(parents=True, exist_ok=True)
 85 | 
 86 | ## Data
 87 | import lib.data as data
 88 | if task == 'mortality':
 89 |     tr_loader, va_loader, te_loader = data.get_benchmark_splits(fuse=True)
 90 | else:
 91 |     tr_loader, va_loader, te_loader = data.get_train_val_test(task, duration=T, timestep=dt, fuse=True)
 92 | 
 93 | import torch
 94 | from torch.utils.data import Dataset, DataLoader
 95 | from sklearn.model_selection import StratifiedShuffleSplit
 96 | 
 97 | # Set CUDA 
 98 | if args.cuda:
 99 |     torch.cuda.set_device(args.cuda)
100 |     print('cuda', torch.cuda.current_device())
101 | 
102 | if args.seed:
103 |     torch.manual_seed(args.seed)
104 |     np.random.seed(args.seed)
105 |     random.seed(args.seed)
106 | 
107 | 
108 | from lib.experiment import Experiment
109 | 
110 | class MIMICExperiment(Experiment):
111 |     def get_model_params(self, params):
112 |         model = ModelClass(
113 |             in_channels, L_in, 1,
114 |             **{k:params[k] for k in params.keys() if k not in training_params}
115 |         )
116 |         criterion = torch.nn.BCELoss()
117 |         optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
118 |         return model, criterion, optimizer
119 | 
120 |     def get_data(self):
121 |         return tr_loader, va_loader
122 | 
123 | exp = MIMICExperiment(
124 |     param_grid, name='model={}.outcome={}.T={}.dt={}'.format(model_name, task, T, dt), 
125 |     budget=budget, n_epochs=epochs, repeat=repeat,
126 | )
127 | 
128 | print('EXPERIMENT:', exp.name)
129 | 
130 | df_search = exp.run()
131 | df_search.to_csv('./log/df_search.{}.csv'.format(exp.name), index=False)
132 | 


--------------------------------------------------------------------------------
/eicu_experiments/3_ML_models/run_deep_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | mkdir -p log
 4 | cuda=0
 5 | 
 6 | python run_deep.py --outcome=mortality   --T=48.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=mortality,T=48,dt=1.0,CNN.log'
 7 | python run_deep.py --outcome=mortality   --T=48.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=mortality,T=48,dt=1.0,RNN.log'
 8 | 
 9 | python run_deep.py --outcome=ARF   --T=4.0  --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=ARF,T=4,dt=1.0,CNN.log'
10 | python run_deep.py --outcome=ARF   --T=4.0  --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=ARF,T=4,dt=1.0,RNN.log'
11 | 
12 | python run_deep.py --outcome=ARF   --T=12.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=ARF,T=12,dt=1.0,CNN.log'
13 | python run_deep.py --outcome=ARF   --T=12.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=ARF,T=12,dt=1.0,RNN.log'
14 | 
15 | python run_deep.py --outcome=Shock --T=4.0  --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=Shock,T=4,dt=1.0,CNN.log'
16 | python run_deep.py --outcome=Shock --T=4.0  --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=Shock,T=4,dt=1.0,RNN.log'
17 | 
18 | python run_deep.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=Shock,T=12,dt=1.0,CNN.log'
19 | python run_deep.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=Shock,T=12,dt=1.0,RNN.log'
20 | 


--------------------------------------------------------------------------------
/eicu_experiments/3_ML_models/run_deep_eval.py:
--------------------------------------------------------------------------------
 1 | import sys, os, time, pickle, random
 2 | import pandas as pd
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import yaml
 6 | with open('config.yaml') as f:
 7 |     config = yaml.load(f)
 8 | 
 9 | ########
10 | ## Constants
11 | model_names = config['model_names']
12 | training_params = {'batch_size', 'lr'}
13 | 
14 | # Feature dimensions
15 | dimensions = config['feature_dimension']
16 | 
17 | ########
18 | 
19 | def main(task, T, dt, model_type):
20 |     L_in = int(np.floor(T / dt))
21 |     in_channels = dimensions[task][T]
22 | 
23 |     import lib.models as models
24 |     model_name = model_names[model_type]
25 |     ModelClass = getattr(models, model_name)
26 |     df_search = pd.read_csv('./log/df_search.model={}.outcome={}.T={}.dt={}.csv'.format(model_name, task, T, dt))
27 |     import lib.evaluate as evaluate
28 |     best_model_info = evaluate.get_best_model_info(df_search)
29 |     checkpoint, model = evaluate.load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params)
30 | 
31 | 
32 |     import lib.data as data
33 |     if task == 'mortality':
34 |         te_loader = data.get_benchmark_test(fuse=True)
35 |     else:
36 |         te_loader = data.get_test(task, duration=T, timestep=dt, fuse=True)
37 |     
38 |     y_true, y_score = evaluate.get_test_predictions(model, te_loader, '{}_T={}_dt={}'.format(task, T, dt), model_name)
39 |     evaluate.save_test_predictions(y_true, y_score, task, T, dt, model_name)
40 | 
41 |     from sklearn import metrics, utils
42 |     fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score)
43 |     fig = plt.figure(figsize=(5,5))
44 |     plt.xlabel('False Positive Rate')
45 |     plt.ylabel('True Positive Rate')
46 |     plt.xlim(0,1)
47 |     plt.ylim(0,1)
48 |     plt.plot([0,1], [0,1], ':')
49 |     plt.plot(fpr, tpr, color='darkorange')
50 |     plt.show()
51 | 
52 |     ## Bootstrapped 95% Confidence Interval
53 |     # try:
54 |     #     yte_pred = clf.decision_function(Xte)
55 |     # except AttributeError:
56 |     #     yte_pred = clf.predict_proba(Xte)[:,1]
57 |     from sklearn.externals.joblib import Parallel, delayed
58 |     from tqdm import tqdm_notebook as tqdm
59 |     def func(i):
60 |         yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)
61 |         return metrics.roc_auc_score(yte_true_b, yte_pred_b)
62 | 
63 |     test_scores = Parallel(n_jobs=16)(delayed(func)(i) for i in tqdm(range(1000), leave=False))
64 |     print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5)))
65 | 
66 |     # idx = (np.abs(tpr - 0.5)).argmin()
67 |     # y_pred = (y_score > thresholds[idx])
68 |     # metrics.roc_auc_score(y_true, y_score)
69 | 
70 |     precision, recall, thresholds_ = metrics.precision_recall_curve(y_true, y_score)
71 |     fig = plt.figure(figsize=(5,5))
72 |     plt.xlabel('Recall')
73 |     plt.ylabel('Precision')
74 |     plt.xlim(0,1)
75 |     plt.ylim(0,1)
76 |     plt.plot(recall, precision, color='darkorange')
77 |     plt.show()
78 | 
79 |     # target TPR = 50%
80 |     idx = (np.abs(tpr - 0.5)).argmin()
81 |     y_pred = (y_score > thresholds[idx])
82 |     metrics.roc_auc_score(y_true, y_score)
83 | 
84 |     pd.DataFrame([{
85 |         'tpr': tpr[idx],
86 |         'fpr': fpr[idx],
87 |         'ppv': metrics.precision_score(y_true, y_pred),
88 |     }])
89 | 


--------------------------------------------------------------------------------
/eicu_experiments/3_ML_models/run_shallow_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | mkdir -p log
 4 | mkdir -p output
 5 | 
 6 | python3 run_shallow.py --outcome=ARF   --T=4.0 --model_type=LR \
 7 |     >  >(tee 'log/outcome=ARF,T=4.0,dt=1.0,LR.out') \
 8 |     2> >(tee 'log/outcome=ARF,T=4.0,dt=1.0,LR.err' >&2)
 9 | 
10 | python3 run_shallow.py --outcome=Shock --T=4.0 --model_type=LR \
11 |     >  >(tee 'log/outcome=Shock,T=4.0,dt=1.0,LR.out') \
12 |     2> >(tee 'log/outcome=Shock,T=4.0,dt=1.0,LR.err' >&2)
13 | 
14 | python3 run_shallow.py --outcome=ARF   --T=4.0 --model_type=RF \
15 |     >  >(tee 'log/outcome=ARF,T=4.0,dt=1.0,RF.out') \
16 |     2> >(tee 'log/outcome=ARF,T=4.0,dt=1.0,RF.err' >&2)
17 | 
18 | python3 run_shallow.py --outcome=Shock --T=4.0 --model_type=RF \
19 |     >  >(tee 'log/outcome=Shock,T=4.0,dt=1.0,RF.out') \
20 |     2> >(tee 'log/outcome=Shock,T=4.0,dt=1.0,RF.err' >&2)
21 | 
22 | 
23 | python run_shallow.py --outcome=ARF   --T=12.0 --dt=1.0 --model_type=LR &> 'log/outcome=ARF,T=12.0,dt=1.0,LR.log'
24 | python run_shallow.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=LR &> 'log/outcome=Shock,T=12.0,dt=1.0,LR.log'
25 | 
26 | python run_shallow.py --outcome=ARF   --T=12.0 --dt=1.0 --model_type=RF &> 'log/outcome=ARF,T=12.0,dt=1.0,RF.log'
27 | python run_shallow.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=RF &> 'log/outcome=Shock,T=12.0,dt=1.0,RF.log'
28 | 
29 | 
30 | python run_shallow.py --outcome=mortality --T=48.0 --model_type=LR \
31 |     >  >(tee 'log/outcome=mortality,T=48.0,dt=1.0,LR.out') \
32 |     2> >(tee 'log/outcome=mortality,T=48.0,dt=1.0,LR.err' >&2)
33 | python run_shallow.py --outcome=mortality --T=48.0 --model_type=RF \
34 |     >  >(tee 'log/outcome=mortality,T=48.0,dt=1.0,RF.out') \
35 |     2> >(tee 'log/outcome=mortality,T=48.0,dt=1.0,RF.err' >&2)
36 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: FIDDLE-env
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.7
 7 |   - sparse
 8 |   - pandas
 9 |   - tqdm
10 |   - pyyaml
11 |   - scikit-learn
12 |   - numpy
13 |   - joblib
14 |   - ipykernel
15 |   - matplotlib
16 | prefix: /home/tangsp/miniconda3/envs/FIDDLE-env
17 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_mask+Dt/config.py:
--------------------------------------------------------------------------------
 1 | import os, yaml
 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f:
 3 |     config = yaml.full_load(f)
 4 | 
 5 | ID_col = config['column_names']['ID']
 6 | var_col = config['column_names']['var_name']
 7 | val_col = config['column_names']['var_value']
 8 | t_col = config['column_names']['t']
 9 | 
10 | value_type_override = config['value_types']
11 | 
12 | parallel = True
13 | n_jobs = 72
14 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_mask+Dt/config.yaml:
--------------------------------------------------------------------------------
 1 | # Customize table headers
 2 | column_names:
 3 |     ID: ID
 4 |     t: t
 5 |     var_name: variable_name
 6 |     var_value: variable_value
 7 | 
 8 | value_types:
 9 |     # enter the feature type that you would like to override in the following format:
10 |     FIRST_WARDID: Categorical
11 |     MedA:
12 |         AMOUNT: Numeric
13 |         ROUTE: Categorical
14 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_mask+Dt/run.py:
--------------------------------------------------------------------------------
  1 | from .config import *
  2 | import pickle
  3 | import pandas as pd
  4 | import numpy as np
  5 | import time
  6 | import os
  7 | 
  8 | import argparse
  9 | from .helpers import str2bool
 10 | 
 11 | parser = argparse.ArgumentParser(description='')
 12 | parser.add_argument('--T',               type=float,   required=True)
 13 | parser.add_argument('--dt',              type=float,   required=True)
 14 | parser.add_argument('--theta_1',         type=float,   default=0.001)
 15 | parser.add_argument('--theta_2',         type=float,   default=0.001)
 16 | parser.add_argument('--theta_freq',      type=float,   default=1.0)
 17 | parser.add_argument('--stats_functions', nargs='+',    default=['min', 'max', 'mean'])
 18 | parser.add_argument('--binarize',        type=str2bool, default=True, nargs='?', const=True)
 19 | 
 20 | parser.add_argument('--data_path',       type=str,     required=True)
 21 | parser.add_argument('--input_fname',     type=str,     required=False)
 22 | parser.add_argument('--population',      type=str,     required=True)
 23 | parser.add_argument('--N',               type=int,     required=False)
 24 | parser.add_argument('--Ds',              nargs='+',    type=int)
 25 | 
 26 | parser.add_argument('--no_prefilter',    dest='prefilter',  action='store_false')
 27 | parser.add_argument('--no_postfilter',   dest='postfilter', action='store_false')
 28 | parser.set_defaults(prefilter=True, postfilter=True)
 29 | 
 30 | args = parser.parse_args()
 31 | 
 32 | data_path = args.data_path
 33 | if not data_path.endswith('/'):
 34 |     data_path += '/'
 35 | 
 36 | population = args.population
 37 | T = int(args.T)
 38 | dt = args.dt
 39 | theta_1 = args.theta_1
 40 | theta_2 = args.theta_2
 41 | theta_freq = args.theta_freq
 42 | stats_functions = args.stats_functions
 43 | binarize = args.binarize
 44 | 
 45 | df_population = pd.read_csv(population).set_index('ID')
 46 | N = args.N or len(df_population)
 47 | df_population = df_population.iloc[:args.N]
 48 | L = int(np.floor(T/dt))
 49 | 
 50 | args.df_population = df_population
 51 | args.N = N
 52 | args.L = L
 53 | args.parallel = parallel
 54 | 
 55 | if args.input_fname and os.path.isfile(args.input_fname):
 56 |     input_fname = args.input_fname
 57 |     if input_fname.endswith('.p' or '.pickle'):
 58 |         df_data = pd.read_pickle(input_fname)
 59 |     elif input_fname.endswith('.csv'):
 60 |         df_data = pd.read_csv(input_fname)
 61 |     else:
 62 |         assert False
 63 | elif os.path.isfile(data_path + 'input_data.p'):
 64 |     input_fname = data_path + 'input_data.p'
 65 |     df_data = pd.read_pickle(input_fname)
 66 | elif os.path.isfile(data_path + 'input_data.pickle'):
 67 |     input_fname = data_path + 'input_data.pickle'
 68 |     df_data = pd.read_pickle(input_fname)
 69 | elif os.path.isfile(data_path + 'input_data.csv'):
 70 |     input_fname = data_path + 'input_data.csv'
 71 |     df_data = pd.read_csv(input_fname)
 72 | 
 73 | 
 74 | from .steps import *
 75 | 
 76 | print('Input data file:', input_fname)
 77 | print()
 78 | print('Input arguments:')
 79 | print('    {:<6} = {}'.format('T', T))
 80 | print('    {:<6} = {}'.format('dt', dt))
 81 | print('    {:<6} = {}'.format('\u03B8\u2081', theta_1))
 82 | print('    {:<6} = {}'.format('\u03B8\u2082', theta_2))
 83 | print('    {:<6} = {}'.format('\u03B8_freq', theta_freq))
 84 | print('    {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
 85 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
 86 | print()
 87 | print('N = {}'.format(N))
 88 | print('L = {}'.format(L))
 89 | print('', flush=True)
 90 | 
 91 | 
 92 | ######
 93 | # Main
 94 | ######
 95 | if args.prefilter:
 96 |     print_header('1) Pre-filter')
 97 |     df_data = pre_filter(df_data, theta_1, df_population, args)
 98 |     df_data.to_csv(data_path + 'pre-filtered.csv', index=False)
 99 | 
100 | print_header('2) Transform; 3) Post-filter')
101 | df_data, df_types = detect_variable_data_type(df_data, value_type_override, args)
102 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data)
103 | 
104 | # Process time-invariant data
105 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args)
106 | 
107 | # Process time-dependent data
108 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args)
109 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_maskonly/config.py:
--------------------------------------------------------------------------------
 1 | import os, yaml
 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f:
 3 |     config = yaml.full_load(f)
 4 | 
 5 | ID_col = config['column_names']['ID']
 6 | var_col = config['column_names']['var_name']
 7 | val_col = config['column_names']['var_value']
 8 | t_col = config['column_names']['t']
 9 | 
10 | value_type_override = config['value_types']
11 | 
12 | parallel = True
13 | n_jobs = 72
14 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_maskonly/config.yaml:
--------------------------------------------------------------------------------
 1 | # Customize table headers
 2 | column_names:
 3 |     ID: ID
 4 |     t: t
 5 |     var_name: variable_name
 6 |     var_value: variable_value
 7 | 
 8 | value_types:
 9 |     # enter the feature type that you would like to override in the following format:
10 |     FIRST_WARDID: Categorical
11 |     MedA:
12 |         AMOUNT: Numeric
13 |         ROUTE: Categorical
14 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_maskonly/run.py:
--------------------------------------------------------------------------------
  1 | from .config import *
  2 | import pickle
  3 | import pandas as pd
  4 | import numpy as np
  5 | import time
  6 | import os
  7 | 
  8 | import argparse
  9 | from .helpers import str2bool
 10 | 
 11 | parser = argparse.ArgumentParser(description='')
 12 | parser.add_argument('--T',               type=float,   required=True)
 13 | parser.add_argument('--dt',              type=float,   required=True)
 14 | parser.add_argument('--theta_1',         type=float,   default=0.001)
 15 | parser.add_argument('--theta_2',         type=float,   default=0.001)
 16 | parser.add_argument('--theta_freq',      type=float,   default=1.0)
 17 | parser.add_argument('--stats_functions', nargs='+',    default=['min', 'max', 'mean'])
 18 | parser.add_argument('--binarize',        type=str2bool, default=True, nargs='?', const=True)
 19 | 
 20 | parser.add_argument('--data_path',       type=str,     required=True)
 21 | parser.add_argument('--input_fname',     type=str,     required=False)
 22 | parser.add_argument('--population',      type=str,     required=True)
 23 | parser.add_argument('--N',               type=int,     required=False)
 24 | parser.add_argument('--Ds',              nargs='+',    type=int)
 25 | 
 26 | parser.add_argument('--no_prefilter',    dest='prefilter',  action='store_false')
 27 | parser.add_argument('--no_postfilter',   dest='postfilter', action='store_false')
 28 | parser.set_defaults(prefilter=True, postfilter=True)
 29 | 
 30 | args = parser.parse_args()
 31 | 
 32 | data_path = args.data_path
 33 | if not data_path.endswith('/'):
 34 |     data_path += '/'
 35 | 
 36 | population = args.population
 37 | T = int(args.T)
 38 | dt = args.dt
 39 | theta_1 = args.theta_1
 40 | theta_2 = args.theta_2
 41 | theta_freq = args.theta_freq
 42 | stats_functions = args.stats_functions
 43 | binarize = args.binarize
 44 | 
 45 | df_population = pd.read_csv(population).set_index('ID')
 46 | N = args.N or len(df_population)
 47 | df_population = df_population.iloc[:args.N]
 48 | L = int(np.floor(T/dt))
 49 | 
 50 | args.df_population = df_population
 51 | args.N = N
 52 | args.L = L
 53 | args.parallel = parallel
 54 | 
 55 | if args.input_fname and os.path.isfile(args.input_fname):
 56 |     input_fname = args.input_fname
 57 |     if input_fname.endswith('.p' or '.pickle'):
 58 |         df_data = pd.read_pickle(input_fname)
 59 |     elif input_fname.endswith('.csv'):
 60 |         df_data = pd.read_csv(input_fname)
 61 |     else:
 62 |         assert False
 63 | elif os.path.isfile(data_path + 'input_data.p'):
 64 |     input_fname = data_path + 'input_data.p'
 65 |     df_data = pd.read_pickle(input_fname)
 66 | elif os.path.isfile(data_path + 'input_data.pickle'):
 67 |     input_fname = data_path + 'input_data.pickle'
 68 |     df_data = pd.read_pickle(input_fname)
 69 | elif os.path.isfile(data_path + 'input_data.csv'):
 70 |     input_fname = data_path + 'input_data.csv'
 71 |     df_data = pd.read_csv(input_fname)
 72 | 
 73 | 
 74 | from .steps import *
 75 | 
 76 | print('Input data file:', input_fname)
 77 | print()
 78 | print('Input arguments:')
 79 | print('    {:<6} = {}'.format('T', T))
 80 | print('    {:<6} = {}'.format('dt', dt))
 81 | print('    {:<6} = {}'.format('\u03B8\u2081', theta_1))
 82 | print('    {:<6} = {}'.format('\u03B8\u2082', theta_2))
 83 | print('    {:<6} = {}'.format('\u03B8_freq', theta_freq))
 84 | print('    {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
 85 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
 86 | print()
 87 | print('N = {}'.format(N))
 88 | print('L = {}'.format(L))
 89 | print('', flush=True)
 90 | 
 91 | 
 92 | ######
 93 | # Main
 94 | ######
 95 | if args.prefilter:
 96 |     print_header('1) Pre-filter')
 97 |     df_data = pre_filter(df_data, theta_1, df_population, args)
 98 |     df_data.to_csv(data_path + 'pre-filtered.csv', index=False)
 99 | 
100 | print_header('2) Transform; 3) Post-filter')
101 | df_data, df_types = detect_variable_data_type(df_data, value_type_override, args)
102 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data)
103 | 
104 | # Process time-invariant data
105 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args)
106 | 
107 | # Process time-dependent data
108 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args)
109 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_medianimpute/config.py:
--------------------------------------------------------------------------------
 1 | import os, yaml
 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f:
 3 |     config = yaml.full_load(f)
 4 | 
 5 | ID_col = config['column_names']['ID']
 6 | var_col = config['column_names']['var_name']
 7 | val_col = config['column_names']['var_value']
 8 | t_col = config['column_names']['t']
 9 | 
10 | value_type_override = config['value_types']
11 | 
12 | parallel = True
13 | n_jobs = 72
14 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_medianimpute/config.yaml:
--------------------------------------------------------------------------------
 1 | # Customize table headers
 2 | column_names:
 3 |     ID: ID
 4 |     t: t
 5 |     var_name: variable_name
 6 |     var_value: variable_value
 7 | 
 8 | value_types:
 9 |     # enter the feature type that you would like to override in the following format:
10 |     FIRST_WARDID: Categorical
11 |     MedA:
12 |         AMOUNT: Numeric
13 |         ROUTE: Categorical
14 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_medianimpute/run.py:
--------------------------------------------------------------------------------
  1 | from .config import *
  2 | import pickle
  3 | import pandas as pd
  4 | import numpy as np
  5 | import time
  6 | import os
  7 | 
  8 | import argparse
  9 | from .helpers import str2bool
 10 | 
 11 | parser = argparse.ArgumentParser(description='')
 12 | parser.add_argument('--T',               type=float,   required=True)
 13 | parser.add_argument('--dt',              type=float,   required=True)
 14 | parser.add_argument('--theta_1',         type=float,   default=0.001)
 15 | parser.add_argument('--theta_2',         type=float,   default=0.001)
 16 | parser.add_argument('--theta_freq',      type=float,   default=1.0)
 17 | parser.add_argument('--stats_functions', nargs='+',    default=['min', 'max', 'mean'])
 18 | parser.add_argument('--binarize',        type=str2bool, default=True, nargs='?', const=True)
 19 | 
 20 | parser.add_argument('--data_path',       type=str,     required=True)
 21 | parser.add_argument('--input_fname',     type=str,     required=False)
 22 | parser.add_argument('--population',      type=str,     required=True)
 23 | parser.add_argument('--N',               type=int,     required=False)
 24 | parser.add_argument('--Ds',              nargs='+',    type=int)
 25 | 
 26 | parser.add_argument('--no_prefilter',    dest='prefilter',  action='store_false')
 27 | parser.add_argument('--no_postfilter',   dest='postfilter', action='store_false')
 28 | parser.set_defaults(prefilter=True, postfilter=True)
 29 | 
 30 | args = parser.parse_args()
 31 | 
 32 | data_path = args.data_path
 33 | if not data_path.endswith('/'):
 34 |     data_path += '/'
 35 | 
 36 | population = args.population
 37 | T = int(args.T)
 38 | dt = args.dt
 39 | theta_1 = args.theta_1
 40 | theta_2 = args.theta_2
 41 | theta_freq = args.theta_freq
 42 | stats_functions = args.stats_functions
 43 | binarize = args.binarize
 44 | 
 45 | df_population = pd.read_csv(population).set_index('ID')
 46 | N = args.N or len(df_population)
 47 | df_population = df_population.iloc[:args.N]
 48 | L = int(np.floor(T/dt))
 49 | 
 50 | args.df_population = df_population
 51 | args.N = N
 52 | args.L = L
 53 | args.parallel = parallel
 54 | 
 55 | if args.input_fname and os.path.isfile(args.input_fname):
 56 |     input_fname = args.input_fname
 57 |     if input_fname.endswith('.p' or '.pickle'):
 58 |         df_data = pd.read_pickle(input_fname)
 59 |     elif input_fname.endswith('.csv'):
 60 |         df_data = pd.read_csv(input_fname)
 61 |     else:
 62 |         assert False
 63 | elif os.path.isfile(data_path + 'input_data.p'):
 64 |     input_fname = data_path + 'input_data.p'
 65 |     df_data = pd.read_pickle(input_fname)
 66 | elif os.path.isfile(data_path + 'input_data.pickle'):
 67 |     input_fname = data_path + 'input_data.pickle'
 68 |     df_data = pd.read_pickle(input_fname)
 69 | elif os.path.isfile(data_path + 'input_data.csv'):
 70 |     input_fname = data_path + 'input_data.csv'
 71 |     df_data = pd.read_csv(input_fname)
 72 | 
 73 | 
 74 | from .steps import *
 75 | 
 76 | print('Input data file:', input_fname)
 77 | print()
 78 | print('Input arguments:')
 79 | print('    {:<6} = {}'.format('T', T))
 80 | print('    {:<6} = {}'.format('dt', dt))
 81 | print('    {:<6} = {}'.format('\u03B8\u2081', theta_1))
 82 | print('    {:<6} = {}'.format('\u03B8\u2082', theta_2))
 83 | print('    {:<6} = {}'.format('\u03B8_freq', theta_freq))
 84 | print('    {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
 85 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
 86 | print()
 87 | print('N = {}'.format(N))
 88 | print('L = {}'.format(L))
 89 | print('', flush=True)
 90 | 
 91 | 
 92 | ######
 93 | # Main
 94 | ######
 95 | if args.prefilter:
 96 |     print_header('1) Pre-filter')
 97 |     df_data = pre_filter(df_data, theta_1, df_population, args)
 98 |     df_data.to_csv(data_path + 'pre-filtered.csv', index=False)
 99 | 
100 | print_header('2) Transform; 3) Post-filter')
101 | df_data, df_types = detect_variable_data_type(df_data, value_type_override, args)
102 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data)
103 | 
104 | # Process time-invariant data
105 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args)
106 | 
107 | # Process time-dependent data
108 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args)
109 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_noimpute/config.py:
--------------------------------------------------------------------------------
 1 | import os, yaml
 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f:
 3 |     config = yaml.full_load(f)
 4 | 
 5 | ID_col = config['column_names']['ID']
 6 | var_col = config['column_names']['var_name']
 7 | val_col = config['column_names']['var_value']
 8 | t_col = config['column_names']['t']
 9 | 
10 | value_type_override = config['value_types']
11 | 
12 | parallel = True
13 | n_jobs = 72
14 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_noimpute/config.yaml:
--------------------------------------------------------------------------------
 1 | # Customize table headers
 2 | column_names:
 3 |     ID: ID
 4 |     t: t
 5 |     var_name: variable_name
 6 |     var_value: variable_value
 7 | 
 8 | value_types:
 9 |     # enter the feature type that you would like to override in the following format:
10 |     FIRST_WARDID: Categorical
11 |     MedA:
12 |         AMOUNT: Numeric
13 |         ROUTE: Categorical
14 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_noimpute/run.py:
--------------------------------------------------------------------------------
  1 | from .config import *
  2 | import pickle
  3 | import pandas as pd
  4 | import numpy as np
  5 | import time
  6 | import os
  7 | 
  8 | import argparse
  9 | from .helpers import str2bool
 10 | 
 11 | parser = argparse.ArgumentParser(description='')
 12 | parser.add_argument('--T',               type=float,   required=True)
 13 | parser.add_argument('--dt',              type=float,   required=True)
 14 | parser.add_argument('--theta_1',         type=float,   default=0.001)
 15 | parser.add_argument('--theta_2',         type=float,   default=0.001)
 16 | parser.add_argument('--theta_freq',      type=float,   default=1.0)
 17 | parser.add_argument('--stats_functions', nargs='+',    default=['min', 'max', 'mean'])
 18 | parser.add_argument('--binarize',        type=str2bool, default=True, nargs='?', const=True)
 19 | 
 20 | parser.add_argument('--data_path',       type=str,     required=True)
 21 | parser.add_argument('--input_fname',     type=str,     required=False)
 22 | parser.add_argument('--population',      type=str,     required=True)
 23 | parser.add_argument('--N',               type=int,     required=False)
 24 | parser.add_argument('--Ds',              nargs='+',    type=int)
 25 | 
 26 | parser.add_argument('--no_prefilter',    dest='prefilter',  action='store_false')
 27 | parser.add_argument('--no_postfilter',   dest='postfilter', action='store_false')
 28 | parser.set_defaults(prefilter=True, postfilter=True)
 29 | 
 30 | args = parser.parse_args()
 31 | 
 32 | data_path = args.data_path
 33 | if not data_path.endswith('/'):
 34 |     data_path += '/'
 35 | 
 36 | population = args.population
 37 | T = int(args.T)
 38 | dt = args.dt
 39 | theta_1 = args.theta_1
 40 | theta_2 = args.theta_2
 41 | theta_freq = args.theta_freq
 42 | stats_functions = args.stats_functions
 43 | binarize = args.binarize
 44 | 
 45 | df_population = pd.read_csv(population).set_index('ID')
 46 | N = args.N or len(df_population)
 47 | df_population = df_population.iloc[:args.N]
 48 | L = int(np.floor(T/dt))
 49 | 
 50 | args.df_population = df_population
 51 | args.N = N
 52 | args.L = L
 53 | args.parallel = parallel
 54 | 
 55 | if args.input_fname and os.path.isfile(args.input_fname):
 56 |     input_fname = args.input_fname
 57 |     if input_fname.endswith('.p' or '.pickle'):
 58 |         df_data = pd.read_pickle(input_fname)
 59 |     elif input_fname.endswith('.csv'):
 60 |         df_data = pd.read_csv(input_fname)
 61 |     else:
 62 |         assert False
 63 | elif os.path.isfile(data_path + 'input_data.p'):
 64 |     input_fname = data_path + 'input_data.p'
 65 |     df_data = pd.read_pickle(input_fname)
 66 | elif os.path.isfile(data_path + 'input_data.pickle'):
 67 |     input_fname = data_path + 'input_data.pickle'
 68 |     df_data = pd.read_pickle(input_fname)
 69 | elif os.path.isfile(data_path + 'input_data.csv'):
 70 |     input_fname = data_path + 'input_data.csv'
 71 |     df_data = pd.read_csv(input_fname)
 72 | 
 73 | 
 74 | from .steps import *
 75 | 
 76 | print('Input data file:', input_fname)
 77 | print()
 78 | print('Input arguments:')
 79 | print('    {:<6} = {}'.format('T', T))
 80 | print('    {:<6} = {}'.format('dt', dt))
 81 | print('    {:<6} = {}'.format('\u03B8\u2081', theta_1))
 82 | print('    {:<6} = {}'.format('\u03B8\u2082', theta_2))
 83 | print('    {:<6} = {}'.format('\u03B8_freq', theta_freq))
 84 | print('    {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
 85 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
 86 | print()
 87 | print('N = {}'.format(N))
 88 | print('L = {}'.format(L))
 89 | print('', flush=True)
 90 | 
 91 | 
 92 | ######
 93 | # Main
 94 | ######
 95 | if args.prefilter:
 96 |     print_header('1) Pre-filter')
 97 |     df_data = pre_filter(df_data, theta_1, df_population, args)
 98 |     df_data.to_csv(data_path + 'pre-filtered.csv', index=False)
 99 | 
100 | print_header('2) Transform; 3) Post-filter')
101 | df_data, df_types = detect_variable_data_type(df_data, value_type_override, args)
102 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data)
103 | 
104 | # Process time-invariant data
105 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args)
106 | 
107 | # Process time-dependent data
108 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args)
109 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_ordinal/config.py:
--------------------------------------------------------------------------------
 1 | import os, yaml
 2 | with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f:
 3 |     config = yaml.full_load(f)
 4 | 
 5 | ID_col = config['column_names']['ID']
 6 | var_col = config['column_names']['var_name']
 7 | val_col = config['column_names']['var_value']
 8 | t_col = config['column_names']['t']
 9 | 
10 | use_ordinal_encoding = config['use_ordinal_encoding']
11 | value_type_override = config['value_types']
12 | 
13 | parallel = True
14 | n_jobs = 72
15 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_ordinal/config.yaml:
--------------------------------------------------------------------------------
 1 | # Customize table headers
 2 | column_names:
 3 |     ID: ID
 4 |     t: t
 5 |     var_name: variable_name
 6 |     var_value: variable_value
 7 | 
 8 | use_ordinal_encoding: yes
 9 | 
10 | value_types:
11 |     # enter the feature type that you would like to override in the following format:
12 |     FIRST_WARDID: Categorical
13 |     MedA:
14 |         AMOUNT: Numeric
15 |         ROUTE: Categorical
16 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/FIDDLE_ordinal/run.py:
--------------------------------------------------------------------------------
  1 | from .config import *
  2 | import pickle
  3 | import pandas as pd
  4 | import numpy as np
  5 | import time
  6 | import os
  7 | 
  8 | import argparse
  9 | from .helpers import str2bool
 10 | 
 11 | parser = argparse.ArgumentParser(description='')
 12 | parser.add_argument('--T',               type=float,   required=True)
 13 | parser.add_argument('--dt',              type=float,   required=True)
 14 | parser.add_argument('--theta_1',         type=float,   default=0.001)
 15 | parser.add_argument('--theta_2',         type=float,   default=0.001)
 16 | parser.add_argument('--theta_freq',      type=float,   default=1.0)
 17 | parser.add_argument('--stats_functions', nargs='+',    default=['min', 'max', 'mean'])
 18 | parser.add_argument('--binarize',        type=str2bool, default=True, nargs='?', const=True)
 19 | 
 20 | parser.add_argument('--data_path',       type=str,     required=True)
 21 | parser.add_argument('--input_fname',     type=str,     required=False)
 22 | parser.add_argument('--population',      type=str,     required=True)
 23 | parser.add_argument('--N',               type=int,     required=False)
 24 | parser.add_argument('--Ds',              nargs='+',    type=int)
 25 | 
 26 | parser.add_argument('--no_prefilter',    dest='prefilter',  action='store_false')
 27 | parser.add_argument('--no_postfilter',   dest='postfilter', action='store_false')
 28 | parser.set_defaults(prefilter=True, postfilter=True)
 29 | 
 30 | args = parser.parse_args()
 31 | 
 32 | data_path = args.data_path
 33 | if not data_path.endswith('/'):
 34 |     data_path += '/'
 35 | 
 36 | population = args.population
 37 | T = int(args.T)
 38 | dt = args.dt
 39 | theta_1 = args.theta_1
 40 | theta_2 = args.theta_2
 41 | theta_freq = args.theta_freq
 42 | stats_functions = args.stats_functions
 43 | binarize = args.binarize
 44 | 
 45 | df_population = pd.read_csv(population).set_index('ID')
 46 | N = args.N or len(df_population)
 47 | df_population = df_population.iloc[:args.N]
 48 | L = int(np.floor(T/dt))
 49 | 
 50 | args.df_population = df_population
 51 | args.N = N
 52 | args.L = L
 53 | args.parallel = parallel
 54 | 
 55 | if args.input_fname and os.path.isfile(args.input_fname):
 56 |     input_fname = args.input_fname
 57 |     if input_fname.endswith('.p' or '.pickle'):
 58 |         df_data = pd.read_pickle(input_fname)
 59 |     elif input_fname.endswith('.csv'):
 60 |         df_data = pd.read_csv(input_fname)
 61 |     else:
 62 |         assert False
 63 | elif os.path.isfile(data_path + 'input_data.p'):
 64 |     input_fname = data_path + 'input_data.p'
 65 |     df_data = pd.read_pickle(input_fname)
 66 | elif os.path.isfile(data_path + 'input_data.pickle'):
 67 |     input_fname = data_path + 'input_data.pickle'
 68 |     df_data = pd.read_pickle(input_fname)
 69 | elif os.path.isfile(data_path + 'input_data.csv'):
 70 |     input_fname = data_path + 'input_data.csv'
 71 |     df_data = pd.read_csv(input_fname)
 72 | 
 73 | 
 74 | from .steps import *
 75 | 
 76 | print('Input data file:', input_fname)
 77 | print()
 78 | print('Input arguments:')
 79 | print('    {:<6} = {}'.format('T', T))
 80 | print('    {:<6} = {}'.format('dt', dt))
 81 | print('    {:<6} = {}'.format('\u03B8\u2081', theta_1))
 82 | print('    {:<6} = {}'.format('\u03B8\u2082', theta_2))
 83 | print('    {:<6} = {}'.format('\u03B8_freq', theta_freq))
 84 | print('    {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
 85 | print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
 86 | print()
 87 | print('N = {}'.format(N))
 88 | print('L = {}'.format(L))
 89 | print('', flush=True)
 90 | 
 91 | 
 92 | ######
 93 | # Main
 94 | ######
 95 | if args.prefilter:
 96 |     print_header('1) Pre-filter')
 97 |     df_data = pre_filter(df_data, theta_1, df_population, args)
 98 |     df_data.to_csv(data_path + 'pre-filtered.csv', index=False)
 99 | 
100 | print_header('2) Transform; 3) Post-filter')
101 | df_data, df_types = detect_variable_data_type(df_data, value_type_override, args)
102 | df_time_invariant, df_time_series = split_by_timestamp_type(df_data)
103 | 
104 | # Process time-invariant data
105 | s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args)
106 | 
107 | # Process time-dependent data
108 | X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args)
109 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/impute,benchmark,outcome=mortality,T=48.0,dt=1.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 1.0
  6 |     θ₁     = 0.1
  7 |     θ₂     = 0.1
  8 |     θ_freq = 1.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 48
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.1)
 22 | Total variables     : 5405
 23 | Rare variables      : 4400
 24 | Remaining variables : 1005
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 30906331
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/impute,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 993
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 30803407
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.103816 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.513918 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 115
 61 | Correlated     : 3
 62 | Time elapsed: 1.529421 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 34), density=0.267
 66 | Total time: 1.560092 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 993
 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']
 74 | M₁ = 6
 75 | M₂ = 987
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 179265 out of 8577×48×6=2470176 total
 81 | (freq) number of imputed entries :	 97328
 82 | (freq) number of not imputed entries :	 81937
 83 | (non-freq) number of missing entries :	 379890514 out of 8577×48×987=406343952 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 48, 1023)
 86 | Time elapsed: 282.528516 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 1017 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 48, 3071), density=0.032
 96 | Time elapsed: 508.261614 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 48, 3071) 0.03224463116576165
102 | Original : 3071
103 | Nearly-constant: 1598
104 | *** time:  649.6199653148651
105 | Correlated     : 75
106 | *** time:  1411.193752527237
107 | 
108 | Output
109 | X: shape=(8577, 48, 1398), density=0.060
110 | (8577, 48, 1398) 0.059674307789588654
111 | Time elapsed: 1919.462684 seconds
112 | 
113 | Output
114 | X: shape=(8577, 48, 1398), density=0.060
115 | Total time: 1943.168096 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 1.0
  6 |     θ₁     = 0.001
  7 |     θ₂     = 0.001
  8 |     θ_freq = 1.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 48
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.001)
 22 | Total variables     : 5405
 23 | Rare variables      : 1524
 24 | Remaining variables : 3881
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 33661000
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 3869
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 33558076
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.138054 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.914684 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 53
 61 | Correlated     : 3
 62 | Time elapsed: 1.935639 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 96), density=0.116
 66 | Total time: 1.982121 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 3869
 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']
 74 | M₁ = 6
 75 | M₂ = 3863
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 179265 out of 8577×48×6=2470176 total
 81 | (freq) number of imputed entries :	 97328
 82 | (freq) number of not imputed entries :	 81937
 83 | (non-freq) number of missing entries :	 1561275820 out of 8577×48×3863=1590381648 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 48, 3881)
 86 | Time elapsed: 2208.715729 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 3875 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 48, 9641), density=0.009
 96 | Time elapsed: 6289.449163 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 48, 9641) 0.009134210670151034
102 | Original : 9641
103 | Nearly-constant: 2072
104 | *** time:  4570.7659068107605
105 | Correlated     : 334
106 | *** time:  9981.303814411163
107 | 
108 | Output
109 | X: shape=(8577, 48, 7235), density=0.011
110 | (8577, 48, 7235) 0.010731039202925532
111 | Time elapsed: 16270.757611 seconds
112 | 
113 | Output
114 | X: shape=(8577, 48, 7235), density=0.011
115 | Total time: 16299.821350 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 1.0
  6 |     θ₁     = 0.001
  7 |     θ₂     = 0.001
  8 |     θ_freq = 1.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 48
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.001)
 22 | Total variables     : 5405
 23 | Rare variables      : 1524
 24 | Remaining variables : 3881
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 33661000
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 3869
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 33558076
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.144644 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.576981 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 53
 61 | Correlated     : 3
 62 | Time elapsed: 1.595947 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 96), density=0.116
 66 | Total time: 1.633576 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 3869
 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']
 74 | M₁ = 6
 75 | M₂ = 3863
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 179265 out of 8577×48×6=2470176 total
 81 | (freq) number of imputed entries :	 0
 82 | (freq) number of not imputed entries :	 0
 83 | (non-freq) number of missing entries :	 1561275820 out of 8577×48×3863=1590381648 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 48, 3875)
 86 | Time elapsed: 1895.090169 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 3869 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 48, 9635), density=0.009
 96 | Time elapsed: 6725.114749 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 48, 9635) 0.008517169182244537
102 | Original : 9635
103 | Nearly-constant: 2066
104 | *** time:  3142.791482448578
105 | Correlated     : 334
106 | *** time:  7060.699452161789
107 | 
108 | Output
109 | X: shape=(8577, 48, 7235), density=0.011
110 | (8577, 48, 7235) 0.010731039202925532
111 | Time elapsed: 13785.849776 seconds
112 | 
113 | Output
114 | X: shape=(8577, 48, 7235), density=0.011
115 | Total time: 13810.326319 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 1.0
  6 |     θ₁     = 0.1
  7 |     θ₂     = 0.1
  8 |     θ_freq = 1.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 48
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.1)
 22 | Total variables     : 5405
 23 | Rare variables      : 4400
 24 | Remaining variables : 1005
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 30906331
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 993
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 30803407
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.111821 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.643674 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 115
 61 | Correlated     : 3
 62 | Time elapsed: 1.659079 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 34), density=0.267
 66 | Total time: 1.689984 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 993
 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']
 74 | M₁ = 6
 75 | M₂ = 987
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 179265 out of 8577×48×6=2470176 total
 81 | (freq) number of imputed entries :	 97328
 82 | (freq) number of not imputed entries :	 81937
 83 | (non-freq) number of missing entries :	 379890514 out of 8577×48×987=406343952 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 48, 1023)
 86 | Time elapsed: 599.909334 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 1017 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 48, 3071), density=0.032
 96 | Time elapsed: 882.033273 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 48, 3071) 0.03243142271843646
102 | Original : 3071
103 | Nearly-constant: 1602
104 | *** time:  656.6613774299622
105 | Correlated     : 72
106 | *** time:  1418.9878075122833
107 | 
108 | Output
109 | X: shape=(8577, 48, 1397), density=0.059
110 | (8577, 48, 1397) 0.05935146891854264
111 | Time elapsed: 2301.027611 seconds
112 | 
113 | Output
114 | X: shape=(8577, 48, 1397), density=0.059
115 | Total time: 2324.529186 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 1.0
  6 |     θ₁     = 0.001
  7 |     θ₂     = 0.001
  8 |     θ_freq = 10000.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 48
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.001)
 22 | Total variables     : 5405
 23 | Rare variables      : 1524
 24 | Remaining variables : 3881
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 33661000
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 3869
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 33558076
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.134358 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.595397 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 53
 61 | Correlated     : 3
 62 | Time elapsed: 1.615305 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 96), density=0.116
 66 | Total time: 1.661497 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 3869
 73 | Frequent variables : []
 74 | M₁ = 0
 75 | M₂ = 3869
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 0.0 out of 8577×48×0=0 total
 81 | (freq) number of imputed entries :	 0.0
 82 | (freq) number of not imputed entries :	 0.0
 83 | (non-freq) number of missing entries :	 1561455085 out of 8577×48×3869=1592851824 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 48, 3869)
 86 | Time elapsed: 1273.130074 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 3869 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 48, 9629), density=0.008
 96 | Time elapsed: 7504.357979 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 48, 9629) 0.007920027846763844
102 | Original : 9629
103 | Nearly-constant: 2066
104 | *** time:  3209.2778713703156
105 | Correlated     : 334
106 | *** time:  7071.120953321457
107 | 
108 | Output
109 | X: shape=(8577, 48, 7229), density=0.010
110 | (8577, 48, 7229) 0.009937486747645477
111 | Time elapsed: 14575.489017 seconds
112 | 
113 | Output
114 | X: shape=(8577, 48, 7229), density=0.010
115 | Total time: 14597.368192 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 12.0
  6 |     θ₁     = 0.001
  7 |     θ₂     = 0.001
  8 |     θ_freq = 1000000.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 4
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.001)
 22 | Total variables     : 5405
 23 | Rare variables      : 1524
 24 | Remaining variables : 3881
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 33661000
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 3869
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 33558076
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.234322 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.939724 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 53
 61 | Correlated     : 3
 62 | Time elapsed: 2.005332 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 96), density=0.116
 66 | Total time: 2.075384 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 3869
 73 | Frequent variables : []
 74 | M₁ = 0
 75 | M₂ = 3869
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 0.0 out of 8577×4×0=0 total
 81 | (freq) number of imputed entries :	 0.0
 82 | (freq) number of not imputed entries :	 0.0
 83 | (non-freq) number of missing entries :	 122456605 out of 8577×4×3869=132737652 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 4, 3869)
 86 | Time elapsed: 442.025943 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 3869 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 4, 9625), density=0.031
 96 | Time elapsed: 593.201395 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 4, 9625) 0.031134450485971996
102 | Original : 9625
103 | Nearly-constant: 2171
104 | *** time:  423.86314630508423
105 | Correlated     : 333
106 | *** time:  831.847085237503
107 | 
108 | Output
109 | X: shape=(8577, 4, 7121), density=0.040
110 | (8577, 4, 7121) 0.04025364075537859
111 | Time elapsed: 1425.054848 seconds
112 | 
113 | Output
114 | X: shape=(8577, 4, 7121), density=0.040
115 | Total time: 1430.915702 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/nofreq,benchmark,outcome=mortality,T=48.0,dt=4.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 4.0
  6 |     θ₁     = 0.001
  7 |     θ₂     = 0.001
  8 |     θ_freq = 100000.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 12
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.001)
 22 | Total variables     : 5405
 23 | Rare variables      : 1524
 24 | Remaining variables : 3881
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 33661000
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=4.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 3869
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 33558076
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.161524 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.528967 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 53
 61 | Correlated     : 3
 62 | Time elapsed: 1.584104 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 96), density=0.116
 66 | Total time: 1.633917 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 3869
 73 | Frequent variables : []
 74 | M₁ = 0
 75 | M₂ = 3869
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 0.0 out of 8577×12×0=0 total
 81 | (freq) number of imputed entries :	 0.0
 82 | (freq) number of not imputed entries :	 0.0
 83 | (non-freq) number of missing entries :	 377580723 out of 8577×12×3869=398212956 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 12, 3869)
 86 | Time elapsed: 533.026957 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 3869 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 12, 9661), density=0.021
 96 | Time elapsed: 1533.176079 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 12, 9661) 0.020749493283425048
102 | Original : 9661
103 | Nearly-constant: 2121
104 | *** time:  1299.0333437919617
105 | Correlated     : 332
106 | *** time:  2987.5951042175293
107 | 
108 | Output
109 | X: shape=(8577, 12, 7208), density=0.027
110 | (8577, 12, 7208) 0.02676353172417211
111 | Time elapsed: 4520.922278 seconds
112 | 
113 | Output
114 | X: shape=(8577, 12, 7208), density=0.027
115 | Total time: 4537.501810 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 48.0
  6 |     θ₁     = 0.001
  7 |     θ₂     = 0.001
  8 |     θ_freq = 1000000.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 1
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.001)
 22 | Total variables     : 5405
 23 | Rare variables      : 1524
 24 | Remaining variables : 3881
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 33661000
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 3869
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 33558076
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.115656 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.586363 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 53
 61 | Correlated     : 3
 62 | Time elapsed: 1.605498 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 96), density=0.116
 66 | Total time: 1.643156 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 3869
 73 | Frequent variables : []
 74 | M₁ = 0
 75 | M₂ = 3869
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 0.0 out of 8577×1×0=0 total
 81 | (freq) number of imputed entries :	 0.0
 82 | (freq) number of not imputed entries :	 0.0
 83 | (non-freq) number of missing entries :	 29214964 out of 8577×1×3869=33184413 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 1, 3869)
 86 | Time elapsed: 421.370643 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 3869 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 1, 9433), density=0.049
 96 | Time elapsed: 502.070196 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 1, 9433) 0.04906196992662215
102 | Original : 9433
103 | Nearly-constant: 2211
104 | *** time:  139.90465545654297
105 | Correlated     : 347
106 | *** time:  263.2418613433838
107 | 
108 | Output
109 | X: shape=(8577, 1, 6875), density=0.064
110 | (8577, 1, 6875) 0.0636226525485707
111 | Time elapsed: 765.346615 seconds
112 | 
113 | Output
114 | X: shape=(8577, 1, 6875), density=0.064
115 | Total time: 767.538363 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 1.0
  6 |     θ₁     = 0.1
  7 |     θ₂     = 0.1
  8 |     θ_freq = 1.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 48
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.1)
 22 | Total variables     : 5405
 23 | Rare variables      : 4400
 24 | Remaining variables : 1005
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 30906331
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 993
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 30803407
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.103230 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.321447 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 115
 61 | Correlated     : 3
 62 | Time elapsed: 1.335476 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 34), density=0.267
 66 | Total time: 1.366069 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 993
 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']
 74 | M₁ = 6
 75 | M₂ = 987
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 179265 out of 8577×48×6=2470176 total
 81 | (freq) number of imputed entries :	 97328
 82 | (freq) number of not imputed entries :	 81937
 83 | (non-freq) number of missing entries :	 379890514 out of 8577×48×987=406343952 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 48, 1023)
 86 | Time elapsed: 459.223514 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 1017 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 48, 3071), density=0.032
 96 | Time elapsed: 697.539552 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 48, 3071) 0.03193670873492995
102 | Original : 3071
103 | Nearly-constant: 1598
104 | *** time:  651.8644843101501
105 | Correlated     : 76
106 | *** time:  1414.6409630775452
107 | 
108 | Output
109 | X: shape=(8577, 48, 1397), density=0.058
110 | (8577, 48, 1397) 0.05843776333619845
111 | Time elapsed: 2112.187592 seconds
112 | 
113 | Output
114 | X: shape=(8577, 48, 1397), density=0.058
115 | Total time: 2135.861422 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 1.0
  6 |     θ₁     = 0.001
  7 |     θ₂     = 0.001
  8 |     θ_freq = 1.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 48
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.001)
 22 | Total variables     : 5405
 23 | Rare variables      : 1524
 24 | Remaining variables : 3881
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 33661000
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 3869
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 33558076
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.256884 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 2.509025 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 55
 61 | Correlated     : 3
 62 | Time elapsed: 2.586198 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 94), density=0.140
 66 | Total time: 2.659438 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 3869
 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']
 74 | M₁ = 6
 75 | M₂ = 3863
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 179265 out of 8577×48×6=2470176 total
 81 | (freq) number of imputed entries :	 97328
 82 | (freq) number of not imputed entries :	 81937
 83 | (non-freq) number of missing entries :	 1561275820 out of 8577×48×3863=1590381648 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 48, 3899)
 86 | Time elapsed: 1602.799133 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 3893 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 48, 9950), density=0.016
 96 | Time elapsed: 5116.279016 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 48, 9950) 0.015691314360071314
102 | Original : 9950
103 | Nearly-constant: 1642
104 | *** time:  5924.276056051254
105 | Correlated     : 359
106 | *** time:  12414.71133685112
107 | 
108 | Output
109 | X: shape=(8577, 48, 7949), density=0.018
110 | (8577, 48, 7949) 0.017977339510562455
111 | Time elapsed: 17531.153291 seconds
112 | 
113 | Output
114 | X: shape=(8577, 48, 7949), density=0.018
115 | Total time: 17576.561756 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/theta=0.001,medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 1.0
  6 |     θ₁     = 0.001
  7 |     θ₂     = 0.001
  8 |     θ_freq = 1.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 48
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.001)
 22 | Total variables     : 5405
 23 | Rare variables      : 1524
 24 | Remaining variables : 3881
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 33661000
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/medianimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 3869
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 33558076
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.617579 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 3.354901 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 53
 61 | Correlated     : 3
 62 | Time elapsed: 3.446176 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 96), density=0.116
 66 | Total time: 3.512502 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 3869
 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']
 74 | M₁ = 6
 75 | M₂ = 3863
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 179265 out of 8577×48×6=2470176 total
 81 | (freq) number of imputed entries :	 97328
 82 | (freq) number of not imputed entries :	 81937
 83 | (non-freq) number of missing entries :	 1561275820 out of 8577×48×3863=1590381648 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 48, 3899)
 86 | Time elapsed: 1575.769876 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 3893 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 48, 9719), density=0.011
 96 | Time elapsed: 6665.714383 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 48, 9719) 0.010910535315619061
102 | Original : 9719
103 | Nearly-constant: 2072
104 | *** time:  3580.4881682395935
105 | Correlated     : 340
106 | *** time:  8112.426522254944
107 | 
108 | Output
109 | X: shape=(8577, 48, 7307), density=0.013
110 | (8577, 48, 7307) 0.012678978253797912
111 | Time elapsed: 14778.144839 seconds
112 | 
113 | Output
114 | X: shape=(8577, 48, 7307), density=0.013
115 | Total time: 14806.507060 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/log/theta=0.001,noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: /data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 1.0
  6 |     θ₁     = 0.001
  7 |     θ₂     = 0.001
  8 |     θ_freq = 1.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 48
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.001)
 22 | Total variables     : 5405
 23 | Rare variables      : 1524
 24 | Remaining variables : 3881
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 33661000
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/features,ablations/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 3869
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 33558076
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.112251 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.584486 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 53
 61 | Correlated     : 3
 62 | Time elapsed: 1.613765 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 96), density=0.116
 66 | Total time: 1.665902 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 3869
 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']
 74 | M₁ = 6
 75 | M₂ = 3863
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 179265 out of 8577×48×6=2470176 total
 81 | (freq) number of imputed entries :	 97328
 82 | (freq) number of not imputed entries :	 81937
 83 | (non-freq) number of missing entries :	 1561275820 out of 8577×48×3863=1590381648 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 48, 3899)
 86 | Time elapsed: 1101.718930 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 3893 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 48, 9719), density=0.011
 96 | Time elapsed: 3553.558418 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 48, 9719) 0.010754216080785386
102 | Original : 9719
103 | Nearly-constant: 2072
104 | *** time:  3614.3632407188416
105 | Correlated     : 341
106 | *** time:  7846.658187866211
107 | 
108 | Output
109 | X: shape=(8577, 48, 7306), density=0.012
110 | (8577, 48, 7306) 0.012370484208457008
111 | Time elapsed: 11400.220244 seconds
112 | 
113 | Output
114 | X: shape=(8577, 48, 7306), density=0.012
115 | Total time: 11427.416120 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/run_mortality_mask+Dt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | export PYTHONPATH="./"
 5 | mkdir -p log
 6 | mkdir -p "../data/features,ablations/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0/"
 7 | 
 8 | python -m FIDDLE_mask+Dt.run \
 9 |     --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
10 |     --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \
11 |     --data_path="../data/features,ablations/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
12 |     --T=48.0 \
13 |     --dt=1.0 \
14 |     --theta_1=0.001 \
15 |     --theta_2=0.001 \
16 |     --theta_freq=1 \
17 |     --stats_functions 'min' 'max' 'mean' \
18 |     > >(tee 'log/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
19 |     2> >(tee 'log/mask+Dt,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
20 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/run_mortality_maskonly.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | export PYTHONPATH="./"
 5 | mkdir -p log
 6 | mkdir -p "../data/features,ablations/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0/"
 7 | 
 8 | python -m FIDDLE_maskonly.run \
 9 |     --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
10 |     --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \
11 |     --data_path="../data/features,ablations/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
12 |     --T=48.0 \
13 |     --dt=1.0 \
14 |     --theta_1=0.001 \
15 |     --theta_2=0.001 \
16 |     --theta_freq=1 \
17 |     --stats_functions 'min' 'max' 'mean' \
18 |     > >(tee 'log/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
19 |     2> >(tee 'log/maskonly,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
20 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/run_mortality_nofreq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | export PYTHONPATH="../../../FIDDLE/"
 5 | mkdir -p log
 6 | mkdir -p "../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0/"
 7 | mkdir -p "../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0/"
 8 | mkdir -p "../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0/"
 9 | 
10 | # python -m FIDDLE.run \
11 | #     --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
12 | #     --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \
13 | #     --data_path="../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
14 | #     --T=48.0 \
15 | #     --dt=1.0 \
16 | #     --theta_1=0.001 \
17 | #     --theta_2=0.001 \
18 | #     --theta_freq=1000000 \
19 | #     --stats_functions 'min' 'max' 'mean' \
20 | #     > >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
21 | #     2> >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)    &
22 | 
23 | python -m FIDDLE.run \
24 |     --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
25 |     --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \
26 |     --data_path="../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0/" \
27 |     --T=48.0 \
28 |     --dt=12.0 \
29 |     --theta_1=0.001 \
30 |     --theta_2=0.001 \
31 |     --theta_freq=1000000 \
32 |     --stats_functions 'min' 'max' 'mean' \
33 |     > >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0.out') \
34 |     2> >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=12.0.err' >&2)    &
35 | 
36 | python -m FIDDLE.run \
37 |     --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
38 |     --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \
39 |     --data_path="../data/features,ablations/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0/" \
40 |     --T=48.0 \
41 |     --dt=48.0 \
42 |     --theta_1=0.001 \
43 |     --theta_2=0.001 \
44 |     --theta_freq=1000000 \
45 |     --stats_functions 'min' 'max' 'mean' \
46 |     > >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0.out') \
47 |     2> >(tee 'log/nofreq,benchmark,outcome=mortality,T=48.0,dt=48.0.err' >&2)    &
48 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/run_mortality_noimpute.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | export PYTHONPATH="./"
 5 | mkdir -p log
 6 | mkdir -p "../data/features,ablations/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/"
 7 | 
 8 | python -m FIDDLE_noimpute.run \
 9 |     --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
10 |     --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \
11 |     --data_path="../data/features,ablations/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
12 |     --T=48.0 \
13 |     --dt=1.0 \
14 |     --theta_1=0.001 \
15 |     --theta_2=0.001 \
16 |     --theta_freq=1 \
17 |     --stats_functions 'min' 'max' 'mean' \
18 |     > >(tee 'log/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
19 |     2> >(tee 'log/noimpute,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
20 | 


--------------------------------------------------------------------------------
/mimic3_ablations/2_apply_FIDDLE/run_mortality_ordinal.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | export PYTHONPATH="./"
 5 | mkdir -p log
 6 | mkdir -p "../data/features,ablations/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0/"
 7 | 
 8 | python -m FIDDLE_ordinal.run \
 9 |     --input_fname="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
10 |     --population="/data4/tangsp/FIDDLE/mimic3_experiments/data/processed/population/pop.mortality_benchmark.csv" \
11 |     --data_path="../data/features,ablations/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
12 |     --T=48.0 \
13 |     --dt=1.0 \
14 |     --theta_1=0.001 \
15 |     --theta_2=0.001 \
16 |     --theta_freq=1 \
17 |     --stats_functions 'min' 'max' 'mean' \
18 |     > >(tee 'log/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
19 |     2> >(tee 'log/ordinal,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
20 | 


--------------------------------------------------------------------------------
/mimic3_ablations/3_ML_models/config.yaml:
--------------------------------------------------------------------------------
 1 | data_path: ../data/processed/
 2 | 
 3 | model_names: {
 4 |     'CNN': 'CNN_V3',
 5 |     'RNN': 'RNN_V2',
 6 |     'LR': 'LR',
 7 |     'RF': 'RF',
 8 | }
 9 | 
10 | train:
11 |     budget: 50
12 |     repeat: 1
13 |     epochs: 15
14 | 
15 | feature_dimension:
16 |     ARF:
17 |         4.0 : 4143
18 |         12.0: 4912
19 | 
20 |     Shock:
21 |         4.0 : 4620
22 |         12.0: 5597
23 | 
24 |     mortality:
25 |         48.0: 7508
26 | 


--------------------------------------------------------------------------------
/mimic3_ablations/3_ML_models/lib/evaluate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | def get_best_model_info(df_search):
 5 |     df_search_sorted = df_search.sort_values('best_score', ascending=False).head()
 6 |     best_model_info = df_search_sorted.iloc[0, 1:]
 7 |     return best_model_info
 8 | 
 9 | def load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params, load_filename=None):
10 |     if load_filename is None:
11 |         savename = best_model_info['savename']
12 |         split = savename.split('/')
13 |         split[-1] = 'best_' + split[-1]
14 |         load_filename = '/'.join(split)
15 |     
16 |     checkpoint = torch.load(load_filename)
17 |     _iter = checkpoint['_iter']
18 |     print("Loaded checkpoint (trained for {} iterations)".format(checkpoint['_iter']))
19 | #     print(load_filename)
20 |     
21 |     best_HP = best_model_info.drop(['savename', 'best_iter', 'seed']).to_dict()
22 |     model = ModelClass(
23 |         in_channels, L_in, 1,
24 |         **{k:best_HP[k] for k in best_HP.keys() if k not in training_params}
25 |     )
26 |     model.load_state_dict(checkpoint['state_dict'])
27 |     model.cuda()
28 |     print("Restored model {} with #params={}".format(ModelClass, sum(p.numel() for p in model.parameters())))
29 |     
30 |     return checkpoint, model
31 | 
32 | def get_test_predictions(model, te_loader, task=None, model_name=None):
33 |     model.eval()
34 |     running_pred = []
35 | 
36 |     cuda = True
37 |     for i, (X, y) in enumerate(te_loader):
38 |         if cuda:
39 |             X = X.contiguous().cuda()
40 |             y = y.contiguous().cuda(non_blocking=True)
41 | 
42 |         with torch.set_grad_enabled(False):
43 |             output = model(X)
44 |             running_pred.append((output.data.detach().cpu(), y.data.detach().cpu()))
45 | 
46 |     y_score, y_true = zip(*running_pred)
47 |     y_score = torch.cat(y_score).numpy()
48 |     y_true = torch.cat(y_true).numpy()
49 | 
50 |     assert (np.stack(te_loader.dataset.y) == y_true).all()
51 |     return y_true, y_score
52 | 
53 | def save_test_predictions(y_true, y_score, model_name, save_dir):
54 |     import pathlib
55 |     pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True)
56 |     
57 |     fname = save_dir + '/{}.test.npz'.format(model_name)
58 |     np.savez(
59 |         open(fname, 'wb'),
60 |         y_score = y_score,
61 |         y_true  = y_true,
62 |     )
63 |     print('Test predictions saved to', fname)
64 | 


--------------------------------------------------------------------------------
/mimic3_ablations/3_ML_models/lib/experiment.py:
--------------------------------------------------------------------------------
 1 | from .trainer import Trainer
 2 | import time
 3 | import random
 4 | import numpy as np
 5 | import pandas as pd
 6 | import torch
 7 | from sklearn.model_selection import ParameterSampler
 8 | 
 9 | class Experiment(object):
10 |     def __init__(self, param_grid, budget=1, repeat=1, n_epochs=5, name='tmp'):
11 |         self.name = name
12 |         self.budget = budget
13 |         self.repeat = repeat # number of restarts with different random seeds
14 |         self.n_epochs = n_epochs
15 |         self.param_grid = param_grid
16 |         self.param_sampler = ParameterSampler(param_grid, n_iter=self.budget, random_state=0)
17 |     
18 |     def run(self):
19 |         df_search = pd.DataFrame(columns=['best_score', 'best_iter', 'seed', 'savename'] + list(self.param_grid.keys()))
20 |         start_time = time.time()
21 |         for run, params in enumerate(self.param_sampler):
22 |             print(self.name, '\t', 'Run:', run, '/', self.budget)
23 |             print(params)
24 |             for i in range(self.repeat):
25 |                 results = self._run_trial(i, params)
26 |                 df_search = df_search.append(results, ignore_index=True)
27 |                 df_search.to_csv('./log/df_search.current.{}.csv'.format(self.name), index=False)
28 | 
29 |         print('Took:', time.time() - start_time)
30 |         return df_search
31 |     
32 |     def _run_trial(self, seed, params):
33 |         savename = 'checkpoint/{}/{}_seed={}.pth.tar'.format(self.name, params, seed)
34 |         
35 |         random.seed(seed)
36 |         np.random.seed(seed)
37 |         torch.manual_seed(seed)
38 |         torch.cuda.manual_seed_all(seed)
39 | 
40 |         tr_loader, va_loader = self.get_data()
41 |         model, criterion, optimizer = self.get_model_params(params)
42 |         trainer = Trainer(model, criterion, optimizer, tr_loader, va_loader, 
43 |                           n_epochs=self.n_epochs, batch_size=params['batch_size'], 
44 |                           savename=savename, 
45 |                           save_every=100, plot_every=50, cuda=True)
46 | #         print(trainer)
47 |         trainer.fit()
48 | 
49 |         print(trainer._best_iter, '{:.5f}'.format(trainer.best_score))
50 |         
51 |         del model
52 |         return {
53 |             'best_score': trainer.best_score, 'best_iter': trainer._best_iter, 
54 |             'savename': savename, 'seed': seed,
55 |             **params,
56 |         }
57 |     
58 |     def get_model_params(self):
59 |         raise NotImplementedError
60 |     
61 |     def get_data(self):
62 |         raise NotImplementedError
63 |     


--------------------------------------------------------------------------------
/mimic3_ablations/3_ML_models/lib/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | 
  6 | class CNN_V3(nn.Module):
  7 |     """
  8 |     Multilayer CNN with 1D convolutions
  9 |     """
 10 |     def __init__(
 11 |         self,
 12 |         in_channels,
 13 |         L_in,
 14 |         output_size,
 15 |         depth=2,
 16 |         filter_size=3, 
 17 |         n_filters=64, 
 18 |         n_neurons=64, 
 19 |         dropout=0.2,
 20 |         activation='relu',
 21 |     ):
 22 |         super().__init__()
 23 |         self.depth = depth
 24 |         if activation == 'relu':
 25 |             self.activation = F.relu
 26 |         elif activation == 'elu':
 27 |             self.activation = F.elu
 28 |         padding = int(np.floor(filter_size / 2))
 29 |         
 30 |         if depth == 1:
 31 |             self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding)
 32 |             self.pool1 = nn.MaxPool1d(2, 2)
 33 |             self.fc1 = nn.Linear(int(L_in * (n_filters) / 2), n_neurons)
 34 |             self.fc1_drop = nn.Dropout(dropout)
 35 |             self.fc2 = nn.Linear(n_neurons, 1)
 36 |     
 37 |         elif depth == 2:
 38 |             self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding)
 39 |             self.pool1 = nn.MaxPool1d(2, 2)
 40 |             self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding)
 41 |             self.pool2 = nn.MaxPool1d(2, 2)
 42 |             self.fc1 = nn.Linear(int(L_in * (n_filters) / 4), n_neurons)
 43 |             self.fc1_drop = nn.Dropout(dropout)
 44 |             self.fc2 = nn.Linear(n_neurons, 1)
 45 |             
 46 |         elif depth == 3:
 47 |             self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding)
 48 |             self.pool1 = nn.MaxPool1d(2, 2)
 49 |             self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding)
 50 |             self.pool2 = nn.MaxPool1d(2, 2)
 51 |             self.conv3 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding)
 52 |             self.pool3 = nn.MaxPool1d(2, 2)
 53 |             self.fc1 = nn.Linear(int(L_in * (n_filters) / 8), n_neurons)
 54 |             self.fc1_drop = nn.Dropout(dropout)
 55 |             self.fc2 = nn.Linear(n_neurons, 1)
 56 |     
 57 |     def forward(self, x):
 58 |         # x: tensor (batch_size, L_in, in_channels)
 59 |         x = x.transpose(1,2) # swap time and feature axes
 60 |         
 61 |         x = self.pool1(self.activation(self.conv1(x)))
 62 |         if self.depth == 2 or self.depth == 3:
 63 |             x = self.pool2(self.activation(self.conv2(x)))
 64 |         if self.depth == 3:
 65 |             x = self.pool3(self.activation(self.conv3(x)))
 66 |         
 67 |         x = x.view(x.size(0), -1) # flatten
 68 |         x = self.activation(self.fc1_drop(self.fc1(x)))
 69 |         x = torch.sigmoid(self.fc2(x))
 70 |         return x
 71 | 
 72 | class RNN_V2(nn.Module):
 73 |     """
 74 |     Multi-layer LSTM network
 75 |     """
 76 |     def __init__(
 77 |         self, 
 78 |         input_size,
 79 |         input_length,
 80 |         output_size,
 81 |         hidden_size=64,
 82 |         num_layers=1,
 83 |         dropout=0.0,
 84 |         n_neurons=64,
 85 |         activation='relu',
 86 |     ):
 87 |         super().__init__()
 88 |         if activation == 'relu':
 89 |             self.activation = F.relu
 90 |         elif activation == 'elu':
 91 |             self.activation = F.elu
 92 |         
 93 |         self.hidden_size = int(hidden_size)
 94 |         self.num_layers = int(num_layers)
 95 |         
 96 |         self.lstm = nn.LSTM(int(input_size), int(hidden_size), int(num_layers), batch_first=True)
 97 |         self.fc1 = nn.Linear(hidden_size, n_neurons)
 98 |         self.fc1_drop = nn.Dropout(dropout)
 99 |         self.fc2 = nn.Linear(n_neurons, output_size)
100 |     
101 |     def forward(self, x):
102 |         # x: tensor (batch_size, T, input_size)
103 |         # h_all: (batch_size, T, hidden_size)
104 |         h_0, c_0 = self.init_hidden(x)
105 |         h_all, (h_T, c_T) = self.lstm(x, (h_0, c_0))
106 |         output = h_T[-1]
107 |         output = self.activation(self.fc1_drop(self.fc1(output)))
108 |         output = torch.sigmoid(self.fc2(output))
109 |         return output
110 |     
111 |     def init_hidden(self, x):
112 |         batch_size = x.size(0)
113 |         return (torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device),
114 |                 torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device))


--------------------------------------------------------------------------------
/mimic3_comparisons/2_apply_FIDDLE/log/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0.out:
--------------------------------------------------------------------------------
  1 | Input data file: ../data/processed//features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p
  2 | 
  3 | Input arguments:
  4 |     T      = 48
  5 |     dt     = 1.0
  6 |     θ₁     = 0.4
  7 |     θ₂     = 0.4
  8 |     θ_freq = 1.0
  9 |     k      = 3 ['min', 'max', 'mean']
 10 | binarize = yes
 11 | 
 12 | N = 8577
 13 | L = 48
 14 | 
 15 | 
 16 | ================================================================================
 17 | 1) Pre-filter
 18 | ================================================================================
 19 | Remove rows not in population
 20 | Remove rows with t outside of [0, 48]
 21 | Remove rare variables (<= 0.4)
 22 | Total variables     : 5405
 23 | Rare variables      : 4965
 24 | Remaining variables : 440
 25 | # rows (original)   : 33684409
 26 | # rows (filtered)   : 24750654
 27 | 
 28 | ================================================================================
 29 | 2) Transform; 3) Post-filter
 30 | ================================================================================
 31 | 
 32 | --------------------------------------------------------------------------------
 33 | *) Detecting value types
 34 | --------------------------------------------------------------------------------
 35 | Saved as: ../data/processed//features,comparison/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0/value_types.csv
 36 | 
 37 | --------------------------------------------------------------------------------
 38 | *) Separate time-invariant and time-dependent
 39 | --------------------------------------------------------------------------------
 40 | Variables (time-invariant): 12
 41 | Variables (time-dependent): 428
 42 | # rows    (time-invariant): 102924
 43 | # rows    (time-dependent): 24647730
 44 | 
 45 | --------------------------------------------------------------------------------
 46 | 2-A) Transform time-invariant data
 47 | --------------------------------------------------------------------------------
 48 | (N × ^d) table            :	 (8577, 12)
 49 | number of missing entries :	 374 out of 102924 total
 50 | Time elapsed: 0.120688 seconds
 51 | 
 52 | Output
 53 | s_all, binary features    :	 (8577, 152)
 54 | Time elapsed: 1.519710 seconds
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | 3-A) Post-filter time-invariant data
 58 | --------------------------------------------------------------------------------
 59 | Original       : 152
 60 | Nearly-constant: 147
 61 | Correlated     : 1
 62 | Time elapsed: 1.528469 seconds
 63 | 
 64 | Output
 65 | s: shape=(8577, 4), density=0.487
 66 | Total time: 1.539292 seconds
 67 | 
 68 | 
 69 | --------------------------------------------------------------------------------
 70 | 2-B) Transform time-dependent data
 71 | --------------------------------------------------------------------------------
 72 | Total variables    : 428
 73 | Frequent variables : ['220048', 'DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']
 74 | M₁ = 6
 75 | M₂ = 422
 76 | k  = 3 ['min', 'max', 'mean']
 77 | 
 78 | Transforming each example...
 79 | DONE: Transforming each example...
 80 | (freq) number of missing entries :	 179265 out of 8577×48×6=2470176 total
 81 | (freq) number of imputed entries :	 97328
 82 | (freq) number of not imputed entries :	 81937
 83 | (non-freq) number of missing entries :	 153244621 out of 8577×48×422=173735712 total
 84 | 
 85 | (N × L × ^D) table :	 (8577, 48, 458)
 86 | Time elapsed: 334.029715 seconds
 87 | Discretizing features...
 88 | 
 89 | Processing 452 non-boolean variable columns...
 90 |     Binning numeric variables by quintile...
 91 |     Converting variables to binary features
 92 | Finished discretizing features
 93 | 
 94 | Output
 95 | X_all: shape=(8577, 48, 1536), density=0.055
 96 | Time elapsed: 546.195059 seconds
 97 | 
 98 | --------------------------------------------------------------------------------
 99 | 3-B) Post-filter time-dependent data
100 | --------------------------------------------------------------------------------
101 | (8577, 48, 1536) 0.055039622556247
102 | Original : 1536
103 | Nearly-constant: 998
104 | *** time:  282.423082113266
105 | Correlated     : 16
106 | *** time:  628.0092172622681
107 | 
108 | Output
109 | X: shape=(8577, 48, 522), density=0.128
110 | (8577, 48, 522) 0.1278138950795223
111 | Time elapsed: 1174.211754 seconds
112 | 
113 | Output
114 | X: shape=(8577, 48, 522), density=0.128
115 | Total time: 1197.517308 seconds
116 | 
117 | 


--------------------------------------------------------------------------------
/mimic3_comparisons/2_apply_FIDDLE/run_mortality_dt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | export PYTHONPATH="../../FIDDLE/"
 5 | DATAPATH=$(python -c "import yaml;print(yaml.full_load(open('../config.yaml'))['data_path']);")
 6 | mkdir -p log
 7 | mkdir -p "$DATAPATH/features,comparison/dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0/"
 8 | mkdir -p "$DATAPATH/features,comparison/dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0/"
 9 | mkdir -p "$DATAPATH/features,comparison/dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0/"
10 | mkdir -p "$DATAPATH/features,comparison/dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0/"
11 | 
12 | python -m FIDDLE.run \
13 |     --output_dir="$DATAPATH/features,comparison/dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
14 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
15 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
16 |     --T=48.0 \
17 |     --dt=48.0 \
18 |     --theta_1=0.001 \
19 |     --theta_2=0.001 \
20 |     --theta_freq=1 \
21 |     --stats_functions 'min' 'max' 'mean' \
22 |     > >(tee 'log/dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
23 |     2> >(tee 'log/dt=48.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) &
24 | 
25 | python -m FIDDLE.run \
26 |     --output_dir="$DATAPATH/features,comparison/dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
27 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
28 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
29 |     --T=48.0 \
30 |     --dt=24.0 \
31 |     --theta_1=0.001 \
32 |     --theta_2=0.001 \
33 |     --theta_freq=1 \
34 |     --stats_functions 'min' 'max' 'mean' \
35 |     > >(tee 'log/dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
36 |     2> >(tee 'log/dt=24.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) &
37 | 
38 | python -m FIDDLE.run \
39 |     --output_dir="$DATAPATH/features,comparison/dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
40 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
41 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
42 |     --T=48.0 \
43 |     --dt=12.0 \
44 |     --theta_1=0.001 \
45 |     --theta_2=0.001 \
46 |     --theta_freq=1 \
47 |     --stats_functions 'min' 'max' 'mean' \
48 |     > >(tee 'log/dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
49 |     2> >(tee 'log/dt=12.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) &
50 | 
51 | python -m FIDDLE.run \
52 |     --output_dir="$DATAPATH/features,comparison/dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
53 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
54 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
55 |     --T=48.0 \
56 |     --dt=4.0 \
57 |     --theta_1=0.001 \
58 |     --theta_2=0.001 \
59 |     --theta_freq=1 \
60 |     --stats_functions 'min' 'max' 'mean' \
61 |     > >(tee 'log/dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
62 |     2> >(tee 'log/dt=4.0,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2) &
63 | 


--------------------------------------------------------------------------------
/mimic3_comparisons/2_apply_FIDDLE/run_mortality_theta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | export PYTHONPATH="../../FIDDLE/"
 5 | DATAPATH=$(python -c "import yaml;print(yaml.full_load(open('../config.yaml'))['data_path']);")
 6 | mkdir -p log
 7 | mkdir -p "$DATAPATH/features,comparison/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0/"
 8 | mkdir -p "$DATAPATH/features,comparison/theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0/"
 9 | mkdir -p "$DATAPATH/features,comparison/theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0/"
10 | mkdir -p "$DATAPATH/features,comparison/theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0/"
11 | mkdir -p "$DATAPATH/features,comparison/theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0/"
12 | 
13 | python -m FIDDLE.run \
14 |     --output_dir="$DATAPATH/features,comparison/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
15 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
16 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
17 |     --T=48.0 \
18 |     --dt=1.0 \
19 |     --theta_1=0.4 \
20 |     --theta_2=0.4 \
21 |     --theta_freq=1 \
22 |     --stats_functions 'min' 'max' 'mean' \
23 |     > >(tee 'log/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
24 |     2> >(tee 'log/theta=0.4,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
25 | 
26 | python -m FIDDLE.run \
27 |     --output_dir="$DATAPATH/features,comparison/theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
28 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
29 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
30 |     --T=48.0 \
31 |     --dt=1.0 \
32 |     --theta_1=0.2 \
33 |     --theta_2=0.2 \
34 |     --theta_freq=1 \
35 |     --stats_functions 'min' 'max' 'mean' \
36 |     > >(tee 'log/theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
37 |     2> >(tee 'log/theta=0.2,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
38 | 
39 | python -m FIDDLE.run \
40 |     --output_dir="$DATAPATH/features,comparison/theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
41 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
42 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
43 |     --T=48.0 \
44 |     --dt=1.0 \
45 |     --theta_1=0.1 \
46 |     --theta_2=0.1 \
47 |     --theta_freq=1 \
48 |     --stats_functions 'min' 'max' 'mean' \
49 |     > >(tee 'log/theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
50 |     2> >(tee 'log/theta=0.1,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
51 | 
52 | python -m FIDDLE.run \
53 |     --output_dir="$DATAPATH/features,comparison/theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
54 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
55 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
56 |     --T=48.0 \
57 |     --dt=1.0 \
58 |     --theta_1=0.05 \
59 |     --theta_2=0.05 \
60 |     --theta_freq=1 \
61 |     --stats_functions 'min' 'max' 'mean' \
62 |     > >(tee 'log/theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
63 |     2> >(tee 'log/theta=0.05,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
64 | 
65 | python -m FIDDLE.run \
66 |     --output_dir="$DATAPATH/features,comparison/theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0/" \
67 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
68 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
69 |     --T=48.0 \
70 |     --dt=1.0 \
71 |     --theta_1=0.01 \
72 |     --theta_2=0.01 \
73 |     --theta_freq=1 \
74 |     --stats_functions 'min' 'max' 'mean' \
75 |     > >(tee 'log/theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
76 |     2> >(tee 'log/theta=0.01,benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
77 | 


--------------------------------------------------------------------------------
/mimic3_comparisons/3_ML_models/config.yaml:
--------------------------------------------------------------------------------
 1 | data_path: ../data/processed/
 2 | 
 3 | model_names: {
 4 |     'CNN': 'CNN_V3',
 5 |     'RNN': 'RNN_V2',
 6 |     'LR': 'LR',
 7 |     'RF': 'RF',
 8 | }
 9 | 
10 | train:
11 |     budget: 50
12 |     repeat: 1
13 |     epochs: 15
14 | 
15 | feature_dimension:
16 |     ARF:
17 |         4.0 : 4143
18 |         12.0: 4912
19 | 
20 |     Shock:
21 |         4.0 : 4620
22 |         12.0: 5597
23 | 
24 |     mortality:
25 |         48.0: 7508
26 | 


--------------------------------------------------------------------------------
/mimic3_comparisons/3_ML_models/lib/evaluate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | def get_best_model_info(df_search):
 5 |     df_search_sorted = df_search.sort_values('best_score', ascending=False).head()
 6 |     best_model_info = df_search_sorted.iloc[0, 1:]
 7 |     return best_model_info
 8 | 
 9 | def load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params, load_filename=None):
10 |     if load_filename is None:
11 |         savename = best_model_info['savename']
12 |         split = savename.split('/')
13 |         split[-1] = 'best_' + split[-1]
14 |         load_filename = '/'.join(split)
15 |     
16 |     checkpoint = torch.load(load_filename)
17 |     _iter = checkpoint['_iter']
18 |     print("Loaded checkpoint (trained for {} iterations)".format(checkpoint['_iter']))
19 | #     print(load_filename)
20 |     
21 |     best_HP = best_model_info.drop(['savename', 'best_iter', 'seed']).to_dict()
22 |     model = ModelClass(
23 |         in_channels, L_in, 1,
24 |         **{k:best_HP[k] for k in best_HP.keys() if k not in training_params}
25 |     )
26 |     model.load_state_dict(checkpoint['state_dict'])
27 |     model.cuda()
28 |     print("Restored model {} with #params={}".format(ModelClass, sum(p.numel() for p in model.parameters())))
29 |     
30 |     return checkpoint, model
31 | 
32 | def get_test_predictions(model, te_loader, task=None, model_name=None):
33 |     model.eval()
34 |     running_pred = []
35 | 
36 |     cuda = True
37 |     for i, (X, y) in enumerate(te_loader):
38 |         if cuda:
39 |             X = X.contiguous().cuda()
40 |             y = y.contiguous().cuda(non_blocking=True)
41 | 
42 |         with torch.set_grad_enabled(False):
43 |             output = model(X)
44 |             running_pred.append((output.data.detach().cpu(), y.data.detach().cpu()))
45 | 
46 |     y_score, y_true = zip(*running_pred)
47 |     y_score = torch.cat(y_score).numpy()
48 |     y_true = torch.cat(y_true).numpy()
49 | 
50 |     assert (np.stack(te_loader.dataset.y) == y_true).all()
51 |     return y_true, y_score
52 | 
53 | def save_test_predictions(y_true, y_score, model_name, save_dir):
54 |     import pathlib
55 |     pathlib.Path(save_dir).mkdir(parents=True, exist_ok=True)
56 |     
57 |     fname = save_dir + '/{}.test.npz'.format(model_name)
58 |     np.savez(
59 |         open(fname, 'wb'),
60 |         y_score = y_score,
61 |         y_true  = y_true,
62 |     )
63 |     print('Test predictions saved to', fname)
64 | 


--------------------------------------------------------------------------------
/mimic3_comparisons/3_ML_models/lib/experiment.py:
--------------------------------------------------------------------------------
 1 | from .trainer import Trainer
 2 | import time
 3 | import random
 4 | import numpy as np
 5 | import pandas as pd
 6 | import torch
 7 | from sklearn.model_selection import ParameterSampler
 8 | 
 9 | class Experiment(object):
10 |     def __init__(self, param_grid, budget=1, repeat=1, n_epochs=5, name='tmp'):
11 |         self.name = name
12 |         self.budget = budget
13 |         self.repeat = repeat # number of restarts with different random seeds
14 |         self.n_epochs = n_epochs
15 |         self.param_grid = param_grid
16 |         self.param_sampler = ParameterSampler(param_grid, n_iter=self.budget, random_state=0)
17 |     
18 |     def run(self):
19 |         df_search = pd.DataFrame(columns=['best_score', 'best_iter', 'seed', 'savename'] + list(self.param_grid.keys()))
20 |         start_time = time.time()
21 |         for run, params in enumerate(self.param_sampler):
22 |             print(self.name, '\t', 'Run:', run, '/', self.budget)
23 |             print(params)
24 |             for i in range(self.repeat):
25 |                 results = self._run_trial(i, params)
26 |                 df_search = df_search.append(results, ignore_index=True)
27 |                 df_search.to_csv('./log/df_search.current.{}.csv'.format(self.name), index=False)
28 | 
29 |         print('Took:', time.time() - start_time)
30 |         return df_search
31 |     
32 |     def _run_trial(self, seed, params):
33 |         savename = 'checkpoint/{}/{}_seed={}.pth.tar'.format(self.name, params, seed)
34 |         
35 |         random.seed(seed)
36 |         np.random.seed(seed)
37 |         torch.manual_seed(seed)
38 |         torch.cuda.manual_seed_all(seed)
39 | 
40 |         tr_loader, va_loader = self.get_data()
41 |         model, criterion, optimizer = self.get_model_params(params)
42 |         trainer = Trainer(model, criterion, optimizer, tr_loader, va_loader, 
43 |                           n_epochs=self.n_epochs, batch_size=params['batch_size'], 
44 |                           savename=savename, 
45 |                           save_every=100, plot_every=50, cuda=True)
46 | #         print(trainer)
47 |         trainer.fit()
48 | 
49 |         print(trainer._best_iter, '{:.5f}'.format(trainer.best_score))
50 |         
51 |         del model
52 |         return {
53 |             'best_score': trainer.best_score, 'best_iter': trainer._best_iter, 
54 |             'savename': savename, 'seed': seed,
55 |             **params,
56 |         }
57 |     
58 |     def get_model_params(self):
59 |         raise NotImplementedError
60 |     
61 |     def get_data(self):
62 |         raise NotImplementedError
63 |     


--------------------------------------------------------------------------------
/mimic3_comparisons/3_ML_models/lib/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | 
  6 | class CNN_V3(nn.Module):
  7 |     """
  8 |     Multilayer CNN with 1D convolutions
  9 |     """
 10 |     def __init__(
 11 |         self,
 12 |         in_channels,
 13 |         L_in,
 14 |         output_size,
 15 |         depth=2,
 16 |         filter_size=3, 
 17 |         n_filters=64, 
 18 |         n_neurons=64, 
 19 |         dropout=0.2,
 20 |         activation='relu',
 21 |     ):
 22 |         super().__init__()
 23 |         self.depth = depth
 24 |         if activation == 'relu':
 25 |             self.activation = F.relu
 26 |         elif activation == 'elu':
 27 |             self.activation = F.elu
 28 |         padding = int(np.floor(filter_size / 2))
 29 |         
 30 |         if depth == 1:
 31 |             self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding)
 32 |             self.pool1 = nn.MaxPool1d(2, 2)
 33 |             self.fc1 = nn.Linear(int(L_in * (n_filters) / 2), n_neurons)
 34 |             self.fc1_drop = nn.Dropout(dropout)
 35 |             self.fc2 = nn.Linear(n_neurons, 1)
 36 |     
 37 |         elif depth == 2:
 38 |             self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding)
 39 |             self.pool1 = nn.MaxPool1d(2, 2)
 40 |             self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding)
 41 |             self.pool2 = nn.MaxPool1d(2, 2)
 42 |             self.fc1 = nn.Linear(int(L_in * (n_filters) / 4), n_neurons)
 43 |             self.fc1_drop = nn.Dropout(dropout)
 44 |             self.fc2 = nn.Linear(n_neurons, 1)
 45 |             
 46 |         elif depth == 3:
 47 |             self.conv1 = nn.Conv1d(in_channels, n_filters, filter_size, padding=padding)
 48 |             self.pool1 = nn.MaxPool1d(2, 2)
 49 |             self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding)
 50 |             self.pool2 = nn.MaxPool1d(2, 2)
 51 |             self.conv3 = nn.Conv1d(n_filters, n_filters, filter_size, padding=padding)
 52 |             self.pool3 = nn.MaxPool1d(2, 2)
 53 |             self.fc1 = nn.Linear(int(L_in * (n_filters) / 8), n_neurons)
 54 |             self.fc1_drop = nn.Dropout(dropout)
 55 |             self.fc2 = nn.Linear(n_neurons, 1)
 56 |     
 57 |     def forward(self, x):
 58 |         # x: tensor (batch_size, L_in, in_channels)
 59 |         x = x.transpose(1,2) # swap time and feature axes
 60 |         
 61 |         x = self.pool1(self.activation(self.conv1(x)))
 62 |         if self.depth == 2 or self.depth == 3:
 63 |             x = self.pool2(self.activation(self.conv2(x)))
 64 |         if self.depth == 3:
 65 |             x = self.pool3(self.activation(self.conv3(x)))
 66 |         
 67 |         x = x.view(x.size(0), -1) # flatten
 68 |         x = self.activation(self.fc1_drop(self.fc1(x)))
 69 |         x = torch.sigmoid(self.fc2(x))
 70 |         return x
 71 | 
 72 | class RNN_V2(nn.Module):
 73 |     """
 74 |     Multi-layer LSTM network
 75 |     """
 76 |     def __init__(
 77 |         self, 
 78 |         input_size,
 79 |         input_length,
 80 |         output_size,
 81 |         hidden_size=64,
 82 |         num_layers=1,
 83 |         dropout=0.0,
 84 |         n_neurons=64,
 85 |         activation='relu',
 86 |     ):
 87 |         super().__init__()
 88 |         if activation == 'relu':
 89 |             self.activation = F.relu
 90 |         elif activation == 'elu':
 91 |             self.activation = F.elu
 92 |         
 93 |         self.hidden_size = int(hidden_size)
 94 |         self.num_layers = int(num_layers)
 95 |         
 96 |         self.lstm = nn.LSTM(int(input_size), int(hidden_size), int(num_layers), batch_first=True)
 97 |         self.fc1 = nn.Linear(hidden_size, n_neurons)
 98 |         self.fc1_drop = nn.Dropout(dropout)
 99 |         self.fc2 = nn.Linear(n_neurons, output_size)
100 |     
101 |     def forward(self, x):
102 |         # x: tensor (batch_size, T, input_size)
103 |         # h_all: (batch_size, T, hidden_size)
104 |         h_0, c_0 = self.init_hidden(x)
105 |         h_all, (h_T, c_T) = self.lstm(x, (h_0, c_0))
106 |         output = h_T[-1]
107 |         output = self.activation(self.fc1_drop(self.fc1(output)))
108 |         output = torch.sigmoid(self.fc2(output))
109 |         return output
110 |     
111 |     def init_hidden(self, x):
112 |         batch_size = x.size(0)
113 |         return (torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device),
114 |                 torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device))


--------------------------------------------------------------------------------
/mimic3_comparisons/3_ML_models/run_shallow_dt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | mkdir -p log
 4 | mkdir -p output
 5 | 
 6 | export DT=48.0
 7 | python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \
 8 |     >  >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.out") \
 9 |     2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2)      &
10 | python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \
11 |     >  >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.out") \
12 |     2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2)
13 | 
14 | # export DT=24.0
15 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \
16 | #     >  >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.out") \
17 | #     2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2)      &
18 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \
19 | #     >  >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.out") \
20 | #     2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2)
21 | 
22 | # export DT=12.0
23 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \
24 | #     >  >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.out") \
25 | #     2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2)      &
26 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \
27 | #     >  >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.out") \
28 | #     2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2)
29 | 
30 | # export DT=4.0
31 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \
32 | #     >  >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.out") \
33 | #     2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2)      &
34 | # python run_shallow_dt.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \
35 | #     >  >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.out") \
36 | #     2> >(tee "log/dt=$DT,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2)
37 | 


--------------------------------------------------------------------------------
/mimic3_comparisons/3_ML_models/run_shallow_theta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | mkdir -p log
 4 | mkdir -p output
 5 | 
 6 | export THETA=0.4
 7 | python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \
 8 |     >  >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.out") \
 9 |     2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2)      &
10 | python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \
11 |     >  >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.out") \
12 |     2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2)
13 | 
14 | # export THETA=0.2
15 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \
16 | #     >  >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.out") \
17 | #     2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2)      &
18 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \
19 | #     >  >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.out") \
20 | #     2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2)
21 | 
22 | # export THETA=0.1
23 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \
24 | #     >  >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.out") \
25 | #     2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2)
26 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \
27 | #     >  >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.out") \
28 | #     2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2)
29 | 
30 | # export THETA=0.05
31 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \
32 | #     >  >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.out") \
33 | #     2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2)
34 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \
35 | #     >  >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.out") \
36 | #     2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2)
37 | 
38 | # export THETA=0.01
39 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \
40 | #     >  >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.out") \
41 | #     2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,LR.err" >&2)
42 | # python run_shallow_theta.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \
43 | #     >  >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.out") \
44 | #     2> >(tee "log/theta=$THETA,outcome=mortality,T=48.0,dt=1.0,RF.err" >&2)
45 | 


--------------------------------------------------------------------------------
/mimic3_experiments/1_data_extraction/config.py:
--------------------------------------------------------------------------------
 1 | import os, yaml
 2 | with open(os.path.join(os.path.dirname(__file__), '../config.yaml')) as f:
 3 |     config = yaml.full_load(f)
 4 | 
 5 | data_path = os.path.join(os.path.dirname(__file__), config['data_path'])
 6 | mimic3_path = os.path.join(os.path.dirname(__file__), config['mimic3_path'])
 7 | 
 8 | ID_col = config['column_names']['ID']
 9 | t_col = config['column_names']['t']
10 | var_col = config['column_names']['var_name']
11 | val_col = config['column_names']['var_value']
12 | 
13 | parallel = True
14 | n_jobs = 72
15 | 


--------------------------------------------------------------------------------
/mimic3_experiments/1_data_extraction/generate_labels.py:
--------------------------------------------------------------------------------
 1 | """
 2 | generate_labels.py
 3 | Author: Shengpu Tang
 4 | 
 5 | Generate labels for two adverse outcomes: ARF and shock.
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import numpy as np
10 | import scipy.stats
11 | import itertools
12 | from collections import OrderedDict, Counter
13 | from joblib import Parallel, delayed
14 | from tqdm import tqdm as tqdm
15 | import yaml
16 | data_path = yaml.full_load(open('../config.yaml'))['data_path']
17 | 
18 | import pathlib
19 | pathlib.Path(data_path, 'labels').mkdir(parents=True, exist_ok=True)
20 | 
21 | examples = pd.read_csv(data_path + 'prep/icustays_MV.csv', parse_dates=['INTIME', 'OUTTIME']).sort_values(by='ICUSTAY_ID')
22 | chartevents = pd.read_pickle(data_path + 'prep/chartevents.p')
23 | procedures = pd.read_pickle(data_path + 'prep/procedureevents_mv.p')
24 | inputevents = pd.read_pickle(data_path + 'prep/inputevents_mv.p')
25 | 
26 | ventilation = [
27 |     '225792', # Invasive Ventilation
28 |     '225794', # Non-invasive Ventilation
29 | ]
30 | 
31 | PEEP = [
32 |     '220339', # PEEP set
33 | ]
34 | 
35 | vasopressors = [
36 |     '221906', # Norepinephrine
37 |     '221289', # Epinephrine
38 |     '221662', # Dopamine
39 |     '222315', # Vasopressin
40 |     '221749', # Phenylephrine
41 | ]
42 | 
43 | ## ARF: (PEEP) OR (mechanical ventilation)
44 | df_PEEP = chartevents[chartevents.ITEMID.isin(PEEP)].copy()
45 | df_vent = procedures[procedures.ITEMID.isin(ventilation)].rename(columns={'t_start': 't'}).copy()
46 | df_ARF = pd.concat([df_PEEP[['ICUSTAY_ID', 't']], df_vent[['ICUSTAY_ID', 't']]], axis=0)
47 | df_ARF['ICUSTAY_ID'] = df_ARF['ICUSTAY_ID'].astype(int)
48 | df_ARF = df_ARF.sort_values(by=['ICUSTAY_ID', 't']).drop_duplicates(['ICUSTAY_ID'], keep='first').reset_index(drop=True)
49 | df_ARF = df_ARF.rename(columns={'t': 'ARF_ONSET_HOUR'})
50 | df_ARF = pd.merge(examples[['ICUSTAY_ID']], df_ARF, on='ICUSTAY_ID', how='left')
51 | df_ARF['ARF_LABEL'] = df_ARF['ARF_ONSET_HOUR'].notnull().astype(int)
52 | print('ARF: ', dict(Counter(df_ARF['ARF_LABEL'])), 'N = {}'.format(len(df_ARF)), sep='\t')
53 | df_ARF.to_csv(data_path + 'labels/ARF.csv', index=False)
54 | 
55 | ## Shock: (one of vasopressors)
56 | df_vaso = inputevents[inputevents.ITEMID.isin(vasopressors)].rename(columns={'t_start': 't'}).copy()
57 | df_shock = df_vaso.copy()
58 | df_shock['ICUSTAY_ID'] = df_shock['ICUSTAY_ID'].astype(int)
59 | df_shock = df_shock.sort_values(by=['ICUSTAY_ID', 't']).drop_duplicates(['ICUSTAY_ID'], keep='first').reset_index(drop=True)
60 | df_shock = df_shock.rename(columns={'t': 'Shock_ONSET_HOUR'})
61 | df_shock = pd.merge(examples[['ICUSTAY_ID']], df_shock, on='ICUSTAY_ID', how='left')
62 | df_shock['Shock_LABEL'] = df_shock['Shock_ONSET_HOUR'].notnull().astype(int)
63 | print('Shock: ', dict(Counter(df_shock['Shock_LABEL'])), 'N = {}'.format(len(df_shock)), sep='\t')
64 | df_shock.to_csv(data_path + 'labels/Shock.csv', index=False)
65 | 


--------------------------------------------------------------------------------
/mimic3_experiments/1_data_extraction/grouped_variables.yaml:
--------------------------------------------------------------------------------
 1 | HR:
 2 | - 220045 # Heart Rate
 3 | 
 4 | SysBP:
 5 | - 224167 # Manual Blood Pressure Systolic Left
 6 | - 227243 # Manual Blood Pressure Systolic Right
 7 | - 220050 # Arterial Blood Pressure systolic
 8 | - 220179 # Non Invasive Blood Pressure systolic
 9 | - 225309 # ART BP Systolic
10 | 
11 | DiaBP:
12 | - 224643 # Manual Blood Pressure Diastolic Left
13 | - 227242 # Manual Blood Pressure Diastolic Right
14 | - 220051 # Arterial Blood Pressure diastolic
15 | - 220180 # Non Invasive Blood Pressure diastolic
16 | - 225310 # ART BP Diastolic
17 | 
18 | RR:
19 | - 220210 # Respiratory Rate
20 | - 224690 # Respiratory Rate (Total)
21 | 
22 | Temperature:
23 | - 223761 # Temperature Fahrenheit
24 | - 223762 # Temperature Celsius
25 | 
26 | SpO2:
27 | - 220277 # O2 saturation pulseoxymetry
28 | 
29 | Height:
30 | - 226707 # Height
31 | - 226730 # Height (cm)
32 | 
33 | Weight:
34 | - 224639 # Daily Weight
35 | - 226512 # Admission Weight (Kg)
36 | - 226531 # Admission Weight (lbs.)
37 | 


--------------------------------------------------------------------------------
/mimic3_experiments/1_data_extraction/run_prepare_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | python prepare_input.py --outcome=ARF   --T=4  --dt=1
 5 | python prepare_input.py --outcome=ARF   --T=12 --dt=1
 6 | python prepare_input.py --outcome=Shock --T=4  --dt=1
 7 | python prepare_input.py --outcome=Shock --T=12 --dt=1
 8 | 
 9 | python prepare_input.py --outcome=mortality --T=48 --dt=1
10 | cp -r ../data/processed/features/outcome=mortality,T=48.0,dt=1.0 ../data/processed/features/benchmark,outcome=mortality,T=48.0,dt=1.0
11 | 


--------------------------------------------------------------------------------
/mimic3_experiments/2_apply_FIDDLE/run_make_all,discretize=no.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | export PYTHONPATH="../../"
 5 | DATAPATH=$(python -c "import yaml;print(yaml.full_load(open('../config.yaml'))['data_path']);")
 6 | mkdir -p 'log,discretize=no'
 7 | 
 8 | OUTCOME=ARF
 9 | T=4.0
10 | dt=1.0
11 | python -m FIDDLE.run \
12 |     --output_dir="$DATAPATH/features,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt/" \
13 |     --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \
14 |     --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \
15 |     --T=$T \
16 |     --dt=$dt \
17 |     --theta_1=0.001 \
18 |     --theta_2=0.001 \
19 |     --theta_freq=1 \
20 |     --stats_functions 'min' 'max' 'mean' \
21 |     --discretize=no \
22 |     > >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.out") \
23 |     2> >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2)
24 | 
25 | OUTCOME=ARF
26 | T=12.0
27 | dt=1.0
28 | python -m FIDDLE.run \
29 |     --output_dir="$DATAPATH/features,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt/" \
30 |     --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \
31 |     --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \
32 |     --T=$T \
33 |     --dt=$dt \
34 |     --theta_1=0.001 \
35 |     --theta_2=0.001 \
36 |     --theta_freq=1 \
37 |     --stats_functions 'min' 'max' 'mean' \
38 |     --discretize=no \
39 |     > >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.out") \
40 |     2> >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2)
41 | 
42 | OUTCOME=Shock
43 | T=4.0
44 | dt=1.0
45 | python -m FIDDLE.run \
46 |     --output_dir="$DATAPATH/features,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt/" \
47 |     --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \
48 |     --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \
49 |     --T=$T \
50 |     --dt=$dt \
51 |     --theta_1=0.001 \
52 |     --theta_2=0.001 \
53 |     --theta_freq=1 \
54 |     --stats_functions 'min' 'max' 'mean' \
55 |     --discretize=no \
56 |     > >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.out") \
57 |     2> >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2)
58 | 
59 | OUTCOME=Shock
60 | T=12.0
61 | dt=1.0
62 | python -m FIDDLE.run \
63 |     --output_dir="$DATAPATH/features,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt/" \
64 |     --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \
65 |     --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \
66 |     --T=$T \
67 |     --dt=$dt \
68 |     --theta_1=0.001 \
69 |     --theta_2=0.001 \
70 |     --theta_freq=1 \
71 |     --stats_functions 'min' 'max' 'mean' \
72 |     --discretize=no \
73 |     > >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.out") \
74 |     2> >(tee "log,discretize=no/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2)
75 | 
76 | 
77 | 
78 | python -m FIDDLE.run \
79 |     --output_dir="$DATAPATH/features,discretize=no/benchmark,outcome=mortality,T=48.0,dt=1.0/" \
80 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
81 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
82 |     --T=48.0 \
83 |     --dt=1.0 \
84 |     --theta_1=0.001 \
85 |     --theta_2=0.001 \
86 |     --theta_freq=1 \
87 |     --stats_functions 'min' 'max' 'mean' \
88 |     --discretize=no \
89 |     > >(tee 'log,discretize=no/benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
90 |     2> >(tee 'log,discretize=no/benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
91 | 


--------------------------------------------------------------------------------
/mimic3_experiments/2_apply_FIDDLE/run_make_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | export PYTHONPATH="../../FIDDLE/"
 5 | DATAPATH=$(python -c "import yaml;print(yaml.full_load(open('../config.yaml'))['data_path']);")
 6 | mkdir -p log
 7 | 
 8 | OUTCOME=ARF
 9 | T=4.0
10 | dt=1.0
11 | python -m FIDDLE.run \
12 |     --output_dir="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/" \
13 |     --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \
14 |     --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \
15 |     --T=$T \
16 |     --dt=$dt \
17 |     --theta_1=0.001 \
18 |     --theta_2=0.001 \
19 |     --theta_freq=1 \
20 |     --stats_functions 'min' 'max' 'mean' \
21 |     > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \
22 |     2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2)
23 | 
24 | 
25 | OUTCOME=ARF
26 | T=12.0
27 | dt=1.0
28 | python -m FIDDLE.run \
29 |     --output_dir="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/" \
30 |     --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \
31 |     --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \
32 |     --T=$T \
33 |     --dt=$dt \
34 |     --theta_1=0.001 \
35 |     --theta_2=0.001 \
36 |     --theta_freq=1 \
37 |     --stats_functions 'min' 'max' 'mean' \
38 |     > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \
39 |     2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2)
40 | 
41 | OUTCOME=Shock
42 | T=4.0
43 | dt=1.0
44 | python -m FIDDLE.run \
45 |     --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \
46 |     --output_dir="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/" \
47 |     --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \
48 |     --T=$T \
49 |     --dt=$dt \
50 |     --theta_1=0.001 \
51 |     --theta_2=0.001 \
52 |     --theta_freq=1 \
53 |     --stats_functions 'min' 'max' 'mean' \
54 |     > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \
55 |     2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2)
56 | 
57 | OUTCOME=Shock
58 | T=12.0
59 | dt=1.0
60 | python -m FIDDLE.run \
61 |     --data_fname="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/input_data.p" \
62 |     --output_dir="$DATAPATH/features/outcome=$OUTCOME,T=$T,dt=$dt/" \
63 |     --population="$DATAPATH/population/${OUTCOME}_${T}h.csv" \
64 |     --T=$T \
65 |     --dt=$dt \
66 |     --theta_1=0.001 \
67 |     --theta_2=0.001 \
68 |     --theta_freq=1 \
69 |     --stats_functions 'min' 'max' 'mean' \
70 |     > >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.out") \
71 |     2> >(tee "log/outcome=$OUTCOME,T=$T,dt=$dt.err" >&2)
72 | 
73 | T=48.0
74 | dt=1.0
75 | python -m FIDDLE.run \
76 |     --data_fname="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/input_data.p" \
77 |     --output_dir="$DATAPATH/features/benchmark,outcome=mortality,T=48.0,dt=1.0/" \
78 |     --population="$DATAPATH/population/pop.mortality_benchmark.csv" \
79 |     --T=$T \
80 |     --dt=$dt \
81 |     --theta_1=0.001 \
82 |     --theta_2=0.001 \
83 |     --theta_freq=1 \
84 |     --stats_functions 'min' 'max' 'mean' \
85 |     > >(tee 'log/benchmark,outcome=mortality,T=48.0,dt=1.0.out') \
86 |     2> >(tee 'log/benchmark,outcome=mortality,T=48.0,dt=1.0.err' >&2)
87 | 


--------------------------------------------------------------------------------
/mimic3_experiments/3_ML_models/DataSummary.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/home/tangsp/mimic3_experiments/lib/data.py:14: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
 13 |       "  config = yaml.load(f)\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "from lib.data import _Mimic3Reader\n",
 19 |     "import pandas as pd\n",
 20 |     "import numpy as np"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "timestep = 1.0"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "Finish reading data \t 2.28 s\n",
 42 |       "s (15873, 98)\n",
 43 |       "X (15873, 4, 4045)\n",
 44 |       "Finish reading data \t 11.16 s\n",
 45 |       "s (14174, 96)\n",
 46 |       "X (14174, 12, 4816)\n",
 47 |       "Finish reading data \t 3.49 s\n",
 48 |       "s (19342, 98)\n",
 49 |       "X (19342, 4, 4522)\n",
 50 |       "Finish reading data \t 10.31 s\n",
 51 |       "s (17588, 97)\n",
 52 |       "X (17588, 12, 5500)\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "for task in ['ARF', 'Shock']:\n",
 58 |     "    for duration in [4, 12]:\n",
 59 |     "        reader = _Mimic3Reader(task, duration, timestep)\n",
 60 |     "        print('s', reader.s.shape)\n",
 61 |     "        print('X', reader.X.shape)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stdout",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "Finish reading data \t 35.72 s\n",
 74 |       "s (11695, 97)\n",
 75 |       "X (11695, 48, 7411)\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "for task in ['mortality']:\n",
 81 |     "    for duration in [48]:\n",
 82 |     "        reader = _Mimic3Reader(task, duration, timestep)\n",
 83 |     "        print('s', reader.s.shape)\n",
 84 |     "        print('X', reader.X.shape)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": []
 93 |   }
 94 |  ],
 95 |  "metadata": {
 96 |   "kernelspec": {
 97 |    "display_name": "Python 3",
 98 |    "language": "python",
 99 |    "name": "python3"
100 |   },
101 |   "language_info": {
102 |    "codemirror_mode": {
103 |     "name": "ipython",
104 |     "version": 3
105 |    },
106 |    "file_extension": ".py",
107 |    "mimetype": "text/x-python",
108 |    "name": "python",
109 |    "nbconvert_exporter": "python",
110 |    "pygments_lexer": "ipython3",
111 |    "version": "3.5.2"
112 |   }
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 2
116 | }
117 | 


--------------------------------------------------------------------------------
/mimic3_experiments/3_ML_models/README.md:
--------------------------------------------------------------------------------
 1 | Config file:
 2 | - `config.yaml`. Change data_path to point to the directory where the features/labels/population are stored
 3 | 
 4 | Library files:
 5 | - `data.py`
 6 | - `models.py`
 7 | - `trainer.py`
 8 | - `experiment.py`
 9 | - `eval_deep.py`
10 | - `evaluate.py`
11 | 
12 | Executable files:
13 | - `run_deep.py`
14 | - `run_shallow.py`
15 | 
16 | Notebooks:
17 | - `RunShallow.ipynb`
18 | - `NewEval_Deep.ipynb`
19 | - `Evaluation.ipynb`
20 | - `PredictionGap.ipynb`
21 | 


--------------------------------------------------------------------------------
/mimic3_experiments/3_ML_models/config.yaml:
--------------------------------------------------------------------------------
 1 | data_path: ../data/processed/
 2 | 
 3 | model_names: {
 4 |     'CNN': 'CNN_V3',
 5 |     'RNN': 'RNN_V2',
 6 |     'LR': 'LR',
 7 |     'RF': 'RF',
 8 | }
 9 | 
10 | train:
11 |     budget: 50
12 |     repeat: 1
13 |     epochs: 15
14 | 
15 | feature_dimension:
16 |     ARF:
17 |         4.0 : 4380
18 |         12.0: 5226
19 | 
20 |     Shock:
21 |         4.0 : 4857
22 |         12.0: 5892
23 | 
24 |     mortality:
25 |         48.0: 7822
26 | 


--------------------------------------------------------------------------------
/mimic3_experiments/3_ML_models/lib/evaluate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | def get_best_model_info(df_search):
 5 |     df_search_sorted = df_search.sort_values('best_score', ascending=False).head()
 6 |     best_model_info = df_search_sorted.iloc[0, 1:]
 7 |     return best_model_info
 8 | 
 9 | def load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params, load_filename=None):
10 |     if load_filename is None:
11 |         savename = best_model_info['savename']
12 |         split = savename.split('/')
13 |         split[-1] = 'best_' + split[-1]
14 |         load_filename = '/'.join(split)
15 |     
16 |     checkpoint = torch.load(load_filename)
17 |     _iter = checkpoint['_iter']
18 |     print("Loaded checkpoint (trained for {} iterations)".format(checkpoint['_iter']))
19 | #     print(load_filename)
20 |     
21 |     best_HP = best_model_info.drop(['savename', 'best_iter', 'seed']).to_dict()
22 |     model = ModelClass(
23 |         in_channels, L_in, 1,
24 |         **{k:best_HP[k] for k in best_HP.keys() if k not in training_params}
25 |     )
26 |     model.load_state_dict(checkpoint['state_dict'])
27 |     model.cuda()
28 |     print("Restored model {} with #params={}".format(ModelClass, sum(p.numel() for p in model.parameters())))
29 |     
30 |     return checkpoint, model
31 | 
32 | def get_test_predictions(model, te_loader, task=None, model_name=None):
33 |     model.eval()
34 |     running_pred = []
35 | 
36 |     cuda = True
37 |     for i, (X, y) in enumerate(te_loader):
38 |         if cuda:
39 |             X = X.contiguous().cuda()
40 |             y = y.contiguous().cuda(non_blocking=True)
41 | 
42 |         with torch.set_grad_enabled(False):
43 |             output = model(X)
44 |             running_pred.append((output.data.detach().cpu(), y.data.detach().cpu()))
45 | 
46 |     y_score, y_true = zip(*running_pred)
47 |     y_score = torch.cat(y_score).numpy()
48 |     y_true = torch.cat(y_true).numpy()
49 | 
50 |     assert (np.stack(te_loader.dataset.y) == y_true).all()
51 |     return y_true, y_score
52 | 
53 | def save_test_predictions(y_true, y_score, task, T, dt, model_name):
54 |     import pathlib
55 |     pathlib.Path('./output/outcome={}.T={}.dt={}/'.format(task, T, dt)).mkdir(parents=True, exist_ok=True)
56 |     
57 |     fname = './output/outcome={}.T={}.dt={}/{}.test.npz'.format(task, T, dt, model_name)
58 |     np.savez(
59 |         open(fname, 'wb'),
60 |         y_score = y_score,
61 |         y_true  = y_true,
62 |     )
63 |     print('Test predictions saved to', fname)
64 | 


--------------------------------------------------------------------------------
/mimic3_experiments/3_ML_models/lib/experiment.py:
--------------------------------------------------------------------------------
 1 | from .trainer import Trainer
 2 | import time
 3 | import random
 4 | import numpy as np
 5 | import pandas as pd
 6 | import torch
 7 | from sklearn.model_selection import ParameterSampler
 8 | 
 9 | class Experiment(object):
10 |     def __init__(self, param_grid, budget=1, repeat=1, n_epochs=5, name='tmp'):
11 |         self.name = name
12 |         self.budget = budget
13 |         self.repeat = repeat # number of restarts with different random seeds
14 |         self.n_epochs = n_epochs
15 |         self.param_grid = param_grid
16 |         self.param_sampler = ParameterSampler(param_grid, n_iter=self.budget, random_state=0)
17 |     
18 |     def run(self):
19 |         df_search = pd.DataFrame(columns=['best_score', 'best_iter', 'seed', 'savename'] + list(self.param_grid.keys()))
20 |         start_time = time.time()
21 |         for run, params in enumerate(self.param_sampler):
22 |             print(self.name, '\t', 'Run:', run, '/', self.budget)
23 |             print(params)
24 |             for i in range(self.repeat):
25 |                 results = self._run_trial(i, params)
26 |                 df_search = df_search.append(results, ignore_index=True)
27 |                 df_search.to_csv('./log/df_search.current.{}.csv'.format(self.name), index=False)
28 | 
29 |         print('Took:', time.time() - start_time)
30 |         return df_search
31 |     
32 |     def _run_trial(self, seed, params):
33 |         savename = 'checkpoint/{}/{}_seed={}.pth.tar'.format(self.name, params, seed)
34 |         
35 |         random.seed(seed)
36 |         np.random.seed(seed)
37 |         torch.manual_seed(seed)
38 |         torch.cuda.manual_seed_all(seed)
39 | 
40 |         tr_loader, va_loader = self.get_data()
41 |         model, criterion, optimizer = self.get_model_params(params)
42 |         trainer = Trainer(model, criterion, optimizer, tr_loader, va_loader, 
43 |                           n_epochs=self.n_epochs, batch_size=params['batch_size'], 
44 |                           savename=savename, 
45 |                           save_every=100, plot_every=50, cuda=True)
46 | #         print(trainer)
47 |         trainer.fit()
48 | 
49 |         print(trainer._best_iter, '{:.5f}'.format(trainer.best_score))
50 |         
51 |         del model
52 |         return {
53 |             'best_score': trainer.best_score, 'best_iter': trainer._best_iter, 
54 |             'savename': savename, 'seed': seed,
55 |             **params,
56 |         }
57 |     
58 |     def get_model_params(self):
59 |         raise NotImplementedError
60 |     
61 |     def get_data(self):
62 |         raise NotImplementedError
63 |     


--------------------------------------------------------------------------------
/mimic3_experiments/3_ML_models/run_deep.py:
--------------------------------------------------------------------------------
  1 | # python run_deep.py --outcome=ARF --T=4 --dt=0.5 --model_type=CNN --cuda=7
  2 | 
  3 | import sys, os, time, pickle, random
  4 | import pandas as pd
  5 | import numpy as np
  6 | import pathlib
  7 | pathlib.Path('log').mkdir(parents=True, exist_ok=True)
  8 | 
  9 | import yaml
 10 | with open('config.yaml') as f:
 11 |     config = yaml.load(f)
 12 | 
 13 | ########
 14 | ## Constants
 15 | data_path = config['data_path']
 16 | model_names = config['model_names']
 17 | 
 18 | budget = config['train']['budget'] # Number of randomized hyperparameter settings to try
 19 | repeat = config['train']['repeat'] # 1   # number of restarts (with different seeds) for each setting
 20 | epochs = config['train']['epochs'] # 15  # Max epochs for each setting
 21 | 
 22 | # Feature dimensions
 23 | dimensions = config['feature_dimension']
 24 | 
 25 | # Hyperparameter search space
 26 | train_param_grid = {
 27 |     'batch_size': [16, 32, 64, 128],
 28 |     'lr': [1e-2, 1e-3, 1e-4],
 29 | }
 30 | CNN_param_grid = {
 31 |     'dropout': [0.0, 0.1, 0.2, 0.4, 0.8],
 32 |     'depth': [1, 2],#, 3],
 33 |     'filter_size': [1, 2, 3, 4],
 34 |     'n_filters': [16, 32, 64, 128],
 35 |     'n_neurons': [16, 32, 64, 128],
 36 |     'activation': ['relu', 'elu'],
 37 | }
 38 | RNN_param_grid = {
 39 |     'dropout': [0.0, 0.1, 0.2, 0.4, 0.8],
 40 |     'num_layers': [1, 2, 3],
 41 |     'hidden_size': [16, 32, 64, 128],
 42 |     'n_neurons': [16, 32, 64, 128],
 43 |     'activation': ['relu', 'elu'],
 44 | }
 45 | 
 46 | training_params = {'batch_size', 'lr'}
 47 | 
 48 | ########
 49 | 
 50 | import argparse
 51 | 
 52 | parser = argparse.ArgumentParser(description='')
 53 | 
 54 | parser.add_argument('--outcome', type=str, required=True)
 55 | parser.add_argument('--T', type=float, required=True)
 56 | parser.add_argument('--dt', type=float, required=True)
 57 | parser.add_argument('--model_type', type=str, required=True)
 58 | parser.add_argument('--cuda', type=int, default=7)
 59 | parser.add_argument('--seed', type=int, default=42)
 60 | 
 61 | args = parser.parse_args()
 62 | 
 63 | task = args.outcome
 64 | model_type = args.model_type
 65 | 
 66 | T = float(args.T)
 67 | dt = float(args.dt)
 68 | L_in = int(np.floor(T / dt))
 69 | in_channels = dimensions[task][float(T)]
 70 | 
 71 | import lib.models as models
 72 | model_name = model_names[model_type]
 73 | ModelClass = getattr(models, model_name)
 74 | 
 75 | if model_type == 'CNN':
 76 |     param_grid = {**train_param_grid, **CNN_param_grid}
 77 | elif model_type == 'RNN':
 78 |     param_grid = {**train_param_grid, **RNN_param_grid}
 79 | else:
 80 |     assert False
 81 | 
 82 | # Create checkpoint directories
 83 | import pathlib
 84 | pathlib.Path("./checkpoint/model={}.outcome={}.T={}.dt={}/".format(model_name, task, T, dt)).mkdir(parents=True, exist_ok=True)
 85 | 
 86 | ## Data
 87 | import lib.data as data
 88 | if task == 'mortality':
 89 |     tr_loader, va_loader, te_loader = data.get_benchmark_splits(fuse=True)
 90 | else:
 91 |     tr_loader, va_loader, te_loader = data.get_train_val_test(task, duration=T, timestep=dt, fuse=True)
 92 | 
 93 | import torch
 94 | from torch.utils.data import Dataset, DataLoader
 95 | from sklearn.model_selection import StratifiedShuffleSplit
 96 | 
 97 | # Set CUDA 
 98 | if args.cuda:
 99 |     torch.cuda.set_device(args.cuda)
100 |     print('cuda', torch.cuda.current_device())
101 | 
102 | if args.seed:
103 |     torch.manual_seed(args.seed)
104 |     np.random.seed(args.seed)
105 |     random.seed(args.seed)
106 | 
107 | 
108 | from lib.experiment import Experiment
109 | 
110 | class MIMICExperiment(Experiment):
111 |     def get_model_params(self, params):
112 |         model = ModelClass(
113 |             in_channels, L_in, 1,
114 |             **{k:params[k] for k in params.keys() if k not in training_params}
115 |         )
116 |         criterion = torch.nn.BCELoss()
117 |         optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
118 |         return model, criterion, optimizer
119 | 
120 |     def get_data(self):
121 |         return tr_loader, va_loader
122 | 
123 | exp = MIMICExperiment(
124 |     param_grid, name='model={}.outcome={}.T={}.dt={}'.format(model_name, task, T, dt), 
125 |     budget=budget, n_epochs=epochs, repeat=repeat,
126 | )
127 | 
128 | print('EXPERIMENT:', exp.name)
129 | 
130 | df_search = exp.run()
131 | df_search.to_csv('./log/df_search.{}.csv'.format(exp.name), index=False)
132 | 


--------------------------------------------------------------------------------
/mimic3_experiments/3_ML_models/run_deep_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | mkdir -p log
 4 | cuda=0
 5 | 
 6 | python run_deep.py --outcome=mortality   --T=48.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=mortality,T=48,dt=1.0,CNN.log'
 7 | python run_deep.py --outcome=mortality   --T=48.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=mortality,T=48,dt=1.0,RNN.log'
 8 | 
 9 | python run_deep.py --outcome=ARF   --T=4.0  --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=ARF,T=4,dt=1.0,CNN.log'
10 | python run_deep.py --outcome=ARF   --T=4.0  --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=ARF,T=4,dt=1.0,RNN.log'
11 | 
12 | python run_deep.py --outcome=ARF   --T=12.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=ARF,T=12,dt=1.0,CNN.log'
13 | python run_deep.py --outcome=ARF   --T=12.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=ARF,T=12,dt=1.0,RNN.log'
14 | 
15 | python run_deep.py --outcome=Shock --T=4.0  --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=Shock,T=4,dt=1.0,CNN.log'
16 | python run_deep.py --outcome=Shock --T=4.0  --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=Shock,T=4,dt=1.0,RNN.log'
17 | 
18 | python run_deep.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=CNN --cuda=$cuda &> 'log/outcome=Shock,T=12,dt=1.0,CNN.log'
19 | python run_deep.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=RNN --cuda=$cuda &> 'log/outcome=Shock,T=12,dt=1.0,RNN.log'
20 | 


--------------------------------------------------------------------------------
/mimic3_experiments/3_ML_models/run_deep_eval.py:
--------------------------------------------------------------------------------
 1 | import sys, os, time, pickle, random
 2 | import pandas as pd
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import yaml
 6 | with open('config.yaml') as f:
 7 |     config = yaml.load(f)
 8 | 
 9 | ########
10 | ## Constants
11 | model_names = config['model_names']
12 | training_params = {'batch_size', 'lr'}
13 | 
14 | # Feature dimensions
15 | dimensions = config['feature_dimension']
16 | 
17 | ########
18 | 
19 | def main(task, T, dt, model_type):
20 |     L_in = int(np.floor(T / dt))
21 |     in_channels = dimensions[task][T]
22 | 
23 |     import lib.models as models
24 |     model_name = model_names[model_type]
25 |     ModelClass = getattr(models, model_name)
26 |     df_search = pd.read_csv('./log/df_search.model={}.outcome={}.T={}.dt={}.csv'.format(model_name, task, T, dt))
27 |     import lib.evaluate as evaluate
28 |     best_model_info = evaluate.get_best_model_info(df_search)
29 |     checkpoint, model = evaluate.load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params)
30 | 
31 | 
32 |     import lib.data as data
33 |     if task == 'mortality':
34 |         te_loader = data.get_benchmark_test(fuse=True)
35 |     else:
36 |         te_loader = data.get_test(task, duration=T, timestep=dt, fuse=True)
37 |     
38 |     y_true, y_score = evaluate.get_test_predictions(model, te_loader, '{}_T={}_dt={}'.format(task, T, dt), model_name)
39 |     evaluate.save_test_predictions(y_true, y_score, task, T, dt, model_name)
40 | 
41 |     from sklearn import metrics, utils
42 |     fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score)
43 |     fig = plt.figure(figsize=(5,5))
44 |     plt.xlabel('False Positive Rate')
45 |     plt.ylabel('True Positive Rate')
46 |     plt.xlim(0,1)
47 |     plt.ylim(0,1)
48 |     plt.plot([0,1], [0,1], ':')
49 |     plt.plot(fpr, tpr, color='darkorange')
50 |     plt.show()
51 | 
52 |     ## Bootstrapped 95% Confidence Interval
53 |     # try:
54 |     #     yte_pred = clf.decision_function(Xte)
55 |     # except AttributeError:
56 |     #     yte_pred = clf.predict_proba(Xte)[:,1]
57 |     from joblib import Parallel, delayed
58 |     from tqdm import tqdm_notebook as tqdm
59 |     def func(i):
60 |         yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)
61 |         return metrics.roc_auc_score(yte_true_b, yte_pred_b)
62 | 
63 |     test_scores = Parallel(n_jobs=16)(delayed(func)(i) for i in tqdm(range(1000), leave=False))
64 |     print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(test_scores), np.percentile(test_scores, 2.5), np.percentile(test_scores, 97.5)))
65 | 
66 |     # idx = (np.abs(tpr - 0.5)).argmin()
67 |     # y_pred = (y_score > thresholds[idx])
68 |     # metrics.roc_auc_score(y_true, y_score)
69 | 
70 |     precision, recall, thresholds_ = metrics.precision_recall_curve(y_true, y_score)
71 |     fig = plt.figure(figsize=(5,5))
72 |     plt.xlabel('Recall')
73 |     plt.ylabel('Precision')
74 |     plt.xlim(0,1)
75 |     plt.ylim(0,1)
76 |     plt.plot(recall, precision, color='darkorange')
77 |     plt.show()
78 | 
79 |     # target TPR = 50%
80 |     idx = (np.abs(tpr - 0.5)).argmin()
81 |     y_pred = (y_score > thresholds[idx])
82 |     metrics.roc_auc_score(y_true, y_score)
83 | 
84 |     pd.DataFrame([{
85 |         'tpr': tpr[idx],
86 |         'fpr': fpr[idx],
87 |         'ppv': metrics.precision_score(y_true, y_pred),
88 |     }])
89 | 


--------------------------------------------------------------------------------
/mimic3_experiments/3_ML_models/run_shallow_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | mkdir -p log
 4 | mkdir -p output
 5 | 
 6 | python run_shallow.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=LR \
 7 |     >  >(tee 'log/outcome=mortality,T=48.0,dt=1.0,LR.out') \
 8 |     2> >(tee 'log/outcome=mortality,T=48.0,dt=1.0,LR.err' >&2)
 9 | python run_shallow.py --outcome=mortality --T=48.0 --dt=1.0 --model_type=RF \
10 |     >  >(tee 'log/outcome=mortality,T=48.0,dt=1.0,RF.out') \
11 |     2> >(tee 'log/outcome=mortality,T=48.0,dt=1.0,RF.err' >&2)
12 | 
13 | python run_shallow.py --outcome=ARF   --T=4.0  --dt=1.0 --model_type=LR &> 'log/outcome=ARF,T=4.0,dt=1.0,LR.log'
14 | python run_shallow.py --outcome=Shock --T=4.0  --dt=1.0 --model_type=LR &> 'log/outcome=Shock,T=4.0,dt=1.0,LR.log'
15 | 
16 | python run_shallow.py --outcome=ARF   --T=4.0  --dt=1.0 --model_type=RF &> 'log/outcome=ARF,T=4.0,dt=1.0,RF.log'
17 | python run_shallow.py --outcome=Shock --T=4.0  --dt=1.0 --model_type=RF &> 'log/outcome=Shock,T=4.0,dt=1.0,RF.log'
18 | 
19 | python run_shallow.py --outcome=ARF   --T=12.0 --dt=1.0 --model_type=LR &> 'log/outcome=ARF,T=12.0,dt=1.0,LR.log'
20 | python run_shallow.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=LR &> 'log/outcome=Shock,T=12.0,dt=1.0,LR.log'
21 | 
22 | python run_shallow.py --outcome=ARF   --T=12.0 --dt=1.0 --model_type=RF &> 'log/outcome=ARF,T=12.0,dt=1.0,RF.log'
23 | python run_shallow.py --outcome=Shock --T=12.0 --dt=1.0 --model_type=RF &> 'log/outcome=Shock,T=12.0,dt=1.0,RF.log'
24 | 


--------------------------------------------------------------------------------
/mimic3_experiments/5_baseline_NEWS/NEWS_table.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLD3/FIDDLE-experiments/77483adf4327e87cbea4963252db873829cad813/mimic3_experiments/5_baseline_NEWS/NEWS_table.jpg


--------------------------------------------------------------------------------
/mimic3_experiments/README.md:
--------------------------------------------------------------------------------
 1 | ## Overview
 2 | - Requires: the raw csv files of the [MIMIC-III database](https://mimic.physionet.org/about/mimic/)
 3 | - Extract and format data from structured tables in MIMIC-III as input to FIDDLE
 4 | - Goal: using data from all structured tables, generate Time-Invariant features **s** and Time-Series features **X**. 
 5 | 
 6 | 
 7 | We considered five prediction tasks involving three adverse outcomes: 
 8 | 
 9 | - in-hospital mortality (48h)
10 | - ARF (4h)
11 | - ARF (12h)
12 | - shock (4h)
13 | - shock (12h)
14 | 
15 | 
16 | ## Steps to reproduce results
17 | 
18 | 0. Modify `config.yaml` to specify `mimic3_path` and `data_path`.
19 | 
20 | ### 1) Data Extraction
21 | 
22 | 1. Data Extraction
23 |     - Run `python -c "from extract_data import *; check_nrows();"` to verify the integrity of raw csv files.
24 |     - Run `python extract_data.py`.
25 | 
26 | 2. Labels & Cohort definitions
27 |     - Run `python generate_labels.py` to generate the event onset time and labels for three outcomes: in-hospital mortality, ARF, and shock. The output should be
28 |         ```
29 |         ARF:    {0: 13125, 1: 10495}    N = 23620
30 |         Shock:  {0: 16629, 1: 6991}     N = 23620
31 |         ```
32 |     - Run the following notebooks in order: `LabelDistribution.ipynb` and `InclusionExclusion.ipynb`.
33 |     > The above also generates the cohort for 48h in-hospital mortality in `mortality_48.0h.csv`. However, we found some inconsistencies compared to the [mimic3-benchmark](https://github.com/YerevaNN/mimic3-benchmarks) (see also: [multitask benchmarking paper](https://doi.org/10.1038/s41597-019-0103-9)). To ensure a fair comparison with the benchmark feature set (and use the same train/val/test splits), we used their label definitions, but only consider the subset of their cohort recorded using MetaVision (i.e., also in our mortality cohort). Run the notebook `resources/IHM_benchmark.ipynb` to generate the final cohort for 48h in-hospital mortality prediction in `pop.mortality_benchmark.csv`.
34 |     - Run `PopulationSummary.ipynb`.
35 | 
36 | 3. Prepare input tables for each cohort
37 | 
38 |     - Run `python prepare_input.py --outcome={outcome} --T={T} --dt={dt}`
39 |     
40 |     Note: a bash script is provided for prepare input tables for all cohorts: `./run_prepare_all.sh`
41 |     
42 |     Since `pop.mortality_benchmark.csv` is a subset of `mortality_48.0h.csv`, we only create one `input_data.p` for the larger `mortality_48.0h.csv` and copy it into two output folders. 
43 | 
44 | 4. Run the notebook `FIDDLE_input_lengths.ipynb` to check the file size and the number of rows in each `input_data.p` table. 
45 | 
46 | ### 2) Apply FIDDLE
47 | 
48 | 1. Apply FIDDLE on each cohort to generate features. 
49 | 
50 |     - A bash script is provided for generating features: `./run_make_all.sh`
51 | 
52 |     - The generated features and associated metadata are located in `{data_path}/features/outcome={outcome},T={T},dt={dt}/`:
53 | 
54 |         - `s.npz`: a sparse array of shape (N, d)
55 |         - `X.npz`: a sparse tensor of shape (N, L, D)
56 |         - `s.feature_names.txt`: names of _d_ time-invariant features
57 |         - `X.feature_names.txt`: names of _D_ time-series features
58 | 
59 | ### 3) ML Models
60 | 
61 | We used four commonly used ML algorithms to train models using the generated features:
62 | 
63 | - LR: L2-regularized logistic regression
64 | - RF: random forest
65 | - CNN: 1D convolutional neural networks
66 | - LSTM: recurrent neural networks with long short-term memory cells
67 | 
68 | To establish a fair comparison, all models are tuned for hyperparameter settings using a random search with a budget of 50, maximizing the area under the receiver operating characteristic curve (AUROC). 
69 | 
70 | To train the shallow models (LR and RF), run the following bash script. This part uses sklearn implementation of the models. 
71 | 
72 | ```bash
73 | > ./run_shallow_all.sh
74 | ```
75 | 
76 | To train the deep models (CNN and LSTM), run the following bash script. This part uses pytorch implementation of the layers, and custom architectures defined in `lib/models.py`; it will use GPUs if available. 
77 | 
78 | ```bash
79 | > ./run_deep_all.sh
80 | ```
81 | 
82 | ### 4) Evaluation
83 | 
84 | See instructions in `Evaluation.ipynb` to generate the following plots using the held-out test set: ROC curves with AUROC, PR curves with AUPR, and calibration plots with Brier scores. Plots include 95% confidence intervals calculated on 1000 bootstraps of the held-out test set. 
85 | 


--------------------------------------------------------------------------------
/mimic3_experiments/config.yaml:
--------------------------------------------------------------------------------
 1 | # Location of input files
 2 | mimic3_path: ../data/mimic3_csv/
 3 | data_path: ../data/processed/
 4 | 
 5 | # Customize table headers
 6 | column_names:
 7 |     ID: ID
 8 |     t: t
 9 |     var_name: variable_name
10 |     var_value: variable_value
11 | 


--------------------------------------------------------------------------------