├── metabolomicstatemodel
    ├── source
    │   ├── __init__.py
    │   ├── config
    │   │   ├── features
    │   │   │   ├── PANELmetabolitesOverlap.yaml
    │   │   │   ├── AgeSex.yaml
    │   │   │   ├── PANELmetabolites.yaml
    │   │   │   ├── PANEL.yaml
    │   │   │   └── Metabolomics.yaml
    │   │   └── config.yaml
    │   ├── logging.py
    │   ├── evaluation.py
    │   ├── losses.py
    │   ├── utils.py
    │   ├── callbacks.py
    │   ├── modules.py
    │   ├── datasets.py
    │   └── datamodules.py
    ├── train.sh
    └── train.py
├── src
    ├── fig1.png
    ├── fig2.png
    └── msm_logo.png
├── analysis
    ├── preprocessing
    │   ├── 0_decode_ukbb.ipynb
    │   ├── pipeline_metabolomics.py
    │   ├── 2_preprocessing_clinical_endpoints.ipynb
    │   └── 1_preprocessing_dataportal.ipynb
    └── examples
    │   ├── assets.yaml
    │   └── sample.csv
├── README.md
└── LICENSE


/metabolomicstatemodel/source/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thbuerg/MetabolomicsCommonDiseases/HEAD/src/fig1.png


--------------------------------------------------------------------------------
/src/fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thbuerg/MetabolomicsCommonDiseases/HEAD/src/fig2.png


--------------------------------------------------------------------------------
/src/msm_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thbuerg/MetabolomicsCommonDiseases/HEAD/src/msm_logo.png


--------------------------------------------------------------------------------
/metabolomicstatemodel/train.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | echo $(hostname)
3 | echo $(which python)
4 | echo $(python -c 'import torch; print(f"found {torch.cuda.device_count()} gpus.")')
5 | echo $CUDA_VISIBLE_DEVICES
6 | 
7 | python train.py --config-dir source/config/ --config-name config
8 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/config/features/PANELmetabolitesOverlap.yaml:
--------------------------------------------------------------------------------
 1 | # @package experiment.feature_set.PANEL
 2 | categorical:
 3 |   basics: []
 4 |   questionnaire: []
 5 | one_hot_enc:
 6 |   basics: []
 7 |   questionnaire: []
 8 | general:
 9 |   pgs: []
10 |   measurements: []
11 |   labs: [
12 |       "albumin",
13 |       "cholesterol",
14 |       "hdl_cholesterol",
15 |       "ldl_direct",
16 |       "triglycerides",
17 |       "glucose",
18 |       "creatinine",
19 |   ]
20 |   family_history: []
21 |   diagnoses: []
22 |   medications: []
23 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/config/features/AgeSex.yaml:
--------------------------------------------------------------------------------
 1 | # @package experiment.feature_set.CVDCoreVariablesWithPGS
 2 | categorical:
 3 |   basics: [
 4 |     'age_at_recruitment',
 5 |     'sex'
 6 |   ]
 7 |   questionnaire: [
 8 |   ]
 9 |   metabolomics: []
10 | one_hot_enc:
11 |   basics: [
12 |     'age_at_recruitment',
13 |     'sex'
14 |   ]
15 |   questionnaire: [
16 |   ]
17 |   metabolomics: []
18 | general:
19 |   metabolomics: []
20 |   measurements: [
21 |     ]
22 |   labs: [
23 |     ]
24 |   family_history: [
25 |     ]
26 |   diagnoses: []
27 |   medications: []


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/config/features/PANELmetabolites.yaml:
--------------------------------------------------------------------------------
 1 | # @package experiment.feature_set.PANEL
 2 | categorical:
 3 |   basics: []
 4 |   questionnaire: []
 5 | one_hot_enc:
 6 |   basics: []
 7 |   questionnaire: []
 8 | general:
 9 |   pgs: []
10 |   measurements: []
11 |   labs: [
12 |       "albumin",
13 |       "cholesterol",
14 |       "hdl_cholesterol",
15 |       "ldl_direct",
16 |       "triglycerides",
17 |       "glucose",
18 |       "creatinine",
19 |       "urea",
20 |       "urate",
21 |   ]
22 |   family_history: []
23 |   diagnoses: []
24 |   medications: []
25 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/logging.py:
--------------------------------------------------------------------------------
 1 | from neptune.new.integrations.pytorch_lightning import NeptuneLogger
 2 | 
 3 | from typing import Any, Dict, Iterable, Optional, Union
 4 | from argparse import Namespace
 5 | 
 6 | 
 7 | class FoolProofNeptuneLogger(NeptuneLogger):
 8 |     """
 9 |     Logger that does only log params if they do not exceed the str len limit.
10 |     """
11 |     def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:
12 |         params = self._convert_params(params)
13 | 
14 |         parameters_key = self.PARAMETERS_KEY
15 |         if self._base_namespace:
16 |             parameters_key = f'{self._base_namespace}/{parameters_key}'
17 | 
18 |         keys_to_pop = []
19 |         for k, v in params.items():
20 |             if len(str(v)) >= 16384:
21 |                 keys_to_pop.append(k)
22 |         for k in keys_to_pop:
23 |             params.pop(k)
24 | 
25 |         self.run[parameters_key] = params
26 | 
27 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/evaluation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from lifelines import CRCSplineFitter
 4 | 
 5 | 
 6 | def get_observed_probability(F_t, events, durations, t0: float):
 7 |     def ccl(p): return np.log(-np.log(1 - p))
 8 | 
 9 |     T = "time"
10 |     E = "event"
11 | 
12 |     predictions_at_t0 = np.clip(F_t, 1e-10, 1 - 1e-10)
13 |     prediction_df = pd.DataFrame({f"ccl_at_{t0}": ccl(predictions_at_t0), T: durations, E: events})
14 | 
15 |     if any(x <= 1 for x in events):
16 |         pass
17 |     else:
18 |         prediction_df["event"] = [0 if v > 1 else v for v in prediction_df["event"].to_list()]
19 | 
20 |     index_old = prediction_df.index
21 |     prediction_df = prediction_df.dropna()
22 |     index_new = prediction_df.index
23 |     diff = index_old.difference(index_new)
24 | 
25 |     knots = 3
26 |     regressors = {"beta_": [f"ccl_at_{t0}"], **{f"gamma{i}_": "1" for i in range(knots)}}
27 | 
28 |     crc = CRCSplineFitter(knots, penalizer=0.001).fit(prediction_df, T, E, regressors=regressors, show_progress=False)
29 | 
30 |     risk_obs = (1 - crc.predict_survival_function(prediction_df, times=[t0])).T.squeeze()
31 | 
32 |     return risk_obs, diff.to_list()
33 | 


--------------------------------------------------------------------------------
/analysis/preprocessing/0_decode_ukbb.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "library(ukbtools)"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "outputs": [],
16 |    "source": [
17 |     "in_dir = \"dir/where/decoded/file/is\"\n",
18 |     "out_path = \"dir/where/to/write/feather\""
19 |    ],
20 |    "metadata": {
21 |     "collapsed": false,
22 |     "pycharm": {
23 |      "name": "#%%\n"
24 |     }
25 |    }
26 |   },
27 |   {
28 |    "cell_type": "code",
29 |    "execution_count": 2,
30 |    "metadata": {},
31 |    "outputs": [],
32 |    "source": [
33 |     "my_ukb_data <- ukb_df(\"decoded\", in_dir)\n",
34 |     "df_field <- ukb_df_field(\"decoded\", in_dir)"
35 |    ]
36 |   },
37 |   {
38 |    "cell_type": "code",
39 |    "execution_count": 3,
40 |    "metadata": {
41 |     "pycharm": {
42 |      "name": "#%%\n"
43 |     }
44 |    },
45 |    "outputs": [],
46 |    "source": [
47 |     "arrow::write_feather(my_ukb_data, out_path)\n",
48 |     "arrow::write_feather(df_field, out_path)"
49 |    ]
50 |   }
51 |  ],
52 |  "metadata": {
53 |   "kernelspec": {
54 |    "display_name": "R [conda env:python]",
55 |    "language": "R",
56 |    "name": "conda-env-python-r"
57 |   },
58 |   "language_info": {
59 |    "codemirror_mode": "r",
60 |    "file_extension": ".r",
61 |    "mimetype": "text/x-r-source",
62 |    "name": "R",
63 |    "pygments_lexer": "r",
64 |    "version": "4.0.3"
65 |   }
66 |  },
67 |  "nbformat": 4,
68 |  "nbformat_minor": 4
69 | }


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/config/features/PANEL.yaml:
--------------------------------------------------------------------------------
 1 | # @package experiment.feature_set.PANEL
 2 | categorical:
 3 |   basics: [
 4 |       'age_at_recruitment',
 5 |       'sex',
 6 |   ]
 7 |   questionnaire: [
 8 |   ]
 9 | one_hot_enc:
10 |   basics: [
11 |       'sex',
12 |       'age_at_recruitment'
13 |   ]
14 |   questionnaire: [
15 |       'smoking_status_2.0',
16 |       'alcohol_intake_frequency_2.0',
17 |   ]
18 | general:
19 |   pgs: []
20 |   measurements: [
21 |       'daily_physical_activity',
22 |       'daily_healthy_food',
23 |       'education_years',
24 |       'body_mass_index_bmi',
25 |       'waist_hip_ratio',
26 |       'weight',
27 |       "standing_height",
28 |       'systolic_blood_pressure',
29 |   ]
30 |   labs: [
31 |       "cholesterol",
32 |       "hdl_cholesterol",
33 |       "ldl_direct",
34 |       "triglycerides",
35 |       "glucose",
36 |       "glycated_haemoglobin_hba1c",
37 |       "creatinine",
38 |       "cystatin_c",
39 |       "urea",
40 |       "urate",
41 |       'aspartate_aminotransferase',
42 |       'alanine_aminotransferase',
43 |       'alkaline_phosphatase',
44 |       'albumin',
45 |       "creactive_protein",
46 |       'red_blood_cell_erythrocyte_count',
47 |       'white_blood_cell_leukocyte_count',
48 |       'platelet_count',
49 |       'haemoglobin_concentration',
50 |       'haematocrit_percentage',
51 |       'mean_corpuscular_haemoglobin',
52 |       'mean_corpuscular_volume',
53 |       'mean_corpuscular_haemoglobin_concentration'
54 |   ]
55 |   family_history: [
56 |       'fh_diabetes',
57 |   ]
58 |   diagnoses: []
59 |   medications: [
60 |       "antihypertensives",
61 |   ]
62 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.distributions import Normal, LogNormal, Weibull, transform_to
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | def cox_ph_loss(logh, durations, events, eps=1e-7):
10 |     """
11 |     Simple approximation of the COX-ph. Log hazard is not computed on risk-sets, but on ranked list instead.
12 |     This approximation is valid for datamodules w/ low percentage of ties.
13 |     Credit to Haavard Kamme/PyCox
14 |     :param logh:
15 |     :param durations:
16 |     :param events:
17 |     :param eps:
18 |     :return:
19 |     """
20 |     # sort:
21 |     idx = durations.sort(descending=True, dim=0)[1]
22 |     events = events[idx].squeeze(-1)
23 |     logh = logh[idx].squeeze(-1)
24 |     # calculate loss:
25 |     gamma = logh.max()
26 |     log_cumsum_h = logh.sub(gamma).exp().cumsum(0).add(eps).log().add(gamma)
27 |     if events.sum() > 0:
28 |         loss = - logh.sub(log_cumsum_h).mul(events).sum().div(events.sum())
29 |     else:
30 |         loss = - logh.sub(log_cumsum_h).mul(events).sum()
31 |     return loss
32 | 
33 | def DSM_uncensored_loss(logf_ts, ks, events, e=1):
34 |     """
35 |     We minimize the ELBO of log P(DATASET_uncensored)
36 |     equalling the negative sum over all log hazards.
37 |     inputs are expected to be 2D Tensors of shape [B, k_dim]
38 |     :param logf_t:
39 |     :param durations:
40 |     :param events:
41 |     :return:
42 |     """
43 | 
44 |     e_ = torch.Tensor([e])
45 |     e_ = e_.type_as(logf_ts)
46 |     zero_ = torch.Tensor([0])
47 |     zero_ = zero_.type_as(logf_ts)
48 | 
49 |     elbo = torch.logsumexp(F.log_softmax(ks, dim=1)+logf_ts, dim=1, keepdim=True)
50 |     mask = torch.eq(events, e_)
51 |     elbo = elbo[mask]
52 | 
53 |     if torch.eq(mask.sum(), zero_):
54 |         return torch.Tensor([1.0]).squeeze().type_as(logf_ts)
55 |     else:
56 |         return -elbo.sum() / (mask.sum())
57 | 
58 | 
59 | def DSM_censored_loss(logS_ts, ks, events, e=1):
60 |     """
61 |     NLL on log hazards.
62 | 
63 |     For competing risks, all other events are treated as administrative censoring.
64 | 
65 |     :param logh:
66 |     :param durations:
67 |     :param events:
68 |     :return:
69 |     """
70 |     e_ = torch.Tensor([e])
71 |     e_ = e_.type_as(logS_ts)
72 | 
73 |     elbo = torch.logsumexp(F.log_softmax(ks, dim=1)+logS_ts, dim=1, keepdim=True)
74 |     mask = torch.ne(events, e_)
75 |     elbo = elbo[mask]
76 | 
77 |     return -elbo.sum()/mask.sum()
78 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytorch_lightning as pl
 6 | 
 7 | from .logging import FoolProofNeptuneLogger
 8 | 
 9 | 
10 | ####################################################################################################
11 | #                        neptune                                                                   #
12 | ####################################################################################################
13 | 
14 | def set_up_neptune(FLAGS={}, close_after_fit=False, **kwargs):
15 |     """
16 |     Set up a neptune logger from file.
17 |     :param keyfile:
18 |     :param project:
19 |     :param name:
20 |     :param params:
21 |     :param tags:
22 |     :param close_after_fit:
23 |     :param kwargs:
24 |     :return:
25 |     """
26 |     if not "NEPTUNE_API_TOKEN" in os.environ:
27 |         raise EnvironmentError('Please set environment variable `NEPTUNE_API_TOKEN`.')
28 | 
29 |     neptune_logger = FoolProofNeptuneLogger(api_key=os.environ["NEPTUNE_API_TOKEN"],
30 |                                             close_after_fit=close_after_fit,
31 |                                             **FLAGS.setup)
32 |     return neptune_logger
33 | 
34 | 
35 | def get_default_callbacks(monitor='Ctd_0.9', mode='max', early_stop=True):
36 |     """
37 |     Instantate the default callbacks: EarlyStopping and Checkpointing.
38 | 
39 |     :param monitor:
40 |     :param mode:
41 |     :return:
42 |     """
43 |     checkpoint_callback = pl.callbacks.model_checkpoint.ModelCheckpoint(monitor=monitor, verbose=True,
44 |                                                                         save_last=True, save_top_k=3,
45 |                                                                         save_weights_only=False, mode=mode,
46 |                                                                         period=1)
47 |     lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step', log_momentum=False)
48 |     if early_stop:
49 |         early_stop = pl.callbacks.early_stopping.EarlyStopping(monitor=monitor, min_delta=1e-5, patience=15,
50 |                                                                verbose=True, mode=mode, strict=False)
51 |         return [checkpoint_callback, early_stop, lr_monitor]
52 |     else:
53 |         return [checkpoint_callback, lr_monitor]
54 | 
55 | 
56 | def attribution2df(attributions, feature_names, loader):
57 |     attribution_sum = attributions.detach().numpy().sum(0)
58 |     attribution_norm_sum = attribution_sum / np.linalg.norm(attribution_sum, ord=1)
59 |     axis_data = np.arange(loader.shape[1])
60 |     data_labels = list(map(lambda idx: feature_names[idx], axis_data))
61 |     df = pd.DataFrame({'feature': data_labels,
62 |                        'importance': attribution_norm_sum})
63 |     sorted_df = df.reindex(df.importance.abs().sort_values(ascending=False).index)
64 |     return sorted_df
65 | 
66 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import warnings
 4 | 
 5 | import hydra
 6 | import torch
 7 | import torch.nn as nn
 8 | import pandas as pd
 9 | import pytorch_lightning as pl
10 | import neptune.new as neptune
11 | 
12 | from omegaconf import DictConfig, ListConfig, OmegaConf
13 | from torch.nn import Sigmoid, SELU, ReLU
14 | from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau, MultiStepLR
15 | from pytorch_lightning.callbacks import LearningRateMonitor, StochasticWeightAveraging
16 | 
17 | from source.datamodules import *
18 | from source.tasks import *
19 | from source.modules import *
20 | from source.utils import set_up_neptune, get_default_callbacks
21 | from source.callbacks import WriteCheckpointLogs, WritePredictionsDataFrame
22 | 
23 | 
24 | # globals:
25 | warnings.filterwarnings("ignore", category=RuntimeWarning)
26 | warnings.filterwarnings("ignore", category=UserWarning)
27 | pd.options.mode.use_inf_as_na = True
28 | pl.seed_everything(23)  #the number of doom
29 | 
30 | 
31 | assert os.environ['NEPTUNE_API_TOKEN'], 'No Neptune API Token found. Please do `export NEPTUNE_API_TOKEN=<token>`.'
32 | config_path = "source/config/"
33 | 
34 | 
35 | @hydra.main(config_path, config_name="config")
36 | def train(FLAGS: DictConfig):
37 |     OmegaConf.set_struct(FLAGS, False)
38 |     FLAGS.config_path = config_path
39 | 
40 |     # get classes
41 |     Task = eval(FLAGS.experiment.task)
42 |     Module = eval(FLAGS.experiment.module)
43 |     DataModule = eval(FLAGS.experiment.datamodule)
44 |     if FLAGS.experiment.latent_module is not None:
45 |         LatentModule = eval(FLAGS.experiment.latent_module)
46 |     else:
47 |         LatentModule = None
48 | 
49 |     # initialize datamodule
50 |     # load features.yaml if necessary:
51 |     if FLAGS.experiment.feature_set is not None:
52 |         FLAGS.experiment.features = OmegaConf.load(os.path.join(FLAGS.config_path, FLAGS.experiment.features_yaml))
53 |     datamodule = DataModule(**FLAGS.experiment)
54 |     datamodule.prepare_data()
55 |     datamodule.setup("fit")
56 |     FLAGS["data"] = {"feature_names": datamodule.features}
57 | 
58 |     # get network:
59 |     ft_extractor = Module(input_dim=len(datamodule.features), **FLAGS.experiment.module_kwargs)
60 |     if LatentModule is not None:
61 |         if LatentModule == ResidualHeadMLP:
62 |             FLAGS.experiment.latent_module_kwargs.skip_connection_input_dim = len(datamodule.features)
63 |         cause_specific = LatentModule(**FLAGS.experiment.latent_module_kwargs)
64 |     else:
65 |         cause_specific = nn.Identity()
66 | 
67 |     # initialize Task
68 |     task = Task(feature_extractor=ft_extractor,
69 |                 latent_mlp=cause_specific,
70 |                 feature_dim=len(datamodule.features),
71 |                 **FLAGS.experiment)
72 | 
73 |     # initialize trainer
74 |     callbacks = get_default_callbacks(monitor=FLAGS.experiment.monitor)
75 |     callbacks.extend([WriteCheckpointLogs(),
76 |                       WritePredictionsDataFrame(
77 |                           write_calibrated_predictions=FLAGS.experiment.write_calibrated_predictions)
78 |                       ]
79 |                      )
80 | 
81 |     trainer = pl.Trainer(**FLAGS.trainer,
82 |                          callbacks=callbacks,
83 |                          logger=set_up_neptune(FLAGS))
84 | 
85 |     FLAGS["parameters/callbacks"] = [c.__class__.__name__ for c in callbacks]
86 |     trainer.logger.run["FLAGS"] = FLAGS
87 | 
88 |     if FLAGS.trainer.auto_lr_find:
89 |         trainer.tune(model=task, datamodule=datamodule)
90 | 
91 |     # run
92 |     trainer.fit(task, datamodule)
93 |     trainer.logger.run.stop()
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     train()
98 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/callbacks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import pandas as pd
 4 | 
 5 | from tqdm import tqdm
 6 | from pathlib import Path
 7 | from pytorch_lightning.callbacks.base import Callback
 8 | 
 9 | 
10 | class WriteCheckpointLogs(Callback):
11 |     """
12 |     Write final logs to neptune.
13 |     """
14 |     def on_keyboard_interrupt(self, trainer, pl_module, device='cuda:0'):
15 |         self.on_epoch_end(trainer, pl_module)
16 | 
17 |     def on_epoch_end(self, trainer, pl_module):
18 |         if isinstance(trainer.logger, list):
19 |             logger = trainer.logger[0]
20 |         else:
21 |             logger = trainer.logger
22 |         if torch.is_tensor(trainer.checkpoint_callback.best_model_score):
23 |             logger.run["checkpoint_metric"] = trainer.checkpoint_callback.monitor
24 |             logger.run["checkpoint_value"] = str(trainer.checkpoint_callback.best_model_score.item())
25 |             logger.run["checkpoint_path"] = trainer.checkpoint_callback.best_model_path
26 | 
27 | 
28 | class WritePredictionsDataFrame(Callback):
29 |     """
30 |     Write Predictions generated by `predict_dataset` or `predict_dataset_with_uncertainty` that return pd.DataFrames.
31 |     """
32 |     def __init__(self, write_calibrated_predictions=True, **kwargs):
33 |         super().__init__()
34 |         self.write_calibrated_predictions = write_calibrated_predictions
35 | 
36 |     def on_keyboard_interrupt(self, trainer, module, device='cuda:0'):
37 |         self.on_fit_end(trainer, module, device)
38 | 
39 |     def on_fit_end(self, trainer, module, device='cuda:0'): # how to set inference device better? adaptive to train device?
40 |         ckpt = torch.load(trainer.checkpoint_callback.best_model_path)
41 |         module.load_state_dict(ckpt['state_dict'])
42 |         module.eval()
43 |         module.to(device)
44 | 
45 |         time_max = 26 # effective real time max is time_max-2 -> 25 years
46 |         times = [e for e in range(1, time_max, 1)]
47 |         if self.write_calibrated_predictions:
48 |             module.fit_isotonic_regressor(trainer.datamodule.train_ds, times, 100000)
49 | 
50 |         # write the predictions.csv
51 |         predictions = {}
52 |         for ds_idx, (ds, ds_name) in enumerate(tqdm([(trainer.datamodule.train_ds, 'train'),
53 |                                                      (trainer.datamodule.valid_ds, 'valid'),
54 |                                                      (trainer.datamodule.test_ds, 'test')])):
55 |             if self.write_calibrated_predictions:
56 |                  predictions[ds_name] = module.predict_dataset_calibrated(ds, times)
57 |             else:
58 |                 predictions[ds_name] = module.predict_dataset(ds, times)
59 |             predictions[ds_name]['eid'] = ds.datasets[0].eid_map.index.values
60 |             predictions[ds_name]["split"] = ds_name
61 |         predictions_df = pd.concat([*predictions.values()]).reset_index(drop=True)
62 |         predictions_df["partition"] = trainer.datamodule.cv_partition
63 |         predictions_df["module"] = type(module).__name__
64 |         try:
65 |             predictions_df["net"] = type(module.net).__name__
66 |         except AttributeError:
67 |             pass
68 |         predictions_df["datamodule"] = type(trainer.datamodule).__name__
69 |         predictions_df["event_names"] = str(trainer.datamodule.event)
70 |         predictions_df["feature_names"] = str(trainer.datamodule.features)
71 | 
72 |         self.write_and_log(trainer, predictions_df)
73 | 
74 |     def write_and_log(self, trainer, predictions_df):
75 |         # write the predictions.csv
76 |         outdir = os.path.join(Path(trainer.checkpoint_callback.dirpath).parent, "predictions")
77 |         if not os.path.exists(outdir):
78 |             os.mkdir(outdir)
79 |         predictions_df.to_feather(os.path.join(outdir, "predictions.feather"))
80 |         predictions_df.to_csv(os.path.join(outdir, "predictions.csv"))
81 | 
82 |         if isinstance(trainer.logger, list):
83 |             trainer.logger[0].run["prediction_available"] = "TRUE"
84 |             trainer.logger[0].run["prediction_path"] = os.path.join(outdir, "predictions.feather")
85 |         else:
86 |             trainer.logger.run["prediction_available"] = "TRUE"
87 |             trainer.logger.run["prediction_path"] = os.path.join(outdir, "predictions.feather")
88 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | ![Logo](./src/msm_logo.png?raw=true "Logo")
  4 | 
  5 | ⛑ **Metabolomic profiles predict individual multi-disease outcomes** ⛑
  6 | 
  7 | [comment]: <> (<!--)
  8 | 
  9 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thbuerg/MetabolomicsCommonDiseases/blob/main/analysis/examples/MetabolomicsInference.ipynb)
 10 | [![Paper](https://img.shields.io/badge/Paper-Nature%20Medicine-red)](https://www.nature.com/articles/s41591-022-01980-3)
 11 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6200202.svg)](https://doi.org/10.5281/zenodo.6200202)
 12 | [![CC BY-NC-SA 4.0][cc-by-nc-sa-shield]][cc-by-nc-sa]
 13 | 
 14 | [comment]: <> (-->)
 15 | 
 16 | </div>
 17 | 
 18 | ## Description   
 19 | Code related to the paper "Metabolomic profiles predict individual multi-disease outcomes in the UK Biobank cohort". 
 20 | This repo is a python package for preprocessing UK Biobank data and preprocessing, training and evaluating the proposed MetabolomicStateModel score.
 21 | 
 22 | ![Workflow](./src/fig1.png?raw=true "Workflow")
 23 | 
 24 | ## Methods
 25 | The **MetabolomicStateModel** is based on [DeepSurv](https://arxiv.org/abs/1606.00931) (the original implementation can be found [here](https://github.com/jaredleekatzman/DeepSurv)). Using a residual neural network, it learns a shared-representation of the NMR metabolomics data to predict log partial hazards for common disease endpoints.
 26 | 
 27 | ![Architecture](./src/fig2.png?raw=true "Architecture")
 28 | 
 29 | ## Assets
 30 | This repo contains code to preprocess [UK Biobank](https://www.ukbiobank.ac.uk/) data, train the MetabolomicStateModel and analyze/evaluate its performance.
 31 | 
 32 | - Preprocessing involves parsing primary care records for desired diagnosis. 
 33 | - Training involves Model specification via pytorch-lightning and hydra.
 34 | - Evaluation involves extensive benchmarks with linear Models, and calculation of bootstrapped metrics.
 35 | - Visualization contains the code to generate the figures displayed in the paper. 
 36 | 
 37 | ## Use the MetabolomicStateModel on your data
 38 | We provide you a ready-to-use [Google colab notebook](https://colab.research.google.com/github/thbuerg/MetabolomicsCommonDiseases/blob/main/analysis/examples/MetabolomicsInference.ipynb) with a trained version of our MetabolomicStateModel. Upload your dataset of Nightingale NMR metabolomics and run the model!  
 39 |  
 40 | **NOTE**: Data must be provided in [this format](https://github.com/thbuerg/MetabolomicsCommonDiseases/blob/main/analysis/examples/sample.csv).  
 41 |  
 42 | **DISCLAIMER**: This model is intended for research use only. We provide the NMR normalization pipeline as fitted on UK Biobank. Cohort-specific rescaling might be advisable.
 43 | 
 44 | ## How to train the MetabolomicStateModel  
 45 | 1. First, install dependencies   
 46 | ```bash
 47 | # clone project   
 48 | git clone https://github.com/thbuerg/MetabolomicsCommonDiseases
 49 | 
 50 | # install project   
 51 | cd MetabolomicsCommonDiseases
 52 | pip install -e .   
 53 | pip install -r requirements.txt
 54 |  ```   
 55 | 
 56 | 2. Download UK Biobank data. Execute preprocessing notebooks on the downloaded data.
 57 | 
 58 | 3. Set up [Neptune.ai](https://www.neptune.ai)
 59 | 
 60 | 4. Edit the `config.yaml` in `metabolomicstatemodel/run/config/`:
 61 | ```yaml
 62 | data_dir: /path/to/data
 63 | code_dir: /path/to/repo_base
 64 | setup:
 65 |   project: <YourNeptuneWorkspace>/<YourProject>
 66 | experiment:
 67 |   tabular_filepath: /path/to/processed/data
 68 | ```
 69 | 
 70 | 5. Train the NeuralCVD Model (make sure you are on a machine w/ GPU)
 71 |  ```bash
 72 | # module folder
 73 | cd source
 74 | 
 75 | # run training
 76 | bash run/run_MSM.sh
 77 | ```
 78 |  
 79 | ## License
 80 | This work is licensed under a
 81 | [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License][cc-by-nc-sa].
 82 | 
 83 | [![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]
 84 | 
 85 | [cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/
 86 | [cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png
 87 | [cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC%20BY--NC--SA%204.0-lightgrey.svg
 88 | 
 89 | ## Citation   
 90 | ```
 91 | @article{buergel2022metabolomic,
 92 |   title={Metabolomic profiles predict individual multidisease outcomes},
 93 |   author={Buergel, Thore and Steinfeldt, Jakob and Ruyoga, Greg and Pietzner, Maik and Bizzarri, Daniele and Vojinovic, Dina and Upmeier zu Belzen, Julius and Loock, Lukas and Kittner, Paul and Christmann, Lara and others},
 94 |   journal={Nature Medicine},
 95 |   pages={1--12},
 96 |   year={2022},
 97 |   publisher={Nature Publishing Group}
 98 | }
 99 | ```  
100 | 


--------------------------------------------------------------------------------
/analysis/examples/assets.yaml:
--------------------------------------------------------------------------------
  1 | endpoints:
  2 |   ['M_MACE',
  3 |    'M_all_cause_dementia',
  4 |    'M_type_2_diabetes',
  5 |    'M_liver_disease',
  6 |    'M_renal_disease',
  7 |    'M_atrial_fibrillation',
  8 |    'M_heart_failure',
  9 |    'M_coronary_heart_disease',
 10 |    'M_venous_thrombosis',
 11 |    'M_cerebral_stroke',
 12 |    'M_abdominal_aortic_aneurysm',
 13 |    'M_peripheral_arterial_disease',
 14 |    'M_asthma',
 15 |    'M_chronic_obstructuve_pulmonary_disease',
 16 |    'M_lung_cancer',
 17 |    'M_non_melanoma_skin_cancer',
 18 |    'M_colon_cancer',
 19 |    'M_rectal_cancer',
 20 |    'M_prostate_cancer',
 21 |    'M_breast_cancer',
 22 |    'M_parkinsons_disease',
 23 |    'M_fractures',
 24 |    'M_cataracts',
 25 |    'M_glaucoma'
 26 |   ]
 27 | metabolite_labels:
 28 |   ['NMR_3hydroxybutyrate',
 29 |    'NMR_acetate',
 30 |    'NMR_acetoacetate',
 31 |    'NMR_acetone',
 32 |    'NMR_alanine',
 33 |    'NMR_albumin',
 34 |    'NMR_apolipoprotein_a1',
 35 |    'NMR_apolipoprotein_b',
 36 |    'NMR_average_diameter_for_hdl_particles',
 37 |    'NMR_average_diameter_for_ldl_particles',
 38 |    'NMR_average_diameter_for_vldl_particles',
 39 |    'NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl',
 40 |    'NMR_cholesterol_in_idl',
 41 |    'NMR_cholesterol_in_large_hdl',
 42 |    'NMR_cholesterol_in_large_ldl',
 43 |    'NMR_cholesterol_in_large_vldl',
 44 |    'NMR_cholesterol_in_medium_hdl',
 45 |    'NMR_cholesterol_in_medium_ldl',
 46 |    'NMR_cholesterol_in_medium_vldl',
 47 |    'NMR_cholesterol_in_small_hdl',
 48 |    'NMR_cholesterol_in_small_ldl',
 49 |    'NMR_cholesterol_in_small_vldl',
 50 |    'NMR_cholesterol_in_very_large_hdl',
 51 |    'NMR_cholesterol_in_very_large_vldl',
 52 |    'NMR_cholesterol_in_very_small_vldl',
 53 |    'NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl',
 54 |    'NMR_cholesteryl_esters_in_hdl',
 55 |    'NMR_cholesteryl_esters_in_idl',
 56 |    'NMR_cholesteryl_esters_in_ldl',
 57 |    'NMR_cholesteryl_esters_in_large_hdl',
 58 |    'NMR_cholesteryl_esters_in_large_ldl',
 59 |    'NMR_cholesteryl_esters_in_large_vldl',
 60 |    'NMR_cholesteryl_esters_in_medium_hdl',
 61 |    'NMR_cholesteryl_esters_in_medium_ldl',
 62 |    'NMR_cholesteryl_esters_in_medium_vldl',
 63 |    'NMR_cholesteryl_esters_in_small_hdl',
 64 |    'NMR_cholesteryl_esters_in_small_ldl',
 65 |    'NMR_cholesteryl_esters_in_small_vldl',
 66 |    'NMR_cholesteryl_esters_in_vldl',
 67 |    'NMR_cholesteryl_esters_in_very_large_hdl',
 68 |    'NMR_cholesteryl_esters_in_very_large_vldl',
 69 |    'NMR_cholesteryl_esters_in_very_small_vldl',
 70 |    'NMR_citrate',
 71 |    'NMR_clinical_ldl_cholesterol',
 72 |    'NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles',
 73 |    'NMR_concentration_of_hdl_particles',
 74 |    'NMR_concentration_of_idl_particles',
 75 |    'NMR_concentration_of_ldl_particles',
 76 |    'NMR_concentration_of_large_hdl_particles',
 77 |    'NMR_concentration_of_large_ldl_particles',
 78 |    'NMR_concentration_of_large_vldl_particles',
 79 |    'NMR_concentration_of_medium_hdl_particles',
 80 |    'NMR_concentration_of_medium_ldl_particles',
 81 |    'NMR_concentration_of_medium_vldl_particles',
 82 |    'NMR_concentration_of_small_hdl_particles',
 83 |    'NMR_concentration_of_small_ldl_particles',
 84 |    'NMR_concentration_of_small_vldl_particles',
 85 |    'NMR_concentration_of_vldl_particles',
 86 |    'NMR_concentration_of_very_large_hdl_particles',
 87 |    'NMR_concentration_of_very_large_vldl_particles',
 88 |    'NMR_concentration_of_very_small_vldl_particles',
 89 |    'NMR_creatinine',
 90 |    'NMR_degree_of_unsaturation',
 91 |    'NMR_docosahexaenoic_acid',
 92 |    'NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl',
 93 |    'NMR_free_cholesterol_in_hdl',
 94 |    'NMR_free_cholesterol_in_idl',
 95 |    'NMR_free_cholesterol_in_ldl',
 96 |    'NMR_free_cholesterol_in_large_hdl',
 97 |    'NMR_free_cholesterol_in_large_ldl',
 98 |    'NMR_free_cholesterol_in_large_vldl',
 99 |    'NMR_free_cholesterol_in_medium_hdl',
100 |    'NMR_free_cholesterol_in_medium_ldl',
101 |    'NMR_free_cholesterol_in_medium_vldl',
102 |    'NMR_free_cholesterol_in_small_hdl',
103 |    'NMR_free_cholesterol_in_small_ldl',
104 |    'NMR_free_cholesterol_in_small_vldl',
105 |    'NMR_free_cholesterol_in_vldl',
106 |    'NMR_free_cholesterol_in_very_large_hdl',
107 |    'NMR_free_cholesterol_in_very_large_vldl',
108 |    'NMR_free_cholesterol_in_very_small_vldl',
109 |    'NMR_glucose',
110 |    'NMR_glutamine',
111 |    'NMR_glycine',
112 |    'NMR_glycoprotein_acetyls',
113 |    'NMR_hdl_cholesterol',
114 |    'NMR_histidine',
115 |    'NMR_isoleucine',
116 |    'NMR_ldl_cholesterol',
117 |    'NMR_lactate',
118 |    'NMR_leucine',
119 |    'NMR_linoleic_acid',
120 |    'NMR_monounsaturated_fatty_acids',
121 |    'NMR_omega3_fatty_acids',
122 |    'NMR_omega6_fatty_acids',
123 |    'NMR_phenylalanine',
124 |    'NMR_phosphatidylcholines',
125 |    'NMR_phosphoglycerides',
126 |    'NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl',
127 |    'NMR_phospholipids_in_hdl',
128 |    'NMR_phospholipids_in_idl',
129 |    'NMR_phospholipids_in_ldl',
130 |    'NMR_phospholipids_in_large_hdl',
131 |    'NMR_phospholipids_in_large_ldl',
132 |    'NMR_phospholipids_in_large_vldl',
133 |    'NMR_phospholipids_in_medium_hdl',
134 |    'NMR_phospholipids_in_medium_ldl',
135 |    'NMR_phospholipids_in_medium_vldl',
136 |    'NMR_phospholipids_in_small_hdl',
137 |    'NMR_phospholipids_in_small_ldl',
138 |    'NMR_phospholipids_in_small_vldl',
139 |    'NMR_phospholipids_in_vldl',
140 |    'NMR_phospholipids_in_very_large_hdl',
141 |    'NMR_phospholipids_in_very_large_vldl',
142 |    'NMR_phospholipids_in_very_small_vldl',
143 |    'NMR_polyunsaturated_fatty_acids',
144 |    'NMR_pyruvate',
145 |    'NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol',
146 |    'NMR_saturated_fatty_acids',
147 |    'NMR_sphingomyelins',
148 |    'NMR_total_cholesterol',
149 |    'NMR_total_cholesterol_minus_hdlc',
150 |    'NMR_total_cholines',
151 |    'NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine',
152 |    'NMR_total_concentration_of_lipoprotein_particles',
153 |    'NMR_total_esterified_cholesterol',
154 |    'NMR_total_fatty_acids',
155 |    'NMR_total_free_cholesterol',
156 |    'NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl',
157 |    'NMR_total_lipids_in_hdl',
158 |    'NMR_total_lipids_in_idl',
159 |    'NMR_total_lipids_in_ldl',
160 |    'NMR_total_lipids_in_large_hdl',
161 |    'NMR_total_lipids_in_large_ldl',
162 |    'NMR_total_lipids_in_large_vldl',
163 |    'NMR_total_lipids_in_lipoprotein_particles',
164 |    'NMR_total_lipids_in_medium_hdl',
165 |    'NMR_total_lipids_in_medium_ldl',
166 |    'NMR_total_lipids_in_medium_vldl',
167 |    'NMR_total_lipids_in_small_hdl',
168 |    'NMR_total_lipids_in_small_ldl',
169 |    'NMR_total_lipids_in_small_vldl',
170 |    'NMR_total_lipids_in_vldl',
171 |    'NMR_total_lipids_in_very_large_hdl',
172 |    'NMR_total_lipids_in_very_large_vldl',
173 |    'NMR_total_lipids_in_very_small_vldl',
174 |    'NMR_total_phospholipids_in_lipoprotein_particles',
175 |    'NMR_total_triglycerides',
176 |    'NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl',
177 |    'NMR_triglycerides_in_hdl',
178 |    'NMR_triglycerides_in_idl',
179 |    'NMR_triglycerides_in_ldl',
180 |    'NMR_triglycerides_in_large_hdl',
181 |    'NMR_triglycerides_in_large_ldl',
182 |    'NMR_triglycerides_in_large_vldl',
183 |    'NMR_triglycerides_in_medium_hdl',
184 |    'NMR_triglycerides_in_medium_ldl',
185 |    'NMR_triglycerides_in_medium_vldl',
186 |    'NMR_triglycerides_in_small_hdl',
187 |    'NMR_triglycerides_in_small_ldl',
188 |    'NMR_triglycerides_in_small_vldl',
189 |    'NMR_triglycerides_in_vldl',
190 |    'NMR_triglycerides_in_very_large_hdl',
191 |    'NMR_triglycerides_in_very_large_vldl',
192 |    'NMR_triglycerides_in_very_small_vldl',
193 |    'NMR_tyrosine',
194 |    'NMR_vldl_cholesterol',
195 |    'NMR_valine'
196 | ]
197 | 
198 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/config/features/Metabolomics.yaml:
--------------------------------------------------------------------------------
  1 | # @package experiment.complex_feature_set.Metabolomics
  2 | categorical:
  3 |   basics: []
  4 |   questionnaire: []
  5 | one_hot_enc:
  6 |   basics: []
  7 |   questionnaire: []
  8 | general:
  9 |   metabolomics: [
 10 |       'NMR_3hydroxybutyrate',
 11 |       'NMR_acetate',
 12 |       'NMR_acetoacetate',
 13 |       'NMR_acetone',
 14 |       'NMR_alanine',
 15 |       'NMR_albumin',
 16 |       'NMR_apolipoprotein_a1',
 17 |       'NMR_apolipoprotein_b',
 18 |       'NMR_average_diameter_for_hdl_particles',
 19 |       'NMR_average_diameter_for_ldl_particles',
 20 |       'NMR_average_diameter_for_vldl_particles',
 21 |       'NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl',
 22 |       'NMR_cholesterol_in_idl',
 23 |       'NMR_cholesterol_in_large_hdl',
 24 |       'NMR_cholesterol_in_large_ldl',
 25 |       'NMR_cholesterol_in_large_vldl',
 26 |       'NMR_cholesterol_in_medium_hdl',
 27 |       'NMR_cholesterol_in_medium_ldl',
 28 |       'NMR_cholesterol_in_medium_vldl',
 29 |       'NMR_cholesterol_in_small_hdl',
 30 |       'NMR_cholesterol_in_small_ldl',
 31 |       'NMR_cholesterol_in_small_vldl',
 32 |       'NMR_cholesterol_in_very_large_hdl',
 33 |       'NMR_cholesterol_in_very_large_vldl',
 34 |       'NMR_cholesterol_in_very_small_vldl',
 35 |       'NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl',
 36 |       'NMR_cholesteryl_esters_in_hdl',
 37 |       'NMR_cholesteryl_esters_in_idl',
 38 |       'NMR_cholesteryl_esters_in_ldl',
 39 |       'NMR_cholesteryl_esters_in_large_hdl',
 40 |       'NMR_cholesteryl_esters_in_large_ldl',
 41 |       'NMR_cholesteryl_esters_in_large_vldl',
 42 |       'NMR_cholesteryl_esters_in_medium_hdl',
 43 |       'NMR_cholesteryl_esters_in_medium_ldl',
 44 |       'NMR_cholesteryl_esters_in_medium_vldl',
 45 |       'NMR_cholesteryl_esters_in_small_hdl',
 46 |       'NMR_cholesteryl_esters_in_small_ldl',
 47 |       'NMR_cholesteryl_esters_in_small_vldl',
 48 |       'NMR_cholesteryl_esters_in_vldl',
 49 |       'NMR_cholesteryl_esters_in_very_large_hdl',
 50 |       'NMR_cholesteryl_esters_in_very_large_vldl',
 51 |       'NMR_cholesteryl_esters_in_very_small_vldl',
 52 |       'NMR_citrate',
 53 |       'NMR_clinical_ldl_cholesterol',
 54 |       'NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles',
 55 |       'NMR_concentration_of_hdl_particles',
 56 |       'NMR_concentration_of_idl_particles',
 57 |       'NMR_concentration_of_ldl_particles',
 58 |       'NMR_concentration_of_large_hdl_particles',
 59 |       'NMR_concentration_of_large_ldl_particles',
 60 |       'NMR_concentration_of_large_vldl_particles',
 61 |       'NMR_concentration_of_medium_hdl_particles',
 62 |       'NMR_concentration_of_medium_ldl_particles',
 63 |       'NMR_concentration_of_medium_vldl_particles',
 64 |       'NMR_concentration_of_small_hdl_particles',
 65 |       'NMR_concentration_of_small_ldl_particles',
 66 |       'NMR_concentration_of_small_vldl_particles',
 67 |       'NMR_concentration_of_vldl_particles',
 68 |       'NMR_concentration_of_very_large_hdl_particles',
 69 |       'NMR_concentration_of_very_large_vldl_particles',
 70 |       'NMR_concentration_of_very_small_vldl_particles',
 71 |       'NMR_creatinine',
 72 |       'NMR_degree_of_unsaturation',
 73 |       'NMR_docosahexaenoic_acid',
 74 |       'NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl',
 75 |       'NMR_free_cholesterol_in_hdl',
 76 |       'NMR_free_cholesterol_in_idl',
 77 |       'NMR_free_cholesterol_in_ldl',
 78 |       'NMR_free_cholesterol_in_large_hdl',
 79 |       'NMR_free_cholesterol_in_large_ldl',
 80 |       'NMR_free_cholesterol_in_large_vldl',
 81 |       'NMR_free_cholesterol_in_medium_hdl',
 82 |       'NMR_free_cholesterol_in_medium_ldl',
 83 |       'NMR_free_cholesterol_in_medium_vldl',
 84 |       'NMR_free_cholesterol_in_small_hdl',
 85 |       'NMR_free_cholesterol_in_small_ldl',
 86 |       'NMR_free_cholesterol_in_small_vldl',
 87 |       'NMR_free_cholesterol_in_vldl',
 88 |       'NMR_free_cholesterol_in_very_large_hdl',
 89 |       'NMR_free_cholesterol_in_very_large_vldl',
 90 |       'NMR_free_cholesterol_in_very_small_vldl',
 91 |       'NMR_glucose',
 92 |       'NMR_glutamine',
 93 |       'NMR_glycine',
 94 |       'NMR_glycoprotein_acetyls',
 95 |       'NMR_hdl_cholesterol',
 96 |       'NMR_histidine',
 97 |       'NMR_isoleucine',
 98 |       'NMR_ldl_cholesterol',
 99 |       'NMR_lactate',
100 |       'NMR_leucine',
101 |       'NMR_linoleic_acid',
102 |       'NMR_monounsaturated_fatty_acids',
103 |       'NMR_omega3_fatty_acids',
104 |       'NMR_omega6_fatty_acids',
105 |       'NMR_phenylalanine',
106 |       'NMR_phosphatidylcholines',
107 |       'NMR_phosphoglycerides',
108 |       'NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl',
109 |       'NMR_phospholipids_in_hdl',
110 |       'NMR_phospholipids_in_idl',
111 |       'NMR_phospholipids_in_ldl',
112 |       'NMR_phospholipids_in_large_hdl',
113 |       'NMR_phospholipids_in_large_ldl',
114 |       'NMR_phospholipids_in_large_vldl',
115 |       'NMR_phospholipids_in_medium_hdl',
116 |       'NMR_phospholipids_in_medium_ldl',
117 |       'NMR_phospholipids_in_medium_vldl',
118 |       'NMR_phospholipids_in_small_hdl',
119 |       'NMR_phospholipids_in_small_ldl',
120 |       'NMR_phospholipids_in_small_vldl',
121 |       'NMR_phospholipids_in_vldl',
122 |       'NMR_phospholipids_in_very_large_hdl',
123 |       'NMR_phospholipids_in_very_large_vldl',
124 |       'NMR_phospholipids_in_very_small_vldl',
125 |       'NMR_polyunsaturated_fatty_acids',
126 |       'NMR_pyruvate',
127 |       'NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol',
128 |       'NMR_saturated_fatty_acids',
129 |       'NMR_sphingomyelins',
130 |       'NMR_total_cholesterol',
131 |       'NMR_total_cholesterol_minus_hdlc',
132 |       'NMR_total_cholines',
133 |       'NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine',
134 |       'NMR_total_concentration_of_lipoprotein_particles',
135 |       'NMR_total_esterified_cholesterol',
136 |       'NMR_total_fatty_acids',
137 |       'NMR_total_free_cholesterol',
138 |       'NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl',
139 |       'NMR_total_lipids_in_hdl',
140 |       'NMR_total_lipids_in_idl',
141 |       'NMR_total_lipids_in_ldl',
142 |       'NMR_total_lipids_in_large_hdl',
143 |       'NMR_total_lipids_in_large_ldl',
144 |       'NMR_total_lipids_in_large_vldl',
145 |       'NMR_total_lipids_in_lipoprotein_particles',
146 |       'NMR_total_lipids_in_medium_hdl',
147 |       'NMR_total_lipids_in_medium_ldl',
148 |       'NMR_total_lipids_in_medium_vldl',
149 |       'NMR_total_lipids_in_small_hdl',
150 |       'NMR_total_lipids_in_small_ldl',
151 |       'NMR_total_lipids_in_small_vldl',
152 |       'NMR_total_lipids_in_vldl',
153 |       'NMR_total_lipids_in_very_large_hdl',
154 |       'NMR_total_lipids_in_very_large_vldl',
155 |       'NMR_total_lipids_in_very_small_vldl',
156 |       'NMR_total_phospholipids_in_lipoprotein_particles',
157 |       'NMR_total_triglycerides',
158 |       'NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl',
159 |       'NMR_triglycerides_in_hdl',
160 |       'NMR_triglycerides_in_idl',
161 |       'NMR_triglycerides_in_ldl',
162 |       'NMR_triglycerides_in_large_hdl',
163 |       'NMR_triglycerides_in_large_ldl',
164 |       'NMR_triglycerides_in_large_vldl',
165 |       'NMR_triglycerides_in_medium_hdl',
166 |       'NMR_triglycerides_in_medium_ldl',
167 |       'NMR_triglycerides_in_medium_vldl',
168 |       'NMR_triglycerides_in_small_hdl',
169 |       'NMR_triglycerides_in_small_ldl',
170 |       'NMR_triglycerides_in_small_vldl',
171 |       'NMR_triglycerides_in_vldl',
172 |       'NMR_triglycerides_in_very_large_hdl',
173 |       'NMR_triglycerides_in_very_large_vldl',
174 |       'NMR_triglycerides_in_very_small_vldl',
175 |       'NMR_tyrosine',
176 |       'NMR_vldl_cholesterol',
177 |       'NMR_valine',
178 |   ]
179 |   basics: []
180 |   questionnaire: []
181 |   measurements: []
182 |   labs: []
183 |   family_history: []
184 |   diagnoses: []
185 |   medications: []
186 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/config/config.yaml:
--------------------------------------------------------------------------------
  1 | #data_dir: /path/to/data
  2 | #code_dir: /path/to/repo_base
  3 | data_dir: /sc-projects/sc-proj-ukb-cvd
  4 | code_dir: /home/buergelt/projects/cardiors/code/MetabolomicsCommonDiseases
  5 | setup:
  6 | #  project: YourNeptune/Project
  7 |   project: CardioRS/metabolomics
  8 |   name: MSM
  9 |   tags: MSM_train
 10 | trainer:
 11 |   default_root_dir: ${data_dir}/results/models
 12 |   gpus: 1
 13 |   precision: 16
 14 |   val_check_interval: 1.0
 15 |   overfit_batches: 0.0
 16 |   fast_dev_run: False
 17 |   track_grad_norm: 0
 18 |   max_epochs: 100
 19 |   stochastic_weight_avg: True
 20 |   auto_lr_find: False
 21 | experiment:
 22 |   seed: 23
 23 |   num_workers: 4
 24 |   monitor: "Avg__C_10"
 25 |   report_train_metrics: False
 26 |   evaluation_time_points: [10]
 27 |   evaluation_quantile_bins: None
 28 |   write_calibrated_predictions: False
 29 |   task_names: [
 30 |       "M_MACE",
 31 |       "M_all_cause_dementia",
 32 |       "M_type_2_diabetes",
 33 |       "M_liver_disease",
 34 |       "M_renal_disease",
 35 |       "M_atrial_fibrillation",
 36 |       "M_heart_failure",
 37 |       "M_coronary_heart_disease",
 38 |       "M_venous_thrombosis",
 39 |       "M_cerebral_stroke",
 40 |       "M_abdominal_aortic_aneurysm",
 41 |       "M_peripheral_arterial_disease",
 42 |       "M_asthma",
 43 |       "M_chronic_obstructuve_pulmonary_disease",
 44 |       "M_lung_cancer",
 45 |       "M_non_melanoma_skin_cancer",
 46 |       "M_colon_cancer",
 47 |       "M_rectal_cancer",
 48 |       "M_prostate_cancer",
 49 |       "M_breast_cancer",
 50 |       "M_parkinsons_disease",
 51 |       "M_fractures",
 52 |       "M_cataracts",
 53 |       "M_glaucoma"
 54 |   ]
 55 |   task_weights:
 56 |       M_MACE: 1,
 57 |       M_all_cause_dementia: 1,
 58 |       M_type_2_diabetes: 1,
 59 |       M_liver_disease: 1,
 60 |       M_renal_disease: 1,
 61 |       M_atrial_fibrillation: 1,
 62 |       M_heart_failure: 1,
 63 |       M_coronary_heart_disease: 1,
 64 |       M_venous_thrombosis: 1,
 65 |       M_cerebral_stroke: 1,
 66 |       M_haemorrhagic_stroke: 1,
 67 |       M_abdominal_aortic_aneurysm: 1,
 68 |       M_peripheral_arterial_disease: 1,
 69 |       M_asthma: 1,
 70 |       M_chronic_obstructuve_pulmonary_disease: 1,
 71 |       M_lung_cancer: 1,
 72 |       M_non_melanoma_skin_cancer: 1,
 73 |       M_colon_cancer: 1,
 74 |       M_rectal_cancer: 1,
 75 |       M_prostate_cancer: 1,
 76 |       M_breast_cancer: 1,
 77 |       M_parkinsons_disease: 1,
 78 |       M_fractures: 1,
 79 |       M_cataracts: 1,
 80 |       M_glaucoma: 1
 81 |   event: [
 82 |       "M_MACE_event",
 83 |       "M_all_cause_dementia_event",
 84 |       "M_type_2_diabetes_event",
 85 |       "M_liver_disease_event",
 86 |       "M_renal_disease_event",
 87 |       "M_atrial_fibrillation_event",
 88 |       "M_heart_failure_event",
 89 |       "M_coronary_heart_disease_event",
 90 |       "M_venous_thrombosis_event",
 91 |       "M_cerebral_stroke_event",
 92 |       "M_abdominal_aortic_aneurysm_event",
 93 |       "M_peripheral_arterial_disease_event",
 94 |       "M_asthma_event",
 95 |       "M_chronic_obstructuve_pulmonary_disease_event",
 96 |       "M_lung_cancer_event",
 97 |       "M_non_melanoma_skin_cancer_event",
 98 |       "M_colon_cancer_event",
 99 |       "M_rectal_cancer_event",
100 |       "M_prostate_cancer_event",
101 |       "M_breast_cancer_event",
102 |       "M_parkinsons_disease_event",
103 |       "M_fractures_event",
104 |       "M_cataracts_event",
105 |       "M_glaucoma_event"
106 |   ]
107 |   duration: [
108 |       "M_MACE_event_time",
109 |       "M_all_cause_dementia_event_time",
110 |       "M_type_2_diabetes_event_time",
111 |       "M_liver_disease_event_time",
112 |       "M_renal_disease_event_time",
113 |       "M_atrial_fibrillation_event_time",
114 |       "M_heart_failure_event_time",
115 |       "M_coronary_heart_disease_event_time",
116 |       "M_venous_thrombosis_event_time",
117 |       "M_cerebral_stroke_event_time",
118 |       "M_abdominal_aortic_aneurysm_event_time",
119 |       "M_peripheral_arterial_disease_event_time",
120 |       "M_asthma_event_time",
121 |       "M_chronic_obstructuve_pulmonary_disease_event_time",
122 |       "M_lung_cancer_event_time",
123 |       "M_non_melanoma_skin_cancer_event_time",
124 |       "M_colon_cancer_event_time",
125 |       "M_rectal_cancer_event_time",
126 |       "M_prostate_cancer_event_time",
127 |       "M_breast_cancer_event_time",
128 |       "M_parkinsons_disease_event_time",
129 |       "M_fractures_event_time",
130 |       "M_cataracts_event_time",
131 |       "M_glaucoma_event_time"
132 |   ]
133 |   cohort_definition:
134 |     general:
135 |       train: "NMR_FLAG==True"
136 |       valid: "NMR_FLAG==True"
137 |       test: "NMR_FLAG==True"
138 |     task_specific:
139 |         M_MACE: "M_MACE==False&statins==False"
140 |         M_all_cause_dementia: "M_all_cause_dementia==False"
141 |         M_type_2_diabetes: "M_type_2_diabetes==False"
142 |         M_liver_disease: "M_liver_disease==False"
143 |         M_renal_disease: "M_renal_disease==False"
144 |         M_atrial_fibrillation: "M_atrial_fibrillation==False"
145 |         M_heart_failure: "M_heart_failure==False"
146 |         M_coronary_heart_disease: "M_coronary_heart_disease==False"
147 |         M_venous_thrombosis: "M_venous_thrombosis==False"
148 |         M_cerebral_stroke: "M_cerebral_stroke==False"
149 |         M_abdominal_aortic_aneurysm: "M_abdominal_aortic_aneurysm==False"
150 |         M_peripheral_arterial_disease: "M_peripheral_arterial_disease==False"
151 |         M_asthma: "M_asthma==False"
152 |         M_chronic_obstructuve_pulmonary_disease: "M_chronic_obstructuve_pulmonary_disease==False"
153 |         M_lung_cancer: "M_lung_cancer==False"
154 |         M_non_melanoma_skin_cancer: "M_non_melanoma_skin_cancer==False"
155 |         M_colon_cancer: "M_colon_cancer==False"
156 |         M_rectal_cancer: "M_rectal_cancer==False"
157 |         M_prostate_cancer: "M_prostate_cancer==False&sex=='Male'"
158 |         M_breast_cancer: "M_breast_cancer==False&sex=='Female'"
159 |         M_parkinsons_disease: "M_parkinsons_disease==False"
160 |         M_fractures: "M_fractures==False"
161 |         M_cataracts: "M_cataracts==False"
162 |         M_glaucoma: "M_glaucoma==False"
163 |   task_specific_exclusions: True
164 |   datamodule: UKBBSurvivalDatamoduleWithExclusions
165 |   task: ResidualMultiTaskSurvivalTraining
166 |   cv_partition: 0
167 |   feature_set: Metabolomics
168 |   features_yaml: ${code_dir}/metabolomicstatemodel/source/config/features/${experiment.feature_set}.yaml
169 |   tabular_filepath: ${data_dir}/data/tabular/210714_metabolomics
170 |   latent_dim: 512
171 |   module: MLP
172 |   module_kwargs:
173 |     snn_init: False
174 |     hidden_dim: [256, 256, 256]
175 |     output_dim: ${experiment.latent_dim}
176 |     norm_fn: 'nn.BatchNorm1d'
177 |     norm_layer: [0]
178 |     input_norm: False
179 |     final_norm: False
180 |     dropout_fn: "nn.Dropout"
181 |     dropout: 0.3
182 |     dropout_after_norm: False
183 |     activation: "nn.SiLU"
184 |     final_activation: "nn.SiLU"
185 |   latent_module: ResidualHeadMLP
186 |   latent_module_kwargs:
187 |     latent_dim: 32
188 |     mlp: MLP
189 |     mlp_kwargs:
190 |       snn_init: False
191 |       input_dim: ${experiment.latent_dim}
192 |       hidden_dim: [256, 128]
193 |       output_dim: ${experiment.latent_module_kwargs.latent_dim}
194 |       activation_fn: "nn.SiLU"
195 |       dropout_fn: "nn.Dropout"
196 |       norm_fn: 'nn.BatchNorm1d'
197 |       norm_layer: "all"
198 |       input_norm: False
199 |       final_norm: True
200 |       dropout: 0.6
201 |       dropout_after_norm: True
202 |       activation: "nn.SiLU"
203 |       final_activation: "nn.SiLU"
204 |     skip_connection_mlp: MLP
205 |     skip_connection_mlp_kwargs:
206 |       snn_init: False
207 |       hidden_dim: [128, 128]
208 |       output_dim: ${experiment.latent_module_kwargs.latent_dim}
209 |       activation_fn: "nn.SiLU"
210 |       dropout_fn: "nn.Dropout"
211 |       norm_fn: 'nn.BatchNorm1d'
212 |       norm_layer: "all"
213 |       input_norm: False
214 |       final_norm: True
215 |       dropout: 0.6
216 |       dropout_after_norm: True
217 |       activation: "nn.SiLU"
218 |       final_activation: "nn.SiLU"
219 |     predictor_mlp: MLP
220 |     predictor_mlp_kwargs:
221 |       snn_init: False
222 |       input_dim: ${experiment.latent_module_kwargs.latent_dim}
223 |       hidden_dim: [128, 128]
224 |       output_dim: 1
225 |       activation_fn: "nn.SiLU"
226 |       dropout_fn: "nn.Dropout"
227 |       norm_fn: 'nn.BatchNorm1d'
228 |       norm_layer: "all"
229 |       input_norm: False
230 |       final_norm: True
231 |       dropout: 0.6
232 |       dropout_after_norm: True
233 |       activation: "nn.SiLU"
234 |       final_activation: "nn.Identity"
235 |   optimizer: "torch.optim.Adam"
236 |   optimizer_kwargs: {weight_decay: 1e-8}
237 |   schedule: MultiStepLR
238 |   schedule_kwargs: {milestones:[20, 30, 40], gamma: 0.1, verbose: True}
239 |   n_events: 1
240 |   batch_size: 1024
241 |   lr: 0.001
242 |   survival_task: DeepSurv
243 |   survival_task_kwargs:
244 |     batch_size: ${experiment.batch_size}
245 |     num_workers: ${experiment.num_workers}


--------------------------------------------------------------------------------
/analysis/examples/sample.csv:
--------------------------------------------------------------------------------
1 | ,NMR_3hydroxybutyrate,NMR_acetate,NMR_acetoacetate,NMR_acetone,NMR_alanine,NMR_albumin,NMR_apolipoprotein_a1,NMR_apolipoprotein_b,NMR_average_diameter_for_hdl_particles,NMR_average_diameter_for_ldl_particles,NMR_average_diameter_for_vldl_particles,NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl,NMR_cholesterol_in_idl,NMR_cholesterol_in_large_hdl,NMR_cholesterol_in_large_ldl,NMR_cholesterol_in_large_vldl,NMR_cholesterol_in_medium_hdl,NMR_cholesterol_in_medium_ldl,NMR_cholesterol_in_medium_vldl,NMR_cholesterol_in_small_hdl,NMR_cholesterol_in_small_ldl,NMR_cholesterol_in_small_vldl,NMR_cholesterol_in_very_large_hdl,NMR_cholesterol_in_very_large_vldl,NMR_cholesterol_in_very_small_vldl,NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl,NMR_cholesteryl_esters_in_hdl,NMR_cholesteryl_esters_in_idl,NMR_cholesteryl_esters_in_ldl,NMR_cholesteryl_esters_in_large_hdl,NMR_cholesteryl_esters_in_large_ldl,NMR_cholesteryl_esters_in_large_vldl,NMR_cholesteryl_esters_in_medium_hdl,NMR_cholesteryl_esters_in_medium_ldl,NMR_cholesteryl_esters_in_medium_vldl,NMR_cholesteryl_esters_in_small_hdl,NMR_cholesteryl_esters_in_small_ldl,NMR_cholesteryl_esters_in_small_vldl,NMR_cholesteryl_esters_in_vldl,NMR_cholesteryl_esters_in_very_large_hdl,NMR_cholesteryl_esters_in_very_large_vldl,NMR_cholesteryl_esters_in_very_small_vldl,NMR_citrate,NMR_clinical_ldl_cholesterol,NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles,NMR_concentration_of_hdl_particles,NMR_concentration_of_idl_particles,NMR_concentration_of_ldl_particles,NMR_concentration_of_large_hdl_particles,NMR_concentration_of_large_ldl_particles,NMR_concentration_of_large_vldl_particles,NMR_concentration_of_medium_hdl_particles,NMR_concentration_of_medium_ldl_particles,NMR_concentration_of_medium_vldl_particles,NMR_concentration_of_small_hdl_particles,NMR_concentration_of_small_ldl_particles,NMR_concentration_of_small_vldl_particles,NMR_concentration_of_vldl_particles,NMR_concentration_of_very_large_hdl_particles,NMR_concentration_of_very_large_vldl_particles,NMR_concentration_of_very_small_vldl_particles,NMR_creatinine,NMR_degree_of_unsaturation,NMR_docosahexaenoic_acid,NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl,NMR_free_cholesterol_in_hdl,NMR_free_cholesterol_in_idl,NMR_free_cholesterol_in_ldl,NMR_free_cholesterol_in_large_hdl,NMR_free_cholesterol_in_large_ldl,NMR_free_cholesterol_in_large_vldl,NMR_free_cholesterol_in_medium_hdl,NMR_free_cholesterol_in_medium_ldl,NMR_free_cholesterol_in_medium_vldl,NMR_free_cholesterol_in_small_hdl,NMR_free_cholesterol_in_small_ldl,NMR_free_cholesterol_in_small_vldl,NMR_free_cholesterol_in_vldl,NMR_free_cholesterol_in_very_large_hdl,NMR_free_cholesterol_in_very_large_vldl,NMR_free_cholesterol_in_very_small_vldl,NMR_glucose,NMR_glutamine,NMR_glycine,NMR_glycoprotein_acetyls,NMR_hdl_cholesterol,NMR_histidine,NMR_isoleucine,NMR_ldl_cholesterol,NMR_lactate,NMR_leucine,NMR_linoleic_acid,NMR_monounsaturated_fatty_acids,NMR_omega3_fatty_acids,NMR_omega6_fatty_acids,NMR_phenylalanine,NMR_phosphatidylcholines,NMR_phosphoglycerides,NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl,NMR_phospholipids_in_hdl,NMR_phospholipids_in_idl,NMR_phospholipids_in_ldl,NMR_phospholipids_in_large_hdl,NMR_phospholipids_in_large_ldl,NMR_phospholipids_in_large_vldl,NMR_phospholipids_in_medium_hdl,NMR_phospholipids_in_medium_ldl,NMR_phospholipids_in_medium_vldl,NMR_phospholipids_in_small_hdl,NMR_phospholipids_in_small_ldl,NMR_phospholipids_in_small_vldl,NMR_phospholipids_in_vldl,NMR_phospholipids_in_very_large_hdl,NMR_phospholipids_in_very_large_vldl,NMR_phospholipids_in_very_small_vldl,NMR_polyunsaturated_fatty_acids,NMR_pyruvate,NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol,NMR_saturated_fatty_acids,NMR_sphingomyelins,NMR_total_cholesterol,NMR_total_cholesterol_minus_hdlc,NMR_total_cholines,NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine,NMR_total_concentration_of_lipoprotein_particles,NMR_total_esterified_cholesterol,NMR_total_fatty_acids,NMR_total_free_cholesterol,NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl,NMR_total_lipids_in_hdl,NMR_total_lipids_in_idl,NMR_total_lipids_in_ldl,NMR_total_lipids_in_large_hdl,NMR_total_lipids_in_large_ldl,NMR_total_lipids_in_large_vldl,NMR_total_lipids_in_lipoprotein_particles,NMR_total_lipids_in_medium_hdl,NMR_total_lipids_in_medium_ldl,NMR_total_lipids_in_medium_vldl,NMR_total_lipids_in_small_hdl,NMR_total_lipids_in_small_ldl,NMR_total_lipids_in_small_vldl,NMR_total_lipids_in_vldl,NMR_total_lipids_in_very_large_hdl,NMR_total_lipids_in_very_large_vldl,NMR_total_lipids_in_very_small_vldl,NMR_total_phospholipids_in_lipoprotein_particles,NMR_total_triglycerides,NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl,NMR_triglycerides_in_hdl,NMR_triglycerides_in_idl,NMR_triglycerides_in_ldl,NMR_triglycerides_in_large_hdl,NMR_triglycerides_in_large_ldl,NMR_triglycerides_in_large_vldl,NMR_triglycerides_in_medium_hdl,NMR_triglycerides_in_medium_ldl,NMR_triglycerides_in_medium_vldl,NMR_triglycerides_in_small_hdl,NMR_triglycerides_in_small_ldl,NMR_triglycerides_in_small_vldl,NMR_triglycerides_in_vldl,NMR_triglycerides_in_very_large_hdl,NMR_triglycerides_in_very_large_vldl,NMR_triglycerides_in_very_small_vldl,NMR_tyrosine,NMR_vldl_cholesterol,NMR_valine
2 | sample_0,1.7267629,0.5097766,0.29732332,0.46884453,1.4918994,0.6533935,0.18285494,1.1864842,0.64613235,0.9915879,0.6898927,1.6417483,1.088562,0.18311316,1.0280348,1.7085389,1.3308727,1.3725197,0.453924,0.14175132,0.5277481,0.7697196,0.65141964,0.048153635,0.6881,0.6210777,0.981835,0.14764947,1.4333347,0.5098504,1.5350345,0.3321557,1.7906251,0.4931004,1.3807096,0.52706426,0.30808988,1.2255802,0.9953301,1.7295392,1.0182742,1.324438,1.3735615,0.22325872,0.62359023,1.7462277,1.2364578,1.1424968,0.6403464,1.494473,0.43395197,1.0911344,0.99456245,0.70620626,1.2572117,0.39763924,0.84456956,0.34995952,1.7056545,0.13534041,1.6538748,1.2421536,0.275961,1.61189,0.43145138,0.69299865,1.3780788,0.2530471,0.73021734,1.5449325,1.4830003,0.55509186,0.11249849,0.0051142704,0.4280246,1.4834918,0.90811676,0.1878527,0.20966077,0.43405378,0.620166,0.25039583,0.859951,0.506151,0.49615797,0.9883161,1.315808,0.027282929,1.3034036,0.41346607,1.228288,0.51193565,0.26067233,0.8067817,0.90494514,0.06724546,1.5765679,0.5881223,1.2508178,0.7745326,0.7279988,0.9269167,0.07510055,0.37857637,1.7506536,0.105617344,0.13456018,0.23826805,0.2921966,0.6284125,0.74927133,0.7842055,0.6030325,1.2905078,0.4911587,0.57319283,0.13305517,0.80514073,1.0980389,0.39218083,0.1623933,0.13884185,0.17421694,0.15859717,0.41635695,0.6279078,0.62771,0.8249054,1.5470794,0.7805787,1.6720229,0.5789668,0.98674554,0.15837964,0.07383999,0.7915055,0.28337508,1.1657172,0.71572715,1.6519899,1.3657914,0.4889391,1.1835232,1.6657807,0.8121769,0.40309033,0.6373719,1.0736355,0.25264534,1.0069423,0.86787814,1.3369577,0.50055605,1.3423343,1.3437071,0.22329777,0.1401766,0.22787079,1.5827061,1.6035206,1.7987934,0.124354236,1.4847823,1.3791466,0.7313719,1.4134979,0.5265129,0.204291
3 | sample_1,1.1247766,1.4505948,1.5361449,1.4904437,1.5670565,1.2864381,0.077527456,0.48182374,1.4347273,0.70310014,1.1604679,1.4766713,0.82198054,0.69924873,0.47259116,0.019940903,1.2150654,0.14550368,1.1506934,0.42108682,0.72633934,1.4222,1.6842602,0.6014966,1.6811364,0.81836396,0.81108344,1.0707843,0.86859024,0.11722562,0.8970124,1.4451725,1.7431155,0.9812217,1.3683889,1.1414407,0.9885561,0.5018747,1.0990268,0.14177717,1.1253144,0.0690509,1.5061882,0.23769274,0.5989025,1.4955494,0.49969435,0.2531869,1.4209754,1.4839709,1.1655788,1.137409,1.7623557,1.1982107,0.16304106,0.6498071,1.247407,0.007712378,1.4728979,0.97307813,0.9165665,0.3204594,0.24263348,1.2905837,1.4658877,1.6306684,0.00226037,0.039650865,1.7878088,1.5556992,1.191061,0.075704694,1.5558871,1.0816427,1.3992503,0.07463606,0.47904897,1.4666929,0.5326069,0.95497483,0.8767187,1.1244663,1.655131,1.4107636,1.093523,0.29427055,0.24261752,0.33046967,1.2696238,1.3498253,1.6691794,1.4430594,0.29075617,1.5200106,1.0818076,1.1958362,0.035788987,0.7284869,0.34679976,1.4599568,0.3480672,1.3999459,1.3708526,0.68422735,1.1539485,0.35347924,1.0999752,0.6629832,1.677415,1.7063679,0.20748337,0.4156185,1.4872332,0.42577755,0.63565433,1.7240045,0.30661812,1.1443036,0.36064732,1.0750041,1.3607403,1.2786652,0.9076651,0.6570294,0.38894868,0.48623,0.19309354,1.6910648,1.5545182,1.6601219,1.6360918,1.1610199,0.9035126,0.55135554,0.49802023,0.8577952,1.5569063,1.6308388,1.7859721,1.107143,1.5525775,0.7256029,0.056678522,1.4784337,0.506086,0.47641203,1.1579549,0.74990684,1.5955726,1.2277075,0.8925245,1.7089723,0.81651545,1.1531171,1.2951981,0.22069862,0.7634763,0.63415253,1.3337371,1.5506673,0.8664601,0.87574947,0.41530603,0.025238335,0.92178035,1.6352707,0.37116644,1.192257
4 | sample_2,0.6394572,0.048151683,0.0041278093,1.3019447,1.5163862,1.7615204,1.3996884,0.88473135,0.027976334,1.6112314,1.0511037,1.3119624,0.10937966,0.8065622,0.2579499,0.40155205,1.158636,0.100113384,1.0507463,1.4433336,0.8073181,1.4442376,0.66797423,0.53165096,0.9798346,1.5489019,1.334381,0.13057685,1.7061685,0.73331654,1.067463,0.7395286,0.44745353,0.5638262,0.343716,0.26125124,0.042463213,0.68579,1.5634893,1.095418,1.7047849,1.5969012,0.28933054,0.43715376,1.3674892,1.1461415,0.040662605,0.27717274,0.46089447,0.31720972,1.5146904,0.31740576,1.164278,1.2174286,1.0174254,1.7494576,0.1993007,0.28359258,1.6533439,0.4360933,0.7976693,0.2955996,1.4103165,1.1625634,1.166639,1.5002036,1.4426631,1.0293818,1.7243257,1.6771489,1.1876979,0.46074176,1.3435627,0.057508104,1.1589948,0.48598015,1.1272345,1.6984007,0.7455857,0.9537147,1.6460967,1.2778734,0.65043944,0.8658931,1.59622,1.2816808,0.18826872,1.0616916,0.11173987,0.9738986,1.4398922,1.370888,1.0251368,0.68071675,0.8055198,0.4604453,0.05194658,1.2749503,1.6958934,1.2984284,0.47111607,0.835225,0.11417559,1.7460171,0.32695165,0.25663644,0.111282885,1.6801063,1.0840608,0.45405158,1.6737995,0.29428774,0.673308,1.589201,0.21577558,0.02494686,1.2485098,1.0764318,0.38511005,0.1998027,0.54615897,1.6868843,0.62696517,1.7561861,1.4750563,1.45119,1.0248525,1.3861558,1.1287161,1.5323092,0.5878731,1.096504,1.4778537,0.46314618,0.36474022,1.0683476,1.4301068,1.4973894,0.827812,0.6916306,1.7210993,1.0710878,1.6875877,0.085866526,1.0035372,1.7737324,1.0767089,0.084368296,0.1562901,1.0578055,0.27928537,0.16458681,0.8512347,1.240649,1.5773059,1.172978,1.4460613,0.8570547,0.23283778,0.960511,0.8304184,1.5568347,0.18804511,1.2705362,1.7228076,1.7837844,1.7385787,1.7892717
5 | sample_3,0.4487589,0.37038893,0.06925146,1.0901536,1.042064,0.32663187,1.4012345,0.49263698,1.6871933,1.1226852,1.6242129,1.7099121,0.18495268,0.87873393,0.7100061,1.525226,1.4485743,0.039899103,0.8911774,1.4572191,1.0084244,0.11996893,0.036249947,1.0465795,1.5553432,0.30086443,0.78687495,0.030546222,1.5204672,0.0752192,0.23986651,1.707968,0.6906252,0.06051089,0.32766625,1.5041764,1.4767783,1.2251022,1.307869,1.2565494,1.7772435,0.31088248,0.8995749,0.26549795,0.15266678,1.2756772,1.6006684,0.4677581,0.23416235,0.6896485,0.4579907,1.3835437,0.554329,1.2575718,1.2434229,0.054920208,0.5825295,0.20535323,0.8275796,0.60950464,0.7588442,1.1936173,1.3331549,1.5392226,0.16763358,0.10467987,1.741956,0.5891317,1.3638477,0.67581654,0.31941622,1.1855899,1.66403,0.6271029,1.1893716,1.0977352,1.534193,0.22538827,0.64260757,0.2408825,1.7362573,1.2060944,0.969551,0.9658206,0.23782036,1.7787619,1.5680085,1.0271211,0.80165523,1.2293843,1.2669885,1.7391648,1.2056175,0.84038806,0.61846817,0.15132122,0.51495504,0.77662504,1.31058,0.79846156,1.2412288,1.4939855,0.1171603,0.74272704,1.7567694,1.7224572,1.3670967,1.3684192,0.66651887,0.88285154,0.73113704,1.71152,0.1984383,1.6891428,1.7598286,0.74715847,0.8938639,1.594826,0.15118295,1.1975237,0.31053978,0.019128645,1.499253,0.88487166,1.5354916,1.2054054,1.4602237,0.47618222,0.011032284,0.18765059,1.4797944,0.5439001,0.35328564,0.0931981,1.7674357,0.06046798,0.95566815,1.6162721,0.35987407,0.52456313,1.2912939,0.025181143,0.6913808,1.3839384,1.311321,1.7093761,0.013740754,0.7665038,0.7897924,1.1775188,1.2336917,1.0273027,0.6498797,0.7412339,0.6385234,1.7824961,0.5776181,0.10206302,0.20797145,1.5722026,0.7791373,0.7737667,1.2712736,0.07551479,1.0713022,1.1955271,0.33608505,1.7409687
6 | sample_4,0.39158726,1.4290415,1.0409038,0.32171452,1.6907369,1.6188732,0.29006386,1.0105348,0.6140596,0.1370683,0.033939946,1.6528398,1.121336,0.70619434,0.13311104,0.795395,1.1870075,0.9588572,0.69247836,0.6088535,1.5259367,1.3447601,0.835985,0.41114345,1.2541314,0.33457318,0.31144038,0.21643135,1.5477998,0.5301468,0.24439298,0.5454397,0.218857,0.7117476,0.77929556,1.7813807,1.6003835,1.7953465,1.4607075,1.2049109,1.48384,0.85762167,1.1429639,0.9870765,0.6465661,0.7455147,0.43265375,1.7432247,1.3255073,1.4863342,1.499107,1.637055,1.4781187,1.7494757,0.38001835,0.4903395,1.426103,0.24009445,0.94217247,1.5151939,1.2099993,0.46371266,0.49783766,1.4946584,0.804823,1.7760794,0.3688554,1.3472563,0.14981924,1.0910748,1.2662728,1.5365591,1.5832489,0.39390194,1.1747075,1.2293701,1.6966463,0.07995371,1.5968765,0.35381973,1.795621,1.3334799,1.6623522,0.114330016,0.9883059,0.42806634,0.42088893,0.33974266,0.8436938,0.5935543,1.4546558,0.93583494,0.39511275,1.756246,0.33147886,1.5322094,1.0483937,1.3000615,1.5142493,1.1211617,0.21773115,0.48247924,1.3405449,0.75711596,0.43254915,1.108704,0.21249163,0.5359951,1.2703377,0.54352325,1.678972,1.2367089,0.20781432,1.7676998,0.2796192,0.5369049,1.552004,0.42126063,1.3163086,0.84839207,0.4736774,0.9796766,0.7934538,1.218545,1.3820137,0.5128056,1.1789824,1.2839047,0.5724214,0.19048418,0.96299917,0.6771659,0.19370678,0.96163434,0.7227618,0.7204046,1.1550516,0.53782845,1.7956933,0.18117389,0.0303778,0.42181036,0.32062125,1.4959301,0.6862654,1.2135113,1.3248559,0.31858745,1.7488276,0.6386698,0.758894,1.2883968,0.17838693,0.20247132,1.6250259,0.8688114,1.6884774,0.46924558,0.2778965,0.5041763,1.3240411,1.5307034,0.07698704,0.08331609,0.14573115,0.9030207,1.0258918,0.30767688
7 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/modules.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | from omegaconf.listconfig import ListConfig
  5 | 
  6 | 
  7 | class SingleLayerNet(nn.Module):
  8 |     def __init__(self, input_dim=32, output_dim=2, final_activation=None, final_batchnorm=False, **kwargs):
  9 |         super(SingleLayerNet, self).__init__()
 10 |         self.input_dim = input_dim
 11 |         self.output_dim = output_dim
 12 | 
 13 |         if final_activation is not None and isinstance(final_activation, str):
 14 |             m = final_activation.split('.')
 15 |             final_activation = getattr(nn, m[1])
 16 |             print(final_activation)
 17 | 
 18 |         predictor_specs = [nn.Linear(self.input_dim, self.output_dim), ]
 19 |         if final_batchnorm:
 20 |             predictor_specs.append(nn.BatchNorm1d(self.output_dim))
 21 |         if final_activation is not None:
 22 |             predictor_specs.append(final_activation())
 23 |         self.predictor = nn.Sequential(*predictor_specs)
 24 | 
 25 |     def forward(self, input):
 26 |         fts = self.predictor(input)
 27 |         return fts
 28 | 
 29 | 
 30 | class MLP(nn.Module):
 31 |     def __init__(self,
 32 |                  input_dim=32,
 33 |                  output_dim=2,
 34 |                  hidden_dim=256,
 35 |                  n_hidden_layers=None,
 36 |                  activation="nn.SELU",
 37 |                  dropout_fn='nn.Dropout',
 38 |                  norm_fn='nn.BatchNorm1d',
 39 |                  norm_layer="all",
 40 |                  dropout_after_norm=True,
 41 |                  input_norm=False,
 42 |                  final_activation=None,
 43 |                  final_norm=False,
 44 |                  snn_init=True,
 45 |                  dropout=0.5, **kwargs):
 46 |         """
 47 |         A simple feed-forward neural network.
 48 |         :param input_dim:   `int`, dimension ot the input features
 49 |         :param output_dim:  `int`, dimension of the outlayer
 50 |         :param activation:  `nn.Module`, NOT initialized. that is the activation of the last layer, if `None` no activation will be performed.
 51 |         :param dropout:     `float`, [<1], that specifies the dropout probability
 52 |         :param kwargs:
 53 |         """
 54 |         super().__init__()
 55 |         self.input_dim = input_dim
 56 |         self.output_dim = output_dim
 57 |         norm_layer = norm_layer if isinstance(norm_layer, (list, tuple, ListConfig)) else [l for l in range(100)]
 58 |         self.hidden_dim = hidden_dim
 59 |         self.dropout = dropout
 60 |         if norm_fn is not None and isinstance(norm_fn, str):
 61 |             m = norm_fn.split('.')
 62 |             norm_fn = getattr(nn, m[1])
 63 |             self.norm_fn = norm_fn
 64 |         if dropout_fn is not None and isinstance(dropout_fn, str):
 65 |             m = dropout_fn.split('.')
 66 |             dropout_fn = getattr(nn, m[1])
 67 |         if activation is not None and isinstance(activation, str):
 68 |             m = activation.split('.')
 69 |             activation = getattr(nn, m[1])
 70 |             print(activation)
 71 |         if final_activation is not None and isinstance(final_activation, str):
 72 |             m = final_activation.split('.')
 73 |             final_activation = getattr(nn, m[1])
 74 |             print(final_activation)
 75 |         print(self.output_dim)
 76 | 
 77 |         if input_norm:
 78 |             self.input_norm = nn.LayerNorm(self.input_dim)
 79 |         else:
 80 |             self.input_norm = None
 81 | 
 82 |         if isinstance(hidden_dim, int):
 83 |             if isinstance(norm_layer, (list, tuple, ListConfig)): norm_fn = self.norm_fn if 0 in norm_layer else None
 84 |             else: norm_fn = None
 85 |             mlp_specs = [nn.Linear(input_dim, hidden_dim),]
 86 |             if dropout_after_norm == True:
 87 |                 mlp_specs.extend([
 88 |                     norm_fn(hidden_dim) if norm_fn is not None else nn.Identity(),
 89 |                     dropout_fn(self.dropout),])
 90 |             else:
 91 |                 mlp_specs.extend([
 92 |                     dropout_fn(self.dropout),
 93 |                     norm_fn(hidden_dim) if norm_fn is not None else nn.Identity(),
 94 |                 ])
 95 |             mlp_specs.extend([activation(),])
 96 | 
 97 |             for i in range(n_hidden_layers):
 98 |                 if isinstance(norm_layer, (list, tuple, ListConfig)): norm_fn = self.norm_fn if i+1 in norm_layer else None
 99 |                 else: norm_fn = None
100 |                 mlp_specs.extend([nn.Linear(hidden_dim, hidden_dim),])
101 |                 if dropout_after_norm == True:
102 |                     mlp_specs.extend([
103 |                         norm_fn(hidden_dim) if norm_fn is not None else nn.Identity(),
104 |                         dropout_fn(self.dropout), ])
105 |                 else:
106 |                     mlp_specs.extend([
107 |                         dropout_fn(self.dropout),
108 |                         norm_fn(hidden_dim) if norm_fn is not None else nn.Identity(),
109 |                     ])
110 |                 mlp_specs.extend([activation(),])
111 |             self.mlp = nn.Sequential(*mlp_specs)
112 |             predictor_specs = [
113 |                 nn.Linear(hidden_dim, self.output_dim),
114 |             ]
115 |         elif isinstance(hidden_dim, (list, tuple, ListConfig)):
116 |             assert n_hidden_layers is None, 'Either pass list of hidden_dims, or n_hidden_layers with single hidden_dim'
117 |             mlp_specs = []
118 |             for i, h in enumerate(hidden_dim):
119 |                 if isinstance(norm_layer, (list, tuple, ListConfig)): norm_fn = self.norm_fn if i in norm_layer else None
120 |                 else: norm_fn = None
121 |                 mlp_specs.extend([nn.Linear(input_dim if i==0 else hidden_dim[i-1], h),])
122 |                 if dropout_after_norm == True:
123 |                     mlp_specs.extend([
124 |                         norm_fn(h) if norm_fn is not None else nn.Identity(),
125 |                         dropout_fn(self.dropout)])
126 |                 else:
127 |                     mlp_specs.extend([
128 |                         dropout_fn(self.dropout),
129 |                         norm_fn(h) if norm_fn is not None else nn.Identity(),
130 |                     ])
131 |                 mlp_specs.extend([activation(),])
132 |             self.mlp = nn.Sequential(*mlp_specs)
133 |             predictor_specs = [
134 |                 nn.Linear(hidden_dim[-1], self.output_dim),
135 |                 ]
136 |         else:
137 |             raise ValueError('hidden_dim is either int or list of ints')
138 | 
139 |         if final_norm:
140 |             predictor_specs.append(self.norm_fn(self.output_dim))
141 |         if final_activation is not None:
142 |             predictor_specs.append(final_activation())
143 | 
144 |         self.predictor = nn.Sequential(*predictor_specs)
145 | 
146 |         if snn_init:
147 |             self.reset_parameters('predictor')
148 |             self.reset_parameters('mlp')
149 | 
150 |     def forward(self, input):
151 |         if self.input_norm is not None:
152 |             input = self.input_norm(input)
153 |         fts = self.mlp(input)
154 |         output = self.predictor(fts)
155 |         return output
156 | 
157 |     def reset_parameters(self, name):
158 |         for layer in getattr(self, name):
159 |             if not isinstance(layer, nn.Linear):
160 |                 continue
161 |             nn.init.normal_(layer.weight, std=1 / math.sqrt(layer.out_features))
162 |             if layer.bias is not None:
163 |                 fan_in, _ = nn.init._calculate_fan_in_and_fan_out(layer.weight)
164 |                 bound = 1 / math.sqrt(fan_in)
165 |                 nn.init.uniform_(layer.bias, -bound, bound)
166 | 
167 | 
168 | class ResidualHeadMLP(nn.Module):
169 |     def __init__(self,
170 |                  predictor_mlp=MLP,
171 |                  predictor_mlp_kwargs=dict(input_dim=None,
172 |                                            output_dim=None,
173 |                                            hidden_dim=None,
174 |                                            activation="nn.SiLU",
175 |                                            dropout_fn='nn.Dropout',
176 |                                            dropout=0.2,
177 |                                            final_activation="nn.SiLU",
178 |                                            final_batchnorm=False),
179 |                  skip_connection_mlp=MLP,
180 |                  skip_connection_input_dim=32,
181 |                  skip_connection_mlp_kwargs=dict(input_dim=None,
182 |                                                  output_dim=None,
183 |                                                  hidden_dim=None,
184 |                                                  activation="nn.SiLU",
185 |                                                  dropout_fn='nn.Dropout',
186 |                                                  dropout=0.2,
187 |                                                  final_activation="nn.SiLU",
188 |                                                  final_batchnorm=False),
189 |                  mlp=MLP,
190 |                  mlp_kwargs=dict(input_dim=None,
191 |                                  output_dim=None,
192 |                                  hidden_dim=None,
193 |                                  activation="nn.SiLU",
194 |                                  dropout_fn='nn.Dropout',
195 |                                  dropout=0.2,
196 |                                  final_activation="nn.SiLU",
197 |                                  final_batchnorm=False),
198 |                  **kwargs):
199 |         super().__init__()
200 |         self.skip_connection_input_dim = skip_connection_input_dim
201 | 
202 |         if predictor_mlp is not None and isinstance(predictor_mlp, str):
203 |             self.predictor_mlp = eval(predictor_mlp)
204 |         if skip_connection_mlp is not None and isinstance(skip_connection_mlp, str):
205 |             self.skip_connection_mlp = eval(skip_connection_mlp)
206 |         if mlp is not None and isinstance(mlp, str):
207 |             self.mlp = eval(mlp)
208 | 
209 |         skip_connection_mlp_kwargs['input_dim'] = self.skip_connection_input_dim
210 | 
211 |         self.predictor = self.predictor_mlp(**predictor_mlp_kwargs)
212 |         self.skip_connection = self.skip_connection_mlp(**skip_connection_mlp_kwargs)
213 |         self.mlp = self.mlp(**mlp_kwargs)
214 | 
215 |     def forward(self, input):
216 |         features, covariates = input
217 |         fts = self.mlp(features)
218 |         skip_fts = self.skip_connection(covariates)
219 |         h = fts + skip_fts
220 |         out = self.predictor(h)
221 |         return out
222 | 
223 | 
224 | class MLPResNetBlock(nn.Module):
225 |     """
226 |     MLP version of the ResBlock wrapped by TemporalBlock from:
227 |     https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/unet.py#L143
228 | 
229 |     with less complexity and fts.
230 |     """
231 |     def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, dropout=0.3,
232 |                  embedding_dim=16,
233 |                  use_scale_shift_norm=False,
234 |                  temporal_embedding=False):
235 |         super().__init__()
236 |         self.input_dim = input_dim
237 |         self.hidden_dim = hidden_dim
238 |         self.output_dim = output_dim
239 |         self.dropout = dropout
240 |         self.use_scale_shift_norm = use_scale_shift_norm
241 |         self.temporal_embedding=temporal_embedding
242 | 
243 |         if temporal_embedding:
244 |             self.emb_layers = nn.Sequential(
245 |                 nn.SiLU(),
246 |                 nn.Linear(embedding_dim,
247 |                           2 * self.output_dim if use_scale_shift_norm else self.output_dim),
248 |             )
249 | 
250 |         self.in_layers = nn.Sequential(
251 |             nn.Linear(self.input_dim, self.hidden_dim),
252 |             nn.BatchNorm1d(self.hidden_dim),
253 |             nn.SiLU(),
254 |             nn.Dropout(self.dropout),
255 |             nn.Linear(self.hidden_dim, self.output_dim),
256 |             nn.BatchNorm1d(self.output_dim),
257 |             nn.SiLU(),
258 |             nn.Dropout(self.dropout),
259 |         )
260 | 
261 |         self.skip_connection = nn.Identity() if self.input_dim==self.output_dim else \
262 |             nn.Sequential(
263 |                 nn.Linear(self.input_dim, self.output_dim),
264 |                 nn.BatchNorm1d(self.output_dim),
265 |                 nn.SiLU()
266 |             )
267 | 
268 |         self.out_layers = nn.Sequential(
269 |             nn.SiLU(),
270 |             nn.Dropout(p=self.dropout),
271 |             nn.Linear(self.output_dim, self.output_dim)
272 |         )
273 | 
274 |     def forward(self, x, emb):
275 |         h = self.in_layers(x)
276 |         emb_out = self.emb_layers(emb).type(h.dtype)
277 |         while len(emb_out.shape) < len(h.shape):
278 |             emb_out = emb_out[..., None]
279 |         if self.use_scale_shift_norm:
280 |             out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
281 |             scale, shift = torch.chunk(emb_out, 2, dim=1)
282 |             h = out_norm(h) * (1 + scale) + shift
283 |             h = out_rest(h)
284 |         else:
285 |             h = h + emb_out
286 |             h = self.out_layers(h)
287 |         return self.skip_connection(x) + h
288 | 
289 |     def reset_parameters(self, name):
290 |         for layer in getattr(self, name):
291 |             if not isinstance(layer, nn.Linear):
292 |                 continue
293 |             nn.init.normal_(layer.weight, std=1 / math.sqrt(layer.out_features))
294 |             if layer.bias is not None:
295 |                 fan_in, _ = nn.init._calculate_fan_in_and_fan_out(layer.weight)
296 |                 bound = 1 / math.sqrt(fan_in)
297 |                 nn.init.uniform_(layer.bias, -bound, bound)
298 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/datasets.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import numpy as np
  4 | 
  5 | import pathlib
  6 | 
  7 | import torch
  8 | 
  9 | from collections import OrderedDict, abc as container_abcs
 10 | from torch._six import string_classes
 11 | from torch.utils.data import Dataset, DataLoader
 12 | from torch.utils.data.sampler import Sampler
 13 | 
 14 | 
 15 | class RepeatIterator(object):
 16 |     """
 17 |     creates an iterable which returns each integer in range(length) n_times times.
 18 |     example: next(RepeatIterator(2,3)) would return: 0,0,1,1,2,2,3,3
 19 |     """
 20 |     def __init__(self, n_times, length):
 21 |         self.n_times = n_times
 22 |         self.length = length
 23 |         self.idx = 0
 24 |         self.reps = 0
 25 | 
 26 |     def __iter__(self):
 27 |         return self
 28 | 
 29 |     def __next__(self):
 30 |         #print(f"Number of TTA views Iterator: {self.n_times}")
 31 |         if self.reps < self.n_times:
 32 |             self.reps += 1
 33 |         else:
 34 |             self.idx += 1
 35 |             self.reps = 1
 36 |         if self.idx < self.length-1:
 37 |             return self.idx
 38 |         else:
 39 |             raise StopIteration
 40 | 
 41 | 
 42 | class RepeatedSampler(Sampler):
 43 |     """
 44 |     Sampler class that wraps the RepeatIterator. Can be used in dataloaders.
 45 |     """
 46 |     def __init__(self, n_times, data_source):
 47 |         self.n_times = n_times
 48 |         self.ds_length = len(data_source)
 49 |         super().__init__(data_source=data_source)
 50 |         #print(f"Number of TTA views Sampler: {self.n_times}")
 51 |         #print(f"ds length Sampler: {self.ds_length}")
 52 |         self.iterator = RepeatIterator(self.n_times, self.ds_length)
 53 | 
 54 |     def __iter__(self):
 55 |         return self.iterator
 56 | 
 57 |     def __len__(self):
 58 |         return int(self.ds_length)*self.n_times
 59 | 
 60 | 
 61 | class TabularDataset(Dataset):
 62 |     """
 63 |     Dataset wrapper to sit ontop of a feather file, and read specific columns
 64 |     """
 65 | 
 66 |     def __init__(self, data_fp, features, normalization_dict=None, eid_selection_mask=None, oversampling=None):
 67 |         super().__init__()
 68 |         """
 69 |         Create a dataset to read h5ad files.
 70 |         Currently a bit ugly as we create a pd.DataFrame holding the entire dataset. We need this to perform efficient eid selection using df.loc.
 71 |         df.loc is the perfect method to do that since it sorts the datamodules to the passed argument as well. We can thus make sure that multiple h5adDatasets are in the same order.
 72 |         :param h5ad_fp: `str`, the filepath to the h5ad that should be read.
 73 |         :param features: `list` or list-like, contains the strings to select the features to be returned from the h5ad.
 74 |         :param eid_selection_mask: `list` or list-like, optional (default `None`), contains the eids to select.
 75 |         """
 76 |         # determine wheter file to read is .csv or .feather:
 77 |         ext = os.path.splitext(data_fp)[1]
 78 |         assert ext in ['.csv', '.feather'], 'TabularDataset only supports .csv and .feather files'
 79 |         print(data_fp)
 80 |         base = pathlib.Path(data_fp).parents[2]
 81 |         description_fp = os.path.join(base, f'description{ext}')
 82 |         assert os.path.exists(description_fp), f'Description file not found in {description_fp}'
 83 | 
 84 |         # read datamodules:
 85 |         read_method = pd.read_feather if ext == '.feather' else pd.read_csv
 86 |         data = read_method(data_fp)
 87 |         description = read_method(description_fp)
 88 | 
 89 |         for f in features:
 90 |             if f not in data.columns.values:
 91 |                 print(f)
 92 |         assert all([c in data.columns.values for c in features]), \
 93 |             'Not all passed features were found in datamodules file columns'
 94 | 
 95 |         self.features = features
 96 |         description = description.query("covariate==@self.features")
 97 |         self.eid_map = data[["eid"]+self.features].copy().astype({'eid': 'int32'}).set_index('eid')
 98 | 
 99 |         if eid_selection_mask is not None: #self.eid_map = self.eid_map.reset_index().query("eid == @eid_selection_map").set_index("eid")
100 |             ## find intersection of mask and eids:
101 |             eid_selection_mask = [int(i) for i in eid_selection_mask]  # make sure its int!
102 |             #faulty_ids = [i for i in eid_selection_mask if i not in self.eid_map.index.values]
103 |             eids_intersection = self.eid_map.index.intersection(eid_selection_mask)
104 |             print(f"{len(self.eid_map)-len(eids_intersection)} eids excluded")
105 |             self.eid_map = self.eid_map.loc[eids_intersection,:]  # make sure this is sorted.
106 |             print(len(self.eid_map))
107 |         # normalize values
108 |         if normalization_dict is not None:
109 |             self.eid_map = self.normalize_df_fixed_params(self.eid_map, normalization_dict)
110 | 
111 |         # get the idxs of categorical vars:
112 |         self.categoricals = description.query("covariate ==@self.features").\
113 |             query("dtype in ['category', 'bool']").covariate.values
114 |         self.continuous = description.query("covariate ==@self.features").\
115 |             query("dtype in ['int', 'float']").covariate.values
116 |         self.categorical_idxs = [self.eid_map.columns.tolist().index(v) for v in self.categoricals]
117 |         self.continuous_idxs = [self.eid_map.columns.tolist().index(v)for v in self.continuous]
118 | 
119 |         for f in self.features:
120 |             self.eid_map[f] = self.eid_map[f].astype(float)
121 |         del data
122 | 
123 |     def normalize_df_fixed_params(self, df, param_dict):
124 |         """
125 |         Normalize pd.DF column-wise.
126 |         :param df:
127 |         :param param_dict: 'dictionary', contains columns of the df as key and tuple (min, max) as scaling factors for spec column
128 |         :return:
129 |         """
130 |         print('normalizing datamodules...')
131 |         for key in param_dict.keys():
132 |             assert key in df.columns
133 |         for col in df.columns:
134 |             if col in list(param_dict.keys()):
135 |                 min = param_dict[col][0]
136 |                 max = param_dict[col][1]
137 |                 df[col] = (df[col] - min) / (max - min + 0.00001)
138 |         return df
139 | 
140 |     def __getitem__(self, idx):
141 |         fts = self.eid_map.values[idx, :]
142 |         return torch.Tensor(fts)
143 | 
144 |     def __len__(self):
145 |         return self.eid_map.shape[0]
146 | 
147 | 
148 | class ExclusionMaskDataset(Dataset):
149 |     def __init__(self, data_fp, exclusion_criteria_dict, eid_selection_mask=None):
150 |         super().__init__()
151 |         # determine wheter file to read is .csv or .feather:
152 |         ext = os.path.splitext(data_fp)[1]
153 |         assert ext in ['.csv', '.feather'], 'TabularDataset only supports .csv and .feather files'
154 |         print(data_fp)
155 |         base = pathlib.Path(data_fp).parents[2]
156 |         description_fp = os.path.join(base, f'description{ext}')
157 |         assert os.path.exists(description_fp), f'Description file not found in {description_fp}'
158 | 
159 |         # read data:
160 |         read_method = pd.read_feather if ext == '.feather' else pd.read_csv
161 |         data = read_method(data_fp)
162 | 
163 |         # store
164 |         self.eid_map = data.copy().astype({'eid': 'int32'}).set_index('eid')
165 | 
166 |         # apply general exclusion criteria:
167 |         if eid_selection_mask is not None:
168 |             ## find intersection of mask and eids:
169 |             eid_selection_mask = [int(i) for i in eid_selection_mask]  # make sure its int!
170 |             #faulty_ids = [i for i in eid_selection_mask if i not in self.eid_map.index.values]
171 |             eids_intersection = self.eid_map.index.intersection(eid_selection_mask)
172 |             print(f"{len(self.eid_map)-len(eids_intersection)} eids excluded")
173 |             self.eid_map = self.eid_map.loc[eids_intersection,:]  # make sure this is sorted.
174 |             print(len(self.eid_map))
175 | 
176 |         # excl:
177 |         self.exclusion_criteria_dict = exclusion_criteria_dict
178 |         self.generate_exclusion_masks()
179 |         self.eid_map = self.eid_map[[v for v in self.eid_map.columns if v.endswith('exclusion_mask')]].astype(float)
180 | 
181 |         del data
182 | 
183 |     def generate_exclusion_masks(self):
184 |         for task in self.exclusion_criteria_dict.keys():
185 |             eids = self.eid_map.query(self.exclusion_criteria_dict[task]).index.to_list()
186 | 
187 |             self.eid_map[f'{task}_exclusion_mask'] = 1
188 |             self.eid_map.loc[eids, f'{task}_exclusion_mask'] = 0
189 | 
190 |             print(task, self.eid_map.shape[0], self.eid_map[f'{task}_exclusion_mask'].sum())
191 | 
192 |     def __getitem__(self, idx):
193 |         fts = self.eid_map.values[idx, :]
194 |         return torch.Tensor(fts)
195 | 
196 |     def __len__(self):
197 |         return self.eid_map.shape[0]
198 | 
199 | 
200 | class BatchedDS(Dataset):
201 |     def __init__(self, dataset, batch_size, attrs=None):
202 |         attrs = ['durations', 'events', ] if attrs is None else attrs
203 |         for attr in attrs:
204 |             try:
205 |                 setattr(self, attr, getattr(dataset, attr))
206 |             except:
207 |                 print('Dataset has not attribute %s' % attr)
208 | 
209 |         self.len = len(dataset)
210 |         self.dataset = dataset
211 |         self.batch_size = batch_size
212 | 
213 |     def __len__(self):
214 |         return self.len // self.batch_size
215 | 
216 |     def __getitem__(self, idx):
217 |         return self.dataset[idx*self.batch_size:idx*self.batch_size+self.batch_size]
218 | 
219 |     @staticmethod
220 |     def default_collate(batch):
221 |         r"""Puts each datamodules field into a tensor with outer dimension batch size"""
222 |         elem = batch[0]
223 |         elem_type = type(elem)
224 |         if isinstance(elem, torch.Tensor):
225 |             out = None
226 |             if torch.utils.data.get_worker_info() is not None:
227 |                 # If we're in a background process, concatenate directly into a
228 |                 # shared memory tensor to avoid an extra copy
229 |                 numel = sum([x.numel() for x in batch])
230 |                 storage = elem.storage()._new_shared(numel)
231 |                 out = elem.new(storage)
232 |             return torch.cat(batch, 0, out=out)
233 |         elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
234 |                 and elem_type.__name__ != 'string_':
235 |             elem = batch[0]
236 |             if elem_type.__name__ == 'ndarray':
237 | 
238 |                 return BatchedDS.default_collate([torch.as_tensor(b) for b in batch])
239 |             elif elem.shape == ():  # scalars
240 |                 return torch.as_tensor(batch)
241 |         elif isinstance(elem, float):
242 |             return torch.tensor(batch, dtype=torch.float64)
243 |         elif isinstance(elem, int):
244 |             return torch.tensor(batch)
245 |         elif isinstance(elem, string_classes):
246 |             return batch
247 |         elif isinstance(elem, container_abcs.Mapping):
248 |             return {key: BatchedDS.default_collate([d[key] for d in batch]) for key in elem}
249 |         elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
250 |             return elem_type(*(BatchedDS.default_collate(samples) for samples in zip(*batch)))
251 |         elif isinstance(elem, container_abcs.Sequence):
252 |             # check to make sure that the elements in batch have consistent size
253 |             it = iter(batch)
254 |             elem_size = len(next(it))
255 |             if not all(len(elem) == elem_size for elem in it):
256 |                 raise RuntimeError('each element in list of batch should be of equal size')
257 |             transposed = zip(*batch)
258 |             return [BatchedDS.default_collate(samples) for samples in transposed]
259 | 
260 | 
261 | class DatasetWrapper(Dataset):
262 |     """
263 |     Wrap multiple datasets (datamodules) with labels (labels).
264 |     Assumes all passed datasets have the same order.
265 |     """
266 |     def __init__(self,
267 |                  covariate_datasets,
268 |                  label_datasets):
269 |         """
270 |         Wrap multiple datasets (datamodules) with labels (labels).
271 |         Assumes all passed datasets have the same order (eid-wise).
272 | 
273 |         :param covariate_datasets:  `list-like`, should contain datasets, samples all in the same order
274 |         :param label_dataset:  `list-like`, shoudl contain datsets
275 |         """
276 |         assert all(len(ds) == len(label_datasets[0])
277 |                    for ds in covariate_datasets + label_datasets), 'datasets need to be same length'
278 |         self.datasets = covariate_datasets
279 |         self.label_datasets = label_datasets
280 | 
281 |     @property
282 |     def eid_map(self):
283 |         return self.datasets[0].eid_map.values
284 | 
285 |     @property
286 |     def durations(self):
287 |         return self.label_datasets[0].eid_map.values
288 | 
289 |     @property
290 |     def events(self):
291 |         return self.label_datasets[1].eid_map.values
292 | 
293 |     def __len__(self):
294 |         return len(self.label_datasets[0])
295 | 
296 |     def __getitem__(self, idx):
297 |         # return a tuple for datasets and a tuple for whatever is in labels
298 |         # ((dataset1, dataset2, dataset3, ..)(duration, labels))
299 |         covariates = tuple([ds[idx] for ds in self.datasets]) if len(self.datasets) > 1 else self.datasets[0][idx]
300 |         labels = tuple([ds[idx] for ds in self.label_datasets]) if len(self.label_datasets) > 1 else self.label_datasets[0][idx]
301 | 
302 |         return covariates, labels
303 | 
304 | 
305 | class LabelPlaceHolderDataset(Dataset):
306 |     """
307 |     This dataset is to be used as a Mockup for the labels dataset in the datasetwrapper if no lablels are needed.
308 |     """
309 |     def __init__(self, eids, feature_dim=10):
310 |         super().__init__()
311 |         # construc mockup:
312 |         self.feature_dim = feature_dim
313 |         mockup_labels = np.zeros((len(eids), ))
314 |         self.eid_map = pd.DataFrame(np.stack([np.asarray(eids), mockup_labels], axis=-1),
315 |                                     columns=['eid', 'MockUpCol']).set_index('eid')
316 |     def __getitem__(self, idx):
317 |         fts = np.zeros(self.feature_dim)
318 |         return torch.Tensor(fts)
319 | 
320 |     def __len__(self):
321 |         return self.eid_map.shape[0]
322 | 


--------------------------------------------------------------------------------
/analysis/preprocessing/pipeline_metabolomics.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pathlib
  3 | import pandas as pd
  4 | import numpy as np
  5 | import prefect as pf
  6 | import miceforest as mf
  7 | from prefect.engine.results import LocalResult
  8 | from prefect.engine.flow_runner import FlowRunner
  9 | from prefect.engine.serializers import JSONSerializer
 10 | from prefect.executors import LocalDaskExecutor
 11 | from sklearn.model_selection import train_test_split
 12 | from sklearn.preprocessing import StandardScaler
 13 | from collections import OrderedDict
 14 | from category_encoders.ordinal import OrdinalEncoder
 15 | import pickle
 16 | 
 17 | 
 18 | output_directory = '/your/output/dir/'
 19 | output_name = 'your_dataset_name'
 20 | 
 21 | json_serializer = JSONSerializer()
 22 | 
 23 | 
 24 | class ApplyImputer(pf.Task):
 25 |     """
 26 |     Takes a list of tuples, where the first pos is the eids_dict, the second is the kernel, the third is the split.
 27 |     Then applies imputer and saves to file.
 28 |     """
 29 |     def __init__(self, *args, **kwargs):
 30 |         super().__init__(*args, **kwargs)
 31 | 
 32 |     def _update_target(self, cv_partition, split):
 33 |         """
 34 |         Update Target string at runtime.
 35 |         :return:
 36 |         """
 37 |         self.target = f"partition_{cv_partition}/{split}_baseline_imputed.csv"
 38 | 
 39 |     def run(self, partition_split_dict):
 40 |         """
 41 |         split tuple is a tuple in the form of
 42 |         (  (partition_idx, eids_dict,  (data_merged, data_merged_description)  ), imputer, split)
 43 |         :param partition_split_dict:
 44 |         :return:
 45 |         """
 46 |         split = partition_split_dict['split']
 47 |         partition = partition_split_dict["cv_partition"]
 48 |         eids = partition_split_dict['eids_dict'][split]
 49 | 
 50 |         assert split in ['test', 'train', 'valid']
 51 |         self._update_target(partition_split_dict['cv_partition'], split)
 52 |         data = partition_split_dict['data'].loc[eids]
 53 | 
 54 |         # Save partitions
 55 |         data_output_path = f"{output_directory}/{output_name}/partition_{partition}/{split}"
 56 |         pathlib.Path(data_output_path).mkdir(parents=True, exist_ok=True)
 57 |         data.reset_index().to_feather(f"{data_output_path}/data.feather")
 58 | 
 59 |         # Impute data
 60 |         with open(partition_split_dict['imputer_path'], "rb") as input_file: imputer = pickle.load(input_file)
 61 |         data_imputed = imputer.impute_new_data(new_data=data).complete_data()
 62 |         partition_split_dict['data'] = data_imputed
 63 | 
 64 |         data_output_path = f"{output_directory}/{output_name}/partition_{partition}/{split}"
 65 |         pathlib.Path(data_output_path).mkdir(parents=True, exist_ok=True)
 66 |         data_imputed.reset_index().to_feather(f"{data_output_path}/data_imputed.feather")
 67 | 
 68 |         return partition_split_dict
 69 | 
 70 | 
 71 | class ApplyNorm(pf.Task):
 72 |     """
 73 |     Takes a list of tuples, where the first pos is the eid_dict, the second is the kernel, the third is the split.
 74 |     Then applies imputer and saves to file.
 75 |     """
 76 |     def __init__(self, *args, **kwargs):
 77 |         super().__init__(*args, **kwargs)
 78 | 
 79 |     def _update_target(self, cv_partition, split):
 80 |         """
 81 |         Update Target string at runtime.
 82 |         :return:
 83 |         """
 84 |         self.target = f"partition_{cv_partition}/{split}_baseline_imputed_normalized.csv"
 85 | 
 86 |     def run(self, partition_split_dict):
 87 |         """
 88 |         DICT
 89 |         :param partition_split_dict:
 90 |         :return:
 91 |         """
 92 |         split = partition_split_dict['split']
 93 |         partition = partition_split_dict['cv_partition']
 94 |         self._update_target(partition, split)
 95 | 
 96 |         description = partition_split_dict['description']
 97 | 
 98 |         noncategorical_covariates = description.reset_index() \
 99 |             .set_index('dtype').loc[['int', "float"]] \
100 |             .query("(isTarget == False) & (based_on != 'diagnoses_emb') & (based_on != 'eid')")['covariate'].values
101 | 
102 | 
103 |         noncat_data = partition_split_dict['data'][noncategorical_covariates].copy()
104 | 
105 |         # log 1p transform!:
106 |         for c in noncat_data.columns:
107 |             if c.startswith('NMR'):
108 |                 noncat_data[c] = np.log1p(noncat_data[c].values)
109 | 
110 |         noncat_data = noncat_data.values
111 | 
112 |         noncat_data = pd.DataFrame(partition_split_dict['normalizer'].transform(noncat_data),
113 |                                    columns=noncategorical_covariates)
114 | 
115 |         for v in noncategorical_covariates:
116 |             partition_split_dict['data'][v] = noncat_data[v].values
117 | 
118 |         # save preprocessed data
119 |         data_output_path = f"{output_directory}/{output_name}/partition_{partition}/{split}"
120 |         pathlib.Path(data_output_path).mkdir(parents=True, exist_ok=True)
121 |         partition_split_dict['data'].reset_index().to_feather(f"{data_output_path}/data_imputed_normalized.feather")
122 | 
123 |         # return partition_split_dict
124 |         return partition_split_dict['data']
125 | 
126 | @pf.task(target="data_merged_dict.p",
127 |          checkpoint=True,
128 |          log_stdout=True,
129 |       result=LocalResult(dir=f"{output_directory}/{output_name}")
130 |       )
131 | def read_and_merge_data(covariate_paths, input_data_dir):
132 |     logger = pf.context.get("logger")
133 |     logger.info("Data")
134 |     data_dfs = [pd.read_feather(f"{input_data_dir}/{covariate_paths[covariate][0]}").set_index("eid") for covariate in covariate_paths]
135 |     data_merged = pd.concat(data_dfs, axis=1).copy()
136 |     output_path = f"{output_directory}/{output_name}"
137 | 
138 |     data_merged.reset_index().to_feather(f"{output_path}/data_merged.feather")
139 | 
140 |     logger.info("Descriptions")
141 |     description_dfs = [pd.read_feather(f"{input_data_dir}/{covariate_paths[covariate][1]}") for covariate in covariate_paths]
142 |     description_merged = pd.concat([df if i == 0 else df.tail(-1) for i, df in enumerate(description_dfs)], axis=0).reset_index()
143 |     description_merged.reset_index(drop=True).to_feather(f"{output_path}/description_merged.feather")
144 | 
145 |     return {"data": data_merged.query('NMR_FLAG==True'), "description": description_merged}
146 | 
147 | @pf.task(name="encode_categoricals",
148 |       target="data_encoded.p",
149 |       checkpoint=True,
150 |       result=LocalResult(dir=f"{output_directory}/{output_name}")
151 |       )
152 | def encode_categoricals(data_dict):
153 |     logger = pf.context.get("logger")
154 |     data = data_dict["data"]
155 |     description = data_dict["description"]
156 | 
157 |     cat_cols = [c for c in description.set_index("dtype").loc[["category"]].covariate.to_list() if "date" not in c]
158 | 
159 |     mapping = [{"col": c, "mapping": {e: i for i, e in enumerate([v for v in data[c].unique().tolist() if v==v])}} for c in cat_cols]
160 |     for i, c in enumerate(cat_cols): mapping[i]["mapping"].update({np.nan: -2})
161 | 
162 |     enc = OrdinalEncoder(cols=cat_cols, mapping=mapping, handle_missing="return_nan")
163 |     data = enc.fit_transform(data)
164 | 
165 |     description["mapping"] = np.nan
166 |     for i, c in enumerate(cat_cols):
167 |         description.loc[description.covariate == c, 'mapping'] = str(enc.mapping[i]["mapping"])
168 |         if data[c].nunique() > 2:
169 |             ohe_encoded = pd.get_dummies(data[c], prefix=c)
170 |             data[ohe_encoded.columns] = ohe_encoded
171 |             for col in ohe_encoded.columns:
172 |                 description = description.append(
173 |                     {"covariate": col, "dtype": "bool", "isTarget": False,
174 |                      "based_on": description.loc[description.covariate == c, "based_on"].iloc[0],
175 |                      "aggr_fn": np.nan, "mapping": str(enc.mapping[i]["mapping"])}, ignore_index=True)
176 |     description["based_on"] = description["based_on"].astype(str)
177 | 
178 |     description.reset_index(drop=True).to_feather(f"{output_directory}/{output_name}/description.feather")
179 | 
180 |     logger.info(f"{len(cat_cols)} columns one-hot-encoded")
181 |     return {"data": data, "description": description}
182 | 
183 | @pf.task(name="apply_exclusion_criteria",
184 |       target="data_merged_excluded_dict.p",
185 |       checkpoint=True,
186 |       result=LocalResult(dir=f"{output_directory}/{output_name}")
187 |       )
188 | def apply_exclusion_criteria(data_dict, exclusion_criteria):
189 |     logger = pf.context.get("logger")
190 |     data = data_dict["data"]
191 |     data_excl = data.copy().query(exclusion_criteria).reset_index(drop=False).set_index("eid")
192 |     output_path = f"{output_directory}/{output_name}"
193 |     data_excl.reset_index().to_feather(f"{output_path}/data_excl.feather")
194 |     logger.info(f"{len(data)-len(data_excl)} eids excluded")
195 |     return {"data": data, "description": data_dict["description"]}
196 | 
197 | @pf.task(name="get_eids_for_partitions",
198 |       target=f"eids.json",
199 |       checkpoint=True,
200 |       result=LocalResult(dir=f"{output_directory}/{output_name}", serializer=json_serializer)
201 |       )
202 | 
203 | def get_eids_for_partitions(data_dict, partition_column, valid_size=0.1):
204 |     logger = pf.context.get("logger")
205 | 
206 |     data_all = data_dict["data"]
207 |     eids_all = data_all.index.values
208 |     groups = data_all.reset_index().set_index(partition_column).index.value_counts().index.to_list()
209 |     splits = {i: data_all.query(f"{partition_column}==@group").index.tolist() for i, group in enumerate(groups)}
210 | 
211 |     eids_dict = OrderedDict()
212 |     for partition in range(len(groups)):
213 |         eids_dict[partition] = {}
214 |         eids_test = splits[partition]
215 |         eids_notest = sorted(list(set(eids_all) - set(eids_test)))
216 |         eids_train, eids_valid = train_test_split(eids_notest, test_size=valid_size, shuffle=False)
217 | 
218 |         if bool(set(eids_train) & set(eids_valid) & set(eids_test)) == True:
219 |             logger.warning(f"Overlap of eids in partition {partition}")
220 |         else:
221 |             logger.info(f"No overlap of eids in partition {partition}")
222 | 
223 |         eids_dict[partition]["train"] = eids_train
224 |         eids_dict[partition]["valid"] = eids_valid
225 |         eids_dict[partition]["test"] = eids_test
226 | 
227 |     return eids_dict
228 | 
229 | @pf.task
230 | def get_partitions(data_dict, eids_dict):
231 |     partition_dicts = [{**data_dict, 'cv_partition': partition_idx, 'eids_dict': eids_dict[partition_idx]} for partition_idx in eids_dict.keys()]
232 |     return partition_dicts
233 | 
234 | 
235 | @pf.task(name="fit_imputer",
236 |       target="{task_name}/{task_full_name}_kernel.p",
237 |       checkpoint=True,
238 |       result=LocalResult(dir=os.path.join(output_directory, output_name, "pipeline/"))
239 |       )
240 | def fit_imputer(partition_dict):
241 |     """
242 |     Fit an imputer to train set and pickle it
243 |     (partition_idx, eids_dict, (data, data_descr) )
244 |     """
245 |     eids_train = partition_dict['eids_dict']['train']
246 |     data = partition_dict['data'].loc[eids_train]
247 |     partition = partition_dict["cv_partition"]
248 | 
249 |     missing = data.columns[data.isna().any()].to_list()
250 |     missing = [col for col in missing if not "NMR_measurement_quality_flagged" in col]
251 | 
252 |     events = [col for col in data.columns if "_event" in col]
253 | 
254 |     variable_schema = {}
255 |     for m in missing:
256 |         variable_schema[m] = [x for x in missing if x != m]+events
257 |     kernel = mf.KernelDataSet(data,
258 |                               variable_schema=variable_schema,
259 |                               save_all_iterations=True,
260 |                               random_state=42)
261 | 
262 |     # Run the MICE algorithm for 3 iterations
263 |     kernel.mice(3, n_jobs=1, n_estimators=8,
264 |                 max_features="sqrt", bootstrap=True, max_depth=8, verbose=True)
265 | 
266 |     data_output_path = f"{output_directory}/{output_name}/partition_{partition}"
267 |     pathlib.Path(data_output_path).mkdir(parents=True, exist_ok=True)
268 | 
269 |     imputer_path = f"{data_output_path}/imputer.p"
270 |     with open(imputer_path, "wb") as output_file: pickle.dump(kernel, output_file)
271 |     del kernel
272 |     return imputer_path
273 | 
274 | @pf.task
275 | def get_splits_per_partition(partition_dict, imputer_path, splits):
276 |     partition_split_dicts = [{**partition_dict, 'imputer_path': imputer_path, 'split': s} for s in splits]
277 |     return partition_split_dicts
278 | 
279 | @pf.task(name="fit_normalization",
280 |          target="{task_name}/{task_full_name}_norm.p",
281 |       checkpoint=True,
282 |       result=LocalResult(dir=os.path.join(output_directory, output_name, "pipeline/"))
283 |       )
284 | def fit_normalization(partition_split_dicts):
285 |     """
286 |     Fit an imputer to train set and pickle it.
287 | 
288 |     imputed_tuples should be a list of dicts of the form:
289 |     data_imputed is the imputed data for a split in the partition for partition idx
290 | 
291 |     """
292 |     # first get vars:
293 |     description = partition_split_dicts[0]['description']
294 |     noncategorical_covariates = description.reset_index() \
295 |         .set_index('dtype').loc[['int', "float"]] \
296 |         .query("(isTarget == False) & (based_on != 'diagnoses_emb') & (based_on != 'eid')")['covariate'].values
297 | 
298 |     # fit normalizer for each train split:
299 |     fitted_normalizers = {}
300 |     for d in partition_split_dicts:
301 |         if d['split'] == 'train':
302 |             if 'eid' in d['data'].columns:
303 |                 data = d['data'].set_index('eid')
304 |             else:
305 |                 data = d['data']
306 |             noncategorical_data = data[noncategorical_covariates]
307 | 
308 |             # log 1p transform!:
309 |             for c in noncategorical_data.columns:
310 |                 if c.startswith('NMR'):
311 |                     noncategorical_data[c] = np.log1p(noncategorical_data[c].values)
312 | 
313 |             noncategorical_data = noncategorical_data.values
314 | 
315 |             norm = StandardScaler(with_mean=True, with_std=True, copy=True).fit(noncategorical_data)
316 |             fitted_normalizers[d['cv_partition']] = norm
317 | 
318 |     partition_split_dicts = [{**d, 'normalizer': fitted_normalizers[d['cv_partition']]} for d in partition_split_dicts]
319 |     return partition_split_dicts
320 | 
321 | 
322 | Impute = ApplyImputer(name="apply_imputer",
323 |                       target=f"partition_23/baseline_imputed.csv",
324 |                       checkpoint=True,
325 |                       result=LocalResult(dir=f"{output_directory}/{output_name}/cv_partitions/"),
326 |                                          # serializer=pd_serializer)
327 |                       )
328 | 
329 | Normalize = ApplyNorm(name="apply_norm",
330 |                       target=f"partition_23/baseline_imputed_normalized.csv",
331 |                       checkpoint=True,
332 |                       result=LocalResult(dir=f"{output_directory}/{output_name}/cv_partitions/"),
333 |                                          # serializer=pd_serializer)
334 |                       )
335 | 
336 | with pf.Flow("ukb_pipeline") as flow:
337 |     input_data_dir = pf.Parameter('input_data',
338 |                                   default=f'{output_name}/2_datasets_pre/210709_metabolomics/')
339 |     partition_column = pf.Parameter('partition_column', default="uk_biobank_assessment_centre")
340 |     valid_size = pf.Parameter('valid_size', default=0.1)
341 |     data_filenames = {
342 |         "covariates": ("baseline_covariates.feather", "baseline_covariates_description.feather"),
343 |         "pgs": ("baseline_pgs.feather", "baseline_pgs_description.feather"),
344 |         "endpoints": ("baseline_endpoints.feather", "baseline_endpoints_description.feather"),
345 |     }
346 | 
347 | 
348 |     data_dict = read_and_merge_data(data_filenames, input_data_dir)
349 |     data_dict = encode_categoricals(data_dict)
350 |     eids_dict = get_eids_for_partitions(data_dict, partition_column=partition_column, valid_size=valid_size)
351 |     partition_dicts = get_partitions(data_dict, eids_dict)
352 | 
353 |     # fit imputer per partition
354 |     imputer_paths = fit_imputer.map(partition_dict=partition_dicts)
355 | 
356 |     partition_split_dicts = get_splits_per_partition.map(partition_dicts,
357 |                                                      imputer_paths,
358 |                                                      splits=pf.unmapped(['train', 'test', 'valid'])
359 |                                                      )
360 | 
361 |     partition_split_dicts = Impute.map(partition_split_dict=pf.flatten(partition_split_dicts))
362 |     partition_split_dicts = fit_normalization(partition_split_dicts=partition_split_dicts)
363 | 
364 |     normalized = Normalize.map(partition_split_dict=partition_split_dicts)
365 | 
366 | if __name__ == "__main__":
367 |     flow.executor = LocalDaskExecutor(scheduler="threads", num_workers=60)
368 | 
369 |     # run locally
370 |     runner = FlowRunner(flow=flow)
371 |     flow_state = runner.run(return_tasks=flow.tasks)
372 | 


--------------------------------------------------------------------------------
/metabolomicstatemodel/source/datamodules.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | import pandas as pd
  3 | import numpy as np
  4 | import pytorch_lightning as pl
  5 | from torch.utils.data import DataLoader
  6 | from omegaconf import OmegaConf, ListConfig, DictConfig
  7 | 
  8 | from .datasets import TabularDataset, DatasetWrapper, BatchedDS, ExclusionMaskDataset
  9 | 
 10 | 
 11 | class RiskianoDataModule(pl.LightningDataModule):
 12 |     def __init__(self, batch_size=128, num_workers=8, tabular_filepath='', use_batched_ds=False,
 13 |                  return_rank_mat=None, output_dim=None, fast_dev_run=None, cv_partition=None, **kwargs):
 14 |         """
 15 |         Abstract DataModule Class for Riskiano.
 16 | 
 17 |         The __init__ of this calss should be called in every inherited class.
 18 | 
 19 |         A few points to consider:
 20 |             - hardcode filepaths for versioning
 21 |             - make durations, events and other labels explicit attributes
 22 |             - define transformations etc in the `__init__()`
 23 |             - The logic of how exactly the individual datasets are instantiated should be defined in the
 24 |               `get_dataset()` method. This method NEEDS TO BE DEFINED PER USECASE, and will be called in `setup()`.
 25 | 
 26 |         :param batch_size: `int`, batchsize to use, needs to be passed for the BatchedDS
 27 |         :param num_workers: `int`, number of workers for the DataLoaders
 28 |         :param use_batched_ds: `bool`, whether to use the batchedDS (`True`) or not (`False`). Defaults to `False`.
 29 |         :param output_categorical: `bool`, whether to output categorical columns (`True`) vs. 1-hot/binary columns (`False`)
 30 |         :param return_rank_mat: `bool`, whether to return the rank_mat for DeepHitTraining
 31 |         :param output_dim: `int`, output-dimension of the network, needed for cuts and rank_mat calculation, can be ommitted of rank_mat equals False
 32 |         :param fast_dev_run: `bool`, similar to pl.Trainer FLAG. in this case limits the eid_map to 100 eids.
 33 |         :param kwargs:
 34 |         """
 35 |         super().__init__()
 36 |         self.cv_partition = cv_partition
 37 |         self.batch_size = batch_size
 38 |         self.num_workers = num_workers
 39 |         self.tabular_filepath = tabular_filepath
 40 |         self.use_batched_ds = use_batched_ds
 41 |         self.fast_dev_run = fast_dev_run
 42 | 
 43 |         self.return_rank_mat = return_rank_mat
 44 |         if self.return_rank_mat:
 45 |             assert output_dim is not None, 'Rank mat computation needs out_dim!'
 46 |         self.output_dim = output_dim
 47 |         self.cuts = None
 48 | 
 49 |     def get_batched_ds(self, ds):
 50 |         if self.return_rank_mat:
 51 |             raise NotImplementedError()
 52 |         else:
 53 |             return BatchedDS(ds, batch_size=self.batch_size)
 54 | 
 55 |     def get_dataset(self, split):
 56 |         raise NotImplementedError('Implement according to usecase.')
 57 | 
 58 |     def setup(self, stage=None):
 59 |         self.train_ds = self.get_dataset('train')
 60 |         self.valid_ds = self.get_dataset('valid')
 61 |         try:
 62 |             self.test_ds = self.get_dataset('test')
 63 |         except AssertionError:
 64 |             print('No test split defined to this data.')
 65 | 
 66 |         if self.return_rank_mat:
 67 |             self.cuts = self.get_time_cuts()
 68 | 
 69 |     def get_time_cuts(self, max_time=None):
 70 |         """
 71 |         Get the interval borders for the discrete times.
 72 |         :param n_durations:
 73 |         :param ds:
 74 |         :return:
 75 |         """
 76 |         if self.cuts is not None:
 77 |             return self.cuts
 78 | 
 79 |         loader = DataLoader(self.train_ds, batch_size=1024, num_workers=self.num_workers, shuffle=False, drop_last=False)
 80 | 
 81 |         if max_time is None:
 82 |             max_time = -np.inf
 83 |             for data in loader:
 84 |                 _, (durations, _) = data
 85 |                 max_duration = float(durations.max())
 86 |                 if max_time < max_duration:
 87 |                     max_time = max_duration
 88 |         return np.linspace(0, max_time, self.output_dim + 1)
 89 | 
 90 |     def train_dataloader(self):
 91 |         if self.use_batched_ds:
 92 |             return DataLoader(self.get_batched_ds(self.train_ds),
 93 |                               num_workers=self.num_workers, pin_memory=True, collate_fn=BatchedDS.default_collate,
 94 |                               shuffle=True)
 95 |         else:
 96 |             return DataLoader(self.train_ds, batch_size=self.batch_size,
 97 |                               num_workers=self.num_workers, shuffle=True)
 98 | 
 99 |     def val_dataloader(self):
100 |         if self.use_batched_ds:
101 |             return DataLoader(self.get_batched_ds(self.valid_ds),
102 |                               num_workers=self.num_workers, pin_memory=True, collate_fn=BatchedDS.default_collate,
103 |                               shuffle=False)
104 |         else:
105 |             return DataLoader(self.valid_ds, batch_size=self.batch_size,
106 |                               num_workers=self.num_workers, shuffle=False)
107 | 
108 |     def test_dataloader(self):
109 |         if not self.use_batched_ds:
110 |             return DataLoader(self.test_ds, batch_size=self.batch_size,
111 |                               num_workers=self.num_workers, shuffle=False)
112 |         else:
113 |             return DataLoader(self.get_batched_ds(self.test_ds),
114 |                               num_workers=self.num_workers, pin_memory=True, collate_fn=BatchedDS.default_collate,
115 |                               shuffle=False)
116 | 
117 | 
118 | class UKBBSurvivalDatamodule(RiskianoDataModule):
119 |     """
120 |     Datamodule for survival training on UKBB data.
121 | 
122 |     :param batch_size: `int`, batchsize needed for outputting the rankmat + loaders
123 |     :param num_workers: `int`, num_workers
124 |     :param tabular_filepath: `str`, path to the ukbb data file.
125 |     :param use_batched_ds: `bool`, whether to use the batched_dataset.
126 |     :param features: `Union([dict, list]), features/covariates to use.
127 |     :param duration: `str`, the duration col in the datset file
128 |     :param event: `str`, the event col in the datset file
129 |     :param return_rank_mat: `bool`, whether to return rank_mat (required to DeepHit Model) or not, default=False
130 |     :param output_dim: `int`, n-timepoints in descrete time model, required for rank_mat generation -> required for DeepHit Model
131 |     :param fast_dev_run: `bool`, run smoke test, default=False
132 |     :param cv_partition: `int`, partition to read data from.
133 |     :param output_categorical: `bool`, wheter to ourput raw categories (ints -> True) or do 1-hot encoding (False), Default=False
134 |     :param exclusion_criteria: `dict`, dict of the form {`sets_apply`: [`train`, `valid`]} -> to which sets to apply exclusion criteria. Default = None
135 |     :param kwargs:
136 |     """
137 |     def __init__(self,
138 |                  batch_size=128,
139 |                  num_workers=8,
140 |                  tabular_filepath="",
141 |                  use_batched_ds=False,
142 |                  features={},
143 |                  duration='',
144 |                  event='',
145 |                  return_rank_mat=None,
146 |                  output_dim=None,
147 |                  clip=False,
148 |                  fast_dev_run=False,
149 |                  cv_partition=0,
150 |                  output_categorical=False,
151 |                  cohort_definition=None,
152 |                  oversampling=False,
153 |                  **kwargs):
154 |         super().__init__(batch_size=batch_size, num_workers=num_workers, tabular_filepath=tabular_filepath,
155 |                          use_batched_ds=use_batched_ds,
156 |                          return_rank_mat=return_rank_mat, output_dim=output_dim, fast_dev_run=fast_dev_run)
157 | 
158 |         self.cv_partition=cv_partition
159 |         self.cohort_definition = cohort_definition if not isinstance(cohort_definition, DictConfig) \
160 |             else OmegaConf.to_container(cohort_definition, resolve=True)
161 | 
162 |         assert isinstance(features, (dict, list, ListConfig, DictConfig)), 'Features must be dict or list.'
163 | 
164 |         features = features if not isinstance(features, (ListConfig, DictConfig)) \
165 |             else OmegaConf.to_container(features, resolve=True)
166 | 
167 |         if isinstance(features, dict):
168 |             if output_categorical == False:
169 |                 self.features = {**features["one_hot_enc"], **features["general"]}
170 |             else:
171 |                 self.features = {**features["categorical"], **features["general"]}
172 |             print(self.features)
173 |             self.features = [f for group_list in self.features.values() for f in group_list]
174 |         else:
175 |             self.features = features
176 | 
177 |         print(type(self.features))
178 | 
179 |         self.duration = duration if not isinstance(duration, (ListConfig, DictConfig)) \
180 |             else OmegaConf.to_container(duration, resolve=True)
181 |         self.event = event if not isinstance(event, (ListConfig, DictConfig)) \
182 |             else OmegaConf.to_container(event, resolve=True)
183 |         self.clip = clip
184 |         self.oversampling = oversampling
185 | 
186 |     def get_dataset(self, split):
187 |         filepath = f'{self.tabular_filepath}/partition_{self.cv_partition}/{split}/data_imputed_normalized.feather'
188 |         print(filepath)
189 |         if self.cohort_definition is not None:
190 |             if split in self.cohort_definition.keys():
191 |                     eids = pd.read_feather(f"{self.tabular_filepath}/data_merged.feather").query(self.cohort_definition[split]).eid.to_list()
192 |             else:
193 |                 eids = None
194 |         else:
195 |             eids = None
196 | 
197 |         ds = TabularDataset(filepath, self.features, eid_selection_mask=eids)
198 |         if self.clip:
199 |             upperq = ds.eid_map.quantile(.99)
200 |             lowerq = ds.eid_map.quantile(.01)
201 |             for c in self.features:
202 |                 ds.eid_map.loc[:, c] = ds.eid_map[c].clip(
203 |                     lower=lowerq[c], upper=upperq[c])
204 |         covariate_datasets = [ds]
205 |         label_datasets = [TabularDataset(filepath, self.duration, eid_selection_mask=eids),
206 |                           TabularDataset(filepath, self.event, eid_selection_mask=eids)]
207 | 
208 |         # make sure we have observations for each label:
209 |         print(split)
210 |         print(label_datasets[1].eid_map[[c for c in label_datasets[1].eid_map.columns if 'event' in c]].sum())
211 | 
212 |         # oversample if needed:
213 |         if split == 'train' and self.oversampling:
214 |             assert len(self.event) == 1, 'Oversampling only possible for single events.'
215 |             pos_eids = label_datasets[1].eid_map.query(f'{self.event[0]}==1').index.values
216 |             # augment sets:
217 |             for ds_list in [covariate_datasets, label_datasets]:
218 |                 for ds in ds_list:
219 |                     pos_ds = pd.concat(10*[ds.eid_map.loc[pos_eids].copy()], axis=0)
220 |                     print(pos_ds.head())
221 |                     pos_ds = pos_ds.reset_index(drop=True)
222 |                     pos_ds.index.name = 'eid'
223 |                     print(pos_ds.head())
224 |                     ds.eid_map = pd.concat([ds.eid_map, pos_ds], axis=0)
225 | 
226 |             # make sure we have observations for each label:
227 |             print(split)
228 |             print(label_datasets[1].eid_map[[c for c in label_datasets[1].eid_map.columns if 'event' in c]].sum())
229 | 
230 |         return DatasetWrapper(covariate_datasets, label_datasets)
231 | 
232 | 
233 | class UKBBSurvivalDatamoduleWithExclusions(UKBBSurvivalDatamodule):
234 |     """
235 |     Datamodule for survival training on UKBB data, that explicitly generates exclusion masks for the model.
236 | 
237 |     :param batch_size: `int`, batchsize needed for outputting the rankmat + loaders
238 |     :param num_workers: `int`, num_workers
239 |     :param tabular_filepath: `str`, path to the ukbb data file.
240 |     :param use_batched_ds: `bool`, whether to use the batched_dataset.
241 |     :param features: `Union([dict, list]), features/covariates to use.
242 |     :param duration: `str`, the duration col in the datset file
243 |     :param event: `str`, the event col in the datset file
244 |     :param return_rank_mat: `bool`, whether to return rank_mat (required to DeepHit Model) or not, default=False
245 |     :param output_dim: `int`, n-timepoints in descrete time model, required for rank_mat generation -> required for DeepHit Model
246 |     :param fast_dev_run: `bool`, run smoke test, default=False
247 |     :param cv_partition: `int`, partition to read data from.
248 |     :param output_categorical: `bool`, wheter to ourput raw categories (ints -> True) or do 1-hot encoding (False), Default=False
249 |     :param exclusion_criteria: `dict`, dict of the form {`sets_apply`: [`train`, `valid`]} -> to which sets to apply exclusion criteria. Default = None
250 |     :param kwargs:
251 |     """
252 |     def __init__(self,
253 |                  batch_size=128,
254 |                  num_workers=8,
255 |                  tabular_filepath="",
256 |                  use_batched_ds=False,
257 |                  features={},
258 |                  duration='',
259 |                  event='',
260 |                  return_rank_mat=None,
261 |                  output_dim=None,
262 |                  clip=False,
263 |                  fast_dev_run=False,
264 |                  cv_partition=0,
265 |                  output_categorical=False,
266 |                  cohort_definition=None,
267 |                  oversampling=False,
268 |                  **kwargs):
269 |         super().__init__(
270 |             batch_size=batch_size,
271 |             num_workers=num_workers,
272 |             tabular_filepath=tabular_filepath,
273 |             use_batched_ds=use_batched_ds,
274 |             features=features,
275 |             duration=duration,
276 |             event=event,
277 |             return_rank_mat=return_rank_mat,
278 |             output_dim=output_dim,
279 |             clip=clip,
280 |             fast_dev_run=fast_dev_run,
281 |             cv_partition=cv_partition,
282 |             output_categorical=output_categorical,
283 |             cohort_definition=None,
284 |             oversampling=oversampling)
285 | 
286 |         # self.cohort_definition = cohort_definition if not isinstance(cohort_definition, DictConfig) \
287 |         #     else OmegaConf.to_container(cohort_definition, resolve=True)
288 |         self.cohort_definition = cohort_definition
289 | 
290 |     def get_dataset(self, split):
291 |         filepath = f'{self.tabular_filepath}/partition_{self.cv_partition}/{split}/data_imputed_normalized.feather'
292 |         print(filepath)
293 |         if self.cohort_definition is not None:
294 |             if split in self.cohort_definition.general.keys():
295 |                 eids = pd.read_feather(f"{self.tabular_filepath}/data_merged.feather").query(self.cohort_definition.general[split]).eid.to_list()
296 |             else:
297 |                 eids = None
298 |         else:
299 |             eids = None
300 | 
301 |         ds = TabularDataset(filepath, self.features, eid_selection_mask=eids)
302 |         if self.clip:
303 |             upperq = ds.eid_map.quantile(.99)
304 |             lowerq = ds.eid_map.quantile(.01)
305 |             for c in self.features:
306 |                 ds.eid_map.loc[:, c] = ds.eid_map[c].clip(
307 |                     lower=lowerq[c], upper=upperq[c])
308 |         mask_ds = ExclusionMaskDataset(filepath, exclusion_criteria_dict=self.cohort_definition.task_specific, eid_selection_mask=eids)
309 |         covariate_datasets = [ds, mask_ds]
310 |         label_datasets = [TabularDataset(filepath, self.duration, eid_selection_mask=eids),
311 |                           TabularDataset(filepath, self.event, eid_selection_mask=eids)]
312 | 
313 |         # make sure we have observations for each label:
314 |         print(split)
315 |         print(label_datasets[1].eid_map[[c for c in label_datasets[1].eid_map.columns if 'event' in c]].sum())
316 | 
317 |         # oversample if needed:
318 |         if split == 'train' and self.oversampling:
319 |             assert len(self.event) == 1, 'Oversampling only possible for single events.'
320 |             pos_eids = label_datasets[1].eid_map.query(f'{self.event[0]}==1').index.values
321 |             # augment sets:
322 |             for ds_list in [covariate_datasets, label_datasets]:
323 |                 for ds in ds_list:
324 |                     pos_ds = pd.concat(10*[ds.eid_map.loc[pos_eids].copy()], axis=0)
325 |                     print(pos_ds.head())
326 |                     pos_ds = pos_ds.reset_index(drop=True)
327 |                     pos_ds.index.name = 'eid'
328 |                     print(pos_ds.head())
329 |                     ds.eid_map = pd.concat([ds.eid_map, pos_ds], axis=0)
330 | 
331 |             # make sure we have observations for each label:
332 |             print(split)
333 |             print(label_datasets[1].eid_map[[c for c in label_datasets[1].eid_map.columns if 'event' in c]].sum())
334 | 
335 |         return DatasetWrapper(covariate_datasets, label_datasets)
336 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution-NonCommercial-ShareAlike 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 |     wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More considerations
 52 |      for the public:
 53 |     wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
 58 | Public License
 59 | 
 60 | By exercising the Licensed Rights (defined below), You accept and agree
 61 | to be bound by the terms and conditions of this Creative Commons
 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
 63 | ("Public License"). To the extent this Public License may be
 64 | interpreted as a contract, You are granted the Licensed Rights in
 65 | consideration of Your acceptance of these terms and conditions, and the
 66 | Licensor grants You such rights in consideration of benefits the
 67 | Licensor receives from making the Licensed Material available under
 68 | these terms and conditions.
 69 | 
 70 | 
 71 | Section 1 -- Definitions.
 72 | 
 73 |   a. Adapted Material means material subject to Copyright and Similar
 74 |      Rights that is derived from or based upon the Licensed Material
 75 |      and in which the Licensed Material is translated, altered,
 76 |      arranged, transformed, or otherwise modified in a manner requiring
 77 |      permission under the Copyright and Similar Rights held by the
 78 |      Licensor. For purposes of this Public License, where the Licensed
 79 |      Material is a musical work, performance, or sound recording,
 80 |      Adapted Material is always produced where the Licensed Material is
 81 |      synched in timed relation with a moving image.
 82 | 
 83 |   b. Adapter's License means the license You apply to Your Copyright
 84 |      and Similar Rights in Your contributions to Adapted Material in
 85 |      accordance with the terms and conditions of this Public License.
 86 | 
 87 |   c. BY-NC-SA Compatible License means a license listed at
 88 |      creativecommons.org/compatiblelicenses, approved by Creative
 89 |      Commons as essentially the equivalent of this Public License.
 90 | 
 91 |   d. Copyright and Similar Rights means copyright and/or similar rights
 92 |      closely related to copyright including, without limitation,
 93 |      performance, broadcast, sound recording, and Sui Generis Database
 94 |      Rights, without regard to how the rights are labeled or
 95 |      categorized. For purposes of this Public License, the rights
 96 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 97 |      Rights.
 98 | 
 99 |   e. Effective Technological Measures means those measures that, in the
100 |      absence of proper authority, may not be circumvented under laws
101 |      fulfilling obligations under Article 11 of the WIPO Copyright
102 |      Treaty adopted on December 20, 1996, and/or similar international
103 |      agreements.
104 | 
105 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
106 |      any other exception or limitation to Copyright and Similar Rights
107 |      that applies to Your use of the Licensed Material.
108 | 
109 |   g. License Elements means the license attributes listed in the name
110 |      of a Creative Commons Public License. The License Elements of this
111 |      Public License are Attribution, NonCommercial, and ShareAlike.
112 | 
113 |   h. Licensed Material means the artistic or literary work, database,
114 |      or other material to which the Licensor applied this Public
115 |      License.
116 | 
117 |   i. Licensed Rights means the rights granted to You subject to the
118 |      terms and conditions of this Public License, which are limited to
119 |      all Copyright and Similar Rights that apply to Your use of the
120 |      Licensed Material and that the Licensor has authority to license.
121 | 
122 |   j. Licensor means the individual(s) or entity(ies) granting rights
123 |      under this Public License.
124 | 
125 |   k. NonCommercial means not primarily intended for or directed towards
126 |      commercial advantage or monetary compensation. For purposes of
127 |      this Public License, the exchange of the Licensed Material for
128 |      other material subject to Copyright and Similar Rights by digital
129 |      file-sharing or similar means is NonCommercial provided there is
130 |      no payment of monetary compensation in connection with the
131 |      exchange.
132 | 
133 |   l. Share means to provide material to the public by any means or
134 |      process that requires permission under the Licensed Rights, such
135 |      as reproduction, public display, public performance, distribution,
136 |      dissemination, communication, or importation, and to make material
137 |      available to the public including in ways that members of the
138 |      public may access the material from a place and at a time
139 |      individually chosen by them.
140 | 
141 |   m. Sui Generis Database Rights means rights other than copyright
142 |      resulting from Directive 96/9/EC of the European Parliament and of
143 |      the Council of 11 March 1996 on the legal protection of databases,
144 |      as amended and/or succeeded, as well as other essentially
145 |      equivalent rights anywhere in the world.
146 | 
147 |   n. You means the individual or entity exercising the Licensed Rights
148 |      under this Public License. Your has a corresponding meaning.
149 | 
150 | 
151 | Section 2 -- Scope.
152 | 
153 |   a. License grant.
154 | 
155 |        1. Subject to the terms and conditions of this Public License,
156 |           the Licensor hereby grants You a worldwide, royalty-free,
157 |           non-sublicensable, non-exclusive, irrevocable license to
158 |           exercise the Licensed Rights in the Licensed Material to:
159 | 
160 |             a. reproduce and Share the Licensed Material, in whole or
161 |                in part, for NonCommercial purposes only; and
162 | 
163 |             b. produce, reproduce, and Share Adapted Material for
164 |                NonCommercial purposes only.
165 | 
166 |        2. Exceptions and Limitations. For the avoidance of doubt, where
167 |           Exceptions and Limitations apply to Your use, this Public
168 |           License does not apply, and You do not need to comply with
169 |           its terms and conditions.
170 | 
171 |        3. Term. The term of this Public License is specified in Section
172 |           6(a).
173 | 
174 |        4. Media and formats; technical modifications allowed. The
175 |           Licensor authorizes You to exercise the Licensed Rights in
176 |           all media and formats whether now known or hereafter created,
177 |           and to make technical modifications necessary to do so. The
178 |           Licensor waives and/or agrees not to assert any right or
179 |           authority to forbid You from making technical modifications
180 |           necessary to exercise the Licensed Rights, including
181 |           technical modifications necessary to circumvent Effective
182 |           Technological Measures. For purposes of this Public License,
183 |           simply making modifications authorized by this Section 2(a)
184 |           (4) never produces Adapted Material.
185 | 
186 |        5. Downstream recipients.
187 | 
188 |             a. Offer from the Licensor -- Licensed Material. Every
189 |                recipient of the Licensed Material automatically
190 |                receives an offer from the Licensor to exercise the
191 |                Licensed Rights under the terms and conditions of this
192 |                Public License.
193 | 
194 |             b. Additional offer from the Licensor -- Adapted Material.
195 |                Every recipient of Adapted Material from You
196 |                automatically receives an offer from the Licensor to
197 |                exercise the Licensed Rights in the Adapted Material
198 |                under the conditions of the Adapter's License You apply.
199 | 
200 |             c. No downstream restrictions. You may not offer or impose
201 |                any additional or different terms or conditions on, or
202 |                apply any Effective Technological Measures to, the
203 |                Licensed Material if doing so restricts exercise of the
204 |                Licensed Rights by any recipient of the Licensed
205 |                Material.
206 | 
207 |        6. No endorsement. Nothing in this Public License constitutes or
208 |           may be construed as permission to assert or imply that You
209 |           are, or that Your use of the Licensed Material is, connected
210 |           with, or sponsored, endorsed, or granted official status by,
211 |           the Licensor or others designated to receive attribution as
212 |           provided in Section 3(a)(1)(A)(i).
213 | 
214 |   b. Other rights.
215 | 
216 |        1. Moral rights, such as the right of integrity, are not
217 |           licensed under this Public License, nor are publicity,
218 |           privacy, and/or other similar personality rights; however, to
219 |           the extent possible, the Licensor waives and/or agrees not to
220 |           assert any such rights held by the Licensor to the limited
221 |           extent necessary to allow You to exercise the Licensed
222 |           Rights, but not otherwise.
223 | 
224 |        2. Patent and trademark rights are not licensed under this
225 |           Public License.
226 | 
227 |        3. To the extent possible, the Licensor waives any right to
228 |           collect royalties from You for the exercise of the Licensed
229 |           Rights, whether directly or through a collecting society
230 |           under any voluntary or waivable statutory or compulsory
231 |           licensing scheme. In all other cases the Licensor expressly
232 |           reserves any right to collect such royalties, including when
233 |           the Licensed Material is used other than for NonCommercial
234 |           purposes.
235 | 
236 | 
237 | Section 3 -- License Conditions.
238 | 
239 | Your exercise of the Licensed Rights is expressly made subject to the
240 | following conditions.
241 | 
242 |   a. Attribution.
243 | 
244 |        1. If You Share the Licensed Material (including in modified
245 |           form), You must:
246 | 
247 |             a. retain the following if it is supplied by the Licensor
248 |                with the Licensed Material:
249 | 
250 |                  i. identification of the creator(s) of the Licensed
251 |                     Material and any others designated to receive
252 |                     attribution, in any reasonable manner requested by
253 |                     the Licensor (including by pseudonym if
254 |                     designated);
255 | 
256 |                 ii. a copyright notice;
257 | 
258 |                iii. a notice that refers to this Public License;
259 | 
260 |                 iv. a notice that refers to the disclaimer of
261 |                     warranties;
262 | 
263 |                  v. a URI or hyperlink to the Licensed Material to the
264 |                     extent reasonably practicable;
265 | 
266 |             b. indicate if You modified the Licensed Material and
267 |                retain an indication of any previous modifications; and
268 | 
269 |             c. indicate the Licensed Material is licensed under this
270 |                Public License, and include the text of, or the URI or
271 |                hyperlink to, this Public License.
272 | 
273 |        2. You may satisfy the conditions in Section 3(a)(1) in any
274 |           reasonable manner based on the medium, means, and context in
275 |           which You Share the Licensed Material. For example, it may be
276 |           reasonable to satisfy the conditions by providing a URI or
277 |           hyperlink to a resource that includes the required
278 |           information.
279 |        3. If requested by the Licensor, You must remove any of the
280 |           information required by Section 3(a)(1)(A) to the extent
281 |           reasonably practicable.
282 | 
283 |   b. ShareAlike.
284 | 
285 |      In addition to the conditions in Section 3(a), if You Share
286 |      Adapted Material You produce, the following conditions also apply.
287 | 
288 |        1. The Adapter's License You apply must be a Creative Commons
289 |           license with the same License Elements, this version or
290 |           later, or a BY-NC-SA Compatible License.
291 | 
292 |        2. You must include the text of, or the URI or hyperlink to, the
293 |           Adapter's License You apply. You may satisfy this condition
294 |           in any reasonable manner based on the medium, means, and
295 |           context in which You Share Adapted Material.
296 | 
297 |        3. You may not offer or impose any additional or different terms
298 |           or conditions on, or apply any Effective Technological
299 |           Measures to, Adapted Material that restrict exercise of the
300 |           rights granted under the Adapter's License You apply.
301 | 
302 | 
303 | Section 4 -- Sui Generis Database Rights.
304 | 
305 | Where the Licensed Rights include Sui Generis Database Rights that
306 | apply to Your use of the Licensed Material:
307 | 
308 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
309 |      to extract, reuse, reproduce, and Share all or a substantial
310 |      portion of the contents of the database for NonCommercial purposes
311 |      only;
312 | 
313 |   b. if You include all or a substantial portion of the database
314 |      contents in a database in which You have Sui Generis Database
315 |      Rights, then the database in which You have Sui Generis Database
316 |      Rights (but not its individual contents) is Adapted Material,
317 |      including for purposes of Section 3(b); and
318 | 
319 |   c. You must comply with the conditions in Section 3(a) if You Share
320 |      all or a substantial portion of the contents of the database.
321 | 
322 | For the avoidance of doubt, this Section 4 supplements and does not
323 | replace Your obligations under this Public License where the Licensed
324 | Rights include other Copyright and Similar Rights.
325 | 
326 | 
327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
328 | 
329 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
330 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
331 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
332 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
333 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
334 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
335 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
336 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
337 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
338 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
339 | 
340 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
341 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
342 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
343 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
344 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
345 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
346 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
347 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
348 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
349 | 
350 |   c. The disclaimer of warranties and limitation of liability provided
351 |      above shall be interpreted in a manner that, to the extent
352 |      possible, most closely approximates an absolute disclaimer and
353 |      waiver of all liability.
354 | 
355 | 
356 | Section 6 -- Term and Termination.
357 | 
358 |   a. This Public License applies for the term of the Copyright and
359 |      Similar Rights licensed here. However, if You fail to comply with
360 |      this Public License, then Your rights under this Public License
361 |      terminate automatically.
362 | 
363 |   b. Where Your right to use the Licensed Material has terminated under
364 |      Section 6(a), it reinstates:
365 | 
366 |        1. automatically as of the date the violation is cured, provided
367 |           it is cured within 30 days of Your discovery of the
368 |           violation; or
369 | 
370 |        2. upon express reinstatement by the Licensor.
371 | 
372 |      For the avoidance of doubt, this Section 6(b) does not affect any
373 |      right the Licensor may have to seek remedies for Your violations
374 |      of this Public License.
375 | 
376 |   c. For the avoidance of doubt, the Licensor may also offer the
377 |      Licensed Material under separate terms or conditions or stop
378 |      distributing the Licensed Material at any time; however, doing so
379 |      will not terminate this Public License.
380 | 
381 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
382 |      License.
383 | 
384 | 
385 | Section 7 -- Other Terms and Conditions.
386 | 
387 |   a. The Licensor shall not be bound by any additional or different
388 |      terms or conditions communicated by You unless expressly agreed.
389 | 
390 |   b. Any arrangements, understandings, or agreements regarding the
391 |      Licensed Material not stated herein are separate from and
392 |      independent of the terms and conditions of this Public License.
393 | 
394 | 
395 | Section 8 -- Interpretation.
396 | 
397 |   a. For the avoidance of doubt, this Public License does not, and
398 |      shall not be interpreted to, reduce, limit, restrict, or impose
399 |      conditions on any use of the Licensed Material that could lawfully
400 |      be made without permission under this Public License.
401 | 
402 |   b. To the extent possible, if any provision of this Public License is
403 |      deemed unenforceable, it shall be automatically reformed to the
404 |      minimum extent necessary to make it enforceable. If the provision
405 |      cannot be reformed, it shall be severed from this Public License
406 |      without affecting the enforceability of the remaining terms and
407 |      conditions.
408 | 
409 |   c. No term or condition of this Public License will be waived and no
410 |      failure to comply consented to unless expressly agreed to by the
411 |      Licensor.
412 | 
413 |   d. Nothing in this Public License constitutes or may be interpreted
414 |      as a limitation upon, or waiver of, any privileges and immunities
415 |      that apply to the Licensor or You, including from the legal
416 |      processes of any jurisdiction or authority.
417 | 
418 | =======================================================================
419 | 
420 | Creative Commons is not a party to its public
421 | licenses. Notwithstanding, Creative Commons may elect to apply one of
422 | its public licenses to material it publishes and in those instances
423 | will be considered the “Licensor.” The text of the Creative Commons
424 | public licenses is dedicated to the public domain under the CC0 Public
425 | Domain Dedication. Except for the limited purpose of indicating that
426 | material is shared under a Creative Commons public license or as
427 | otherwise permitted by the Creative Commons policies published at
428 | creativecommons.org/policies, Creative Commons does not authorize the
429 | use of the trademark "Creative Commons" or any other trademark or logo
430 | of Creative Commons without its prior written consent including,
431 | without limitation, in connection with any unauthorized modifications
432 | to any of its public licenses or any other arrangements,
433 | understandings, or agreements concerning use of licensed material. For
434 | the avoidance of doubt, this paragraph does not form part of the
435 | public licenses.
436 | 
437 | Creative Commons may be contacted at creativecommons.org.
438 | 


--------------------------------------------------------------------------------
/analysis/preprocessing/2_preprocessing_clinical_endpoints.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Preprocessing"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "ExecuteTime": {
 15 |      "end_time": "2020-11-04T12:31:49.436340Z",
 16 |      "start_time": "2020-11-04T12:31:48.732042Z"
 17 |     }
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np\n",
 23 |     "import os\n",
 24 |     "import yaml\n",
 25 |     "from tqdm.notebook import tqdm"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "dataset_name = \"name_of_your_dataset\"\n",
 34 |     "path = \"/path/to/mapping/files\"\n",
 35 |     "data_path = \"/path/to/decoded/output\"\n",
 36 |     "dataset_path = f\"{data_path}/2_datasets_pre/{dataset_name}\""
 37 |    ],
 38 |    "metadata": {
 39 |     "collapsed": false,
 40 |     "pycharm": {
 41 |      "name": "#%%\n"
 42 |     }
 43 |    }
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "ExecuteTime": {
 50 |      "end_time": "2020-11-04T12:31:49.895222Z",
 51 |      "start_time": "2020-11-04T12:31:49.891332Z"
 52 |     }
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "from pathlib import Path\n",
 57 |     "Path(dataset_path).mkdir(parents=True, exist_ok=True)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {
 64 |     "ExecuteTime": {
 65 |      "end_time": "2020-11-04T12:33:14.171198Z",
 66 |      "start_time": "2020-11-04T12:31:50.204540Z"
 67 |     }
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "data = pd.read_feather(f\"{data_path}/1_decoded/ukb_data_210517.feather\")\n",
 72 |     "data_field = pd.read_feather(f\"{data_path}/1_decoded/ukb_data_field_210517.feather\")\n",
 73 |     "data_columns = data.columns.to_list()"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "## Mappings + Vocabulary"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {
 87 |     "ExecuteTime": {
 88 |      "end_time": "2020-11-04T12:34:05.867152Z",
 89 |      "start_time": "2020-11-04T12:33:16.878773Z"
 90 |     }
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "# Drop obvious missing data\n",
 95 |     "print(len(data))\n",
 96 |     "data = data.dropna(subset=[\"sex_f31_0_0\"], axis=0)\n",
 97 |     "print(len(data))"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "# Starting information"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {
111 |     "ExecuteTime": {
112 |      "end_time": "2020-11-04T12:34:05.872216Z",
113 |      "start_time": "2020-11-04T12:34:05.869505Z"
114 |     }
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "time0_col=\"date_of_attending_assessment_centre_f53_0_0\""
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "# Baseline covariates"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {
132 |     "ExecuteTime": {
133 |      "end_time": "2020-11-04T12:34:05.889725Z",
134 |      "start_time": "2020-11-04T12:34:05.874587Z"
135 |     }
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "def get_fields(fields, data, data_field):\n",
140 |     "    f = data_field[data_field[\"field.showcase\"].isin(fields) & data_field[\"field.tab\"].str.contains(\"f\\\\.\\\\d+\\\\.0\\\\.\\\\d\")].copy()\n",
141 |     "    f[\"field\"] = pd.Categorical(f[\"field.showcase\"], categories=fields, ordered=True)\n",
142 |     "    f = f.sort_values(\"field\").reset_index().drop(\"field\", axis=1)\n",
143 |     "    return f\n",
144 |     "\n",
145 |     "def get_fields_all(fields, data, data_field):\n",
146 |     "    f = data_field[data_field[\"field.showcase\"].isin(fields)].copy()\n",
147 |     "    f[\"field\"] = pd.Categorical(f[\"field.showcase\"], categories=fields, ordered=True)\n",
148 |     "    f = f.sort_values(\"field\").reset_index().drop(\"field\", axis=1)\n",
149 |     "    return f\n",
150 |     "\n",
151 |     "def get_data_fields(fields, data, data_field):\n",
152 |     "    f = get_fields(fields, data, data_field)\n",
153 |     "    return data[[\"eid\"]+f[\"col.name\"].to_list()].copy()\n",
154 |     "\n",
155 |     "def get_data_fields_all(fields, data, data_field):\n",
156 |     "    f = get_fields_all(fields, data, data_field)\n",
157 |     "    return data[[\"eid\"]+f[\"col.name\"].to_list()].copy()"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## Diagnoses and events"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "ExecuteTime": {
172 |      "end_time": "2020-11-04T12:37:14.667281Z",
173 |      "start_time": "2020-11-04T12:36:14.427693Z"
174 |     }
175 |    },
176 |    "outputs": [],
177 |    "source": [
178 |     "vocab_dir = f\"{data_path}/mapping/athena\"\n",
179 |     "vocab = {\n",
180 |     "    \"concept\": pd.read_csv(f\"{vocab_dir}/CONCEPT.csv\", sep='\\t'),\n",
181 |     "    \"domain\": pd.read_csv(f\"{vocab_dir}/DOMAIN.csv\", sep='\\t'),\n",
182 |     "    \"class\": pd.read_csv(f\"{vocab_dir}/CONCEPT_CLASS.csv\", sep='\\t'),\n",
183 |     "    \"relationship\": pd.read_csv(f\"{vocab_dir}/RELATIONSHIP.csv\", sep='\\t'),\n",
184 |     "    \"drug_strength\": pd.read_csv(f\"{vocab_dir}/DRUG_STRENGTH.csv\", sep='\\t'),\n",
185 |     "    \"vocabulary\": pd.read_csv(f\"{vocab_dir}/VOCABULARY.csv\", sep='\\t'),\n",
186 |     "    \"concept_synonym\": pd.read_csv(f\"{vocab_dir}/CONCEPT_SYNONYM.csv\", sep='\\t'),\n",
187 |     "    \"concept_ancestor\": pd.read_csv(f\"{vocab_dir}/CONCEPT_ANCESTOR.csv\", sep='\\t'),\n",
188 |     "    \"concept_relationship\": pd.read_csv(f\"{vocab_dir}/CONCEPT_RELATIONSHIP.csv\", sep='\\t')                       \n",
189 |     "}"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "### Definitions"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "ExecuteTime": {
204 |      "end_time": "2020-11-04T12:37:14.772869Z",
205 |      "start_time": "2020-11-04T12:37:14.669541Z"
206 |     }
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "coding1836 = pd.read_csv(f\"{path}/codings/coding1836.tsv\", sep=\"\\t\").rename(columns={\"coding\":\"code\"})\n",
211 |     "phecodes = pd.read_csv(f\"{path}/phecodes/phecode_icd10.csv\")\n",
212 |     "def phenotype_children(phecodes, phenotype_list):\n",
213 |     "    l={}\n",
214 |     "    phecodes = phecodes.dropna(subset=[\"Phenotype\"], axis=0)\n",
215 |     "    for ph, ph_names in phenotype_list.items():\n",
216 |     "        regex = \"|\".join(ph_names)\n",
217 |     "        l[ph] = list(phecodes[phecodes.Phenotype.str.contains(regex, case=False)].ICD10.str.replace(\"\\\\.\", \"\").str.slice(0, 3).unique())\n",
218 |     "    return l"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "diagnoses_codes = pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses_codes.feather')).drop(\"level\", axis=1)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "death_codes = pd.read_feather(f\"{data_path}/1_decoded/codes_death_records_210115.feather\").query(\"level==1\").drop(\"level\", axis=1)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "endpoint_codes = pd.concat([diagnoses_codes, death_codes[diagnoses_codes.columns]])"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "# Endpoints"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {
259 |     "ExecuteTime": {
260 |      "end_time": "2020-11-04T12:39:55.628580Z",
261 |      "start_time": "2020-11-04T12:33:33.036Z"
262 |     }
263 |    },
264 |    "outputs": [],
265 |    "source": [
266 |     "### define in snomed and get icd codes from there"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "### 1. Hospital admissions"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "endpoint_list = {\n",
283 |     "    \"myocardial_infarction\": ['I21', 'I22', 'I23', 'I24', 'I25'],\n",
284 |     "    \"stroke\": ['G45', \"I63\", \"I64\"],\n",
285 |     "    \"diabetes\" : ['E10', 'E11', 'E12', 'E13', 'E14'],\n",
286 |     "    \"diabetes1\" : ['E10'],\n",
287 |     "    \"diabetes2\" : ['E11', 'E12', 'E13', 'E14'],\n",
288 |     "    \"atrial_fibrillation\": ['I47', 'I48'],\n",
289 |     "    'migraine': ['G43', 'G44'],\n",
290 |     "    'rheumatoid_arthritis': ['J99', 'M05', 'M06', 'M08', 'M12', 'M13'],\n",
291 |     "    \"systemic_lupus_erythematosus\": ['M32'],\n",
292 |     "    'severe_mental_illness': ['F20', 'F25', 'F30', 'F31', 'F32', 'F33', 'F44'],\n",
293 |     "    \"erectile_dysfunction\" : ['F52', 'N48'],  \n",
294 |     "    \"chronic_kidney_disease\": [\"I12\", \"N18\", \"N19\"],\n",
295 |     "    \"liver_disease\":[\"K70\", \"K71\", \"K72\", \"K73\", \"K74\", \"K75\", \"K76\", \"K77\"],\n",
296 |     "    \"dementia\":['F00', 'F01', 'F02', 'F03'],\n",
297 |     "    \"copd\": ['J44'],\n",
298 |     "    \"M_all_cause_dementia\": [\"F00\", \"F01\", \"F02\", \"F03\", \"G30\", \"G31\"],\n",
299 |     "    \"M_MACE\": [\"G45\", \"I21\", \"I22\", \"I23\", \"I24\", \"I25\", \"I63\", \"I64\"],\n",
300 |     "    \"M_type_2_diabetes\": [\"E10\", \"E11\", \"E12\", \"E13\", \"E14\"],\n",
301 |     "    \"M_liver_disease\": [\"B15\", \"B16\", \"B17\", \"B18\", \"B19\", \"C22\", \"E83\", \"E88\", \"I85\", \n",
302 |     "                          \"K70\", \"K72\", \"K73\", \"K74\", \"K75\", \"K76\", \"R18\", \"Z94\"],\n",
303 |     "    \"M_renal_disease\":  [f\"N{i:02}\" for i in range(20)]+[f\"N{i:02}\" for i in range(25, 30)],\n",
304 |     "    \"M_atrial_fibrillation\": [\"I48\"],\n",
305 |     "    \"M_heart_failure\":[\"I50\"],\n",
306 |     "    \"M_coronary_heart_disease\": [f\"I{i:02}\" for i in range(20, 26)],\n",
307 |     "    \"M_venous_thrombosis\": [\"I80\", \"I81\", \"I82\"],\n",
308 |     "    \"M_cerebral_stroke\":[\"I63\", \"I65\", \"I66\"],\n",
309 |     "    \"M_haemorrhagic_stroke\": [\"I60, I61, I62\"],\n",
310 |     "    \"M_abdominal_aortic_aneurysm\" : [\"I71\"],\n",
311 |     "    \"M_peripheral_arterial_disease\": ['I70', 'I71', 'I72', 'I73', 'I74', 'I75', 'I76', 'I77', 'I78', 'I79'],\n",
312 |     "    \"M_asthma\":[\"J45\", \"J46\"],\n",
313 |     "    \"M_chronic_obstructuve_pulmonary_disease\":[\"J40\", \"J41\", \"J42\", \"J43\", \"J44\", \"J47\"],\n",
314 |     "    \"M_lung_cancer\":[\"C33\", \"C34\"],\n",
315 |     "    \"M_non_melanoma_skin_cancer\":[\"C44\"],\n",
316 |     "    \"M_stomach_cancer\":[\"C16\"],\n",
317 |     "    \"M_oesophagus_cancer\":[\"C15\"],\n",
318 |     "    \"M_colon_cancer\":[\"C18\"],\n",
319 |     "    \"M_rectal_cancer\":[\"C19\", \"C20\"],\n",
320 |     "    \"M_prostate_cancer\":[\"C61\"],\n",
321 |     "    \"M_ovarian_cancer\":[\"C56\", \"C57\"],\n",
322 |     "    \"M_breast_cancer\":[\"C50\"],\n",
323 |     "    \"M_uterus_cancer\":[\"C54\"],\n",
324 |     "    \"M_parkinsons_disease\":[\"G20\", \"G21\", \"G22\"],\n",
325 |     "    \"M_fractures\":[\"S02\", \"S12\", \"S22\", \"S32\", \"S42\", \"S52\", \"S62\", \"S72\", \"S82\", \"S92\", \"T02\", \"T08\", \"T10\"],\n",
326 |     "    \"M_cataracts\":[\"H25\", \"H26\"],\n",
327 |     "    \"M_glaucoma\":[\"H40\"]  \n",
328 |     "}\n",
329 |     "\n",
330 |     "with open(os.path.join(path, dataset_path, 'endpoint_list.yaml'), 'w') as file:\n",
331 |     "    yaml.dump(endpoint_list, file, default_flow_style=False)"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "from dateutil.relativedelta import relativedelta\n",
341 |     "import datetime\n",
342 |     "\n",
343 |     "def extract_endpoints_tte(data, diagnoses_codes, endpoint_list, time0_col, level=None):\n",
344 |     "    if level is not None: diagnoses_codes = diagnoses_codes.query(\"level==@level\")\n",
345 |     "    diagnoses_codes_time0 = diagnoses_codes.merge(data[[\"eid\", time0_col]], how=\"left\", on=\"eid\")\n",
346 |     "    \n",
347 |     "    cens_time_right = datetime.date(2020, 9, 30)\n",
348 |     "\n",
349 |     "    df_interval = diagnoses_codes_time0[(diagnoses_codes_time0.date > diagnoses_codes_time0[time0_col]) & \n",
350 |     "                                        (diagnoses_codes_time0.date < cens_time_right)]\n",
351 |     "    \n",
352 |     "    temp = data[[\"eid\", time0_col]].copy()\n",
353 |     "    for ph, ph_codes in tqdm(endpoint_list.items()):\n",
354 |     "        regex = \"|\".join(ph_codes)\n",
355 |     "        ph_df = df_interval[df_interval.meaning.str.contains(regex, case=False)] \\\n",
356 |     "            .sort_values('date').groupby('eid').head(1).assign(phenotype=1, date=lambda x: x.date)\n",
357 |     "        temp_ph = temp.merge(ph_df, how=\"left\", on=\"eid\").fillna(0)\n",
358 |     "        temp[ph+\"_event\"], temp[ph+\"_event_date\"] = temp_ph.phenotype, temp_ph.date\n",
359 |     "        \n",
360 |     "        fill_date = {ph+\"_event_date\" : lambda x: [cens_time_right if event==0 else event_date for event, event_date in zip(x[ph+\"_event\"], x[ph+\"_event_date\"])]}\n",
361 |     "        calc_tte = {ph+\"_event_time\" : lambda x: [(event_date-time0).days/365.25  for time0, event_date in zip(x[time0_col], x[ph+\"_event_date\"])]}\n",
362 |     "        \n",
363 |     "        temp = temp.assign(**fill_date).assign(**calc_tte).drop([ph+\"_event_date\"], axis=1)\n",
364 |     "        \n",
365 |     "    temp = temp.drop([time0_col], axis=1)     \n",
366 |     "    \n",
367 |     "    return temp.drop_duplicates()"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "basics = pd.read_feather(os.path.join(path, dataset_path, 'temp_basics.feather'))\n",
377 |     "endpoints_diagnoses = extract_endpoints_tte(basics, endpoint_codes, endpoint_list, time0_col)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "### 2. Death registry"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "death_list = {\n",
394 |     "    \"death_allcause\":[],\n",
395 |     "    \"death_cvd\":['I{:02}'.format(ID+1) for ID in range(0, 98)],\n",
396 |     "}\n",
397 |     "\n",
398 |     "with open(os.path.join(path, dataset_path, 'death_list.yaml'), 'w') as file:\n",
399 |     "    yaml.dump(death_list, file, default_flow_style=False)"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": [
408 |     "endpoints_death = extract_endpoints_tte(basics, death_codes, death_list, time0_col)"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "markdown",
413 |    "metadata": {},
414 |    "source": [
415 |     "## SCORES"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "scores_list = {\n",
425 |     "    \"SCORE\":['I{:02}'.format(ID) for ID in [10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 44, 45, 46, 47, 48, 49, 50, 51, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73]],\n",
426 |     "    \"ASCVD\":['I{:02}'.format(ID) for ID in [20, 21, 22, 23, 24, 25, 63]],\n",
427 |     "    \"QRISK3\":[\"G45\", \"I20\", \"I21\", \"I22\", \"I23\", \"I24\", \"I25\", \"I63\", \"I64\"],\n",
428 |     "    \"MACE\":[\"G45\", \"I21\", \"I22\", \"I23\", \"I24\", \"I25\", \"I63\", \"I64\"],    \n",
429 |     "}\n",
430 |     "with open(os.path.join(path, dataset_path, 'scores_list.yaml'), 'w') as file:\n",
431 |     "    yaml.dump(scores_list, file, default_flow_style=False)"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": [
440 |     "death_scores =  extract_endpoints_tte(basics, death_codes, scores_list, time0_col=time0_col)\n",
441 |     "endpoint_scores = extract_endpoints_tte(basics, endpoint_codes, scores_list, time0_col=time0_col)"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": [
450 |     "endpoints_scores_all = death_scores[[\"eid\", \"SCORE_event\", \"SCORE_event_time\"]].merge(endpoint_scores[[\"eid\", \"ASCVD_event\", \"ASCVD_event_time\", \"QRISK3_event\", \"QRISK3_event_time\", \"MACE_event\", \"MACE_event_time\"]], on=\"eid\")\n",
451 |     "endpoints_scores_all.to_feather(os.path.join(path, dataset_path, 'temp_endpoints_scores_all.feather'))"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "markdown",
456 |    "metadata": {},
457 |    "source": [
458 |     "## Merge Everything"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": null,
464 |    "metadata": {},
465 |    "outputs": [],
466 |    "source": [
467 |     "data_dfs_dict = {\"endpoints_diagnoses\":endpoints_diagnoses, \n",
468 |     "                 \"endpoints_death\":endpoints_death, \n",
469 |     "                 \"endpoints_scores_all\":endpoints_scores_all}"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": null,
475 |    "metadata": {},
476 |    "outputs": [],
477 |    "source": [
478 |     "def get_cols_clean(df):\n",
479 |     "    df.columns = df.columns.str.replace(r'_0_0$', '').str.replace(r'_f[0-9]+$', '').str.replace(\"_automated_reading\", '')\n",
480 |     "    return df.columns\n",
481 |     "\n",
482 |     "def clean_df(df):\n",
483 |     "    df.columns = get_cols_clean(df)\n",
484 |     "    return df"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "metadata": {},
491 |    "outputs": [],
492 |    "source": [
493 |     "import pandas as pd\n",
494 |     "from functools import reduce\n",
495 |     "\n",
496 |     "data_baseline = reduce(lambda x, y: pd.merge(x, y, on = 'eid'), list(data_dfs_dict.values()))\n",
497 |     "endpoint_columns = [c[:-11] for c in data_baseline.columns.tolist() if \"_event_time\" in c]"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "code",
502 |    "execution_count": null,
503 |    "outputs": [],
504 |    "source": [
505 |     "data_baseline = clean_df(data_baseline)"
506 |    ],
507 |    "metadata": {
508 |     "collapsed": false,
509 |     "pycharm": {
510 |      "name": "#%%\n"
511 |     }
512 |    }
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "outputs": [],
518 |    "source": [
519 |     "for col in [col for col in list(data_baseline.columns) if (\"_event\" in col) & (\"_time\" not in col)]:\n",
520 |     "    data_baseline[col] = data_baseline[col].astype(int)"
521 |    ],
522 |    "metadata": {
523 |     "collapsed": false,
524 |     "pycharm": {
525 |      "name": "#%%\n"
526 |     }
527 |    }
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "outputs": [],
533 |    "source": [
534 |     "covariates = [col for col in list(data_baseline.columns) if not \"_event\" in col]\n",
535 |     "targets = [col for col in list(data_baseline.columns) if \"_event\" in col]"
536 |    ],
537 |    "metadata": {
538 |     "collapsed": false,
539 |     "pycharm": {
540 |      "name": "#%%\n"
541 |     }
542 |    }
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "metadata": {},
547 |    "source": [
548 |     "# Exporting"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": null,
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": [
557 |     "data_cols = {}\n",
558 |     "for topic, df in data_dfs_dict.items(): \n",
559 |     "    data_cols[\"eid\"] = [\"admin\"]\n",
560 |     "    data_cols[topic]=list(get_cols_clean(df))[1:]"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": null,
566 |    "metadata": {},
567 |    "outputs": [],
568 |    "source": [
569 |     "data_cols_single = {}\n",
570 |     "for topic, columns in data_cols.items():\n",
571 |     "    for col in columns:\n",
572 |     "        data_cols_single[col] = topic"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": null,
578 |    "metadata": {},
579 |    "outputs": [],
580 |    "source": [
581 |     "for c in [c for c in data_baseline.columns.tolist() if \"comp\" in c]:\n",
582 |     "    data_cols_single.update({c:\"endpoints_competing\"})"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": null,
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": [
591 |     "dtypes = {\"int32\":\"int\", \"int64\":\"int\", \"float64\":\"float\", \"category\":\"category\", \"object\":\"category\", \"bool\":\"bool\"}\n",
592 |     "desc_dict = {\"id\": [*range(1, len(data_baseline.columns.to_list())+1)] , \n",
593 |     "             \"covariate\": data_baseline.columns.to_list(), \n",
594 |     "             \"dtype\":[dtypes[str(col)] for col in data_baseline.dtypes.to_list()], \n",
595 |     "             \"isTarget\":[True if col in targets else False for col in data_baseline.columns.to_list()],\n",
596 |     "            \"based_on\":[topic for col, topic in data_cols_single.items()],\n",
597 |     "             \"field\": [np.nan for col in data_baseline.columns.to_list()],\n",
598 |     "            \"aggr_fn\": [np.nan for col in data_baseline.columns.to_list()]}\n",
599 |     "data_baseline_description = pd.DataFrame.from_dict(desc_dict)"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": null,
605 |    "metadata": {},
606 |    "outputs": [],
607 |    "source": [
608 |     "endpoint_dict = {}\n",
609 |     "for group in data_baseline_description.based_on.unique(): endpoint_dict[group] = data_baseline_description.query(\"based_on==@group\").covariate.to_list()\n",
610 |     "with open(os.path.join(path, dataset_path, 'endpoint_list.yaml'), 'w') as file: yaml.dump(endpoint_dict, file, default_flow_style=False, allow_unicode=True)"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": [
619 |     "### WRITE FEATURES IN YAML!!!"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": null,
625 |    "metadata": {},
626 |    "outputs": [],
627 |    "source": [
628 |     "data_baseline.to_feather(os.path.join(path, dataset_path, 'baseline_endpoints.feather'))\n",
629 |     "data_baseline_description.to_feather(os.path.join(path, dataset_path, 'baseline_endpoints_description.feather'))"
630 |    ]
631 |   }
632 |  ],
633 |  "metadata": {
634 |   "kernelspec": {
635 |    "display_name": "Python [conda env:miniconda3-pl1.x]",
636 |    "language": "python",
637 |    "name": "conda-env-miniconda3-pl1.x-py"
638 |   },
639 |   "language_info": {
640 |    "codemirror_mode": {
641 |     "name": "ipython",
642 |     "version": 3
643 |    },
644 |    "file_extension": ".py",
645 |    "mimetype": "text/x-python",
646 |    "name": "python",
647 |    "nbconvert_exporter": "python",
648 |    "pygments_lexer": "ipython3",
649 |    "version": "3.7.8"
650 |   },
651 |   "toc-autonumbering": true,
652 |   "toc-showcode": false,
653 |   "toc-showmarkdowntxt": false
654 |  },
655 |  "nbformat": 4,
656 |  "nbformat_minor": 4
657 | }


--------------------------------------------------------------------------------
/analysis/preprocessing/1_preprocessing_dataportal.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "source": [
   6 |     "# 1. Data Portal Preprocessing"
   7 |    ],
   8 |    "metadata": {
   9 |     "collapsed": false
  10 |    }
  11 |   },
  12 |   {
  13 |    "cell_type": "code",
  14 |    "execution_count": null,
  15 |    "outputs": [],
  16 |    "source": [
  17 |     "try(library(tidyverse), silent=TRUE)\n",
  18 |     "library(lubridate)\n",
  19 |     "library(glue)\n",
  20 |     "library(data.table)\n",
  21 |     "library(tidyfast)\n",
  22 |     "library(\"magrittr\")\n",
  23 |     "setwd(\"/\")"
  24 |    ],
  25 |    "metadata": {
  26 |     "collapsed": false,
  27 |     "pycharm": {
  28 |      "name": "#%%\n"
  29 |     }
  30 |    }
  31 |   },
  32 |   {
  33 |    "cell_type": "code",
  34 |    "execution_count": null,
  35 |    "outputs": [],
  36 |    "source": [
  37 |     "dataset_name = \"name_of_your_dataset\"\n",
  38 |     "path = \"/path/to/dir/with/decoded/file\"\n",
  39 |     "data_path = \"/path/for/output\"\n",
  40 |     "dataset_path = glue(\"{data_path}/2_datasets_pre/{dataset_name}\")"
  41 |    ],
  42 |    "metadata": {
  43 |     "collapsed": false,
  44 |     "pycharm": {
  45 |      "name": "#%%\n"
  46 |     }
  47 |    }
  48 |   },
  49 |   {
  50 |    "cell_type": "code",
  51 |    "execution_count": null,
  52 |    "outputs": [],
  53 |    "source": [
  54 |     "list.files(path = \"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/\")"
  55 |    ],
  56 |    "metadata": {
  57 |     "collapsed": false,
  58 |     "pycharm": {
  59 |      "name": "#%%\n"
  60 |     }
  61 |    }
  62 |   },
  63 |   {
  64 |    "cell_type": "markdown",
  65 |    "source": [
  66 |     "## Load Athena Vocabulary"
  67 |    ],
  68 |    "metadata": {
  69 |     "collapsed": false
  70 |    }
  71 |   },
  72 |   {
  73 |    "cell_type": "code",
  74 |    "execution_count": null,
  75 |    "outputs": [],
  76 |    "source": [
  77 |     "vocab_dir = glue(\"{data_path}/athena_vocabulary_covid\")\n",
  78 |     "concept =fread(glue(\"{vocab_dir}/CONCEPT.csv\"), sep='\\t')"
  79 |    ],
  80 |    "metadata": {
  81 |     "collapsed": false,
  82 |     "pycharm": {
  83 |      "name": "#%%\n"
  84 |     }
  85 |    }
  86 |   },
  87 |   {
  88 |    "cell_type": "code",
  89 |    "execution_count": null,
  90 |    "outputs": [],
  91 |    "source": [
  92 |     "unique(concept$vocabulary_id)"
  93 |    ],
  94 |    "metadata": {
  95 |     "collapsed": false,
  96 |     "pycharm": {
  97 |      "name": "#%%\n"
  98 |     }
  99 |    }
 100 |   },
 101 |   {
 102 |    "cell_type": "code",
 103 |    "execution_count": null,
 104 |    "outputs": [],
 105 |    "source": [
 106 |     "relationship = fread(glue(\"{vocab_dir}/RELATIONSHIP.csv\"), sep='\\t')"
 107 |    ],
 108 |    "metadata": {
 109 |     "collapsed": false,
 110 |     "pycharm": {
 111 |      "name": "#%%\n"
 112 |     }
 113 |    }
 114 |   },
 115 |   {
 116 |    "cell_type": "code",
 117 |    "execution_count": null,
 118 |    "outputs": [],
 119 |    "source": [
 120 |     "vocabulary =  fread(glue(\"{vocab_dir}/VOCABULARY.csv\"), sep='\\t')"
 121 |    ],
 122 |    "metadata": {
 123 |     "collapsed": false,
 124 |     "pycharm": {
 125 |      "name": "#%%\n"
 126 |     }
 127 |    }
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": null,
 132 |    "outputs": [],
 133 |    "source": [
 134 |     "concept_relationship = fread(glue(\"{vocab_dir}/CONCEPT_RELATIONSHIP.csv\"), sep='\\t')"
 135 |    ],
 136 |    "metadata": {
 137 |     "collapsed": false,
 138 |     "pycharm": {
 139 |      "name": "#%%\n"
 140 |     }
 141 |    }
 142 |   },
 143 |   {
 144 |    "cell_type": "code",
 145 |    "execution_count": null,
 146 |    "outputs": [],
 147 |    "source": [
 148 |     "## Diagnoses"
 149 |    ],
 150 |    "metadata": {
 151 |     "collapsed": false,
 152 |     "pycharm": {
 153 |      "name": "#%%\n"
 154 |     }
 155 |    }
 156 |   },
 157 |   {
 158 |    "cell_type": "markdown",
 159 |    "source": [
 160 |     "## Hospital Episode Statistics"
 161 |    ],
 162 |    "metadata": {
 163 |     "collapsed": false
 164 |    }
 165 |   },
 166 |   {
 167 |    "cell_type": "code",
 168 |    "execution_count": null,
 169 |    "outputs": [],
 170 |    "source": [
 171 |     "hesin = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin.txt\")"
 172 |    ],
 173 |    "metadata": {
 174 |     "collapsed": false,
 175 |     "pycharm": {
 176 |      "name": "#%%\n"
 177 |     }
 178 |    }
 179 |   },
 180 |   {
 181 |    "cell_type": "code",
 182 |    "execution_count": null,
 183 |    "outputs": [],
 184 |    "source": [
 185 |     "hesin_diag = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_diag.txt\")"
 186 |    ],
 187 |    "metadata": {
 188 |     "collapsed": false,
 189 |     "pycharm": {
 190 |      "name": "#%%\n"
 191 |     }
 192 |    }
 193 |   },
 194 |   {
 195 |    "cell_type": "code",
 196 |    "execution_count": null,
 197 |    "outputs": [],
 198 |    "source": [
 199 |     "hesin_critical = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_critical.txt\")"
 200 |    ],
 201 |    "metadata": {
 202 |     "collapsed": false,
 203 |     "pycharm": {
 204 |      "name": "#%%\n"
 205 |     }
 206 |    }
 207 |   },
 208 |   {
 209 |    "cell_type": "code",
 210 |    "execution_count": null,
 211 |    "outputs": [],
 212 |    "source": [
 213 |     "hesin_psych = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_psych.txt\")"
 214 |    ],
 215 |    "metadata": {
 216 |     "collapsed": false,
 217 |     "pycharm": {
 218 |      "name": "#%%\n"
 219 |     }
 220 |    }
 221 |   },
 222 |   {
 223 |    "cell_type": "code",
 224 |    "execution_count": null,
 225 |    "outputs": [],
 226 |    "source": [
 227 |     "hesin_delivery = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_delivery.txt\")\n",
 228 |     "hesin_maternity = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_maternity.txt\")"
 229 |    ],
 230 |    "metadata": {
 231 |     "collapsed": false,
 232 |     "pycharm": {
 233 |      "name": "#%%\n"
 234 |     }
 235 |    }
 236 |   },
 237 |   {
 238 |    "cell_type": "markdown",
 239 |    "source": [
 240 |     "### Diagnoses - ICD10"
 241 |    ],
 242 |    "metadata": {
 243 |     "collapsed": false
 244 |    }
 245 |   },
 246 |   {
 247 |    "cell_type": "code",
 248 |    "execution_count": null,
 249 |    "outputs": [],
 250 |    "source": [
 251 |     "## icd9 to icd10 mapping\n",
 252 |     "icd9to10_df = fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/codings/coding1836.tsv\")\n",
 253 |     "icd9to10_mapping = split(icd9to10_df$meaning, icd9to10_df$coding)\n",
 254 |     "hesin_diag_icd9 = hesin_diag %>% filter(diag_icd9!=\"\") %>% rowwise() %>% mutate(diag_icd10 = list(icd9to10_mapping[[diag_icd9]])) %>% drop_na(diag_icd10)\n",
 255 |     "hesin_diag = rbind(hesin_diag %>% filter(diag_icd9==\"\") %>% mutate(origin=\"hes_icd10\"), hesin_diag_icd9  %>% mutate(origin=\"hes_icd9\"))"
 256 |    ],
 257 |    "metadata": {
 258 |     "collapsed": false,
 259 |     "pycharm": {
 260 |      "name": "#%%\n"
 261 |     }
 262 |    }
 263 |   },
 264 |   {
 265 |    "cell_type": "code",
 266 |    "execution_count": null,
 267 |    "outputs": [],
 268 |    "source": [
 269 |     "hes_join = hesin[hesin_diag, on=c(\"eid\", \"ins_index\")]\n",
 270 |     "hes_join = hes_join[, c(\"eid\", \"origin\",\"ins_index\", \"arr_index\", \"level\", \"epistart\", \"diag_icd10\")][order(eid, ins_index, arr_index),]"
 271 |    ],
 272 |    "metadata": {
 273 |     "collapsed": false,
 274 |     "pycharm": {
 275 |      "name": "#%%\n"
 276 |     }
 277 |    }
 278 |   },
 279 |   {
 280 |    "cell_type": "code",
 281 |    "execution_count": null,
 282 |    "outputs": [],
 283 |    "source": [
 284 |     "hes_join_date = hes_join %>% rename(date=\"epistart\") %>% mutate(date = ymd(as.Date(fast_strptime(date, \"%d/%m/%Y\"))))"
 285 |    ],
 286 |    "metadata": {
 287 |     "collapsed": false,
 288 |     "pycharm": {
 289 |      "name": "#%%\n"
 290 |     }
 291 |    }
 292 |   },
 293 |   {
 294 |    "cell_type": "code",
 295 |    "execution_count": null,
 296 |    "outputs": [],
 297 |    "source": [
 298 |     "hes_diagnoses = hes_join_date %>% drop_na(date) %>% rename(code = \"diag_icd10\") %>% mutate(instance=ins_index) %>% group_by(eid) %>% mutate(n = arr_index)"
 299 |    ],
 300 |    "metadata": {
 301 |     "collapsed": false,
 302 |     "pycharm": {
 303 |      "name": "#%%\n"
 304 |     }
 305 |    }
 306 |   },
 307 |   {
 308 |    "cell_type": "code",
 309 |    "execution_count": null,
 310 |    "outputs": [],
 311 |    "source": [
 312 |     "hes_diagnoses = hes_diagnoses %>% mutate(meaning=str_sub(code, 1, 3)) %>% select(c(eid, origin, instance, n, level, code, meaning, date))"
 313 |    ],
 314 |    "metadata": {
 315 |     "collapsed": false,
 316 |     "pycharm": {
 317 |      "name": "#%%\n"
 318 |     }
 319 |    }
 320 |   },
 321 |   {
 322 |    "cell_type": "code",
 323 |    "execution_count": null,
 324 |    "outputs": [],
 325 |    "source": [
 326 |     "nrow(hes_diagnoses)\n",
 327 |     "head(hes_diagnoses %>% arrange(desc(date)))"
 328 |    ],
 329 |    "metadata": {
 330 |     "collapsed": false,
 331 |     "pycharm": {
 332 |      "name": "#%%\n"
 333 |     }
 334 |    }
 335 |   },
 336 |   {
 337 |    "cell_type": "code",
 338 |    "execution_count": null,
 339 |    "outputs": [],
 340 |    "source": [
 341 |     "arrow::write_feather(hes_diagnoses, glue(\"{path}/codes_hes_diagnoses_210120.feather\"))"
 342 |    ],
 343 |    "metadata": {
 344 |     "collapsed": false,
 345 |     "pycharm": {
 346 |      "name": "#%%\n"
 347 |     }
 348 |    }
 349 |   },
 350 |   {
 351 |    "cell_type": "markdown",
 352 |    "source": [
 353 |     "### Procedures - Snomed CT"
 354 |    ],
 355 |    "metadata": {
 356 |     "collapsed": false
 357 |    }
 358 |   },
 359 |   {
 360 |    "cell_type": "code",
 361 |    "execution_count": null,
 362 |    "outputs": [],
 363 |    "source": [
 364 |     "hesin_oper = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_oper.txt\")"
 365 |    ],
 366 |    "metadata": {
 367 |     "collapsed": false,
 368 |     "pycharm": {
 369 |      "name": "#%%\n"
 370 |     }
 371 |    }
 372 |   },
 373 |   {
 374 |    "cell_type": "code",
 375 |    "execution_count": null,
 376 |    "outputs": [],
 377 |    "source": [
 378 |     "hesin_oper[hesin_oper == \"\"] <- NA\n"
 379 |    ],
 380 |    "metadata": {
 381 |     "collapsed": false,
 382 |     "pycharm": {
 383 |      "name": "#%%\n"
 384 |     }
 385 |    }
 386 |   },
 387 |   {
 388 |    "cell_type": "code",
 389 |    "execution_count": null,
 390 |    "outputs": [],
 391 |    "source": [
 392 |     "hesin_oper_pre = hesin_oper %>% rename(date=\"opdate\", code=\"oper4\") %>% \n",
 393 |     "    mutate(date = ymd(as.Date(fast_strptime(date, \"%d/%m/%Y\"))))  %>%\n",
 394 |     "    mutate(origin=\"hes_opcs4\", instance=ins_index) %>% group_by(eid) %>% mutate(n = arr_index) %>% select(eid, origin, instance, n, level, code, date)"
 395 |    ],
 396 |    "metadata": {
 397 |     "collapsed": false,
 398 |     "pycharm": {
 399 |      "name": "#%%\n"
 400 |     }
 401 |    }
 402 |   },
 403 |   {
 404 |    "cell_type": "code",
 405 |    "execution_count": null,
 406 |    "outputs": [],
 407 |    "source": [
 408 |     "concept_ids_opcs4 = concept %>% filter(vocabulary_id == \"OPCS4\") %>% mutate(concept_code = str_replace(concept_code, \"\\\\.\", \"\"))\n",
 409 |     "concept_ids_snomed = concept %>% filter(vocabulary_id == \"SNOMED\" & domain_id==\"Procedure\") \n",
 410 |     "\n",
 411 |     "# check necessary opcs4 concept ids\n",
 412 |     "concept_ids = concept_ids_opcs4 %>% mutate(concept_id_1 = concept_id)\n",
 413 |     "\n",
 414 |     "cr_filtered = concept_relationship %>% filter(concept_id_1 %in% concept_ids_opcs4$concept_id) %>% filter(concept_id_2 %in% concept_ids_snomed$concept_id) %>% arrange(concept_id_1)"
 415 |    ],
 416 |    "metadata": {
 417 |     "collapsed": false,
 418 |     "pycharm": {
 419 |      "name": "#%%\n"
 420 |     }
 421 |    }
 422 |   },
 423 |   {
 424 |    "cell_type": "code",
 425 |    "execution_count": null,
 426 |    "outputs": [],
 427 |    "source": [
 428 |     "mapping_opcs4_snomed = concept_ids_opcs4 %>% \n",
 429 |     "    left_join(cr_filtered %>% select(concept_id_1, concept_id_2), by=c(\"concept_id\"=\"concept_id_1\")) %>% \n",
 430 |     "    left_join(concept_ids_snomed %>% select(concept_id, concept_code, concept_name), by=c(\"concept_id_2\"=\"concept_id\")) %>% \n",
 431 |     "    mutate(code = concept_code.x, meaning=concept_code.y, name=concept_name.y)"
 432 |    ],
 433 |    "metadata": {
 434 |     "collapsed": false,
 435 |     "pycharm": {
 436 |      "name": "#%%\n"
 437 |     }
 438 |    }
 439 |   },
 440 |   {
 441 |    "cell_type": "code",
 442 |    "execution_count": null,
 443 |    "outputs": [],
 444 |    "source": [
 445 |     "hes_procedures = hesin_oper_pre %>% left_join(mapping_opcs4_snomed %>% select(code, meaning, name), by=\"code\") %>% select(eid, origin, instance, n, level, date, code, meaning, name)"
 446 |    ],
 447 |    "metadata": {
 448 |     "collapsed": false,
 449 |     "pycharm": {
 450 |      "name": "#%%\n"
 451 |     }
 452 |    }
 453 |   },
 454 |   {
 455 |    "cell_type": "code",
 456 |    "execution_count": null,
 457 |    "outputs": [],
 458 |    "source": [
 459 |     "arrow::write_feather(hes_procedures, glue(\"{path}/codes_hes_procedures_210119.feather\"))"
 460 |    ],
 461 |    "metadata": {
 462 |     "collapsed": false,
 463 |     "pycharm": {
 464 |      "name": "#%%\n"
 465 |     }
 466 |    }
 467 |   },
 468 |   {
 469 |    "cell_type": "markdown",
 470 |    "source": [
 471 |     "## Mortality Records - ICD10"
 472 |    ],
 473 |    "metadata": {
 474 |     "collapsed": false
 475 |    }
 476 |   },
 477 |   {
 478 |    "cell_type": "code",
 479 |    "execution_count": null,
 480 |    "outputs": [],
 481 |    "source": [
 482 |     "death = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/death.txt\")\n",
 483 |     "death_cause = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/death_cause.txt\")"
 484 |    ],
 485 |    "metadata": {
 486 |     "collapsed": false,
 487 |     "pycharm": {
 488 |      "name": "#%%\n"
 489 |     }
 490 |    }
 491 |   },
 492 |   {
 493 |    "cell_type": "code",
 494 |    "execution_count": null,
 495 |    "outputs": [],
 496 |    "source": [
 497 |     "death_join = death[death_cause, on=c(\"eid\", \"ins_index\")]\n",
 498 |     "death_join = death_join[, c(\"eid\", \"ins_index\", \"arr_index\", \"level\", \"date_of_death\", \"cause_icd10\")][order(eid, ins_index, arr_index),]"
 499 |    ],
 500 |    "metadata": {
 501 |     "collapsed": false,
 502 |     "pycharm": {
 503 |      "name": "#%%\n"
 504 |     }
 505 |    }
 506 |   },
 507 |   {
 508 |    "cell_type": "code",
 509 |    "execution_count": null,
 510 |    "outputs": [],
 511 |    "source": [
 512 |     "death_join_date = death_join %>% rename(date=\"date_of_death\") %>% rename(code = \"cause_icd10\") %>% mutate(date = ymd(as.Date(fast_strptime(date, \"%d/%m/%Y\"))))"
 513 |    ],
 514 |    "metadata": {
 515 |     "collapsed": false,
 516 |     "pycharm": {
 517 |      "name": "#%%\n"
 518 |     }
 519 |    }
 520 |   },
 521 |   {
 522 |    "cell_type": "code",
 523 |    "execution_count": null,
 524 |    "outputs": [],
 525 |    "source": [
 526 |     "codes_death = death_join_date  %>% mutate(instance=0) %>% mutate(origin=\"death_records\") %>% group_by(eid) %>% mutate(n=row_number())\n",
 527 |     "codes_death = codes_death %>% mutate(meaning=str_sub(code, 1, 3)) %>% select(c(eid, origin, instance, n, level, code, meaning, date))"
 528 |    ],
 529 |    "metadata": {
 530 |     "collapsed": false,
 531 |     "pycharm": {
 532 |      "name": "#%%\n"
 533 |     }
 534 |    }
 535 |   },
 536 |   {
 537 |    "cell_type": "code",
 538 |    "execution_count": null,
 539 |    "outputs": [],
 540 |    "source": [
 541 |     "arrow::write_feather(codes_death, glue(\"{path}/codes_death_records_210115.feather\"))"
 542 |    ],
 543 |    "metadata": {
 544 |     "collapsed": false,
 545 |     "pycharm": {
 546 |      "name": "#%%\n"
 547 |     }
 548 |    }
 549 |   },
 550 |   {
 551 |    "cell_type": "markdown",
 552 |    "source": [
 553 |     "## GP Records"
 554 |    ],
 555 |    "metadata": {
 556 |     "collapsed": false
 557 |    }
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": null,
 562 |    "outputs": [],
 563 |    "source": [
 564 |     "gp_registrations = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/gp_registrations.txt\")"
 565 |    ],
 566 |    "metadata": {
 567 |     "collapsed": false,
 568 |     "pycharm": {
 569 |      "name": "#%%\n"
 570 |     }
 571 |    }
 572 |   },
 573 |   {
 574 |    "cell_type": "code",
 575 |    "execution_count": null,
 576 |    "outputs": [],
 577 |    "source": [
 578 |     "gp_clinical = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/gp_clinical.txt\")"
 579 |    ],
 580 |    "metadata": {
 581 |     "collapsed": false,
 582 |     "pycharm": {
 583 |      "name": "#%%\n"
 584 |     }
 585 |    }
 586 |   },
 587 |   {
 588 |    "cell_type": "code",
 589 |    "execution_count": null,
 590 |    "outputs": [],
 591 |    "source": [
 592 |     "gp_clinical[gp_clinical == \"\"] <- NA"
 593 |    ],
 594 |    "metadata": {
 595 |     "collapsed": false,
 596 |     "pycharm": {
 597 |      "name": "#%%\n"
 598 |     }
 599 |    }
 600 |   },
 601 |   {
 602 |    "cell_type": "code",
 603 |    "execution_count": null,
 604 |    "outputs": [],
 605 |    "source": [
 606 |     "gp_clinical = gp_clinical %>% rename(date=\"event_dt\") %>% mutate(date = ymd(as.Date(fast_strptime(date, \"%d/%m/%Y\"))))"
 607 |    ],
 608 |    "metadata": {
 609 |     "collapsed": false,
 610 |     "pycharm": {
 611 |      "name": "#%%\n"
 612 |     }
 613 |    }
 614 |   },
 615 |   {
 616 |    "cell_type": "code",
 617 |    "execution_count": null,
 618 |    "outputs": [],
 619 |    "source": [
 620 |     "# clean_dates\n",
 621 |     "# These data are provided in a form which is as close as possible to how they were issued from their source supplier, in order to avoid potential systematic error or bias by attempting to ‘clean’ them by\n",
 622 |     "# removing or altering invalid or erroneous information. However, to protect individuals, alterations have been made to dates in relation to participant date of birth as follows:\n",
 623 |     "\n",
 624 |     "# - where clinical event or prescription date precedes participant date of birth it has been altered to 01/01/1901.\n",
 625 |     "# - Where the date matches participant date of birth it has been altered to 02/02/1902.\n",
 626 |     "# - Where the date follows participant date of birth but is in the year of their birth it has been altered to 03/03/1903.\n",
 627 |     "# - Where the date was in the future this has been changed to 07/07/2037 as these are likely to have been entered as a place-holder or other system default."
 628 |    ],
 629 |    "metadata": {
 630 |     "collapsed": false,
 631 |     "pycharm": {
 632 |      "name": "#%%\n"
 633 |     }
 634 |    }
 635 |   },
 636 |   {
 637 |    "cell_type": "code",
 638 |    "execution_count": null,
 639 |    "outputs": [],
 640 |    "source": [
 641 |     "gp_clinical = gp_clinical %>% filter(date!=\"2037-07-07\")"
 642 |    ],
 643 |    "metadata": {
 644 |     "collapsed": false,
 645 |     "pycharm": {
 646 |      "name": "#%%\n"
 647 |     }
 648 |    }
 649 |   },
 650 |   {
 651 |    "cell_type": "markdown",
 652 |    "source": [
 653 |     "### Diagnoses - ICD10"
 654 |    ],
 655 |    "metadata": {
 656 |     "collapsed": false
 657 |    }
 658 |   },
 659 |   {
 660 |    "cell_type": "code",
 661 |    "execution_count": null,
 662 |    "outputs": [],
 663 |    "source": [
 664 |     "readv2_icd10 = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_v2_icd10.csv\"), -3) %>% rename(read_2=\"read_code\", code =\"icd10_code\") %>% select(read_2, code)\n",
 665 |     "readv3_icd10 = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_ctv3_icd10.csv\"), -3)%>% rename(read_3=\"read_code\", code=\"icd10_code\") %>% select(read_3, code)"
 666 |    ],
 667 |    "metadata": {
 668 |     "collapsed": false,
 669 |     "pycharm": {
 670 |      "name": "#%%\n"
 671 |     }
 672 |    }
 673 |   },
 674 |   {
 675 |    "cell_type": "code",
 676 |    "execution_count": null,
 677 |    "outputs": [],
 678 |    "source": [
 679 |     "gp_diagnoses_pre = gp_clinical %>% filter(read_2 %in% readv2_icd10$read_2 | read_3 %in% readv3_icd10$read_3)\n",
 680 |     "gp_diagnoses_readv2 = gp_diagnoses_pre %>% filter(!is.na(read_2)) %>% left_join(readv2_icd10, on=\"read_2\") %>% drop_na(code) %>% mutate(origin=\"gp_read2\") %>% select(eid, origin, code, date)\n",
 681 |     "gp_diagnoses_readv3 = gp_diagnoses_pre %>% filter(!is.na(read_3)) %>% left_join(readv3_icd10, on=\"read_3\") %>% drop_na(code) %>% mutate(origin=\"gp_read3\") %>% select(eid, origin, code, date)\n",
 682 |     "gp_diagnoses_raw = rbind(gp_diagnoses_readv2, gp_diagnoses_readv3)"
 683 |    ],
 684 |    "metadata": {
 685 |     "collapsed": false,
 686 |     "pycharm": {
 687 |      "name": "#%%\n"
 688 |     }
 689 |    }
 690 |   },
 691 |   {
 692 |    "cell_type": "code",
 693 |    "execution_count": null,
 694 |    "outputs": [],
 695 |    "source": [
 696 |     "gp_diagnoses = gp_diagnoses_raw %>% mutate(instance=0, level=NA) %>% distinct() %>% group_by(eid) %>% mutate(n = row_number()) %>% mutate(meaning=str_sub(code, 1, 3)) %>% select(c(eid, origin, instance, n, level, code, meaning, date))"
 697 |    ],
 698 |    "metadata": {
 699 |     "collapsed": false,
 700 |     "pycharm": {
 701 |      "name": "#%%\n"
 702 |     }
 703 |    }
 704 |   },
 705 |   {
 706 |    "cell_type": "code",
 707 |    "execution_count": null,
 708 |    "outputs": [],
 709 |    "source": [
 710 |     "arrow::write_feather(gp_diagnoses, glue(\"{path}/codes_gp_diagnoses_210119.feather\"))"
 711 |    ],
 712 |    "metadata": {
 713 |     "collapsed": false,
 714 |     "pycharm": {
 715 |      "name": "#%%\n"
 716 |     }
 717 |    }
 718 |   },
 719 |   {
 720 |    "cell_type": "markdown",
 721 |    "source": [
 722 |     "### Procedures - Snomed CT"
 723 |    ],
 724 |    "metadata": {
 725 |     "collapsed": false
 726 |    }
 727 |   },
 728 |   {
 729 |    "cell_type": "code",
 730 |    "execution_count": null,
 731 |    "outputs": [],
 732 |    "source": [
 733 |     "readv2_opcs4 = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_v2_opcs4.csv\"), -3) %>% rename(read_2=\"read_code\", code =\"opcs_4.2_code\") %>% select(read_2, code)\n",
 734 |     "readv3_opcs4 = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_ctv3_opcs4.csv\"), -3)%>% rename(read_3=\"read_code\", code=\"opcs4_code\") %>% select(read_3, code)"
 735 |    ],
 736 |    "metadata": {
 737 |     "collapsed": false,
 738 |     "pycharm": {
 739 |      "name": "#%%\n"
 740 |     }
 741 |    }
 742 |   },
 743 |   {
 744 |    "cell_type": "code",
 745 |    "execution_count": null,
 746 |    "outputs": [],
 747 |    "source": [
 748 |     "gp_procedures_pre = gp_clinical %>% filter(read_2 %in% readv2_opcs4$read_2 | read_3 %in% readv3_opcs4$read_3)"
 749 |    ],
 750 |    "metadata": {
 751 |     "collapsed": false,
 752 |     "pycharm": {
 753 |      "name": "#%%\n"
 754 |     }
 755 |    }
 756 |   },
 757 |   {
 758 |    "cell_type": "code",
 759 |    "execution_count": null,
 760 |    "outputs": [],
 761 |    "source": [
 762 |     "gp_procedures_readv2 = gp_procedures_pre %>% filter(!is.na(read_2)) %>% left_join(readv2_opcs4, on=\"read_2\") %>% drop_na(code) %>% mutate(origin=\"gp_read2\") %>% select(eid, origin, code, date)\n",
 763 |     "gp_procedures_readv3 = gp_procedures_pre %>% filter(!is.na(read_3)) %>% left_join(readv3_opcs4, on=\"read_3\") %>% drop_na(code) %>% mutate(origin=\"gp_read3\") %>% select(eid, origin, code, date)"
 764 |    ],
 765 |    "metadata": {
 766 |     "collapsed": false,
 767 |     "pycharm": {
 768 |      "name": "#%%\n"
 769 |     }
 770 |    }
 771 |   },
 772 |   {
 773 |    "cell_type": "code",
 774 |    "execution_count": null,
 775 |    "outputs": [],
 776 |    "source": [
 777 |     "gp_procedures_raw = rbind(gp_procedures_readv2, gp_procedures_readv3) %>% mutate(instance=0, level=NA) %>% distinct() %>% group_by(eid) %>% mutate(n = row_number()) "
 778 |    ],
 779 |    "metadata": {
 780 |     "collapsed": false,
 781 |     "pycharm": {
 782 |      "name": "#%%\n"
 783 |     }
 784 |    }
 785 |   },
 786 |   {
 787 |    "cell_type": "code",
 788 |    "execution_count": null,
 789 |    "outputs": [],
 790 |    "source": [
 791 |     "# opcs4 to snomed mapping\n",
 792 |     "\n",
 793 |     "concept_ids_opcs4 = concept %>% filter(vocabulary_id == \"OPCS4\") %>% mutate(concept_code = str_replace(concept_code, \"\\\\.\", \"\"))\n",
 794 |     "concept_ids_snomed = concept %>% filter(vocabulary_id == \"SNOMED\" & domain_id==\"Procedure\") \n",
 795 |     "\n",
 796 |     "# check necessary opcs4 concept ids\n",
 797 |     "concept_ids = concept_ids_opcs4 %>% mutate(concept_id_1 = concept_id)\n",
 798 |     "cr_filtered = concept_relationship %>% filter(concept_id_1 %in% concept_ids_opcs4$concept_id) %>% filter(concept_id_2 %in% concept_ids_snomed$concept_id) %>% arrange(concept_id_1)\n",
 799 |     "\n",
 800 |     "mapping_opcs4_snomed = concept_ids_opcs4 %>% \n",
 801 |     "    left_join(cr_filtered %>% select(concept_id_1, concept_id_2), by=c(\"concept_id\"=\"concept_id_1\")) %>% \n",
 802 |     "    left_join(concept_ids_snomed %>% select(concept_id, concept_code, concept_name), by=c(\"concept_id_2\"=\"concept_id\")) %>% \n",
 803 |     "    mutate(code = concept_code.x, meaning=concept_code.y, name=concept_name.y)"
 804 |    ],
 805 |    "metadata": {
 806 |     "collapsed": false,
 807 |     "pycharm": {
 808 |      "name": "#%%\n"
 809 |     }
 810 |    }
 811 |   },
 812 |   {
 813 |    "cell_type": "code",
 814 |    "execution_count": null,
 815 |    "outputs": [],
 816 |    "source": [
 817 |     "gp_procedures = gp_procedures_raw %>% left_join(mapping_opcs4_snomed %>% select(code, meaning, name), by=\"code\") %>% select(eid, origin, instance, n, level, date, code, meaning, name) %>% arrange(eid, date)"
 818 |    ],
 819 |    "metadata": {
 820 |     "collapsed": false,
 821 |     "pycharm": {
 822 |      "name": "#%%\n"
 823 |     }
 824 |    }
 825 |   },
 826 |   {
 827 |    "cell_type": "code",
 828 |    "execution_count": null,
 829 |    "outputs": [],
 830 |    "source": [
 831 |     "arrow::write_feather(gp_procedures, glue(\"{path}/codes_gp_procedures_210119.feather\"))"
 832 |    ],
 833 |    "metadata": {
 834 |     "collapsed": false,
 835 |     "pycharm": {
 836 |      "name": "#%%\n"
 837 |     }
 838 |    }
 839 |   },
 840 |   {
 841 |    "cell_type": "markdown",
 842 |    "source": [
 843 |     "### Measurements - Snomed CT"
 844 |    ],
 845 |    "metadata": {
 846 |     "collapsed": false
 847 |    }
 848 |   },
 849 |   {
 850 |    "cell_type": "code",
 851 |    "execution_count": null,
 852 |    "outputs": [],
 853 |    "source": [
 854 |     "readv2_readv3 = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_v2_read_ctv3.csv\"), -3) %>% rename(read_2=\"READV2_CODE\", code =\"READV3_CODE\", name =\"TERMV3_DESC\") %>% select(read_2, code)"
 855 |    ],
 856 |    "metadata": {
 857 |     "collapsed": false,
 858 |     "pycharm": {
 859 |      "name": "#%%\n"
 860 |     }
 861 |    }
 862 |   },
 863 |   {
 864 |    "cell_type": "code",
 865 |    "execution_count": null,
 866 |    "outputs": [],
 867 |    "source": [
 868 |     "gp_meas = gp_clinical %>% filter(!is.na(value1)) %>% distinct()"
 869 |    ],
 870 |    "metadata": {
 871 |     "collapsed": false,
 872 |     "pycharm": {
 873 |      "name": "#%%\n"
 874 |     }
 875 |    }
 876 |   },
 877 |   {
 878 |    "cell_type": "code",
 879 |    "execution_count": null,
 880 |    "outputs": [],
 881 |    "source": [
 882 |     "gp_meas_readv2 = gp_meas %>% filter(!is.na(read_2)) %>% left_join(readv2_readv3, by=\"read_2\")"
 883 |    ],
 884 |    "metadata": {
 885 |     "collapsed": false,
 886 |     "pycharm": {
 887 |      "name": "#%%\n"
 888 |     }
 889 |    }
 890 |   },
 891 |   {
 892 |    "cell_type": "code",
 893 |    "execution_count": null,
 894 |    "outputs": [],
 895 |    "source": [
 896 |     "gp_meas_readv3 = gp_meas %>% filter(!is.na(read_3)) %>% mutate(code=read_3)"
 897 |    ],
 898 |    "metadata": {
 899 |     "collapsed": false,
 900 |     "pycharm": {
 901 |      "name": "#%%\n"
 902 |     }
 903 |    }
 904 |   },
 905 |   {
 906 |    "cell_type": "code",
 907 |    "execution_count": null,
 908 |    "outputs": [],
 909 |    "source": [
 910 |     "gp_meas_all = rbind(gp_meas_readv2, gp_meas_readv3) %>% distinct() %>% group_by(eid) "
 911 |    ],
 912 |    "metadata": {
 913 |     "collapsed": false,
 914 |     "pycharm": {
 915 |      "name": "#%%\n"
 916 |     }
 917 |    }
 918 |   },
 919 |   {
 920 |    "cell_type": "code",
 921 |    "execution_count": null,
 922 |    "outputs": [],
 923 |    "source": [
 924 |     "readv3_lkp = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_ctv3_lkp.csv\"), -3)%>% rename(code=\"read_code\", name =\"term_description\") %>% select(code, name)\n",
 925 |     "readv3_sct = fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/CTV3SCTMAP.csv\")%>% rename(SCUI=\"V1\", STUI=\"V2\", TCUI=\"V3\", TTUI=\"V4\")%>% rename(code=\"SCUI\", meaning=\"TCUI\") %>% select(code, meaning)\n",
 926 |     "#readct_sct = fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/RCTSCTMAP.csv\")%>% rename(SCUI=\"V1\", STUI=\"V2\", TCUI=\"V3\", TTUI=\"V4\")#%>% rename(code=\"read_code\", name =\"term_description\") %>% select(code, name)#"
 927 |    ],
 928 |    "metadata": {
 929 |     "collapsed": false,
 930 |     "pycharm": {
 931 |      "name": "#%%\n"
 932 |     }
 933 |    }
 934 |   },
 935 |   {
 936 |    "cell_type": "code",
 937 |    "execution_count": null,
 938 |    "outputs": [],
 939 |    "source": [
 940 |     "gp_meas = gp_meas_all %>% left_join(readv3_lkp, by=\"code\")"
 941 |    ],
 942 |    "metadata": {
 943 |     "collapsed": false,
 944 |     "pycharm": {
 945 |      "name": "#%%\n"
 946 |     }
 947 |    }
 948 |   },
 949 |   {
 950 |    "cell_type": "code",
 951 |    "execution_count": null,
 952 |    "outputs": [],
 953 |    "source": [
 954 |     "concept_ids_snomed = concept %>% filter(vocabulary_id == \"SNOMED\") %>% rename(name=\"concept_name\", meaning=\"concept_code\") %>% select(meaning, name)"
 955 |    ],
 956 |    "metadata": {
 957 |     "collapsed": false,
 958 |     "pycharm": {
 959 |      "name": "#%%\n"
 960 |     }
 961 |    }
 962 |   },
 963 |   {
 964 |    "cell_type": "code",
 965 |    "execution_count": null,
 966 |    "outputs": [],
 967 |    "source": [
 968 |     "gp_meas_uncleaned = gp_meas_all %>% left_join(readv3_sct, by=\"code\") %>% left_join(concept_ids_snomed, by=\"meaning\") %>% distinct()"
 969 |    ],
 970 |    "metadata": {
 971 |     "collapsed": false,
 972 |     "pycharm": {
 973 |      "name": "#%%\n"
 974 |     }
 975 |    }
 976 |   },
 977 |   {
 978 |    "cell_type": "code",
 979 |    "execution_count": null,
 980 |    "outputs": [],
 981 |    "source": [
 982 |     "gp_meas_cleaned_1 = gp_meas_uncleaned %>% select(eid, date, code, value1, value2, value3, meaning, name) %>% distinct() %>% filter(value1!=0)"
 983 |    ],
 984 |    "metadata": {
 985 |     "collapsed": false,
 986 |     "pycharm": {
 987 |      "name": "#%%\n"
 988 |     }
 989 |    }
 990 |   },
 991 |   {
 992 |    "cell_type": "code",
 993 |    "execution_count": null,
 994 |    "outputs": [],
 995 |    "source": [
 996 |     "gp_meas_cleaned_2 = gp_meas_cleaned_1 %>% ungroup() %>% filter(!is.na(meaning))"
 997 |    ],
 998 |    "metadata": {
 999 |     "collapsed": false,
1000 |     "pycharm": {
1001 |      "name": "#%%\n"
1002 |     }
1003 |    }
1004 |   },
1005 |   {
1006 |    "cell_type": "code",
1007 |    "execution_count": null,
1008 |    "outputs": [],
1009 |    "source": [
1010 |     "double_df = gp_meas_cleaned_2 %>% filter(!is.na(as.numeric(value1)) & !is.na(as.numeric(value2))) "
1011 |    ],
1012 |    "metadata": {
1013 |     "collapsed": false,
1014 |     "pycharm": {
1015 |      "name": "#%%\n"
1016 |     }
1017 |    }
1018 |   },
1019 |   {
1020 |    "cell_type": "code",
1021 |    "execution_count": null,
1022 |    "outputs": [],
1023 |    "source": [
1024 |     "# clean blood pressure and map to systolic and diastolic\n",
1025 |     "bp_double_mapped = double_df %>% filter(name %in% c('O/E - blood pressure reading', 'O/E - BP reading normal', 'O/E - BP reading raised',\n",
1026 |     "                'O/E - BP borderline raised', 'O/E - Systolic BP reading', 'O/E - Diastolic BP reading', 'Sitting blood pressure', \"Average home systolic blood pressure\",\n",
1027 |     "                'Standing blood pressure','24 hr blood pressure monitoring')) %>% \n",
1028 |     "     #filter(name %in% c('O/E - Systolic BP reading', 'O/E - Diastolic BP reading', \"Average home systolic blood pressure\")) %>%\n",
1029 |     "    filter(as.numeric(value1)>0) %>% \n",
1030 |     "    mutate(value_high = pmax(as.numeric(value1), as.numeric(value2)), value_low = pmin(as.numeric(value1), as.numeric(value2))) %>% \n",
1031 |     "    filter(value_high>40 & value_low>20 & value_high<400 & value_low<300)  %>% rename(\"163030003\" = \"value_high\", \"163031004\" = \"value_low\") %>% \n",
1032 |     "    select(-c(meaning, name)) %>% pivot_longer(c(\"163030003\", \"163031004\"), names_to=\"meaning\", values_to=\"value\") %>% left_join(concept_ids_snomed, by=\"meaning\") %>% distinct() %>% arrange(eid) %>%\n",
1033 |     "    select(eid, date, code, value1, value2, value3, meaning, name, value)"
1034 |    ],
1035 |    "metadata": {
1036 |     "collapsed": false,
1037 |     "pycharm": {
1038 |      "name": "#%%\n"
1039 |     }
1040 |    }
1041 |   },
1042 |   {
1043 |    "cell_type": "code",
1044 |    "execution_count": null,
1045 |    "outputs": [],
1046 |    "source": [
1047 |     "gp_meas_single = gp_meas_cleaned_2 %>% filter(is.na(as.numeric(value1)) | is.na(as.numeric(value2))) %>%\n",
1048 |     "    mutate(value=case_when(!is.na(as.numeric(value1)) ~ as.numeric(value1), is.na(as.numeric(value1)) ~ as.numeric(value2))) %>% filter(!is.na(value))"
1049 |    ],
1050 |    "metadata": {
1051 |     "collapsed": false,
1052 |     "pycharm": {
1053 |      "name": "#%%\n"
1054 |     }
1055 |    }
1056 |   },
1057 |   {
1058 |    "cell_type": "code",
1059 |    "execution_count": null,
1060 |    "outputs": [],
1061 |    "source": [
1062 |     "gp_meas_cleaned_3 = rbind(gp_meas_single, bp_double_mapped) %>% distinct() %>% arrange(eid, date)"
1063 |    ],
1064 |    "metadata": {
1065 |     "collapsed": false,
1066 |     "pycharm": {
1067 |      "name": "#%%\n"
1068 |     }
1069 |    }
1070 |   },
1071 |   {
1072 |    "cell_type": "code",
1073 |    "execution_count": null,
1074 |    "outputs": [],
1075 |    "source": [
1076 |     "arrow::write_feather(gp_meas_cleaned_3, glue(\"{path}/codes_gp_measurements_210120.feather\"))"
1077 |    ],
1078 |    "metadata": {
1079 |     "collapsed": false,
1080 |     "pycharm": {
1081 |      "name": "#%%\n"
1082 |     }
1083 |    }
1084 |   },
1085 |   {
1086 |    "cell_type": "markdown",
1087 |    "source": [
1088 |     "### Prescriptions - RXNorm"
1089 |    ],
1090 |    "metadata": {
1091 |     "collapsed": false
1092 |    }
1093 |   },
1094 |   {
1095 |    "cell_type": "code",
1096 |    "execution_count": null,
1097 |    "outputs": [],
1098 |    "source": [
1099 |     "gp_scripts = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/gp_scripts.txt\")"
1100 |    ],
1101 |    "metadata": {
1102 |     "collapsed": false,
1103 |     "pycharm": {
1104 |      "name": "#%%\n"
1105 |     }
1106 |    }
1107 |   },
1108 |   {
1109 |    "cell_type": "code",
1110 |    "execution_count": null,
1111 |    "outputs": [],
1112 |    "source": [
1113 |     "gp_scripts[gp_scripts == \"\"] <- NA"
1114 |    ],
1115 |    "metadata": {
1116 |     "collapsed": false,
1117 |     "pycharm": {
1118 |      "name": "#%%\n"
1119 |     }
1120 |    }
1121 |   },
1122 |   {
1123 |    "cell_type": "code",
1124 |    "execution_count": null,
1125 |    "outputs": [],
1126 |    "source": [
1127 |     "gp_scripts = gp_scripts %>% mutate(date = ymd(as.Date(fast_strptime(issue_date, \"%d/%m/%Y\"))))"
1128 |    ],
1129 |    "metadata": {
1130 |     "collapsed": false,
1131 |     "pycharm": {
1132 |      "name": "#%%\n"
1133 |     }
1134 |    }
1135 |   },
1136 |   {
1137 |    "cell_type": "code",
1138 |    "execution_count": null,
1139 |    "outputs": [],
1140 |    "source": [
1141 |     "gp_scripts_names_available = gp_scripts %>% filter(!is.na(drug_name))"
1142 |    ],
1143 |    "metadata": {
1144 |     "collapsed": false,
1145 |     "pycharm": {
1146 |      "name": "#%%\n"
1147 |     }
1148 |    }
1149 |   },
1150 |   {
1151 |    "cell_type": "code",
1152 |    "execution_count": null,
1153 |    "outputs": [],
1154 |    "source": [
1155 |     "gp_scripts_read_available = gp_scripts %>% filter(is.na(drug_name))"
1156 |    ],
1157 |    "metadata": {
1158 |     "collapsed": false,
1159 |     "pycharm": {
1160 |      "name": "#%%\n"
1161 |     }
1162 |    }
1163 |   },
1164 |   {
1165 |    "cell_type": "code",
1166 |    "execution_count": null,
1167 |    "outputs": [],
1168 |    "source": [
1169 |     "drug_names = (gp_scripts_names_available %>% count(drug_name, sort=TRUE))$drug_name"
1170 |    ],
1171 |    "metadata": {
1172 |     "collapsed": false,
1173 |     "pycharm": {
1174 |      "name": "#%%\n"
1175 |     }
1176 |    }
1177 |   },
1178 |   {
1179 |    "cell_type": "code",
1180 |    "execution_count": null,
1181 |    "outputs": [],
1182 |    "source": [
1183 |     "library(jsonlite)\n",
1184 |     "write_json(drug_names, glue(\"{path}/drug_names.json\"))"
1185 |    ],
1186 |    "metadata": {
1187 |     "collapsed": false,
1188 |     "pycharm": {
1189 |      "name": "#%%\n"
1190 |     }
1191 |    }
1192 |   },
1193 |   {
1194 |    "cell_type": "code",
1195 |    "execution_count": null,
1196 |    "outputs": [],
1197 |    "source": [
1198 |     "name_umls_link = arrow::read_feather(glue(\"{path}/drug_names_umls_linked.feather\"))"
1199 |    ],
1200 |    "metadata": {
1201 |     "collapsed": false,
1202 |     "pycharm": {
1203 |      "name": "#%%\n"
1204 |     }
1205 |    }
1206 |   },
1207 |   {
1208 |    "cell_type": "code",
1209 |    "execution_count": null,
1210 |    "outputs": [],
1211 |    "source": [
1212 |     "drugs_rxnorm = arrow::read_feather(glue(\"{path}/drug_names_umls_linked_rxnorm.feather\"))"
1213 |    ],
1214 |    "metadata": {
1215 |     "collapsed": false,
1216 |     "pycharm": {
1217 |      "name": "#%%\n"
1218 |     }
1219 |    }
1220 |   },
1221 |   {
1222 |    "cell_type": "code",
1223 |    "execution_count": null,
1224 |    "outputs": [],
1225 |    "source": [
1226 |     "rx_mapping = concept %>% filter(vocabulary_id %in% c('RxNorm','RxNorm Extension')) %>% select(concept_code, concept_name) %>% rename(rx_code =\"concept_code\", name=\"concept_name\")"
1227 |    ],
1228 |    "metadata": {
1229 |     "collapsed": false,
1230 |     "pycharm": {
1231 |      "name": "#%%\n"
1232 |     }
1233 |    }
1234 |   },
1235 |   {
1236 |    "cell_type": "code",
1237 |    "execution_count": null,
1238 |    "outputs": [],
1239 |    "source": [
1240 |     "rx_norm_mapping_table = drugs_rxnorm %>% select(drug_name, rx_code) %>% filter(rx_code != \"\") %>% distinct() %>% left_join(rx_mapping, on=\"rx_code\")"
1241 |    ],
1242 |    "metadata": {
1243 |     "collapsed": false,
1244 |     "pycharm": {
1245 |      "name": "#%%\n"
1246 |     }
1247 |    }
1248 |   },
1249 |   {
1250 |    "cell_type": "code",
1251 |    "execution_count": null,
1252 |    "outputs": [],
1253 |    "source": [
1254 |     "gp_scripts_rxnorm = gp_scripts_names_available %>% left_join(rx_norm_mapping_table, on=\"drug_name\") %>% select(eid, date, drug_name, rx_code, name) %>% distinct()"
1255 |    ],
1256 |    "metadata": {
1257 |     "collapsed": false,
1258 |     "pycharm": {
1259 |      "name": "#%%\n"
1260 |     }
1261 |    }
1262 |   },
1263 |   {
1264 |    "cell_type": "code",
1265 |    "execution_count": null,
1266 |    "outputs": [],
1267 |    "source": [
1268 |     "arrow::write_feather(gp_scripts_rxnorm, glue(\"{path}/codes_gp_prescription_scispacy.feather\"))"
1269 |    ],
1270 |    "metadata": {
1271 |     "collapsed": false,
1272 |     "pycharm": {
1273 |      "name": "#%%\n"
1274 |     }
1275 |    }
1276 |   }
1277 |  ],
1278 |  "metadata": {
1279 |   "hide_input": false,
1280 |   "kernelspec": {
1281 |    "display_name": "R [conda env:python]",
1282 |    "language": "R",
1283 |    "name": "conda-env-python-r"
1284 |   },
1285 |   "language_info": {
1286 |    "codemirror_mode": "r",
1287 |    "file_extension": ".r",
1288 |    "mimetype": "text/x-r-source",
1289 |    "name": "R",
1290 |    "pygments_lexer": "r",
1291 |    "version": "4.0.3"
1292 |   }
1293 |  },
1294 |  "nbformat": 4,
1295 |  "nbformat_minor": 4
1296 | }


--------------------------------------------------------------------------------