├── metabolomicstatemodel ├── source │ ├── __init__.py │ ├── config │ │ ├── features │ │ │ ├── PANELmetabolitesOverlap.yaml │ │ │ ├── AgeSex.yaml │ │ │ ├── PANELmetabolites.yaml │ │ │ ├── PANEL.yaml │ │ │ └── Metabolomics.yaml │ │ └── config.yaml │ ├── logging.py │ ├── evaluation.py │ ├── losses.py │ ├── utils.py │ ├── callbacks.py │ ├── modules.py │ ├── datasets.py │ └── datamodules.py ├── train.sh └── train.py ├── src ├── fig1.png ├── fig2.png └── msm_logo.png ├── analysis ├── preprocessing │ ├── 0_decode_ukbb.ipynb │ ├── pipeline_metabolomics.py │ ├── 2_preprocessing_clinical_endpoints.ipynb │ └── 1_preprocessing_dataportal.ipynb └── examples │ ├── assets.yaml │ └── sample.csv ├── README.md └── LICENSE /metabolomicstatemodel/source/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thbuerg/MetabolomicsCommonDiseases/HEAD/src/fig1.png -------------------------------------------------------------------------------- /src/fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thbuerg/MetabolomicsCommonDiseases/HEAD/src/fig2.png -------------------------------------------------------------------------------- /src/msm_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thbuerg/MetabolomicsCommonDiseases/HEAD/src/msm_logo.png -------------------------------------------------------------------------------- /metabolomicstatemodel/train.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | echo $(hostname) 3 | echo $(which python) 4 | echo $(python -c 'import torch; print(f"found {torch.cuda.device_count()} gpus.")') 5 | echo $CUDA_VISIBLE_DEVICES 6 | 7 | python train.py --config-dir source/config/ --config-name config 8 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/config/features/PANELmetabolitesOverlap.yaml: -------------------------------------------------------------------------------- 1 | # @package experiment.feature_set.PANEL 2 | categorical: 3 | basics: [] 4 | questionnaire: [] 5 | one_hot_enc: 6 | basics: [] 7 | questionnaire: [] 8 | general: 9 | pgs: [] 10 | measurements: [] 11 | labs: [ 12 | "albumin", 13 | "cholesterol", 14 | "hdl_cholesterol", 15 | "ldl_direct", 16 | "triglycerides", 17 | "glucose", 18 | "creatinine", 19 | ] 20 | family_history: [] 21 | diagnoses: [] 22 | medications: [] 23 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/config/features/AgeSex.yaml: -------------------------------------------------------------------------------- 1 | # @package experiment.feature_set.CVDCoreVariablesWithPGS 2 | categorical: 3 | basics: [ 4 | 'age_at_recruitment', 5 | 'sex' 6 | ] 7 | questionnaire: [ 8 | ] 9 | metabolomics: [] 10 | one_hot_enc: 11 | basics: [ 12 | 'age_at_recruitment', 13 | 'sex' 14 | ] 15 | questionnaire: [ 16 | ] 17 | metabolomics: [] 18 | general: 19 | metabolomics: [] 20 | measurements: [ 21 | ] 22 | labs: [ 23 | ] 24 | family_history: [ 25 | ] 26 | diagnoses: [] 27 | medications: [] -------------------------------------------------------------------------------- /metabolomicstatemodel/source/config/features/PANELmetabolites.yaml: -------------------------------------------------------------------------------- 1 | # @package experiment.feature_set.PANEL 2 | categorical: 3 | basics: [] 4 | questionnaire: [] 5 | one_hot_enc: 6 | basics: [] 7 | questionnaire: [] 8 | general: 9 | pgs: [] 10 | measurements: [] 11 | labs: [ 12 | "albumin", 13 | "cholesterol", 14 | "hdl_cholesterol", 15 | "ldl_direct", 16 | "triglycerides", 17 | "glucose", 18 | "creatinine", 19 | "urea", 20 | "urate", 21 | ] 22 | family_history: [] 23 | diagnoses: [] 24 | medications: [] 25 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/logging.py: -------------------------------------------------------------------------------- 1 | from neptune.new.integrations.pytorch_lightning import NeptuneLogger 2 | 3 | from typing import Any, Dict, Iterable, Optional, Union 4 | from argparse import Namespace 5 | 6 | 7 | class FoolProofNeptuneLogger(NeptuneLogger): 8 | """ 9 | Logger that does only log params if they do not exceed the str len limit. 10 | """ 11 | def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: 12 | params = self._convert_params(params) 13 | 14 | parameters_key = self.PARAMETERS_KEY 15 | if self._base_namespace: 16 | parameters_key = f'{self._base_namespace}/{parameters_key}' 17 | 18 | keys_to_pop = [] 19 | for k, v in params.items(): 20 | if len(str(v)) >= 16384: 21 | keys_to_pop.append(k) 22 | for k in keys_to_pop: 23 | params.pop(k) 24 | 25 | self.run[parameters_key] = params 26 | 27 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from lifelines import CRCSplineFitter 4 | 5 | 6 | def get_observed_probability(F_t, events, durations, t0: float): 7 | def ccl(p): return np.log(-np.log(1 - p)) 8 | 9 | T = "time" 10 | E = "event" 11 | 12 | predictions_at_t0 = np.clip(F_t, 1e-10, 1 - 1e-10) 13 | prediction_df = pd.DataFrame({f"ccl_at_{t0}": ccl(predictions_at_t0), T: durations, E: events}) 14 | 15 | if any(x <= 1 for x in events): 16 | pass 17 | else: 18 | prediction_df["event"] = [0 if v > 1 else v for v in prediction_df["event"].to_list()] 19 | 20 | index_old = prediction_df.index 21 | prediction_df = prediction_df.dropna() 22 | index_new = prediction_df.index 23 | diff = index_old.difference(index_new) 24 | 25 | knots = 3 26 | regressors = {"beta_": [f"ccl_at_{t0}"], **{f"gamma{i}_": "1" for i in range(knots)}} 27 | 28 | crc = CRCSplineFitter(knots, penalizer=0.001).fit(prediction_df, T, E, regressors=regressors, show_progress=False) 29 | 30 | risk_obs = (1 - crc.predict_survival_function(prediction_df, times=[t0])).T.squeeze() 31 | 32 | return risk_obs, diff.to_list() 33 | -------------------------------------------------------------------------------- /analysis/preprocessing/0_decode_ukbb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "library(ukbtools)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "outputs": [], 16 | "source": [ 17 | "in_dir = \"dir/where/decoded/file/is\"\n", 18 | "out_path = \"dir/where/to/write/feather\"" 19 | ], 20 | "metadata": { 21 | "collapsed": false, 22 | "pycharm": { 23 | "name": "#%%\n" 24 | } 25 | } 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "my_ukb_data <- ukb_df(\"decoded\", in_dir)\n", 34 | "df_field <- ukb_df_field(\"decoded\", in_dir)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": { 41 | "pycharm": { 42 | "name": "#%%\n" 43 | } 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "arrow::write_feather(my_ukb_data, out_path)\n", 48 | "arrow::write_feather(df_field, out_path)" 49 | ] 50 | } 51 | ], 52 | "metadata": { 53 | "kernelspec": { 54 | "display_name": "R [conda env:python]", 55 | "language": "R", 56 | "name": "conda-env-python-r" 57 | }, 58 | "language_info": { 59 | "codemirror_mode": "r", 60 | "file_extension": ".r", 61 | "mimetype": "text/x-r-source", 62 | "name": "R", 63 | "pygments_lexer": "r", 64 | "version": "4.0.3" 65 | } 66 | }, 67 | "nbformat": 4, 68 | "nbformat_minor": 4 69 | } -------------------------------------------------------------------------------- /metabolomicstatemodel/source/config/features/PANEL.yaml: -------------------------------------------------------------------------------- 1 | # @package experiment.feature_set.PANEL 2 | categorical: 3 | basics: [ 4 | 'age_at_recruitment', 5 | 'sex', 6 | ] 7 | questionnaire: [ 8 | ] 9 | one_hot_enc: 10 | basics: [ 11 | 'sex', 12 | 'age_at_recruitment' 13 | ] 14 | questionnaire: [ 15 | 'smoking_status_2.0', 16 | 'alcohol_intake_frequency_2.0', 17 | ] 18 | general: 19 | pgs: [] 20 | measurements: [ 21 | 'daily_physical_activity', 22 | 'daily_healthy_food', 23 | 'education_years', 24 | 'body_mass_index_bmi', 25 | 'waist_hip_ratio', 26 | 'weight', 27 | "standing_height", 28 | 'systolic_blood_pressure', 29 | ] 30 | labs: [ 31 | "cholesterol", 32 | "hdl_cholesterol", 33 | "ldl_direct", 34 | "triglycerides", 35 | "glucose", 36 | "glycated_haemoglobin_hba1c", 37 | "creatinine", 38 | "cystatin_c", 39 | "urea", 40 | "urate", 41 | 'aspartate_aminotransferase', 42 | 'alanine_aminotransferase', 43 | 'alkaline_phosphatase', 44 | 'albumin', 45 | "creactive_protein", 46 | 'red_blood_cell_erythrocyte_count', 47 | 'white_blood_cell_leukocyte_count', 48 | 'platelet_count', 49 | 'haemoglobin_concentration', 50 | 'haematocrit_percentage', 51 | 'mean_corpuscular_haemoglobin', 52 | 'mean_corpuscular_volume', 53 | 'mean_corpuscular_haemoglobin_concentration' 54 | ] 55 | family_history: [ 56 | 'fh_diabetes', 57 | ] 58 | diagnoses: [] 59 | medications: [ 60 | "antihypertensives", 61 | ] 62 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal, LogNormal, Weibull, transform_to 5 | 6 | import numpy as np 7 | 8 | 9 | def cox_ph_loss(logh, durations, events, eps=1e-7): 10 | """ 11 | Simple approximation of the COX-ph. Log hazard is not computed on risk-sets, but on ranked list instead. 12 | This approximation is valid for datamodules w/ low percentage of ties. 13 | Credit to Haavard Kamme/PyCox 14 | :param logh: 15 | :param durations: 16 | :param events: 17 | :param eps: 18 | :return: 19 | """ 20 | # sort: 21 | idx = durations.sort(descending=True, dim=0)[1] 22 | events = events[idx].squeeze(-1) 23 | logh = logh[idx].squeeze(-1) 24 | # calculate loss: 25 | gamma = logh.max() 26 | log_cumsum_h = logh.sub(gamma).exp().cumsum(0).add(eps).log().add(gamma) 27 | if events.sum() > 0: 28 | loss = - logh.sub(log_cumsum_h).mul(events).sum().div(events.sum()) 29 | else: 30 | loss = - logh.sub(log_cumsum_h).mul(events).sum() 31 | return loss 32 | 33 | def DSM_uncensored_loss(logf_ts, ks, events, e=1): 34 | """ 35 | We minimize the ELBO of log P(DATASET_uncensored) 36 | equalling the negative sum over all log hazards. 37 | inputs are expected to be 2D Tensors of shape [B, k_dim] 38 | :param logf_t: 39 | :param durations: 40 | :param events: 41 | :return: 42 | """ 43 | 44 | e_ = torch.Tensor([e]) 45 | e_ = e_.type_as(logf_ts) 46 | zero_ = torch.Tensor([0]) 47 | zero_ = zero_.type_as(logf_ts) 48 | 49 | elbo = torch.logsumexp(F.log_softmax(ks, dim=1)+logf_ts, dim=1, keepdim=True) 50 | mask = torch.eq(events, e_) 51 | elbo = elbo[mask] 52 | 53 | if torch.eq(mask.sum(), zero_): 54 | return torch.Tensor([1.0]).squeeze().type_as(logf_ts) 55 | else: 56 | return -elbo.sum() / (mask.sum()) 57 | 58 | 59 | def DSM_censored_loss(logS_ts, ks, events, e=1): 60 | """ 61 | NLL on log hazards. 62 | 63 | For competing risks, all other events are treated as administrative censoring. 64 | 65 | :param logh: 66 | :param durations: 67 | :param events: 68 | :return: 69 | """ 70 | e_ = torch.Tensor([e]) 71 | e_ = e_.type_as(logS_ts) 72 | 73 | elbo = torch.logsumexp(F.log_softmax(ks, dim=1)+logS_ts, dim=1, keepdim=True) 74 | mask = torch.ne(events, e_) 75 | elbo = elbo[mask] 76 | 77 | return -elbo.sum()/mask.sum() 78 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytorch_lightning as pl 6 | 7 | from .logging import FoolProofNeptuneLogger 8 | 9 | 10 | #################################################################################################### 11 | # neptune # 12 | #################################################################################################### 13 | 14 | def set_up_neptune(FLAGS={}, close_after_fit=False, **kwargs): 15 | """ 16 | Set up a neptune logger from file. 17 | :param keyfile: 18 | :param project: 19 | :param name: 20 | :param params: 21 | :param tags: 22 | :param close_after_fit: 23 | :param kwargs: 24 | :return: 25 | """ 26 | if not "NEPTUNE_API_TOKEN" in os.environ: 27 | raise EnvironmentError('Please set environment variable `NEPTUNE_API_TOKEN`.') 28 | 29 | neptune_logger = FoolProofNeptuneLogger(api_key=os.environ["NEPTUNE_API_TOKEN"], 30 | close_after_fit=close_after_fit, 31 | **FLAGS.setup) 32 | return neptune_logger 33 | 34 | 35 | def get_default_callbacks(monitor='Ctd_0.9', mode='max', early_stop=True): 36 | """ 37 | Instantate the default callbacks: EarlyStopping and Checkpointing. 38 | 39 | :param monitor: 40 | :param mode: 41 | :return: 42 | """ 43 | checkpoint_callback = pl.callbacks.model_checkpoint.ModelCheckpoint(monitor=monitor, verbose=True, 44 | save_last=True, save_top_k=3, 45 | save_weights_only=False, mode=mode, 46 | period=1) 47 | lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step', log_momentum=False) 48 | if early_stop: 49 | early_stop = pl.callbacks.early_stopping.EarlyStopping(monitor=monitor, min_delta=1e-5, patience=15, 50 | verbose=True, mode=mode, strict=False) 51 | return [checkpoint_callback, early_stop, lr_monitor] 52 | else: 53 | return [checkpoint_callback, lr_monitor] 54 | 55 | 56 | def attribution2df(attributions, feature_names, loader): 57 | attribution_sum = attributions.detach().numpy().sum(0) 58 | attribution_norm_sum = attribution_sum / np.linalg.norm(attribution_sum, ord=1) 59 | axis_data = np.arange(loader.shape[1]) 60 | data_labels = list(map(lambda idx: feature_names[idx], axis_data)) 61 | df = pd.DataFrame({'feature': data_labels, 62 | 'importance': attribution_norm_sum}) 63 | sorted_df = df.reindex(df.importance.abs().sort_values(ascending=False).index) 64 | return sorted_df 65 | 66 | -------------------------------------------------------------------------------- /metabolomicstatemodel/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import warnings 4 | 5 | import hydra 6 | import torch 7 | import torch.nn as nn 8 | import pandas as pd 9 | import pytorch_lightning as pl 10 | import neptune.new as neptune 11 | 12 | from omegaconf import DictConfig, ListConfig, OmegaConf 13 | from torch.nn import Sigmoid, SELU, ReLU 14 | from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau, MultiStepLR 15 | from pytorch_lightning.callbacks import LearningRateMonitor, StochasticWeightAveraging 16 | 17 | from source.datamodules import * 18 | from source.tasks import * 19 | from source.modules import * 20 | from source.utils import set_up_neptune, get_default_callbacks 21 | from source.callbacks import WriteCheckpointLogs, WritePredictionsDataFrame 22 | 23 | 24 | # globals: 25 | warnings.filterwarnings("ignore", category=RuntimeWarning) 26 | warnings.filterwarnings("ignore", category=UserWarning) 27 | pd.options.mode.use_inf_as_na = True 28 | pl.seed_everything(23) #the number of doom 29 | 30 | 31 | assert os.environ['NEPTUNE_API_TOKEN'], 'No Neptune API Token found. Please do `export NEPTUNE_API_TOKEN=`.' 32 | config_path = "source/config/" 33 | 34 | 35 | @hydra.main(config_path, config_name="config") 36 | def train(FLAGS: DictConfig): 37 | OmegaConf.set_struct(FLAGS, False) 38 | FLAGS.config_path = config_path 39 | 40 | # get classes 41 | Task = eval(FLAGS.experiment.task) 42 | Module = eval(FLAGS.experiment.module) 43 | DataModule = eval(FLAGS.experiment.datamodule) 44 | if FLAGS.experiment.latent_module is not None: 45 | LatentModule = eval(FLAGS.experiment.latent_module) 46 | else: 47 | LatentModule = None 48 | 49 | # initialize datamodule 50 | # load features.yaml if necessary: 51 | if FLAGS.experiment.feature_set is not None: 52 | FLAGS.experiment.features = OmegaConf.load(os.path.join(FLAGS.config_path, FLAGS.experiment.features_yaml)) 53 | datamodule = DataModule(**FLAGS.experiment) 54 | datamodule.prepare_data() 55 | datamodule.setup("fit") 56 | FLAGS["data"] = {"feature_names": datamodule.features} 57 | 58 | # get network: 59 | ft_extractor = Module(input_dim=len(datamodule.features), **FLAGS.experiment.module_kwargs) 60 | if LatentModule is not None: 61 | if LatentModule == ResidualHeadMLP: 62 | FLAGS.experiment.latent_module_kwargs.skip_connection_input_dim = len(datamodule.features) 63 | cause_specific = LatentModule(**FLAGS.experiment.latent_module_kwargs) 64 | else: 65 | cause_specific = nn.Identity() 66 | 67 | # initialize Task 68 | task = Task(feature_extractor=ft_extractor, 69 | latent_mlp=cause_specific, 70 | feature_dim=len(datamodule.features), 71 | **FLAGS.experiment) 72 | 73 | # initialize trainer 74 | callbacks = get_default_callbacks(monitor=FLAGS.experiment.monitor) 75 | callbacks.extend([WriteCheckpointLogs(), 76 | WritePredictionsDataFrame( 77 | write_calibrated_predictions=FLAGS.experiment.write_calibrated_predictions) 78 | ] 79 | ) 80 | 81 | trainer = pl.Trainer(**FLAGS.trainer, 82 | callbacks=callbacks, 83 | logger=set_up_neptune(FLAGS)) 84 | 85 | FLAGS["parameters/callbacks"] = [c.__class__.__name__ for c in callbacks] 86 | trainer.logger.run["FLAGS"] = FLAGS 87 | 88 | if FLAGS.trainer.auto_lr_find: 89 | trainer.tune(model=task, datamodule=datamodule) 90 | 91 | # run 92 | trainer.fit(task, datamodule) 93 | trainer.logger.run.stop() 94 | 95 | 96 | if __name__ == '__main__': 97 | train() 98 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/callbacks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import pandas as pd 4 | 5 | from tqdm import tqdm 6 | from pathlib import Path 7 | from pytorch_lightning.callbacks.base import Callback 8 | 9 | 10 | class WriteCheckpointLogs(Callback): 11 | """ 12 | Write final logs to neptune. 13 | """ 14 | def on_keyboard_interrupt(self, trainer, pl_module, device='cuda:0'): 15 | self.on_epoch_end(trainer, pl_module) 16 | 17 | def on_epoch_end(self, trainer, pl_module): 18 | if isinstance(trainer.logger, list): 19 | logger = trainer.logger[0] 20 | else: 21 | logger = trainer.logger 22 | if torch.is_tensor(trainer.checkpoint_callback.best_model_score): 23 | logger.run["checkpoint_metric"] = trainer.checkpoint_callback.monitor 24 | logger.run["checkpoint_value"] = str(trainer.checkpoint_callback.best_model_score.item()) 25 | logger.run["checkpoint_path"] = trainer.checkpoint_callback.best_model_path 26 | 27 | 28 | class WritePredictionsDataFrame(Callback): 29 | """ 30 | Write Predictions generated by `predict_dataset` or `predict_dataset_with_uncertainty` that return pd.DataFrames. 31 | """ 32 | def __init__(self, write_calibrated_predictions=True, **kwargs): 33 | super().__init__() 34 | self.write_calibrated_predictions = write_calibrated_predictions 35 | 36 | def on_keyboard_interrupt(self, trainer, module, device='cuda:0'): 37 | self.on_fit_end(trainer, module, device) 38 | 39 | def on_fit_end(self, trainer, module, device='cuda:0'): # how to set inference device better? adaptive to train device? 40 | ckpt = torch.load(trainer.checkpoint_callback.best_model_path) 41 | module.load_state_dict(ckpt['state_dict']) 42 | module.eval() 43 | module.to(device) 44 | 45 | time_max = 26 # effective real time max is time_max-2 -> 25 years 46 | times = [e for e in range(1, time_max, 1)] 47 | if self.write_calibrated_predictions: 48 | module.fit_isotonic_regressor(trainer.datamodule.train_ds, times, 100000) 49 | 50 | # write the predictions.csv 51 | predictions = {} 52 | for ds_idx, (ds, ds_name) in enumerate(tqdm([(trainer.datamodule.train_ds, 'train'), 53 | (trainer.datamodule.valid_ds, 'valid'), 54 | (trainer.datamodule.test_ds, 'test')])): 55 | if self.write_calibrated_predictions: 56 | predictions[ds_name] = module.predict_dataset_calibrated(ds, times) 57 | else: 58 | predictions[ds_name] = module.predict_dataset(ds, times) 59 | predictions[ds_name]['eid'] = ds.datasets[0].eid_map.index.values 60 | predictions[ds_name]["split"] = ds_name 61 | predictions_df = pd.concat([*predictions.values()]).reset_index(drop=True) 62 | predictions_df["partition"] = trainer.datamodule.cv_partition 63 | predictions_df["module"] = type(module).__name__ 64 | try: 65 | predictions_df["net"] = type(module.net).__name__ 66 | except AttributeError: 67 | pass 68 | predictions_df["datamodule"] = type(trainer.datamodule).__name__ 69 | predictions_df["event_names"] = str(trainer.datamodule.event) 70 | predictions_df["feature_names"] = str(trainer.datamodule.features) 71 | 72 | self.write_and_log(trainer, predictions_df) 73 | 74 | def write_and_log(self, trainer, predictions_df): 75 | # write the predictions.csv 76 | outdir = os.path.join(Path(trainer.checkpoint_callback.dirpath).parent, "predictions") 77 | if not os.path.exists(outdir): 78 | os.mkdir(outdir) 79 | predictions_df.to_feather(os.path.join(outdir, "predictions.feather")) 80 | predictions_df.to_csv(os.path.join(outdir, "predictions.csv")) 81 | 82 | if isinstance(trainer.logger, list): 83 | trainer.logger[0].run["prediction_available"] = "TRUE" 84 | trainer.logger[0].run["prediction_path"] = os.path.join(outdir, "predictions.feather") 85 | else: 86 | trainer.logger.run["prediction_available"] = "TRUE" 87 | trainer.logger.run["prediction_path"] = os.path.join(outdir, "predictions.feather") 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ![Logo](./src/msm_logo.png?raw=true "Logo") 4 | 5 | ⛑ **Metabolomic profiles predict individual multi-disease outcomes** ⛑ 6 | 7 | [comment]: <> () 15 | 16 |
17 | 18 | ## Description 19 | Code related to the paper "Metabolomic profiles predict individual multi-disease outcomes in the UK Biobank cohort". 20 | This repo is a python package for preprocessing UK Biobank data and preprocessing, training and evaluating the proposed MetabolomicStateModel score. 21 | 22 | ![Workflow](./src/fig1.png?raw=true "Workflow") 23 | 24 | ## Methods 25 | The **MetabolomicStateModel** is based on [DeepSurv](https://arxiv.org/abs/1606.00931) (the original implementation can be found [here](https://github.com/jaredleekatzman/DeepSurv)). Using a residual neural network, it learns a shared-representation of the NMR metabolomics data to predict log partial hazards for common disease endpoints. 26 | 27 | ![Architecture](./src/fig2.png?raw=true "Architecture") 28 | 29 | ## Assets 30 | This repo contains code to preprocess [UK Biobank](https://www.ukbiobank.ac.uk/) data, train the MetabolomicStateModel and analyze/evaluate its performance. 31 | 32 | - Preprocessing involves parsing primary care records for desired diagnosis. 33 | - Training involves Model specification via pytorch-lightning and hydra. 34 | - Evaluation involves extensive benchmarks with linear Models, and calculation of bootstrapped metrics. 35 | - Visualization contains the code to generate the figures displayed in the paper. 36 | 37 | ## Use the MetabolomicStateModel on your data 38 | We provide you a ready-to-use [Google colab notebook](https://colab.research.google.com/github/thbuerg/MetabolomicsCommonDiseases/blob/main/analysis/examples/MetabolomicsInference.ipynb) with a trained version of our MetabolomicStateModel. Upload your dataset of Nightingale NMR metabolomics and run the model! 39 | 40 | **NOTE**: Data must be provided in [this format](https://github.com/thbuerg/MetabolomicsCommonDiseases/blob/main/analysis/examples/sample.csv). 41 | 42 | **DISCLAIMER**: This model is intended for research use only. We provide the NMR normalization pipeline as fitted on UK Biobank. Cohort-specific rescaling might be advisable. 43 | 44 | ## How to train the MetabolomicStateModel 45 | 1. First, install dependencies 46 | ```bash 47 | # clone project 48 | git clone https://github.com/thbuerg/MetabolomicsCommonDiseases 49 | 50 | # install project 51 | cd MetabolomicsCommonDiseases 52 | pip install -e . 53 | pip install -r requirements.txt 54 | ``` 55 | 56 | 2. Download UK Biobank data. Execute preprocessing notebooks on the downloaded data. 57 | 58 | 3. Set up [Neptune.ai](https://www.neptune.ai) 59 | 60 | 4. Edit the `config.yaml` in `metabolomicstatemodel/run/config/`: 61 | ```yaml 62 | data_dir: /path/to/data 63 | code_dir: /path/to/repo_base 64 | setup: 65 | project: / 66 | experiment: 67 | tabular_filepath: /path/to/processed/data 68 | ``` 69 | 70 | 5. Train the NeuralCVD Model (make sure you are on a machine w/ GPU) 71 | ```bash 72 | # module folder 73 | cd source 74 | 75 | # run training 76 | bash run/run_MSM.sh 77 | ``` 78 | 79 | ## License 80 | This work is licensed under a 81 | [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License][cc-by-nc-sa]. 82 | 83 | [![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa] 84 | 85 | [cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/ 86 | [cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png 87 | [cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC%20BY--NC--SA%204.0-lightgrey.svg 88 | 89 | ## Citation 90 | ``` 91 | @article{buergel2022metabolomic, 92 | title={Metabolomic profiles predict individual multidisease outcomes}, 93 | author={Buergel, Thore and Steinfeldt, Jakob and Ruyoga, Greg and Pietzner, Maik and Bizzarri, Daniele and Vojinovic, Dina and Upmeier zu Belzen, Julius and Loock, Lukas and Kittner, Paul and Christmann, Lara and others}, 94 | journal={Nature Medicine}, 95 | pages={1--12}, 96 | year={2022}, 97 | publisher={Nature Publishing Group} 98 | } 99 | ``` 100 | -------------------------------------------------------------------------------- /analysis/examples/assets.yaml: -------------------------------------------------------------------------------- 1 | endpoints: 2 | ['M_MACE', 3 | 'M_all_cause_dementia', 4 | 'M_type_2_diabetes', 5 | 'M_liver_disease', 6 | 'M_renal_disease', 7 | 'M_atrial_fibrillation', 8 | 'M_heart_failure', 9 | 'M_coronary_heart_disease', 10 | 'M_venous_thrombosis', 11 | 'M_cerebral_stroke', 12 | 'M_abdominal_aortic_aneurysm', 13 | 'M_peripheral_arterial_disease', 14 | 'M_asthma', 15 | 'M_chronic_obstructuve_pulmonary_disease', 16 | 'M_lung_cancer', 17 | 'M_non_melanoma_skin_cancer', 18 | 'M_colon_cancer', 19 | 'M_rectal_cancer', 20 | 'M_prostate_cancer', 21 | 'M_breast_cancer', 22 | 'M_parkinsons_disease', 23 | 'M_fractures', 24 | 'M_cataracts', 25 | 'M_glaucoma' 26 | ] 27 | metabolite_labels: 28 | ['NMR_3hydroxybutyrate', 29 | 'NMR_acetate', 30 | 'NMR_acetoacetate', 31 | 'NMR_acetone', 32 | 'NMR_alanine', 33 | 'NMR_albumin', 34 | 'NMR_apolipoprotein_a1', 35 | 'NMR_apolipoprotein_b', 36 | 'NMR_average_diameter_for_hdl_particles', 37 | 'NMR_average_diameter_for_ldl_particles', 38 | 'NMR_average_diameter_for_vldl_particles', 39 | 'NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl', 40 | 'NMR_cholesterol_in_idl', 41 | 'NMR_cholesterol_in_large_hdl', 42 | 'NMR_cholesterol_in_large_ldl', 43 | 'NMR_cholesterol_in_large_vldl', 44 | 'NMR_cholesterol_in_medium_hdl', 45 | 'NMR_cholesterol_in_medium_ldl', 46 | 'NMR_cholesterol_in_medium_vldl', 47 | 'NMR_cholesterol_in_small_hdl', 48 | 'NMR_cholesterol_in_small_ldl', 49 | 'NMR_cholesterol_in_small_vldl', 50 | 'NMR_cholesterol_in_very_large_hdl', 51 | 'NMR_cholesterol_in_very_large_vldl', 52 | 'NMR_cholesterol_in_very_small_vldl', 53 | 'NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl', 54 | 'NMR_cholesteryl_esters_in_hdl', 55 | 'NMR_cholesteryl_esters_in_idl', 56 | 'NMR_cholesteryl_esters_in_ldl', 57 | 'NMR_cholesteryl_esters_in_large_hdl', 58 | 'NMR_cholesteryl_esters_in_large_ldl', 59 | 'NMR_cholesteryl_esters_in_large_vldl', 60 | 'NMR_cholesteryl_esters_in_medium_hdl', 61 | 'NMR_cholesteryl_esters_in_medium_ldl', 62 | 'NMR_cholesteryl_esters_in_medium_vldl', 63 | 'NMR_cholesteryl_esters_in_small_hdl', 64 | 'NMR_cholesteryl_esters_in_small_ldl', 65 | 'NMR_cholesteryl_esters_in_small_vldl', 66 | 'NMR_cholesteryl_esters_in_vldl', 67 | 'NMR_cholesteryl_esters_in_very_large_hdl', 68 | 'NMR_cholesteryl_esters_in_very_large_vldl', 69 | 'NMR_cholesteryl_esters_in_very_small_vldl', 70 | 'NMR_citrate', 71 | 'NMR_clinical_ldl_cholesterol', 72 | 'NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles', 73 | 'NMR_concentration_of_hdl_particles', 74 | 'NMR_concentration_of_idl_particles', 75 | 'NMR_concentration_of_ldl_particles', 76 | 'NMR_concentration_of_large_hdl_particles', 77 | 'NMR_concentration_of_large_ldl_particles', 78 | 'NMR_concentration_of_large_vldl_particles', 79 | 'NMR_concentration_of_medium_hdl_particles', 80 | 'NMR_concentration_of_medium_ldl_particles', 81 | 'NMR_concentration_of_medium_vldl_particles', 82 | 'NMR_concentration_of_small_hdl_particles', 83 | 'NMR_concentration_of_small_ldl_particles', 84 | 'NMR_concentration_of_small_vldl_particles', 85 | 'NMR_concentration_of_vldl_particles', 86 | 'NMR_concentration_of_very_large_hdl_particles', 87 | 'NMR_concentration_of_very_large_vldl_particles', 88 | 'NMR_concentration_of_very_small_vldl_particles', 89 | 'NMR_creatinine', 90 | 'NMR_degree_of_unsaturation', 91 | 'NMR_docosahexaenoic_acid', 92 | 'NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl', 93 | 'NMR_free_cholesterol_in_hdl', 94 | 'NMR_free_cholesterol_in_idl', 95 | 'NMR_free_cholesterol_in_ldl', 96 | 'NMR_free_cholesterol_in_large_hdl', 97 | 'NMR_free_cholesterol_in_large_ldl', 98 | 'NMR_free_cholesterol_in_large_vldl', 99 | 'NMR_free_cholesterol_in_medium_hdl', 100 | 'NMR_free_cholesterol_in_medium_ldl', 101 | 'NMR_free_cholesterol_in_medium_vldl', 102 | 'NMR_free_cholesterol_in_small_hdl', 103 | 'NMR_free_cholesterol_in_small_ldl', 104 | 'NMR_free_cholesterol_in_small_vldl', 105 | 'NMR_free_cholesterol_in_vldl', 106 | 'NMR_free_cholesterol_in_very_large_hdl', 107 | 'NMR_free_cholesterol_in_very_large_vldl', 108 | 'NMR_free_cholesterol_in_very_small_vldl', 109 | 'NMR_glucose', 110 | 'NMR_glutamine', 111 | 'NMR_glycine', 112 | 'NMR_glycoprotein_acetyls', 113 | 'NMR_hdl_cholesterol', 114 | 'NMR_histidine', 115 | 'NMR_isoleucine', 116 | 'NMR_ldl_cholesterol', 117 | 'NMR_lactate', 118 | 'NMR_leucine', 119 | 'NMR_linoleic_acid', 120 | 'NMR_monounsaturated_fatty_acids', 121 | 'NMR_omega3_fatty_acids', 122 | 'NMR_omega6_fatty_acids', 123 | 'NMR_phenylalanine', 124 | 'NMR_phosphatidylcholines', 125 | 'NMR_phosphoglycerides', 126 | 'NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl', 127 | 'NMR_phospholipids_in_hdl', 128 | 'NMR_phospholipids_in_idl', 129 | 'NMR_phospholipids_in_ldl', 130 | 'NMR_phospholipids_in_large_hdl', 131 | 'NMR_phospholipids_in_large_ldl', 132 | 'NMR_phospholipids_in_large_vldl', 133 | 'NMR_phospholipids_in_medium_hdl', 134 | 'NMR_phospholipids_in_medium_ldl', 135 | 'NMR_phospholipids_in_medium_vldl', 136 | 'NMR_phospholipids_in_small_hdl', 137 | 'NMR_phospholipids_in_small_ldl', 138 | 'NMR_phospholipids_in_small_vldl', 139 | 'NMR_phospholipids_in_vldl', 140 | 'NMR_phospholipids_in_very_large_hdl', 141 | 'NMR_phospholipids_in_very_large_vldl', 142 | 'NMR_phospholipids_in_very_small_vldl', 143 | 'NMR_polyunsaturated_fatty_acids', 144 | 'NMR_pyruvate', 145 | 'NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol', 146 | 'NMR_saturated_fatty_acids', 147 | 'NMR_sphingomyelins', 148 | 'NMR_total_cholesterol', 149 | 'NMR_total_cholesterol_minus_hdlc', 150 | 'NMR_total_cholines', 151 | 'NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine', 152 | 'NMR_total_concentration_of_lipoprotein_particles', 153 | 'NMR_total_esterified_cholesterol', 154 | 'NMR_total_fatty_acids', 155 | 'NMR_total_free_cholesterol', 156 | 'NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl', 157 | 'NMR_total_lipids_in_hdl', 158 | 'NMR_total_lipids_in_idl', 159 | 'NMR_total_lipids_in_ldl', 160 | 'NMR_total_lipids_in_large_hdl', 161 | 'NMR_total_lipids_in_large_ldl', 162 | 'NMR_total_lipids_in_large_vldl', 163 | 'NMR_total_lipids_in_lipoprotein_particles', 164 | 'NMR_total_lipids_in_medium_hdl', 165 | 'NMR_total_lipids_in_medium_ldl', 166 | 'NMR_total_lipids_in_medium_vldl', 167 | 'NMR_total_lipids_in_small_hdl', 168 | 'NMR_total_lipids_in_small_ldl', 169 | 'NMR_total_lipids_in_small_vldl', 170 | 'NMR_total_lipids_in_vldl', 171 | 'NMR_total_lipids_in_very_large_hdl', 172 | 'NMR_total_lipids_in_very_large_vldl', 173 | 'NMR_total_lipids_in_very_small_vldl', 174 | 'NMR_total_phospholipids_in_lipoprotein_particles', 175 | 'NMR_total_triglycerides', 176 | 'NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl', 177 | 'NMR_triglycerides_in_hdl', 178 | 'NMR_triglycerides_in_idl', 179 | 'NMR_triglycerides_in_ldl', 180 | 'NMR_triglycerides_in_large_hdl', 181 | 'NMR_triglycerides_in_large_ldl', 182 | 'NMR_triglycerides_in_large_vldl', 183 | 'NMR_triglycerides_in_medium_hdl', 184 | 'NMR_triglycerides_in_medium_ldl', 185 | 'NMR_triglycerides_in_medium_vldl', 186 | 'NMR_triglycerides_in_small_hdl', 187 | 'NMR_triglycerides_in_small_ldl', 188 | 'NMR_triglycerides_in_small_vldl', 189 | 'NMR_triglycerides_in_vldl', 190 | 'NMR_triglycerides_in_very_large_hdl', 191 | 'NMR_triglycerides_in_very_large_vldl', 192 | 'NMR_triglycerides_in_very_small_vldl', 193 | 'NMR_tyrosine', 194 | 'NMR_vldl_cholesterol', 195 | 'NMR_valine' 196 | ] 197 | 198 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/config/features/Metabolomics.yaml: -------------------------------------------------------------------------------- 1 | # @package experiment.complex_feature_set.Metabolomics 2 | categorical: 3 | basics: [] 4 | questionnaire: [] 5 | one_hot_enc: 6 | basics: [] 7 | questionnaire: [] 8 | general: 9 | metabolomics: [ 10 | 'NMR_3hydroxybutyrate', 11 | 'NMR_acetate', 12 | 'NMR_acetoacetate', 13 | 'NMR_acetone', 14 | 'NMR_alanine', 15 | 'NMR_albumin', 16 | 'NMR_apolipoprotein_a1', 17 | 'NMR_apolipoprotein_b', 18 | 'NMR_average_diameter_for_hdl_particles', 19 | 'NMR_average_diameter_for_ldl_particles', 20 | 'NMR_average_diameter_for_vldl_particles', 21 | 'NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl', 22 | 'NMR_cholesterol_in_idl', 23 | 'NMR_cholesterol_in_large_hdl', 24 | 'NMR_cholesterol_in_large_ldl', 25 | 'NMR_cholesterol_in_large_vldl', 26 | 'NMR_cholesterol_in_medium_hdl', 27 | 'NMR_cholesterol_in_medium_ldl', 28 | 'NMR_cholesterol_in_medium_vldl', 29 | 'NMR_cholesterol_in_small_hdl', 30 | 'NMR_cholesterol_in_small_ldl', 31 | 'NMR_cholesterol_in_small_vldl', 32 | 'NMR_cholesterol_in_very_large_hdl', 33 | 'NMR_cholesterol_in_very_large_vldl', 34 | 'NMR_cholesterol_in_very_small_vldl', 35 | 'NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl', 36 | 'NMR_cholesteryl_esters_in_hdl', 37 | 'NMR_cholesteryl_esters_in_idl', 38 | 'NMR_cholesteryl_esters_in_ldl', 39 | 'NMR_cholesteryl_esters_in_large_hdl', 40 | 'NMR_cholesteryl_esters_in_large_ldl', 41 | 'NMR_cholesteryl_esters_in_large_vldl', 42 | 'NMR_cholesteryl_esters_in_medium_hdl', 43 | 'NMR_cholesteryl_esters_in_medium_ldl', 44 | 'NMR_cholesteryl_esters_in_medium_vldl', 45 | 'NMR_cholesteryl_esters_in_small_hdl', 46 | 'NMR_cholesteryl_esters_in_small_ldl', 47 | 'NMR_cholesteryl_esters_in_small_vldl', 48 | 'NMR_cholesteryl_esters_in_vldl', 49 | 'NMR_cholesteryl_esters_in_very_large_hdl', 50 | 'NMR_cholesteryl_esters_in_very_large_vldl', 51 | 'NMR_cholesteryl_esters_in_very_small_vldl', 52 | 'NMR_citrate', 53 | 'NMR_clinical_ldl_cholesterol', 54 | 'NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles', 55 | 'NMR_concentration_of_hdl_particles', 56 | 'NMR_concentration_of_idl_particles', 57 | 'NMR_concentration_of_ldl_particles', 58 | 'NMR_concentration_of_large_hdl_particles', 59 | 'NMR_concentration_of_large_ldl_particles', 60 | 'NMR_concentration_of_large_vldl_particles', 61 | 'NMR_concentration_of_medium_hdl_particles', 62 | 'NMR_concentration_of_medium_ldl_particles', 63 | 'NMR_concentration_of_medium_vldl_particles', 64 | 'NMR_concentration_of_small_hdl_particles', 65 | 'NMR_concentration_of_small_ldl_particles', 66 | 'NMR_concentration_of_small_vldl_particles', 67 | 'NMR_concentration_of_vldl_particles', 68 | 'NMR_concentration_of_very_large_hdl_particles', 69 | 'NMR_concentration_of_very_large_vldl_particles', 70 | 'NMR_concentration_of_very_small_vldl_particles', 71 | 'NMR_creatinine', 72 | 'NMR_degree_of_unsaturation', 73 | 'NMR_docosahexaenoic_acid', 74 | 'NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl', 75 | 'NMR_free_cholesterol_in_hdl', 76 | 'NMR_free_cholesterol_in_idl', 77 | 'NMR_free_cholesterol_in_ldl', 78 | 'NMR_free_cholesterol_in_large_hdl', 79 | 'NMR_free_cholesterol_in_large_ldl', 80 | 'NMR_free_cholesterol_in_large_vldl', 81 | 'NMR_free_cholesterol_in_medium_hdl', 82 | 'NMR_free_cholesterol_in_medium_ldl', 83 | 'NMR_free_cholesterol_in_medium_vldl', 84 | 'NMR_free_cholesterol_in_small_hdl', 85 | 'NMR_free_cholesterol_in_small_ldl', 86 | 'NMR_free_cholesterol_in_small_vldl', 87 | 'NMR_free_cholesterol_in_vldl', 88 | 'NMR_free_cholesterol_in_very_large_hdl', 89 | 'NMR_free_cholesterol_in_very_large_vldl', 90 | 'NMR_free_cholesterol_in_very_small_vldl', 91 | 'NMR_glucose', 92 | 'NMR_glutamine', 93 | 'NMR_glycine', 94 | 'NMR_glycoprotein_acetyls', 95 | 'NMR_hdl_cholesterol', 96 | 'NMR_histidine', 97 | 'NMR_isoleucine', 98 | 'NMR_ldl_cholesterol', 99 | 'NMR_lactate', 100 | 'NMR_leucine', 101 | 'NMR_linoleic_acid', 102 | 'NMR_monounsaturated_fatty_acids', 103 | 'NMR_omega3_fatty_acids', 104 | 'NMR_omega6_fatty_acids', 105 | 'NMR_phenylalanine', 106 | 'NMR_phosphatidylcholines', 107 | 'NMR_phosphoglycerides', 108 | 'NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl', 109 | 'NMR_phospholipids_in_hdl', 110 | 'NMR_phospholipids_in_idl', 111 | 'NMR_phospholipids_in_ldl', 112 | 'NMR_phospholipids_in_large_hdl', 113 | 'NMR_phospholipids_in_large_ldl', 114 | 'NMR_phospholipids_in_large_vldl', 115 | 'NMR_phospholipids_in_medium_hdl', 116 | 'NMR_phospholipids_in_medium_ldl', 117 | 'NMR_phospholipids_in_medium_vldl', 118 | 'NMR_phospholipids_in_small_hdl', 119 | 'NMR_phospholipids_in_small_ldl', 120 | 'NMR_phospholipids_in_small_vldl', 121 | 'NMR_phospholipids_in_vldl', 122 | 'NMR_phospholipids_in_very_large_hdl', 123 | 'NMR_phospholipids_in_very_large_vldl', 124 | 'NMR_phospholipids_in_very_small_vldl', 125 | 'NMR_polyunsaturated_fatty_acids', 126 | 'NMR_pyruvate', 127 | 'NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol', 128 | 'NMR_saturated_fatty_acids', 129 | 'NMR_sphingomyelins', 130 | 'NMR_total_cholesterol', 131 | 'NMR_total_cholesterol_minus_hdlc', 132 | 'NMR_total_cholines', 133 | 'NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine', 134 | 'NMR_total_concentration_of_lipoprotein_particles', 135 | 'NMR_total_esterified_cholesterol', 136 | 'NMR_total_fatty_acids', 137 | 'NMR_total_free_cholesterol', 138 | 'NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl', 139 | 'NMR_total_lipids_in_hdl', 140 | 'NMR_total_lipids_in_idl', 141 | 'NMR_total_lipids_in_ldl', 142 | 'NMR_total_lipids_in_large_hdl', 143 | 'NMR_total_lipids_in_large_ldl', 144 | 'NMR_total_lipids_in_large_vldl', 145 | 'NMR_total_lipids_in_lipoprotein_particles', 146 | 'NMR_total_lipids_in_medium_hdl', 147 | 'NMR_total_lipids_in_medium_ldl', 148 | 'NMR_total_lipids_in_medium_vldl', 149 | 'NMR_total_lipids_in_small_hdl', 150 | 'NMR_total_lipids_in_small_ldl', 151 | 'NMR_total_lipids_in_small_vldl', 152 | 'NMR_total_lipids_in_vldl', 153 | 'NMR_total_lipids_in_very_large_hdl', 154 | 'NMR_total_lipids_in_very_large_vldl', 155 | 'NMR_total_lipids_in_very_small_vldl', 156 | 'NMR_total_phospholipids_in_lipoprotein_particles', 157 | 'NMR_total_triglycerides', 158 | 'NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl', 159 | 'NMR_triglycerides_in_hdl', 160 | 'NMR_triglycerides_in_idl', 161 | 'NMR_triglycerides_in_ldl', 162 | 'NMR_triglycerides_in_large_hdl', 163 | 'NMR_triglycerides_in_large_ldl', 164 | 'NMR_triglycerides_in_large_vldl', 165 | 'NMR_triglycerides_in_medium_hdl', 166 | 'NMR_triglycerides_in_medium_ldl', 167 | 'NMR_triglycerides_in_medium_vldl', 168 | 'NMR_triglycerides_in_small_hdl', 169 | 'NMR_triglycerides_in_small_ldl', 170 | 'NMR_triglycerides_in_small_vldl', 171 | 'NMR_triglycerides_in_vldl', 172 | 'NMR_triglycerides_in_very_large_hdl', 173 | 'NMR_triglycerides_in_very_large_vldl', 174 | 'NMR_triglycerides_in_very_small_vldl', 175 | 'NMR_tyrosine', 176 | 'NMR_vldl_cholesterol', 177 | 'NMR_valine', 178 | ] 179 | basics: [] 180 | questionnaire: [] 181 | measurements: [] 182 | labs: [] 183 | family_history: [] 184 | diagnoses: [] 185 | medications: [] 186 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/config/config.yaml: -------------------------------------------------------------------------------- 1 | #data_dir: /path/to/data 2 | #code_dir: /path/to/repo_base 3 | data_dir: /sc-projects/sc-proj-ukb-cvd 4 | code_dir: /home/buergelt/projects/cardiors/code/MetabolomicsCommonDiseases 5 | setup: 6 | # project: YourNeptune/Project 7 | project: CardioRS/metabolomics 8 | name: MSM 9 | tags: MSM_train 10 | trainer: 11 | default_root_dir: ${data_dir}/results/models 12 | gpus: 1 13 | precision: 16 14 | val_check_interval: 1.0 15 | overfit_batches: 0.0 16 | fast_dev_run: False 17 | track_grad_norm: 0 18 | max_epochs: 100 19 | stochastic_weight_avg: True 20 | auto_lr_find: False 21 | experiment: 22 | seed: 23 23 | num_workers: 4 24 | monitor: "Avg__C_10" 25 | report_train_metrics: False 26 | evaluation_time_points: [10] 27 | evaluation_quantile_bins: None 28 | write_calibrated_predictions: False 29 | task_names: [ 30 | "M_MACE", 31 | "M_all_cause_dementia", 32 | "M_type_2_diabetes", 33 | "M_liver_disease", 34 | "M_renal_disease", 35 | "M_atrial_fibrillation", 36 | "M_heart_failure", 37 | "M_coronary_heart_disease", 38 | "M_venous_thrombosis", 39 | "M_cerebral_stroke", 40 | "M_abdominal_aortic_aneurysm", 41 | "M_peripheral_arterial_disease", 42 | "M_asthma", 43 | "M_chronic_obstructuve_pulmonary_disease", 44 | "M_lung_cancer", 45 | "M_non_melanoma_skin_cancer", 46 | "M_colon_cancer", 47 | "M_rectal_cancer", 48 | "M_prostate_cancer", 49 | "M_breast_cancer", 50 | "M_parkinsons_disease", 51 | "M_fractures", 52 | "M_cataracts", 53 | "M_glaucoma" 54 | ] 55 | task_weights: 56 | M_MACE: 1, 57 | M_all_cause_dementia: 1, 58 | M_type_2_diabetes: 1, 59 | M_liver_disease: 1, 60 | M_renal_disease: 1, 61 | M_atrial_fibrillation: 1, 62 | M_heart_failure: 1, 63 | M_coronary_heart_disease: 1, 64 | M_venous_thrombosis: 1, 65 | M_cerebral_stroke: 1, 66 | M_haemorrhagic_stroke: 1, 67 | M_abdominal_aortic_aneurysm: 1, 68 | M_peripheral_arterial_disease: 1, 69 | M_asthma: 1, 70 | M_chronic_obstructuve_pulmonary_disease: 1, 71 | M_lung_cancer: 1, 72 | M_non_melanoma_skin_cancer: 1, 73 | M_colon_cancer: 1, 74 | M_rectal_cancer: 1, 75 | M_prostate_cancer: 1, 76 | M_breast_cancer: 1, 77 | M_parkinsons_disease: 1, 78 | M_fractures: 1, 79 | M_cataracts: 1, 80 | M_glaucoma: 1 81 | event: [ 82 | "M_MACE_event", 83 | "M_all_cause_dementia_event", 84 | "M_type_2_diabetes_event", 85 | "M_liver_disease_event", 86 | "M_renal_disease_event", 87 | "M_atrial_fibrillation_event", 88 | "M_heart_failure_event", 89 | "M_coronary_heart_disease_event", 90 | "M_venous_thrombosis_event", 91 | "M_cerebral_stroke_event", 92 | "M_abdominal_aortic_aneurysm_event", 93 | "M_peripheral_arterial_disease_event", 94 | "M_asthma_event", 95 | "M_chronic_obstructuve_pulmonary_disease_event", 96 | "M_lung_cancer_event", 97 | "M_non_melanoma_skin_cancer_event", 98 | "M_colon_cancer_event", 99 | "M_rectal_cancer_event", 100 | "M_prostate_cancer_event", 101 | "M_breast_cancer_event", 102 | "M_parkinsons_disease_event", 103 | "M_fractures_event", 104 | "M_cataracts_event", 105 | "M_glaucoma_event" 106 | ] 107 | duration: [ 108 | "M_MACE_event_time", 109 | "M_all_cause_dementia_event_time", 110 | "M_type_2_diabetes_event_time", 111 | "M_liver_disease_event_time", 112 | "M_renal_disease_event_time", 113 | "M_atrial_fibrillation_event_time", 114 | "M_heart_failure_event_time", 115 | "M_coronary_heart_disease_event_time", 116 | "M_venous_thrombosis_event_time", 117 | "M_cerebral_stroke_event_time", 118 | "M_abdominal_aortic_aneurysm_event_time", 119 | "M_peripheral_arterial_disease_event_time", 120 | "M_asthma_event_time", 121 | "M_chronic_obstructuve_pulmonary_disease_event_time", 122 | "M_lung_cancer_event_time", 123 | "M_non_melanoma_skin_cancer_event_time", 124 | "M_colon_cancer_event_time", 125 | "M_rectal_cancer_event_time", 126 | "M_prostate_cancer_event_time", 127 | "M_breast_cancer_event_time", 128 | "M_parkinsons_disease_event_time", 129 | "M_fractures_event_time", 130 | "M_cataracts_event_time", 131 | "M_glaucoma_event_time" 132 | ] 133 | cohort_definition: 134 | general: 135 | train: "NMR_FLAG==True" 136 | valid: "NMR_FLAG==True" 137 | test: "NMR_FLAG==True" 138 | task_specific: 139 | M_MACE: "M_MACE==False&statins==False" 140 | M_all_cause_dementia: "M_all_cause_dementia==False" 141 | M_type_2_diabetes: "M_type_2_diabetes==False" 142 | M_liver_disease: "M_liver_disease==False" 143 | M_renal_disease: "M_renal_disease==False" 144 | M_atrial_fibrillation: "M_atrial_fibrillation==False" 145 | M_heart_failure: "M_heart_failure==False" 146 | M_coronary_heart_disease: "M_coronary_heart_disease==False" 147 | M_venous_thrombosis: "M_venous_thrombosis==False" 148 | M_cerebral_stroke: "M_cerebral_stroke==False" 149 | M_abdominal_aortic_aneurysm: "M_abdominal_aortic_aneurysm==False" 150 | M_peripheral_arterial_disease: "M_peripheral_arterial_disease==False" 151 | M_asthma: "M_asthma==False" 152 | M_chronic_obstructuve_pulmonary_disease: "M_chronic_obstructuve_pulmonary_disease==False" 153 | M_lung_cancer: "M_lung_cancer==False" 154 | M_non_melanoma_skin_cancer: "M_non_melanoma_skin_cancer==False" 155 | M_colon_cancer: "M_colon_cancer==False" 156 | M_rectal_cancer: "M_rectal_cancer==False" 157 | M_prostate_cancer: "M_prostate_cancer==False&sex=='Male'" 158 | M_breast_cancer: "M_breast_cancer==False&sex=='Female'" 159 | M_parkinsons_disease: "M_parkinsons_disease==False" 160 | M_fractures: "M_fractures==False" 161 | M_cataracts: "M_cataracts==False" 162 | M_glaucoma: "M_glaucoma==False" 163 | task_specific_exclusions: True 164 | datamodule: UKBBSurvivalDatamoduleWithExclusions 165 | task: ResidualMultiTaskSurvivalTraining 166 | cv_partition: 0 167 | feature_set: Metabolomics 168 | features_yaml: ${code_dir}/metabolomicstatemodel/source/config/features/${experiment.feature_set}.yaml 169 | tabular_filepath: ${data_dir}/data/tabular/210714_metabolomics 170 | latent_dim: 512 171 | module: MLP 172 | module_kwargs: 173 | snn_init: False 174 | hidden_dim: [256, 256, 256] 175 | output_dim: ${experiment.latent_dim} 176 | norm_fn: 'nn.BatchNorm1d' 177 | norm_layer: [0] 178 | input_norm: False 179 | final_norm: False 180 | dropout_fn: "nn.Dropout" 181 | dropout: 0.3 182 | dropout_after_norm: False 183 | activation: "nn.SiLU" 184 | final_activation: "nn.SiLU" 185 | latent_module: ResidualHeadMLP 186 | latent_module_kwargs: 187 | latent_dim: 32 188 | mlp: MLP 189 | mlp_kwargs: 190 | snn_init: False 191 | input_dim: ${experiment.latent_dim} 192 | hidden_dim: [256, 128] 193 | output_dim: ${experiment.latent_module_kwargs.latent_dim} 194 | activation_fn: "nn.SiLU" 195 | dropout_fn: "nn.Dropout" 196 | norm_fn: 'nn.BatchNorm1d' 197 | norm_layer: "all" 198 | input_norm: False 199 | final_norm: True 200 | dropout: 0.6 201 | dropout_after_norm: True 202 | activation: "nn.SiLU" 203 | final_activation: "nn.SiLU" 204 | skip_connection_mlp: MLP 205 | skip_connection_mlp_kwargs: 206 | snn_init: False 207 | hidden_dim: [128, 128] 208 | output_dim: ${experiment.latent_module_kwargs.latent_dim} 209 | activation_fn: "nn.SiLU" 210 | dropout_fn: "nn.Dropout" 211 | norm_fn: 'nn.BatchNorm1d' 212 | norm_layer: "all" 213 | input_norm: False 214 | final_norm: True 215 | dropout: 0.6 216 | dropout_after_norm: True 217 | activation: "nn.SiLU" 218 | final_activation: "nn.SiLU" 219 | predictor_mlp: MLP 220 | predictor_mlp_kwargs: 221 | snn_init: False 222 | input_dim: ${experiment.latent_module_kwargs.latent_dim} 223 | hidden_dim: [128, 128] 224 | output_dim: 1 225 | activation_fn: "nn.SiLU" 226 | dropout_fn: "nn.Dropout" 227 | norm_fn: 'nn.BatchNorm1d' 228 | norm_layer: "all" 229 | input_norm: False 230 | final_norm: True 231 | dropout: 0.6 232 | dropout_after_norm: True 233 | activation: "nn.SiLU" 234 | final_activation: "nn.Identity" 235 | optimizer: "torch.optim.Adam" 236 | optimizer_kwargs: {weight_decay: 1e-8} 237 | schedule: MultiStepLR 238 | schedule_kwargs: {milestones:[20, 30, 40], gamma: 0.1, verbose: True} 239 | n_events: 1 240 | batch_size: 1024 241 | lr: 0.001 242 | survival_task: DeepSurv 243 | survival_task_kwargs: 244 | batch_size: ${experiment.batch_size} 245 | num_workers: ${experiment.num_workers} -------------------------------------------------------------------------------- /analysis/examples/sample.csv: -------------------------------------------------------------------------------- 1 | ,NMR_3hydroxybutyrate,NMR_acetate,NMR_acetoacetate,NMR_acetone,NMR_alanine,NMR_albumin,NMR_apolipoprotein_a1,NMR_apolipoprotein_b,NMR_average_diameter_for_hdl_particles,NMR_average_diameter_for_ldl_particles,NMR_average_diameter_for_vldl_particles,NMR_cholesterol_in_chylomicrons_and_extremely_large_vldl,NMR_cholesterol_in_idl,NMR_cholesterol_in_large_hdl,NMR_cholesterol_in_large_ldl,NMR_cholesterol_in_large_vldl,NMR_cholesterol_in_medium_hdl,NMR_cholesterol_in_medium_ldl,NMR_cholesterol_in_medium_vldl,NMR_cholesterol_in_small_hdl,NMR_cholesterol_in_small_ldl,NMR_cholesterol_in_small_vldl,NMR_cholesterol_in_very_large_hdl,NMR_cholesterol_in_very_large_vldl,NMR_cholesterol_in_very_small_vldl,NMR_cholesteryl_esters_in_chylomicrons_and_extremely_large_vldl,NMR_cholesteryl_esters_in_hdl,NMR_cholesteryl_esters_in_idl,NMR_cholesteryl_esters_in_ldl,NMR_cholesteryl_esters_in_large_hdl,NMR_cholesteryl_esters_in_large_ldl,NMR_cholesteryl_esters_in_large_vldl,NMR_cholesteryl_esters_in_medium_hdl,NMR_cholesteryl_esters_in_medium_ldl,NMR_cholesteryl_esters_in_medium_vldl,NMR_cholesteryl_esters_in_small_hdl,NMR_cholesteryl_esters_in_small_ldl,NMR_cholesteryl_esters_in_small_vldl,NMR_cholesteryl_esters_in_vldl,NMR_cholesteryl_esters_in_very_large_hdl,NMR_cholesteryl_esters_in_very_large_vldl,NMR_cholesteryl_esters_in_very_small_vldl,NMR_citrate,NMR_clinical_ldl_cholesterol,NMR_concentration_of_chylomicrons_and_extremely_large_vldl_particles,NMR_concentration_of_hdl_particles,NMR_concentration_of_idl_particles,NMR_concentration_of_ldl_particles,NMR_concentration_of_large_hdl_particles,NMR_concentration_of_large_ldl_particles,NMR_concentration_of_large_vldl_particles,NMR_concentration_of_medium_hdl_particles,NMR_concentration_of_medium_ldl_particles,NMR_concentration_of_medium_vldl_particles,NMR_concentration_of_small_hdl_particles,NMR_concentration_of_small_ldl_particles,NMR_concentration_of_small_vldl_particles,NMR_concentration_of_vldl_particles,NMR_concentration_of_very_large_hdl_particles,NMR_concentration_of_very_large_vldl_particles,NMR_concentration_of_very_small_vldl_particles,NMR_creatinine,NMR_degree_of_unsaturation,NMR_docosahexaenoic_acid,NMR_free_cholesterol_in_chylomicrons_and_extremely_large_vldl,NMR_free_cholesterol_in_hdl,NMR_free_cholesterol_in_idl,NMR_free_cholesterol_in_ldl,NMR_free_cholesterol_in_large_hdl,NMR_free_cholesterol_in_large_ldl,NMR_free_cholesterol_in_large_vldl,NMR_free_cholesterol_in_medium_hdl,NMR_free_cholesterol_in_medium_ldl,NMR_free_cholesterol_in_medium_vldl,NMR_free_cholesterol_in_small_hdl,NMR_free_cholesterol_in_small_ldl,NMR_free_cholesterol_in_small_vldl,NMR_free_cholesterol_in_vldl,NMR_free_cholesterol_in_very_large_hdl,NMR_free_cholesterol_in_very_large_vldl,NMR_free_cholesterol_in_very_small_vldl,NMR_glucose,NMR_glutamine,NMR_glycine,NMR_glycoprotein_acetyls,NMR_hdl_cholesterol,NMR_histidine,NMR_isoleucine,NMR_ldl_cholesterol,NMR_lactate,NMR_leucine,NMR_linoleic_acid,NMR_monounsaturated_fatty_acids,NMR_omega3_fatty_acids,NMR_omega6_fatty_acids,NMR_phenylalanine,NMR_phosphatidylcholines,NMR_phosphoglycerides,NMR_phospholipids_in_chylomicrons_and_extremely_large_vldl,NMR_phospholipids_in_hdl,NMR_phospholipids_in_idl,NMR_phospholipids_in_ldl,NMR_phospholipids_in_large_hdl,NMR_phospholipids_in_large_ldl,NMR_phospholipids_in_large_vldl,NMR_phospholipids_in_medium_hdl,NMR_phospholipids_in_medium_ldl,NMR_phospholipids_in_medium_vldl,NMR_phospholipids_in_small_hdl,NMR_phospholipids_in_small_ldl,NMR_phospholipids_in_small_vldl,NMR_phospholipids_in_vldl,NMR_phospholipids_in_very_large_hdl,NMR_phospholipids_in_very_large_vldl,NMR_phospholipids_in_very_small_vldl,NMR_polyunsaturated_fatty_acids,NMR_pyruvate,NMR_remnant_cholesterol_nonhdl_nonldl_cholesterol,NMR_saturated_fatty_acids,NMR_sphingomyelins,NMR_total_cholesterol,NMR_total_cholesterol_minus_hdlc,NMR_total_cholines,NMR_total_concentration_of_branchedchain_amino_acids_leucine_isoleucine_valine,NMR_total_concentration_of_lipoprotein_particles,NMR_total_esterified_cholesterol,NMR_total_fatty_acids,NMR_total_free_cholesterol,NMR_total_lipids_in_chylomicrons_and_extremely_large_vldl,NMR_total_lipids_in_hdl,NMR_total_lipids_in_idl,NMR_total_lipids_in_ldl,NMR_total_lipids_in_large_hdl,NMR_total_lipids_in_large_ldl,NMR_total_lipids_in_large_vldl,NMR_total_lipids_in_lipoprotein_particles,NMR_total_lipids_in_medium_hdl,NMR_total_lipids_in_medium_ldl,NMR_total_lipids_in_medium_vldl,NMR_total_lipids_in_small_hdl,NMR_total_lipids_in_small_ldl,NMR_total_lipids_in_small_vldl,NMR_total_lipids_in_vldl,NMR_total_lipids_in_very_large_hdl,NMR_total_lipids_in_very_large_vldl,NMR_total_lipids_in_very_small_vldl,NMR_total_phospholipids_in_lipoprotein_particles,NMR_total_triglycerides,NMR_triglycerides_in_chylomicrons_and_extremely_large_vldl,NMR_triglycerides_in_hdl,NMR_triglycerides_in_idl,NMR_triglycerides_in_ldl,NMR_triglycerides_in_large_hdl,NMR_triglycerides_in_large_ldl,NMR_triglycerides_in_large_vldl,NMR_triglycerides_in_medium_hdl,NMR_triglycerides_in_medium_ldl,NMR_triglycerides_in_medium_vldl,NMR_triglycerides_in_small_hdl,NMR_triglycerides_in_small_ldl,NMR_triglycerides_in_small_vldl,NMR_triglycerides_in_vldl,NMR_triglycerides_in_very_large_hdl,NMR_triglycerides_in_very_large_vldl,NMR_triglycerides_in_very_small_vldl,NMR_tyrosine,NMR_vldl_cholesterol,NMR_valine 2 | sample_0,1.7267629,0.5097766,0.29732332,0.46884453,1.4918994,0.6533935,0.18285494,1.1864842,0.64613235,0.9915879,0.6898927,1.6417483,1.088562,0.18311316,1.0280348,1.7085389,1.3308727,1.3725197,0.453924,0.14175132,0.5277481,0.7697196,0.65141964,0.048153635,0.6881,0.6210777,0.981835,0.14764947,1.4333347,0.5098504,1.5350345,0.3321557,1.7906251,0.4931004,1.3807096,0.52706426,0.30808988,1.2255802,0.9953301,1.7295392,1.0182742,1.324438,1.3735615,0.22325872,0.62359023,1.7462277,1.2364578,1.1424968,0.6403464,1.494473,0.43395197,1.0911344,0.99456245,0.70620626,1.2572117,0.39763924,0.84456956,0.34995952,1.7056545,0.13534041,1.6538748,1.2421536,0.275961,1.61189,0.43145138,0.69299865,1.3780788,0.2530471,0.73021734,1.5449325,1.4830003,0.55509186,0.11249849,0.0051142704,0.4280246,1.4834918,0.90811676,0.1878527,0.20966077,0.43405378,0.620166,0.25039583,0.859951,0.506151,0.49615797,0.9883161,1.315808,0.027282929,1.3034036,0.41346607,1.228288,0.51193565,0.26067233,0.8067817,0.90494514,0.06724546,1.5765679,0.5881223,1.2508178,0.7745326,0.7279988,0.9269167,0.07510055,0.37857637,1.7506536,0.105617344,0.13456018,0.23826805,0.2921966,0.6284125,0.74927133,0.7842055,0.6030325,1.2905078,0.4911587,0.57319283,0.13305517,0.80514073,1.0980389,0.39218083,0.1623933,0.13884185,0.17421694,0.15859717,0.41635695,0.6279078,0.62771,0.8249054,1.5470794,0.7805787,1.6720229,0.5789668,0.98674554,0.15837964,0.07383999,0.7915055,0.28337508,1.1657172,0.71572715,1.6519899,1.3657914,0.4889391,1.1835232,1.6657807,0.8121769,0.40309033,0.6373719,1.0736355,0.25264534,1.0069423,0.86787814,1.3369577,0.50055605,1.3423343,1.3437071,0.22329777,0.1401766,0.22787079,1.5827061,1.6035206,1.7987934,0.124354236,1.4847823,1.3791466,0.7313719,1.4134979,0.5265129,0.204291 3 | sample_1,1.1247766,1.4505948,1.5361449,1.4904437,1.5670565,1.2864381,0.077527456,0.48182374,1.4347273,0.70310014,1.1604679,1.4766713,0.82198054,0.69924873,0.47259116,0.019940903,1.2150654,0.14550368,1.1506934,0.42108682,0.72633934,1.4222,1.6842602,0.6014966,1.6811364,0.81836396,0.81108344,1.0707843,0.86859024,0.11722562,0.8970124,1.4451725,1.7431155,0.9812217,1.3683889,1.1414407,0.9885561,0.5018747,1.0990268,0.14177717,1.1253144,0.0690509,1.5061882,0.23769274,0.5989025,1.4955494,0.49969435,0.2531869,1.4209754,1.4839709,1.1655788,1.137409,1.7623557,1.1982107,0.16304106,0.6498071,1.247407,0.007712378,1.4728979,0.97307813,0.9165665,0.3204594,0.24263348,1.2905837,1.4658877,1.6306684,0.00226037,0.039650865,1.7878088,1.5556992,1.191061,0.075704694,1.5558871,1.0816427,1.3992503,0.07463606,0.47904897,1.4666929,0.5326069,0.95497483,0.8767187,1.1244663,1.655131,1.4107636,1.093523,0.29427055,0.24261752,0.33046967,1.2696238,1.3498253,1.6691794,1.4430594,0.29075617,1.5200106,1.0818076,1.1958362,0.035788987,0.7284869,0.34679976,1.4599568,0.3480672,1.3999459,1.3708526,0.68422735,1.1539485,0.35347924,1.0999752,0.6629832,1.677415,1.7063679,0.20748337,0.4156185,1.4872332,0.42577755,0.63565433,1.7240045,0.30661812,1.1443036,0.36064732,1.0750041,1.3607403,1.2786652,0.9076651,0.6570294,0.38894868,0.48623,0.19309354,1.6910648,1.5545182,1.6601219,1.6360918,1.1610199,0.9035126,0.55135554,0.49802023,0.8577952,1.5569063,1.6308388,1.7859721,1.107143,1.5525775,0.7256029,0.056678522,1.4784337,0.506086,0.47641203,1.1579549,0.74990684,1.5955726,1.2277075,0.8925245,1.7089723,0.81651545,1.1531171,1.2951981,0.22069862,0.7634763,0.63415253,1.3337371,1.5506673,0.8664601,0.87574947,0.41530603,0.025238335,0.92178035,1.6352707,0.37116644,1.192257 4 | sample_2,0.6394572,0.048151683,0.0041278093,1.3019447,1.5163862,1.7615204,1.3996884,0.88473135,0.027976334,1.6112314,1.0511037,1.3119624,0.10937966,0.8065622,0.2579499,0.40155205,1.158636,0.100113384,1.0507463,1.4433336,0.8073181,1.4442376,0.66797423,0.53165096,0.9798346,1.5489019,1.334381,0.13057685,1.7061685,0.73331654,1.067463,0.7395286,0.44745353,0.5638262,0.343716,0.26125124,0.042463213,0.68579,1.5634893,1.095418,1.7047849,1.5969012,0.28933054,0.43715376,1.3674892,1.1461415,0.040662605,0.27717274,0.46089447,0.31720972,1.5146904,0.31740576,1.164278,1.2174286,1.0174254,1.7494576,0.1993007,0.28359258,1.6533439,0.4360933,0.7976693,0.2955996,1.4103165,1.1625634,1.166639,1.5002036,1.4426631,1.0293818,1.7243257,1.6771489,1.1876979,0.46074176,1.3435627,0.057508104,1.1589948,0.48598015,1.1272345,1.6984007,0.7455857,0.9537147,1.6460967,1.2778734,0.65043944,0.8658931,1.59622,1.2816808,0.18826872,1.0616916,0.11173987,0.9738986,1.4398922,1.370888,1.0251368,0.68071675,0.8055198,0.4604453,0.05194658,1.2749503,1.6958934,1.2984284,0.47111607,0.835225,0.11417559,1.7460171,0.32695165,0.25663644,0.111282885,1.6801063,1.0840608,0.45405158,1.6737995,0.29428774,0.673308,1.589201,0.21577558,0.02494686,1.2485098,1.0764318,0.38511005,0.1998027,0.54615897,1.6868843,0.62696517,1.7561861,1.4750563,1.45119,1.0248525,1.3861558,1.1287161,1.5323092,0.5878731,1.096504,1.4778537,0.46314618,0.36474022,1.0683476,1.4301068,1.4973894,0.827812,0.6916306,1.7210993,1.0710878,1.6875877,0.085866526,1.0035372,1.7737324,1.0767089,0.084368296,0.1562901,1.0578055,0.27928537,0.16458681,0.8512347,1.240649,1.5773059,1.172978,1.4460613,0.8570547,0.23283778,0.960511,0.8304184,1.5568347,0.18804511,1.2705362,1.7228076,1.7837844,1.7385787,1.7892717 5 | sample_3,0.4487589,0.37038893,0.06925146,1.0901536,1.042064,0.32663187,1.4012345,0.49263698,1.6871933,1.1226852,1.6242129,1.7099121,0.18495268,0.87873393,0.7100061,1.525226,1.4485743,0.039899103,0.8911774,1.4572191,1.0084244,0.11996893,0.036249947,1.0465795,1.5553432,0.30086443,0.78687495,0.030546222,1.5204672,0.0752192,0.23986651,1.707968,0.6906252,0.06051089,0.32766625,1.5041764,1.4767783,1.2251022,1.307869,1.2565494,1.7772435,0.31088248,0.8995749,0.26549795,0.15266678,1.2756772,1.6006684,0.4677581,0.23416235,0.6896485,0.4579907,1.3835437,0.554329,1.2575718,1.2434229,0.054920208,0.5825295,0.20535323,0.8275796,0.60950464,0.7588442,1.1936173,1.3331549,1.5392226,0.16763358,0.10467987,1.741956,0.5891317,1.3638477,0.67581654,0.31941622,1.1855899,1.66403,0.6271029,1.1893716,1.0977352,1.534193,0.22538827,0.64260757,0.2408825,1.7362573,1.2060944,0.969551,0.9658206,0.23782036,1.7787619,1.5680085,1.0271211,0.80165523,1.2293843,1.2669885,1.7391648,1.2056175,0.84038806,0.61846817,0.15132122,0.51495504,0.77662504,1.31058,0.79846156,1.2412288,1.4939855,0.1171603,0.74272704,1.7567694,1.7224572,1.3670967,1.3684192,0.66651887,0.88285154,0.73113704,1.71152,0.1984383,1.6891428,1.7598286,0.74715847,0.8938639,1.594826,0.15118295,1.1975237,0.31053978,0.019128645,1.499253,0.88487166,1.5354916,1.2054054,1.4602237,0.47618222,0.011032284,0.18765059,1.4797944,0.5439001,0.35328564,0.0931981,1.7674357,0.06046798,0.95566815,1.6162721,0.35987407,0.52456313,1.2912939,0.025181143,0.6913808,1.3839384,1.311321,1.7093761,0.013740754,0.7665038,0.7897924,1.1775188,1.2336917,1.0273027,0.6498797,0.7412339,0.6385234,1.7824961,0.5776181,0.10206302,0.20797145,1.5722026,0.7791373,0.7737667,1.2712736,0.07551479,1.0713022,1.1955271,0.33608505,1.7409687 6 | sample_4,0.39158726,1.4290415,1.0409038,0.32171452,1.6907369,1.6188732,0.29006386,1.0105348,0.6140596,0.1370683,0.033939946,1.6528398,1.121336,0.70619434,0.13311104,0.795395,1.1870075,0.9588572,0.69247836,0.6088535,1.5259367,1.3447601,0.835985,0.41114345,1.2541314,0.33457318,0.31144038,0.21643135,1.5477998,0.5301468,0.24439298,0.5454397,0.218857,0.7117476,0.77929556,1.7813807,1.6003835,1.7953465,1.4607075,1.2049109,1.48384,0.85762167,1.1429639,0.9870765,0.6465661,0.7455147,0.43265375,1.7432247,1.3255073,1.4863342,1.499107,1.637055,1.4781187,1.7494757,0.38001835,0.4903395,1.426103,0.24009445,0.94217247,1.5151939,1.2099993,0.46371266,0.49783766,1.4946584,0.804823,1.7760794,0.3688554,1.3472563,0.14981924,1.0910748,1.2662728,1.5365591,1.5832489,0.39390194,1.1747075,1.2293701,1.6966463,0.07995371,1.5968765,0.35381973,1.795621,1.3334799,1.6623522,0.114330016,0.9883059,0.42806634,0.42088893,0.33974266,0.8436938,0.5935543,1.4546558,0.93583494,0.39511275,1.756246,0.33147886,1.5322094,1.0483937,1.3000615,1.5142493,1.1211617,0.21773115,0.48247924,1.3405449,0.75711596,0.43254915,1.108704,0.21249163,0.5359951,1.2703377,0.54352325,1.678972,1.2367089,0.20781432,1.7676998,0.2796192,0.5369049,1.552004,0.42126063,1.3163086,0.84839207,0.4736774,0.9796766,0.7934538,1.218545,1.3820137,0.5128056,1.1789824,1.2839047,0.5724214,0.19048418,0.96299917,0.6771659,0.19370678,0.96163434,0.7227618,0.7204046,1.1550516,0.53782845,1.7956933,0.18117389,0.0303778,0.42181036,0.32062125,1.4959301,0.6862654,1.2135113,1.3248559,0.31858745,1.7488276,0.6386698,0.758894,1.2883968,0.17838693,0.20247132,1.6250259,0.8688114,1.6884774,0.46924558,0.2778965,0.5041763,1.3240411,1.5307034,0.07698704,0.08331609,0.14573115,0.9030207,1.0258918,0.30767688 7 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from omegaconf.listconfig import ListConfig 5 | 6 | 7 | class SingleLayerNet(nn.Module): 8 | def __init__(self, input_dim=32, output_dim=2, final_activation=None, final_batchnorm=False, **kwargs): 9 | super(SingleLayerNet, self).__init__() 10 | self.input_dim = input_dim 11 | self.output_dim = output_dim 12 | 13 | if final_activation is not None and isinstance(final_activation, str): 14 | m = final_activation.split('.') 15 | final_activation = getattr(nn, m[1]) 16 | print(final_activation) 17 | 18 | predictor_specs = [nn.Linear(self.input_dim, self.output_dim), ] 19 | if final_batchnorm: 20 | predictor_specs.append(nn.BatchNorm1d(self.output_dim)) 21 | if final_activation is not None: 22 | predictor_specs.append(final_activation()) 23 | self.predictor = nn.Sequential(*predictor_specs) 24 | 25 | def forward(self, input): 26 | fts = self.predictor(input) 27 | return fts 28 | 29 | 30 | class MLP(nn.Module): 31 | def __init__(self, 32 | input_dim=32, 33 | output_dim=2, 34 | hidden_dim=256, 35 | n_hidden_layers=None, 36 | activation="nn.SELU", 37 | dropout_fn='nn.Dropout', 38 | norm_fn='nn.BatchNorm1d', 39 | norm_layer="all", 40 | dropout_after_norm=True, 41 | input_norm=False, 42 | final_activation=None, 43 | final_norm=False, 44 | snn_init=True, 45 | dropout=0.5, **kwargs): 46 | """ 47 | A simple feed-forward neural network. 48 | :param input_dim: `int`, dimension ot the input features 49 | :param output_dim: `int`, dimension of the outlayer 50 | :param activation: `nn.Module`, NOT initialized. that is the activation of the last layer, if `None` no activation will be performed. 51 | :param dropout: `float`, [<1], that specifies the dropout probability 52 | :param kwargs: 53 | """ 54 | super().__init__() 55 | self.input_dim = input_dim 56 | self.output_dim = output_dim 57 | norm_layer = norm_layer if isinstance(norm_layer, (list, tuple, ListConfig)) else [l for l in range(100)] 58 | self.hidden_dim = hidden_dim 59 | self.dropout = dropout 60 | if norm_fn is not None and isinstance(norm_fn, str): 61 | m = norm_fn.split('.') 62 | norm_fn = getattr(nn, m[1]) 63 | self.norm_fn = norm_fn 64 | if dropout_fn is not None and isinstance(dropout_fn, str): 65 | m = dropout_fn.split('.') 66 | dropout_fn = getattr(nn, m[1]) 67 | if activation is not None and isinstance(activation, str): 68 | m = activation.split('.') 69 | activation = getattr(nn, m[1]) 70 | print(activation) 71 | if final_activation is not None and isinstance(final_activation, str): 72 | m = final_activation.split('.') 73 | final_activation = getattr(nn, m[1]) 74 | print(final_activation) 75 | print(self.output_dim) 76 | 77 | if input_norm: 78 | self.input_norm = nn.LayerNorm(self.input_dim) 79 | else: 80 | self.input_norm = None 81 | 82 | if isinstance(hidden_dim, int): 83 | if isinstance(norm_layer, (list, tuple, ListConfig)): norm_fn = self.norm_fn if 0 in norm_layer else None 84 | else: norm_fn = None 85 | mlp_specs = [nn.Linear(input_dim, hidden_dim),] 86 | if dropout_after_norm == True: 87 | mlp_specs.extend([ 88 | norm_fn(hidden_dim) if norm_fn is not None else nn.Identity(), 89 | dropout_fn(self.dropout),]) 90 | else: 91 | mlp_specs.extend([ 92 | dropout_fn(self.dropout), 93 | norm_fn(hidden_dim) if norm_fn is not None else nn.Identity(), 94 | ]) 95 | mlp_specs.extend([activation(),]) 96 | 97 | for i in range(n_hidden_layers): 98 | if isinstance(norm_layer, (list, tuple, ListConfig)): norm_fn = self.norm_fn if i+1 in norm_layer else None 99 | else: norm_fn = None 100 | mlp_specs.extend([nn.Linear(hidden_dim, hidden_dim),]) 101 | if dropout_after_norm == True: 102 | mlp_specs.extend([ 103 | norm_fn(hidden_dim) if norm_fn is not None else nn.Identity(), 104 | dropout_fn(self.dropout), ]) 105 | else: 106 | mlp_specs.extend([ 107 | dropout_fn(self.dropout), 108 | norm_fn(hidden_dim) if norm_fn is not None else nn.Identity(), 109 | ]) 110 | mlp_specs.extend([activation(),]) 111 | self.mlp = nn.Sequential(*mlp_specs) 112 | predictor_specs = [ 113 | nn.Linear(hidden_dim, self.output_dim), 114 | ] 115 | elif isinstance(hidden_dim, (list, tuple, ListConfig)): 116 | assert n_hidden_layers is None, 'Either pass list of hidden_dims, or n_hidden_layers with single hidden_dim' 117 | mlp_specs = [] 118 | for i, h in enumerate(hidden_dim): 119 | if isinstance(norm_layer, (list, tuple, ListConfig)): norm_fn = self.norm_fn if i in norm_layer else None 120 | else: norm_fn = None 121 | mlp_specs.extend([nn.Linear(input_dim if i==0 else hidden_dim[i-1], h),]) 122 | if dropout_after_norm == True: 123 | mlp_specs.extend([ 124 | norm_fn(h) if norm_fn is not None else nn.Identity(), 125 | dropout_fn(self.dropout)]) 126 | else: 127 | mlp_specs.extend([ 128 | dropout_fn(self.dropout), 129 | norm_fn(h) if norm_fn is not None else nn.Identity(), 130 | ]) 131 | mlp_specs.extend([activation(),]) 132 | self.mlp = nn.Sequential(*mlp_specs) 133 | predictor_specs = [ 134 | nn.Linear(hidden_dim[-1], self.output_dim), 135 | ] 136 | else: 137 | raise ValueError('hidden_dim is either int or list of ints') 138 | 139 | if final_norm: 140 | predictor_specs.append(self.norm_fn(self.output_dim)) 141 | if final_activation is not None: 142 | predictor_specs.append(final_activation()) 143 | 144 | self.predictor = nn.Sequential(*predictor_specs) 145 | 146 | if snn_init: 147 | self.reset_parameters('predictor') 148 | self.reset_parameters('mlp') 149 | 150 | def forward(self, input): 151 | if self.input_norm is not None: 152 | input = self.input_norm(input) 153 | fts = self.mlp(input) 154 | output = self.predictor(fts) 155 | return output 156 | 157 | def reset_parameters(self, name): 158 | for layer in getattr(self, name): 159 | if not isinstance(layer, nn.Linear): 160 | continue 161 | nn.init.normal_(layer.weight, std=1 / math.sqrt(layer.out_features)) 162 | if layer.bias is not None: 163 | fan_in, _ = nn.init._calculate_fan_in_and_fan_out(layer.weight) 164 | bound = 1 / math.sqrt(fan_in) 165 | nn.init.uniform_(layer.bias, -bound, bound) 166 | 167 | 168 | class ResidualHeadMLP(nn.Module): 169 | def __init__(self, 170 | predictor_mlp=MLP, 171 | predictor_mlp_kwargs=dict(input_dim=None, 172 | output_dim=None, 173 | hidden_dim=None, 174 | activation="nn.SiLU", 175 | dropout_fn='nn.Dropout', 176 | dropout=0.2, 177 | final_activation="nn.SiLU", 178 | final_batchnorm=False), 179 | skip_connection_mlp=MLP, 180 | skip_connection_input_dim=32, 181 | skip_connection_mlp_kwargs=dict(input_dim=None, 182 | output_dim=None, 183 | hidden_dim=None, 184 | activation="nn.SiLU", 185 | dropout_fn='nn.Dropout', 186 | dropout=0.2, 187 | final_activation="nn.SiLU", 188 | final_batchnorm=False), 189 | mlp=MLP, 190 | mlp_kwargs=dict(input_dim=None, 191 | output_dim=None, 192 | hidden_dim=None, 193 | activation="nn.SiLU", 194 | dropout_fn='nn.Dropout', 195 | dropout=0.2, 196 | final_activation="nn.SiLU", 197 | final_batchnorm=False), 198 | **kwargs): 199 | super().__init__() 200 | self.skip_connection_input_dim = skip_connection_input_dim 201 | 202 | if predictor_mlp is not None and isinstance(predictor_mlp, str): 203 | self.predictor_mlp = eval(predictor_mlp) 204 | if skip_connection_mlp is not None and isinstance(skip_connection_mlp, str): 205 | self.skip_connection_mlp = eval(skip_connection_mlp) 206 | if mlp is not None and isinstance(mlp, str): 207 | self.mlp = eval(mlp) 208 | 209 | skip_connection_mlp_kwargs['input_dim'] = self.skip_connection_input_dim 210 | 211 | self.predictor = self.predictor_mlp(**predictor_mlp_kwargs) 212 | self.skip_connection = self.skip_connection_mlp(**skip_connection_mlp_kwargs) 213 | self.mlp = self.mlp(**mlp_kwargs) 214 | 215 | def forward(self, input): 216 | features, covariates = input 217 | fts = self.mlp(features) 218 | skip_fts = self.skip_connection(covariates) 219 | h = fts + skip_fts 220 | out = self.predictor(h) 221 | return out 222 | 223 | 224 | class MLPResNetBlock(nn.Module): 225 | """ 226 | MLP version of the ResBlock wrapped by TemporalBlock from: 227 | https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/unet.py#L143 228 | 229 | with less complexity and fts. 230 | """ 231 | def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, dropout=0.3, 232 | embedding_dim=16, 233 | use_scale_shift_norm=False, 234 | temporal_embedding=False): 235 | super().__init__() 236 | self.input_dim = input_dim 237 | self.hidden_dim = hidden_dim 238 | self.output_dim = output_dim 239 | self.dropout = dropout 240 | self.use_scale_shift_norm = use_scale_shift_norm 241 | self.temporal_embedding=temporal_embedding 242 | 243 | if temporal_embedding: 244 | self.emb_layers = nn.Sequential( 245 | nn.SiLU(), 246 | nn.Linear(embedding_dim, 247 | 2 * self.output_dim if use_scale_shift_norm else self.output_dim), 248 | ) 249 | 250 | self.in_layers = nn.Sequential( 251 | nn.Linear(self.input_dim, self.hidden_dim), 252 | nn.BatchNorm1d(self.hidden_dim), 253 | nn.SiLU(), 254 | nn.Dropout(self.dropout), 255 | nn.Linear(self.hidden_dim, self.output_dim), 256 | nn.BatchNorm1d(self.output_dim), 257 | nn.SiLU(), 258 | nn.Dropout(self.dropout), 259 | ) 260 | 261 | self.skip_connection = nn.Identity() if self.input_dim==self.output_dim else \ 262 | nn.Sequential( 263 | nn.Linear(self.input_dim, self.output_dim), 264 | nn.BatchNorm1d(self.output_dim), 265 | nn.SiLU() 266 | ) 267 | 268 | self.out_layers = nn.Sequential( 269 | nn.SiLU(), 270 | nn.Dropout(p=self.dropout), 271 | nn.Linear(self.output_dim, self.output_dim) 272 | ) 273 | 274 | def forward(self, x, emb): 275 | h = self.in_layers(x) 276 | emb_out = self.emb_layers(emb).type(h.dtype) 277 | while len(emb_out.shape) < len(h.shape): 278 | emb_out = emb_out[..., None] 279 | if self.use_scale_shift_norm: 280 | out_norm, out_rest = self.out_layers[0], self.out_layers[1:] 281 | scale, shift = torch.chunk(emb_out, 2, dim=1) 282 | h = out_norm(h) * (1 + scale) + shift 283 | h = out_rest(h) 284 | else: 285 | h = h + emb_out 286 | h = self.out_layers(h) 287 | return self.skip_connection(x) + h 288 | 289 | def reset_parameters(self, name): 290 | for layer in getattr(self, name): 291 | if not isinstance(layer, nn.Linear): 292 | continue 293 | nn.init.normal_(layer.weight, std=1 / math.sqrt(layer.out_features)) 294 | if layer.bias is not None: 295 | fan_in, _ = nn.init._calculate_fan_in_and_fan_out(layer.weight) 296 | bound = 1 / math.sqrt(fan_in) 297 | nn.init.uniform_(layer.bias, -bound, bound) 298 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | 5 | import pathlib 6 | 7 | import torch 8 | 9 | from collections import OrderedDict, abc as container_abcs 10 | from torch._six import string_classes 11 | from torch.utils.data import Dataset, DataLoader 12 | from torch.utils.data.sampler import Sampler 13 | 14 | 15 | class RepeatIterator(object): 16 | """ 17 | creates an iterable which returns each integer in range(length) n_times times. 18 | example: next(RepeatIterator(2,3)) would return: 0,0,1,1,2,2,3,3 19 | """ 20 | def __init__(self, n_times, length): 21 | self.n_times = n_times 22 | self.length = length 23 | self.idx = 0 24 | self.reps = 0 25 | 26 | def __iter__(self): 27 | return self 28 | 29 | def __next__(self): 30 | #print(f"Number of TTA views Iterator: {self.n_times}") 31 | if self.reps < self.n_times: 32 | self.reps += 1 33 | else: 34 | self.idx += 1 35 | self.reps = 1 36 | if self.idx < self.length-1: 37 | return self.idx 38 | else: 39 | raise StopIteration 40 | 41 | 42 | class RepeatedSampler(Sampler): 43 | """ 44 | Sampler class that wraps the RepeatIterator. Can be used in dataloaders. 45 | """ 46 | def __init__(self, n_times, data_source): 47 | self.n_times = n_times 48 | self.ds_length = len(data_source) 49 | super().__init__(data_source=data_source) 50 | #print(f"Number of TTA views Sampler: {self.n_times}") 51 | #print(f"ds length Sampler: {self.ds_length}") 52 | self.iterator = RepeatIterator(self.n_times, self.ds_length) 53 | 54 | def __iter__(self): 55 | return self.iterator 56 | 57 | def __len__(self): 58 | return int(self.ds_length)*self.n_times 59 | 60 | 61 | class TabularDataset(Dataset): 62 | """ 63 | Dataset wrapper to sit ontop of a feather file, and read specific columns 64 | """ 65 | 66 | def __init__(self, data_fp, features, normalization_dict=None, eid_selection_mask=None, oversampling=None): 67 | super().__init__() 68 | """ 69 | Create a dataset to read h5ad files. 70 | Currently a bit ugly as we create a pd.DataFrame holding the entire dataset. We need this to perform efficient eid selection using df.loc. 71 | df.loc is the perfect method to do that since it sorts the datamodules to the passed argument as well. We can thus make sure that multiple h5adDatasets are in the same order. 72 | :param h5ad_fp: `str`, the filepath to the h5ad that should be read. 73 | :param features: `list` or list-like, contains the strings to select the features to be returned from the h5ad. 74 | :param eid_selection_mask: `list` or list-like, optional (default `None`), contains the eids to select. 75 | """ 76 | # determine wheter file to read is .csv or .feather: 77 | ext = os.path.splitext(data_fp)[1] 78 | assert ext in ['.csv', '.feather'], 'TabularDataset only supports .csv and .feather files' 79 | print(data_fp) 80 | base = pathlib.Path(data_fp).parents[2] 81 | description_fp = os.path.join(base, f'description{ext}') 82 | assert os.path.exists(description_fp), f'Description file not found in {description_fp}' 83 | 84 | # read datamodules: 85 | read_method = pd.read_feather if ext == '.feather' else pd.read_csv 86 | data = read_method(data_fp) 87 | description = read_method(description_fp) 88 | 89 | for f in features: 90 | if f not in data.columns.values: 91 | print(f) 92 | assert all([c in data.columns.values for c in features]), \ 93 | 'Not all passed features were found in datamodules file columns' 94 | 95 | self.features = features 96 | description = description.query("covariate==@self.features") 97 | self.eid_map = data[["eid"]+self.features].copy().astype({'eid': 'int32'}).set_index('eid') 98 | 99 | if eid_selection_mask is not None: #self.eid_map = self.eid_map.reset_index().query("eid == @eid_selection_map").set_index("eid") 100 | ## find intersection of mask and eids: 101 | eid_selection_mask = [int(i) for i in eid_selection_mask] # make sure its int! 102 | #faulty_ids = [i for i in eid_selection_mask if i not in self.eid_map.index.values] 103 | eids_intersection = self.eid_map.index.intersection(eid_selection_mask) 104 | print(f"{len(self.eid_map)-len(eids_intersection)} eids excluded") 105 | self.eid_map = self.eid_map.loc[eids_intersection,:] # make sure this is sorted. 106 | print(len(self.eid_map)) 107 | # normalize values 108 | if normalization_dict is not None: 109 | self.eid_map = self.normalize_df_fixed_params(self.eid_map, normalization_dict) 110 | 111 | # get the idxs of categorical vars: 112 | self.categoricals = description.query("covariate ==@self.features").\ 113 | query("dtype in ['category', 'bool']").covariate.values 114 | self.continuous = description.query("covariate ==@self.features").\ 115 | query("dtype in ['int', 'float']").covariate.values 116 | self.categorical_idxs = [self.eid_map.columns.tolist().index(v) for v in self.categoricals] 117 | self.continuous_idxs = [self.eid_map.columns.tolist().index(v)for v in self.continuous] 118 | 119 | for f in self.features: 120 | self.eid_map[f] = self.eid_map[f].astype(float) 121 | del data 122 | 123 | def normalize_df_fixed_params(self, df, param_dict): 124 | """ 125 | Normalize pd.DF column-wise. 126 | :param df: 127 | :param param_dict: 'dictionary', contains columns of the df as key and tuple (min, max) as scaling factors for spec column 128 | :return: 129 | """ 130 | print('normalizing datamodules...') 131 | for key in param_dict.keys(): 132 | assert key in df.columns 133 | for col in df.columns: 134 | if col in list(param_dict.keys()): 135 | min = param_dict[col][0] 136 | max = param_dict[col][1] 137 | df[col] = (df[col] - min) / (max - min + 0.00001) 138 | return df 139 | 140 | def __getitem__(self, idx): 141 | fts = self.eid_map.values[idx, :] 142 | return torch.Tensor(fts) 143 | 144 | def __len__(self): 145 | return self.eid_map.shape[0] 146 | 147 | 148 | class ExclusionMaskDataset(Dataset): 149 | def __init__(self, data_fp, exclusion_criteria_dict, eid_selection_mask=None): 150 | super().__init__() 151 | # determine wheter file to read is .csv or .feather: 152 | ext = os.path.splitext(data_fp)[1] 153 | assert ext in ['.csv', '.feather'], 'TabularDataset only supports .csv and .feather files' 154 | print(data_fp) 155 | base = pathlib.Path(data_fp).parents[2] 156 | description_fp = os.path.join(base, f'description{ext}') 157 | assert os.path.exists(description_fp), f'Description file not found in {description_fp}' 158 | 159 | # read data: 160 | read_method = pd.read_feather if ext == '.feather' else pd.read_csv 161 | data = read_method(data_fp) 162 | 163 | # store 164 | self.eid_map = data.copy().astype({'eid': 'int32'}).set_index('eid') 165 | 166 | # apply general exclusion criteria: 167 | if eid_selection_mask is not None: 168 | ## find intersection of mask and eids: 169 | eid_selection_mask = [int(i) for i in eid_selection_mask] # make sure its int! 170 | #faulty_ids = [i for i in eid_selection_mask if i not in self.eid_map.index.values] 171 | eids_intersection = self.eid_map.index.intersection(eid_selection_mask) 172 | print(f"{len(self.eid_map)-len(eids_intersection)} eids excluded") 173 | self.eid_map = self.eid_map.loc[eids_intersection,:] # make sure this is sorted. 174 | print(len(self.eid_map)) 175 | 176 | # excl: 177 | self.exclusion_criteria_dict = exclusion_criteria_dict 178 | self.generate_exclusion_masks() 179 | self.eid_map = self.eid_map[[v for v in self.eid_map.columns if v.endswith('exclusion_mask')]].astype(float) 180 | 181 | del data 182 | 183 | def generate_exclusion_masks(self): 184 | for task in self.exclusion_criteria_dict.keys(): 185 | eids = self.eid_map.query(self.exclusion_criteria_dict[task]).index.to_list() 186 | 187 | self.eid_map[f'{task}_exclusion_mask'] = 1 188 | self.eid_map.loc[eids, f'{task}_exclusion_mask'] = 0 189 | 190 | print(task, self.eid_map.shape[0], self.eid_map[f'{task}_exclusion_mask'].sum()) 191 | 192 | def __getitem__(self, idx): 193 | fts = self.eid_map.values[idx, :] 194 | return torch.Tensor(fts) 195 | 196 | def __len__(self): 197 | return self.eid_map.shape[0] 198 | 199 | 200 | class BatchedDS(Dataset): 201 | def __init__(self, dataset, batch_size, attrs=None): 202 | attrs = ['durations', 'events', ] if attrs is None else attrs 203 | for attr in attrs: 204 | try: 205 | setattr(self, attr, getattr(dataset, attr)) 206 | except: 207 | print('Dataset has not attribute %s' % attr) 208 | 209 | self.len = len(dataset) 210 | self.dataset = dataset 211 | self.batch_size = batch_size 212 | 213 | def __len__(self): 214 | return self.len // self.batch_size 215 | 216 | def __getitem__(self, idx): 217 | return self.dataset[idx*self.batch_size:idx*self.batch_size+self.batch_size] 218 | 219 | @staticmethod 220 | def default_collate(batch): 221 | r"""Puts each datamodules field into a tensor with outer dimension batch size""" 222 | elem = batch[0] 223 | elem_type = type(elem) 224 | if isinstance(elem, torch.Tensor): 225 | out = None 226 | if torch.utils.data.get_worker_info() is not None: 227 | # If we're in a background process, concatenate directly into a 228 | # shared memory tensor to avoid an extra copy 229 | numel = sum([x.numel() for x in batch]) 230 | storage = elem.storage()._new_shared(numel) 231 | out = elem.new(storage) 232 | return torch.cat(batch, 0, out=out) 233 | elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \ 234 | and elem_type.__name__ != 'string_': 235 | elem = batch[0] 236 | if elem_type.__name__ == 'ndarray': 237 | 238 | return BatchedDS.default_collate([torch.as_tensor(b) for b in batch]) 239 | elif elem.shape == (): # scalars 240 | return torch.as_tensor(batch) 241 | elif isinstance(elem, float): 242 | return torch.tensor(batch, dtype=torch.float64) 243 | elif isinstance(elem, int): 244 | return torch.tensor(batch) 245 | elif isinstance(elem, string_classes): 246 | return batch 247 | elif isinstance(elem, container_abcs.Mapping): 248 | return {key: BatchedDS.default_collate([d[key] for d in batch]) for key in elem} 249 | elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple 250 | return elem_type(*(BatchedDS.default_collate(samples) for samples in zip(*batch))) 251 | elif isinstance(elem, container_abcs.Sequence): 252 | # check to make sure that the elements in batch have consistent size 253 | it = iter(batch) 254 | elem_size = len(next(it)) 255 | if not all(len(elem) == elem_size for elem in it): 256 | raise RuntimeError('each element in list of batch should be of equal size') 257 | transposed = zip(*batch) 258 | return [BatchedDS.default_collate(samples) for samples in transposed] 259 | 260 | 261 | class DatasetWrapper(Dataset): 262 | """ 263 | Wrap multiple datasets (datamodules) with labels (labels). 264 | Assumes all passed datasets have the same order. 265 | """ 266 | def __init__(self, 267 | covariate_datasets, 268 | label_datasets): 269 | """ 270 | Wrap multiple datasets (datamodules) with labels (labels). 271 | Assumes all passed datasets have the same order (eid-wise). 272 | 273 | :param covariate_datasets: `list-like`, should contain datasets, samples all in the same order 274 | :param label_dataset: `list-like`, shoudl contain datsets 275 | """ 276 | assert all(len(ds) == len(label_datasets[0]) 277 | for ds in covariate_datasets + label_datasets), 'datasets need to be same length' 278 | self.datasets = covariate_datasets 279 | self.label_datasets = label_datasets 280 | 281 | @property 282 | def eid_map(self): 283 | return self.datasets[0].eid_map.values 284 | 285 | @property 286 | def durations(self): 287 | return self.label_datasets[0].eid_map.values 288 | 289 | @property 290 | def events(self): 291 | return self.label_datasets[1].eid_map.values 292 | 293 | def __len__(self): 294 | return len(self.label_datasets[0]) 295 | 296 | def __getitem__(self, idx): 297 | # return a tuple for datasets and a tuple for whatever is in labels 298 | # ((dataset1, dataset2, dataset3, ..)(duration, labels)) 299 | covariates = tuple([ds[idx] for ds in self.datasets]) if len(self.datasets) > 1 else self.datasets[0][idx] 300 | labels = tuple([ds[idx] for ds in self.label_datasets]) if len(self.label_datasets) > 1 else self.label_datasets[0][idx] 301 | 302 | return covariates, labels 303 | 304 | 305 | class LabelPlaceHolderDataset(Dataset): 306 | """ 307 | This dataset is to be used as a Mockup for the labels dataset in the datasetwrapper if no lablels are needed. 308 | """ 309 | def __init__(self, eids, feature_dim=10): 310 | super().__init__() 311 | # construc mockup: 312 | self.feature_dim = feature_dim 313 | mockup_labels = np.zeros((len(eids), )) 314 | self.eid_map = pd.DataFrame(np.stack([np.asarray(eids), mockup_labels], axis=-1), 315 | columns=['eid', 'MockUpCol']).set_index('eid') 316 | def __getitem__(self, idx): 317 | fts = np.zeros(self.feature_dim) 318 | return torch.Tensor(fts) 319 | 320 | def __len__(self): 321 | return self.eid_map.shape[0] 322 | -------------------------------------------------------------------------------- /analysis/preprocessing/pipeline_metabolomics.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import pandas as pd 4 | import numpy as np 5 | import prefect as pf 6 | import miceforest as mf 7 | from prefect.engine.results import LocalResult 8 | from prefect.engine.flow_runner import FlowRunner 9 | from prefect.engine.serializers import JSONSerializer 10 | from prefect.executors import LocalDaskExecutor 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.preprocessing import StandardScaler 13 | from collections import OrderedDict 14 | from category_encoders.ordinal import OrdinalEncoder 15 | import pickle 16 | 17 | 18 | output_directory = '/your/output/dir/' 19 | output_name = 'your_dataset_name' 20 | 21 | json_serializer = JSONSerializer() 22 | 23 | 24 | class ApplyImputer(pf.Task): 25 | """ 26 | Takes a list of tuples, where the first pos is the eids_dict, the second is the kernel, the third is the split. 27 | Then applies imputer and saves to file. 28 | """ 29 | def __init__(self, *args, **kwargs): 30 | super().__init__(*args, **kwargs) 31 | 32 | def _update_target(self, cv_partition, split): 33 | """ 34 | Update Target string at runtime. 35 | :return: 36 | """ 37 | self.target = f"partition_{cv_partition}/{split}_baseline_imputed.csv" 38 | 39 | def run(self, partition_split_dict): 40 | """ 41 | split tuple is a tuple in the form of 42 | ( (partition_idx, eids_dict, (data_merged, data_merged_description) ), imputer, split) 43 | :param partition_split_dict: 44 | :return: 45 | """ 46 | split = partition_split_dict['split'] 47 | partition = partition_split_dict["cv_partition"] 48 | eids = partition_split_dict['eids_dict'][split] 49 | 50 | assert split in ['test', 'train', 'valid'] 51 | self._update_target(partition_split_dict['cv_partition'], split) 52 | data = partition_split_dict['data'].loc[eids] 53 | 54 | # Save partitions 55 | data_output_path = f"{output_directory}/{output_name}/partition_{partition}/{split}" 56 | pathlib.Path(data_output_path).mkdir(parents=True, exist_ok=True) 57 | data.reset_index().to_feather(f"{data_output_path}/data.feather") 58 | 59 | # Impute data 60 | with open(partition_split_dict['imputer_path'], "rb") as input_file: imputer = pickle.load(input_file) 61 | data_imputed = imputer.impute_new_data(new_data=data).complete_data() 62 | partition_split_dict['data'] = data_imputed 63 | 64 | data_output_path = f"{output_directory}/{output_name}/partition_{partition}/{split}" 65 | pathlib.Path(data_output_path).mkdir(parents=True, exist_ok=True) 66 | data_imputed.reset_index().to_feather(f"{data_output_path}/data_imputed.feather") 67 | 68 | return partition_split_dict 69 | 70 | 71 | class ApplyNorm(pf.Task): 72 | """ 73 | Takes a list of tuples, where the first pos is the eid_dict, the second is the kernel, the third is the split. 74 | Then applies imputer and saves to file. 75 | """ 76 | def __init__(self, *args, **kwargs): 77 | super().__init__(*args, **kwargs) 78 | 79 | def _update_target(self, cv_partition, split): 80 | """ 81 | Update Target string at runtime. 82 | :return: 83 | """ 84 | self.target = f"partition_{cv_partition}/{split}_baseline_imputed_normalized.csv" 85 | 86 | def run(self, partition_split_dict): 87 | """ 88 | DICT 89 | :param partition_split_dict: 90 | :return: 91 | """ 92 | split = partition_split_dict['split'] 93 | partition = partition_split_dict['cv_partition'] 94 | self._update_target(partition, split) 95 | 96 | description = partition_split_dict['description'] 97 | 98 | noncategorical_covariates = description.reset_index() \ 99 | .set_index('dtype').loc[['int', "float"]] \ 100 | .query("(isTarget == False) & (based_on != 'diagnoses_emb') & (based_on != 'eid')")['covariate'].values 101 | 102 | 103 | noncat_data = partition_split_dict['data'][noncategorical_covariates].copy() 104 | 105 | # log 1p transform!: 106 | for c in noncat_data.columns: 107 | if c.startswith('NMR'): 108 | noncat_data[c] = np.log1p(noncat_data[c].values) 109 | 110 | noncat_data = noncat_data.values 111 | 112 | noncat_data = pd.DataFrame(partition_split_dict['normalizer'].transform(noncat_data), 113 | columns=noncategorical_covariates) 114 | 115 | for v in noncategorical_covariates: 116 | partition_split_dict['data'][v] = noncat_data[v].values 117 | 118 | # save preprocessed data 119 | data_output_path = f"{output_directory}/{output_name}/partition_{partition}/{split}" 120 | pathlib.Path(data_output_path).mkdir(parents=True, exist_ok=True) 121 | partition_split_dict['data'].reset_index().to_feather(f"{data_output_path}/data_imputed_normalized.feather") 122 | 123 | # return partition_split_dict 124 | return partition_split_dict['data'] 125 | 126 | @pf.task(target="data_merged_dict.p", 127 | checkpoint=True, 128 | log_stdout=True, 129 | result=LocalResult(dir=f"{output_directory}/{output_name}") 130 | ) 131 | def read_and_merge_data(covariate_paths, input_data_dir): 132 | logger = pf.context.get("logger") 133 | logger.info("Data") 134 | data_dfs = [pd.read_feather(f"{input_data_dir}/{covariate_paths[covariate][0]}").set_index("eid") for covariate in covariate_paths] 135 | data_merged = pd.concat(data_dfs, axis=1).copy() 136 | output_path = f"{output_directory}/{output_name}" 137 | 138 | data_merged.reset_index().to_feather(f"{output_path}/data_merged.feather") 139 | 140 | logger.info("Descriptions") 141 | description_dfs = [pd.read_feather(f"{input_data_dir}/{covariate_paths[covariate][1]}") for covariate in covariate_paths] 142 | description_merged = pd.concat([df if i == 0 else df.tail(-1) for i, df in enumerate(description_dfs)], axis=0).reset_index() 143 | description_merged.reset_index(drop=True).to_feather(f"{output_path}/description_merged.feather") 144 | 145 | return {"data": data_merged.query('NMR_FLAG==True'), "description": description_merged} 146 | 147 | @pf.task(name="encode_categoricals", 148 | target="data_encoded.p", 149 | checkpoint=True, 150 | result=LocalResult(dir=f"{output_directory}/{output_name}") 151 | ) 152 | def encode_categoricals(data_dict): 153 | logger = pf.context.get("logger") 154 | data = data_dict["data"] 155 | description = data_dict["description"] 156 | 157 | cat_cols = [c for c in description.set_index("dtype").loc[["category"]].covariate.to_list() if "date" not in c] 158 | 159 | mapping = [{"col": c, "mapping": {e: i for i, e in enumerate([v for v in data[c].unique().tolist() if v==v])}} for c in cat_cols] 160 | for i, c in enumerate(cat_cols): mapping[i]["mapping"].update({np.nan: -2}) 161 | 162 | enc = OrdinalEncoder(cols=cat_cols, mapping=mapping, handle_missing="return_nan") 163 | data = enc.fit_transform(data) 164 | 165 | description["mapping"] = np.nan 166 | for i, c in enumerate(cat_cols): 167 | description.loc[description.covariate == c, 'mapping'] = str(enc.mapping[i]["mapping"]) 168 | if data[c].nunique() > 2: 169 | ohe_encoded = pd.get_dummies(data[c], prefix=c) 170 | data[ohe_encoded.columns] = ohe_encoded 171 | for col in ohe_encoded.columns: 172 | description = description.append( 173 | {"covariate": col, "dtype": "bool", "isTarget": False, 174 | "based_on": description.loc[description.covariate == c, "based_on"].iloc[0], 175 | "aggr_fn": np.nan, "mapping": str(enc.mapping[i]["mapping"])}, ignore_index=True) 176 | description["based_on"] = description["based_on"].astype(str) 177 | 178 | description.reset_index(drop=True).to_feather(f"{output_directory}/{output_name}/description.feather") 179 | 180 | logger.info(f"{len(cat_cols)} columns one-hot-encoded") 181 | return {"data": data, "description": description} 182 | 183 | @pf.task(name="apply_exclusion_criteria", 184 | target="data_merged_excluded_dict.p", 185 | checkpoint=True, 186 | result=LocalResult(dir=f"{output_directory}/{output_name}") 187 | ) 188 | def apply_exclusion_criteria(data_dict, exclusion_criteria): 189 | logger = pf.context.get("logger") 190 | data = data_dict["data"] 191 | data_excl = data.copy().query(exclusion_criteria).reset_index(drop=False).set_index("eid") 192 | output_path = f"{output_directory}/{output_name}" 193 | data_excl.reset_index().to_feather(f"{output_path}/data_excl.feather") 194 | logger.info(f"{len(data)-len(data_excl)} eids excluded") 195 | return {"data": data, "description": data_dict["description"]} 196 | 197 | @pf.task(name="get_eids_for_partitions", 198 | target=f"eids.json", 199 | checkpoint=True, 200 | result=LocalResult(dir=f"{output_directory}/{output_name}", serializer=json_serializer) 201 | ) 202 | 203 | def get_eids_for_partitions(data_dict, partition_column, valid_size=0.1): 204 | logger = pf.context.get("logger") 205 | 206 | data_all = data_dict["data"] 207 | eids_all = data_all.index.values 208 | groups = data_all.reset_index().set_index(partition_column).index.value_counts().index.to_list() 209 | splits = {i: data_all.query(f"{partition_column}==@group").index.tolist() for i, group in enumerate(groups)} 210 | 211 | eids_dict = OrderedDict() 212 | for partition in range(len(groups)): 213 | eids_dict[partition] = {} 214 | eids_test = splits[partition] 215 | eids_notest = sorted(list(set(eids_all) - set(eids_test))) 216 | eids_train, eids_valid = train_test_split(eids_notest, test_size=valid_size, shuffle=False) 217 | 218 | if bool(set(eids_train) & set(eids_valid) & set(eids_test)) == True: 219 | logger.warning(f"Overlap of eids in partition {partition}") 220 | else: 221 | logger.info(f"No overlap of eids in partition {partition}") 222 | 223 | eids_dict[partition]["train"] = eids_train 224 | eids_dict[partition]["valid"] = eids_valid 225 | eids_dict[partition]["test"] = eids_test 226 | 227 | return eids_dict 228 | 229 | @pf.task 230 | def get_partitions(data_dict, eids_dict): 231 | partition_dicts = [{**data_dict, 'cv_partition': partition_idx, 'eids_dict': eids_dict[partition_idx]} for partition_idx in eids_dict.keys()] 232 | return partition_dicts 233 | 234 | 235 | @pf.task(name="fit_imputer", 236 | target="{task_name}/{task_full_name}_kernel.p", 237 | checkpoint=True, 238 | result=LocalResult(dir=os.path.join(output_directory, output_name, "pipeline/")) 239 | ) 240 | def fit_imputer(partition_dict): 241 | """ 242 | Fit an imputer to train set and pickle it 243 | (partition_idx, eids_dict, (data, data_descr) ) 244 | """ 245 | eids_train = partition_dict['eids_dict']['train'] 246 | data = partition_dict['data'].loc[eids_train] 247 | partition = partition_dict["cv_partition"] 248 | 249 | missing = data.columns[data.isna().any()].to_list() 250 | missing = [col for col in missing if not "NMR_measurement_quality_flagged" in col] 251 | 252 | events = [col for col in data.columns if "_event" in col] 253 | 254 | variable_schema = {} 255 | for m in missing: 256 | variable_schema[m] = [x for x in missing if x != m]+events 257 | kernel = mf.KernelDataSet(data, 258 | variable_schema=variable_schema, 259 | save_all_iterations=True, 260 | random_state=42) 261 | 262 | # Run the MICE algorithm for 3 iterations 263 | kernel.mice(3, n_jobs=1, n_estimators=8, 264 | max_features="sqrt", bootstrap=True, max_depth=8, verbose=True) 265 | 266 | data_output_path = f"{output_directory}/{output_name}/partition_{partition}" 267 | pathlib.Path(data_output_path).mkdir(parents=True, exist_ok=True) 268 | 269 | imputer_path = f"{data_output_path}/imputer.p" 270 | with open(imputer_path, "wb") as output_file: pickle.dump(kernel, output_file) 271 | del kernel 272 | return imputer_path 273 | 274 | @pf.task 275 | def get_splits_per_partition(partition_dict, imputer_path, splits): 276 | partition_split_dicts = [{**partition_dict, 'imputer_path': imputer_path, 'split': s} for s in splits] 277 | return partition_split_dicts 278 | 279 | @pf.task(name="fit_normalization", 280 | target="{task_name}/{task_full_name}_norm.p", 281 | checkpoint=True, 282 | result=LocalResult(dir=os.path.join(output_directory, output_name, "pipeline/")) 283 | ) 284 | def fit_normalization(partition_split_dicts): 285 | """ 286 | Fit an imputer to train set and pickle it. 287 | 288 | imputed_tuples should be a list of dicts of the form: 289 | data_imputed is the imputed data for a split in the partition for partition idx 290 | 291 | """ 292 | # first get vars: 293 | description = partition_split_dicts[0]['description'] 294 | noncategorical_covariates = description.reset_index() \ 295 | .set_index('dtype').loc[['int', "float"]] \ 296 | .query("(isTarget == False) & (based_on != 'diagnoses_emb') & (based_on != 'eid')")['covariate'].values 297 | 298 | # fit normalizer for each train split: 299 | fitted_normalizers = {} 300 | for d in partition_split_dicts: 301 | if d['split'] == 'train': 302 | if 'eid' in d['data'].columns: 303 | data = d['data'].set_index('eid') 304 | else: 305 | data = d['data'] 306 | noncategorical_data = data[noncategorical_covariates] 307 | 308 | # log 1p transform!: 309 | for c in noncategorical_data.columns: 310 | if c.startswith('NMR'): 311 | noncategorical_data[c] = np.log1p(noncategorical_data[c].values) 312 | 313 | noncategorical_data = noncategorical_data.values 314 | 315 | norm = StandardScaler(with_mean=True, with_std=True, copy=True).fit(noncategorical_data) 316 | fitted_normalizers[d['cv_partition']] = norm 317 | 318 | partition_split_dicts = [{**d, 'normalizer': fitted_normalizers[d['cv_partition']]} for d in partition_split_dicts] 319 | return partition_split_dicts 320 | 321 | 322 | Impute = ApplyImputer(name="apply_imputer", 323 | target=f"partition_23/baseline_imputed.csv", 324 | checkpoint=True, 325 | result=LocalResult(dir=f"{output_directory}/{output_name}/cv_partitions/"), 326 | # serializer=pd_serializer) 327 | ) 328 | 329 | Normalize = ApplyNorm(name="apply_norm", 330 | target=f"partition_23/baseline_imputed_normalized.csv", 331 | checkpoint=True, 332 | result=LocalResult(dir=f"{output_directory}/{output_name}/cv_partitions/"), 333 | # serializer=pd_serializer) 334 | ) 335 | 336 | with pf.Flow("ukb_pipeline") as flow: 337 | input_data_dir = pf.Parameter('input_data', 338 | default=f'{output_name}/2_datasets_pre/210709_metabolomics/') 339 | partition_column = pf.Parameter('partition_column', default="uk_biobank_assessment_centre") 340 | valid_size = pf.Parameter('valid_size', default=0.1) 341 | data_filenames = { 342 | "covariates": ("baseline_covariates.feather", "baseline_covariates_description.feather"), 343 | "pgs": ("baseline_pgs.feather", "baseline_pgs_description.feather"), 344 | "endpoints": ("baseline_endpoints.feather", "baseline_endpoints_description.feather"), 345 | } 346 | 347 | 348 | data_dict = read_and_merge_data(data_filenames, input_data_dir) 349 | data_dict = encode_categoricals(data_dict) 350 | eids_dict = get_eids_for_partitions(data_dict, partition_column=partition_column, valid_size=valid_size) 351 | partition_dicts = get_partitions(data_dict, eids_dict) 352 | 353 | # fit imputer per partition 354 | imputer_paths = fit_imputer.map(partition_dict=partition_dicts) 355 | 356 | partition_split_dicts = get_splits_per_partition.map(partition_dicts, 357 | imputer_paths, 358 | splits=pf.unmapped(['train', 'test', 'valid']) 359 | ) 360 | 361 | partition_split_dicts = Impute.map(partition_split_dict=pf.flatten(partition_split_dicts)) 362 | partition_split_dicts = fit_normalization(partition_split_dicts=partition_split_dicts) 363 | 364 | normalized = Normalize.map(partition_split_dict=partition_split_dicts) 365 | 366 | if __name__ == "__main__": 367 | flow.executor = LocalDaskExecutor(scheduler="threads", num_workers=60) 368 | 369 | # run locally 370 | runner = FlowRunner(flow=flow) 371 | flow_state = runner.run(return_tasks=flow.tasks) 372 | -------------------------------------------------------------------------------- /metabolomicstatemodel/source/datamodules.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import pandas as pd 3 | import numpy as np 4 | import pytorch_lightning as pl 5 | from torch.utils.data import DataLoader 6 | from omegaconf import OmegaConf, ListConfig, DictConfig 7 | 8 | from .datasets import TabularDataset, DatasetWrapper, BatchedDS, ExclusionMaskDataset 9 | 10 | 11 | class RiskianoDataModule(pl.LightningDataModule): 12 | def __init__(self, batch_size=128, num_workers=8, tabular_filepath='', use_batched_ds=False, 13 | return_rank_mat=None, output_dim=None, fast_dev_run=None, cv_partition=None, **kwargs): 14 | """ 15 | Abstract DataModule Class for Riskiano. 16 | 17 | The __init__ of this calss should be called in every inherited class. 18 | 19 | A few points to consider: 20 | - hardcode filepaths for versioning 21 | - make durations, events and other labels explicit attributes 22 | - define transformations etc in the `__init__()` 23 | - The logic of how exactly the individual datasets are instantiated should be defined in the 24 | `get_dataset()` method. This method NEEDS TO BE DEFINED PER USECASE, and will be called in `setup()`. 25 | 26 | :param batch_size: `int`, batchsize to use, needs to be passed for the BatchedDS 27 | :param num_workers: `int`, number of workers for the DataLoaders 28 | :param use_batched_ds: `bool`, whether to use the batchedDS (`True`) or not (`False`). Defaults to `False`. 29 | :param output_categorical: `bool`, whether to output categorical columns (`True`) vs. 1-hot/binary columns (`False`) 30 | :param return_rank_mat: `bool`, whether to return the rank_mat for DeepHitTraining 31 | :param output_dim: `int`, output-dimension of the network, needed for cuts and rank_mat calculation, can be ommitted of rank_mat equals False 32 | :param fast_dev_run: `bool`, similar to pl.Trainer FLAG. in this case limits the eid_map to 100 eids. 33 | :param kwargs: 34 | """ 35 | super().__init__() 36 | self.cv_partition = cv_partition 37 | self.batch_size = batch_size 38 | self.num_workers = num_workers 39 | self.tabular_filepath = tabular_filepath 40 | self.use_batched_ds = use_batched_ds 41 | self.fast_dev_run = fast_dev_run 42 | 43 | self.return_rank_mat = return_rank_mat 44 | if self.return_rank_mat: 45 | assert output_dim is not None, 'Rank mat computation needs out_dim!' 46 | self.output_dim = output_dim 47 | self.cuts = None 48 | 49 | def get_batched_ds(self, ds): 50 | if self.return_rank_mat: 51 | raise NotImplementedError() 52 | else: 53 | return BatchedDS(ds, batch_size=self.batch_size) 54 | 55 | def get_dataset(self, split): 56 | raise NotImplementedError('Implement according to usecase.') 57 | 58 | def setup(self, stage=None): 59 | self.train_ds = self.get_dataset('train') 60 | self.valid_ds = self.get_dataset('valid') 61 | try: 62 | self.test_ds = self.get_dataset('test') 63 | except AssertionError: 64 | print('No test split defined to this data.') 65 | 66 | if self.return_rank_mat: 67 | self.cuts = self.get_time_cuts() 68 | 69 | def get_time_cuts(self, max_time=None): 70 | """ 71 | Get the interval borders for the discrete times. 72 | :param n_durations: 73 | :param ds: 74 | :return: 75 | """ 76 | if self.cuts is not None: 77 | return self.cuts 78 | 79 | loader = DataLoader(self.train_ds, batch_size=1024, num_workers=self.num_workers, shuffle=False, drop_last=False) 80 | 81 | if max_time is None: 82 | max_time = -np.inf 83 | for data in loader: 84 | _, (durations, _) = data 85 | max_duration = float(durations.max()) 86 | if max_time < max_duration: 87 | max_time = max_duration 88 | return np.linspace(0, max_time, self.output_dim + 1) 89 | 90 | def train_dataloader(self): 91 | if self.use_batched_ds: 92 | return DataLoader(self.get_batched_ds(self.train_ds), 93 | num_workers=self.num_workers, pin_memory=True, collate_fn=BatchedDS.default_collate, 94 | shuffle=True) 95 | else: 96 | return DataLoader(self.train_ds, batch_size=self.batch_size, 97 | num_workers=self.num_workers, shuffle=True) 98 | 99 | def val_dataloader(self): 100 | if self.use_batched_ds: 101 | return DataLoader(self.get_batched_ds(self.valid_ds), 102 | num_workers=self.num_workers, pin_memory=True, collate_fn=BatchedDS.default_collate, 103 | shuffle=False) 104 | else: 105 | return DataLoader(self.valid_ds, batch_size=self.batch_size, 106 | num_workers=self.num_workers, shuffle=False) 107 | 108 | def test_dataloader(self): 109 | if not self.use_batched_ds: 110 | return DataLoader(self.test_ds, batch_size=self.batch_size, 111 | num_workers=self.num_workers, shuffle=False) 112 | else: 113 | return DataLoader(self.get_batched_ds(self.test_ds), 114 | num_workers=self.num_workers, pin_memory=True, collate_fn=BatchedDS.default_collate, 115 | shuffle=False) 116 | 117 | 118 | class UKBBSurvivalDatamodule(RiskianoDataModule): 119 | """ 120 | Datamodule for survival training on UKBB data. 121 | 122 | :param batch_size: `int`, batchsize needed for outputting the rankmat + loaders 123 | :param num_workers: `int`, num_workers 124 | :param tabular_filepath: `str`, path to the ukbb data file. 125 | :param use_batched_ds: `bool`, whether to use the batched_dataset. 126 | :param features: `Union([dict, list]), features/covariates to use. 127 | :param duration: `str`, the duration col in the datset file 128 | :param event: `str`, the event col in the datset file 129 | :param return_rank_mat: `bool`, whether to return rank_mat (required to DeepHit Model) or not, default=False 130 | :param output_dim: `int`, n-timepoints in descrete time model, required for rank_mat generation -> required for DeepHit Model 131 | :param fast_dev_run: `bool`, run smoke test, default=False 132 | :param cv_partition: `int`, partition to read data from. 133 | :param output_categorical: `bool`, wheter to ourput raw categories (ints -> True) or do 1-hot encoding (False), Default=False 134 | :param exclusion_criteria: `dict`, dict of the form {`sets_apply`: [`train`, `valid`]} -> to which sets to apply exclusion criteria. Default = None 135 | :param kwargs: 136 | """ 137 | def __init__(self, 138 | batch_size=128, 139 | num_workers=8, 140 | tabular_filepath="", 141 | use_batched_ds=False, 142 | features={}, 143 | duration='', 144 | event='', 145 | return_rank_mat=None, 146 | output_dim=None, 147 | clip=False, 148 | fast_dev_run=False, 149 | cv_partition=0, 150 | output_categorical=False, 151 | cohort_definition=None, 152 | oversampling=False, 153 | **kwargs): 154 | super().__init__(batch_size=batch_size, num_workers=num_workers, tabular_filepath=tabular_filepath, 155 | use_batched_ds=use_batched_ds, 156 | return_rank_mat=return_rank_mat, output_dim=output_dim, fast_dev_run=fast_dev_run) 157 | 158 | self.cv_partition=cv_partition 159 | self.cohort_definition = cohort_definition if not isinstance(cohort_definition, DictConfig) \ 160 | else OmegaConf.to_container(cohort_definition, resolve=True) 161 | 162 | assert isinstance(features, (dict, list, ListConfig, DictConfig)), 'Features must be dict or list.' 163 | 164 | features = features if not isinstance(features, (ListConfig, DictConfig)) \ 165 | else OmegaConf.to_container(features, resolve=True) 166 | 167 | if isinstance(features, dict): 168 | if output_categorical == False: 169 | self.features = {**features["one_hot_enc"], **features["general"]} 170 | else: 171 | self.features = {**features["categorical"], **features["general"]} 172 | print(self.features) 173 | self.features = [f for group_list in self.features.values() for f in group_list] 174 | else: 175 | self.features = features 176 | 177 | print(type(self.features)) 178 | 179 | self.duration = duration if not isinstance(duration, (ListConfig, DictConfig)) \ 180 | else OmegaConf.to_container(duration, resolve=True) 181 | self.event = event if not isinstance(event, (ListConfig, DictConfig)) \ 182 | else OmegaConf.to_container(event, resolve=True) 183 | self.clip = clip 184 | self.oversampling = oversampling 185 | 186 | def get_dataset(self, split): 187 | filepath = f'{self.tabular_filepath}/partition_{self.cv_partition}/{split}/data_imputed_normalized.feather' 188 | print(filepath) 189 | if self.cohort_definition is not None: 190 | if split in self.cohort_definition.keys(): 191 | eids = pd.read_feather(f"{self.tabular_filepath}/data_merged.feather").query(self.cohort_definition[split]).eid.to_list() 192 | else: 193 | eids = None 194 | else: 195 | eids = None 196 | 197 | ds = TabularDataset(filepath, self.features, eid_selection_mask=eids) 198 | if self.clip: 199 | upperq = ds.eid_map.quantile(.99) 200 | lowerq = ds.eid_map.quantile(.01) 201 | for c in self.features: 202 | ds.eid_map.loc[:, c] = ds.eid_map[c].clip( 203 | lower=lowerq[c], upper=upperq[c]) 204 | covariate_datasets = [ds] 205 | label_datasets = [TabularDataset(filepath, self.duration, eid_selection_mask=eids), 206 | TabularDataset(filepath, self.event, eid_selection_mask=eids)] 207 | 208 | # make sure we have observations for each label: 209 | print(split) 210 | print(label_datasets[1].eid_map[[c for c in label_datasets[1].eid_map.columns if 'event' in c]].sum()) 211 | 212 | # oversample if needed: 213 | if split == 'train' and self.oversampling: 214 | assert len(self.event) == 1, 'Oversampling only possible for single events.' 215 | pos_eids = label_datasets[1].eid_map.query(f'{self.event[0]}==1').index.values 216 | # augment sets: 217 | for ds_list in [covariate_datasets, label_datasets]: 218 | for ds in ds_list: 219 | pos_ds = pd.concat(10*[ds.eid_map.loc[pos_eids].copy()], axis=0) 220 | print(pos_ds.head()) 221 | pos_ds = pos_ds.reset_index(drop=True) 222 | pos_ds.index.name = 'eid' 223 | print(pos_ds.head()) 224 | ds.eid_map = pd.concat([ds.eid_map, pos_ds], axis=0) 225 | 226 | # make sure we have observations for each label: 227 | print(split) 228 | print(label_datasets[1].eid_map[[c for c in label_datasets[1].eid_map.columns if 'event' in c]].sum()) 229 | 230 | return DatasetWrapper(covariate_datasets, label_datasets) 231 | 232 | 233 | class UKBBSurvivalDatamoduleWithExclusions(UKBBSurvivalDatamodule): 234 | """ 235 | Datamodule for survival training on UKBB data, that explicitly generates exclusion masks for the model. 236 | 237 | :param batch_size: `int`, batchsize needed for outputting the rankmat + loaders 238 | :param num_workers: `int`, num_workers 239 | :param tabular_filepath: `str`, path to the ukbb data file. 240 | :param use_batched_ds: `bool`, whether to use the batched_dataset. 241 | :param features: `Union([dict, list]), features/covariates to use. 242 | :param duration: `str`, the duration col in the datset file 243 | :param event: `str`, the event col in the datset file 244 | :param return_rank_mat: `bool`, whether to return rank_mat (required to DeepHit Model) or not, default=False 245 | :param output_dim: `int`, n-timepoints in descrete time model, required for rank_mat generation -> required for DeepHit Model 246 | :param fast_dev_run: `bool`, run smoke test, default=False 247 | :param cv_partition: `int`, partition to read data from. 248 | :param output_categorical: `bool`, wheter to ourput raw categories (ints -> True) or do 1-hot encoding (False), Default=False 249 | :param exclusion_criteria: `dict`, dict of the form {`sets_apply`: [`train`, `valid`]} -> to which sets to apply exclusion criteria. Default = None 250 | :param kwargs: 251 | """ 252 | def __init__(self, 253 | batch_size=128, 254 | num_workers=8, 255 | tabular_filepath="", 256 | use_batched_ds=False, 257 | features={}, 258 | duration='', 259 | event='', 260 | return_rank_mat=None, 261 | output_dim=None, 262 | clip=False, 263 | fast_dev_run=False, 264 | cv_partition=0, 265 | output_categorical=False, 266 | cohort_definition=None, 267 | oversampling=False, 268 | **kwargs): 269 | super().__init__( 270 | batch_size=batch_size, 271 | num_workers=num_workers, 272 | tabular_filepath=tabular_filepath, 273 | use_batched_ds=use_batched_ds, 274 | features=features, 275 | duration=duration, 276 | event=event, 277 | return_rank_mat=return_rank_mat, 278 | output_dim=output_dim, 279 | clip=clip, 280 | fast_dev_run=fast_dev_run, 281 | cv_partition=cv_partition, 282 | output_categorical=output_categorical, 283 | cohort_definition=None, 284 | oversampling=oversampling) 285 | 286 | # self.cohort_definition = cohort_definition if not isinstance(cohort_definition, DictConfig) \ 287 | # else OmegaConf.to_container(cohort_definition, resolve=True) 288 | self.cohort_definition = cohort_definition 289 | 290 | def get_dataset(self, split): 291 | filepath = f'{self.tabular_filepath}/partition_{self.cv_partition}/{split}/data_imputed_normalized.feather' 292 | print(filepath) 293 | if self.cohort_definition is not None: 294 | if split in self.cohort_definition.general.keys(): 295 | eids = pd.read_feather(f"{self.tabular_filepath}/data_merged.feather").query(self.cohort_definition.general[split]).eid.to_list() 296 | else: 297 | eids = None 298 | else: 299 | eids = None 300 | 301 | ds = TabularDataset(filepath, self.features, eid_selection_mask=eids) 302 | if self.clip: 303 | upperq = ds.eid_map.quantile(.99) 304 | lowerq = ds.eid_map.quantile(.01) 305 | for c in self.features: 306 | ds.eid_map.loc[:, c] = ds.eid_map[c].clip( 307 | lower=lowerq[c], upper=upperq[c]) 308 | mask_ds = ExclusionMaskDataset(filepath, exclusion_criteria_dict=self.cohort_definition.task_specific, eid_selection_mask=eids) 309 | covariate_datasets = [ds, mask_ds] 310 | label_datasets = [TabularDataset(filepath, self.duration, eid_selection_mask=eids), 311 | TabularDataset(filepath, self.event, eid_selection_mask=eids)] 312 | 313 | # make sure we have observations for each label: 314 | print(split) 315 | print(label_datasets[1].eid_map[[c for c in label_datasets[1].eid_map.columns if 'event' in c]].sum()) 316 | 317 | # oversample if needed: 318 | if split == 'train' and self.oversampling: 319 | assert len(self.event) == 1, 'Oversampling only possible for single events.' 320 | pos_eids = label_datasets[1].eid_map.query(f'{self.event[0]}==1').index.values 321 | # augment sets: 322 | for ds_list in [covariate_datasets, label_datasets]: 323 | for ds in ds_list: 324 | pos_ds = pd.concat(10*[ds.eid_map.loc[pos_eids].copy()], axis=0) 325 | print(pos_ds.head()) 326 | pos_ds = pos_ds.reset_index(drop=True) 327 | pos_ds.index.name = 'eid' 328 | print(pos_ds.head()) 329 | ds.eid_map = pd.concat([ds.eid_map, pos_ds], axis=0) 330 | 331 | # make sure we have observations for each label: 332 | print(split) 333 | print(label_datasets[1].eid_map[[c for c in label_datasets[1].eid_map.columns if 'event' in c]].sum()) 334 | 335 | return DatasetWrapper(covariate_datasets, label_datasets) 336 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 58 | Public License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 63 | ("Public License"). To the extent this Public License may be 64 | interpreted as a contract, You are granted the Licensed Rights in 65 | consideration of Your acceptance of these terms and conditions, and the 66 | Licensor grants You such rights in consideration of benefits the 67 | Licensor receives from making the Licensed Material available under 68 | these terms and conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-NC-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution, NonCommercial, and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. NonCommercial means not primarily intended for or directed towards 126 | commercial advantage or monetary compensation. For purposes of 127 | this Public License, the exchange of the Licensed Material for 128 | other material subject to Copyright and Similar Rights by digital 129 | file-sharing or similar means is NonCommercial provided there is 130 | no payment of monetary compensation in connection with the 131 | exchange. 132 | 133 | l. Share means to provide material to the public by any means or 134 | process that requires permission under the Licensed Rights, such 135 | as reproduction, public display, public performance, distribution, 136 | dissemination, communication, or importation, and to make material 137 | available to the public including in ways that members of the 138 | public may access the material from a place and at a time 139 | individually chosen by them. 140 | 141 | m. Sui Generis Database Rights means rights other than copyright 142 | resulting from Directive 96/9/EC of the European Parliament and of 143 | the Council of 11 March 1996 on the legal protection of databases, 144 | as amended and/or succeeded, as well as other essentially 145 | equivalent rights anywhere in the world. 146 | 147 | n. You means the individual or entity exercising the Licensed Rights 148 | under this Public License. Your has a corresponding meaning. 149 | 150 | 151 | Section 2 -- Scope. 152 | 153 | a. License grant. 154 | 155 | 1. Subject to the terms and conditions of this Public License, 156 | the Licensor hereby grants You a worldwide, royalty-free, 157 | non-sublicensable, non-exclusive, irrevocable license to 158 | exercise the Licensed Rights in the Licensed Material to: 159 | 160 | a. reproduce and Share the Licensed Material, in whole or 161 | in part, for NonCommercial purposes only; and 162 | 163 | b. produce, reproduce, and Share Adapted Material for 164 | NonCommercial purposes only. 165 | 166 | 2. Exceptions and Limitations. For the avoidance of doubt, where 167 | Exceptions and Limitations apply to Your use, this Public 168 | License does not apply, and You do not need to comply with 169 | its terms and conditions. 170 | 171 | 3. Term. The term of this Public License is specified in Section 172 | 6(a). 173 | 174 | 4. Media and formats; technical modifications allowed. The 175 | Licensor authorizes You to exercise the Licensed Rights in 176 | all media and formats whether now known or hereafter created, 177 | and to make technical modifications necessary to do so. The 178 | Licensor waives and/or agrees not to assert any right or 179 | authority to forbid You from making technical modifications 180 | necessary to exercise the Licensed Rights, including 181 | technical modifications necessary to circumvent Effective 182 | Technological Measures. For purposes of this Public License, 183 | simply making modifications authorized by this Section 2(a) 184 | (4) never produces Adapted Material. 185 | 186 | 5. Downstream recipients. 187 | 188 | a. Offer from the Licensor -- Licensed Material. Every 189 | recipient of the Licensed Material automatically 190 | receives an offer from the Licensor to exercise the 191 | Licensed Rights under the terms and conditions of this 192 | Public License. 193 | 194 | b. Additional offer from the Licensor -- Adapted Material. 195 | Every recipient of Adapted Material from You 196 | automatically receives an offer from the Licensor to 197 | exercise the Licensed Rights in the Adapted Material 198 | under the conditions of the Adapter's License You apply. 199 | 200 | c. No downstream restrictions. You may not offer or impose 201 | any additional or different terms or conditions on, or 202 | apply any Effective Technological Measures to, the 203 | Licensed Material if doing so restricts exercise of the 204 | Licensed Rights by any recipient of the Licensed 205 | Material. 206 | 207 | 6. No endorsement. Nothing in this Public License constitutes or 208 | may be construed as permission to assert or imply that You 209 | are, or that Your use of the Licensed Material is, connected 210 | with, or sponsored, endorsed, or granted official status by, 211 | the Licensor or others designated to receive attribution as 212 | provided in Section 3(a)(1)(A)(i). 213 | 214 | b. Other rights. 215 | 216 | 1. Moral rights, such as the right of integrity, are not 217 | licensed under this Public License, nor are publicity, 218 | privacy, and/or other similar personality rights; however, to 219 | the extent possible, the Licensor waives and/or agrees not to 220 | assert any such rights held by the Licensor to the limited 221 | extent necessary to allow You to exercise the Licensed 222 | Rights, but not otherwise. 223 | 224 | 2. Patent and trademark rights are not licensed under this 225 | Public License. 226 | 227 | 3. To the extent possible, the Licensor waives any right to 228 | collect royalties from You for the exercise of the Licensed 229 | Rights, whether directly or through a collecting society 230 | under any voluntary or waivable statutory or compulsory 231 | licensing scheme. In all other cases the Licensor expressly 232 | reserves any right to collect such royalties, including when 233 | the Licensed Material is used other than for NonCommercial 234 | purposes. 235 | 236 | 237 | Section 3 -- License Conditions. 238 | 239 | Your exercise of the Licensed Rights is expressly made subject to the 240 | following conditions. 241 | 242 | a. Attribution. 243 | 244 | 1. If You Share the Licensed Material (including in modified 245 | form), You must: 246 | 247 | a. retain the following if it is supplied by the Licensor 248 | with the Licensed Material: 249 | 250 | i. identification of the creator(s) of the Licensed 251 | Material and any others designated to receive 252 | attribution, in any reasonable manner requested by 253 | the Licensor (including by pseudonym if 254 | designated); 255 | 256 | ii. a copyright notice; 257 | 258 | iii. a notice that refers to this Public License; 259 | 260 | iv. a notice that refers to the disclaimer of 261 | warranties; 262 | 263 | v. a URI or hyperlink to the Licensed Material to the 264 | extent reasonably practicable; 265 | 266 | b. indicate if You modified the Licensed Material and 267 | retain an indication of any previous modifications; and 268 | 269 | c. indicate the Licensed Material is licensed under this 270 | Public License, and include the text of, or the URI or 271 | hyperlink to, this Public License. 272 | 273 | 2. You may satisfy the conditions in Section 3(a)(1) in any 274 | reasonable manner based on the medium, means, and context in 275 | which You Share the Licensed Material. For example, it may be 276 | reasonable to satisfy the conditions by providing a URI or 277 | hyperlink to a resource that includes the required 278 | information. 279 | 3. If requested by the Licensor, You must remove any of the 280 | information required by Section 3(a)(1)(A) to the extent 281 | reasonably practicable. 282 | 283 | b. ShareAlike. 284 | 285 | In addition to the conditions in Section 3(a), if You Share 286 | Adapted Material You produce, the following conditions also apply. 287 | 288 | 1. The Adapter's License You apply must be a Creative Commons 289 | license with the same License Elements, this version or 290 | later, or a BY-NC-SA Compatible License. 291 | 292 | 2. You must include the text of, or the URI or hyperlink to, the 293 | Adapter's License You apply. You may satisfy this condition 294 | in any reasonable manner based on the medium, means, and 295 | context in which You Share Adapted Material. 296 | 297 | 3. You may not offer or impose any additional or different terms 298 | or conditions on, or apply any Effective Technological 299 | Measures to, Adapted Material that restrict exercise of the 300 | rights granted under the Adapter's License You apply. 301 | 302 | 303 | Section 4 -- Sui Generis Database Rights. 304 | 305 | Where the Licensed Rights include Sui Generis Database Rights that 306 | apply to Your use of the Licensed Material: 307 | 308 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 309 | to extract, reuse, reproduce, and Share all or a substantial 310 | portion of the contents of the database for NonCommercial purposes 311 | only; 312 | 313 | b. if You include all or a substantial portion of the database 314 | contents in a database in which You have Sui Generis Database 315 | Rights, then the database in which You have Sui Generis Database 316 | Rights (but not its individual contents) is Adapted Material, 317 | including for purposes of Section 3(b); and 318 | 319 | c. You must comply with the conditions in Section 3(a) if You Share 320 | all or a substantial portion of the contents of the database. 321 | 322 | For the avoidance of doubt, this Section 4 supplements and does not 323 | replace Your obligations under this Public License where the Licensed 324 | Rights include other Copyright and Similar Rights. 325 | 326 | 327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 328 | 329 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 330 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 331 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 332 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 333 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 334 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 335 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 336 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 337 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 338 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 339 | 340 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 341 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 342 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 343 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 344 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 345 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 346 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 347 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 348 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 349 | 350 | c. The disclaimer of warranties and limitation of liability provided 351 | above shall be interpreted in a manner that, to the extent 352 | possible, most closely approximates an absolute disclaimer and 353 | waiver of all liability. 354 | 355 | 356 | Section 6 -- Term and Termination. 357 | 358 | a. This Public License applies for the term of the Copyright and 359 | Similar Rights licensed here. However, if You fail to comply with 360 | this Public License, then Your rights under this Public License 361 | terminate automatically. 362 | 363 | b. Where Your right to use the Licensed Material has terminated under 364 | Section 6(a), it reinstates: 365 | 366 | 1. automatically as of the date the violation is cured, provided 367 | it is cured within 30 days of Your discovery of the 368 | violation; or 369 | 370 | 2. upon express reinstatement by the Licensor. 371 | 372 | For the avoidance of doubt, this Section 6(b) does not affect any 373 | right the Licensor may have to seek remedies for Your violations 374 | of this Public License. 375 | 376 | c. For the avoidance of doubt, the Licensor may also offer the 377 | Licensed Material under separate terms or conditions or stop 378 | distributing the Licensed Material at any time; however, doing so 379 | will not terminate this Public License. 380 | 381 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 382 | License. 383 | 384 | 385 | Section 7 -- Other Terms and Conditions. 386 | 387 | a. The Licensor shall not be bound by any additional or different 388 | terms or conditions communicated by You unless expressly agreed. 389 | 390 | b. Any arrangements, understandings, or agreements regarding the 391 | Licensed Material not stated herein are separate from and 392 | independent of the terms and conditions of this Public License. 393 | 394 | 395 | Section 8 -- Interpretation. 396 | 397 | a. For the avoidance of doubt, this Public License does not, and 398 | shall not be interpreted to, reduce, limit, restrict, or impose 399 | conditions on any use of the Licensed Material that could lawfully 400 | be made without permission under this Public License. 401 | 402 | b. To the extent possible, if any provision of this Public License is 403 | deemed unenforceable, it shall be automatically reformed to the 404 | minimum extent necessary to make it enforceable. If the provision 405 | cannot be reformed, it shall be severed from this Public License 406 | without affecting the enforceability of the remaining terms and 407 | conditions. 408 | 409 | c. No term or condition of this Public License will be waived and no 410 | failure to comply consented to unless expressly agreed to by the 411 | Licensor. 412 | 413 | d. Nothing in this Public License constitutes or may be interpreted 414 | as a limitation upon, or waiver of, any privileges and immunities 415 | that apply to the Licensor or You, including from the legal 416 | processes of any jurisdiction or authority. 417 | 418 | ======================================================================= 419 | 420 | Creative Commons is not a party to its public 421 | licenses. Notwithstanding, Creative Commons may elect to apply one of 422 | its public licenses to material it publishes and in those instances 423 | will be considered the “Licensor.” The text of the Creative Commons 424 | public licenses is dedicated to the public domain under the CC0 Public 425 | Domain Dedication. Except for the limited purpose of indicating that 426 | material is shared under a Creative Commons public license or as 427 | otherwise permitted by the Creative Commons policies published at 428 | creativecommons.org/policies, Creative Commons does not authorize the 429 | use of the trademark "Creative Commons" or any other trademark or logo 430 | of Creative Commons without its prior written consent including, 431 | without limitation, in connection with any unauthorized modifications 432 | to any of its public licenses or any other arrangements, 433 | understandings, or agreements concerning use of licensed material. For 434 | the avoidance of doubt, this paragraph does not form part of the 435 | public licenses. 436 | 437 | Creative Commons may be contacted at creativecommons.org. 438 | -------------------------------------------------------------------------------- /analysis/preprocessing/2_preprocessing_clinical_endpoints.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Preprocessing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "ExecuteTime": { 15 | "end_time": "2020-11-04T12:31:49.436340Z", 16 | "start_time": "2020-11-04T12:31:48.732042Z" 17 | } 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import os\n", 24 | "import yaml\n", 25 | "from tqdm.notebook import tqdm" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "outputs": [], 32 | "source": [ 33 | "dataset_name = \"name_of_your_dataset\"\n", 34 | "path = \"/path/to/mapping/files\"\n", 35 | "data_path = \"/path/to/decoded/output\"\n", 36 | "dataset_path = f\"{data_path}/2_datasets_pre/{dataset_name}\"" 37 | ], 38 | "metadata": { 39 | "collapsed": false, 40 | "pycharm": { 41 | "name": "#%%\n" 42 | } 43 | } 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "ExecuteTime": { 50 | "end_time": "2020-11-04T12:31:49.895222Z", 51 | "start_time": "2020-11-04T12:31:49.891332Z" 52 | } 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "from pathlib import Path\n", 57 | "Path(dataset_path).mkdir(parents=True, exist_ok=True)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "ExecuteTime": { 65 | "end_time": "2020-11-04T12:33:14.171198Z", 66 | "start_time": "2020-11-04T12:31:50.204540Z" 67 | } 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "data = pd.read_feather(f\"{data_path}/1_decoded/ukb_data_210517.feather\")\n", 72 | "data_field = pd.read_feather(f\"{data_path}/1_decoded/ukb_data_field_210517.feather\")\n", 73 | "data_columns = data.columns.to_list()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Mappings + Vocabulary" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "ExecuteTime": { 88 | "end_time": "2020-11-04T12:34:05.867152Z", 89 | "start_time": "2020-11-04T12:33:16.878773Z" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "# Drop obvious missing data\n", 95 | "print(len(data))\n", 96 | "data = data.dropna(subset=[\"sex_f31_0_0\"], axis=0)\n", 97 | "print(len(data))" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "# Starting information" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "ExecuteTime": { 112 | "end_time": "2020-11-04T12:34:05.872216Z", 113 | "start_time": "2020-11-04T12:34:05.869505Z" 114 | } 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "time0_col=\"date_of_attending_assessment_centre_f53_0_0\"" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "# Baseline covariates" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "ExecuteTime": { 133 | "end_time": "2020-11-04T12:34:05.889725Z", 134 | "start_time": "2020-11-04T12:34:05.874587Z" 135 | } 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "def get_fields(fields, data, data_field):\n", 140 | " f = data_field[data_field[\"field.showcase\"].isin(fields) & data_field[\"field.tab\"].str.contains(\"f\\\\.\\\\d+\\\\.0\\\\.\\\\d\")].copy()\n", 141 | " f[\"field\"] = pd.Categorical(f[\"field.showcase\"], categories=fields, ordered=True)\n", 142 | " f = f.sort_values(\"field\").reset_index().drop(\"field\", axis=1)\n", 143 | " return f\n", 144 | "\n", 145 | "def get_fields_all(fields, data, data_field):\n", 146 | " f = data_field[data_field[\"field.showcase\"].isin(fields)].copy()\n", 147 | " f[\"field\"] = pd.Categorical(f[\"field.showcase\"], categories=fields, ordered=True)\n", 148 | " f = f.sort_values(\"field\").reset_index().drop(\"field\", axis=1)\n", 149 | " return f\n", 150 | "\n", 151 | "def get_data_fields(fields, data, data_field):\n", 152 | " f = get_fields(fields, data, data_field)\n", 153 | " return data[[\"eid\"]+f[\"col.name\"].to_list()].copy()\n", 154 | "\n", 155 | "def get_data_fields_all(fields, data, data_field):\n", 156 | " f = get_fields_all(fields, data, data_field)\n", 157 | " return data[[\"eid\"]+f[\"col.name\"].to_list()].copy()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## Diagnoses and events" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "ExecuteTime": { 172 | "end_time": "2020-11-04T12:37:14.667281Z", 173 | "start_time": "2020-11-04T12:36:14.427693Z" 174 | } 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "vocab_dir = f\"{data_path}/mapping/athena\"\n", 179 | "vocab = {\n", 180 | " \"concept\": pd.read_csv(f\"{vocab_dir}/CONCEPT.csv\", sep='\\t'),\n", 181 | " \"domain\": pd.read_csv(f\"{vocab_dir}/DOMAIN.csv\", sep='\\t'),\n", 182 | " \"class\": pd.read_csv(f\"{vocab_dir}/CONCEPT_CLASS.csv\", sep='\\t'),\n", 183 | " \"relationship\": pd.read_csv(f\"{vocab_dir}/RELATIONSHIP.csv\", sep='\\t'),\n", 184 | " \"drug_strength\": pd.read_csv(f\"{vocab_dir}/DRUG_STRENGTH.csv\", sep='\\t'),\n", 185 | " \"vocabulary\": pd.read_csv(f\"{vocab_dir}/VOCABULARY.csv\", sep='\\t'),\n", 186 | " \"concept_synonym\": pd.read_csv(f\"{vocab_dir}/CONCEPT_SYNONYM.csv\", sep='\\t'),\n", 187 | " \"concept_ancestor\": pd.read_csv(f\"{vocab_dir}/CONCEPT_ANCESTOR.csv\", sep='\\t'),\n", 188 | " \"concept_relationship\": pd.read_csv(f\"{vocab_dir}/CONCEPT_RELATIONSHIP.csv\", sep='\\t') \n", 189 | "}" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "### Definitions" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "ExecuteTime": { 204 | "end_time": "2020-11-04T12:37:14.772869Z", 205 | "start_time": "2020-11-04T12:37:14.669541Z" 206 | } 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "coding1836 = pd.read_csv(f\"{path}/codings/coding1836.tsv\", sep=\"\\t\").rename(columns={\"coding\":\"code\"})\n", 211 | "phecodes = pd.read_csv(f\"{path}/phecodes/phecode_icd10.csv\")\n", 212 | "def phenotype_children(phecodes, phenotype_list):\n", 213 | " l={}\n", 214 | " phecodes = phecodes.dropna(subset=[\"Phenotype\"], axis=0)\n", 215 | " for ph, ph_names in phenotype_list.items():\n", 216 | " regex = \"|\".join(ph_names)\n", 217 | " l[ph] = list(phecodes[phecodes.Phenotype.str.contains(regex, case=False)].ICD10.str.replace(\"\\\\.\", \"\").str.slice(0, 3).unique())\n", 218 | " return l" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "diagnoses_codes = pd.read_feather(os.path.join(path, dataset_path, 'temp_diagnoses_codes.feather')).drop(\"level\", axis=1)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "death_codes = pd.read_feather(f\"{data_path}/1_decoded/codes_death_records_210115.feather\").query(\"level==1\").drop(\"level\", axis=1)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "endpoint_codes = pd.concat([diagnoses_codes, death_codes[diagnoses_codes.columns]])" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "# Endpoints" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "ExecuteTime": { 260 | "end_time": "2020-11-04T12:39:55.628580Z", 261 | "start_time": "2020-11-04T12:33:33.036Z" 262 | } 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "### define in snomed and get icd codes from there" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "### 1. Hospital admissions" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "endpoint_list = {\n", 283 | " \"myocardial_infarction\": ['I21', 'I22', 'I23', 'I24', 'I25'],\n", 284 | " \"stroke\": ['G45', \"I63\", \"I64\"],\n", 285 | " \"diabetes\" : ['E10', 'E11', 'E12', 'E13', 'E14'],\n", 286 | " \"diabetes1\" : ['E10'],\n", 287 | " \"diabetes2\" : ['E11', 'E12', 'E13', 'E14'],\n", 288 | " \"atrial_fibrillation\": ['I47', 'I48'],\n", 289 | " 'migraine': ['G43', 'G44'],\n", 290 | " 'rheumatoid_arthritis': ['J99', 'M05', 'M06', 'M08', 'M12', 'M13'],\n", 291 | " \"systemic_lupus_erythematosus\": ['M32'],\n", 292 | " 'severe_mental_illness': ['F20', 'F25', 'F30', 'F31', 'F32', 'F33', 'F44'],\n", 293 | " \"erectile_dysfunction\" : ['F52', 'N48'], \n", 294 | " \"chronic_kidney_disease\": [\"I12\", \"N18\", \"N19\"],\n", 295 | " \"liver_disease\":[\"K70\", \"K71\", \"K72\", \"K73\", \"K74\", \"K75\", \"K76\", \"K77\"],\n", 296 | " \"dementia\":['F00', 'F01', 'F02', 'F03'],\n", 297 | " \"copd\": ['J44'],\n", 298 | " \"M_all_cause_dementia\": [\"F00\", \"F01\", \"F02\", \"F03\", \"G30\", \"G31\"],\n", 299 | " \"M_MACE\": [\"G45\", \"I21\", \"I22\", \"I23\", \"I24\", \"I25\", \"I63\", \"I64\"],\n", 300 | " \"M_type_2_diabetes\": [\"E10\", \"E11\", \"E12\", \"E13\", \"E14\"],\n", 301 | " \"M_liver_disease\": [\"B15\", \"B16\", \"B17\", \"B18\", \"B19\", \"C22\", \"E83\", \"E88\", \"I85\", \n", 302 | " \"K70\", \"K72\", \"K73\", \"K74\", \"K75\", \"K76\", \"R18\", \"Z94\"],\n", 303 | " \"M_renal_disease\": [f\"N{i:02}\" for i in range(20)]+[f\"N{i:02}\" for i in range(25, 30)],\n", 304 | " \"M_atrial_fibrillation\": [\"I48\"],\n", 305 | " \"M_heart_failure\":[\"I50\"],\n", 306 | " \"M_coronary_heart_disease\": [f\"I{i:02}\" for i in range(20, 26)],\n", 307 | " \"M_venous_thrombosis\": [\"I80\", \"I81\", \"I82\"],\n", 308 | " \"M_cerebral_stroke\":[\"I63\", \"I65\", \"I66\"],\n", 309 | " \"M_haemorrhagic_stroke\": [\"I60, I61, I62\"],\n", 310 | " \"M_abdominal_aortic_aneurysm\" : [\"I71\"],\n", 311 | " \"M_peripheral_arterial_disease\": ['I70', 'I71', 'I72', 'I73', 'I74', 'I75', 'I76', 'I77', 'I78', 'I79'],\n", 312 | " \"M_asthma\":[\"J45\", \"J46\"],\n", 313 | " \"M_chronic_obstructuve_pulmonary_disease\":[\"J40\", \"J41\", \"J42\", \"J43\", \"J44\", \"J47\"],\n", 314 | " \"M_lung_cancer\":[\"C33\", \"C34\"],\n", 315 | " \"M_non_melanoma_skin_cancer\":[\"C44\"],\n", 316 | " \"M_stomach_cancer\":[\"C16\"],\n", 317 | " \"M_oesophagus_cancer\":[\"C15\"],\n", 318 | " \"M_colon_cancer\":[\"C18\"],\n", 319 | " \"M_rectal_cancer\":[\"C19\", \"C20\"],\n", 320 | " \"M_prostate_cancer\":[\"C61\"],\n", 321 | " \"M_ovarian_cancer\":[\"C56\", \"C57\"],\n", 322 | " \"M_breast_cancer\":[\"C50\"],\n", 323 | " \"M_uterus_cancer\":[\"C54\"],\n", 324 | " \"M_parkinsons_disease\":[\"G20\", \"G21\", \"G22\"],\n", 325 | " \"M_fractures\":[\"S02\", \"S12\", \"S22\", \"S32\", \"S42\", \"S52\", \"S62\", \"S72\", \"S82\", \"S92\", \"T02\", \"T08\", \"T10\"],\n", 326 | " \"M_cataracts\":[\"H25\", \"H26\"],\n", 327 | " \"M_glaucoma\":[\"H40\"] \n", 328 | "}\n", 329 | "\n", 330 | "with open(os.path.join(path, dataset_path, 'endpoint_list.yaml'), 'w') as file:\n", 331 | " yaml.dump(endpoint_list, file, default_flow_style=False)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "from dateutil.relativedelta import relativedelta\n", 341 | "import datetime\n", 342 | "\n", 343 | "def extract_endpoints_tte(data, diagnoses_codes, endpoint_list, time0_col, level=None):\n", 344 | " if level is not None: diagnoses_codes = diagnoses_codes.query(\"level==@level\")\n", 345 | " diagnoses_codes_time0 = diagnoses_codes.merge(data[[\"eid\", time0_col]], how=\"left\", on=\"eid\")\n", 346 | " \n", 347 | " cens_time_right = datetime.date(2020, 9, 30)\n", 348 | "\n", 349 | " df_interval = diagnoses_codes_time0[(diagnoses_codes_time0.date > diagnoses_codes_time0[time0_col]) & \n", 350 | " (diagnoses_codes_time0.date < cens_time_right)]\n", 351 | " \n", 352 | " temp = data[[\"eid\", time0_col]].copy()\n", 353 | " for ph, ph_codes in tqdm(endpoint_list.items()):\n", 354 | " regex = \"|\".join(ph_codes)\n", 355 | " ph_df = df_interval[df_interval.meaning.str.contains(regex, case=False)] \\\n", 356 | " .sort_values('date').groupby('eid').head(1).assign(phenotype=1, date=lambda x: x.date)\n", 357 | " temp_ph = temp.merge(ph_df, how=\"left\", on=\"eid\").fillna(0)\n", 358 | " temp[ph+\"_event\"], temp[ph+\"_event_date\"] = temp_ph.phenotype, temp_ph.date\n", 359 | " \n", 360 | " fill_date = {ph+\"_event_date\" : lambda x: [cens_time_right if event==0 else event_date for event, event_date in zip(x[ph+\"_event\"], x[ph+\"_event_date\"])]}\n", 361 | " calc_tte = {ph+\"_event_time\" : lambda x: [(event_date-time0).days/365.25 for time0, event_date in zip(x[time0_col], x[ph+\"_event_date\"])]}\n", 362 | " \n", 363 | " temp = temp.assign(**fill_date).assign(**calc_tte).drop([ph+\"_event_date\"], axis=1)\n", 364 | " \n", 365 | " temp = temp.drop([time0_col], axis=1) \n", 366 | " \n", 367 | " return temp.drop_duplicates()" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "basics = pd.read_feather(os.path.join(path, dataset_path, 'temp_basics.feather'))\n", 377 | "endpoints_diagnoses = extract_endpoints_tte(basics, endpoint_codes, endpoint_list, time0_col)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "### 2. Death registry" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "death_list = {\n", 394 | " \"death_allcause\":[],\n", 395 | " \"death_cvd\":['I{:02}'.format(ID+1) for ID in range(0, 98)],\n", 396 | "}\n", 397 | "\n", 398 | "with open(os.path.join(path, dataset_path, 'death_list.yaml'), 'w') as file:\n", 399 | " yaml.dump(death_list, file, default_flow_style=False)" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "endpoints_death = extract_endpoints_tte(basics, death_codes, death_list, time0_col)" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": {}, 414 | "source": [ 415 | "## SCORES" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "scores_list = {\n", 425 | " \"SCORE\":['I{:02}'.format(ID) for ID in [10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 44, 45, 46, 47, 48, 49, 50, 51, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73]],\n", 426 | " \"ASCVD\":['I{:02}'.format(ID) for ID in [20, 21, 22, 23, 24, 25, 63]],\n", 427 | " \"QRISK3\":[\"G45\", \"I20\", \"I21\", \"I22\", \"I23\", \"I24\", \"I25\", \"I63\", \"I64\"],\n", 428 | " \"MACE\":[\"G45\", \"I21\", \"I22\", \"I23\", \"I24\", \"I25\", \"I63\", \"I64\"], \n", 429 | "}\n", 430 | "with open(os.path.join(path, dataset_path, 'scores_list.yaml'), 'w') as file:\n", 431 | " yaml.dump(scores_list, file, default_flow_style=False)" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "death_scores = extract_endpoints_tte(basics, death_codes, scores_list, time0_col=time0_col)\n", 441 | "endpoint_scores = extract_endpoints_tte(basics, endpoint_codes, scores_list, time0_col=time0_col)" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "endpoints_scores_all = death_scores[[\"eid\", \"SCORE_event\", \"SCORE_event_time\"]].merge(endpoint_scores[[\"eid\", \"ASCVD_event\", \"ASCVD_event_time\", \"QRISK3_event\", \"QRISK3_event_time\", \"MACE_event\", \"MACE_event_time\"]], on=\"eid\")\n", 451 | "endpoints_scores_all.to_feather(os.path.join(path, dataset_path, 'temp_endpoints_scores_all.feather'))" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "## Merge Everything" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "data_dfs_dict = {\"endpoints_diagnoses\":endpoints_diagnoses, \n", 468 | " \"endpoints_death\":endpoints_death, \n", 469 | " \"endpoints_scores_all\":endpoints_scores_all}" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "def get_cols_clean(df):\n", 479 | " df.columns = df.columns.str.replace(r'_0_0$', '').str.replace(r'_f[0-9]+$', '').str.replace(\"_automated_reading\", '')\n", 480 | " return df.columns\n", 481 | "\n", 482 | "def clean_df(df):\n", 483 | " df.columns = get_cols_clean(df)\n", 484 | " return df" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "import pandas as pd\n", 494 | "from functools import reduce\n", 495 | "\n", 496 | "data_baseline = reduce(lambda x, y: pd.merge(x, y, on = 'eid'), list(data_dfs_dict.values()))\n", 497 | "endpoint_columns = [c[:-11] for c in data_baseline.columns.tolist() if \"_event_time\" in c]" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "outputs": [], 504 | "source": [ 505 | "data_baseline = clean_df(data_baseline)" 506 | ], 507 | "metadata": { 508 | "collapsed": false, 509 | "pycharm": { 510 | "name": "#%%\n" 511 | } 512 | } 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "outputs": [], 518 | "source": [ 519 | "for col in [col for col in list(data_baseline.columns) if (\"_event\" in col) & (\"_time\" not in col)]:\n", 520 | " data_baseline[col] = data_baseline[col].astype(int)" 521 | ], 522 | "metadata": { 523 | "collapsed": false, 524 | "pycharm": { 525 | "name": "#%%\n" 526 | } 527 | } 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "outputs": [], 533 | "source": [ 534 | "covariates = [col for col in list(data_baseline.columns) if not \"_event\" in col]\n", 535 | "targets = [col for col in list(data_baseline.columns) if \"_event\" in col]" 536 | ], 537 | "metadata": { 538 | "collapsed": false, 539 | "pycharm": { 540 | "name": "#%%\n" 541 | } 542 | } 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": {}, 547 | "source": [ 548 | "# Exporting" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "data_cols = {}\n", 558 | "for topic, df in data_dfs_dict.items(): \n", 559 | " data_cols[\"eid\"] = [\"admin\"]\n", 560 | " data_cols[topic]=list(get_cols_clean(df))[1:]" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "data_cols_single = {}\n", 570 | "for topic, columns in data_cols.items():\n", 571 | " for col in columns:\n", 572 | " data_cols_single[col] = topic" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": null, 578 | "metadata": {}, 579 | "outputs": [], 580 | "source": [ 581 | "for c in [c for c in data_baseline.columns.tolist() if \"comp\" in c]:\n", 582 | " data_cols_single.update({c:\"endpoints_competing\"})" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "dtypes = {\"int32\":\"int\", \"int64\":\"int\", \"float64\":\"float\", \"category\":\"category\", \"object\":\"category\", \"bool\":\"bool\"}\n", 592 | "desc_dict = {\"id\": [*range(1, len(data_baseline.columns.to_list())+1)] , \n", 593 | " \"covariate\": data_baseline.columns.to_list(), \n", 594 | " \"dtype\":[dtypes[str(col)] for col in data_baseline.dtypes.to_list()], \n", 595 | " \"isTarget\":[True if col in targets else False for col in data_baseline.columns.to_list()],\n", 596 | " \"based_on\":[topic for col, topic in data_cols_single.items()],\n", 597 | " \"field\": [np.nan for col in data_baseline.columns.to_list()],\n", 598 | " \"aggr_fn\": [np.nan for col in data_baseline.columns.to_list()]}\n", 599 | "data_baseline_description = pd.DataFrame.from_dict(desc_dict)" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "metadata": {}, 606 | "outputs": [], 607 | "source": [ 608 | "endpoint_dict = {}\n", 609 | "for group in data_baseline_description.based_on.unique(): endpoint_dict[group] = data_baseline_description.query(\"based_on==@group\").covariate.to_list()\n", 610 | "with open(os.path.join(path, dataset_path, 'endpoint_list.yaml'), 'w') as file: yaml.dump(endpoint_dict, file, default_flow_style=False, allow_unicode=True)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "### WRITE FEATURES IN YAML!!!" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "metadata": {}, 626 | "outputs": [], 627 | "source": [ 628 | "data_baseline.to_feather(os.path.join(path, dataset_path, 'baseline_endpoints.feather'))\n", 629 | "data_baseline_description.to_feather(os.path.join(path, dataset_path, 'baseline_endpoints_description.feather'))" 630 | ] 631 | } 632 | ], 633 | "metadata": { 634 | "kernelspec": { 635 | "display_name": "Python [conda env:miniconda3-pl1.x]", 636 | "language": "python", 637 | "name": "conda-env-miniconda3-pl1.x-py" 638 | }, 639 | "language_info": { 640 | "codemirror_mode": { 641 | "name": "ipython", 642 | "version": 3 643 | }, 644 | "file_extension": ".py", 645 | "mimetype": "text/x-python", 646 | "name": "python", 647 | "nbconvert_exporter": "python", 648 | "pygments_lexer": "ipython3", 649 | "version": "3.7.8" 650 | }, 651 | "toc-autonumbering": true, 652 | "toc-showcode": false, 653 | "toc-showmarkdowntxt": false 654 | }, 655 | "nbformat": 4, 656 | "nbformat_minor": 4 657 | } -------------------------------------------------------------------------------- /analysis/preprocessing/1_preprocessing_dataportal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# 1. Data Portal Preprocessing" 7 | ], 8 | "metadata": { 9 | "collapsed": false 10 | } 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "outputs": [], 16 | "source": [ 17 | "try(library(tidyverse), silent=TRUE)\n", 18 | "library(lubridate)\n", 19 | "library(glue)\n", 20 | "library(data.table)\n", 21 | "library(tidyfast)\n", 22 | "library(\"magrittr\")\n", 23 | "setwd(\"/\")" 24 | ], 25 | "metadata": { 26 | "collapsed": false, 27 | "pycharm": { 28 | "name": "#%%\n" 29 | } 30 | } 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "outputs": [], 36 | "source": [ 37 | "dataset_name = \"name_of_your_dataset\"\n", 38 | "path = \"/path/to/dir/with/decoded/file\"\n", 39 | "data_path = \"/path/for/output\"\n", 40 | "dataset_path = glue(\"{data_path}/2_datasets_pre/{dataset_name}\")" 41 | ], 42 | "metadata": { 43 | "collapsed": false, 44 | "pycharm": { 45 | "name": "#%%\n" 46 | } 47 | } 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "outputs": [], 53 | "source": [ 54 | "list.files(path = \"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/\")" 55 | ], 56 | "metadata": { 57 | "collapsed": false, 58 | "pycharm": { 59 | "name": "#%%\n" 60 | } 61 | } 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "source": [ 66 | "## Load Athena Vocabulary" 67 | ], 68 | "metadata": { 69 | "collapsed": false 70 | } 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "outputs": [], 76 | "source": [ 77 | "vocab_dir = glue(\"{data_path}/athena_vocabulary_covid\")\n", 78 | "concept =fread(glue(\"{vocab_dir}/CONCEPT.csv\"), sep='\\t')" 79 | ], 80 | "metadata": { 81 | "collapsed": false, 82 | "pycharm": { 83 | "name": "#%%\n" 84 | } 85 | } 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "outputs": [], 91 | "source": [ 92 | "unique(concept$vocabulary_id)" 93 | ], 94 | "metadata": { 95 | "collapsed": false, 96 | "pycharm": { 97 | "name": "#%%\n" 98 | } 99 | } 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "outputs": [], 105 | "source": [ 106 | "relationship = fread(glue(\"{vocab_dir}/RELATIONSHIP.csv\"), sep='\\t')" 107 | ], 108 | "metadata": { 109 | "collapsed": false, 110 | "pycharm": { 111 | "name": "#%%\n" 112 | } 113 | } 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "outputs": [], 119 | "source": [ 120 | "vocabulary = fread(glue(\"{vocab_dir}/VOCABULARY.csv\"), sep='\\t')" 121 | ], 122 | "metadata": { 123 | "collapsed": false, 124 | "pycharm": { 125 | "name": "#%%\n" 126 | } 127 | } 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "outputs": [], 133 | "source": [ 134 | "concept_relationship = fread(glue(\"{vocab_dir}/CONCEPT_RELATIONSHIP.csv\"), sep='\\t')" 135 | ], 136 | "metadata": { 137 | "collapsed": false, 138 | "pycharm": { 139 | "name": "#%%\n" 140 | } 141 | } 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "outputs": [], 147 | "source": [ 148 | "## Diagnoses" 149 | ], 150 | "metadata": { 151 | "collapsed": false, 152 | "pycharm": { 153 | "name": "#%%\n" 154 | } 155 | } 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "source": [ 160 | "## Hospital Episode Statistics" 161 | ], 162 | "metadata": { 163 | "collapsed": false 164 | } 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "outputs": [], 170 | "source": [ 171 | "hesin = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin.txt\")" 172 | ], 173 | "metadata": { 174 | "collapsed": false, 175 | "pycharm": { 176 | "name": "#%%\n" 177 | } 178 | } 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "outputs": [], 184 | "source": [ 185 | "hesin_diag = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_diag.txt\")" 186 | ], 187 | "metadata": { 188 | "collapsed": false, 189 | "pycharm": { 190 | "name": "#%%\n" 191 | } 192 | } 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "outputs": [], 198 | "source": [ 199 | "hesin_critical = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_critical.txt\")" 200 | ], 201 | "metadata": { 202 | "collapsed": false, 203 | "pycharm": { 204 | "name": "#%%\n" 205 | } 206 | } 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "outputs": [], 212 | "source": [ 213 | "hesin_psych = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_psych.txt\")" 214 | ], 215 | "metadata": { 216 | "collapsed": false, 217 | "pycharm": { 218 | "name": "#%%\n" 219 | } 220 | } 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "outputs": [], 226 | "source": [ 227 | "hesin_delivery = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_delivery.txt\")\n", 228 | "hesin_maternity = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_maternity.txt\")" 229 | ], 230 | "metadata": { 231 | "collapsed": false, 232 | "pycharm": { 233 | "name": "#%%\n" 234 | } 235 | } 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "source": [ 240 | "### Diagnoses - ICD10" 241 | ], 242 | "metadata": { 243 | "collapsed": false 244 | } 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "outputs": [], 250 | "source": [ 251 | "## icd9 to icd10 mapping\n", 252 | "icd9to10_df = fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/codings/coding1836.tsv\")\n", 253 | "icd9to10_mapping = split(icd9to10_df$meaning, icd9to10_df$coding)\n", 254 | "hesin_diag_icd9 = hesin_diag %>% filter(diag_icd9!=\"\") %>% rowwise() %>% mutate(diag_icd10 = list(icd9to10_mapping[[diag_icd9]])) %>% drop_na(diag_icd10)\n", 255 | "hesin_diag = rbind(hesin_diag %>% filter(diag_icd9==\"\") %>% mutate(origin=\"hes_icd10\"), hesin_diag_icd9 %>% mutate(origin=\"hes_icd9\"))" 256 | ], 257 | "metadata": { 258 | "collapsed": false, 259 | "pycharm": { 260 | "name": "#%%\n" 261 | } 262 | } 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "outputs": [], 268 | "source": [ 269 | "hes_join = hesin[hesin_diag, on=c(\"eid\", \"ins_index\")]\n", 270 | "hes_join = hes_join[, c(\"eid\", \"origin\",\"ins_index\", \"arr_index\", \"level\", \"epistart\", \"diag_icd10\")][order(eid, ins_index, arr_index),]" 271 | ], 272 | "metadata": { 273 | "collapsed": false, 274 | "pycharm": { 275 | "name": "#%%\n" 276 | } 277 | } 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "outputs": [], 283 | "source": [ 284 | "hes_join_date = hes_join %>% rename(date=\"epistart\") %>% mutate(date = ymd(as.Date(fast_strptime(date, \"%d/%m/%Y\"))))" 285 | ], 286 | "metadata": { 287 | "collapsed": false, 288 | "pycharm": { 289 | "name": "#%%\n" 290 | } 291 | } 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "outputs": [], 297 | "source": [ 298 | "hes_diagnoses = hes_join_date %>% drop_na(date) %>% rename(code = \"diag_icd10\") %>% mutate(instance=ins_index) %>% group_by(eid) %>% mutate(n = arr_index)" 299 | ], 300 | "metadata": { 301 | "collapsed": false, 302 | "pycharm": { 303 | "name": "#%%\n" 304 | } 305 | } 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "outputs": [], 311 | "source": [ 312 | "hes_diagnoses = hes_diagnoses %>% mutate(meaning=str_sub(code, 1, 3)) %>% select(c(eid, origin, instance, n, level, code, meaning, date))" 313 | ], 314 | "metadata": { 315 | "collapsed": false, 316 | "pycharm": { 317 | "name": "#%%\n" 318 | } 319 | } 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "outputs": [], 325 | "source": [ 326 | "nrow(hes_diagnoses)\n", 327 | "head(hes_diagnoses %>% arrange(desc(date)))" 328 | ], 329 | "metadata": { 330 | "collapsed": false, 331 | "pycharm": { 332 | "name": "#%%\n" 333 | } 334 | } 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "outputs": [], 340 | "source": [ 341 | "arrow::write_feather(hes_diagnoses, glue(\"{path}/codes_hes_diagnoses_210120.feather\"))" 342 | ], 343 | "metadata": { 344 | "collapsed": false, 345 | "pycharm": { 346 | "name": "#%%\n" 347 | } 348 | } 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "source": [ 353 | "### Procedures - Snomed CT" 354 | ], 355 | "metadata": { 356 | "collapsed": false 357 | } 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "outputs": [], 363 | "source": [ 364 | "hesin_oper = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/hesin_oper.txt\")" 365 | ], 366 | "metadata": { 367 | "collapsed": false, 368 | "pycharm": { 369 | "name": "#%%\n" 370 | } 371 | } 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "outputs": [], 377 | "source": [ 378 | "hesin_oper[hesin_oper == \"\"] <- NA\n" 379 | ], 380 | "metadata": { 381 | "collapsed": false, 382 | "pycharm": { 383 | "name": "#%%\n" 384 | } 385 | } 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "outputs": [], 391 | "source": [ 392 | "hesin_oper_pre = hesin_oper %>% rename(date=\"opdate\", code=\"oper4\") %>% \n", 393 | " mutate(date = ymd(as.Date(fast_strptime(date, \"%d/%m/%Y\")))) %>%\n", 394 | " mutate(origin=\"hes_opcs4\", instance=ins_index) %>% group_by(eid) %>% mutate(n = arr_index) %>% select(eid, origin, instance, n, level, code, date)" 395 | ], 396 | "metadata": { 397 | "collapsed": false, 398 | "pycharm": { 399 | "name": "#%%\n" 400 | } 401 | } 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "outputs": [], 407 | "source": [ 408 | "concept_ids_opcs4 = concept %>% filter(vocabulary_id == \"OPCS4\") %>% mutate(concept_code = str_replace(concept_code, \"\\\\.\", \"\"))\n", 409 | "concept_ids_snomed = concept %>% filter(vocabulary_id == \"SNOMED\" & domain_id==\"Procedure\") \n", 410 | "\n", 411 | "# check necessary opcs4 concept ids\n", 412 | "concept_ids = concept_ids_opcs4 %>% mutate(concept_id_1 = concept_id)\n", 413 | "\n", 414 | "cr_filtered = concept_relationship %>% filter(concept_id_1 %in% concept_ids_opcs4$concept_id) %>% filter(concept_id_2 %in% concept_ids_snomed$concept_id) %>% arrange(concept_id_1)" 415 | ], 416 | "metadata": { 417 | "collapsed": false, 418 | "pycharm": { 419 | "name": "#%%\n" 420 | } 421 | } 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "outputs": [], 427 | "source": [ 428 | "mapping_opcs4_snomed = concept_ids_opcs4 %>% \n", 429 | " left_join(cr_filtered %>% select(concept_id_1, concept_id_2), by=c(\"concept_id\"=\"concept_id_1\")) %>% \n", 430 | " left_join(concept_ids_snomed %>% select(concept_id, concept_code, concept_name), by=c(\"concept_id_2\"=\"concept_id\")) %>% \n", 431 | " mutate(code = concept_code.x, meaning=concept_code.y, name=concept_name.y)" 432 | ], 433 | "metadata": { 434 | "collapsed": false, 435 | "pycharm": { 436 | "name": "#%%\n" 437 | } 438 | } 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "outputs": [], 444 | "source": [ 445 | "hes_procedures = hesin_oper_pre %>% left_join(mapping_opcs4_snomed %>% select(code, meaning, name), by=\"code\") %>% select(eid, origin, instance, n, level, date, code, meaning, name)" 446 | ], 447 | "metadata": { 448 | "collapsed": false, 449 | "pycharm": { 450 | "name": "#%%\n" 451 | } 452 | } 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "outputs": [], 458 | "source": [ 459 | "arrow::write_feather(hes_procedures, glue(\"{path}/codes_hes_procedures_210119.feather\"))" 460 | ], 461 | "metadata": { 462 | "collapsed": false, 463 | "pycharm": { 464 | "name": "#%%\n" 465 | } 466 | } 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "source": [ 471 | "## Mortality Records - ICD10" 472 | ], 473 | "metadata": { 474 | "collapsed": false 475 | } 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "outputs": [], 481 | "source": [ 482 | "death = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/death.txt\")\n", 483 | "death_cause = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/death_cause.txt\")" 484 | ], 485 | "metadata": { 486 | "collapsed": false, 487 | "pycharm": { 488 | "name": "#%%\n" 489 | } 490 | } 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "outputs": [], 496 | "source": [ 497 | "death_join = death[death_cause, on=c(\"eid\", \"ins_index\")]\n", 498 | "death_join = death_join[, c(\"eid\", \"ins_index\", \"arr_index\", \"level\", \"date_of_death\", \"cause_icd10\")][order(eid, ins_index, arr_index),]" 499 | ], 500 | "metadata": { 501 | "collapsed": false, 502 | "pycharm": { 503 | "name": "#%%\n" 504 | } 505 | } 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": null, 510 | "outputs": [], 511 | "source": [ 512 | "death_join_date = death_join %>% rename(date=\"date_of_death\") %>% rename(code = \"cause_icd10\") %>% mutate(date = ymd(as.Date(fast_strptime(date, \"%d/%m/%Y\"))))" 513 | ], 514 | "metadata": { 515 | "collapsed": false, 516 | "pycharm": { 517 | "name": "#%%\n" 518 | } 519 | } 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "outputs": [], 525 | "source": [ 526 | "codes_death = death_join_date %>% mutate(instance=0) %>% mutate(origin=\"death_records\") %>% group_by(eid) %>% mutate(n=row_number())\n", 527 | "codes_death = codes_death %>% mutate(meaning=str_sub(code, 1, 3)) %>% select(c(eid, origin, instance, n, level, code, meaning, date))" 528 | ], 529 | "metadata": { 530 | "collapsed": false, 531 | "pycharm": { 532 | "name": "#%%\n" 533 | } 534 | } 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "outputs": [], 540 | "source": [ 541 | "arrow::write_feather(codes_death, glue(\"{path}/codes_death_records_210115.feather\"))" 542 | ], 543 | "metadata": { 544 | "collapsed": false, 545 | "pycharm": { 546 | "name": "#%%\n" 547 | } 548 | } 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "source": [ 553 | "## GP Records" 554 | ], 555 | "metadata": { 556 | "collapsed": false 557 | } 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "outputs": [], 563 | "source": [ 564 | "gp_registrations = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/gp_registrations.txt\")" 565 | ], 566 | "metadata": { 567 | "collapsed": false, 568 | "pycharm": { 569 | "name": "#%%\n" 570 | } 571 | } 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "outputs": [], 577 | "source": [ 578 | "gp_clinical = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/gp_clinical.txt\")" 579 | ], 580 | "metadata": { 581 | "collapsed": false, 582 | "pycharm": { 583 | "name": "#%%\n" 584 | } 585 | } 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "outputs": [], 591 | "source": [ 592 | "gp_clinical[gp_clinical == \"\"] <- NA" 593 | ], 594 | "metadata": { 595 | "collapsed": false, 596 | "pycharm": { 597 | "name": "#%%\n" 598 | } 599 | } 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "outputs": [], 605 | "source": [ 606 | "gp_clinical = gp_clinical %>% rename(date=\"event_dt\") %>% mutate(date = ymd(as.Date(fast_strptime(date, \"%d/%m/%Y\"))))" 607 | ], 608 | "metadata": { 609 | "collapsed": false, 610 | "pycharm": { 611 | "name": "#%%\n" 612 | } 613 | } 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "outputs": [], 619 | "source": [ 620 | "# clean_dates\n", 621 | "# These data are provided in a form which is as close as possible to how they were issued from their source supplier, in order to avoid potential systematic error or bias by attempting to ‘clean’ them by\n", 622 | "# removing or altering invalid or erroneous information. However, to protect individuals, alterations have been made to dates in relation to participant date of birth as follows:\n", 623 | "\n", 624 | "# - where clinical event or prescription date precedes participant date of birth it has been altered to 01/01/1901.\n", 625 | "# - Where the date matches participant date of birth it has been altered to 02/02/1902.\n", 626 | "# - Where the date follows participant date of birth but is in the year of their birth it has been altered to 03/03/1903.\n", 627 | "# - Where the date was in the future this has been changed to 07/07/2037 as these are likely to have been entered as a place-holder or other system default." 628 | ], 629 | "metadata": { 630 | "collapsed": false, 631 | "pycharm": { 632 | "name": "#%%\n" 633 | } 634 | } 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "outputs": [], 640 | "source": [ 641 | "gp_clinical = gp_clinical %>% filter(date!=\"2037-07-07\")" 642 | ], 643 | "metadata": { 644 | "collapsed": false, 645 | "pycharm": { 646 | "name": "#%%\n" 647 | } 648 | } 649 | }, 650 | { 651 | "cell_type": "markdown", 652 | "source": [ 653 | "### Diagnoses - ICD10" 654 | ], 655 | "metadata": { 656 | "collapsed": false 657 | } 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "outputs": [], 663 | "source": [ 664 | "readv2_icd10 = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_v2_icd10.csv\"), -3) %>% rename(read_2=\"read_code\", code =\"icd10_code\") %>% select(read_2, code)\n", 665 | "readv3_icd10 = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_ctv3_icd10.csv\"), -3)%>% rename(read_3=\"read_code\", code=\"icd10_code\") %>% select(read_3, code)" 666 | ], 667 | "metadata": { 668 | "collapsed": false, 669 | "pycharm": { 670 | "name": "#%%\n" 671 | } 672 | } 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "outputs": [], 678 | "source": [ 679 | "gp_diagnoses_pre = gp_clinical %>% filter(read_2 %in% readv2_icd10$read_2 | read_3 %in% readv3_icd10$read_3)\n", 680 | "gp_diagnoses_readv2 = gp_diagnoses_pre %>% filter(!is.na(read_2)) %>% left_join(readv2_icd10, on=\"read_2\") %>% drop_na(code) %>% mutate(origin=\"gp_read2\") %>% select(eid, origin, code, date)\n", 681 | "gp_diagnoses_readv3 = gp_diagnoses_pre %>% filter(!is.na(read_3)) %>% left_join(readv3_icd10, on=\"read_3\") %>% drop_na(code) %>% mutate(origin=\"gp_read3\") %>% select(eid, origin, code, date)\n", 682 | "gp_diagnoses_raw = rbind(gp_diagnoses_readv2, gp_diagnoses_readv3)" 683 | ], 684 | "metadata": { 685 | "collapsed": false, 686 | "pycharm": { 687 | "name": "#%%\n" 688 | } 689 | } 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": null, 694 | "outputs": [], 695 | "source": [ 696 | "gp_diagnoses = gp_diagnoses_raw %>% mutate(instance=0, level=NA) %>% distinct() %>% group_by(eid) %>% mutate(n = row_number()) %>% mutate(meaning=str_sub(code, 1, 3)) %>% select(c(eid, origin, instance, n, level, code, meaning, date))" 697 | ], 698 | "metadata": { 699 | "collapsed": false, 700 | "pycharm": { 701 | "name": "#%%\n" 702 | } 703 | } 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "outputs": [], 709 | "source": [ 710 | "arrow::write_feather(gp_diagnoses, glue(\"{path}/codes_gp_diagnoses_210119.feather\"))" 711 | ], 712 | "metadata": { 713 | "collapsed": false, 714 | "pycharm": { 715 | "name": "#%%\n" 716 | } 717 | } 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "source": [ 722 | "### Procedures - Snomed CT" 723 | ], 724 | "metadata": { 725 | "collapsed": false 726 | } 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": null, 731 | "outputs": [], 732 | "source": [ 733 | "readv2_opcs4 = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_v2_opcs4.csv\"), -3) %>% rename(read_2=\"read_code\", code =\"opcs_4.2_code\") %>% select(read_2, code)\n", 734 | "readv3_opcs4 = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_ctv3_opcs4.csv\"), -3)%>% rename(read_3=\"read_code\", code=\"opcs4_code\") %>% select(read_3, code)" 735 | ], 736 | "metadata": { 737 | "collapsed": false, 738 | "pycharm": { 739 | "name": "#%%\n" 740 | } 741 | } 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": null, 746 | "outputs": [], 747 | "source": [ 748 | "gp_procedures_pre = gp_clinical %>% filter(read_2 %in% readv2_opcs4$read_2 | read_3 %in% readv3_opcs4$read_3)" 749 | ], 750 | "metadata": { 751 | "collapsed": false, 752 | "pycharm": { 753 | "name": "#%%\n" 754 | } 755 | } 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "outputs": [], 761 | "source": [ 762 | "gp_procedures_readv2 = gp_procedures_pre %>% filter(!is.na(read_2)) %>% left_join(readv2_opcs4, on=\"read_2\") %>% drop_na(code) %>% mutate(origin=\"gp_read2\") %>% select(eid, origin, code, date)\n", 763 | "gp_procedures_readv3 = gp_procedures_pre %>% filter(!is.na(read_3)) %>% left_join(readv3_opcs4, on=\"read_3\") %>% drop_na(code) %>% mutate(origin=\"gp_read3\") %>% select(eid, origin, code, date)" 764 | ], 765 | "metadata": { 766 | "collapsed": false, 767 | "pycharm": { 768 | "name": "#%%\n" 769 | } 770 | } 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "outputs": [], 776 | "source": [ 777 | "gp_procedures_raw = rbind(gp_procedures_readv2, gp_procedures_readv3) %>% mutate(instance=0, level=NA) %>% distinct() %>% group_by(eid) %>% mutate(n = row_number()) " 778 | ], 779 | "metadata": { 780 | "collapsed": false, 781 | "pycharm": { 782 | "name": "#%%\n" 783 | } 784 | } 785 | }, 786 | { 787 | "cell_type": "code", 788 | "execution_count": null, 789 | "outputs": [], 790 | "source": [ 791 | "# opcs4 to snomed mapping\n", 792 | "\n", 793 | "concept_ids_opcs4 = concept %>% filter(vocabulary_id == \"OPCS4\") %>% mutate(concept_code = str_replace(concept_code, \"\\\\.\", \"\"))\n", 794 | "concept_ids_snomed = concept %>% filter(vocabulary_id == \"SNOMED\" & domain_id==\"Procedure\") \n", 795 | "\n", 796 | "# check necessary opcs4 concept ids\n", 797 | "concept_ids = concept_ids_opcs4 %>% mutate(concept_id_1 = concept_id)\n", 798 | "cr_filtered = concept_relationship %>% filter(concept_id_1 %in% concept_ids_opcs4$concept_id) %>% filter(concept_id_2 %in% concept_ids_snomed$concept_id) %>% arrange(concept_id_1)\n", 799 | "\n", 800 | "mapping_opcs4_snomed = concept_ids_opcs4 %>% \n", 801 | " left_join(cr_filtered %>% select(concept_id_1, concept_id_2), by=c(\"concept_id\"=\"concept_id_1\")) %>% \n", 802 | " left_join(concept_ids_snomed %>% select(concept_id, concept_code, concept_name), by=c(\"concept_id_2\"=\"concept_id\")) %>% \n", 803 | " mutate(code = concept_code.x, meaning=concept_code.y, name=concept_name.y)" 804 | ], 805 | "metadata": { 806 | "collapsed": false, 807 | "pycharm": { 808 | "name": "#%%\n" 809 | } 810 | } 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": null, 815 | "outputs": [], 816 | "source": [ 817 | "gp_procedures = gp_procedures_raw %>% left_join(mapping_opcs4_snomed %>% select(code, meaning, name), by=\"code\") %>% select(eid, origin, instance, n, level, date, code, meaning, name) %>% arrange(eid, date)" 818 | ], 819 | "metadata": { 820 | "collapsed": false, 821 | "pycharm": { 822 | "name": "#%%\n" 823 | } 824 | } 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": null, 829 | "outputs": [], 830 | "source": [ 831 | "arrow::write_feather(gp_procedures, glue(\"{path}/codes_gp_procedures_210119.feather\"))" 832 | ], 833 | "metadata": { 834 | "collapsed": false, 835 | "pycharm": { 836 | "name": "#%%\n" 837 | } 838 | } 839 | }, 840 | { 841 | "cell_type": "markdown", 842 | "source": [ 843 | "### Measurements - Snomed CT" 844 | ], 845 | "metadata": { 846 | "collapsed": false 847 | } 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": null, 852 | "outputs": [], 853 | "source": [ 854 | "readv2_readv3 = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_v2_read_ctv3.csv\"), -3) %>% rename(read_2=\"READV2_CODE\", code =\"READV3_CODE\", name =\"TERMV3_DESC\") %>% select(read_2, code)" 855 | ], 856 | "metadata": { 857 | "collapsed": false, 858 | "pycharm": { 859 | "name": "#%%\n" 860 | } 861 | } 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": null, 866 | "outputs": [], 867 | "source": [ 868 | "gp_meas = gp_clinical %>% filter(!is.na(value1)) %>% distinct()" 869 | ], 870 | "metadata": { 871 | "collapsed": false, 872 | "pycharm": { 873 | "name": "#%%\n" 874 | } 875 | } 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": null, 880 | "outputs": [], 881 | "source": [ 882 | "gp_meas_readv2 = gp_meas %>% filter(!is.na(read_2)) %>% left_join(readv2_readv3, by=\"read_2\")" 883 | ], 884 | "metadata": { 885 | "collapsed": false, 886 | "pycharm": { 887 | "name": "#%%\n" 888 | } 889 | } 890 | }, 891 | { 892 | "cell_type": "code", 893 | "execution_count": null, 894 | "outputs": [], 895 | "source": [ 896 | "gp_meas_readv3 = gp_meas %>% filter(!is.na(read_3)) %>% mutate(code=read_3)" 897 | ], 898 | "metadata": { 899 | "collapsed": false, 900 | "pycharm": { 901 | "name": "#%%\n" 902 | } 903 | } 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": null, 908 | "outputs": [], 909 | "source": [ 910 | "gp_meas_all = rbind(gp_meas_readv2, gp_meas_readv3) %>% distinct() %>% group_by(eid) " 911 | ], 912 | "metadata": { 913 | "collapsed": false, 914 | "pycharm": { 915 | "name": "#%%\n" 916 | } 917 | } 918 | }, 919 | { 920 | "cell_type": "code", 921 | "execution_count": null, 922 | "outputs": [], 923 | "source": [ 924 | "readv3_lkp = head(fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/read_ctv3_lkp.csv\"), -3)%>% rename(code=\"read_code\", name =\"term_description\") %>% select(code, name)\n", 925 | "readv3_sct = fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/CTV3SCTMAP.csv\")%>% rename(SCUI=\"V1\", STUI=\"V2\", TCUI=\"V3\", TTUI=\"V4\")%>% rename(code=\"SCUI\", meaning=\"TCUI\") %>% select(code, meaning)\n", 926 | "#readct_sct = fread(\"/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb/mapping/gp_codings/RCTSCTMAP.csv\")%>% rename(SCUI=\"V1\", STUI=\"V2\", TCUI=\"V3\", TTUI=\"V4\")#%>% rename(code=\"read_code\", name =\"term_description\") %>% select(code, name)#" 927 | ], 928 | "metadata": { 929 | "collapsed": false, 930 | "pycharm": { 931 | "name": "#%%\n" 932 | } 933 | } 934 | }, 935 | { 936 | "cell_type": "code", 937 | "execution_count": null, 938 | "outputs": [], 939 | "source": [ 940 | "gp_meas = gp_meas_all %>% left_join(readv3_lkp, by=\"code\")" 941 | ], 942 | "metadata": { 943 | "collapsed": false, 944 | "pycharm": { 945 | "name": "#%%\n" 946 | } 947 | } 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": null, 952 | "outputs": [], 953 | "source": [ 954 | "concept_ids_snomed = concept %>% filter(vocabulary_id == \"SNOMED\") %>% rename(name=\"concept_name\", meaning=\"concept_code\") %>% select(meaning, name)" 955 | ], 956 | "metadata": { 957 | "collapsed": false, 958 | "pycharm": { 959 | "name": "#%%\n" 960 | } 961 | } 962 | }, 963 | { 964 | "cell_type": "code", 965 | "execution_count": null, 966 | "outputs": [], 967 | "source": [ 968 | "gp_meas_uncleaned = gp_meas_all %>% left_join(readv3_sct, by=\"code\") %>% left_join(concept_ids_snomed, by=\"meaning\") %>% distinct()" 969 | ], 970 | "metadata": { 971 | "collapsed": false, 972 | "pycharm": { 973 | "name": "#%%\n" 974 | } 975 | } 976 | }, 977 | { 978 | "cell_type": "code", 979 | "execution_count": null, 980 | "outputs": [], 981 | "source": [ 982 | "gp_meas_cleaned_1 = gp_meas_uncleaned %>% select(eid, date, code, value1, value2, value3, meaning, name) %>% distinct() %>% filter(value1!=0)" 983 | ], 984 | "metadata": { 985 | "collapsed": false, 986 | "pycharm": { 987 | "name": "#%%\n" 988 | } 989 | } 990 | }, 991 | { 992 | "cell_type": "code", 993 | "execution_count": null, 994 | "outputs": [], 995 | "source": [ 996 | "gp_meas_cleaned_2 = gp_meas_cleaned_1 %>% ungroup() %>% filter(!is.na(meaning))" 997 | ], 998 | "metadata": { 999 | "collapsed": false, 1000 | "pycharm": { 1001 | "name": "#%%\n" 1002 | } 1003 | } 1004 | }, 1005 | { 1006 | "cell_type": "code", 1007 | "execution_count": null, 1008 | "outputs": [], 1009 | "source": [ 1010 | "double_df = gp_meas_cleaned_2 %>% filter(!is.na(as.numeric(value1)) & !is.na(as.numeric(value2))) " 1011 | ], 1012 | "metadata": { 1013 | "collapsed": false, 1014 | "pycharm": { 1015 | "name": "#%%\n" 1016 | } 1017 | } 1018 | }, 1019 | { 1020 | "cell_type": "code", 1021 | "execution_count": null, 1022 | "outputs": [], 1023 | "source": [ 1024 | "# clean blood pressure and map to systolic and diastolic\n", 1025 | "bp_double_mapped = double_df %>% filter(name %in% c('O/E - blood pressure reading', 'O/E - BP reading normal', 'O/E - BP reading raised',\n", 1026 | " 'O/E - BP borderline raised', 'O/E - Systolic BP reading', 'O/E - Diastolic BP reading', 'Sitting blood pressure', \"Average home systolic blood pressure\",\n", 1027 | " 'Standing blood pressure','24 hr blood pressure monitoring')) %>% \n", 1028 | " #filter(name %in% c('O/E - Systolic BP reading', 'O/E - Diastolic BP reading', \"Average home systolic blood pressure\")) %>%\n", 1029 | " filter(as.numeric(value1)>0) %>% \n", 1030 | " mutate(value_high = pmax(as.numeric(value1), as.numeric(value2)), value_low = pmin(as.numeric(value1), as.numeric(value2))) %>% \n", 1031 | " filter(value_high>40 & value_low>20 & value_high<400 & value_low<300) %>% rename(\"163030003\" = \"value_high\", \"163031004\" = \"value_low\") %>% \n", 1032 | " select(-c(meaning, name)) %>% pivot_longer(c(\"163030003\", \"163031004\"), names_to=\"meaning\", values_to=\"value\") %>% left_join(concept_ids_snomed, by=\"meaning\") %>% distinct() %>% arrange(eid) %>%\n", 1033 | " select(eid, date, code, value1, value2, value3, meaning, name, value)" 1034 | ], 1035 | "metadata": { 1036 | "collapsed": false, 1037 | "pycharm": { 1038 | "name": "#%%\n" 1039 | } 1040 | } 1041 | }, 1042 | { 1043 | "cell_type": "code", 1044 | "execution_count": null, 1045 | "outputs": [], 1046 | "source": [ 1047 | "gp_meas_single = gp_meas_cleaned_2 %>% filter(is.na(as.numeric(value1)) | is.na(as.numeric(value2))) %>%\n", 1048 | " mutate(value=case_when(!is.na(as.numeric(value1)) ~ as.numeric(value1), is.na(as.numeric(value1)) ~ as.numeric(value2))) %>% filter(!is.na(value))" 1049 | ], 1050 | "metadata": { 1051 | "collapsed": false, 1052 | "pycharm": { 1053 | "name": "#%%\n" 1054 | } 1055 | } 1056 | }, 1057 | { 1058 | "cell_type": "code", 1059 | "execution_count": null, 1060 | "outputs": [], 1061 | "source": [ 1062 | "gp_meas_cleaned_3 = rbind(gp_meas_single, bp_double_mapped) %>% distinct() %>% arrange(eid, date)" 1063 | ], 1064 | "metadata": { 1065 | "collapsed": false, 1066 | "pycharm": { 1067 | "name": "#%%\n" 1068 | } 1069 | } 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "execution_count": null, 1074 | "outputs": [], 1075 | "source": [ 1076 | "arrow::write_feather(gp_meas_cleaned_3, glue(\"{path}/codes_gp_measurements_210120.feather\"))" 1077 | ], 1078 | "metadata": { 1079 | "collapsed": false, 1080 | "pycharm": { 1081 | "name": "#%%\n" 1082 | } 1083 | } 1084 | }, 1085 | { 1086 | "cell_type": "markdown", 1087 | "source": [ 1088 | "### Prescriptions - RXNorm" 1089 | ], 1090 | "metadata": { 1091 | "collapsed": false 1092 | } 1093 | }, 1094 | { 1095 | "cell_type": "code", 1096 | "execution_count": null, 1097 | "outputs": [], 1098 | "source": [ 1099 | "gp_scripts = fread(\"/data/project/uk_bb/cvd/data/ukb_downloads/updated_showcase_43098/ukb_data/records/gp_scripts.txt\")" 1100 | ], 1101 | "metadata": { 1102 | "collapsed": false, 1103 | "pycharm": { 1104 | "name": "#%%\n" 1105 | } 1106 | } 1107 | }, 1108 | { 1109 | "cell_type": "code", 1110 | "execution_count": null, 1111 | "outputs": [], 1112 | "source": [ 1113 | "gp_scripts[gp_scripts == \"\"] <- NA" 1114 | ], 1115 | "metadata": { 1116 | "collapsed": false, 1117 | "pycharm": { 1118 | "name": "#%%\n" 1119 | } 1120 | } 1121 | }, 1122 | { 1123 | "cell_type": "code", 1124 | "execution_count": null, 1125 | "outputs": [], 1126 | "source": [ 1127 | "gp_scripts = gp_scripts %>% mutate(date = ymd(as.Date(fast_strptime(issue_date, \"%d/%m/%Y\"))))" 1128 | ], 1129 | "metadata": { 1130 | "collapsed": false, 1131 | "pycharm": { 1132 | "name": "#%%\n" 1133 | } 1134 | } 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "execution_count": null, 1139 | "outputs": [], 1140 | "source": [ 1141 | "gp_scripts_names_available = gp_scripts %>% filter(!is.na(drug_name))" 1142 | ], 1143 | "metadata": { 1144 | "collapsed": false, 1145 | "pycharm": { 1146 | "name": "#%%\n" 1147 | } 1148 | } 1149 | }, 1150 | { 1151 | "cell_type": "code", 1152 | "execution_count": null, 1153 | "outputs": [], 1154 | "source": [ 1155 | "gp_scripts_read_available = gp_scripts %>% filter(is.na(drug_name))" 1156 | ], 1157 | "metadata": { 1158 | "collapsed": false, 1159 | "pycharm": { 1160 | "name": "#%%\n" 1161 | } 1162 | } 1163 | }, 1164 | { 1165 | "cell_type": "code", 1166 | "execution_count": null, 1167 | "outputs": [], 1168 | "source": [ 1169 | "drug_names = (gp_scripts_names_available %>% count(drug_name, sort=TRUE))$drug_name" 1170 | ], 1171 | "metadata": { 1172 | "collapsed": false, 1173 | "pycharm": { 1174 | "name": "#%%\n" 1175 | } 1176 | } 1177 | }, 1178 | { 1179 | "cell_type": "code", 1180 | "execution_count": null, 1181 | "outputs": [], 1182 | "source": [ 1183 | "library(jsonlite)\n", 1184 | "write_json(drug_names, glue(\"{path}/drug_names.json\"))" 1185 | ], 1186 | "metadata": { 1187 | "collapsed": false, 1188 | "pycharm": { 1189 | "name": "#%%\n" 1190 | } 1191 | } 1192 | }, 1193 | { 1194 | "cell_type": "code", 1195 | "execution_count": null, 1196 | "outputs": [], 1197 | "source": [ 1198 | "name_umls_link = arrow::read_feather(glue(\"{path}/drug_names_umls_linked.feather\"))" 1199 | ], 1200 | "metadata": { 1201 | "collapsed": false, 1202 | "pycharm": { 1203 | "name": "#%%\n" 1204 | } 1205 | } 1206 | }, 1207 | { 1208 | "cell_type": "code", 1209 | "execution_count": null, 1210 | "outputs": [], 1211 | "source": [ 1212 | "drugs_rxnorm = arrow::read_feather(glue(\"{path}/drug_names_umls_linked_rxnorm.feather\"))" 1213 | ], 1214 | "metadata": { 1215 | "collapsed": false, 1216 | "pycharm": { 1217 | "name": "#%%\n" 1218 | } 1219 | } 1220 | }, 1221 | { 1222 | "cell_type": "code", 1223 | "execution_count": null, 1224 | "outputs": [], 1225 | "source": [ 1226 | "rx_mapping = concept %>% filter(vocabulary_id %in% c('RxNorm','RxNorm Extension')) %>% select(concept_code, concept_name) %>% rename(rx_code =\"concept_code\", name=\"concept_name\")" 1227 | ], 1228 | "metadata": { 1229 | "collapsed": false, 1230 | "pycharm": { 1231 | "name": "#%%\n" 1232 | } 1233 | } 1234 | }, 1235 | { 1236 | "cell_type": "code", 1237 | "execution_count": null, 1238 | "outputs": [], 1239 | "source": [ 1240 | "rx_norm_mapping_table = drugs_rxnorm %>% select(drug_name, rx_code) %>% filter(rx_code != \"\") %>% distinct() %>% left_join(rx_mapping, on=\"rx_code\")" 1241 | ], 1242 | "metadata": { 1243 | "collapsed": false, 1244 | "pycharm": { 1245 | "name": "#%%\n" 1246 | } 1247 | } 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "execution_count": null, 1252 | "outputs": [], 1253 | "source": [ 1254 | "gp_scripts_rxnorm = gp_scripts_names_available %>% left_join(rx_norm_mapping_table, on=\"drug_name\") %>% select(eid, date, drug_name, rx_code, name) %>% distinct()" 1255 | ], 1256 | "metadata": { 1257 | "collapsed": false, 1258 | "pycharm": { 1259 | "name": "#%%\n" 1260 | } 1261 | } 1262 | }, 1263 | { 1264 | "cell_type": "code", 1265 | "execution_count": null, 1266 | "outputs": [], 1267 | "source": [ 1268 | "arrow::write_feather(gp_scripts_rxnorm, glue(\"{path}/codes_gp_prescription_scispacy.feather\"))" 1269 | ], 1270 | "metadata": { 1271 | "collapsed": false, 1272 | "pycharm": { 1273 | "name": "#%%\n" 1274 | } 1275 | } 1276 | } 1277 | ], 1278 | "metadata": { 1279 | "hide_input": false, 1280 | "kernelspec": { 1281 | "display_name": "R [conda env:python]", 1282 | "language": "R", 1283 | "name": "conda-env-python-r" 1284 | }, 1285 | "language_info": { 1286 | "codemirror_mode": "r", 1287 | "file_extension": ".r", 1288 | "mimetype": "text/x-r-source", 1289 | "name": "R", 1290 | "pygments_lexer": "r", 1291 | "version": "4.0.3" 1292 | } 1293 | }, 1294 | "nbformat": 4, 1295 | "nbformat_minor": 4 1296 | } --------------------------------------------------------------------------------