├── Cancer ├── experiments.ipynb ├── grouped_embeddings_to_matrices.ipynb ├── process_EHR_data_omics.ipynb └── process_PT_data.ipynb ├── LICENSE ├── Onset of Labor ├── data │ ├── data_generation.ipynb │ ├── ool_EHR_features.csv │ ├── processed_data │ │ ├── RNN_data_codes_with_obs.npy │ │ ├── RNN_data_codes_with_obs_word2vec_from_ool.npy │ │ ├── RNN_data_full_EHR_cohort_with_obs_fixed.npy │ │ ├── RNN_data_lengths_full_EHR_cohort_with_obs_fixed.npy │ │ ├── RNN_data_lengths_with_obs.npy │ │ ├── RNN_data_lengths_with_obs_word2vec_from_ool.npy │ │ ├── RNN_data_outcomes_full_EHR_cohort_with_obs_fixed.npy │ │ ├── RNN_data_outcomes_with_obs.npy │ │ ├── RNN_data_outcomes_with_obs_word2vec_from_ool.npy │ │ ├── ool_proteomics_omop_id.csv │ │ ├── sampleID_indices.csv │ │ ├── sampleID_indices_full_cohort_with_obs_fixed.csv │ │ ├── sampleID_indices_with_obs.csv │ │ └── sampleID_indices_with_obs_word2vec_from_ool.csv │ └── raw_data │ │ └── EHR │ │ ├── EHR_cohort_conditions.csv │ │ ├── EHR_cohort_drugs.csv │ │ ├── EHR_cohort_measurements.csv │ │ ├── EHR_cohort_observations.csv │ │ ├── EHR_cohort_procedures.csv │ │ ├── full_EHR_cohort_conditions.csv │ │ ├── full_EHR_cohort_drugs.csv │ │ ├── full_EHR_cohort_measurements.csv │ │ ├── full_EHR_cohort_observations.csv │ │ └── full_EHR_cohort_procedures.csv ├── experiments.ipynb ├── process_EHR_data_full_PT_cohort.ipynb ├── process_EHR_data_omics_cohort.ipynb └── process_EHR_data_omics_cohort_with_PT_word2vec.ipynb ├── README.md └── environment.yml /Cancer/grouped_embeddings_to_matrices.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "tags": [] 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "from tqdm.notebook import tqdm\n", 14 | "import os\n", 15 | "from datetime import timedelta\n", 16 | "import pyspark\n", 17 | "import dxpy\n", 18 | "import dxdata\n", 19 | "import pandas as pd\n", 20 | "import random\n", 21 | "from pyspark.sql import functions as F\n", 22 | "from pyspark.sql import SparkSession\n", 23 | "from pyspark.ml.feature import Word2Vec\n", 24 | "from pyspark.sql.functions import col, udf, to_date, mean, expr\n", 25 | "from pyspark.sql.types import StringType, ArrayType, IntegerType, DoubleType\n", 26 | "from pyspark.ml.feature import Word2Vec\n", 27 | "from pyspark.sql.window import Window\n", 28 | "import ast\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "sc = pyspark.SparkContext()\n", 40 | "spark = pyspark.sql.SparkSession(sc)\n", 41 | "dispensed_database_name = dxpy.find_one_data_object(classname=\"database\", name=\"app*\", folder=\"/\", name_mode=\"glob\", describe=True)[\"describe\"][\"name\"]\n", 42 | "dispensed_dataset_id = dxpy.find_one_data_object(typename=\"Dataset\", name=\"app*.dataset\", folder=\"/\", name_mode=\"glob\")[\"id\"]\n", 43 | "spark.sql(\"USE \" + dispensed_database_name)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "### Omics Data" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "tags": [] 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "#contains death records\n", 62 | "death = pd.read_csv('/mnt/project/death.csv').drop('Unnamed: 0',axis=1).drop_duplicates()\n", 63 | "death['death_date'] = pd.to_datetime(death['death_date'], yearfirst=True)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "tags": [] 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "#contains all cancer diagnoses for omics patients\n", 75 | "prot_date = pd.read_csv('/mnt/project/cancer_conds.csv', usecols=['eid','proteomics_date']).drop_duplicates()\n", 76 | "prot_date['proteomics_date'] = pd.to_datetime(prot_date['proteomics_date'], yearfirst=True)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "tags": [] 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "pdf = prot_date.merge(death, how='inner', on='eid')" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "tags": [] 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "pdf = pdf[pdf['death_date'] <= (pdf['proteomics_date'] + pd.DateOffset(years=3))]\n", 99 | "pdf['indicator'] = 1" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "tags": [] 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "patient_day_embeddings = pd.read_csv('/mnt/project/patient_day_embeddings_omics_omicsword2vec_lc.csv').drop('Unnamed: 0',axis=1)\n", 111 | "max_dates = 32" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "tags": [] 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "patient_day_embeddings = patient_day_embeddings.sort_values(['eid','record_date'])" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "tags": [] 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "unique_patients = patient_day_embeddings['eid'].nunique()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "tags": [] 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "#create np matrix to store input data, assign each patient to an index\n", 145 | "patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['eid'].unique())}\n", 146 | "RNN_data = np.full((400, max_dates, unique_patients), np.nan)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "tags": [] 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "%%time\n", 158 | "#populate np matrix with data\n", 159 | "date_position = {}\n", 160 | "for index, row in tqdm(patient_day_embeddings.iterrows(), total=patient_day_embeddings.shape[0]):\n", 161 | " patient_id = row['eid']\n", 162 | " patient_index = patient_id_to_index[patient_id]\n", 163 | " \n", 164 | " if patient_id not in date_position:\n", 165 | " date_position[patient_id] = 0\n", 166 | " else:\n", 167 | " date_position[patient_id] += 1\n", 168 | " \n", 169 | " date_index = date_position[patient_id]\n", 170 | " \n", 171 | " for feature_index, feature_value in enumerate(row.drop(['eid', 'record_date'])):\n", 172 | " if date_index < max_dates:\n", 173 | " RNN_data[feature_index, date_index, patient_index] = feature_value\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "tags": [] 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "RNN_data = RNN_data.transpose(2,1,0)\n" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "tags": [] 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "np.save('RNN_data_omics_omicsw2v_lc.npy', RNN_data)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "tags": [] 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "%%bash\n", 207 | "dx upload RNN_data_omics_omicsw2v_lc.npy --path /" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "tags": [] 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('eid_indices_omics_omicsw2v_lc.csv')\n" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "tags": [] 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "%%bash\n", 230 | "dx upload eid_indices_omics_omicsw2v_lc.csv --path /" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "tags": [] 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "np.save('./visit_count_omics_omicsw2v_lc.npy',patient_day_embeddings.groupby('eid').count().sort_values('eid')['record_date'].values)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": { 248 | "tags": [] 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "%%bash\n", 253 | "dx upload visit_count_omics_omicsw2v_lc.npy --path /" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "tags": [] 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "idx_df = pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "tags": [] 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "pdf['eid'] = pdf['eid'].astype(int)\n", 276 | "outcomes = idx_df.merge(pdf[['eid','indicator']], how='left', left_on=0, right_on='eid').fillna(0).sort_values(1)['indicator'].astype(int).values" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": { 283 | "tags": [] 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "np.save('./outcomes_omics_omicsw2v_lc_3yr.npy', outcomes)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": { 294 | "tags": [] 295 | }, 296 | "outputs": [], 297 | "source": [ 298 | "%%bash\n", 299 | "dx upload outcomes_omics_omicsw2v_lc_3yr.npy --path /" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "### PT Data" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "tags": [] 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "patient_day_embeddings = pd.read_csv('/mnt/project/patient_day_embeddings_PT_lc_LARGER.csv').drop('Unnamed: 0',axis=1)\n", 318 | "max_dates = 32" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": { 325 | "tags": [] 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "patient_day_embeddings = patient_day_embeddings.drop_duplicates(['eid','record_date'])" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": { 336 | "tags": [] 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "eids_omics = pd.read_csv('/mnt/project/eid_indices_omics_omicsw2v_lc.csv')['0']" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": { 347 | "tags": [] 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "patient_day_embeddings = patient_day_embeddings[~patient_day_embeddings['eid'].isin(eids_omics)]" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": { 358 | "tags": [] 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "patient_day_embeddings.shape" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": { 369 | "tags": [] 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "patient_day_embeddings['eid'].nunique()" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "tags": [] 381 | }, 382 | "outputs": [], 383 | "source": [ 384 | "combined_query = spark.sql(\"\"\"\n", 385 | "WITH EarliestCConds AS (\n", 386 | " SELECT \n", 387 | " c.eid,\n", 388 | " MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date\n", 389 | " FROM \n", 390 | " omop_condition_occurrence c\n", 391 | " WHERE \n", 392 | " c.condition_source_value LIKE 'C%'\n", 393 | " GROUP BY \n", 394 | " c.eid\n", 395 | "),\n", 396 | "FilteredPatients AS (\n", 397 | " SELECT \n", 398 | " ecc.eid,\n", 399 | " ecc.earliest_cond_date,\n", 400 | " TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date \n", 401 | " FROM \n", 402 | " EarliestCConds ecc\n", 403 | " INNER JOIN \n", 404 | " participant_0001 p ON ecc.eid = p.eid\n", 405 | " WHERE \n", 406 | " ecc.earliest_cond_date <= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), 60) AND\n", 407 | " ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)\n", 408 | ")\n", 409 | "\n", 410 | "SELECT DISTINCT\n", 411 | " fp.eid, \n", 412 | " d.death_date,\n", 413 | " fp.earliest_cond_date AS proteomics_date\n", 414 | "FROM \n", 415 | " FilteredPatients fp\n", 416 | "JOIN\n", 417 | " omop_death d ON d.eid=fp.eid\n", 418 | "\n", 419 | "\"\"\")" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": { 426 | "tags": [] 427 | }, 428 | "outputs": [], 429 | "source": [ 430 | "%%time\n", 431 | "#query for death data and cancer diagnosis date\n", 432 | "combined_query_results = combined_query.collect()" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": { 439 | "tags": [] 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "%%time\n", 444 | "pdf = pd.DataFrame(combined_query_results, columns=[field.name for field in combined_query.schema.fields])" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": { 451 | "tags": [] 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "pdf['proteomics_date'] = pd.to_datetime(pdf['proteomics_date'], yearfirst=True)\n", 456 | "pdf['death_date'] = pd.to_datetime(pdf['death_date'], dayfirst=True)\n", 457 | "pdf = pdf.drop_duplicates()\n", 458 | "pdf = pdf[pdf['death_date'] <= (pdf['proteomics_date'] + pd.DateOffset(years=3))]\n", 459 | "pdf['indicator'] = 1" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": { 466 | "tags": [] 467 | }, 468 | "outputs": [], 469 | "source": [ 470 | "patient_day_embeddings = patient_day_embeddings.sort_values(['eid','record_date'])" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": { 477 | "tags": [] 478 | }, 479 | "outputs": [], 480 | "source": [ 481 | "unique_patients = patient_day_embeddings['eid'].nunique()" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": { 488 | "tags": [] 489 | }, 490 | "outputs": [], 491 | "source": [ 492 | "patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['eid'].unique())}\n", 493 | "RNN_data = np.full((400, max_dates, unique_patients), np.nan)" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": { 500 | "tags": [] 501 | }, 502 | "outputs": [], 503 | "source": [ 504 | "%%time\n", 505 | "date_position = {}\n", 506 | "for index, row in tqdm(patient_day_embeddings.iterrows(), total=patient_day_embeddings.shape[0]):\n", 507 | " patient_id = row['eid']\n", 508 | " patient_index = patient_id_to_index[patient_id]\n", 509 | " \n", 510 | " if patient_id not in date_position:\n", 511 | " date_position[patient_id] = 0\n", 512 | " else:\n", 513 | " date_position[patient_id] += 1\n", 514 | " \n", 515 | " date_index = date_position[patient_id]\n", 516 | " \n", 517 | " for feature_index, feature_value in enumerate(row.drop(['eid', 'record_date'])):\n", 518 | " if date_index < max_dates:\n", 519 | " RNN_data[feature_index, date_index, patient_index] = feature_value\n" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": { 526 | "tags": [] 527 | }, 528 | "outputs": [], 529 | "source": [ 530 | "RNN_data = RNN_data.transpose(2,1,0)\n" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": { 537 | "tags": [] 538 | }, 539 | "outputs": [], 540 | "source": [ 541 | "np.save('RNN_data_PT_lc_LARGER.npy', RNN_data)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": { 548 | "tags": [] 549 | }, 550 | "outputs": [], 551 | "source": [ 552 | "%%bash\n", 553 | "dx upload RNN_data_PT_lc_LARGER.npy --path /" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": { 560 | "tags": [] 561 | }, 562 | "outputs": [], 563 | "source": [ 564 | "pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('eid_indices_PT_lc_LARGER.csv')\n" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": { 571 | "tags": [] 572 | }, 573 | "outputs": [], 574 | "source": [ 575 | "%%bash\n", 576 | "dx upload eid_indices_PT_lc_LARGER.csv --path /" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": { 583 | "tags": [] 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "np.save('./visit_count_PT_lc_LARGER.npy',patient_day_embeddings.groupby('eid').count().sort_values('eid')['record_date'].values)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "metadata": { 594 | "tags": [] 595 | }, 596 | "outputs": [], 597 | "source": [ 598 | "%%bash\n", 599 | "dx upload visit_count_PT_lc_LARGER.npy --path /" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "metadata": { 606 | "tags": [] 607 | }, 608 | "outputs": [], 609 | "source": [ 610 | "patient_day_embeddings['eid'].nunique()" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": { 617 | "tags": [] 618 | }, 619 | "outputs": [], 620 | "source": [ 621 | "patient_day_embeddings.groupby('eid').count().sort_values('eid')['record_date'].values" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": null, 627 | "metadata": { 628 | "tags": [] 629 | }, 630 | "outputs": [], 631 | "source": [ 632 | "idx_df = pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": { 639 | "tags": [] 640 | }, 641 | "outputs": [], 642 | "source": [ 643 | "#one patient has duplicated death but the two records are a day apart and doesn't affect labeling\n", 644 | "pdf = pdf.drop_duplicates('eid')\n" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": null, 650 | "metadata": { 651 | "tags": [] 652 | }, 653 | "outputs": [], 654 | "source": [ 655 | "pdf['eid'] = pdf['eid'].astype(int)\n", 656 | "outcomes = idx_df.merge(pdf[['eid','indicator']], how='left', left_on=0, right_on='eid').fillna(0).sort_values(1)['indicator'].astype(int).values" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "metadata": { 663 | "tags": [] 664 | }, 665 | "outputs": [], 666 | "source": [ 667 | "outcomes.mean()" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": { 674 | "tags": [] 675 | }, 676 | "outputs": [], 677 | "source": [ 678 | "np.save('./outcomes_PT_lc_LARGER.npy', outcomes)" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": null, 684 | "metadata": { 685 | "tags": [] 686 | }, 687 | "outputs": [], 688 | "source": [ 689 | "%%bash\n", 690 | "dx upload outcomes_PT_lc_LARGER.npy --path /" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "metadata": { 697 | "tags": [] 698 | }, 699 | "outputs": [], 700 | "source": [ 701 | "len(outcomes)" 702 | ] 703 | }, 704 | { 705 | "cell_type": "markdown", 706 | "metadata": {}, 707 | "source": [ 708 | "### Pull Proteomics" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": null, 714 | "metadata": { 715 | "tags": [] 716 | }, 717 | "outputs": [], 718 | "source": [ 719 | "%%time\n", 720 | "# Initialize Spark session\n", 721 | "spark = SparkSession.builder.appName(\"Proteomics Data Aggregation\").getOrCreate()\n", 722 | "\n", 723 | "# List of table names\n", 724 | "table_names = [f\"olink_instance_0_{str(i).zfill(4)}\" for i in range(1, 13)]\n", 725 | "\n", 726 | "# Create DataFrame for the first table\n", 727 | "combined_df = spark.table(table_names[0])\n", 728 | "\n", 729 | "# Join the rest of the tables\n", 730 | "for table_name in table_names[1:]:\n", 731 | " # Join each table on 'eid'\n", 732 | " next_table_df = spark.table(table_name)\n", 733 | " combined_df = combined_df.join(next_table_df, \"eid\", \"left\")\n", 734 | "\n", 735 | "# Write the result to a CSV file\n", 736 | "combined_df.write.csv(\"all_proteomics\", header=True)\n" 737 | ] 738 | }, 739 | { 740 | "cell_type": "code", 741 | "execution_count": null, 742 | "metadata": { 743 | "tags": [] 744 | }, 745 | "outputs": [], 746 | "source": [ 747 | "%%bash\n", 748 | "hdfs dfs -ls ./all_proteomics" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": null, 754 | "metadata": { 755 | "tags": [] 756 | }, 757 | "outputs": [], 758 | "source": [ 759 | "%%bash\n", 760 | "hdfs dfs -get ./all_proteomics ./\n" 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": null, 766 | "metadata": { 767 | "tags": [] 768 | }, 769 | "outputs": [], 770 | "source": [ 771 | "%%time\n", 772 | "# Directory containing your CSV files\n", 773 | "directory = './all_proteomics/'\n", 774 | "\n", 775 | "# Read and combine all CSV files in the directory\n", 776 | "all_csvs = [pd.read_csv(os.path.join(directory, file)) for file in os.listdir(directory) if file.endswith('.csv')]\n", 777 | "combined_df = pd.concat(all_csvs, ignore_index=True)\n", 778 | "\n", 779 | "# Write the combined DataFrame to a new CSV file\n", 780 | "combined_df.to_csv('./all_proteomics_lc.csv', index=False)\n" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": null, 786 | "metadata": { 787 | "tags": [] 788 | }, 789 | "outputs": [], 790 | "source": [ 791 | "%%bash\n", 792 | "dx upload all_proteomics_lc.csv --path /" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": null, 798 | "metadata": {}, 799 | "outputs": [], 800 | "source": [] 801 | } 802 | ], 803 | "metadata": { 804 | "kernelspec": { 805 | "display_name": "Python 3", 806 | "language": "python", 807 | "name": "python3" 808 | }, 809 | "language_info": { 810 | "codemirror_mode": { 811 | "name": "ipython", 812 | "version": 3 813 | }, 814 | "file_extension": ".py", 815 | "mimetype": "text/x-python", 816 | "name": "python", 817 | "nbconvert_exporter": "python", 818 | "pygments_lexer": "ipython3", 819 | "version": "3.8.5" 820 | } 821 | }, 822 | "nbformat": 4, 823 | "nbformat_minor": 4 824 | } 825 | -------------------------------------------------------------------------------- /Cancer/process_EHR_data_omics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "tags": [] 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pyspark\n", 12 | "import dxpy\n", 13 | "import dxdata\n", 14 | "import pandas as pd\n", 15 | "import random\n", 16 | "from pyspark.sql import functions as F\n", 17 | "from pyspark.sql import SparkSession\n", 18 | "from pyspark.ml.feature import Word2Vec\n", 19 | "from pyspark.sql.functions import col, udf, to_date, mean, expr, concat_ws\n", 20 | "from pyspark.sql.types import StringType, ArrayType, IntegerType, DoubleType\n", 21 | "from pyspark.ml.feature import Word2Vec\n", 22 | "from pyspark.sql.window import Window\n", 23 | "import ast" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "tags": [] 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "sc = pyspark.SparkContext()\n", 35 | "spark = pyspark.sql.SparkSession(sc)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "tags": [] 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "dispensed_database_name = dxpy.find_one_data_object(classname=\"database\", name=\"app*\", folder=\"/\", name_mode=\"glob\", describe=True)[\"describe\"][\"name\"]\n", 47 | "dispensed_dataset_id = dxpy.find_one_data_object(typename=\"Dataset\", name=\"app*.dataset\", folder=\"/\", name_mode=\"glob\")[\"id\"]" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "tags": [] 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "spark.sql(\"USE \" + dispensed_database_name)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "### Word2Vec Omics Cohort " 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "tags": [] 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "\n", 77 | "combined_query = spark.sql(\"\"\"\n", 78 | "WITH EarliestCConds AS (\n", 79 | " SELECT \n", 80 | " c.eid,\n", 81 | " MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date\n", 82 | " FROM \n", 83 | " omop_condition_occurrence c\n", 84 | " INNER JOIN \n", 85 | " olink_instance_0_0001 o ON c.eid = o.eid\n", 86 | " WHERE \n", 87 | " c.condition_source_value LIKE 'C%'\n", 88 | " GROUP BY \n", 89 | " c.eid\n", 90 | "),\n", 91 | "FilteredPatients AS (\n", 92 | " SELECT \n", 93 | " ecc.eid,\n", 94 | " ecc.earliest_cond_date,\n", 95 | " TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date \n", 96 | " FROM \n", 97 | " EarliestCConds ecc\n", 98 | " INNER JOIN \n", 99 | " participant_0001 p ON ecc.eid = p.eid\n", 100 | " WHERE \n", 101 | " ecc.earliest_cond_date < TO_DATE(p.p53_i0, 'yyyy-MM-dd')\n", 102 | " AND ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)\n", 103 | ")\n", 104 | "\n", 105 | "SELECT \n", 106 | " fp.eid, \n", 107 | " c.concept_id, \n", 108 | " c.record_date,\n", 109 | " DATE_FORMAT(c.record_date, 'yyyy-MM-dd') as formatted_date\n", 110 | "FROM \n", 111 | " FilteredPatients fp\n", 112 | "JOIN (\n", 113 | " SELECT \n", 114 | " o.eid, \n", 115 | " o.condition_concept_id as concept_id, \n", 116 | " TO_DATE(o.condition_start_date, 'dd/MM/yyyy') as record_date\n", 117 | " FROM \n", 118 | " omop_condition_occurrence o\n", 119 | " UNION ALL\n", 120 | " SELECT \n", 121 | " o.eid, \n", 122 | " o.procedure_concept_id as concept_id, \n", 123 | " TO_DATE(o.procedure_date, 'dd/MM/yyyy') as record_date\n", 124 | " FROM \n", 125 | " omop_procedure_occurrence o\n", 126 | " UNION ALL\n", 127 | " SELECT \n", 128 | " o.eid, \n", 129 | " o.drug_concept_id as concept_id, \n", 130 | " TO_DATE(o.drug_exposure_start_date, 'dd/MM/yyyy') as record_date\n", 131 | " FROM \n", 132 | " omop_drug_exposure o\n", 133 | " UNION ALL\n", 134 | " SELECT \n", 135 | " o.eid, \n", 136 | " o.observation_concept_id as concept_id, \n", 137 | " TO_DATE(o.observation_date, 'dd/MM/yyyy') as record_date\n", 138 | " FROM \n", 139 | " omop_observation o\n", 140 | " UNION ALL\n", 141 | " SELECT \n", 142 | " o.eid, \n", 143 | " o.measurement_concept_id as concept_id, \n", 144 | " TO_DATE(o.measurement_date, 'dd/MM/yyyy') as record_date\n", 145 | " FROM \n", 146 | " omop_measurement o\n", 147 | ") c ON fp.eid = c.eid\n", 148 | "\n", 149 | "\n", 150 | "\"\"\")" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "tags": [] 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "%%time\n", 162 | "distinct_eids = combined_query.select(\"eid\").distinct()\n", 163 | "num_distinct_eids = distinct_eids.count()\n", 164 | "\n", 165 | "print(f\"Number of distinct eids: {num_distinct_eids}\")" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "tags": [] 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "%%time\n", 177 | "\n", 178 | "combined_query.show()" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "tags": [] 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "%%time\n", 190 | "# Count the number of rows in the result\n", 191 | "row_count = combined_query.count()\n", 192 | "\n", 193 | "# Print the row count\n", 194 | "print(f\"Number of rows in the query result: {row_count}\")" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "tags": [] 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "%%time\n", 206 | "\n", 207 | "# Initialize Spark Session\n", 208 | "spark = SparkSession.builder.appName(\"Word2Vec Training\").getOrCreate()\n", 209 | "\n", 210 | "combined_query = combined_query.withColumn(\"concept_id\", combined_query[\"concept_id\"].cast(IntegerType()))\n", 211 | "\n", 212 | "# Group by 'eid' (person_id) and 'month_year'\n", 213 | "grouped_data = (combined_query.groupBy(\"eid\", \"formatted_date\")\n", 214 | " .agg(F.collect_list(\"concept_id\").alias(\"concept_ids\")))\n", 215 | "\n", 216 | "# Define a UDF to convert integers to strings\n", 217 | "int_to_string_udf = udf(lambda x: [str(i) for i in x], ArrayType(StringType()))\n", 218 | "\n", 219 | "# Apply the UDF to the 'concept_ids' column\n", 220 | "word2Vec_data = grouped_data.withColumn(\"words\", int_to_string_udf(col(\"concept_ids\")))\n", 221 | "\n", 222 | "# Define the Word2Vec model\n", 223 | "print('started training')\n", 224 | "word2vec = Word2Vec(vectorSize=400, windowSize=100, minCount=5, inputCol=\"words\", outputCol=\"wordVectors\").setMaxIter(3)\n", 225 | "\n", 226 | "# Fit the model\n", 227 | "model = word2vec.fit(word2Vec_data)\n", 228 | "print('done training')\n" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "tags": [] 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "word_vectors = model.getVectors()\n", 240 | "\n", 241 | "pandas_df = word_vectors.toPandas()\n", 242 | "\n", 243 | "pandas_df.to_csv(\"./omics_lc_word2vec.csv\", index=False)\n" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "tags": [] 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "%%bash\n", 255 | "dx upload omics_lc_word2vec.csv --path /" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "### Word2Vec PT Cohort" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "tags": [] 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "combined_query = spark.sql(\"\"\"\n", 274 | "WITH EarliestCConds AS (\n", 275 | " SELECT \n", 276 | " c.eid,\n", 277 | " MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date\n", 278 | " FROM \n", 279 | " omop_condition_occurrence c\n", 280 | " WHERE \n", 281 | " c.condition_source_value LIKE 'C%'\n", 282 | " GROUP BY \n", 283 | " c.eid\n", 284 | "),\n", 285 | "FilteredPatients AS (\n", 286 | " SELECT \n", 287 | " ecc.eid,\n", 288 | " ecc.earliest_cond_date,\n", 289 | " TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date \n", 290 | " FROM \n", 291 | " EarliestCConds ecc\n", 292 | " INNER JOIN \n", 293 | " participant_0001 p ON ecc.eid = p.eid\n", 294 | " WHERE \n", 295 | " ecc.earliest_cond_date < TO_DATE(p.p53_i0, 'yyyy-MM-dd')\n", 296 | " AND ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)\n", 297 | ")\n", 298 | "\n", 299 | "SELECT \n", 300 | " fp.eid, \n", 301 | " c.concept_id, \n", 302 | " c.record_date,\n", 303 | " DATE_FORMAT(c.record_date, 'yyyy-MM-dd') as formatted_date\n", 304 | "FROM \n", 305 | " FilteredPatients fp\n", 306 | "JOIN (\n", 307 | " SELECT \n", 308 | " o.eid, \n", 309 | " o.condition_concept_id as concept_id, \n", 310 | " TO_DATE(o.condition_start_date, 'dd/MM/yyyy') as record_date\n", 311 | " FROM \n", 312 | " omop_condition_occurrence o\n", 313 | " UNION ALL\n", 314 | " SELECT \n", 315 | " o.eid, \n", 316 | " o.procedure_concept_id as concept_id, \n", 317 | " TO_DATE(o.procedure_date, 'dd/MM/yyyy') as record_date\n", 318 | " FROM \n", 319 | " omop_procedure_occurrence o\n", 320 | " UNION ALL\n", 321 | " SELECT \n", 322 | " o.eid, \n", 323 | " o.drug_concept_id as concept_id, \n", 324 | " TO_DATE(o.drug_exposure_start_date, 'dd/MM/yyyy') as record_date\n", 325 | " FROM \n", 326 | " omop_drug_exposure o\n", 327 | " UNION ALL\n", 328 | " SELECT \n", 329 | " o.eid, \n", 330 | " o.observation_concept_id as concept_id, \n", 331 | " TO_DATE(o.observation_date, 'dd/MM/yyyy') as record_date\n", 332 | " FROM \n", 333 | " omop_observation o\n", 334 | " UNION ALL\n", 335 | " SELECT \n", 336 | " o.eid, \n", 337 | " o.measurement_concept_id as concept_id, \n", 338 | " TO_DATE(o.measurement_date, 'dd/MM/yyyy') as record_date\n", 339 | " FROM \n", 340 | " omop_measurement o\n", 341 | ") c ON fp.eid = c.eid\n", 342 | "\n", 343 | "\"\"\")" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": { 350 | "tags": [] 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "%%time\n", 355 | "# Count the number of rows in the result\n", 356 | "row_count = combined_query.count()\n", 357 | "\n", 358 | "# Print the row count\n", 359 | "print(f\"Number of rows in the query result: {row_count}\")" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": { 366 | "tags": [] 367 | }, 368 | "outputs": [], 369 | "source": [ 370 | "%%time\n", 371 | "distinct_eids = combined_query.select(\"eid\").distinct()\n", 372 | "num_distinct_eids = distinct_eids.count()\n", 373 | "\n", 374 | "print(f\"Number of distinct eids: {num_distinct_eids}\")" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": { 381 | "tags": [] 382 | }, 383 | "outputs": [], 384 | "source": [ 385 | "%%time\n", 386 | "\n", 387 | "# Initialize Spark Session\n", 388 | "spark = SparkSession.builder.appName(\"Word2Vec Training\").getOrCreate()\n", 389 | "\n", 390 | "combined_query = combined_query.withColumn(\"concept_id\", combined_query[\"concept_id\"].cast(IntegerType()))\n", 391 | "\n", 392 | "# Group by 'eid' (person_id) and 'month_year'\n", 393 | "grouped_data = (combined_query.groupBy(\"eid\", \"formatted_date\")\n", 394 | " .agg(F.collect_list(\"concept_id\").alias(\"concept_ids\")))\n", 395 | "\n", 396 | "# Define a UDF to convert integers to strings\n", 397 | "int_to_string_udf = udf(lambda x: [str(i) for i in x], ArrayType(StringType()))\n", 398 | "\n", 399 | "# Apply the UDF to the 'concept_ids' column\n", 400 | "word2Vec_data = grouped_data.withColumn(\"words\", int_to_string_udf(col(\"concept_ids\")))\n", 401 | "\n", 402 | "# Define the Word2Vec model\n", 403 | "print('started training')\n", 404 | "word2vec = Word2Vec(vectorSize=400, windowSize=100, minCount=5, inputCol=\"words\", outputCol=\"wordVectors\").setMaxIter(3)\n", 405 | "\n", 406 | "# Fit the model\n", 407 | "model = word2vec.fit(word2Vec_data)\n", 408 | "print('done training')\n" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": { 415 | "tags": [] 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "word_vectors = model.getVectors()\n", 420 | "\n", 421 | "pandas_df = word_vectors.toPandas()\n", 422 | "\n", 423 | "pandas_df.to_csv(\"./PT_lc_word2vec.csv\", index=False)\n" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": { 430 | "tags": [] 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "%%bash\n", 435 | "dx upload PT_lc_word2vec.csv --path /" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "### Downstream Processing for omics cohort only" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": { 456 | "tags": [] 457 | }, 458 | "outputs": [], 459 | "source": [ 460 | "word_vectors = pd.read_csv('./PT_lc_word2vec.csv')\n", 461 | "word_vectors['vector'] = word_vectors['vector'].apply(ast.literal_eval)\n", 462 | "\n" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": { 469 | "tags": [] 470 | }, 471 | "outputs": [], 472 | "source": [ 473 | "combined_query = spark.sql(\"\"\"\n", 474 | "WITH EarliestCConds AS (\n", 475 | " SELECT \n", 476 | " c.eid,\n", 477 | " MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date\n", 478 | " FROM \n", 479 | " omop_condition_occurrence c\n", 480 | " INNER JOIN \n", 481 | " olink_instance_0_0001 o ON c.eid = o.eid\n", 482 | " WHERE \n", 483 | " c.condition_source_value LIKE 'C%'\n", 484 | " GROUP BY \n", 485 | " c.eid\n", 486 | "),\n", 487 | "FilteredPatients AS (\n", 488 | " SELECT \n", 489 | " ecc.eid,\n", 490 | " ecc.earliest_cond_date,\n", 491 | " TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date \n", 492 | " FROM \n", 493 | " EarliestCConds ecc\n", 494 | " INNER JOIN \n", 495 | " participant_0001 p ON ecc.eid = p.eid\n", 496 | " WHERE \n", 497 | " ecc.earliest_cond_date < TO_DATE(p.p53_i0, 'yyyy-MM-dd')\n", 498 | " AND ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)\n", 499 | ")\n", 500 | "\n", 501 | "SELECT \n", 502 | " fp.eid, \n", 503 | " c.concept_id, \n", 504 | " c.record_date,\n", 505 | " DATE_FORMAT(c.record_date, 'yyyy-MM-dd') as formatted_date\n", 506 | "FROM \n", 507 | " FilteredPatients fp\n", 508 | "JOIN (\n", 509 | " SELECT \n", 510 | " o.eid, \n", 511 | " o.condition_concept_id as concept_id, \n", 512 | " TO_DATE(o.condition_start_date, 'dd/MM/yyyy') as record_date\n", 513 | " FROM \n", 514 | " omop_condition_occurrence o\n", 515 | " UNION ALL\n", 516 | " SELECT \n", 517 | " o.eid, \n", 518 | " o.procedure_concept_id as concept_id, \n", 519 | " TO_DATE(o.procedure_date, 'dd/MM/yyyy') as record_date\n", 520 | " FROM \n", 521 | " omop_procedure_occurrence o\n", 522 | " UNION ALL\n", 523 | " SELECT \n", 524 | " o.eid, \n", 525 | " o.drug_concept_id as concept_id, \n", 526 | " TO_DATE(o.drug_exposure_start_date, 'dd/MM/yyyy') as record_date\n", 527 | " FROM \n", 528 | " omop_drug_exposure o\n", 529 | " UNION ALL\n", 530 | " SELECT \n", 531 | " o.eid, \n", 532 | " o.observation_concept_id as concept_id, \n", 533 | " TO_DATE(o.observation_date, 'dd/MM/yyyy') as record_date\n", 534 | " FROM \n", 535 | " omop_observation o\n", 536 | " UNION ALL\n", 537 | " SELECT \n", 538 | " o.eid, \n", 539 | " o.measurement_concept_id as concept_id, \n", 540 | " TO_DATE(o.measurement_date, 'dd/MM/yyyy') as record_date\n", 541 | " FROM \n", 542 | " omop_measurement o\n", 543 | ") c ON fp.eid = c.eid AND c.record_date <= fp.proteomics_date\n", 544 | "\n", 545 | "\"\"\")" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "metadata": { 552 | "tags": [] 553 | }, 554 | "outputs": [], 555 | "source": [ 556 | "%%time\n", 557 | "combined_query_results = combined_query.collect()" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": { 564 | "tags": [] 565 | }, 566 | "outputs": [], 567 | "source": [ 568 | "%%time\n", 569 | "pdf = pd.DataFrame(combined_query_results, columns=[field.name for field in combined_query.schema.fields])" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": { 576 | "tags": [] 577 | }, 578 | "outputs": [], 579 | "source": [ 580 | "word_vectors['word'] = word_vectors['word'].astype(str)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": { 587 | "tags": [] 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "pdf = pdf.merge(word_vectors, how='inner', left_on='concept_id', right_on='word').drop(['word','concept_id'],axis=1)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": { 598 | "tags": [] 599 | }, 600 | "outputs": [], 601 | "source": [ 602 | "pdf.shape, pdf['eid'].nunique()" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": { 609 | "tags": [] 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "%%time\n", 614 | "embeddings_df = pd.DataFrame(pdf['vector'].tolist(), index=pdf.index)\n", 615 | "embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]\n", 616 | "\n", 617 | "# Join the new DataFrame with the original DataFrame\n", 618 | "embedded_codes = pdf.join(embeddings_df)\n" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": { 625 | "tags": [] 626 | }, 627 | "outputs": [], 628 | "source": [ 629 | "embedded_codes.shape" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": { 636 | "tags": [] 637 | }, 638 | "outputs": [], 639 | "source": [ 640 | "# Convert 'record_date' to datetime format in Pandas\n", 641 | "embedded_codes['record_date'] = pd.to_datetime(embedded_codes['record_date'], format='%Y-%m-%d')" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "metadata": { 648 | "tags": [] 649 | }, 650 | "outputs": [], 651 | "source": [ 652 | "%%time\n", 653 | "max_dates=32\n", 654 | "\n", 655 | "# 1. Sort the DataFrame\n", 656 | "embedded_codes = embedded_codes.sort_values(by=['eid', 'record_date'], ascending=[True, False])\n", 657 | "\n", 658 | "# 2. Rank within each 'eid' group\n", 659 | "embedded_codes['date_rank'] = embedded_codes.groupby('eid')['record_date'].rank(method='dense', ascending=False)\n", 660 | "\n", 661 | "# 3. Filter based on rank\n", 662 | "filtered_data_pd = embedded_codes[embedded_codes['date_rank'] <= max_dates]\n", 663 | "\n", 664 | "# 4. Define your aggregation expressions\n", 665 | "agg_funcs = {f'embedding_{i}': 'mean' for i in range(400)}\n", 666 | "\n", 667 | "# Apply aggregation with the defined expressions\n", 668 | "patient_day_embeddings_pd = filtered_data_pd.groupby(['eid', 'record_date']).agg(agg_funcs)\n" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": { 675 | "tags": [] 676 | }, 677 | "outputs": [], 678 | "source": [ 679 | "%%time\n", 680 | "patient_day_embeddings_pd.reset_index().to_csv('./patient_day_embeddings_omics_PTword2vec_lc.csv', header=True)" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": null, 686 | "metadata": { 687 | "tags": [] 688 | }, 689 | "outputs": [], 690 | "source": [ 691 | "%%bash\n", 692 | "dx upload patient_day_embeddings_omics_PTword2vec_lc.csv --path /" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [] 701 | } 702 | ], 703 | "metadata": { 704 | "kernelspec": { 705 | "display_name": "Python 3", 706 | "language": "python", 707 | "name": "python3" 708 | }, 709 | "language_info": { 710 | "codemirror_mode": { 711 | "name": "ipython", 712 | "version": 3 713 | }, 714 | "file_extension": ".py", 715 | "mimetype": "text/x-python", 716 | "name": "python", 717 | "nbconvert_exporter": "python", 718 | "pygments_lexer": "ipython3", 719 | "version": "3.8.5" 720 | } 721 | }, 722 | "nbformat": 4, 723 | "nbformat_minor": 4 724 | } 725 | -------------------------------------------------------------------------------- /Cancer/process_PT_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "tags": [] 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pyspark\n", 12 | "import dxpy\n", 13 | "import dxdata\n", 14 | "import pandas as pd\n", 15 | "import random\n", 16 | "from pyspark.sql import functions as F\n", 17 | "from pyspark.sql import SparkSession\n", 18 | "from pyspark.ml.feature import Word2Vec\n", 19 | "from pyspark.sql.functions import col, udf, to_date, mean, expr\n", 20 | "from pyspark.sql.types import StringType, ArrayType, IntegerType, DoubleType\n", 21 | "from pyspark.ml.feature import Word2Vec\n", 22 | "from pyspark.sql.window import Window\n", 23 | "import ast\n", 24 | "import numpy as np\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "tags": [] 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "spark = SparkSession.builder \\\n", 36 | " .appName(\"MyApp\") \\\n", 37 | " .config(\"spark.serializer\", \"org.apache.spark.serializer.KryoSerializer\") \\\n", 38 | " .config(\"spark.kryoserializer.buffer.max\", \"1g\") \\\n", 39 | " .getOrCreate()\n", 40 | "\n", 41 | "# The SparkContext is accessible from the SparkSession as follows:\n", 42 | "sc = spark.sparkContext" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "tags": [] 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "dispensed_database_name = dxpy.find_one_data_object(classname=\"database\", name=\"app*\", folder=\"/\", name_mode=\"glob\", describe=True)[\"describe\"][\"name\"]\n", 54 | "dispensed_dataset_id = dxpy.find_one_data_object(typename=\"Dataset\", name=\"app*.dataset\", folder=\"/\", name_mode=\"glob\")[\"id\"]" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 4, 60 | "metadata": { 61 | "tags": [] 62 | }, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "DataFrame[]" 68 | ] 69 | }, 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "spark.sql(\"USE \" + dispensed_database_name)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "tags": [] 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "#cancer patients with initial diagnosis at most 12 months before initial UKBB visit\n", 88 | "combined_query = spark.sql(\"\"\"\n", 89 | "WITH EarliestCConds AS (\n", 90 | " SELECT \n", 91 | " c.eid,\n", 92 | " MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date\n", 93 | " FROM \n", 94 | " omop_condition_occurrence c\n", 95 | " WHERE \n", 96 | " c.condition_source_value LIKE 'C%'\n", 97 | " GROUP BY \n", 98 | " c.eid\n", 99 | "),\n", 100 | "FilteredPatients AS (\n", 101 | " SELECT \n", 102 | " ecc.eid,\n", 103 | " ecc.earliest_cond_date,\n", 104 | " TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date \n", 105 | " FROM \n", 106 | " EarliestCConds ecc\n", 107 | " INNER JOIN \n", 108 | " participant_0001 p ON ecc.eid = p.eid\n", 109 | " WHERE \n", 110 | " ecc.earliest_cond_date <= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), 60) AND\n", 111 | " ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)\n", 112 | ")\n", 113 | "\n", 114 | "SELECT \n", 115 | " fp.eid, \n", 116 | " c.concept_id, \n", 117 | " c.record_date,\n", 118 | " DATE_FORMAT(c.record_date, 'yyyy-MM-dd') as formatted_date\n", 119 | "FROM \n", 120 | " FilteredPatients fp\n", 121 | "JOIN (\n", 122 | " SELECT \n", 123 | " o.eid, \n", 124 | " o.condition_concept_id as concept_id, \n", 125 | " TO_DATE(o.condition_start_date, 'dd/MM/yyyy') as record_date\n", 126 | " FROM \n", 127 | " omop_condition_occurrence o\n", 128 | " UNION ALL\n", 129 | " SELECT \n", 130 | " o.eid, \n", 131 | " o.procedure_concept_id as concept_id, \n", 132 | " TO_DATE(o.procedure_date, 'dd/MM/yyyy') as record_date\n", 133 | " FROM \n", 134 | " omop_procedure_occurrence o\n", 135 | " UNION ALL\n", 136 | " SELECT \n", 137 | " o.eid, \n", 138 | " o.drug_concept_id as concept_id, \n", 139 | " TO_DATE(o.drug_exposure_start_date, 'dd/MM/yyyy') as record_date\n", 140 | " FROM \n", 141 | " omop_drug_exposure o\n", 142 | " UNION ALL\n", 143 | " SELECT \n", 144 | " o.eid, \n", 145 | " o.observation_concept_id as concept_id, \n", 146 | " TO_DATE(o.observation_date, 'dd/MM/yyyy') as record_date\n", 147 | " FROM \n", 148 | " omop_observation o\n", 149 | " UNION ALL\n", 150 | " SELECT \n", 151 | " o.eid, \n", 152 | " o.measurement_concept_id as concept_id, \n", 153 | " TO_DATE(o.measurement_date, 'dd/MM/yyyy') as record_date\n", 154 | " FROM \n", 155 | " omop_measurement o\n", 156 | ") c ON fp.eid = c.eid AND c.record_date <= fp.earliest_cond_date\n", 157 | "\n", 158 | "\"\"\")" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "tags": [] 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "%%time\n", 170 | "combined_query_results = combined_query.collect()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "tags": [] 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "%%time\n", 182 | "pdf = pd.DataFrame(combined_query_results, columns=[field.name for field in combined_query.schema.fields])" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "tags": [] 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "pdf['eid'].nunique()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "tags": [] 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "%%time\n", 205 | "# Convert 'record_date' to datetime format in Pandas\n", 206 | "pdf['record_date'] = pd.to_datetime(pdf['record_date'], format='%Y-%m-%d')\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "tags": [] 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "%%time\n", 218 | "max_dates=32\n", 219 | "\n", 220 | "# 1. Sort the DataFrame\n", 221 | "pdf = pdf.sort_values(by=['eid', 'record_date'], ascending=[True, False])\n", 222 | "\n", 223 | "# 2. Rank within each 'eid' group\n", 224 | "pdf['date_rank'] = pdf.groupby('eid')['record_date'].rank(method='dense', ascending=False)\n", 225 | "\n", 226 | "# 3. Filter based on rank\n", 227 | "filtered_pdf = pdf[pdf['date_rank'] <= max_dates]\n" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "tags": [] 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "%%time\n", 239 | "word_vectors = pd.read_csv('/mnt/project/PT_lc_word2vec.csv')\n", 240 | "word_vectors['vector'] = word_vectors['vector'].apply(ast.literal_eval)\n" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "tags": [] 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "word_vectors['word'] = word_vectors['word'].astype(str)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "tags": [] 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "filtered_pdf = filtered_pdf.merge(word_vectors, how='inner', left_on='concept_id', right_on='word').drop(['word','concept_id'],axis=1)\n" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "tags": [] 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "filtered_pdf.shape, filtered_pdf['eid'].nunique()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "tags": [] 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "indices = filtered_pdf.index" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "tags": [] 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "%%time\n", 296 | "embeddings_array = np.array(filtered_pdf['vector'].tolist(), dtype=np.float64)\n" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": { 303 | "tags": [] 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "embeddings_df = pd.DataFrame(embeddings_array, index=indices)\n" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "tags": [] 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "%%time\n", 319 | "embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]\n", 320 | "\n", 321 | "# Join the new DataFrame with the original DataFrame\n", 322 | "embedded_codes = filtered_pdf.join(embeddings_df)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "tags": [] 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "%%time\n", 334 | "\n", 335 | "# 4. Define your aggregation expressions\n", 336 | "agg_funcs = {f'embedding_{i}': 'mean' for i in range(400)}\n", 337 | "\n", 338 | "# Apply aggregation with the defined expressions\n", 339 | "patient_day_embeddings_pd = embedded_codes.groupby(['eid', 'record_date']).agg(agg_funcs)\n" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "tags": [] 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "%%time\n", 351 | "patient_day_embeddings_pd.reset_index().to_csv('./patient_day_embeddings_PT_lc_LARGER.csv', header=True)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": { 358 | "tags": [] 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "%%bash\n", 363 | "dx upload patient_day_embeddings_PT_lc_LARGER.csv --path /" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [] 372 | } 373 | ], 374 | "metadata": { 375 | "kernelspec": { 376 | "display_name": "Python 3", 377 | "language": "python", 378 | "name": "python3" 379 | }, 380 | "language_info": { 381 | "codemirror_mode": { 382 | "name": "ipython", 383 | "version": 3 384 | }, 385 | "file_extension": ".py", 386 | "mimetype": "text/x-python", 387 | "name": "python", 388 | "nbconvert_exporter": "python", 389 | "pygments_lexer": "ipython3", 390 | "version": "3.8.5" 391 | } 392 | }, 393 | "nbformat": 4, 394 | "nbformat_minor": 4 395 | } 396 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Samson Mataraso 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Onset of Labor/data/ool_EHR_features.csv: -------------------------------------------------------------------------------- 1 | mom_person_id 2 | 1000000 3 | 1000001 4 | 1000002 5 | 1000003 6 | 1000004 7 | 1000005 8 | 1000006 9 | 1000007 10 | 1000008 11 | 1000009 12 | 1000010 13 | 1000011 14 | 1000012 15 | 1000013 16 | 1000014 17 | 1000015 18 | 1000016 19 | 1000017 20 | 1000018 21 | 1000019 22 | -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/RNN_data_codes_with_obs.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_codes_with_obs.npy -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/RNN_data_codes_with_obs_word2vec_from_ool.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_codes_with_obs_word2vec_from_ool.npy -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/RNN_data_full_EHR_cohort_with_obs_fixed.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_full_EHR_cohort_with_obs_fixed.npy -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/RNN_data_lengths_full_EHR_cohort_with_obs_fixed.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_lengths_full_EHR_cohort_with_obs_fixed.npy -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/RNN_data_lengths_with_obs.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_lengths_with_obs.npy -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/RNN_data_lengths_with_obs_word2vec_from_ool.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_lengths_with_obs_word2vec_from_ool.npy -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/RNN_data_outcomes_full_EHR_cohort_with_obs_fixed.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_outcomes_full_EHR_cohort_with_obs_fixed.npy -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/RNN_data_outcomes_with_obs.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_outcomes_with_obs.npy -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/RNN_data_outcomes_with_obs_word2vec_from_ool.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_outcomes_with_obs_word2vec_from_ool.npy -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/sampleID_indices.csv: -------------------------------------------------------------------------------- 1 | 0 2 | 1000000_10000000_T1 3 | 1000000_10000000_T2 4 | 1000000_10000000_T3 5 | 1000001_10000001_T1 6 | 1000001_10000001_T2 7 | 1000001_10000001_T3 8 | 1000002_10000002_T1 9 | 1000002_10000002_T2 10 | 1000002_10000002_T3 11 | 1000003_10000003_T1 12 | 1000003_10000003_T2 13 | 1000003_10000003_T3 14 | 1000004_10000004_T1 15 | 1000004_10000004_T2 16 | 1000004_10000004_T3 17 | 1000005_10000005_T1 18 | 1000005_10000005_T2 19 | 1000005_10000005_T3 20 | 1000006_10000006_T1 21 | 1000006_10000006_T2 22 | 1000006_10000006_T3 23 | 1000007_10000007_T1 24 | 1000007_10000007_T2 25 | 1000007_10000007_T3 26 | 1000008_10000008_T1 27 | 1000008_10000008_T2 28 | 1000008_10000008_T3 29 | 1000009_10000009_T1 30 | 1000009_10000009_T2 31 | 1000009_10000009_T3 32 | 1000010_10000010_T1 33 | 1000010_10000010_T2 34 | 1000010_10000010_T3 35 | 1000011_10000011_T1 36 | 1000011_10000011_T2 37 | 1000011_10000011_T3 38 | 1000012_10000012_T1 39 | 1000012_10000012_T2 40 | 1000012_10000012_T3 41 | 1000013_10000013_T1 42 | 1000013_10000013_T2 43 | 1000013_10000013_T3 44 | 1000014_10000014_T1 45 | 1000014_10000014_T2 46 | 1000014_10000014_T3 47 | 1000015_10000015_T1 48 | 1000015_10000015_T2 49 | 1000015_10000015_T3 50 | 1000016_10000016_T1 51 | 1000016_10000016_T2 52 | 1000016_10000016_T3 53 | 1000017_10000017_T1 54 | 1000017_10000017_T2 55 | 1000017_10000017_T3 56 | 1000018_10000018_T1 57 | 1000018_10000018_T2 58 | 1000018_10000018_T3 59 | 1000019_10000019_T1 60 | 1000019_10000019_T2 61 | 1000019_10000019_T3 62 | -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/sampleID_indices_full_cohort_with_obs_fixed.csv: -------------------------------------------------------------------------------- 1 | ,0,1 2 | 0,1000021_10000021,0 3 | 1,1000022_10000022,1 4 | 2,1000023_10000023,2 5 | 3,1000026_10000026,3 6 | 4,1000027_10000027,4 7 | 5,1000031_10000031,5 8 | 6,1000032_10000032,6 9 | 7,1000033_10000033,7 10 | 8,1000035_10000035,8 11 | 9,1000037_10000037,9 12 | 10,1000039_10000039,10 13 | 11,1000045_10000045,11 14 | 12,1000047_10000047,12 15 | 13,1000048_10000048,13 16 | 14,1000050_10000050,14 17 | 15,1000052_10000052,15 18 | 16,1000053_10000053,16 19 | 17,1000054_10000054,17 20 | 18,1000055_10000055,18 21 | 19,1000056_10000056,19 22 | 20,1000058_10000058,20 23 | 21,1000059_10000059,21 24 | 22,1000061_10000061,22 25 | 23,1000062_10000062,23 26 | 24,1000066_10000066,24 27 | 25,1000068_10000068,25 28 | 26,1000069_10000069,26 29 | 27,1000070_10000070,27 30 | 28,1000072_10000072,28 31 | 29,1000073_10000073,29 32 | 30,1000074_10000074,30 33 | 31,1000076_10000076,31 34 | 32,1000083_10000083,32 35 | 33,1000085_10000085,33 36 | 34,1000086_10000086,34 37 | 35,1000087_10000087,35 38 | 36,1000094_10000094,36 39 | 37,1000098_10000098,37 40 | 38,1000100_10000100,38 41 | 39,1000103_10000103,39 42 | 40,1000104_10000104,40 43 | 41,1000107_10000107,41 44 | 42,1000108_10000108,42 45 | 43,1000109_10000109,43 46 | 44,1000112_10000112,44 47 | 45,1000116_10000116,45 48 | 46,1000117_10000117,46 49 | 47,1000118_10000118,47 50 | 48,1000119_10000119,48 51 | 49,1000123_10000123,49 52 | 50,1000124_10000124,50 53 | 51,1000125_10000125,51 54 | 52,1000126_10000126,52 55 | 53,1000135_10000135,53 56 | 54,1000139_10000139,54 57 | 55,1000142_10000142,55 58 | 56,1000144_10000144,56 59 | 57,1000148_10000148,57 60 | 58,1000149_10000149,58 61 | 59,1000150_10000150,59 62 | 60,1000152_10000152,60 63 | 61,1000154_10000154,61 64 | 62,1000155_10000155,62 65 | 63,1000158_10000158,63 66 | 64,1000159_10000159,64 67 | 65,1000162_10000162,65 68 | 66,1000165_10000165,66 69 | 67,1000166_10000166,67 70 | 68,1000168_10000168,68 71 | 69,1000169_10000169,69 72 | 70,1000171_10000171,70 73 | 71,1000172_10000172,71 74 | 72,1000174_10000174,72 75 | 73,1000176_10000176,73 76 | 74,1000178_10000178,74 77 | 75,1000182_10000182,75 78 | 76,1000185_10000185,76 79 | 77,1000187_10000187,77 80 | 78,1000188_10000188,78 81 | 79,1000191_10000191,79 82 | 80,1000192_10000192,80 83 | 81,1000193_10000193,81 84 | 82,1000198_10000198,82 85 | 83,1000202_10000202,83 86 | 84,1000204_10000204,84 87 | 85,1000205_10000205,85 88 | 86,1000207_10000207,86 89 | 87,1000209_10000209,87 90 | 88,1000210_10000210,88 91 | 89,1000212_10000212,89 92 | 90,1000214_10000214,90 93 | 91,1000215_10000215,91 94 | 92,1000217_10000217,92 95 | 93,1000218_10000218,93 96 | 94,1000220_10000220,94 97 | 95,1000221_10000221,95 98 | 96,1000223_10000223,96 99 | 97,1000224_10000224,97 100 | 98,1000225_10000225,98 101 | 99,1000226_10000226,99 102 | 100,1000227_10000227,100 103 | 101,1000229_10000229,101 104 | 102,1000230_10000230,102 105 | 103,1000232_10000232,103 106 | 104,1000234_10000234,104 107 | 105,1000237_10000237,105 108 | 106,1000240_10000240,106 109 | 107,1000241_10000241,107 110 | 108,1000242_10000242,108 111 | 109,1000243_10000243,109 112 | 110,1000245_10000245,110 113 | 111,1000246_10000246,111 114 | 112,1000249_10000249,112 115 | -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/sampleID_indices_with_obs.csv: -------------------------------------------------------------------------------- 1 | ,0,1 2 | 0,1000000_10000000_T1,0 3 | 1,1000000_10000000_T2,1 4 | 2,1000000_10000000_T3,2 5 | 3,1000001_10000001_T1,3 6 | 4,1000001_10000001_T2,4 7 | 5,1000001_10000001_T3,5 8 | 6,1000002_10000002_T1,6 9 | 7,1000002_10000002_T2,7 10 | 8,1000002_10000002_T3,8 11 | 9,1000003_10000003_T1,9 12 | 10,1000003_10000003_T2,10 13 | 11,1000003_10000003_T3,11 14 | 12,1000004_10000004_T1,12 15 | 13,1000004_10000004_T2,13 16 | 14,1000004_10000004_T3,14 17 | 15,1000005_10000005_T1,15 18 | 16,1000005_10000005_T2,16 19 | 17,1000005_10000005_T3,17 20 | 18,1000006_10000006_T1,18 21 | 19,1000006_10000006_T2,19 22 | 20,1000006_10000006_T3,20 23 | 21,1000007_10000007_T1,21 24 | 22,1000007_10000007_T2,22 25 | 23,1000007_10000007_T3,23 26 | 24,1000008_10000008_T1,24 27 | 25,1000008_10000008_T2,25 28 | 26,1000008_10000008_T3,26 29 | 27,1000011_10000011_T1,27 30 | 28,1000011_10000011_T2,28 31 | 29,1000011_10000011_T3,29 32 | 30,1000013_10000013_T1,30 33 | 31,1000013_10000013_T2,31 34 | 32,1000013_10000013_T3,32 35 | 33,1000015_10000015_T1,33 36 | 34,1000015_10000015_T2,34 37 | 35,1000015_10000015_T3,35 38 | 36,1000017_10000017_T1,36 39 | 37,1000017_10000017_T2,37 40 | 38,1000017_10000017_T3,38 41 | 39,1000018_10000018_T1,39 42 | 40,1000018_10000018_T2,40 43 | 41,1000018_10000018_T3,41 44 | -------------------------------------------------------------------------------- /Onset of Labor/data/processed_data/sampleID_indices_with_obs_word2vec_from_ool.csv: -------------------------------------------------------------------------------- 1 | ,0,1 2 | 0,1000000_10000000_T1,0 3 | 1,1000000_10000000_T2,1 4 | 2,1000000_10000000_T3,2 5 | 3,1000001_10000001_T1,3 6 | 4,1000001_10000001_T2,4 7 | 5,1000001_10000001_T3,5 8 | 6,1000002_10000002_T1,6 9 | 7,1000002_10000002_T2,7 10 | 8,1000002_10000002_T3,8 11 | 9,1000003_10000003_T1,9 12 | 10,1000003_10000003_T2,10 13 | 11,1000003_10000003_T3,11 14 | 12,1000004_10000004_T1,12 15 | 13,1000004_10000004_T2,13 16 | 14,1000004_10000004_T3,14 17 | 15,1000005_10000005_T1,15 18 | 16,1000005_10000005_T2,16 19 | 17,1000005_10000005_T3,17 20 | 18,1000006_10000006_T1,18 21 | 19,1000006_10000006_T2,19 22 | 20,1000006_10000006_T3,20 23 | 21,1000007_10000007_T1,21 24 | 22,1000007_10000007_T2,22 25 | 23,1000007_10000007_T3,23 26 | 24,1000008_10000008_T1,24 27 | 25,1000008_10000008_T2,25 28 | 26,1000008_10000008_T3,26 29 | 27,1000011_10000011_T1,27 30 | 28,1000011_10000011_T2,28 31 | 29,1000011_10000011_T3,29 32 | 30,1000013_10000013_T1,30 33 | 31,1000013_10000013_T2,31 34 | 32,1000013_10000013_T3,32 35 | 33,1000015_10000015_T1,33 36 | 34,1000015_10000015_T2,34 37 | 35,1000015_10000015_T3,35 38 | 36,1000017_10000017_T1,36 39 | 37,1000017_10000017_T2,37 40 | 38,1000017_10000017_T3,38 41 | 39,1000018_10000018_T1,39 42 | 40,1000018_10000018_T2,40 43 | 41,1000018_10000018_T3,41 44 | -------------------------------------------------------------------------------- /Onset of Labor/data/raw_data/EHR/EHR_cohort_conditions.csv: -------------------------------------------------------------------------------- 1 | mom_person_id,child_person_id,condition_concept_id,condition_start_DATETIME,child_birth_date 2 | 1000000,10000000,10,2024-04-12 19:23:34.042570,2024-05-30 19:23:34.042570 3 | 1000000,10000000,3,2023-11-25 19:23:34.042570,2024-05-30 19:23:34.042570 4 | 1000000,10000000,4,2024-04-06 19:23:34.042570,2024-05-30 19:23:34.042570 5 | 1000000,10000000,5,2024-01-21 19:23:34.042570,2024-05-30 19:23:34.042570 6 | 1000001,10000001,5,2023-08-02 19:23:34.042644,2023-12-14 19:23:34.042644 7 | 1000001,10000001,2,2023-07-01 19:23:34.042644,2023-12-14 19:23:34.042644 8 | 1000001,10000001,2,2023-09-17 19:23:34.042644,2023-12-14 19:23:34.042644 9 | 1000002,10000002,10,2023-09-05 19:23:34.042656,2024-05-26 19:23:34.042656 10 | 1000002,10000002,2,2024-04-04 19:23:34.042656,2024-05-26 19:23:34.042656 11 | 1000003,10000003,7,2024-03-31 19:23:34.042666,2024-06-30 19:23:34.042666 12 | 1000003,10000003,3,2023-10-11 19:23:34.042666,2024-06-30 19:23:34.042666 13 | 1000003,10000003,1,2023-12-08 19:23:34.042666,2024-06-30 19:23:34.042666 14 | 1000003,10000003,8,2024-05-12 19:23:34.042666,2024-06-30 19:23:34.042666 15 | 1000004,10000004,6,2024-03-04 19:23:34.042674,2024-03-05 19:23:34.042674 16 | 1000004,10000004,10,2024-01-12 19:23:34.042674,2024-03-05 19:23:34.042674 17 | 1000004,10000004,6,2023-06-20 19:23:34.042674,2024-03-05 19:23:34.042674 18 | 1000004,10000004,2,2023-08-28 19:23:34.042674,2024-03-05 19:23:34.042674 19 | 1000005,10000005,2,2024-07-08 19:23:34.042682,2024-08-20 19:23:34.042682 20 | 1000005,10000005,4,2024-02-01 19:23:34.042682,2024-08-20 19:23:34.042682 21 | 1000006,10000006,8,2023-09-03 19:23:34.042690,2024-05-30 19:23:34.042690 22 | 1000006,10000006,9,2023-10-29 19:23:34.042690,2024-05-30 19:23:34.042690 23 | 1000007,10000007,2,2023-10-12 19:23:34.042698,2024-05-11 19:23:34.042698 24 | 1000007,10000007,8,2024-03-20 19:23:34.042698,2024-05-11 19:23:34.042698 25 | 1000008,10000008,9,2023-07-07 19:23:34.042706,2024-02-08 19:23:34.042706 26 | 1000008,10000008,1,2023-09-05 19:23:34.042706,2024-02-08 19:23:34.042706 27 | 1000009,10000009,8,2024-06-06 19:23:34.042713,2024-06-14 19:23:34.042713 28 | 1000010,10000010,8,2024-01-19 19:23:34.042720,2024-06-02 19:23:34.042720 29 | 1000011,10000011,3,2023-11-25 19:23:34.042728,2024-04-11 19:23:34.042728 30 | 1000011,10000011,8,2024-01-22 19:23:34.042728,2024-04-11 19:23:34.042728 31 | 1000011,10000011,3,2023-11-01 19:23:34.042728,2024-04-11 19:23:34.042728 32 | 1000012,10000012,10,2023-08-16 19:23:34.042736,2024-05-02 19:23:34.042736 33 | 1000013,10000013,7,2024-03-04 19:23:34.042743,2024-04-13 19:23:34.042743 34 | 1000013,10000013,8,2023-09-26 19:23:34.042743,2024-04-13 19:23:34.042743 35 | 1000013,10000013,2,2023-07-21 19:23:34.042743,2024-04-13 19:23:34.042743 36 | 1000014,10000014,7,2023-11-10 19:23:34.042750,2023-12-27 19:23:34.042750 37 | 1000015,10000015,8,2024-01-01 19:23:34.042758,2024-03-02 19:23:34.042758 38 | 1000015,10000015,8,2023-11-25 19:23:34.042758,2024-03-02 19:23:34.042758 39 | 1000015,10000015,3,2023-08-02 19:23:34.042758,2024-03-02 19:23:34.042758 40 | 1000016,10000016,5,2023-04-26 19:23:34.042765,2023-12-08 19:23:34.042765 41 | 1000017,10000017,1,2023-11-24 19:23:34.042773,2024-04-02 19:23:34.042773 42 | 1000017,10000017,10,2024-03-29 19:23:34.042773,2024-04-02 19:23:34.042773 43 | 1000017,10000017,7,2023-07-23 19:23:34.042773,2024-04-02 19:23:34.042773 44 | 1000018,10000018,10,2024-01-26 19:23:34.042780,2024-08-19 19:23:34.042780 45 | 1000018,10000018,3,2024-07-09 19:23:34.042780,2024-08-19 19:23:34.042780 46 | 1000018,10000018,7,2024-06-18 19:23:34.042780,2024-08-19 19:23:34.042780 47 | 1000019,10000019,4,2023-11-11 19:23:34.042787,2024-01-01 19:23:34.042787 48 | -------------------------------------------------------------------------------- /Onset of Labor/data/raw_data/EHR/EHR_cohort_drugs.csv: -------------------------------------------------------------------------------- 1 | mom_person_id,child_person_id,drug_concept_id,drug_exposure_start_DATETIME,child_birth_date 2 | 1000000,10000000,4,2023-10-13 19:23:34.042570,2024-05-30 19:23:34.042570 3 | 1000000,10000000,7,2024-05-18 19:23:34.042570,2024-05-30 19:23:34.042570 4 | 1000001,10000001,6,2023-04-16 19:23:34.042644,2023-12-14 19:23:34.042644 5 | 1000001,10000001,10,2023-10-10 19:23:34.042644,2023-12-14 19:23:34.042644 6 | 1000001,10000001,9,2023-10-31 19:23:34.042644,2023-12-14 19:23:34.042644 7 | 1000002,10000002,4,2024-01-14 19:23:34.042656,2024-05-26 19:23:34.042656 8 | 1000003,10000003,10,2024-02-24 19:23:34.042666,2024-06-30 19:23:34.042666 9 | 1000003,10000003,1,2023-11-13 19:23:34.042666,2024-06-30 19:23:34.042666 10 | 1000004,10000004,4,2023-11-06 19:23:34.042674,2024-03-05 19:23:34.042674 11 | 1000005,10000005,3,2024-01-01 19:23:34.042682,2024-08-20 19:23:34.042682 12 | 1000006,10000006,8,2023-11-15 19:23:34.042690,2024-05-30 19:23:34.042690 13 | 1000006,10000006,5,2024-01-15 19:23:34.042690,2024-05-30 19:23:34.042690 14 | 1000006,10000006,3,2023-10-19 19:23:34.042690,2024-05-30 19:23:34.042690 15 | 1000007,10000007,8,2023-11-22 19:23:34.042698,2024-05-11 19:23:34.042698 16 | 1000007,10000007,8,2023-12-04 19:23:34.042698,2024-05-11 19:23:34.042698 17 | 1000008,10000008,1,2023-08-13 19:23:34.042706,2024-02-08 19:23:34.042706 18 | 1000009,10000009,7,2024-04-24 19:23:34.042713,2024-06-14 19:23:34.042713 19 | 1000009,10000009,5,2024-02-23 19:23:34.042713,2024-06-14 19:23:34.042713 20 | 1000010,10000010,1,2024-03-14 19:23:34.042720,2024-06-02 19:23:34.042720 21 | 1000011,10000011,6,2023-12-04 19:23:34.042728,2024-04-11 19:23:34.042728 22 | 1000011,10000011,1,2023-09-01 19:23:34.042728,2024-04-11 19:23:34.042728 23 | 1000012,10000012,2,2023-12-29 19:23:34.042736,2024-05-02 19:23:34.042736 24 | 1000013,10000013,10,2023-10-25 19:23:34.042743,2024-04-13 19:23:34.042743 25 | 1000014,10000014,7,2023-06-13 19:23:34.042750,2023-12-27 19:23:34.042750 26 | 1000014,10000014,8,2023-06-08 19:23:34.042750,2023-12-27 19:23:34.042750 27 | 1000014,10000014,1,2023-08-27 19:23:34.042750,2023-12-27 19:23:34.042750 28 | 1000014,10000014,6,2023-04-17 19:23:34.042750,2023-12-27 19:23:34.042750 29 | 1000015,10000015,6,2023-11-26 19:23:34.042758,2024-03-02 19:23:34.042758 30 | 1000015,10000015,1,2023-07-07 19:23:34.042758,2024-03-02 19:23:34.042758 31 | 1000015,10000015,6,2023-08-15 19:23:34.042758,2024-03-02 19:23:34.042758 32 | 1000015,10000015,4,2023-06-18 19:23:34.042758,2024-03-02 19:23:34.042758 33 | 1000016,10000016,4,2023-07-15 19:23:34.042765,2023-12-08 19:23:34.042765 34 | 1000016,10000016,4,2023-05-24 19:23:34.042765,2023-12-08 19:23:34.042765 35 | 1000016,10000016,7,2023-08-03 19:23:34.042765,2023-12-08 19:23:34.042765 36 | 1000016,10000016,9,2023-08-02 19:23:34.042765,2023-12-08 19:23:34.042765 37 | 1000017,10000017,10,2023-11-04 19:23:34.042773,2024-04-02 19:23:34.042773 38 | 1000018,10000018,10,2023-12-01 19:23:34.042780,2024-08-19 19:23:34.042780 39 | 1000018,10000018,9,2024-04-30 19:23:34.042780,2024-08-19 19:23:34.042780 40 | 1000018,10000018,1,2024-06-21 19:23:34.042780,2024-08-19 19:23:34.042780 41 | 1000019,10000019,5,2023-04-23 19:23:34.042787,2024-01-01 19:23:34.042787 42 | 1000019,10000019,7,2023-11-26 19:23:34.042787,2024-01-01 19:23:34.042787 43 | -------------------------------------------------------------------------------- /Onset of Labor/data/raw_data/EHR/EHR_cohort_measurements.csv: -------------------------------------------------------------------------------- 1 | mom_person_id,child_person_id,measurement_concept_id,measurement_DATETIME,child_birth_date,value_as_number 2 | 1000000,10000000,9,2024-02-21 19:23:34.042570,2024-05-30 19:23:34.042570,18.81211597237613 3 | 1000001,10000001,9,2023-09-30 19:23:34.042644,2023-12-14 19:23:34.042644,46.36984049399822 4 | 1000001,10000001,4,2023-05-23 19:23:34.042644,2023-12-14 19:23:34.042644,35.33522280260528 5 | 1000001,10000001,5,2023-07-24 19:23:34.042644,2023-12-14 19:23:34.042644,58.365611185087204 6 | 1000002,10000002,9,2024-02-18 19:23:34.042656,2024-05-26 19:23:34.042656,7.773463696498483 7 | 1000003,10000003,7,2024-05-09 19:23:34.042666,2024-06-30 19:23:34.042666,97.43948076661665 8 | 1000003,10000003,9,2024-02-08 19:23:34.042666,2024-06-30 19:23:34.042666,98.62107444796028 9 | 1000003,10000003,5,2024-05-23 19:23:34.042666,2024-06-30 19:23:34.042666,69.81617140197451 10 | 1000003,10000003,10,2024-05-20 19:23:34.042666,2024-06-30 19:23:34.042666,53.60963663441204 11 | 1000004,10000004,10,2023-07-27 19:23:34.042674,2024-03-05 19:23:34.042674,30.952761628632775 12 | 1000004,10000004,3,2023-10-25 19:23:34.042674,2024-03-05 19:23:34.042674,81.37950197069486 13 | 1000004,10000004,2,2023-08-04 19:23:34.042674,2024-03-05 19:23:34.042674,68.47311725538793 14 | 1000005,10000005,9,2024-06-06 19:23:34.042682,2024-08-20 19:23:34.042682,16.26169393448913 15 | 1000005,10000005,10,2024-06-08 19:23:34.042682,2024-08-20 19:23:34.042682,91.09271844938425 16 | 1000005,10000005,6,2023-12-12 19:23:34.042682,2024-08-20 19:23:34.042682,82.2537242923169 17 | 1000006,10000006,8,2023-12-09 19:23:34.042690,2024-05-30 19:23:34.042690,94.9799913291924 18 | 1000006,10000006,9,2023-12-13 19:23:34.042690,2024-05-30 19:23:34.042690,72.571950838836 19 | 1000006,10000006,2,2024-01-30 19:23:34.042690,2024-05-30 19:23:34.042690,61.34151959357899 20 | 1000007,10000007,5,2024-04-13 19:23:34.042698,2024-05-11 19:23:34.042698,41.82430362906189 21 | 1000008,10000008,1,2023-09-26 19:23:34.042706,2024-02-08 19:23:34.042706,93.27284833540132 22 | 1000008,10000008,7,2023-09-17 19:23:34.042706,2024-02-08 19:23:34.042706,86.60638895004084 23 | 1000009,10000009,4,2024-02-04 19:23:34.042713,2024-06-14 19:23:34.042713,4.521867010618941 24 | 1000010,10000010,3,2023-09-25 19:23:34.042720,2024-06-02 19:23:34.042720,2.6366974497252005 25 | 1000010,10000010,7,2023-12-04 19:23:34.042720,2024-06-02 19:23:34.042720,37.64633668780496 26 | 1000011,10000011,10,2023-09-23 19:23:34.042728,2024-04-11 19:23:34.042728,81.0553330781833 27 | 1000011,10000011,7,2024-03-24 19:23:34.042728,2024-04-11 19:23:34.042728,98.72761293149445 28 | 1000011,10000011,3,2023-12-08 19:23:34.042728,2024-04-11 19:23:34.042728,15.041689110352818 29 | 1000012,10000012,2,2023-12-14 19:23:34.042736,2024-05-02 19:23:34.042736,59.41307153521351 30 | 1000013,10000013,10,2023-12-10 19:23:34.042743,2024-04-13 19:23:34.042743,38.08908566310215 31 | 1000013,10000013,8,2023-11-18 19:23:34.042743,2024-04-13 19:23:34.042743,96.99143978146031 32 | 1000013,10000013,3,2024-04-13 19:23:34.042743,2024-04-13 19:23:34.042743,84.21189231357087 33 | 1000013,10000013,9,2023-10-07 19:23:34.042743,2024-04-13 19:23:34.042743,83.83287047111378 34 | 1000014,10000014,6,2023-09-02 19:23:34.042750,2023-12-27 19:23:34.042750,46.86931597949703 35 | 1000014,10000014,5,2023-10-31 19:23:34.042750,2023-12-27 19:23:34.042750,41.48195023376652 36 | 1000014,10000014,6,2023-10-09 19:23:34.042750,2023-12-27 19:23:34.042750,27.340707193070624 37 | 1000015,10000015,5,2023-08-13 19:23:34.042758,2024-03-02 19:23:34.042758,5.6375496650927115 38 | 1000016,10000016,3,2023-04-26 19:23:34.042765,2023-12-08 19:23:34.042765,86.47223762550531 39 | 1000016,10000016,9,2023-11-19 19:23:34.042765,2023-12-08 19:23:34.042765,81.29010091300776 40 | 1000016,10000016,9,2023-10-23 19:23:34.042765,2023-12-08 19:23:34.042765,99.97176732861305 41 | 1000016,10000016,1,2023-11-25 19:23:34.042765,2023-12-08 19:23:34.042765,99.66368370739053 42 | 1000017,10000017,6,2024-02-09 19:23:34.042773,2024-04-02 19:23:34.042773,55.543170560262745 43 | 1000018,10000018,7,2024-03-29 19:23:34.042780,2024-08-19 19:23:34.042780,76.89874151805105 44 | 1000018,10000018,10,2024-08-08 19:23:34.042780,2024-08-19 19:23:34.042780,94.47657298824281 45 | 1000018,10000018,8,2024-01-09 19:23:34.042780,2024-08-19 19:23:34.042780,84.96473906774115 46 | 1000019,10000019,5,2023-07-30 19:23:34.042787,2024-01-01 19:23:34.042787,24.734810174319765 47 | 1000019,10000019,10,2023-12-25 19:23:34.042787,2024-01-01 19:23:34.042787,45.05441353100935 48 | -------------------------------------------------------------------------------- /Onset of Labor/data/raw_data/EHR/EHR_cohort_observations.csv: -------------------------------------------------------------------------------- 1 | mom_person_id,child_person_id,observation_concept_id,observation_DATETIME,child_birth_date 2 | 1000000,10000000,3,2024-02-01 19:23:34.042570,2024-05-30 19:23:34.042570 3 | 1000001,10000001,7,2023-10-18 19:23:34.042644,2023-12-14 19:23:34.042644 4 | 1000001,10000001,10,2023-04-30 19:23:34.042644,2023-12-14 19:23:34.042644 5 | 1000001,10000001,7,2023-08-20 19:23:34.042644,2023-12-14 19:23:34.042644 6 | 1000001,10000001,9,2023-08-10 19:23:34.042644,2023-12-14 19:23:34.042644 7 | 1000002,10000002,10,2024-01-19 19:23:34.042656,2024-05-26 19:23:34.042656 8 | 1000002,10000002,1,2024-01-26 19:23:34.042656,2024-05-26 19:23:34.042656 9 | 1000002,10000002,2,2023-10-01 19:23:34.042656,2024-05-26 19:23:34.042656 10 | 1000002,10000002,6,2024-02-21 19:23:34.042656,2024-05-26 19:23:34.042656 11 | 1000003,10000003,8,2024-04-03 19:23:34.042666,2024-06-30 19:23:34.042666 12 | 1000003,10000003,5,2023-11-07 19:23:34.042666,2024-06-30 19:23:34.042666 13 | 1000003,10000003,7,2023-10-03 19:23:34.042666,2024-06-30 19:23:34.042666 14 | 1000003,10000003,5,2023-12-22 19:23:34.042666,2024-06-30 19:23:34.042666 15 | 1000004,10000004,3,2023-07-03 19:23:34.042674,2024-03-05 19:23:34.042674 16 | 1000004,10000004,10,2023-12-21 19:23:34.042674,2024-03-05 19:23:34.042674 17 | 1000005,10000005,9,2024-05-27 19:23:34.042682,2024-08-20 19:23:34.042682 18 | 1000005,10000005,5,2024-01-14 19:23:34.042682,2024-08-20 19:23:34.042682 19 | 1000005,10000005,1,2024-07-05 19:23:34.042682,2024-08-20 19:23:34.042682 20 | 1000005,10000005,4,2024-05-19 19:23:34.042682,2024-08-20 19:23:34.042682 21 | 1000006,10000006,10,2023-11-09 19:23:34.042690,2024-05-30 19:23:34.042690 22 | 1000007,10000007,4,2024-04-03 19:23:34.042698,2024-05-11 19:23:34.042698 23 | 1000007,10000007,5,2023-09-01 19:23:34.042698,2024-05-11 19:23:34.042698 24 | 1000008,10000008,10,2023-06-04 19:23:34.042706,2024-02-08 19:23:34.042706 25 | 1000008,10000008,5,2023-06-08 19:23:34.042706,2024-02-08 19:23:34.042706 26 | 1000008,10000008,2,2023-11-01 19:23:34.042706,2024-02-08 19:23:34.042706 27 | 1000009,10000009,10,2023-11-05 19:23:34.042713,2024-06-14 19:23:34.042713 28 | 1000009,10000009,3,2023-10-09 19:23:34.042713,2024-06-14 19:23:34.042713 29 | 1000009,10000009,1,2024-03-11 19:23:34.042713,2024-06-14 19:23:34.042713 30 | 1000009,10000009,8,2024-06-11 19:23:34.042713,2024-06-14 19:23:34.042713 31 | 1000010,10000010,2,2023-09-30 19:23:34.042720,2024-06-02 19:23:34.042720 32 | 1000010,10000010,4,2023-12-26 19:23:34.042720,2024-06-02 19:23:34.042720 33 | 1000010,10000010,2,2024-01-03 19:23:34.042720,2024-06-02 19:23:34.042720 34 | 1000010,10000010,1,2023-12-23 19:23:34.042720,2024-06-02 19:23:34.042720 35 | 1000011,10000011,5,2023-10-16 19:23:34.042728,2024-04-11 19:23:34.042728 36 | 1000012,10000012,1,2024-02-28 19:23:34.042736,2024-05-02 19:23:34.042736 37 | 1000012,10000012,10,2023-11-17 19:23:34.042736,2024-05-02 19:23:34.042736 38 | 1000012,10000012,2,2024-03-21 19:23:34.042736,2024-05-02 19:23:34.042736 39 | 1000012,10000012,3,2023-08-24 19:23:34.042736,2024-05-02 19:23:34.042736 40 | 1000013,10000013,7,2024-03-10 19:23:34.042743,2024-04-13 19:23:34.042743 41 | 1000013,10000013,8,2024-01-15 19:23:34.042743,2024-04-13 19:23:34.042743 42 | 1000014,10000014,2,2023-09-29 19:23:34.042750,2023-12-27 19:23:34.042750 43 | 1000015,10000015,7,2023-11-19 19:23:34.042758,2024-03-02 19:23:34.042758 44 | 1000015,10000015,10,2023-08-20 19:23:34.042758,2024-03-02 19:23:34.042758 45 | 1000015,10000015,8,2023-11-10 19:23:34.042758,2024-03-02 19:23:34.042758 46 | 1000016,10000016,5,2023-05-06 19:23:34.042765,2023-12-08 19:23:34.042765 47 | 1000016,10000016,4,2023-04-04 19:23:34.042765,2023-12-08 19:23:34.042765 48 | 1000016,10000016,3,2023-05-31 19:23:34.042765,2023-12-08 19:23:34.042765 49 | 1000016,10000016,3,2023-06-15 19:23:34.042765,2023-12-08 19:23:34.042765 50 | 1000017,10000017,2,2024-02-27 19:23:34.042773,2024-04-02 19:23:34.042773 51 | 1000017,10000017,8,2023-12-29 19:23:34.042773,2024-04-02 19:23:34.042773 52 | 1000017,10000017,5,2023-11-04 19:23:34.042773,2024-04-02 19:23:34.042773 53 | 1000017,10000017,1,2023-07-10 19:23:34.042773,2024-04-02 19:23:34.042773 54 | 1000018,10000018,7,2024-08-07 19:23:34.042780,2024-08-19 19:23:34.042780 55 | 1000018,10000018,3,2024-01-16 19:23:34.042780,2024-08-19 19:23:34.042780 56 | 1000018,10000018,1,2023-11-21 19:23:34.042780,2024-08-19 19:23:34.042780 57 | 1000019,10000019,9,2023-10-10 19:23:34.042787,2024-01-01 19:23:34.042787 58 | 1000019,10000019,3,2023-06-29 19:23:34.042787,2024-01-01 19:23:34.042787 59 | -------------------------------------------------------------------------------- /Onset of Labor/data/raw_data/EHR/EHR_cohort_procedures.csv: -------------------------------------------------------------------------------- 1 | mom_person_id,child_person_id,procedure_concept_id,procedure_DATETIME,child_birth_date 2 | 1000000,10000000,3,2023-10-11 19:23:34.042570,2024-05-30 19:23:34.042570 3 | 1000001,10000001,3,2023-05-21 19:23:34.042644,2023-12-14 19:23:34.042644 4 | 1000001,10000001,6,2023-07-16 19:23:34.042644,2023-12-14 19:23:34.042644 5 | 1000001,10000001,1,2023-08-17 19:23:34.042644,2023-12-14 19:23:34.042644 6 | 1000002,10000002,1,2024-02-01 19:23:34.042656,2024-05-26 19:23:34.042656 7 | 1000002,10000002,4,2024-02-13 19:23:34.042656,2024-05-26 19:23:34.042656 8 | 1000002,10000002,3,2023-09-16 19:23:34.042656,2024-05-26 19:23:34.042656 9 | 1000002,10000002,9,2024-02-05 19:23:34.042656,2024-05-26 19:23:34.042656 10 | 1000003,10000003,2,2024-01-30 19:23:34.042666,2024-06-30 19:23:34.042666 11 | 1000003,10000003,6,2023-12-20 19:23:34.042666,2024-06-30 19:23:34.042666 12 | 1000003,10000003,9,2024-01-20 19:23:34.042666,2024-06-30 19:23:34.042666 13 | 1000004,10000004,4,2023-09-27 19:23:34.042674,2024-03-05 19:23:34.042674 14 | 1000004,10000004,4,2023-06-03 19:23:34.042674,2024-03-05 19:23:34.042674 15 | 1000004,10000004,3,2023-10-23 19:23:34.042674,2024-03-05 19:23:34.042674 16 | 1000004,10000004,1,2023-10-30 19:23:34.042674,2024-03-05 19:23:34.042674 17 | 1000005,10000005,6,2024-07-30 19:23:34.042682,2024-08-20 19:23:34.042682 18 | 1000005,10000005,3,2024-01-04 19:23:34.042682,2024-08-20 19:23:34.042682 19 | 1000005,10000005,2,2023-12-01 19:23:34.042682,2024-08-20 19:23:34.042682 20 | 1000005,10000005,1,2023-11-18 19:23:34.042682,2024-08-20 19:23:34.042682 21 | 1000006,10000006,4,2024-01-05 19:23:34.042690,2024-05-30 19:23:34.042690 22 | 1000006,10000006,1,2024-04-26 19:23:34.042690,2024-05-30 19:23:34.042690 23 | 1000006,10000006,5,2024-05-14 19:23:34.042690,2024-05-30 19:23:34.042690 24 | 1000006,10000006,3,2024-05-25 19:23:34.042690,2024-05-30 19:23:34.042690 25 | 1000007,10000007,5,2023-09-22 19:23:34.042698,2024-05-11 19:23:34.042698 26 | 1000007,10000007,1,2023-08-06 19:23:34.042698,2024-05-11 19:23:34.042698 27 | 1000007,10000007,3,2023-11-13 19:23:34.042698,2024-05-11 19:23:34.042698 28 | 1000007,10000007,1,2023-11-06 19:23:34.042698,2024-05-11 19:23:34.042698 29 | 1000008,10000008,5,2023-08-02 19:23:34.042706,2024-02-08 19:23:34.042706 30 | 1000008,10000008,3,2023-09-01 19:23:34.042706,2024-02-08 19:23:34.042706 31 | 1000008,10000008,2,2023-10-04 19:23:34.042706,2024-02-08 19:23:34.042706 32 | 1000008,10000008,6,2023-07-01 19:23:34.042706,2024-02-08 19:23:34.042706 33 | 1000009,10000009,8,2023-12-24 19:23:34.042713,2024-06-14 19:23:34.042713 34 | 1000009,10000009,6,2024-02-22 19:23:34.042713,2024-06-14 19:23:34.042713 35 | 1000010,10000010,10,2024-01-28 19:23:34.042720,2024-06-02 19:23:34.042720 36 | 1000010,10000010,8,2023-09-04 19:23:34.042720,2024-06-02 19:23:34.042720 37 | 1000010,10000010,7,2023-10-01 19:23:34.042720,2024-06-02 19:23:34.042720 38 | 1000011,10000011,7,2023-10-25 19:23:34.042728,2024-04-11 19:23:34.042728 39 | 1000012,10000012,10,2024-04-20 19:23:34.042736,2024-05-02 19:23:34.042736 40 | 1000012,10000012,4,2024-03-28 19:23:34.042736,2024-05-02 19:23:34.042736 41 | 1000013,10000013,3,2023-07-25 19:23:34.042743,2024-04-13 19:23:34.042743 42 | 1000014,10000014,2,2023-08-07 19:23:34.042750,2023-12-27 19:23:34.042750 43 | 1000014,10000014,6,2023-05-27 19:23:34.042750,2023-12-27 19:23:34.042750 44 | 1000014,10000014,9,2023-11-07 19:23:34.042750,2023-12-27 19:23:34.042750 45 | 1000015,10000015,6,2023-12-31 19:23:34.042758,2024-03-02 19:23:34.042758 46 | 1000015,10000015,6,2024-01-05 19:23:34.042758,2024-03-02 19:23:34.042758 47 | 1000016,10000016,10,2023-06-12 19:23:34.042765,2023-12-08 19:23:34.042765 48 | 1000017,10000017,6,2023-07-07 19:23:34.042773,2024-04-02 19:23:34.042773 49 | 1000017,10000017,1,2023-12-24 19:23:34.042773,2024-04-02 19:23:34.042773 50 | 1000018,10000018,5,2024-06-28 19:23:34.042780,2024-08-19 19:23:34.042780 51 | 1000018,10000018,4,2024-05-09 19:23:34.042780,2024-08-19 19:23:34.042780 52 | 1000018,10000018,4,2023-12-02 19:23:34.042780,2024-08-19 19:23:34.042780 53 | 1000018,10000018,3,2024-07-04 19:23:34.042780,2024-08-19 19:23:34.042780 54 | 1000019,10000019,2,2023-08-11 19:23:34.042787,2024-01-01 19:23:34.042787 55 | 1000019,10000019,10,2023-08-31 19:23:34.042787,2024-01-01 19:23:34.042787 56 | 1000019,10000019,3,2023-07-28 19:23:34.042787,2024-01-01 19:23:34.042787 57 | -------------------------------------------------------------------------------- /Onset of Labor/process_EHR_data_full_PT_cohort.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "87e061dd", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "from tqdm.notebook import tqdm\n", 13 | "from scipy.stats import pearsonr\n", 14 | "import gensim\n", 15 | "from gensim.models import Word2Vec\n", 16 | "from gensim.models.callbacks import CallbackAny2Vec\n", 17 | "import random\n", 18 | "import pickle\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "import seaborn as sns\n", 21 | "import dask\n", 22 | "import dask.dataframe as dd\n", 23 | "import os" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "9faa9869", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "directories = [\n", 34 | " 'models',\n", 35 | " 'models/hyperparameters',\n", 36 | " 'models/predictive_models',\n", 37 | " 'results'\n", 38 | "]\n", 39 | "\n", 40 | "# Create each directory\n", 41 | "for directory in directories:\n", 42 | " os.makedirs(directory, exist_ok=True)\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "id": "e9d808da", 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "CPU times: user 19.3 ms, sys: 4.52 ms, total: 23.8 ms\n", 56 | "Wall time: 22.1 ms\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "%%time\n", 62 | "#load data\n", 63 | "#the person_id, concept_id, and date columns are extracted from OMOP tables for people who delivered babies at Stanford\n", 64 | "conds = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_conditions.csv')\n", 65 | "conds = conds[conds['condition_concept_id'] != 0]\n", 66 | "drugs = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_drugs.csv')\n", 67 | "drugs = drugs[drugs['drug_concept_id'] != 0]\n", 68 | "procs = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_procedures.csv')\n", 69 | "procs = procs[procs['procedure_concept_id'] != 0]\n", 70 | "obs = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_observations.csv')\n", 71 | "obs = obs[obs['observation_concept_id'] != 0]\n", 72 | "\n", 73 | "conds['condition_start_DATETIME'] = pd.to_datetime(conds['condition_start_DATETIME'])\n", 74 | "procs['procedure_DATETIME'] = pd.to_datetime(procs['procedure_DATETIME'])\n", 75 | "drugs['drug_exposure_start_DATETIME'] = pd.to_datetime(drugs['drug_exposure_start_DATETIME'])\n", 76 | "obs['observation_DATETIME'] = pd.to_datetime(obs['observation_DATETIME'])\n", 77 | "\n", 78 | "conds['child_birth_date'] = pd.to_datetime(conds['child_birth_date'])\n", 79 | "procs['child_birth_date'] = pd.to_datetime(procs['child_birth_date'])\n", 80 | "drugs['child_birth_date'] = pd.to_datetime(drugs['child_birth_date'])\n", 81 | "obs['child_birth_date'] = pd.to_datetime(obs['child_birth_date'])" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "id": "48776334", 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "CPU times: user 8.55 ms, sys: 0 ns, total: 8.55 ms\n", 95 | "Wall time: 6.78 ms\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "%%time\n", 101 | "measurements = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_measurements.csv')\n", 102 | "measurements = measurements[~pd.isnull(measurements['value_as_number'])]\n", 103 | "measurements = measurements[measurements['measurement_concept_id'] != 0]\n", 104 | "measurements['measurement_DATETIME'] = pd.to_datetime(measurements['measurement_DATETIME'])\n", 105 | "measurements['child_birth_date'] = pd.to_datetime(measurements['child_birth_date'])" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 4, 111 | "id": "3cbd4572", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "#identify ppl who have omics data so they can be EXCLUDED from the pre-training cohort\n", 116 | "OOL_cohort_omop = pd.read_csv('./data/ool_EHR_features.csv')['mom_person_id'].values" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 5, 122 | "id": "348331a6", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "#remove EHR data from ppl who are in the omics cohort\n", 127 | "conds = conds[~conds['mom_person_id'].isin(OOL_cohort_omop)]\n", 128 | "procs = procs[~procs['mom_person_id'].isin(OOL_cohort_omop)]\n", 129 | "drugs = drugs[~drugs['mom_person_id'].isin(OOL_cohort_omop)]\n", 130 | "measurements = measurements[~measurements['mom_person_id'].isin(OOL_cohort_omop)]\n", 131 | "obs = obs[~obs['mom_person_id'].isin(OOL_cohort_omop)]" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 6, 137 | "id": "d4826244", 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "230" 144 | ] 145 | }, 146 | "execution_count": 6, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "conds['mom_person_id'].nunique()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 7, 158 | "id": "a1cf6ad5", 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "\n", 163 | "def filter_df(df, birth_time, time_col, time_range_days=280):\n", 164 | " \"\"\"\n", 165 | " A function to remove entries in a dataframe prior to time of birth. \n", 166 | " \n", 167 | " df: The dataframe to filter. Must contain a column called person_id with the OMOP ID of the mother\n", 168 | " birth_time: A dataframe that contains two columns: maternal_OMOP and birth_DATETIME\n", 169 | " time_col: The index of the column with the date of the event in df\n", 170 | " time_range_days: keeps data from delivery up to time_range_days prior \n", 171 | " \n", 172 | " \"\"\"\n", 173 | " print('There were {} patients before filtering.'.format(len(df['person_id'].unique())))\n", 174 | " df = df.merge(birth_time, how='inner', left_on='person_id', right_on='maternal_OMOP')\n", 175 | " df['diff'] = df['birth_DATETIME']-df[time_col]\n", 176 | " new_df = df[(df['diff'].dt.days > 0) & (df['diff'].dt.days <= time_range_days)].drop('maternal_OMOP', axis=1)\n", 177 | " print('There were {} patients after filtering.'.format(len(new_df['person_id'].unique())))\n", 178 | " return new_df\n", 179 | "\n", 180 | "#function to help with appropriate sample / patient labeling \n", 181 | "def generate_features_EHR_cohort(proteomics, input_df, time_col_name, concept_id_col, indicator, binary=True):\n", 182 | " df = proteomics[['DOS','mom_person_id','child_person_id']].merge(input_df, how='left', on=['mom_person_id','child_person_id'])\n", 183 | " df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days\n", 184 | " df = df[df['delta'] < df['DOS']]\n", 185 | " df['sample_ID'] = df['mom_person_id'].astype(str)+'_'+df['child_person_id'].astype(str)\n", 186 | " return df\n", 187 | " " 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 8, 193 | "id": "9def4745", 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "#pick a random date during pregnancy for the women so we can create an artificial sampling time\n", 198 | "#we will use EHR data from beginning of pregnancy up until this sampling time for features\n", 199 | "#and number of days from this sampling time to birth as the \"time to onset of labor\" pre-training problem\n", 200 | "time_col_name = 'condition_start_DATETIME'\n", 201 | "df = conds\n", 202 | "df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days\n", 203 | "min_ool = df[['mom_person_id','child_person_id','delta']].groupby(['mom_person_id','child_person_id']).min()\n", 204 | "min_ool.columns = ['min_delta']\n", 205 | "max_ool = df[['mom_person_id','child_person_id','delta']].groupby(['mom_person_id','child_person_id']).max()\n", 206 | "max_ool.columns = ['max_delta']\n", 207 | "sampling_df = pd.concat([min_ool, max_ool],axis=1)\n", 208 | "\n", 209 | "np.random.seed(3)\n", 210 | "sampling_df = sampling_df[((sampling_df['max_delta'] - sampling_df['min_delta']) >= 7) == True]\n", 211 | "sampling_df['DOS'] = ((sampling_df['max_delta'] - sampling_df['min_delta'] - 7) * np.random.rand(sampling_df.shape[0]) + sampling_df['min_delta']).astype(int)\n", 212 | "sampling_df = sampling_df[sampling_df['max_delta'] > -100]\n", 213 | "#sample from last 100 days of pregnancy to mirror design of omics study\n", 214 | "sampling_df['DOS'] = sampling_df.apply(lambda row: int(np.random.uniform(-100, row['max_delta'])), axis=1)\n", 215 | "sampling_df = sampling_df.reset_index()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 9, 221 | "id": "c260a873", 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "CPU times: user 30 ms, sys: 341 µs, total: 30.3 ms\n", 229 | "Wall time: 28.5 ms\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "%%time\n", 235 | "condition_features_EHR = generate_features_EHR_cohort(sampling_df, conds, 'condition_start_DATETIME','condition_concept_id','C')\n", 236 | "procedure_features_EHR = generate_features_EHR_cohort(sampling_df, procs, 'procedure_DATETIME','procedure_concept_id','P')\n", 237 | "drug_features_EHR = generate_features_EHR_cohort(sampling_df, drugs, 'drug_exposure_start_DATETIME','drug_concept_id','D')\n", 238 | "measurement_features_EHR = generate_features_EHR_cohort(sampling_df, measurements, 'measurement_DATETIME','measurement_concept_id','M')\n", 239 | "observation_features_EHR = generate_features_EHR_cohort(sampling_df, obs, 'observation_DATETIME','observation_concept_id','O')\n" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 10, 245 | "id": "7422c01b", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "class EpochLogger(CallbackAny2Vec):\n", 250 | " def __init__(self):\n", 251 | " self.epoch = 0\n", 252 | " self.losses = []\n", 253 | "\n", 254 | " def on_epoch_begin(self, model):\n", 255 | " print(f\"Starting epoch #{self.epoch}\")\n", 256 | "\n", 257 | " def on_epoch_end(self, model):\n", 258 | " print(f\"Finished epoch #{self.epoch}\")\n", 259 | " loss = model.get_latest_training_loss()\n", 260 | " self.losses.append(loss)\n", 261 | " print(self.losses)\n", 262 | " print(f' Loss: {loss}')\n", 263 | " self.epoch += 1" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 11, 269 | "id": "7d6a36dd", 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "CPU times: user 530 µs, sys: 3.41 ms, total: 3.94 ms\n", 277 | "Wall time: 2.19 ms\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "%%time\n", 283 | "#learn word2vec embeddings\n", 284 | "try:\n", 285 | " model = Word2Vec.load(\"./models/word2vec_full_pregnancy_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n", 286 | "except:\n", 287 | " print('training new model!')\n", 288 | " epoch_logger = EpochLogger()\n", 289 | "\n", 290 | " word2vec_conds = condition_features_EHR[['sample_ID','condition_concept_id','condition_start_DATETIME']]\n", 291 | " word2vec_conds.columns = ['sample_ID','concept_id','ts']\n", 292 | "\n", 293 | " word2vec_procs = procedure_features_EHR[['sample_ID','procedure_concept_id','procedure_DATETIME']]\n", 294 | " word2vec_procs.columns = ['sample_ID','concept_id','ts']\n", 295 | "\n", 296 | " word2vec_drug = drug_features_EHR[['sample_ID','drug_concept_id','drug_exposure_start_DATETIME']]\n", 297 | " word2vec_drug.columns = ['sample_ID','concept_id','ts']\n", 298 | " \n", 299 | " word2vec_mea = measurement_features_EHR[['sample_ID','measurement_concept_id','measurement_DATETIME']]\n", 300 | " word2vec_mea.columns = ['sample_ID','concept_id','ts']\n", 301 | " \n", 302 | " word2vec_obs = observation_features_EHR[['sample_ID','observation_concept_id','observation_DATETIME']]\n", 303 | " word2vec_obs.columns = ['sample_ID','concept_id','ts']\n", 304 | " \n", 305 | " word2vec_data = pd.concat([word2vec_conds, word2vec_procs, word2vec_drug,word2vec_mea, word2vec_obs],axis=0)\n", 306 | " word2vec_data['date'] = pd.to_datetime(word2vec_data['ts'])\n", 307 | " word2vec_data['date'] = word2vec_data['date'].dt.date\n", 308 | " word2vec_data = word2vec_data.drop('ts',axis=1)\n", 309 | " word2vec_data = word2vec_data[~pd.isnull(word2vec_data['concept_id'])]\n", 310 | " word2vec_data['concept_id'] = word2vec_data['concept_id'].astype(int)\n", 311 | " \n", 312 | " grouped_data = word2vec_data.groupby(['sample_ID', 'date'])\n", 313 | " sentences = []\n", 314 | " for _, group in tqdm(grouped_data):\n", 315 | " codes = group['concept_id'].tolist()\n", 316 | " random.shuffle(codes)\n", 317 | " sentences.append(codes)\n", 318 | " \n", 319 | " print('starting training')\n", 320 | " model = Word2Vec(sentences, vector_size=400, window=100, min_count=5, workers=64)\n", 321 | " model.train(sentences, total_examples=len(sentences), epochs=5, callbacks=[epoch_logger])\n", 322 | " model.save(\"./models/word2vec_full_pregnancy_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 12, 328 | "id": "32b51bd1", 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "code_to_embedding = {code: model.wv[code] for code in model.wv.index_to_key}" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 13, 338 | "id": "92deadf0", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "#replace EHR data with learned embeddings\n", 343 | "embedded_conds = condition_features_EHR[(condition_features_EHR['condition_concept_id'] != 0)]\n", 344 | "embedded_conds = embedded_conds[~pd.isnull(embedded_conds['condition_concept_id'])]\n", 345 | "embedded_conds['embedding'] = [code_to_embedding.get(code) for code in embedded_conds['condition_concept_id']]\n" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 14, 351 | "id": "422a0471", 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "embedded_procs = procedure_features_EHR[(procedure_features_EHR['procedure_concept_id'] != 0)]\n", 356 | "embedded_procs = embedded_procs[~pd.isnull(embedded_procs['procedure_concept_id'])]\n", 357 | "embedded_procs['embedding'] = [code_to_embedding.get(code) for code in embedded_procs['procedure_concept_id']]\n" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 15, 363 | "id": "47d16d21", 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "embedded_drugs = drug_features_EHR[(drug_features_EHR['drug_concept_id'] != 0)]\n", 368 | "embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['drug_concept_id'])]\n", 369 | "embedded_drugs['embedding'] = [code_to_embedding.get(code) for code in embedded_drugs['drug_concept_id']]\n" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 16, 375 | "id": "7942c2ef", 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "embedded_measurements = measurement_features_EHR[(measurement_features_EHR['measurement_concept_id'] != 0)]\n", 380 | "embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['measurement_concept_id'])]\n", 381 | "embedded_measurements['embedding'] = [code_to_embedding.get(code) for code in embedded_measurements['measurement_concept_id']]\n" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 17, 387 | "id": "08c61942", 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "embedded_obs = observation_features_EHR[(observation_features_EHR['observation_concept_id'] != 0)]\n", 392 | "embedded_obs = embedded_obs[~pd.isnull(embedded_obs['observation_concept_id'])]\n", 393 | "embedded_obs['embedding'] = [code_to_embedding.get(code) for code in embedded_obs['observation_concept_id']]\n" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 18, 399 | "id": "39441058", 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "embedded_procs['date'] = pd.to_datetime(embedded_procs['procedure_DATETIME'].dt.date)\n", 404 | "embedded_conds['date'] = pd.to_datetime(embedded_conds['condition_start_DATETIME'].dt.date)\n", 405 | "embedded_drugs['date'] = pd.to_datetime(embedded_drugs['drug_exposure_start_DATETIME'].dt.date)\n", 406 | "embedded_measurements['date'] = pd.to_datetime(embedded_measurements['measurement_DATETIME'].dt.date)\n", 407 | "embedded_obs['date'] = pd.to_datetime(embedded_obs['observation_DATETIME'].dt.date)\n" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 19, 413 | "id": "5efd7c59", 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "CPU times: user 1.65 s, sys: 221 ms, total: 1.88 s\n", 421 | "Wall time: 1.68 s\n" 422 | ] 423 | } 424 | ], 425 | "source": [ 426 | "%%time\n", 427 | "# Convert pandas dataframes to dask dataframes\n", 428 | "embedded_conds_dsk = dd.from_pandas(embedded_conds, npartitions=120)\n", 429 | "embedded_procs_dsk = dd.from_pandas(embedded_procs, npartitions=120)\n", 430 | "embedded_drugs_dsk = dd.from_pandas(embedded_drugs, npartitions=120)\n", 431 | "embedded_measurements_dsk = dd.from_pandas(embedded_measurements, npartitions=120)\n", 432 | "embedded_obs_dsk = dd.from_pandas(embedded_obs, npartitions=120)\n", 433 | "\n", 434 | "# Filter null embeddings\n", 435 | "embedded_conds_dsk = embedded_conds_dsk[embedded_conds_dsk['embedding'].notnull()]\n", 436 | "embedded_procs_dsk = embedded_procs_dsk[embedded_procs_dsk['embedding'].notnull()]\n", 437 | "embedded_drugs_dsk = embedded_drugs_dsk[embedded_drugs_dsk['embedding'].notnull()]\n", 438 | "embedded_measurements_dsk = embedded_measurements_dsk[embedded_measurements_dsk['embedding'].notnull()]\n", 439 | "embedded_obs_dsk = embedded_obs_dsk[embedded_obs_dsk['embedding'].notnull()]\n", 440 | "\n", 441 | "# Concatenate different EHR tables\n", 442 | "all_data = dd.concat([\n", 443 | " embedded_conds_dsk.drop(['DOS','mom_person_id','child_person_id', 'condition_concept_id',\n", 444 | " 'condition_start_DATETIME','child_birth_date','delta'], axis=1),\n", 445 | " embedded_procs_dsk.drop(['DOS','mom_person_id','child_person_id', 'procedure_concept_id',\n", 446 | " 'procedure_DATETIME','child_birth_date','delta'], axis=1),\n", 447 | " embedded_drugs_dsk.drop(['DOS','mom_person_id','child_person_id', 'drug_concept_id',\n", 448 | " 'drug_exposure_start_DATETIME','child_birth_date','delta'], axis=1),\n", 449 | " embedded_measurements_dsk.drop(['DOS','mom_person_id','child_person_id', 'measurement_concept_id',\n", 450 | " 'measurement_DATETIME','value_as_number','child_birth_date','delta'], axis=1),\n", 451 | " embedded_obs_dsk.drop(['DOS','mom_person_id','child_person_id', 'observation_concept_id',\n", 452 | " 'observation_DATETIME','child_birth_date','delta'], axis=1)\n", 453 | "], ignore_index=True)[['sample_ID','date','embedding']].compute()\n", 454 | "\n", 455 | "all_data.sort_values('date', ascending=False, inplace=True)\n", 456 | "\n", 457 | "\n", 458 | "max_dates = 32\n", 459 | "\n", 460 | "all_data.sort_values(by=['sample_ID', 'date'], ascending=[True, False], inplace=True)\n", 461 | "\n", 462 | "# Create a helper column to rank the unique dates for each person_id\n", 463 | "all_data['date_rank'] = all_data.groupby('sample_ID')['date'].transform(lambda x: x.rank(method='dense', ascending=False))\n", 464 | "\n", 465 | "# Filter the rows where date_rank is within the range of 1 to max_dates\n", 466 | "filtered_data = all_data[all_data['date_rank'].between(1, max_dates)]\n", 467 | "filtered_data = filtered_data.drop(columns='date_rank')\n" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 20, 473 | "id": "585d6bf3", 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "CPU times: user 156 ms, sys: 95 µs, total: 156 ms\n", 481 | "Wall time: 154 ms\n" 482 | ] 483 | } 484 | ], 485 | "source": [ 486 | "%%time\n", 487 | "#create patient-day embeddings\n", 488 | "patient_day_embeddings = filtered_data.groupby(['sample_ID','date']).mean()" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 21, 494 | "id": "fb177a6c", 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "patient_day_embeddings = patient_day_embeddings.reset_index()" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 22, 504 | "id": "52b92faa", 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "patient_day_embeddings['date'] = pd.to_datetime(patient_day_embeddings['date'])\n", 509 | "patient_day_embeddings = patient_day_embeddings.sort_values(['sample_ID', 'date'])" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 23, 515 | "id": "247db58c", 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "unique_patients = patient_day_embeddings['sample_ID'].nunique()\n", 520 | "num_features = len(patient_day_embeddings['embedding'].iloc[0])\n" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 24, 526 | "id": "c8f56448", 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [ 530 | "max_dates = patient_day_embeddings.groupby('sample_ID')['date'].count().max()\n", 531 | "max_dates = 32" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 25, 537 | "id": "81eb984e", 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "#assign each patient to an index in the data matrix\n", 542 | "patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['sample_ID'].unique())}\n" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 26, 548 | "id": "eb9a7d30", 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "#create numpy matrix for data\n", 553 | "RNN_data = np.full((num_features, max_dates, unique_patients), np.nan)\n" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 27, 559 | "id": "86ccbe67", 560 | "metadata": {}, 561 | "outputs": [ 562 | { 563 | "data": { 564 | "application/vnd.jupyter.widget-view+json": { 565 | "model_id": "9b6c1e45999c496b8b95a203b42c5e66", 566 | "version_major": 2, 567 | "version_minor": 0 568 | }, 569 | "text/plain": [ 570 | "0it [00:00, ?it/s]" 571 | ] 572 | }, 573 | "metadata": {}, 574 | "output_type": "display_data" 575 | }, 576 | { 577 | "name": "stdout", 578 | "output_type": "stream", 579 | "text": [ 580 | "CPU times: user 103 ms, sys: 17 ms, total: 120 ms\n", 581 | "Wall time: 103 ms\n" 582 | ] 583 | } 584 | ], 585 | "source": [ 586 | "%%time\n", 587 | "#populate data matrix with input data\n", 588 | "date_position = {}\n", 589 | "for index, row in tqdm(patient_day_embeddings.iterrows()):\n", 590 | " patient_id = row['sample_ID']\n", 591 | " patient_index = patient_id_to_index[patient_id]\n", 592 | " \n", 593 | " if patient_id not in date_position:\n", 594 | " date_position[patient_id] = 0\n", 595 | " else:\n", 596 | " date_position[patient_id] += 1\n", 597 | " \n", 598 | " date_index = date_position[patient_id]\n", 599 | " \n", 600 | " RNN_data[:,date_index, patient_index] = row['embedding']\n" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 28, 606 | "id": "95f0c2cf", 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "RNN_data = RNN_data.transpose(2,1,0)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 29, 616 | "id": "25c6a751", 617 | "metadata": {}, 618 | "outputs": [ 619 | { 620 | "data": { 621 | "text/plain": [ 622 | "(113, 32, 400)" 623 | ] 624 | }, 625 | "execution_count": 29, 626 | "metadata": {}, 627 | "output_type": "execute_result" 628 | } 629 | ], 630 | "source": [ 631 | "RNN_data.shape" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 30, 637 | "id": "c389e1f4", 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "sampling_df['sample_ID'] = sampling_df['mom_person_id'].astype(str)+'_'+sampling_df['child_person_id'].astype(str)" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 31, 647 | "id": "38a4b841", 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "sampling_df = sampling_df.merge(pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T, how='right', left_on='sample_ID', right_on=0)\n" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 32, 657 | "id": "56dfc054", 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [ 661 | "#align outcome data with feature matrix\n", 662 | "sampling_df = sampling_df.merge(patient_day_embeddings.groupby('sample_ID').count()[['date']], how='left', on='sample_ID')" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 33, 668 | "id": "e64bdd52", 669 | "metadata": {}, 670 | "outputs": [], 671 | "source": [ 672 | "DOS_outcomes = np.array(sampling_df[['DOS',1]].sort_values(1)['DOS'])" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": 34, 678 | "id": "51e0d166", 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "#save processed data\n", 683 | "np.save('./data/processed_data/RNN_data_full_EHR_cohort_with_obs_fixed.npy', RNN_data)\n", 684 | "np.save('./data/processed_data/RNN_data_outcomes_full_EHR_cohort_with_obs_fixed.npy', DOS_outcomes)\n" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 35, 690 | "id": "0b8aaa6c", 691 | "metadata": {}, 692 | "outputs": [], 693 | "source": [ 694 | "num_patient_visits = np.minimum(np.array(sampling_df['date']), 32)" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 36, 700 | "id": "baba0b97", 701 | "metadata": {}, 702 | "outputs": [], 703 | "source": [ 704 | "np.save('./data/processed_data/RNN_data_lengths_full_EHR_cohort_with_obs_fixed.npy', num_patient_visits)" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": 37, 710 | "id": "7f0cffbc", 711 | "metadata": {}, 712 | "outputs": [], 713 | "source": [ 714 | "df = pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T\n", 715 | "df.to_csv('./data/processed_data/sampleID_indices_full_cohort_with_obs_fixed.csv')" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "id": "da8cbeee", 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [] 725 | } 726 | ], 727 | "metadata": { 728 | "kernelspec": { 729 | "display_name": "Python 3 (ipykernel)", 730 | "language": "python", 731 | "name": "python3" 732 | }, 733 | "language_info": { 734 | "codemirror_mode": { 735 | "name": "ipython", 736 | "version": 3 737 | }, 738 | "file_extension": ".py", 739 | "mimetype": "text/x-python", 740 | "name": "python", 741 | "nbconvert_exporter": "python", 742 | "pygments_lexer": "ipython3", 743 | "version": "3.10.6" 744 | } 745 | }, 746 | "nbformat": 4, 747 | "nbformat_minor": 5 748 | } 749 | -------------------------------------------------------------------------------- /Onset of Labor/process_EHR_data_omics_cohort.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 41, 6 | "id": "401a48de", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "from tqdm.notebook import tqdm\n", 13 | "from scipy.stats import pearsonr\n", 14 | "import gensim\n", 15 | "from gensim.models import Word2Vec\n", 16 | "from gensim.models.callbacks import CallbackAny2Vec\n", 17 | "import random\n", 18 | "import pickle\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "import seaborn as sns\n", 21 | "import torch" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 42, 27 | "id": "fcfe5126", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "#load CSV files which are direct extracts from OMOP tables\n", 32 | "conds = pd.read_csv('./data/raw_data/EHR/EHR_cohort_conditions.csv')\n", 33 | "conds = conds[conds['condition_concept_id'] != 0]\n", 34 | "drugs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_drugs.csv')\n", 35 | "drugs = drugs[drugs['drug_concept_id'] != 0]\n", 36 | "procs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_procedures.csv')\n", 37 | "procs = procs[procs['procedure_concept_id'] != 0]\n", 38 | "obs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_observations.csv')\n", 39 | "obs = obs[obs['observation_concept_id'] != 0]\n", 40 | "\n", 41 | "conds['condition_start_DATETIME'] = pd.to_datetime(conds['condition_start_DATETIME'])\n", 42 | "procs['procedure_DATETIME'] = pd.to_datetime(procs['procedure_DATETIME'])\n", 43 | "drugs['drug_exposure_start_DATETIME'] = pd.to_datetime(drugs['drug_exposure_start_DATETIME'])\n", 44 | "obs['observation_DATETIME'] = pd.to_datetime(obs['observation_DATETIME'])\n", 45 | "\n", 46 | "conds['child_birth_date'] = pd.to_datetime(conds['child_birth_date'])\n", 47 | "procs['child_birth_date'] = pd.to_datetime(procs['child_birth_date'])\n", 48 | "drugs['child_birth_date'] = pd.to_datetime(drugs['child_birth_date'])\n", 49 | "obs['child_birth_date'] = pd.to_datetime(obs['child_birth_date'])\n", 50 | "\n", 51 | "measurements = pd.read_csv('./data/raw_data/EHR/EHR_cohort_measurements.csv')\n", 52 | "measurements = measurements[~pd.isnull(measurements['value_as_number'])]\n", 53 | "measurements = measurements[measurements['measurement_concept_id'] != 0]\n", 54 | "measurements['measurement_DATETIME'] = pd.to_datetime(measurements['measurement_DATETIME'])\n", 55 | "measurements['child_birth_date'] = pd.to_datetime(measurements['child_birth_date'])" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 43, 61 | "id": "c7f04ac1", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "#load IDs of mothers in omics cohort\n", 66 | "OOL_cohort_omop = pd.read_csv('./data/ool_EHR_features.csv')['mom_person_id'].values" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 44, 72 | "id": "42763cbb", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "#filter data to only mothers in omics cohort\n", 77 | "conds = conds[conds['mom_person_id'].isin(OOL_cohort_omop)]\n", 78 | "drugs = drugs[drugs['mom_person_id'].isin(OOL_cohort_omop)]\n", 79 | "procs = procs[procs['mom_person_id'].isin(OOL_cohort_omop)]\n", 80 | "measurements = measurements[measurements['mom_person_id'].isin(OOL_cohort_omop)]\n", 81 | "obs = obs[obs['mom_person_id'].isin(OOL_cohort_omop)]" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 45, 87 | "id": "b504fe9c", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "def filter_df(df, birth_time, time_col, time_range_days=280):\n", 92 | " \"\"\"\n", 93 | " A function to remove entries in a dataframe prior to time of birth. \n", 94 | " \n", 95 | " df: The dataframe to filter. Must contain a column called person_id with the OMOP ID of the mother\n", 96 | " birth_time: A dataframe that contains two columns: maternal_OMOP and birth_DATETIME\n", 97 | " time_col: The index of the column with the date of the event in df\n", 98 | " time_range_days: keeps data from delivery up to time_range_days prior \n", 99 | " \n", 100 | " \"\"\"\n", 101 | " print('There were {} patients before filtering.'.format(len(df['person_id'].unique())))\n", 102 | " df = df.merge(birth_time, how='inner', left_on='person_id', right_on='maternal_OMOP')\n", 103 | " df['diff'] = df['birth_DATETIME']-df[time_col]\n", 104 | " new_df = df[(df['diff'].dt.days > 0) & (df['diff'].dt.days <= time_range_days)].drop('maternal_OMOP', axis=1)\n", 105 | " print('There were {} patients after filtering.'.format(len(new_df['person_id'].unique())))\n", 106 | " return new_df\n", 107 | "\n", 108 | "def generate_features_EHR_cohort(proteomics, input_df, time_col_name, concept_id_col, indicator, binary=True):\n", 109 | " df = proteomics[['DOS','mom_person_id','child_person_id','sample_ID']].merge(input_df, how='left', on=['mom_person_id','child_person_id'])\n", 110 | " df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days\n", 111 | " df = df[df['delta'] < df['DOS']]\n", 112 | " return df\n", 113 | " " 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 46, 119 | "id": "2742da9d", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "#load key file which can be used to map proteomics data to mother person_id\n", 124 | "patient_indices = pd.read_csv('./data/processed_data/sampleID_indices.csv')" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 7, 130 | "id": "558feb6e", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "#load and clean proteomics data\n", 135 | "OOL_proteomics = pd.read_csv('./data/processed_data/ool_proteomics_omop_id.csv')\n", 136 | "OOL_proteomics['sample_ID'] = OOL_proteomics['maternal_person_id'].astype(str)+'_'+OOL_proteomics['Timepoint'].astype(str)\n", 137 | "OOL_proteomics = OOL_proteomics.drop(['Timepoint','maternal_person_id'],axis=1)\n", 138 | "OOL_proteomics.columns = [str(i)+'_protein' for i in OOL_proteomics.columns]\n", 139 | "OOL_proteomics = OOL_proteomics.rename(columns={'DOS_protein':'DOS_sampling_time', 'sample_ID_protein':'sample_ID'})\n", 140 | "OOL_proteomics = OOL_proteomics[['sample_ID','DOS_sampling_time']]\n", 141 | "OOL_proteomics['mom_person_id'] = OOL_proteomics['sample_ID'].str[0:7].astype(int)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 8, 147 | "id": "668c6b85", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "## This block of code creates a dataframe with mom_person_id, child_person_id, min_delta, max_delta \n", 152 | "## (based on the range of EHR data available), days to onset, and a combined sample_ID col which is used as an identifier\n", 153 | "\n", 154 | "# Filter and calculate delta\n", 155 | "time_col_name = 'condition_start_DATETIME'\n", 156 | "df = conds\n", 157 | "df['delta'] = (df[time_col_name] - df['child_birth_date']).dt.days\n", 158 | "\n", 159 | "# Calculate min and max delta in one operation\n", 160 | "ool = df.groupby(['mom_person_id', 'child_person_id'])['delta'].agg(['min', 'max'])\n", 161 | "ool.columns = ['min_delta', 'max_delta']\n", 162 | "\n", 163 | "# Filter for samples with at least 7 days between min and max\n", 164 | "sampling_df = ool[ool['max_delta'] - ool['min_delta'] >= 7].reset_index()\n", 165 | "\n", 166 | "# Create initial sample_ID\n", 167 | "sampling_df['sample_ID'] = sampling_df['mom_person_id'].astype(str) + '_' + sampling_df['child_person_id'].astype(str)\n", 168 | "\n", 169 | "# Filter based on OOL_sample_IDs\n", 170 | "OOL_sample_IDs = np.unique([i[0:15] for i in list(patient_indices['0'])])\n", 171 | "sampling_df = sampling_df[sampling_df['sample_ID'].str[:15].isin(OOL_sample_IDs)]\n", 172 | "\n", 173 | "# Merge with OOL_proteomics\n", 174 | "sampling_df = sampling_df.merge(OOL_proteomics, how='inner', on='mom_person_id', suffixes=('_x', '_y'))\n", 175 | "\n", 176 | "# Set DOS\n", 177 | "sampling_df['DOS'] = sampling_df['DOS_sampling_time']\n", 178 | "\n", 179 | "# Create the correct sample_ID\n", 180 | "sampling_df['sample_ID'] = sampling_df['sample_ID_x'] + sampling_df['sample_ID_y'].str[-3:]\n", 181 | "\n", 182 | "# Drop unnecessary columns\n", 183 | "columns_to_drop = ['sample_ID_x', 'sample_ID_y', 'DOS_sampling_time']\n", 184 | "sampling_df = sampling_df.drop(columns_to_drop, axis=1, errors='ignore')" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 9, 190 | "id": "3a0c5c4a", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "#filter data so it only occurs within the correct time range (beginning of pregnancy thru sampling)\n", 195 | "condition_features_EHR = generate_features_EHR_cohort(sampling_df, conds, 'condition_start_DATETIME','condition_concept_id','C')\n", 196 | "procedure_features_EHR = generate_features_EHR_cohort(sampling_df, procs, 'procedure_DATETIME','procedure_concept_id','P')\n", 197 | "drug_features_EHR = generate_features_EHR_cohort(sampling_df, drugs, 'drug_exposure_start_DATETIME','drug_concept_id','D')\n", 198 | "measurement_features_EHR = generate_features_EHR_cohort(sampling_df, measurements, 'measurement_DATETIME','measurement_concept_id','M')\n", 199 | "observation_features_EHR = generate_features_EHR_cohort(sampling_df, obs, 'observation_DATETIME','observation_concept_id','O')\n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 10, 205 | "id": "a3facc35", 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "class EpochLogger(CallbackAny2Vec):\n", 210 | " def __init__(self):\n", 211 | " self.epoch = 0\n", 212 | "\n", 213 | " def on_epoch_begin(self, model):\n", 214 | " print(f\"Starting epoch #{self.epoch}\")\n", 215 | "\n", 216 | " def on_epoch_end(self, model):\n", 217 | " print(f\"Finished epoch #{self.epoch}\")\n", 218 | " self.epoch += 1" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 11, 224 | "id": "01f87efa", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# Train word2vec model\n", 229 | "# NOTE: For word2vec model training, we do NOT do the date filtering and use all data from pregnancy\n", 230 | "try:\n", 231 | " model = Word2Vec.load(\"./models/word2vec_OOL_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n", 232 | "except:\n", 233 | " \n", 234 | " epoch_logger = EpochLogger()\n", 235 | "\n", 236 | " word2vec_conds = condition_features_EHR[['sample_ID','condition_concept_id','condition_start_DATETIME']]\n", 237 | " word2vec_conds.columns = ['sample_ID','concept_id','ts']\n", 238 | "\n", 239 | " word2vec_procs = procedure_features_EHR[['sample_ID','procedure_concept_id','procedure_DATETIME']]\n", 240 | " word2vec_procs.columns = ['sample_ID','concept_id','ts']\n", 241 | "\n", 242 | " word2vec_drug = drug_features_EHR[['sample_ID','drug_concept_id','drug_exposure_start_DATETIME']]\n", 243 | " word2vec_drug.columns = ['sample_ID','concept_id','ts']\n", 244 | " \n", 245 | " word2vec_mea = measurement_features_EHR[['sample_ID','measurement_concept_id','measurement_DATETIME']]\n", 246 | " word2vec_mea.columns = ['sample_ID','concept_id','ts']\n", 247 | " \n", 248 | " word2vec_obs = observation_features_EHR[['sample_ID','observation_concept_id','observation_DATETIME']]\n", 249 | " word2vec_obs.columns = ['sample_ID','concept_id','ts']\n", 250 | " \n", 251 | " word2vec_data = pd.concat([word2vec_conds, word2vec_procs, word2vec_drug,word2vec_mea, word2vec_obs],axis=0)\n", 252 | " word2vec_data['date'] = pd.to_datetime(word2vec_data['ts'])\n", 253 | " word2vec_data['date'] = word2vec_data['date'].dt.date\n", 254 | " word2vec_data = word2vec_data.drop('ts',axis=1)\n", 255 | " word2vec_data = word2vec_data[~pd.isnull(word2vec_data['concept_id'])]\n", 256 | " word2vec_data['concept_id'] = word2vec_data['concept_id'].astype(int)\n", 257 | " \n", 258 | " grouped_data = word2vec_data.groupby(['sample_ID', 'date'])\n", 259 | " sentences = []\n", 260 | " for _, group in tqdm(grouped_data):\n", 261 | " codes = group['concept_id'].tolist()\n", 262 | " random.shuffle(codes)\n", 263 | " sentences.append(codes)\n", 264 | " \n", 265 | " print('starting training')\n", 266 | " model = Word2Vec(sentences, vector_size=400, window=1000, min_count=5, workers=64)\n", 267 | " model.train(sentences, total_examples=len(sentences), epochs=5, callbacks=[epoch_logger])\n", 268 | " model.save(\"./models/word2vec_OOL_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 12, 274 | "id": "2c112962", 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "code_to_embedding = {code: model.wv[code] for code in model.wv.index_to_key}" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 13, 284 | "id": "004203e4", 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "#map EHR data to their respective learned embeddings from word2vec\n", 289 | "embedded_conds = condition_features_EHR[(condition_features_EHR['condition_concept_id'] != 0)]\n", 290 | "embedded_conds = embedded_conds[~pd.isnull(embedded_conds['condition_concept_id'])]\n", 291 | "embedded_conds['embedding'] = [code_to_embedding.get(code) for code in embedded_conds['condition_concept_id']]\n" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 14, 297 | "id": "0a2def90", 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "embedded_procs = procedure_features_EHR[(procedure_features_EHR['procedure_concept_id'] != 0)]\n", 302 | "embedded_procs = embedded_procs[~pd.isnull(embedded_procs['procedure_concept_id'])]\n", 303 | "embedded_procs['embedding'] = [code_to_embedding.get(code) for code in embedded_procs['procedure_concept_id']]\n" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 15, 309 | "id": "ca224cd0", 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "embedded_drugs = drug_features_EHR[(drug_features_EHR['drug_concept_id'] != 0)]\n", 314 | "embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['drug_concept_id'])]\n", 315 | "embedded_drugs['embedding'] = [code_to_embedding.get(code) for code in embedded_drugs['drug_concept_id']]\n" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 16, 321 | "id": "6d61abf2", 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "embedded_measurements = measurement_features_EHR[(measurement_features_EHR['measurement_concept_id'] != 0)]\n", 326 | "embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['measurement_concept_id'])]\n", 327 | "embedded_measurements['embedding'] = [code_to_embedding.get(code) for code in embedded_measurements['measurement_concept_id']]\n" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 17, 333 | "id": "a8d8e620", 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "embedded_obs = observation_features_EHR[(observation_features_EHR['observation_concept_id'] != 0)]\n", 338 | "embedded_obs = embedded_obs[~pd.isnull(embedded_obs['observation_concept_id'])]\n", 339 | "embedded_obs['embedding'] = [code_to_embedding.get(code) for code in embedded_obs['observation_concept_id']]\n" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 18, 345 | "id": "1b284aab", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "embedded_procs['date'] = pd.to_datetime(embedded_procs['procedure_DATETIME'].dt.date)\n", 350 | "embedded_conds['date'] = pd.to_datetime(embedded_conds['condition_start_DATETIME'].dt.date)\n", 351 | "embedded_drugs['date'] = pd.to_datetime(embedded_drugs['drug_exposure_start_DATETIME'].dt.date)\n", 352 | "embedded_measurements['date'] = pd.to_datetime(embedded_measurements['measurement_DATETIME'].dt.date)\n", 353 | "embedded_obs['date'] = pd.to_datetime(embedded_obs['observation_DATETIME'].dt.date)\n" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 19, 359 | "id": "34f1dc40", 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "name": "stdout", 364 | "output_type": "stream", 365 | "text": [ 366 | "done making interim dataframe\n", 367 | "CPU times: user 79.5 ms, sys: 12.5 ms, total: 91.9 ms\n", 368 | "Wall time: 89.2 ms\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "%%time\n", 374 | "#combine all EHR data tables together\n", 375 | "embedded_conds = embedded_conds[~pd.isnull(embedded_conds['embedding'])]\n", 376 | "embedded_procs = embedded_procs[~pd.isnull(embedded_procs['embedding'])]\n", 377 | "embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['embedding'])]\n", 378 | "embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['embedding'])]\n", 379 | "embedded_obs = embedded_obs[~pd.isnull(embedded_obs['embedding'])]\n", 380 | "\n", 381 | "all_data = pd.concat([embedded_conds.drop(['DOS','mom_person_id','child_person_id', 'condition_concept_id',\n", 382 | " 'condition_start_DATETIME','child_birth_date','delta'],axis=1),\n", 383 | " embedded_procs.drop(['DOS','mom_person_id','child_person_id', 'procedure_concept_id',\n", 384 | " 'procedure_DATETIME','child_birth_date','delta'],axis=1),\n", 385 | " embedded_drugs.drop(['DOS','mom_person_id','child_person_id', 'drug_concept_id',\n", 386 | " 'drug_exposure_start_DATETIME','child_birth_date','delta'],axis=1),\n", 387 | " embedded_measurements.drop(['DOS','mom_person_id','child_person_id', 'measurement_concept_id',\n", 388 | " 'measurement_DATETIME','value_as_number','child_birth_date','delta'],axis=1),\n", 389 | " embedded_obs.drop(['DOS','mom_person_id','child_person_id', 'observation_concept_id',\n", 390 | " 'observation_DATETIME','child_birth_date','delta'],axis=1)], ignore_index=True)[['sample_ID','date','embedding']]\n", 391 | "\n", 392 | "expanded_embedding_df = pd.DataFrame(all_data['embedding'].tolist())\n", 393 | "print('done making interim dataframe')\n", 394 | "all_data = pd.concat([all_data.reset_index(drop=True).drop('embedding',axis=1), expanded_embedding_df], axis=1)\n" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 20, 400 | "id": "44e59ec0", 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "CPU times: user 6.59 ms, sys: 0 ns, total: 6.59 ms\n", 408 | "Wall time: 5.22 ms\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "%%time\n", 414 | "#take the mean to compute patient-day embeddings\n", 415 | "patient_day_embeddings = all_data.groupby(['sample_ID','date']).mean()" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 21, 421 | "id": "07dfd3a3", 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "patient_day_embeddings = patient_day_embeddings.reset_index()" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 22, 431 | "id": "2fbf3201", 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "patient_day_embeddings['date'] = pd.to_datetime(patient_day_embeddings['date'])\n", 436 | "patient_day_embeddings = patient_day_embeddings.sort_values(['sample_ID', 'date'])" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 23, 442 | "id": "cca3d994", 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "unique_patients = patient_day_embeddings['sample_ID'].nunique()\n", 447 | "num_features = len(patient_day_embeddings.columns) - 2 # Subtract patient_id and date columns\n" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 24, 453 | "id": "ca7e301b", 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "max_dates = patient_day_embeddings.groupby('sample_ID')['date'].count().max()\n", 458 | "max_dates = 32" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 25, 464 | "id": "bdeb43c9", 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "#assign each patient id to an index in the input data matrix\n", 469 | "patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['sample_ID'].unique())}\n" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 26, 475 | "id": "d569be44", 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "#create input data matrix\n", 480 | "RNN_data = np.full((num_features, max_dates, unique_patients), np.nan)\n" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 27, 486 | "id": "2c0b8f07", 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "data": { 491 | "application/vnd.jupyter.widget-view+json": { 492 | "model_id": "0704510ca12f4363bda5bce905a00a1b", 493 | "version_major": 2, 494 | "version_minor": 0 495 | }, 496 | "text/plain": [ 497 | "0it [00:00, ?it/s]" 498 | ] 499 | }, 500 | "metadata": {}, 501 | "output_type": "display_data" 502 | }, 503 | { 504 | "name": "stdout", 505 | "output_type": "stream", 506 | "text": [ 507 | "CPU times: user 408 ms, sys: 16.3 ms, total: 424 ms\n", 508 | "Wall time: 407 ms\n" 509 | ] 510 | } 511 | ], 512 | "source": [ 513 | "%%time\n", 514 | "#fill in input data matrix with person-day EHR data embeddings\n", 515 | "date_position = {}\n", 516 | "for index, row in tqdm(patient_day_embeddings.iterrows()):\n", 517 | " patient_id = row['sample_ID']\n", 518 | " patient_index = patient_id_to_index[patient_id]\n", 519 | " \n", 520 | " if patient_id not in date_position:\n", 521 | " date_position[patient_id] = 0\n", 522 | " else:\n", 523 | " date_position[patient_id] += 1\n", 524 | " \n", 525 | " date_index = date_position[patient_id]\n", 526 | " \n", 527 | " for feature_index, feature_value in enumerate(row.drop(['sample_ID', 'date'])):\n", 528 | " if date_index < max_dates:\n", 529 | " RNN_data[feature_index, date_index, patient_index] = feature_value\n" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 28, 535 | "id": "c2de670d", 536 | "metadata": {}, 537 | "outputs": [ 538 | { 539 | "data": { 540 | "text/plain": [ 541 | "(400, 32, 42)" 542 | ] 543 | }, 544 | "execution_count": 28, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "RNN_data.shape" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 29, 556 | "id": "f21c2dd3", 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "RNN_data = RNN_data.transpose(2,1,0)" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 30, 566 | "id": "f043f5ae", 567 | "metadata": {}, 568 | "outputs": [ 569 | { 570 | "data": { 571 | "text/plain": [ 572 | "(42, 32, 400)" 573 | ] 574 | }, 575 | "execution_count": 30, 576 | "metadata": {}, 577 | "output_type": "execute_result" 578 | } 579 | ], 580 | "source": [ 581 | "RNN_data.shape" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 31, 587 | "id": "98f90b14", 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "#align outcome data with correct index\n", 592 | "sampling_df = sampling_df.merge(pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T, how='right', left_on='sample_ID', right_on=0)\n" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 32, 598 | "id": "e6fd3f73", 599 | "metadata": {}, 600 | "outputs": [], 601 | "source": [ 602 | "#align outcome data with correct index\n", 603 | "sampling_df = sampling_df.merge(patient_day_embeddings.groupby('sample_ID').count()[['date']], how='left', on='sample_ID')" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 33, 609 | "id": "74645d66", 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "DOS_outcomes = np.array(sampling_df[['DOS',1]].sort_values(1)['DOS'])" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 34, 619 | "id": "d6490d23", 620 | "metadata": {}, 621 | "outputs": [ 622 | { 623 | "data": { 624 | "text/plain": [ 625 | "((42, 32, 400), (42,))" 626 | ] 627 | }, 628 | "execution_count": 34, 629 | "metadata": {}, 630 | "output_type": "execute_result" 631 | } 632 | ], 633 | "source": [ 634 | "RNN_data.shape, DOS_outcomes.shape" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 35, 640 | "id": "e6427287", 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "#Save processed data below" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 36, 650 | "id": "86cda5eb", 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "np.save('./data/processed_data/RNN_data_codes_with_obs_word2vec_from_ool.npy', RNN_data)" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 37, 660 | "id": "862c6ed5", 661 | "metadata": {}, 662 | "outputs": [], 663 | "source": [ 664 | "np.save('./data/processed_data/RNN_data_outcomes_with_obs_word2vec_from_ool.npy', DOS_outcomes)" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 38, 670 | "id": "9bb0ee8e", 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "patient_outcomes = torch.tensor(DOS_outcomes).float()\n", 675 | "num_patient_visits = np.minimum(np.array(sampling_df['date']), 32)" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": 39, 681 | "id": "e3a77d44", 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "np.save('./data/processed_data/RNN_data_lengths_with_obs_word2vec_from_ool.npy', num_patient_visits)" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": 40, 691 | "id": "76b41b0b", 692 | "metadata": {}, 693 | "outputs": [], 694 | "source": [ 695 | "pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('./data/processed_data/sampleID_indices_with_obs_word2vec_from_ool.csv')\n" 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": null, 701 | "id": "c6018108", 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "id": "ba06bf46", 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "id": "4dd427e9", 718 | "metadata": {}, 719 | "outputs": [], 720 | "source": [] 721 | } 722 | ], 723 | "metadata": { 724 | "kernelspec": { 725 | "display_name": "Python 3 (ipykernel)", 726 | "language": "python", 727 | "name": "python3" 728 | }, 729 | "language_info": { 730 | "codemirror_mode": { 731 | "name": "ipython", 732 | "version": 3 733 | }, 734 | "file_extension": ".py", 735 | "mimetype": "text/x-python", 736 | "name": "python", 737 | "nbconvert_exporter": "python", 738 | "pygments_lexer": "ipython3", 739 | "version": "3.10.6" 740 | } 741 | }, 742 | "nbformat": 4, 743 | "nbformat_minor": 5 744 | } 745 | -------------------------------------------------------------------------------- /Onset of Labor/process_EHR_data_omics_cohort_with_PT_word2vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "401a48de", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "from tqdm.notebook import tqdm\n", 13 | "from scipy.stats import pearsonr\n", 14 | "import gensim\n", 15 | "from gensim.models import Word2Vec\n", 16 | "from gensim.models.callbacks import CallbackAny2Vec\n", 17 | "import random\n", 18 | "import pickle\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "import seaborn as sns\n", 21 | "import torch" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "id": "fcfe5126", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "#load CSV files which are direct extracts from OMOP tables\n", 32 | "conds = pd.read_csv('./data/raw_data/EHR/EHR_cohort_conditions.csv')\n", 33 | "conds = conds[conds['condition_concept_id'] != 0]\n", 34 | "drugs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_drugs.csv')\n", 35 | "drugs = drugs[drugs['drug_concept_id'] != 0]\n", 36 | "procs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_procedures.csv')\n", 37 | "procs = procs[procs['procedure_concept_id'] != 0]\n", 38 | "obs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_observations.csv')\n", 39 | "obs = obs[obs['observation_concept_id'] != 0]\n", 40 | "\n", 41 | "conds['condition_start_DATETIME'] = pd.to_datetime(conds['condition_start_DATETIME'])\n", 42 | "procs['procedure_DATETIME'] = pd.to_datetime(procs['procedure_DATETIME'])\n", 43 | "drugs['drug_exposure_start_DATETIME'] = pd.to_datetime(drugs['drug_exposure_start_DATETIME'])\n", 44 | "obs['observation_DATETIME'] = pd.to_datetime(obs['observation_DATETIME'])\n", 45 | "\n", 46 | "conds['child_birth_date'] = pd.to_datetime(conds['child_birth_date'])\n", 47 | "procs['child_birth_date'] = pd.to_datetime(procs['child_birth_date'])\n", 48 | "drugs['child_birth_date'] = pd.to_datetime(drugs['child_birth_date'])\n", 49 | "obs['child_birth_date'] = pd.to_datetime(obs['child_birth_date'])\n", 50 | "\n", 51 | "measurements = pd.read_csv('./data/raw_data/EHR/EHR_cohort_measurements.csv')\n", 52 | "measurements = measurements[~pd.isnull(measurements['value_as_number'])]\n", 53 | "measurements = measurements[measurements['measurement_concept_id'] != 0]\n", 54 | "measurements['measurement_DATETIME'] = pd.to_datetime(measurements['measurement_DATETIME'])\n", 55 | "measurements['child_birth_date'] = pd.to_datetime(measurements['child_birth_date'])" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "id": "c7f04ac1", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "#load IDs of mothers in omics cohort\n", 66 | "OOL_cohort_omop = pd.read_csv('./data/ool_EHR_features.csv')['mom_person_id'].values" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "id": "42763cbb", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "#filter data to only mothers in omics cohort\n", 77 | "conds = conds[conds['mom_person_id'].isin(OOL_cohort_omop)]\n", 78 | "drugs = drugs[drugs['mom_person_id'].isin(OOL_cohort_omop)]\n", 79 | "procs = procs[procs['mom_person_id'].isin(OOL_cohort_omop)]\n", 80 | "measurements = measurements[measurements['mom_person_id'].isin(OOL_cohort_omop)]\n", 81 | "obs = obs[obs['mom_person_id'].isin(OOL_cohort_omop)]" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "id": "b504fe9c", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "def filter_df(df, birth_time, time_col, time_range_days=280):\n", 92 | " \"\"\"\n", 93 | " A function to remove entries in a dataframe prior to time of birth. \n", 94 | " \n", 95 | " df: The dataframe to filter. Must contain a column called person_id with the OMOP ID of the mother\n", 96 | " birth_time: A dataframe that contains two columns: maternal_OMOP and birth_DATETIME\n", 97 | " time_col: The index of the column with the date of the event in df\n", 98 | " time_range_days: keeps data from delivery up to time_range_days prior \n", 99 | " \n", 100 | " \"\"\"\n", 101 | " print('There were {} patients before filtering.'.format(len(df['person_id'].unique())))\n", 102 | " df = df.merge(birth_time, how='inner', left_on='person_id', right_on='maternal_OMOP')\n", 103 | " df['diff'] = df['birth_DATETIME']-df[time_col]\n", 104 | " new_df = df[(df['diff'].dt.days > 0) & (df['diff'].dt.days <= time_range_days)].drop('maternal_OMOP', axis=1)\n", 105 | " print('There were {} patients after filtering.'.format(len(new_df['person_id'].unique())))\n", 106 | " return new_df\n", 107 | "\n", 108 | "def generate_features_EHR_cohort(proteomics, input_df, time_col_name, concept_id_col, indicator, binary=True):\n", 109 | " df = proteomics[['DOS','mom_person_id','child_person_id','sample_ID']].merge(input_df, how='left', on=['mom_person_id','child_person_id'])\n", 110 | " df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days\n", 111 | " df = df[df['delta'] < df['DOS']]\n", 112 | " return df\n", 113 | " " 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "id": "2742da9d", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "#load key file which can be used to map proteomics data to mother person_id\n", 124 | "patient_indices = pd.read_csv('./data/processed_data/sampleID_indices.csv')" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 7, 130 | "id": "558feb6e", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "#load and clean proteomics data\n", 135 | "OOL_proteomics = pd.read_csv('./data/processed_data/ool_proteomics_omop_id.csv')\n", 136 | "OOL_proteomics['sample_ID'] = OOL_proteomics['maternal_person_id'].astype(str)+'_'+OOL_proteomics['Timepoint'].astype(str)\n", 137 | "OOL_proteomics = OOL_proteomics.drop(['Timepoint','maternal_person_id'],axis=1)\n", 138 | "OOL_proteomics.columns = [str(i)+'_protein' for i in OOL_proteomics.columns]\n", 139 | "OOL_proteomics = OOL_proteomics.rename(columns={'DOS_protein':'DOS_sampling_time', 'sample_ID_protein':'sample_ID'})\n", 140 | "OOL_proteomics = OOL_proteomics[['sample_ID','DOS_sampling_time']]\n", 141 | "OOL_proteomics['mom_person_id'] = OOL_proteomics['sample_ID'].str[0:7].astype(int)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 8, 147 | "id": "668c6b85", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "## This block of code creates a dataframe with mom_person_id, child_person_id, min_delta, max_delta \n", 152 | "## (based on the range of EHR data available), days to onset, and a combined sample_ID col which is used as an identifier\n", 153 | "\n", 154 | "# Filter and calculate delta\n", 155 | "time_col_name = 'condition_start_DATETIME'\n", 156 | "df = conds\n", 157 | "df['delta'] = (df[time_col_name] - df['child_birth_date']).dt.days\n", 158 | "\n", 159 | "# Calculate min and max delta in one operation\n", 160 | "ool = df.groupby(['mom_person_id', 'child_person_id'])['delta'].agg(['min', 'max'])\n", 161 | "ool.columns = ['min_delta', 'max_delta']\n", 162 | "\n", 163 | "# Filter for samples with at least 7 days between min and max\n", 164 | "sampling_df = ool[ool['max_delta'] - ool['min_delta'] >= 7].reset_index()\n", 165 | "\n", 166 | "# Create initial sample_ID\n", 167 | "sampling_df['sample_ID'] = sampling_df['mom_person_id'].astype(str) + '_' + sampling_df['child_person_id'].astype(str)\n", 168 | "\n", 169 | "# Filter based on OOL_sample_IDs\n", 170 | "OOL_sample_IDs = np.unique([i[0:15] for i in list(patient_indices['0'])])\n", 171 | "sampling_df = sampling_df[sampling_df['sample_ID'].str[:15].isin(OOL_sample_IDs)]\n", 172 | "\n", 173 | "# Merge with OOL_proteomics\n", 174 | "sampling_df = sampling_df.merge(OOL_proteomics, how='inner', on='mom_person_id', suffixes=('_x', '_y'))\n", 175 | "\n", 176 | "# Set DOS\n", 177 | "sampling_df['DOS'] = sampling_df['DOS_sampling_time']\n", 178 | "\n", 179 | "# Create the correct sample_ID\n", 180 | "sampling_df['sample_ID'] = sampling_df['sample_ID_x'] + sampling_df['sample_ID_y'].str[-3:]\n", 181 | "\n", 182 | "# Drop unnecessary columns\n", 183 | "columns_to_drop = ['sample_ID_x', 'sample_ID_y', 'DOS_sampling_time']\n", 184 | "sampling_df = sampling_df.drop(columns_to_drop, axis=1, errors='ignore')" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 9, 190 | "id": "3a0c5c4a", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "#filter data so it only occurs within the correct time range (beginning of pregnancy thru sampling)\n", 195 | "condition_features_EHR = generate_features_EHR_cohort(sampling_df, conds, 'condition_start_DATETIME','condition_concept_id','C')\n", 196 | "procedure_features_EHR = generate_features_EHR_cohort(sampling_df, procs, 'procedure_DATETIME','procedure_concept_id','P')\n", 197 | "drug_features_EHR = generate_features_EHR_cohort(sampling_df, drugs, 'drug_exposure_start_DATETIME','drug_concept_id','D')\n", 198 | "measurement_features_EHR = generate_features_EHR_cohort(sampling_df, measurements, 'measurement_DATETIME','measurement_concept_id','M')\n", 199 | "observation_features_EHR = generate_features_EHR_cohort(sampling_df, obs, 'observation_DATETIME','observation_concept_id','O')\n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 10, 205 | "id": "a3facc35", 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "class EpochLogger(CallbackAny2Vec):\n", 210 | " def __init__(self):\n", 211 | " self.epoch = 0\n", 212 | "\n", 213 | " def on_epoch_begin(self, model):\n", 214 | " print(f\"Starting epoch #{self.epoch}\")\n", 215 | "\n", 216 | " def on_epoch_end(self, model):\n", 217 | " print(f\"Finished epoch #{self.epoch}\")\n", 218 | " self.epoch += 1" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 11, 224 | "id": "01f87efa", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# Train word2vec model\n", 229 | "# NOTE: For word2vec model training, we do NOT do the date filtering and use all data from pregnancy\n", 230 | "try:\n", 231 | " model = Word2Vec.load(\"./models/word2vec_full_pregnancy_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n", 232 | "except:\n", 233 | " print('word2vec model from PT cohort not available, proceeding with training new word2vec model')\n", 234 | " epoch_logger = EpochLogger()\n", 235 | "\n", 236 | " word2vec_conds = condition_features_EHR[['sample_ID','condition_concept_id','condition_start_DATETIME']]\n", 237 | " word2vec_conds.columns = ['sample_ID','concept_id','ts']\n", 238 | "\n", 239 | " word2vec_procs = procedure_features_EHR[['sample_ID','procedure_concept_id','procedure_DATETIME']]\n", 240 | " word2vec_procs.columns = ['sample_ID','concept_id','ts']\n", 241 | "\n", 242 | " word2vec_drug = drug_features_EHR[['sample_ID','drug_concept_id','drug_exposure_start_DATETIME']]\n", 243 | " word2vec_drug.columns = ['sample_ID','concept_id','ts']\n", 244 | " \n", 245 | " word2vec_mea = measurement_features_EHR[['sample_ID','measurement_concept_id','measurement_DATETIME']]\n", 246 | " word2vec_mea.columns = ['sample_ID','concept_id','ts']\n", 247 | " \n", 248 | " word2vec_obs = observation_features_EHR[['sample_ID','observation_concept_id','observation_DATETIME']]\n", 249 | " word2vec_obs.columns = ['sample_ID','concept_id','ts']\n", 250 | " \n", 251 | " word2vec_data = pd.concat([word2vec_conds, word2vec_procs, word2vec_drug,word2vec_mea, word2vec_obs],axis=0)\n", 252 | " word2vec_data['date'] = pd.to_datetime(word2vec_data['ts'])\n", 253 | " word2vec_data['date'] = word2vec_data['date'].dt.date\n", 254 | " word2vec_data = word2vec_data.drop('ts',axis=1)\n", 255 | " word2vec_data = word2vec_data[~pd.isnull(word2vec_data['concept_id'])]\n", 256 | " word2vec_data['concept_id'] = word2vec_data['concept_id'].astype(int)\n", 257 | " \n", 258 | " grouped_data = word2vec_data.groupby(['sample_ID', 'date'])\n", 259 | " sentences = []\n", 260 | " for _, group in tqdm(grouped_data):\n", 261 | " codes = group['concept_id'].tolist()\n", 262 | " random.shuffle(codes)\n", 263 | " sentences.append(codes)\n", 264 | " \n", 265 | " print('starting training')\n", 266 | " model = Word2Vec(sentences, vector_size=400, window=1000, min_count=5, workers=64)\n", 267 | " model.train(sentences, total_examples=len(sentences), epochs=5, callbacks=[epoch_logger])\n", 268 | " model.save(\"./models/word2vec_OOL_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 12, 274 | "id": "2c112962", 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "code_to_embedding = {code: model.wv[code] for code in model.wv.index_to_key}" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 13, 284 | "id": "004203e4", 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "#map EHR data to their respective learned embeddings from word2vec\n", 289 | "embedded_conds = condition_features_EHR[(condition_features_EHR['condition_concept_id'] != 0)]\n", 290 | "embedded_conds = embedded_conds[~pd.isnull(embedded_conds['condition_concept_id'])]\n", 291 | "embedded_conds['embedding'] = [code_to_embedding.get(code) for code in embedded_conds['condition_concept_id']]\n" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 14, 297 | "id": "0a2def90", 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "embedded_procs = procedure_features_EHR[(procedure_features_EHR['procedure_concept_id'] != 0)]\n", 302 | "embedded_procs = embedded_procs[~pd.isnull(embedded_procs['procedure_concept_id'])]\n", 303 | "embedded_procs['embedding'] = [code_to_embedding.get(code) for code in embedded_procs['procedure_concept_id']]\n" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 15, 309 | "id": "ca224cd0", 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "embedded_drugs = drug_features_EHR[(drug_features_EHR['drug_concept_id'] != 0)]\n", 314 | "embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['drug_concept_id'])]\n", 315 | "embedded_drugs['embedding'] = [code_to_embedding.get(code) for code in embedded_drugs['drug_concept_id']]\n" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 16, 321 | "id": "6d61abf2", 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "embedded_measurements = measurement_features_EHR[(measurement_features_EHR['measurement_concept_id'] != 0)]\n", 326 | "embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['measurement_concept_id'])]\n", 327 | "embedded_measurements['embedding'] = [code_to_embedding.get(code) for code in embedded_measurements['measurement_concept_id']]\n" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 17, 333 | "id": "a8d8e620", 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "embedded_obs = observation_features_EHR[(observation_features_EHR['observation_concept_id'] != 0)]\n", 338 | "embedded_obs = embedded_obs[~pd.isnull(embedded_obs['observation_concept_id'])]\n", 339 | "embedded_obs['embedding'] = [code_to_embedding.get(code) for code in embedded_obs['observation_concept_id']]\n" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 18, 345 | "id": "1b284aab", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "embedded_procs['date'] = pd.to_datetime(embedded_procs['procedure_DATETIME'].dt.date)\n", 350 | "embedded_conds['date'] = pd.to_datetime(embedded_conds['condition_start_DATETIME'].dt.date)\n", 351 | "embedded_drugs['date'] = pd.to_datetime(embedded_drugs['drug_exposure_start_DATETIME'].dt.date)\n", 352 | "embedded_measurements['date'] = pd.to_datetime(embedded_measurements['measurement_DATETIME'].dt.date)\n", 353 | "embedded_obs['date'] = pd.to_datetime(embedded_obs['observation_DATETIME'].dt.date)\n" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 19, 359 | "id": "34f1dc40", 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "name": "stdout", 364 | "output_type": "stream", 365 | "text": [ 366 | "done making interim dataframe\n", 367 | "CPU times: user 79.1 ms, sys: 4.32 ms, total: 83.4 ms\n", 368 | "Wall time: 80.3 ms\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "%%time\n", 374 | "#combine all EHR data tables together\n", 375 | "embedded_conds = embedded_conds[~pd.isnull(embedded_conds['embedding'])]\n", 376 | "embedded_procs = embedded_procs[~pd.isnull(embedded_procs['embedding'])]\n", 377 | "embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['embedding'])]\n", 378 | "embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['embedding'])]\n", 379 | "embedded_obs = embedded_obs[~pd.isnull(embedded_obs['embedding'])]\n", 380 | "\n", 381 | "all_data = pd.concat([embedded_conds.drop(['DOS','mom_person_id','child_person_id', 'condition_concept_id',\n", 382 | " 'condition_start_DATETIME','child_birth_date','delta'],axis=1),\n", 383 | " embedded_procs.drop(['DOS','mom_person_id','child_person_id', 'procedure_concept_id',\n", 384 | " 'procedure_DATETIME','child_birth_date','delta'],axis=1),\n", 385 | " embedded_drugs.drop(['DOS','mom_person_id','child_person_id', 'drug_concept_id',\n", 386 | " 'drug_exposure_start_DATETIME','child_birth_date','delta'],axis=1),\n", 387 | " embedded_measurements.drop(['DOS','mom_person_id','child_person_id', 'measurement_concept_id',\n", 388 | " 'measurement_DATETIME','value_as_number','child_birth_date','delta'],axis=1),\n", 389 | " embedded_obs.drop(['DOS','mom_person_id','child_person_id', 'observation_concept_id',\n", 390 | " 'observation_DATETIME','child_birth_date','delta'],axis=1)], ignore_index=True)[['sample_ID','date','embedding']]\n", 391 | "\n", 392 | "expanded_embedding_df = pd.DataFrame(all_data['embedding'].tolist())\n", 393 | "print('done making interim dataframe')\n", 394 | "all_data = pd.concat([all_data.reset_index(drop=True).drop('embedding',axis=1), expanded_embedding_df], axis=1)\n" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 20, 400 | "id": "44e59ec0", 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "CPU times: user 0 ns, sys: 7.69 ms, total: 7.69 ms\n", 408 | "Wall time: 5.74 ms\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "%%time\n", 414 | "#take the mean to compute patient-day embeddings\n", 415 | "patient_day_embeddings = all_data.groupby(['sample_ID','date']).mean()" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 21, 421 | "id": "07dfd3a3", 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "patient_day_embeddings = patient_day_embeddings.reset_index()" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 22, 431 | "id": "2fbf3201", 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "patient_day_embeddings['date'] = pd.to_datetime(patient_day_embeddings['date'])\n", 436 | "patient_day_embeddings = patient_day_embeddings.sort_values(['sample_ID', 'date'])" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 23, 442 | "id": "cca3d994", 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "unique_patients = patient_day_embeddings['sample_ID'].nunique()\n", 447 | "num_features = len(patient_day_embeddings.columns) - 2 # Subtract patient_id and date columns\n" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 24, 453 | "id": "ca7e301b", 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "max_dates = patient_day_embeddings.groupby('sample_ID')['date'].count().max()\n", 458 | "max_dates = 32" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 25, 464 | "id": "bdeb43c9", 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "#assign each patient id to an index in the input data matrix\n", 469 | "patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['sample_ID'].unique())}\n" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 26, 475 | "id": "d569be44", 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "#create input data matrix\n", 480 | "RNN_data = np.full((num_features, max_dates, unique_patients), np.nan)\n" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 27, 486 | "id": "2c0b8f07", 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "data": { 491 | "application/vnd.jupyter.widget-view+json": { 492 | "model_id": "3e1a8abc925b4aa5bec5b763552aeccc", 493 | "version_major": 2, 494 | "version_minor": 0 495 | }, 496 | "text/plain": [ 497 | "0it [00:00, ?it/s]" 498 | ] 499 | }, 500 | "metadata": {}, 501 | "output_type": "display_data" 502 | }, 503 | { 504 | "name": "stdout", 505 | "output_type": "stream", 506 | "text": [ 507 | "CPU times: user 421 ms, sys: 13.9 ms, total: 435 ms\n", 508 | "Wall time: 404 ms\n" 509 | ] 510 | } 511 | ], 512 | "source": [ 513 | "%%time\n", 514 | "#fill in input data matrix with person-day EHR data embeddings\n", 515 | "date_position = {}\n", 516 | "for index, row in tqdm(patient_day_embeddings.iterrows()):\n", 517 | " patient_id = row['sample_ID']\n", 518 | " patient_index = patient_id_to_index[patient_id]\n", 519 | " \n", 520 | " if patient_id not in date_position:\n", 521 | " date_position[patient_id] = 0\n", 522 | " else:\n", 523 | " date_position[patient_id] += 1\n", 524 | " \n", 525 | " date_index = date_position[patient_id]\n", 526 | " \n", 527 | " for feature_index, feature_value in enumerate(row.drop(['sample_ID', 'date'])):\n", 528 | " if date_index < max_dates:\n", 529 | " RNN_data[feature_index, date_index, patient_index] = feature_value\n" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 28, 535 | "id": "c2de670d", 536 | "metadata": {}, 537 | "outputs": [ 538 | { 539 | "data": { 540 | "text/plain": [ 541 | "(400, 32, 42)" 542 | ] 543 | }, 544 | "execution_count": 28, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "RNN_data.shape" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 29, 556 | "id": "f21c2dd3", 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "RNN_data = RNN_data.transpose(2,1,0)" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 30, 566 | "id": "f043f5ae", 567 | "metadata": {}, 568 | "outputs": [ 569 | { 570 | "data": { 571 | "text/plain": [ 572 | "(42, 32, 400)" 573 | ] 574 | }, 575 | "execution_count": 30, 576 | "metadata": {}, 577 | "output_type": "execute_result" 578 | } 579 | ], 580 | "source": [ 581 | "RNN_data.shape" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 31, 587 | "id": "98f90b14", 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "#align outcome data with correct index\n", 592 | "sampling_df = sampling_df.merge(pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T, how='right', left_on='sample_ID', right_on=0)\n" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 32, 598 | "id": "e6fd3f73", 599 | "metadata": {}, 600 | "outputs": [], 601 | "source": [ 602 | "#align outcome data with correct index\n", 603 | "sampling_df = sampling_df.merge(patient_day_embeddings.groupby('sample_ID').count()[['date']], how='left', on='sample_ID')" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 33, 609 | "id": "74645d66", 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "DOS_outcomes = np.array(sampling_df[['DOS',1]].sort_values(1)['DOS'])" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 34, 619 | "id": "d6490d23", 620 | "metadata": {}, 621 | "outputs": [ 622 | { 623 | "data": { 624 | "text/plain": [ 625 | "((42, 32, 400), (42,))" 626 | ] 627 | }, 628 | "execution_count": 34, 629 | "metadata": {}, 630 | "output_type": "execute_result" 631 | } 632 | ], 633 | "source": [ 634 | "RNN_data.shape, DOS_outcomes.shape" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 35, 640 | "id": "e6427287", 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "#Save processed data below" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 36, 650 | "id": "86cda5eb", 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "np.save('./data/processed_data/RNN_data_codes_with_obs.npy', RNN_data)" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 37, 660 | "id": "862c6ed5", 661 | "metadata": {}, 662 | "outputs": [], 663 | "source": [ 664 | "np.save('./data/processed_data/RNN_data_outcomes_with_obs.npy', DOS_outcomes)" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 38, 670 | "id": "9bb0ee8e", 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "patient_outcomes = torch.tensor(DOS_outcomes).float()\n", 675 | "num_patient_visits = np.minimum(np.array(sampling_df['date']), 32)" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": 39, 681 | "id": "e3a77d44", 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "np.save('./data/processed_data/RNN_data_lengths_with_obs.npy', num_patient_visits)" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": 40, 691 | "id": "76b41b0b", 692 | "metadata": {}, 693 | "outputs": [], 694 | "source": [ 695 | "pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('./data/processed_data/sampleID_indices_with_obs.csv')\n" 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": null, 701 | "id": "7e81e07a", 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "id": "5956a041", 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "id": "68fc8e43", 718 | "metadata": {}, 719 | "outputs": [], 720 | "source": [] 721 | } 722 | ], 723 | "metadata": { 724 | "kernelspec": { 725 | "display_name": "Python 3 (ipykernel)", 726 | "language": "python", 727 | "name": "python3" 728 | }, 729 | "language_info": { 730 | "codemirror_mode": { 731 | "name": "ipython", 732 | "version": 3 733 | }, 734 | "file_extension": ".py", 735 | "mimetype": "text/x-python", 736 | "name": "python", 737 | "nbconvert_exporter": "python", 738 | "pygments_lexer": "ipython3", 739 | "version": "3.10.6" 740 | } 741 | }, 742 | "nbformat": 4, 743 | "nbformat_minor": 5 744 | } 745 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/833866868.svg)](https://doi.org/10.5281/zenodo.13977341) 2 | # COMET: Clinical and Omics Multi-Modal Analysis Enhanced with Transfer Learning 3 | COMET is a machine learning framework that incorporates large, observational electronic health record (EHR) databases and transfer learning to improve the analysis of small datasets from omics studies. 4 | ## Overview 5 | This repo contains the code used for the analyses and results presented in our manuscript. Due to HIPAA constraints, we cannot share the EHR data used in our study. The proteomics data for the onset of labor cohort can be found [here](https://datadryad.org/stash/dataset/doi:10.5061/dryad.280gb5mpd). Due to UK Biobank policies, we cannot share the proteomics data from the UK Biobank cohort. Researchers who have access to the UK Biobank proteomics data and EHR data can run the provided notebooks from the UK Biobank Research Analysis Platform (RAP) to reproduce our analysis. To generate the necessary processed data files run ./Cancer/process_EHR_data_omics.ipynb first, then ./Cancer/process_PT_data.ipynb, followed by ./Cancer/grouped_embeddings_to_matrices.ipynb. You can then run ./Cancer/experiments.ipynb. Details about the UK Biobank, including how to get approved as a researcher and access the RAP can be found on the UK Biobank's [website](https://www.ukbiobank.ac.uk/). 6 | ## Installation and Setup 7 | First, clone the GitHub repo: 8 | ``` 9 | git clone https://github.com/samson920/COMET 10 | ``` 11 | Then, set up the environment: 12 | ``` 13 | conda env create -f environment.yml 14 | conda activate COMET 15 | ``` 16 | The installation should take about 10 minutes. 17 | 18 | ## Demo 19 | We have included some toy data in the ./Onset of Labor/data/ folder to show the expected structure of data for the onset of labor experiments. The EHR data are direct extracts of OMOP tables. The toy data will work with our code, though the results won't be particularly meaningful as the data are randomly generated. You can replace the toy data with your own data from OMOP tables and your own tabular omics data to run COMET on your own datasets. To run the data processing scripts, run the Jupyter notebooks in ./Onset of Labor/, starting with process_EHR_data_full_PT_cohort.ipynb, then process_EHR_data_omics_cohort.ipynb, lastly, process_EHR_data_omics_cohort_with_PT_word2vec.ipynb. These notebooks will create the processed EHR data files expected by the experiments.ipynb notebook, which you can run after the data processing notebooks. 20 | 21 | The data processing notebooks will take <1 minute on our toy data, but substantially longer with real, larger datasets. The experiments notebook will take about 20 minutes to run with our toy data on machines with a GPU, but substantially longer with real, larger datasets. We do not recommend running this code on a CPU as it will take a very long time. 22 | 23 | 24 | ## General Repo Organization 25 | There are two folders: Onset of Labor and Cancer. Within each folder, we have Jupyter notebooks used for various aspects of the data processing and analysis. Within the onset of labor folder we have: 26 | - process_EHR_data_full_PT_cohort.ipynb: This notebook contains the code necessary to process EHR data for the pre-training cohort from extracts of OMOP tables to matrices that can be direct inputs to the ML models. This includes the training of the word2vec model to embed EHR codes. 27 | - process_EHR_data_omics_cohort.ipynb: This notebook contains the code necessary to process EHR data for the omics from extracts of OMOP tables to matrices that can be direct inputs to the ML models. This includes the training of the word2vec model to embed EHR codes. 28 | - process_EHR_data_omics_cohort_with_PT_word2vec.ipynb: This notebook is the same as the above, except it uses the word2vec model from the PT cohort, and is for use in the latter experiments which utilize COMET (including the pre-trained word2vec model). 29 | - experiments.ipynb: This notebook contains all other code for experiments and analysis. Most notably, it contains the code for the actual architecture of our models, hyperparameter optimization, actual experiments, and downstream analyses including feature importance computation and visualization of the parameter space in Figure 6. 30 | 31 | Within the cancer folder we have: 32 | - process_EHR_data_omics.ipynb: This file contains the queries to pull the patient cohorts and the data necessary to train the word2vec models, and trains the word2vec models for both the omics and pre-training cohorts. This file also contains downstream processing to pull the feature data from the patients in the omics cohort and ultimately saves a CSV containing the person-day embeddings. 33 | - process_PT_data.ipynb: This file contains the queries to pull the feature data from the pre-training cohort and downstream processing to compute person-day embeddings. 34 | - grouped_embeddings_to_matrices.ipynb: contains code to convert person-day embeddings to feature matrix for RNN input, also computes other inputs for ML (length of sequence based on number of days of data, outcome data, mapping between patient ID and indices in the feature matrix), also contains code used to extract all proteomics data 35 | - experiments.ipynb: This notebook contains all other code for experiments and analysis. Most notably, it contains the code for the actual architecture of our models, hyperparameter optimization, actual experiments, and downstream analyses including feature importance computation and visualization of the parameter space in Figure 6. 36 | 37 | 38 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: COMET 2 | channels: 3 | - anaconda 4 | - pytorch 5 | - bioconda 6 | - conda-forge 7 | - defaults 8 | dependencies: 9 | - _libgcc_mutex=0.1=conda_forge 10 | - _openmp_mutex=4.5=2_kmp_llvm 11 | - _py-xgboost-mutex=2.0=cpu_0 12 | - adjusttext=0.7.3.1=py_1 13 | - alsa-lib=1.2.7.2=h166bdaf_0 14 | - anyio=3.6.2=pyhd8ed1ab_0 15 | - aom=3.5.0=h27087fc_0 16 | - argon2-cffi=21.3.0=pyhd8ed1ab_0 17 | - asttokens=2.0.8=pyhd8ed1ab_0 18 | - attr=2.5.1=h166bdaf_1 19 | - attrs=22.1.0=pyh71513ae_1 20 | - babel=2.10.3=pyhd8ed1ab_0 21 | - backcall=0.2.0=pyh9f0ad1d_0 22 | - backports=1.1=pyhd3eb1b0_0 23 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 24 | - beautifulsoup4=4.11.1=pyha770c72_0 25 | - bioinfokit=2.1.0=pyh7cba7a3_0 26 | - blas=1.0=mkl 27 | - bleach=5.0.1=pyhd8ed1ab_0 28 | - bokeh=3.3.0=pyhd8ed1ab_0 29 | - brotli=1.0.9=h166bdaf_7 30 | - brotli-bin=1.0.9=h166bdaf_7 31 | - bzip2=1.0.8=h7f98852_4 32 | - ca-certificates=2023.08.22=h06a4308_0 33 | - captum=0.6.0=0 34 | - certifi=2023.7.22=pyhd8ed1ab_0 35 | - charset-normalizer=2.1.1=pyhd8ed1ab_0 36 | - click=8.1.3=unix_pyhd8ed1ab_2 37 | - cloudpickle=2.2.1=pyhd8ed1ab_0 38 | - colorama=0.4.5=pyhd8ed1ab_0 39 | - cudatoolkit=11.3.1=h2bc3f7f_2 40 | - cycler=0.11.0=pyhd8ed1ab_0 41 | - dask=2022.2.1=pyhd3eb1b0_0 42 | - dask-core=2022.2.1=pyhd3eb1b0_0 43 | - dbus=1.13.18=hb2f20db_0 44 | - decorator=5.1.1=pyhd8ed1ab_0 45 | - defusedxml=0.7.1=pyhd8ed1ab_0 46 | - distributed=2022.2.1=pyhd3eb1b0_0 47 | - entrypoints=0.4=pyhd8ed1ab_0 48 | - executing=1.1.1=pyhd8ed1ab_0 49 | - expat=2.4.9=h27087fc_0 50 | - ffmpeg=5.1.2=gpl_he10e716_101 51 | - fftw=3.3.10=nompi_hf0379b8_105 52 | - flit-core=3.7.1=pyhd8ed1ab_0 53 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 54 | - font-ttf-inconsolata=3.000=h77eed37_0 55 | - font-ttf-source-code-pro=2.038=h77eed37_0 56 | - font-ttf-ubuntu=0.83=hab24e00_0 57 | - fontconfig=2.14.0=hc2a2eb6_1 58 | - fonts-conda-ecosystem=1=0 59 | - fonts-conda-forge=1=0 60 | - freetype=2.12.1=hca18f0e_0 61 | - fsspec=2023.5.0=pyh1a96a4e_0 62 | - gettext=0.21.1=h27087fc_0 63 | - giflib=5.2.1=h36c2ea0_2 64 | - glib=2.74.0=h6239696_0 65 | - glib-tools=2.74.0=h6239696_0 66 | - gmp=6.2.1=h58526e2_0 67 | - gnutls=3.7.8=hf3e180e_0 68 | - gst-plugins-base=1.20.3=h57caac4_2 69 | - gstreamer=1.20.3=hd4edc92_2 70 | - icu=70.1=h27087fc_0 71 | - idna=3.4=pyhd8ed1ab_0 72 | - imageio=2.31.1=pyh24c5eb1_0 73 | - importlib_resources=5.10.0=pyhd8ed1ab_0 74 | - intel-openmp=2021.4.0=h06a4308_3561 75 | - ipykernel=6.16.0=pyh210e3f2_0 76 | - ipython=8.5.0=pyh41d4057_1 77 | - ipython_genutils=0.2.0=py_1 78 | - ipywidgets=8.0.2=pyhd8ed1ab_1 79 | - jack=1.9.21=h2a1e645_0 80 | - jedi=0.18.1=pyhd8ed1ab_2 81 | - jinja2=3.1.2=pyhd8ed1ab_1 82 | - joblib=1.2.0=pyhd8ed1ab_0 83 | - jpeg=9e=h166bdaf_2 84 | - json5=0.9.6=pyhd3eb1b0_0 85 | - jsonschema=4.16.0=pyhd8ed1ab_0 86 | - jupyter_client=7.4.2=pyhd8ed1ab_0 87 | - jupyter_core=4.11.1=py310hff52083_0 88 | - jupyter_server=1.21.0=pyhd8ed1ab_0 89 | - jupyterlab=3.4.8=pyhd8ed1ab_0 90 | - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0 91 | - jupyterlab_server=2.16.0=pyhd8ed1ab_0 92 | - jupyterlab_widgets=3.0.3=pyhd8ed1ab_0 93 | - keyutils=1.6.1=h166bdaf_0 94 | - krb5=1.19.3=h3790be6_0 95 | - lame=3.100=h166bdaf_1003 96 | - lcms2=2.12=hddcbb42_0 97 | - ld_impl_linux-64=2.39=hc81fddc_0 98 | - lerc=4.0.0=h27087fc_0 99 | - libblas=3.9.0=12_linux64_mkl 100 | - libbrotlicommon=1.0.9=h166bdaf_7 101 | - libbrotlidec=1.0.9=h166bdaf_7 102 | - libbrotlienc=1.0.9=h166bdaf_7 103 | - libcap=2.66=ha37c62d_0 104 | - libcblas=3.9.0=12_linux64_mkl 105 | - libclang=14.0.6=default_hc1a23ef_0 106 | - libclang13=14.0.6=default_h31cde19_0 107 | - libcups=2.3.3=h3e49a29_2 108 | - libdb=6.2.32=h9c3ff4c_0 109 | - libdeflate=1.14=h166bdaf_0 110 | - libdrm=2.4.113=h166bdaf_0 111 | - libedit=3.1.20210910=h7f8727e_0 112 | - libevent=2.1.10=h9b69904_4 113 | - libffi=3.4.2=h7f98852_5 114 | - libflac=1.4.1=h27087fc_0 115 | - libgcc-ng=12.2.0=h65d4601_18 116 | - libgfortran-ng=12.2.0=h69a702a_18 117 | - libgfortran5=12.2.0=h337968e_18 118 | - libglib=2.74.0=h7a41b64_0 119 | - libiconv=1.17=h166bdaf_0 120 | - libidn2=2.3.3=h166bdaf_0 121 | - libllvm11=11.1.0=he0ac6c6_5 122 | - libllvm14=14.0.6=he0ac6c6_0 123 | - libnsl=2.0.0=h7f98852_0 124 | - libogg=1.3.5=h27cfd23_1 125 | - libopus=1.3.1=h7f98852_1 126 | - libpciaccess=0.16=h516909a_0 127 | - libpng=1.6.38=h753d276_0 128 | - libpq=14.5=hd77ab85_0 129 | - libsndfile=1.1.0=h27087fc_0 130 | - libsodium=1.0.18=h36c2ea0_1 131 | - libsqlite=3.39.4=h753d276_0 132 | - libstdcxx-ng=12.2.0=h46fd767_18 133 | - libtasn1=4.19.0=h166bdaf_0 134 | - libtiff=4.4.0=h55922b4_4 135 | - libtool=2.4.6=h9c3ff4c_1008 136 | - libudev1=249=h166bdaf_4 137 | - libunistring=0.9.10=h7f98852_0 138 | - libuuid=2.32.1=h7f98852_1000 139 | - libva=2.16.0=h166bdaf_0 140 | - libvorbis=1.3.7=h9c3ff4c_0 141 | - libvpx=1.11.0=h9c3ff4c_3 142 | - libwebp=1.2.4=h522a892_0 143 | - libwebp-base=1.2.4=h166bdaf_0 144 | - libxcb=1.13=h7f98852_1004 145 | - libxgboost=1.6.2=cpu_ha3b9936_1 146 | - libxkbcommon=1.0.3=he3ba5ed_0 147 | - libxml2=2.9.14=h22db469_4 148 | - libzlib=1.2.13=h166bdaf_4 149 | - llvm-openmp=14.0.6=h9e868ea_0 150 | - locket=1.0.0=pyhd8ed1ab_0 151 | - matplotlib-base=3.6.1=py310h8d5ebf3_0 152 | - matplotlib-inline=0.1.6=pyhd8ed1ab_0 153 | - matplotlib-venn=0.11.7=pyhd8ed1ab_0 154 | - mistune=2.0.4=pyhd8ed1ab_0 155 | - mkl=2021.4.0=h8d4b97c_729 156 | - mkl_fft=1.3.1=py310h2b4bcf5_1 157 | - mkl_random=1.2.2=py310h00e6091_0 158 | - mpg123=1.30.2=h27087fc_1 159 | - msgpack-python=1.0.5=py310hdf3cbec_0 160 | - munkres=1.1.4=pyh9f0ad1d_0 161 | - mysql-common=8.0.31=haf5c9bc_0 162 | - mysql-libs=8.0.31=h28c427c_0 163 | - nbclassic=0.4.5=pyhd8ed1ab_0 164 | - nbclient=0.7.0=pyhd8ed1ab_0 165 | - nbconvert=7.2.1=pyhd8ed1ab_0 166 | - nbconvert-core=7.2.1=pyhd8ed1ab_0 167 | - nbconvert-pandoc=7.2.1=pyhd8ed1ab_0 168 | - nbformat=5.7.0=pyhd8ed1ab_0 169 | - ncurses=6.3=h27087fc_1 170 | - nest-asyncio=1.5.6=pyhd8ed1ab_0 171 | - nettle=3.8.1=hc379101_1 172 | - networkx=3.1=pyhd8ed1ab_0 173 | - notebook=6.5.1=pyha770c72_0 174 | - notebook-shim=0.2.0=pyhd8ed1ab_0 175 | - nspr=4.33=h295c915_0 176 | - nss=3.78=h2350873_0 177 | - numpy-base=1.23.3 178 | - openh264=2.3.1=h27087fc_1 179 | - openssl=1.1.1w=hd590300_0 180 | - p11-kit=0.24.1=hc5aa10d_0 181 | - packaging=21.3=pyhd8ed1ab_0 182 | - pandoc=2.19.2=ha770c72_0 183 | - pandocfilters=1.5.0=pyhd8ed1ab_0 184 | - parso=0.8.3=pyhd8ed1ab_0 185 | - partd=1.4.0=pyhd8ed1ab_0 186 | - patsy=0.5.3=pyhd8ed1ab_0 187 | - pcre2=10.37=hc3806b6_1 188 | - pexpect=4.8.0=pyh9f0ad1d_2 189 | - pickleshare=0.7.5=py_1003 190 | - pip=22.3=pyhd8ed1ab_0 191 | - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 192 | - ply=3.11=py_1 193 | - prometheus_client=0.15.0=pyhd8ed1ab_0 194 | - prompt-toolkit=3.0.31=pyha770c72_0 195 | - pthread-stubs=0.4=h36c2ea0_1001 196 | - ptyprocess=0.7.0=pyhd3deb0d_0 197 | - pulseaudio=14.0=habe0971_10 198 | - pure_eval=0.2.2=pyhd8ed1ab_0 199 | - py-xgboost=1.6.2=cpu_py310hd1aba9c_1 200 | - pycparser=2.21=pyhd8ed1ab_0 201 | - pygments=2.13.0=pyhd8ed1ab_0 202 | - pynndescent=0.5.10=pyh1a96a4e_0 203 | - pyopenssl=22.1.0=pyhd8ed1ab_0 204 | - pyparsing=3.0.9=pyhd8ed1ab_0 205 | - pyqt=5.15.7=py310h29803b5_1 206 | - pysocks=1.7.1=pyha2e5f31_6 207 | - python=3.10.6=h582c2e5_0_cpython 208 | - python-dateutil=2.8.2=pyhd8ed1ab_0 209 | - python-fastjsonschema=2.16.2=pyhd8ed1ab_0 210 | - python-louvain=0.16=pyhd8ed1ab_0 211 | - python_abi=3.10=2_cp310 212 | - pytorch=1.12.1=py3.10_cuda11.3_cudnn8.3.2_0 213 | - pytorch-mutex=1.0=cuda 214 | - pytz=2022.4=pyhd8ed1ab_0 215 | - qt-main=5.15.6=hc525480_0 216 | - readline=8.1.2=h0f457ee_0 217 | - requests=2.28.1=pyhd8ed1ab_1 218 | - seaborn=0.11.2=pyhd3eb1b0_0 219 | - send2trash=1.8.0=pyhd8ed1ab_0 220 | - setuptools=65.5.0=pyhd8ed1ab_0 221 | - six=1.16.0=pyh6c4a22f_0 222 | - slicer=0.0.7=pyhd8ed1ab_0 223 | - smart_open=6.3.0=pyhd8ed1ab_1 224 | - sniffio=1.3.0=pyhd8ed1ab_0 225 | - sortedcontainers=2.4.0=pyhd8ed1ab_0 226 | - soupsieve=2.3.2.post1=pyhd8ed1ab_0 227 | - sqlite=3.39.4=h4ff8645_0 228 | - stack_data=0.5.1=pyhd8ed1ab_0 229 | - svt-av1=1.2.1=h27087fc_0 230 | - tabulate=0.9.0=pyhd8ed1ab_1 231 | - tbb=2021.6.0=h924138e_0 232 | - tblib=1.7.0=pyhd8ed1ab_0 233 | - terminado=0.16.0=pyh41d4057_0 234 | - textwrap3=0.9.2=py_0 235 | - threadpoolctl=3.1.0=pyh8a188c0_0 236 | - tinycss2=1.2.1=pyhd8ed1ab_0 237 | - tk=8.6.12=h27826a3_0 238 | - toml=0.10.2=pyhd8ed1ab_0 239 | - tomli=2.0.1=pyhd8ed1ab_0 240 | - toolz=0.12.0=pyhd8ed1ab_0 241 | - tqdm=4.64.1=pyhd8ed1ab_0 242 | - traitlets=5.5.0=pyhd8ed1ab_0 243 | - typing_extensions=4.4.0=pyha770c72_0 244 | - tzdata=2022e=h191b570_0 245 | - urllib3=1.26.11=pyhd8ed1ab_0 246 | - wcwidth=0.2.5=pyh9f0ad1d_2 247 | - webencodings=0.5.1=py_1 248 | - websocket-client=1.4.1=pyhd8ed1ab_0 249 | - wheel=0.37.1=pyhd8ed1ab_0 250 | - widgetsnbextension=4.0.3=pyhd8ed1ab_0 251 | - x264=1!164.3095=h166bdaf_2 252 | - x265=3.5=h924138e_3 253 | - xcb-util=0.4.0=h166bdaf_0 254 | - xcb-util-image=0.4.0=h166bdaf_0 255 | - xcb-util-keysyms=0.4.0=h166bdaf_0 256 | - xcb-util-renderutil=0.3.9=h166bdaf_0 257 | - xcb-util-wm=0.4.1=h166bdaf_0 258 | - xorg-fixesproto=5.0=h7f98852_1002 259 | - xorg-kbproto=1.0.7=h7f98852_1002 260 | - xorg-libx11=1.7.2=h7f98852_0 261 | - xorg-libxau=1.0.9=h7f98852_0 262 | - xorg-libxdmcp=1.1.3=h7f98852_0 263 | - xorg-libxext=1.3.4=h7f98852_1 264 | - xorg-libxfixes=5.0.3=h7f98852_1004 265 | - xorg-xextproto=7.3.0=h7f98852_1002 266 | - xorg-xproto=7.0.31=h7f98852_1007 267 | - xyzservices=2023.2.0=pyhd8ed1ab_0 268 | - xz=5.2.6=h166bdaf_0 269 | - yaml=0.2.5=h7f98852_2 270 | - zeromq=4.3.4=h9c3ff4c_1 271 | - zict=3.0.0=pyhd8ed1ab_0 272 | - zipp=3.9.0=pyhd8ed1ab_0 273 | - zlib=1.2.13=h166bdaf_4 274 | - zstd=1.5.2=h6239696_4 275 | - pip: 276 | - appdirs==1.4.4 277 | - argon2-cffi-bindings==21.2.0 278 | - brotlipy==0.7.0 279 | - cffi==1.15.1 280 | - colorcet==3.1.0 281 | - contourpy==1.0.5 282 | - cryptography==38.0.2 283 | - cytoolz==0.12.0 284 | - debugpy==1.6.3 285 | - docker-pycreds==0.4.0 286 | - et-xmlfile==1.1.0 287 | - fonttools==4.37.4 288 | - gensim==4.3.0 289 | - gitdb==4.0.10 290 | - gitpython==3.1.30 291 | - gseapy==1.0.6 292 | - importlib-metadata==4.11.4 293 | - jupyter-core==4.11.1 294 | - kiwisolver==1.4.4 295 | - llvmlite==0.39.1 296 | - loralib==0.1.1 297 | - markupsafe==2.1.1 298 | - matplotlib==3.6.1 299 | - mkl-fft==1.3.1 300 | - mkl-random==1.2.2 301 | - mkl-service==2.4.0 302 | - msgpack==1.0.5 303 | - numba==0.56.4 304 | - numpy==1.23.3 305 | - openpyxl==3.1.2 306 | - pandas==1.5.0 307 | - pathtools==0.1.2 308 | - pillow==9.2.0 309 | - protobuf==4.21.12 310 | - psutil==5.9.3 311 | - pyqt5==5.15.7 312 | - pyqt5-sip==12.11.0 313 | - pyrsistent==0.18.1 314 | - pyyaml==6.0 315 | - pyzmq==24.0.1 316 | - scikit-learn==1.1.2 317 | - scipy==1.9.1 318 | - sentry-sdk==1.15.0 319 | - setproctitle==1.3.2 320 | - shap==0.41.0 321 | - sip==6.7.2 322 | - smmap==5.0.0 323 | - statsmodels==0.13.5 324 | - torch==1.12.1 325 | - torchaudio==0.12.1 326 | - torchvision==0.13.1 327 | - tornado==6.2 328 | - umap-learn==0.5.3 329 | - unicodedata2==14.0.0 330 | - wandb==0.13.10 331 | - xgboost==1.6.2 332 | prefix: /home/samsonm/miniconda3/envs/multi_modal_DL 333 | --------------------------------------------------------------------------------