├── Cancer
    ├── experiments.ipynb
    ├── grouped_embeddings_to_matrices.ipynb
    ├── process_EHR_data_omics.ipynb
    └── process_PT_data.ipynb
├── LICENSE
├── Onset of Labor
    ├── data
    │   ├── data_generation.ipynb
    │   ├── ool_EHR_features.csv
    │   ├── processed_data
    │   │   ├── RNN_data_codes_with_obs.npy
    │   │   ├── RNN_data_codes_with_obs_word2vec_from_ool.npy
    │   │   ├── RNN_data_full_EHR_cohort_with_obs_fixed.npy
    │   │   ├── RNN_data_lengths_full_EHR_cohort_with_obs_fixed.npy
    │   │   ├── RNN_data_lengths_with_obs.npy
    │   │   ├── RNN_data_lengths_with_obs_word2vec_from_ool.npy
    │   │   ├── RNN_data_outcomes_full_EHR_cohort_with_obs_fixed.npy
    │   │   ├── RNN_data_outcomes_with_obs.npy
    │   │   ├── RNN_data_outcomes_with_obs_word2vec_from_ool.npy
    │   │   ├── ool_proteomics_omop_id.csv
    │   │   ├── sampleID_indices.csv
    │   │   ├── sampleID_indices_full_cohort_with_obs_fixed.csv
    │   │   ├── sampleID_indices_with_obs.csv
    │   │   └── sampleID_indices_with_obs_word2vec_from_ool.csv
    │   └── raw_data
    │   │   └── EHR
    │   │       ├── EHR_cohort_conditions.csv
    │   │       ├── EHR_cohort_drugs.csv
    │   │       ├── EHR_cohort_measurements.csv
    │   │       ├── EHR_cohort_observations.csv
    │   │       ├── EHR_cohort_procedures.csv
    │   │       ├── full_EHR_cohort_conditions.csv
    │   │       ├── full_EHR_cohort_drugs.csv
    │   │       ├── full_EHR_cohort_measurements.csv
    │   │       ├── full_EHR_cohort_observations.csv
    │   │       └── full_EHR_cohort_procedures.csv
    ├── experiments.ipynb
    ├── process_EHR_data_full_PT_cohort.ipynb
    ├── process_EHR_data_omics_cohort.ipynb
    └── process_EHR_data_omics_cohort_with_PT_word2vec.ipynb
├── README.md
└── environment.yml


/Cancer/grouped_embeddings_to_matrices.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "tags": []
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "from tqdm.notebook import tqdm\n",
 14 |     "import os\n",
 15 |     "from datetime import timedelta\n",
 16 |     "import pyspark\n",
 17 |     "import dxpy\n",
 18 |     "import dxdata\n",
 19 |     "import pandas as pd\n",
 20 |     "import random\n",
 21 |     "from pyspark.sql import functions as F\n",
 22 |     "from pyspark.sql import SparkSession\n",
 23 |     "from pyspark.ml.feature import Word2Vec\n",
 24 |     "from pyspark.sql.functions import col, udf, to_date, mean, expr\n",
 25 |     "from pyspark.sql.types import StringType, ArrayType, IntegerType, DoubleType\n",
 26 |     "from pyspark.ml.feature import Word2Vec\n",
 27 |     "from pyspark.sql.window import Window\n",
 28 |     "import ast\n"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "tags": []
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "sc = pyspark.SparkContext()\n",
 40 |     "spark = pyspark.sql.SparkSession(sc)\n",
 41 |     "dispensed_database_name = dxpy.find_one_data_object(classname=\"database\", name=\"app*\", folder=\"/\", name_mode=\"glob\", describe=True)[\"describe\"][\"name\"]\n",
 42 |     "dispensed_dataset_id = dxpy.find_one_data_object(typename=\"Dataset\", name=\"app*.dataset\", folder=\"/\", name_mode=\"glob\")[\"id\"]\n",
 43 |     "spark.sql(\"USE \" + dispensed_database_name)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "### Omics Data"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {
 57 |     "tags": []
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "#contains death records\n",
 62 |     "death = pd.read_csv('/mnt/project/death.csv').drop('Unnamed: 0',axis=1).drop_duplicates()\n",
 63 |     "death['death_date'] = pd.to_datetime(death['death_date'], yearfirst=True)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {
 70 |     "tags": []
 71 |    },
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "#contains all cancer diagnoses for omics patients\n",
 75 |     "prot_date = pd.read_csv('/mnt/project/cancer_conds.csv', usecols=['eid','proteomics_date']).drop_duplicates()\n",
 76 |     "prot_date['proteomics_date'] = pd.to_datetime(prot_date['proteomics_date'], yearfirst=True)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {
 83 |     "tags": []
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "pdf = prot_date.merge(death, how='inner', on='eid')"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {
 94 |     "tags": []
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "pdf = pdf[pdf['death_date'] <= (pdf['proteomics_date'] + pd.DateOffset(years=3))]\n",
 99 |     "pdf['indicator'] = 1"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "tags": []
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "patient_day_embeddings = pd.read_csv('/mnt/project/patient_day_embeddings_omics_omicsword2vec_lc.csv').drop('Unnamed: 0',axis=1)\n",
111 |     "max_dates = 32"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "tags": []
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "patient_day_embeddings = patient_day_embeddings.sort_values(['eid','record_date'])"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {
129 |     "tags": []
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "unique_patients = patient_day_embeddings['eid'].nunique()"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "tags": []
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "#create np matrix to store input data, assign each patient to an index\n",
145 |     "patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['eid'].unique())}\n",
146 |     "RNN_data = np.full((400, max_dates, unique_patients), np.nan)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {
153 |     "tags": []
154 |    },
155 |    "outputs": [],
156 |    "source": [
157 |     "%%time\n",
158 |     "#populate np matrix with data\n",
159 |     "date_position = {}\n",
160 |     "for index, row in tqdm(patient_day_embeddings.iterrows(), total=patient_day_embeddings.shape[0]):\n",
161 |     "    patient_id = row['eid']\n",
162 |     "    patient_index = patient_id_to_index[patient_id]\n",
163 |     "    \n",
164 |     "    if patient_id not in date_position:\n",
165 |     "        date_position[patient_id] = 0\n",
166 |     "    else:\n",
167 |     "        date_position[patient_id] += 1\n",
168 |     "        \n",
169 |     "    date_index = date_position[patient_id]\n",
170 |     "    \n",
171 |     "    for feature_index, feature_value in enumerate(row.drop(['eid', 'record_date'])):\n",
172 |     "        if date_index < max_dates:\n",
173 |     "            RNN_data[feature_index, date_index, patient_index] = feature_value\n"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {
180 |     "tags": []
181 |    },
182 |    "outputs": [],
183 |    "source": [
184 |     "RNN_data = RNN_data.transpose(2,1,0)\n"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {
191 |     "tags": []
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "np.save('RNN_data_omics_omicsw2v_lc.npy', RNN_data)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "tags": []
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "%%bash\n",
207 |     "dx upload RNN_data_omics_omicsw2v_lc.npy --path /"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {
214 |     "tags": []
215 |    },
216 |    "outputs": [],
217 |    "source": [
218 |     "pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('eid_indices_omics_omicsw2v_lc.csv')\n"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {
225 |     "tags": []
226 |    },
227 |    "outputs": [],
228 |    "source": [
229 |     "%%bash\n",
230 |     "dx upload eid_indices_omics_omicsw2v_lc.csv --path /"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {
237 |     "tags": []
238 |    },
239 |    "outputs": [],
240 |    "source": [
241 |     "np.save('./visit_count_omics_omicsw2v_lc.npy',patient_day_embeddings.groupby('eid').count().sort_values('eid')['record_date'].values)"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {
248 |     "tags": []
249 |    },
250 |    "outputs": [],
251 |    "source": [
252 |     "%%bash\n",
253 |     "dx upload visit_count_omics_omicsw2v_lc.npy --path /"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {
260 |     "tags": []
261 |    },
262 |    "outputs": [],
263 |    "source": [
264 |     "idx_df = pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {
271 |     "tags": []
272 |    },
273 |    "outputs": [],
274 |    "source": [
275 |     "pdf['eid'] = pdf['eid'].astype(int)\n",
276 |     "outcomes = idx_df.merge(pdf[['eid','indicator']], how='left', left_on=0, right_on='eid').fillna(0).sort_values(1)['indicator'].astype(int).values"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {
283 |     "tags": []
284 |    },
285 |    "outputs": [],
286 |    "source": [
287 |     "np.save('./outcomes_omics_omicsw2v_lc_3yr.npy', outcomes)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {
294 |     "tags": []
295 |    },
296 |    "outputs": [],
297 |    "source": [
298 |     "%%bash\n",
299 |     "dx upload outcomes_omics_omicsw2v_lc_3yr.npy --path /"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "### PT Data"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {
313 |     "tags": []
314 |    },
315 |    "outputs": [],
316 |    "source": [
317 |     "patient_day_embeddings = pd.read_csv('/mnt/project/patient_day_embeddings_PT_lc_LARGER.csv').drop('Unnamed: 0',axis=1)\n",
318 |     "max_dates = 32"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": null,
324 |    "metadata": {
325 |     "tags": []
326 |    },
327 |    "outputs": [],
328 |    "source": [
329 |     "patient_day_embeddings = patient_day_embeddings.drop_duplicates(['eid','record_date'])"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {
336 |     "tags": []
337 |    },
338 |    "outputs": [],
339 |    "source": [
340 |     "eids_omics = pd.read_csv('/mnt/project/eid_indices_omics_omicsw2v_lc.csv')['0']"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {
347 |     "tags": []
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "patient_day_embeddings = patient_day_embeddings[~patient_day_embeddings['eid'].isin(eids_omics)]"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {
358 |     "tags": []
359 |    },
360 |    "outputs": [],
361 |    "source": [
362 |     "patient_day_embeddings.shape"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {
369 |     "tags": []
370 |    },
371 |    "outputs": [],
372 |    "source": [
373 |     "patient_day_embeddings['eid'].nunique()"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "tags": []
381 |    },
382 |    "outputs": [],
383 |    "source": [
384 |     "combined_query = spark.sql(\"\"\"\n",
385 |     "WITH EarliestCConds AS (\n",
386 |     "    SELECT \n",
387 |     "        c.eid,\n",
388 |     "        MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date\n",
389 |     "    FROM \n",
390 |     "        omop_condition_occurrence c\n",
391 |     "    WHERE \n",
392 |     "        c.condition_source_value LIKE 'C%'\n",
393 |     "    GROUP BY \n",
394 |     "        c.eid\n",
395 |     "),\n",
396 |     "FilteredPatients AS (\n",
397 |     "    SELECT \n",
398 |     "        ecc.eid,\n",
399 |     "        ecc.earliest_cond_date,\n",
400 |     "        TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date \n",
401 |     "    FROM \n",
402 |     "        EarliestCConds ecc\n",
403 |     "    INNER JOIN \n",
404 |     "        participant_0001 p ON ecc.eid = p.eid\n",
405 |     "    WHERE \n",
406 |     "        ecc.earliest_cond_date <= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), 60) AND\n",
407 |     "        ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)\n",
408 |     ")\n",
409 |     "\n",
410 |     "SELECT DISTINCT\n",
411 |     "    fp.eid, \n",
412 |     "    d.death_date,\n",
413 |     "    fp.earliest_cond_date AS proteomics_date\n",
414 |     "FROM \n",
415 |     "    FilteredPatients fp\n",
416 |     "JOIN\n",
417 |     "    omop_death d ON d.eid=fp.eid\n",
418 |     "\n",
419 |     "\"\"\")"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {
426 |     "tags": []
427 |    },
428 |    "outputs": [],
429 |    "source": [
430 |     "%%time\n",
431 |     "#query for death data and cancer diagnosis date\n",
432 |     "combined_query_results = combined_query.collect()"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": null,
438 |    "metadata": {
439 |     "tags": []
440 |    },
441 |    "outputs": [],
442 |    "source": [
443 |     "%%time\n",
444 |     "pdf = pd.DataFrame(combined_query_results, columns=[field.name for field in combined_query.schema.fields])"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": null,
450 |    "metadata": {
451 |     "tags": []
452 |    },
453 |    "outputs": [],
454 |    "source": [
455 |     "pdf['proteomics_date'] = pd.to_datetime(pdf['proteomics_date'], yearfirst=True)\n",
456 |     "pdf['death_date'] = pd.to_datetime(pdf['death_date'], dayfirst=True)\n",
457 |     "pdf = pdf.drop_duplicates()\n",
458 |     "pdf = pdf[pdf['death_date'] <= (pdf['proteomics_date'] + pd.DateOffset(years=3))]\n",
459 |     "pdf['indicator'] = 1"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": null,
465 |    "metadata": {
466 |     "tags": []
467 |    },
468 |    "outputs": [],
469 |    "source": [
470 |     "patient_day_embeddings = patient_day_embeddings.sort_values(['eid','record_date'])"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {
477 |     "tags": []
478 |    },
479 |    "outputs": [],
480 |    "source": [
481 |     "unique_patients = patient_day_embeddings['eid'].nunique()"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {
488 |     "tags": []
489 |    },
490 |    "outputs": [],
491 |    "source": [
492 |     "patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['eid'].unique())}\n",
493 |     "RNN_data = np.full((400, max_dates, unique_patients), np.nan)"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {
500 |     "tags": []
501 |    },
502 |    "outputs": [],
503 |    "source": [
504 |     "%%time\n",
505 |     "date_position = {}\n",
506 |     "for index, row in tqdm(patient_day_embeddings.iterrows(), total=patient_day_embeddings.shape[0]):\n",
507 |     "    patient_id = row['eid']\n",
508 |     "    patient_index = patient_id_to_index[patient_id]\n",
509 |     "    \n",
510 |     "    if patient_id not in date_position:\n",
511 |     "        date_position[patient_id] = 0\n",
512 |     "    else:\n",
513 |     "        date_position[patient_id] += 1\n",
514 |     "        \n",
515 |     "    date_index = date_position[patient_id]\n",
516 |     "    \n",
517 |     "    for feature_index, feature_value in enumerate(row.drop(['eid', 'record_date'])):\n",
518 |     "        if date_index < max_dates:\n",
519 |     "            RNN_data[feature_index, date_index, patient_index] = feature_value\n"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": null,
525 |    "metadata": {
526 |     "tags": []
527 |    },
528 |    "outputs": [],
529 |    "source": [
530 |     "RNN_data = RNN_data.transpose(2,1,0)\n"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {
537 |     "tags": []
538 |    },
539 |    "outputs": [],
540 |    "source": [
541 |     "np.save('RNN_data_PT_lc_LARGER.npy', RNN_data)"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": null,
547 |    "metadata": {
548 |     "tags": []
549 |    },
550 |    "outputs": [],
551 |    "source": [
552 |     "%%bash\n",
553 |     "dx upload RNN_data_PT_lc_LARGER.npy --path /"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": null,
559 |    "metadata": {
560 |     "tags": []
561 |    },
562 |    "outputs": [],
563 |    "source": [
564 |     "pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('eid_indices_PT_lc_LARGER.csv')\n"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": null,
570 |    "metadata": {
571 |     "tags": []
572 |    },
573 |    "outputs": [],
574 |    "source": [
575 |     "%%bash\n",
576 |     "dx upload eid_indices_PT_lc_LARGER.csv --path /"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": null,
582 |    "metadata": {
583 |     "tags": []
584 |    },
585 |    "outputs": [],
586 |    "source": [
587 |     "np.save('./visit_count_PT_lc_LARGER.npy',patient_day_embeddings.groupby('eid').count().sort_values('eid')['record_date'].values)"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "code",
592 |    "execution_count": null,
593 |    "metadata": {
594 |     "tags": []
595 |    },
596 |    "outputs": [],
597 |    "source": [
598 |     "%%bash\n",
599 |     "dx upload visit_count_PT_lc_LARGER.npy --path /"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": null,
605 |    "metadata": {
606 |     "tags": []
607 |    },
608 |    "outputs": [],
609 |    "source": [
610 |     "patient_day_embeddings['eid'].nunique()"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {
617 |     "tags": []
618 |    },
619 |    "outputs": [],
620 |    "source": [
621 |     "patient_day_embeddings.groupby('eid').count().sort_values('eid')['record_date'].values"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": null,
627 |    "metadata": {
628 |     "tags": []
629 |    },
630 |    "outputs": [],
631 |    "source": [
632 |     "idx_df = pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T"
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "code",
637 |    "execution_count": null,
638 |    "metadata": {
639 |     "tags": []
640 |    },
641 |    "outputs": [],
642 |    "source": [
643 |     "#one patient has duplicated death but the two records are a day apart and doesn't affect labeling\n",
644 |     "pdf = pdf.drop_duplicates('eid')\n"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "code",
649 |    "execution_count": null,
650 |    "metadata": {
651 |     "tags": []
652 |    },
653 |    "outputs": [],
654 |    "source": [
655 |     "pdf['eid'] = pdf['eid'].astype(int)\n",
656 |     "outcomes = idx_df.merge(pdf[['eid','indicator']], how='left', left_on=0, right_on='eid').fillna(0).sort_values(1)['indicator'].astype(int).values"
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "code",
661 |    "execution_count": null,
662 |    "metadata": {
663 |     "tags": []
664 |    },
665 |    "outputs": [],
666 |    "source": [
667 |     "outcomes.mean()"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": null,
673 |    "metadata": {
674 |     "tags": []
675 |    },
676 |    "outputs": [],
677 |    "source": [
678 |     "np.save('./outcomes_PT_lc_LARGER.npy', outcomes)"
679 |    ]
680 |   },
681 |   {
682 |    "cell_type": "code",
683 |    "execution_count": null,
684 |    "metadata": {
685 |     "tags": []
686 |    },
687 |    "outputs": [],
688 |    "source": [
689 |     "%%bash\n",
690 |     "dx upload outcomes_PT_lc_LARGER.npy --path /"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": null,
696 |    "metadata": {
697 |     "tags": []
698 |    },
699 |    "outputs": [],
700 |    "source": [
701 |     "len(outcomes)"
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "markdown",
706 |    "metadata": {},
707 |    "source": [
708 |     "### Pull Proteomics"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "code",
713 |    "execution_count": null,
714 |    "metadata": {
715 |     "tags": []
716 |    },
717 |    "outputs": [],
718 |    "source": [
719 |     "%%time\n",
720 |     "# Initialize Spark session\n",
721 |     "spark = SparkSession.builder.appName(\"Proteomics Data Aggregation\").getOrCreate()\n",
722 |     "\n",
723 |     "# List of table names\n",
724 |     "table_names = [f\"olink_instance_0_{str(i).zfill(4)}\" for i in range(1, 13)]\n",
725 |     "\n",
726 |     "# Create DataFrame for the first table\n",
727 |     "combined_df = spark.table(table_names[0])\n",
728 |     "\n",
729 |     "# Join the rest of the tables\n",
730 |     "for table_name in table_names[1:]:\n",
731 |     "    # Join each table on 'eid'\n",
732 |     "    next_table_df = spark.table(table_name)\n",
733 |     "    combined_df = combined_df.join(next_table_df, \"eid\", \"left\")\n",
734 |     "\n",
735 |     "# Write the result to a CSV file\n",
736 |     "combined_df.write.csv(\"all_proteomics\", header=True)\n"
737 |    ]
738 |   },
739 |   {
740 |    "cell_type": "code",
741 |    "execution_count": null,
742 |    "metadata": {
743 |     "tags": []
744 |    },
745 |    "outputs": [],
746 |    "source": [
747 |     "%%bash\n",
748 |     "hdfs dfs -ls ./all_proteomics"
749 |    ]
750 |   },
751 |   {
752 |    "cell_type": "code",
753 |    "execution_count": null,
754 |    "metadata": {
755 |     "tags": []
756 |    },
757 |    "outputs": [],
758 |    "source": [
759 |     "%%bash\n",
760 |     "hdfs dfs -get ./all_proteomics ./\n"
761 |    ]
762 |   },
763 |   {
764 |    "cell_type": "code",
765 |    "execution_count": null,
766 |    "metadata": {
767 |     "tags": []
768 |    },
769 |    "outputs": [],
770 |    "source": [
771 |     "%%time\n",
772 |     "# Directory containing your CSV files\n",
773 |     "directory = './all_proteomics/'\n",
774 |     "\n",
775 |     "# Read and combine all CSV files in the directory\n",
776 |     "all_csvs = [pd.read_csv(os.path.join(directory, file)) for file in os.listdir(directory) if file.endswith('.csv')]\n",
777 |     "combined_df = pd.concat(all_csvs, ignore_index=True)\n",
778 |     "\n",
779 |     "# Write the combined DataFrame to a new CSV file\n",
780 |     "combined_df.to_csv('./all_proteomics_lc.csv', index=False)\n"
781 |    ]
782 |   },
783 |   {
784 |    "cell_type": "code",
785 |    "execution_count": null,
786 |    "metadata": {
787 |     "tags": []
788 |    },
789 |    "outputs": [],
790 |    "source": [
791 |     "%%bash\n",
792 |     "dx upload all_proteomics_lc.csv --path /"
793 |    ]
794 |   },
795 |   {
796 |    "cell_type": "code",
797 |    "execution_count": null,
798 |    "metadata": {},
799 |    "outputs": [],
800 |    "source": []
801 |   }
802 |  ],
803 |  "metadata": {
804 |   "kernelspec": {
805 |    "display_name": "Python 3",
806 |    "language": "python",
807 |    "name": "python3"
808 |   },
809 |   "language_info": {
810 |    "codemirror_mode": {
811 |     "name": "ipython",
812 |     "version": 3
813 |    },
814 |    "file_extension": ".py",
815 |    "mimetype": "text/x-python",
816 |    "name": "python",
817 |    "nbconvert_exporter": "python",
818 |    "pygments_lexer": "ipython3",
819 |    "version": "3.8.5"
820 |   }
821 |  },
822 |  "nbformat": 4,
823 |  "nbformat_minor": 4
824 | }
825 | 


--------------------------------------------------------------------------------
/Cancer/process_EHR_data_omics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "tags": []
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pyspark\n",
 12 |     "import dxpy\n",
 13 |     "import dxdata\n",
 14 |     "import pandas as pd\n",
 15 |     "import random\n",
 16 |     "from pyspark.sql import functions as F\n",
 17 |     "from pyspark.sql import SparkSession\n",
 18 |     "from pyspark.ml.feature import Word2Vec\n",
 19 |     "from pyspark.sql.functions import col, udf, to_date, mean, expr, concat_ws\n",
 20 |     "from pyspark.sql.types import StringType, ArrayType, IntegerType, DoubleType\n",
 21 |     "from pyspark.ml.feature import Word2Vec\n",
 22 |     "from pyspark.sql.window import Window\n",
 23 |     "import ast"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {
 30 |     "tags": []
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "sc = pyspark.SparkContext()\n",
 35 |     "spark = pyspark.sql.SparkSession(sc)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {
 42 |     "tags": []
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "dispensed_database_name = dxpy.find_one_data_object(classname=\"database\", name=\"app*\", folder=\"/\", name_mode=\"glob\", describe=True)[\"describe\"][\"name\"]\n",
 47 |     "dispensed_dataset_id = dxpy.find_one_data_object(typename=\"Dataset\", name=\"app*.dataset\", folder=\"/\", name_mode=\"glob\")[\"id\"]"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {
 54 |     "tags": []
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "spark.sql(\"USE \" + dispensed_database_name)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "### Word2Vec Omics Cohort "
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {
 72 |     "tags": []
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "\n",
 77 |     "combined_query = spark.sql(\"\"\"\n",
 78 |     "WITH EarliestCConds AS (\n",
 79 |     "    SELECT \n",
 80 |     "        c.eid,\n",
 81 |     "        MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date\n",
 82 |     "    FROM \n",
 83 |     "        omop_condition_occurrence c\n",
 84 |     "    INNER JOIN \n",
 85 |     "        olink_instance_0_0001 o ON c.eid = o.eid\n",
 86 |     "    WHERE \n",
 87 |     "        c.condition_source_value LIKE 'C%'\n",
 88 |     "    GROUP BY \n",
 89 |     "        c.eid\n",
 90 |     "),\n",
 91 |     "FilteredPatients AS (\n",
 92 |     "    SELECT \n",
 93 |     "        ecc.eid,\n",
 94 |     "        ecc.earliest_cond_date,\n",
 95 |     "        TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date \n",
 96 |     "    FROM \n",
 97 |     "        EarliestCConds ecc\n",
 98 |     "    INNER JOIN \n",
 99 |     "        participant_0001 p ON ecc.eid = p.eid\n",
100 |     "    WHERE \n",
101 |     "        ecc.earliest_cond_date < TO_DATE(p.p53_i0, 'yyyy-MM-dd')\n",
102 |     "        AND ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)\n",
103 |     ")\n",
104 |     "\n",
105 |     "SELECT \n",
106 |     "    fp.eid, \n",
107 |     "    c.concept_id, \n",
108 |     "    c.record_date,\n",
109 |     "    DATE_FORMAT(c.record_date, 'yyyy-MM-dd') as formatted_date\n",
110 |     "FROM \n",
111 |     "    FilteredPatients fp\n",
112 |     "JOIN (\n",
113 |     "    SELECT \n",
114 |     "        o.eid, \n",
115 |     "        o.condition_concept_id as concept_id, \n",
116 |     "        TO_DATE(o.condition_start_date, 'dd/MM/yyyy') as record_date\n",
117 |     "    FROM \n",
118 |     "        omop_condition_occurrence o\n",
119 |     "    UNION ALL\n",
120 |     "    SELECT \n",
121 |     "        o.eid, \n",
122 |     "        o.procedure_concept_id as concept_id, \n",
123 |     "        TO_DATE(o.procedure_date, 'dd/MM/yyyy') as record_date\n",
124 |     "    FROM \n",
125 |     "        omop_procedure_occurrence o\n",
126 |     "    UNION ALL\n",
127 |     "    SELECT \n",
128 |     "        o.eid, \n",
129 |     "        o.drug_concept_id as concept_id, \n",
130 |     "        TO_DATE(o.drug_exposure_start_date, 'dd/MM/yyyy') as record_date\n",
131 |     "    FROM \n",
132 |     "        omop_drug_exposure o\n",
133 |     "    UNION ALL\n",
134 |     "    SELECT \n",
135 |     "        o.eid, \n",
136 |     "        o.observation_concept_id as concept_id, \n",
137 |     "        TO_DATE(o.observation_date, 'dd/MM/yyyy') as record_date\n",
138 |     "    FROM \n",
139 |     "        omop_observation o\n",
140 |     "    UNION ALL\n",
141 |     "    SELECT \n",
142 |     "        o.eid, \n",
143 |     "        o.measurement_concept_id as concept_id, \n",
144 |     "        TO_DATE(o.measurement_date, 'dd/MM/yyyy') as record_date\n",
145 |     "    FROM \n",
146 |     "        omop_measurement o\n",
147 |     ") c ON fp.eid = c.eid\n",
148 |     "\n",
149 |     "\n",
150 |     "\"\"\")"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {
157 |     "tags": []
158 |    },
159 |    "outputs": [],
160 |    "source": [
161 |     "%%time\n",
162 |     "distinct_eids = combined_query.select(\"eid\").distinct()\n",
163 |     "num_distinct_eids = distinct_eids.count()\n",
164 |     "\n",
165 |     "print(f\"Number of distinct eids: {num_distinct_eids}\")"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {
172 |     "tags": []
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "%%time\n",
177 |     "\n",
178 |     "combined_query.show()"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {
185 |     "tags": []
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "%%time\n",
190 |     "# Count the number of rows in the result\n",
191 |     "row_count = combined_query.count()\n",
192 |     "\n",
193 |     "# Print the row count\n",
194 |     "print(f\"Number of rows in the query result: {row_count}\")"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {
201 |     "tags": []
202 |    },
203 |    "outputs": [],
204 |    "source": [
205 |     "%%time\n",
206 |     "\n",
207 |     "# Initialize Spark Session\n",
208 |     "spark = SparkSession.builder.appName(\"Word2Vec Training\").getOrCreate()\n",
209 |     "\n",
210 |     "combined_query = combined_query.withColumn(\"concept_id\", combined_query[\"concept_id\"].cast(IntegerType()))\n",
211 |     "\n",
212 |     "# Group by 'eid' (person_id) and 'month_year'\n",
213 |     "grouped_data = (combined_query.groupBy(\"eid\", \"formatted_date\")\n",
214 |     "                .agg(F.collect_list(\"concept_id\").alias(\"concept_ids\")))\n",
215 |     "\n",
216 |     "# Define a UDF to convert integers to strings\n",
217 |     "int_to_string_udf = udf(lambda x: [str(i) for i in x], ArrayType(StringType()))\n",
218 |     "\n",
219 |     "# Apply the UDF to the 'concept_ids' column\n",
220 |     "word2Vec_data = grouped_data.withColumn(\"words\", int_to_string_udf(col(\"concept_ids\")))\n",
221 |     "\n",
222 |     "# Define the Word2Vec model\n",
223 |     "print('started training')\n",
224 |     "word2vec = Word2Vec(vectorSize=400, windowSize=100, minCount=5, inputCol=\"words\", outputCol=\"wordVectors\").setMaxIter(3)\n",
225 |     "\n",
226 |     "# Fit the model\n",
227 |     "model = word2vec.fit(word2Vec_data)\n",
228 |     "print('done training')\n"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {
235 |     "tags": []
236 |    },
237 |    "outputs": [],
238 |    "source": [
239 |     "word_vectors = model.getVectors()\n",
240 |     "\n",
241 |     "pandas_df = word_vectors.toPandas()\n",
242 |     "\n",
243 |     "pandas_df.to_csv(\"./omics_lc_word2vec.csv\", index=False)\n"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "tags": []
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "%%bash\n",
255 |     "dx upload omics_lc_word2vec.csv --path /"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "### Word2Vec PT Cohort"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {
269 |     "tags": []
270 |    },
271 |    "outputs": [],
272 |    "source": [
273 |     "combined_query = spark.sql(\"\"\"\n",
274 |     "WITH EarliestCConds AS (\n",
275 |     "    SELECT \n",
276 |     "        c.eid,\n",
277 |     "        MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date\n",
278 |     "    FROM \n",
279 |     "        omop_condition_occurrence c\n",
280 |     "    WHERE \n",
281 |     "        c.condition_source_value LIKE 'C%'\n",
282 |     "    GROUP BY \n",
283 |     "        c.eid\n",
284 |     "),\n",
285 |     "FilteredPatients AS (\n",
286 |     "    SELECT \n",
287 |     "        ecc.eid,\n",
288 |     "        ecc.earliest_cond_date,\n",
289 |     "        TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date \n",
290 |     "    FROM \n",
291 |     "        EarliestCConds ecc\n",
292 |     "    INNER JOIN \n",
293 |     "        participant_0001 p ON ecc.eid = p.eid\n",
294 |     "    WHERE \n",
295 |     "        ecc.earliest_cond_date < TO_DATE(p.p53_i0, 'yyyy-MM-dd')\n",
296 |     "        AND ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)\n",
297 |     ")\n",
298 |     "\n",
299 |     "SELECT \n",
300 |     "    fp.eid, \n",
301 |     "    c.concept_id, \n",
302 |     "    c.record_date,\n",
303 |     "    DATE_FORMAT(c.record_date, 'yyyy-MM-dd') as formatted_date\n",
304 |     "FROM \n",
305 |     "    FilteredPatients fp\n",
306 |     "JOIN (\n",
307 |     "    SELECT \n",
308 |     "        o.eid, \n",
309 |     "        o.condition_concept_id as concept_id, \n",
310 |     "        TO_DATE(o.condition_start_date, 'dd/MM/yyyy') as record_date\n",
311 |     "    FROM \n",
312 |     "        omop_condition_occurrence o\n",
313 |     "    UNION ALL\n",
314 |     "    SELECT \n",
315 |     "        o.eid, \n",
316 |     "        o.procedure_concept_id as concept_id, \n",
317 |     "        TO_DATE(o.procedure_date, 'dd/MM/yyyy') as record_date\n",
318 |     "    FROM \n",
319 |     "        omop_procedure_occurrence o\n",
320 |     "    UNION ALL\n",
321 |     "    SELECT \n",
322 |     "        o.eid, \n",
323 |     "        o.drug_concept_id as concept_id, \n",
324 |     "        TO_DATE(o.drug_exposure_start_date, 'dd/MM/yyyy') as record_date\n",
325 |     "    FROM \n",
326 |     "        omop_drug_exposure o\n",
327 |     "    UNION ALL\n",
328 |     "    SELECT \n",
329 |     "        o.eid, \n",
330 |     "        o.observation_concept_id as concept_id, \n",
331 |     "        TO_DATE(o.observation_date, 'dd/MM/yyyy') as record_date\n",
332 |     "    FROM \n",
333 |     "        omop_observation o\n",
334 |     "    UNION ALL\n",
335 |     "    SELECT \n",
336 |     "        o.eid, \n",
337 |     "        o.measurement_concept_id as concept_id, \n",
338 |     "        TO_DATE(o.measurement_date, 'dd/MM/yyyy') as record_date\n",
339 |     "    FROM \n",
340 |     "        omop_measurement o\n",
341 |     ") c ON fp.eid = c.eid\n",
342 |     "\n",
343 |     "\"\"\")"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {
350 |     "tags": []
351 |    },
352 |    "outputs": [],
353 |    "source": [
354 |     "%%time\n",
355 |     "# Count the number of rows in the result\n",
356 |     "row_count = combined_query.count()\n",
357 |     "\n",
358 |     "# Print the row count\n",
359 |     "print(f\"Number of rows in the query result: {row_count}\")"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {
366 |     "tags": []
367 |    },
368 |    "outputs": [],
369 |    "source": [
370 |     "%%time\n",
371 |     "distinct_eids = combined_query.select(\"eid\").distinct()\n",
372 |     "num_distinct_eids = distinct_eids.count()\n",
373 |     "\n",
374 |     "print(f\"Number of distinct eids: {num_distinct_eids}\")"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {
381 |     "tags": []
382 |    },
383 |    "outputs": [],
384 |    "source": [
385 |     "%%time\n",
386 |     "\n",
387 |     "# Initialize Spark Session\n",
388 |     "spark = SparkSession.builder.appName(\"Word2Vec Training\").getOrCreate()\n",
389 |     "\n",
390 |     "combined_query = combined_query.withColumn(\"concept_id\", combined_query[\"concept_id\"].cast(IntegerType()))\n",
391 |     "\n",
392 |     "# Group by 'eid' (person_id) and 'month_year'\n",
393 |     "grouped_data = (combined_query.groupBy(\"eid\", \"formatted_date\")\n",
394 |     "                .agg(F.collect_list(\"concept_id\").alias(\"concept_ids\")))\n",
395 |     "\n",
396 |     "# Define a UDF to convert integers to strings\n",
397 |     "int_to_string_udf = udf(lambda x: [str(i) for i in x], ArrayType(StringType()))\n",
398 |     "\n",
399 |     "# Apply the UDF to the 'concept_ids' column\n",
400 |     "word2Vec_data = grouped_data.withColumn(\"words\", int_to_string_udf(col(\"concept_ids\")))\n",
401 |     "\n",
402 |     "# Define the Word2Vec model\n",
403 |     "print('started training')\n",
404 |     "word2vec = Word2Vec(vectorSize=400, windowSize=100, minCount=5, inputCol=\"words\", outputCol=\"wordVectors\").setMaxIter(3)\n",
405 |     "\n",
406 |     "# Fit the model\n",
407 |     "model = word2vec.fit(word2Vec_data)\n",
408 |     "print('done training')\n"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": null,
414 |    "metadata": {
415 |     "tags": []
416 |    },
417 |    "outputs": [],
418 |    "source": [
419 |     "word_vectors = model.getVectors()\n",
420 |     "\n",
421 |     "pandas_df = word_vectors.toPandas()\n",
422 |     "\n",
423 |     "pandas_df.to_csv(\"./PT_lc_word2vec.csv\", index=False)\n"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {
430 |     "tags": []
431 |    },
432 |    "outputs": [],
433 |    "source": [
434 |     "%%bash\n",
435 |     "dx upload PT_lc_word2vec.csv --path /"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": []
444 |   },
445 |   {
446 |    "cell_type": "markdown",
447 |    "metadata": {},
448 |    "source": [
449 |     "### Downstream Processing for omics cohort only"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": null,
455 |    "metadata": {
456 |     "tags": []
457 |    },
458 |    "outputs": [],
459 |    "source": [
460 |     "word_vectors = pd.read_csv('./PT_lc_word2vec.csv')\n",
461 |     "word_vectors['vector'] = word_vectors['vector'].apply(ast.literal_eval)\n",
462 |     "\n"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {
469 |     "tags": []
470 |    },
471 |    "outputs": [],
472 |    "source": [
473 |     "combined_query = spark.sql(\"\"\"\n",
474 |     "WITH EarliestCConds AS (\n",
475 |     "    SELECT \n",
476 |     "        c.eid,\n",
477 |     "        MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date\n",
478 |     "    FROM \n",
479 |     "        omop_condition_occurrence c\n",
480 |     "    INNER JOIN \n",
481 |     "        olink_instance_0_0001 o ON c.eid = o.eid\n",
482 |     "    WHERE \n",
483 |     "        c.condition_source_value LIKE 'C%'\n",
484 |     "    GROUP BY \n",
485 |     "        c.eid\n",
486 |     "),\n",
487 |     "FilteredPatients AS (\n",
488 |     "    SELECT \n",
489 |     "        ecc.eid,\n",
490 |     "        ecc.earliest_cond_date,\n",
491 |     "        TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date \n",
492 |     "    FROM \n",
493 |     "        EarliestCConds ecc\n",
494 |     "    INNER JOIN \n",
495 |     "        participant_0001 p ON ecc.eid = p.eid\n",
496 |     "    WHERE \n",
497 |     "        ecc.earliest_cond_date < TO_DATE(p.p53_i0, 'yyyy-MM-dd')\n",
498 |     "        AND ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)\n",
499 |     ")\n",
500 |     "\n",
501 |     "SELECT \n",
502 |     "    fp.eid, \n",
503 |     "    c.concept_id, \n",
504 |     "    c.record_date,\n",
505 |     "    DATE_FORMAT(c.record_date, 'yyyy-MM-dd') as formatted_date\n",
506 |     "FROM \n",
507 |     "    FilteredPatients fp\n",
508 |     "JOIN (\n",
509 |     "    SELECT \n",
510 |     "        o.eid, \n",
511 |     "        o.condition_concept_id as concept_id, \n",
512 |     "        TO_DATE(o.condition_start_date, 'dd/MM/yyyy') as record_date\n",
513 |     "    FROM \n",
514 |     "        omop_condition_occurrence o\n",
515 |     "    UNION ALL\n",
516 |     "    SELECT \n",
517 |     "        o.eid, \n",
518 |     "        o.procedure_concept_id as concept_id, \n",
519 |     "        TO_DATE(o.procedure_date, 'dd/MM/yyyy') as record_date\n",
520 |     "    FROM \n",
521 |     "        omop_procedure_occurrence o\n",
522 |     "    UNION ALL\n",
523 |     "    SELECT \n",
524 |     "        o.eid, \n",
525 |     "        o.drug_concept_id as concept_id, \n",
526 |     "        TO_DATE(o.drug_exposure_start_date, 'dd/MM/yyyy') as record_date\n",
527 |     "    FROM \n",
528 |     "        omop_drug_exposure o\n",
529 |     "    UNION ALL\n",
530 |     "    SELECT \n",
531 |     "        o.eid, \n",
532 |     "        o.observation_concept_id as concept_id, \n",
533 |     "        TO_DATE(o.observation_date, 'dd/MM/yyyy') as record_date\n",
534 |     "    FROM \n",
535 |     "        omop_observation o\n",
536 |     "    UNION ALL\n",
537 |     "    SELECT \n",
538 |     "        o.eid, \n",
539 |     "        o.measurement_concept_id as concept_id, \n",
540 |     "        TO_DATE(o.measurement_date, 'dd/MM/yyyy') as record_date\n",
541 |     "    FROM \n",
542 |     "        omop_measurement o\n",
543 |     ") c ON fp.eid = c.eid AND c.record_date <= fp.proteomics_date\n",
544 |     "\n",
545 |     "\"\"\")"
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "code",
550 |    "execution_count": null,
551 |    "metadata": {
552 |     "tags": []
553 |    },
554 |    "outputs": [],
555 |    "source": [
556 |     "%%time\n",
557 |     "combined_query_results = combined_query.collect()"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "code",
562 |    "execution_count": null,
563 |    "metadata": {
564 |     "tags": []
565 |    },
566 |    "outputs": [],
567 |    "source": [
568 |     "%%time\n",
569 |     "pdf = pd.DataFrame(combined_query_results, columns=[field.name for field in combined_query.schema.fields])"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": null,
575 |    "metadata": {
576 |     "tags": []
577 |    },
578 |    "outputs": [],
579 |    "source": [
580 |     "word_vectors['word'] = word_vectors['word'].astype(str)"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": null,
586 |    "metadata": {
587 |     "tags": []
588 |    },
589 |    "outputs": [],
590 |    "source": [
591 |     "pdf = pdf.merge(word_vectors, how='inner', left_on='concept_id', right_on='word').drop(['word','concept_id'],axis=1)"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": null,
597 |    "metadata": {
598 |     "tags": []
599 |    },
600 |    "outputs": [],
601 |    "source": [
602 |     "pdf.shape, pdf['eid'].nunique()"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "code",
607 |    "execution_count": null,
608 |    "metadata": {
609 |     "tags": []
610 |    },
611 |    "outputs": [],
612 |    "source": [
613 |     "%%time\n",
614 |     "embeddings_df = pd.DataFrame(pdf['vector'].tolist(), index=pdf.index)\n",
615 |     "embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]\n",
616 |     "\n",
617 |     "# Join the new DataFrame with the original DataFrame\n",
618 |     "embedded_codes = pdf.join(embeddings_df)\n"
619 |    ]
620 |   },
621 |   {
622 |    "cell_type": "code",
623 |    "execution_count": null,
624 |    "metadata": {
625 |     "tags": []
626 |    },
627 |    "outputs": [],
628 |    "source": [
629 |     "embedded_codes.shape"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "code",
634 |    "execution_count": null,
635 |    "metadata": {
636 |     "tags": []
637 |    },
638 |    "outputs": [],
639 |    "source": [
640 |     "# Convert 'record_date' to datetime format in Pandas\n",
641 |     "embedded_codes['record_date'] = pd.to_datetime(embedded_codes['record_date'], format='%Y-%m-%d')"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": null,
647 |    "metadata": {
648 |     "tags": []
649 |    },
650 |    "outputs": [],
651 |    "source": [
652 |     "%%time\n",
653 |     "max_dates=32\n",
654 |     "\n",
655 |     "# 1. Sort the DataFrame\n",
656 |     "embedded_codes = embedded_codes.sort_values(by=['eid', 'record_date'], ascending=[True, False])\n",
657 |     "\n",
658 |     "# 2. Rank within each 'eid' group\n",
659 |     "embedded_codes['date_rank'] = embedded_codes.groupby('eid')['record_date'].rank(method='dense', ascending=False)\n",
660 |     "\n",
661 |     "# 3. Filter based on rank\n",
662 |     "filtered_data_pd = embedded_codes[embedded_codes['date_rank'] <= max_dates]\n",
663 |     "\n",
664 |     "# 4. Define your aggregation expressions\n",
665 |     "agg_funcs = {f'embedding_{i}': 'mean' for i in range(400)}\n",
666 |     "\n",
667 |     "# Apply aggregation with the defined expressions\n",
668 |     "patient_day_embeddings_pd = filtered_data_pd.groupby(['eid', 'record_date']).agg(agg_funcs)\n"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": null,
674 |    "metadata": {
675 |     "tags": []
676 |    },
677 |    "outputs": [],
678 |    "source": [
679 |     "%%time\n",
680 |     "patient_day_embeddings_pd.reset_index().to_csv('./patient_day_embeddings_omics_PTword2vec_lc.csv', header=True)"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "code",
685 |    "execution_count": null,
686 |    "metadata": {
687 |     "tags": []
688 |    },
689 |    "outputs": [],
690 |    "source": [
691 |     "%%bash\n",
692 |     "dx upload patient_day_embeddings_omics_PTword2vec_lc.csv --path /"
693 |    ]
694 |   },
695 |   {
696 |    "cell_type": "code",
697 |    "execution_count": null,
698 |    "metadata": {},
699 |    "outputs": [],
700 |    "source": []
701 |   }
702 |  ],
703 |  "metadata": {
704 |   "kernelspec": {
705 |    "display_name": "Python 3",
706 |    "language": "python",
707 |    "name": "python3"
708 |   },
709 |   "language_info": {
710 |    "codemirror_mode": {
711 |     "name": "ipython",
712 |     "version": 3
713 |    },
714 |    "file_extension": ".py",
715 |    "mimetype": "text/x-python",
716 |    "name": "python",
717 |    "nbconvert_exporter": "python",
718 |    "pygments_lexer": "ipython3",
719 |    "version": "3.8.5"
720 |   }
721 |  },
722 |  "nbformat": 4,
723 |  "nbformat_minor": 4
724 | }
725 | 


--------------------------------------------------------------------------------
/Cancer/process_PT_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "tags": []
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pyspark\n",
 12 |     "import dxpy\n",
 13 |     "import dxdata\n",
 14 |     "import pandas as pd\n",
 15 |     "import random\n",
 16 |     "from pyspark.sql import functions as F\n",
 17 |     "from pyspark.sql import SparkSession\n",
 18 |     "from pyspark.ml.feature import Word2Vec\n",
 19 |     "from pyspark.sql.functions import col, udf, to_date, mean, expr\n",
 20 |     "from pyspark.sql.types import StringType, ArrayType, IntegerType, DoubleType\n",
 21 |     "from pyspark.ml.feature import Word2Vec\n",
 22 |     "from pyspark.sql.window import Window\n",
 23 |     "import ast\n",
 24 |     "import numpy as np\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {
 31 |     "tags": []
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "spark = SparkSession.builder \\\n",
 36 |     "    .appName(\"MyApp\") \\\n",
 37 |     "    .config(\"spark.serializer\", \"org.apache.spark.serializer.KryoSerializer\") \\\n",
 38 |     "    .config(\"spark.kryoserializer.buffer.max\", \"1g\") \\\n",
 39 |     "    .getOrCreate()\n",
 40 |     "\n",
 41 |     "# The SparkContext is accessible from the SparkSession as follows:\n",
 42 |     "sc = spark.sparkContext"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {
 49 |     "tags": []
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "dispensed_database_name = dxpy.find_one_data_object(classname=\"database\", name=\"app*\", folder=\"/\", name_mode=\"glob\", describe=True)[\"describe\"][\"name\"]\n",
 54 |     "dispensed_dataset_id = dxpy.find_one_data_object(typename=\"Dataset\", name=\"app*.dataset\", folder=\"/\", name_mode=\"glob\")[\"id\"]"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 4,
 60 |    "metadata": {
 61 |     "tags": []
 62 |    },
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "DataFrame[]"
 68 |       ]
 69 |      },
 70 |      "execution_count": 4,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "spark.sql(\"USE \" + dispensed_database_name)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {
 83 |     "tags": []
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "#cancer patients with initial diagnosis at most 12 months before initial UKBB visit\n",
 88 |     "combined_query = spark.sql(\"\"\"\n",
 89 |     "WITH EarliestCConds AS (\n",
 90 |     "    SELECT \n",
 91 |     "        c.eid,\n",
 92 |     "        MIN(TO_DATE(c.condition_start_date, 'dd/MM/yyyy')) as earliest_cond_date\n",
 93 |     "    FROM \n",
 94 |     "        omop_condition_occurrence c\n",
 95 |     "    WHERE \n",
 96 |     "        c.condition_source_value LIKE 'C%'\n",
 97 |     "    GROUP BY \n",
 98 |     "        c.eid\n",
 99 |     "),\n",
100 |     "FilteredPatients AS (\n",
101 |     "    SELECT \n",
102 |     "        ecc.eid,\n",
103 |     "        ecc.earliest_cond_date,\n",
104 |     "        TO_DATE(p.p53_i0, 'yyyy-MM-dd') AS proteomics_date \n",
105 |     "    FROM \n",
106 |     "        EarliestCConds ecc\n",
107 |     "    INNER JOIN \n",
108 |     "        participant_0001 p ON ecc.eid = p.eid\n",
109 |     "    WHERE \n",
110 |     "        ecc.earliest_cond_date <= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), 60) AND\n",
111 |     "        ecc.earliest_cond_date >= ADD_MONTHS(TO_DATE(p.p53_i0, 'yyyy-MM-dd'), -12)\n",
112 |     ")\n",
113 |     "\n",
114 |     "SELECT \n",
115 |     "    fp.eid, \n",
116 |     "    c.concept_id, \n",
117 |     "    c.record_date,\n",
118 |     "    DATE_FORMAT(c.record_date, 'yyyy-MM-dd') as formatted_date\n",
119 |     "FROM \n",
120 |     "    FilteredPatients fp\n",
121 |     "JOIN (\n",
122 |     "    SELECT \n",
123 |     "        o.eid, \n",
124 |     "        o.condition_concept_id as concept_id, \n",
125 |     "        TO_DATE(o.condition_start_date, 'dd/MM/yyyy') as record_date\n",
126 |     "    FROM \n",
127 |     "        omop_condition_occurrence o\n",
128 |     "    UNION ALL\n",
129 |     "    SELECT \n",
130 |     "        o.eid, \n",
131 |     "        o.procedure_concept_id as concept_id, \n",
132 |     "        TO_DATE(o.procedure_date, 'dd/MM/yyyy') as record_date\n",
133 |     "    FROM \n",
134 |     "        omop_procedure_occurrence o\n",
135 |     "    UNION ALL\n",
136 |     "    SELECT \n",
137 |     "        o.eid, \n",
138 |     "        o.drug_concept_id as concept_id, \n",
139 |     "        TO_DATE(o.drug_exposure_start_date, 'dd/MM/yyyy') as record_date\n",
140 |     "    FROM \n",
141 |     "        omop_drug_exposure o\n",
142 |     "    UNION ALL\n",
143 |     "    SELECT \n",
144 |     "        o.eid, \n",
145 |     "        o.observation_concept_id as concept_id, \n",
146 |     "        TO_DATE(o.observation_date, 'dd/MM/yyyy') as record_date\n",
147 |     "    FROM \n",
148 |     "        omop_observation o\n",
149 |     "    UNION ALL\n",
150 |     "    SELECT \n",
151 |     "        o.eid, \n",
152 |     "        o.measurement_concept_id as concept_id, \n",
153 |     "        TO_DATE(o.measurement_date, 'dd/MM/yyyy') as record_date\n",
154 |     "    FROM \n",
155 |     "        omop_measurement o\n",
156 |     ") c ON fp.eid = c.eid AND c.record_date <= fp.earliest_cond_date\n",
157 |     "\n",
158 |     "\"\"\")"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {
165 |     "tags": []
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "%%time\n",
170 |     "combined_query_results = combined_query.collect()"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {
177 |     "tags": []
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "%%time\n",
182 |     "pdf = pd.DataFrame(combined_query_results, columns=[field.name for field in combined_query.schema.fields])"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {
189 |     "tags": []
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "pdf['eid'].nunique()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {
200 |     "tags": []
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "%%time\n",
205 |     "# Convert 'record_date' to datetime format in Pandas\n",
206 |     "pdf['record_date'] = pd.to_datetime(pdf['record_date'], format='%Y-%m-%d')\n"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {
213 |     "tags": []
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "%%time\n",
218 |     "max_dates=32\n",
219 |     "\n",
220 |     "# 1. Sort the DataFrame\n",
221 |     "pdf = pdf.sort_values(by=['eid', 'record_date'], ascending=[True, False])\n",
222 |     "\n",
223 |     "# 2. Rank within each 'eid' group\n",
224 |     "pdf['date_rank'] = pdf.groupby('eid')['record_date'].rank(method='dense', ascending=False)\n",
225 |     "\n",
226 |     "# 3. Filter based on rank\n",
227 |     "filtered_pdf = pdf[pdf['date_rank'] <= max_dates]\n"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {
234 |     "tags": []
235 |    },
236 |    "outputs": [],
237 |    "source": [
238 |     "%%time\n",
239 |     "word_vectors = pd.read_csv('/mnt/project/PT_lc_word2vec.csv')\n",
240 |     "word_vectors['vector'] = word_vectors['vector'].apply(ast.literal_eval)\n"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {
247 |     "tags": []
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "word_vectors['word'] = word_vectors['word'].astype(str)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {
258 |     "tags": []
259 |    },
260 |    "outputs": [],
261 |    "source": [
262 |     "filtered_pdf = filtered_pdf.merge(word_vectors, how='inner', left_on='concept_id', right_on='word').drop(['word','concept_id'],axis=1)\n"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {
269 |     "tags": []
270 |    },
271 |    "outputs": [],
272 |    "source": [
273 |     "filtered_pdf.shape, filtered_pdf['eid'].nunique()"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {
280 |     "tags": []
281 |    },
282 |    "outputs": [],
283 |    "source": [
284 |     "indices = filtered_pdf.index"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {
291 |     "tags": []
292 |    },
293 |    "outputs": [],
294 |    "source": [
295 |     "%%time\n",
296 |     "embeddings_array = np.array(filtered_pdf['vector'].tolist(), dtype=np.float64)\n"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {
303 |     "tags": []
304 |    },
305 |    "outputs": [],
306 |    "source": [
307 |     "embeddings_df = pd.DataFrame(embeddings_array, index=indices)\n"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "metadata": {
314 |     "tags": []
315 |    },
316 |    "outputs": [],
317 |    "source": [
318 |     "%%time\n",
319 |     "embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]\n",
320 |     "\n",
321 |     "# Join the new DataFrame with the original DataFrame\n",
322 |     "embedded_codes = filtered_pdf.join(embeddings_df)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {
329 |     "tags": []
330 |    },
331 |    "outputs": [],
332 |    "source": [
333 |     "%%time\n",
334 |     "\n",
335 |     "# 4. Define your aggregation expressions\n",
336 |     "agg_funcs = {f'embedding_{i}': 'mean' for i in range(400)}\n",
337 |     "\n",
338 |     "# Apply aggregation with the defined expressions\n",
339 |     "patient_day_embeddings_pd = embedded_codes.groupby(['eid', 'record_date']).agg(agg_funcs)\n"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {
346 |     "tags": []
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "%%time\n",
351 |     "patient_day_embeddings_pd.reset_index().to_csv('./patient_day_embeddings_PT_lc_LARGER.csv', header=True)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {
358 |     "tags": []
359 |    },
360 |    "outputs": [],
361 |    "source": [
362 |     "%%bash\n",
363 |     "dx upload patient_day_embeddings_PT_lc_LARGER.csv --path /"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": []
372 |   }
373 |  ],
374 |  "metadata": {
375 |   "kernelspec": {
376 |    "display_name": "Python 3",
377 |    "language": "python",
378 |    "name": "python3"
379 |   },
380 |   "language_info": {
381 |    "codemirror_mode": {
382 |     "name": "ipython",
383 |     "version": 3
384 |    },
385 |    "file_extension": ".py",
386 |    "mimetype": "text/x-python",
387 |    "name": "python",
388 |    "nbconvert_exporter": "python",
389 |    "pygments_lexer": "ipython3",
390 |    "version": "3.8.5"
391 |   }
392 |  },
393 |  "nbformat": 4,
394 |  "nbformat_minor": 4
395 | }
396 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Samson Mataraso
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Onset of Labor/data/ool_EHR_features.csv:
--------------------------------------------------------------------------------
 1 | mom_person_id
 2 | 1000000
 3 | 1000001
 4 | 1000002
 5 | 1000003
 6 | 1000004
 7 | 1000005
 8 | 1000006
 9 | 1000007
10 | 1000008
11 | 1000009
12 | 1000010
13 | 1000011
14 | 1000012
15 | 1000013
16 | 1000014
17 | 1000015
18 | 1000016
19 | 1000017
20 | 1000018
21 | 1000019
22 | 


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/RNN_data_codes_with_obs.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_codes_with_obs.npy


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/RNN_data_codes_with_obs_word2vec_from_ool.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_codes_with_obs_word2vec_from_ool.npy


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/RNN_data_full_EHR_cohort_with_obs_fixed.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_full_EHR_cohort_with_obs_fixed.npy


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/RNN_data_lengths_full_EHR_cohort_with_obs_fixed.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_lengths_full_EHR_cohort_with_obs_fixed.npy


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/RNN_data_lengths_with_obs.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_lengths_with_obs.npy


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/RNN_data_lengths_with_obs_word2vec_from_ool.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_lengths_with_obs_word2vec_from_ool.npy


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/RNN_data_outcomes_full_EHR_cohort_with_obs_fixed.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_outcomes_full_EHR_cohort_with_obs_fixed.npy


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/RNN_data_outcomes_with_obs.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_outcomes_with_obs.npy


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/RNN_data_outcomes_with_obs_word2vec_from_ool.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samson920/COMET/cc582803da8c3b0bc2b4375adaae3b48ce67f83f/Onset of Labor/data/processed_data/RNN_data_outcomes_with_obs_word2vec_from_ool.npy


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/sampleID_indices.csv:
--------------------------------------------------------------------------------
 1 | 0
 2 | 1000000_10000000_T1
 3 | 1000000_10000000_T2
 4 | 1000000_10000000_T3
 5 | 1000001_10000001_T1
 6 | 1000001_10000001_T2
 7 | 1000001_10000001_T3
 8 | 1000002_10000002_T1
 9 | 1000002_10000002_T2
10 | 1000002_10000002_T3
11 | 1000003_10000003_T1
12 | 1000003_10000003_T2
13 | 1000003_10000003_T3
14 | 1000004_10000004_T1
15 | 1000004_10000004_T2
16 | 1000004_10000004_T3
17 | 1000005_10000005_T1
18 | 1000005_10000005_T2
19 | 1000005_10000005_T3
20 | 1000006_10000006_T1
21 | 1000006_10000006_T2
22 | 1000006_10000006_T3
23 | 1000007_10000007_T1
24 | 1000007_10000007_T2
25 | 1000007_10000007_T3
26 | 1000008_10000008_T1
27 | 1000008_10000008_T2
28 | 1000008_10000008_T3
29 | 1000009_10000009_T1
30 | 1000009_10000009_T2
31 | 1000009_10000009_T3
32 | 1000010_10000010_T1
33 | 1000010_10000010_T2
34 | 1000010_10000010_T3
35 | 1000011_10000011_T1
36 | 1000011_10000011_T2
37 | 1000011_10000011_T3
38 | 1000012_10000012_T1
39 | 1000012_10000012_T2
40 | 1000012_10000012_T3
41 | 1000013_10000013_T1
42 | 1000013_10000013_T2
43 | 1000013_10000013_T3
44 | 1000014_10000014_T1
45 | 1000014_10000014_T2
46 | 1000014_10000014_T3
47 | 1000015_10000015_T1
48 | 1000015_10000015_T2
49 | 1000015_10000015_T3
50 | 1000016_10000016_T1
51 | 1000016_10000016_T2
52 | 1000016_10000016_T3
53 | 1000017_10000017_T1
54 | 1000017_10000017_T2
55 | 1000017_10000017_T3
56 | 1000018_10000018_T1
57 | 1000018_10000018_T2
58 | 1000018_10000018_T3
59 | 1000019_10000019_T1
60 | 1000019_10000019_T2
61 | 1000019_10000019_T3
62 | 


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/sampleID_indices_full_cohort_with_obs_fixed.csv:
--------------------------------------------------------------------------------
  1 | ,0,1
  2 | 0,1000021_10000021,0
  3 | 1,1000022_10000022,1
  4 | 2,1000023_10000023,2
  5 | 3,1000026_10000026,3
  6 | 4,1000027_10000027,4
  7 | 5,1000031_10000031,5
  8 | 6,1000032_10000032,6
  9 | 7,1000033_10000033,7
 10 | 8,1000035_10000035,8
 11 | 9,1000037_10000037,9
 12 | 10,1000039_10000039,10
 13 | 11,1000045_10000045,11
 14 | 12,1000047_10000047,12
 15 | 13,1000048_10000048,13
 16 | 14,1000050_10000050,14
 17 | 15,1000052_10000052,15
 18 | 16,1000053_10000053,16
 19 | 17,1000054_10000054,17
 20 | 18,1000055_10000055,18
 21 | 19,1000056_10000056,19
 22 | 20,1000058_10000058,20
 23 | 21,1000059_10000059,21
 24 | 22,1000061_10000061,22
 25 | 23,1000062_10000062,23
 26 | 24,1000066_10000066,24
 27 | 25,1000068_10000068,25
 28 | 26,1000069_10000069,26
 29 | 27,1000070_10000070,27
 30 | 28,1000072_10000072,28
 31 | 29,1000073_10000073,29
 32 | 30,1000074_10000074,30
 33 | 31,1000076_10000076,31
 34 | 32,1000083_10000083,32
 35 | 33,1000085_10000085,33
 36 | 34,1000086_10000086,34
 37 | 35,1000087_10000087,35
 38 | 36,1000094_10000094,36
 39 | 37,1000098_10000098,37
 40 | 38,1000100_10000100,38
 41 | 39,1000103_10000103,39
 42 | 40,1000104_10000104,40
 43 | 41,1000107_10000107,41
 44 | 42,1000108_10000108,42
 45 | 43,1000109_10000109,43
 46 | 44,1000112_10000112,44
 47 | 45,1000116_10000116,45
 48 | 46,1000117_10000117,46
 49 | 47,1000118_10000118,47
 50 | 48,1000119_10000119,48
 51 | 49,1000123_10000123,49
 52 | 50,1000124_10000124,50
 53 | 51,1000125_10000125,51
 54 | 52,1000126_10000126,52
 55 | 53,1000135_10000135,53
 56 | 54,1000139_10000139,54
 57 | 55,1000142_10000142,55
 58 | 56,1000144_10000144,56
 59 | 57,1000148_10000148,57
 60 | 58,1000149_10000149,58
 61 | 59,1000150_10000150,59
 62 | 60,1000152_10000152,60
 63 | 61,1000154_10000154,61
 64 | 62,1000155_10000155,62
 65 | 63,1000158_10000158,63
 66 | 64,1000159_10000159,64
 67 | 65,1000162_10000162,65
 68 | 66,1000165_10000165,66
 69 | 67,1000166_10000166,67
 70 | 68,1000168_10000168,68
 71 | 69,1000169_10000169,69
 72 | 70,1000171_10000171,70
 73 | 71,1000172_10000172,71
 74 | 72,1000174_10000174,72
 75 | 73,1000176_10000176,73
 76 | 74,1000178_10000178,74
 77 | 75,1000182_10000182,75
 78 | 76,1000185_10000185,76
 79 | 77,1000187_10000187,77
 80 | 78,1000188_10000188,78
 81 | 79,1000191_10000191,79
 82 | 80,1000192_10000192,80
 83 | 81,1000193_10000193,81
 84 | 82,1000198_10000198,82
 85 | 83,1000202_10000202,83
 86 | 84,1000204_10000204,84
 87 | 85,1000205_10000205,85
 88 | 86,1000207_10000207,86
 89 | 87,1000209_10000209,87
 90 | 88,1000210_10000210,88
 91 | 89,1000212_10000212,89
 92 | 90,1000214_10000214,90
 93 | 91,1000215_10000215,91
 94 | 92,1000217_10000217,92
 95 | 93,1000218_10000218,93
 96 | 94,1000220_10000220,94
 97 | 95,1000221_10000221,95
 98 | 96,1000223_10000223,96
 99 | 97,1000224_10000224,97
100 | 98,1000225_10000225,98
101 | 99,1000226_10000226,99
102 | 100,1000227_10000227,100
103 | 101,1000229_10000229,101
104 | 102,1000230_10000230,102
105 | 103,1000232_10000232,103
106 | 104,1000234_10000234,104
107 | 105,1000237_10000237,105
108 | 106,1000240_10000240,106
109 | 107,1000241_10000241,107
110 | 108,1000242_10000242,108
111 | 109,1000243_10000243,109
112 | 110,1000245_10000245,110
113 | 111,1000246_10000246,111
114 | 112,1000249_10000249,112
115 | 


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/sampleID_indices_with_obs.csv:
--------------------------------------------------------------------------------
 1 | ,0,1
 2 | 0,1000000_10000000_T1,0
 3 | 1,1000000_10000000_T2,1
 4 | 2,1000000_10000000_T3,2
 5 | 3,1000001_10000001_T1,3
 6 | 4,1000001_10000001_T2,4
 7 | 5,1000001_10000001_T3,5
 8 | 6,1000002_10000002_T1,6
 9 | 7,1000002_10000002_T2,7
10 | 8,1000002_10000002_T3,8
11 | 9,1000003_10000003_T1,9
12 | 10,1000003_10000003_T2,10
13 | 11,1000003_10000003_T3,11
14 | 12,1000004_10000004_T1,12
15 | 13,1000004_10000004_T2,13
16 | 14,1000004_10000004_T3,14
17 | 15,1000005_10000005_T1,15
18 | 16,1000005_10000005_T2,16
19 | 17,1000005_10000005_T3,17
20 | 18,1000006_10000006_T1,18
21 | 19,1000006_10000006_T2,19
22 | 20,1000006_10000006_T3,20
23 | 21,1000007_10000007_T1,21
24 | 22,1000007_10000007_T2,22
25 | 23,1000007_10000007_T3,23
26 | 24,1000008_10000008_T1,24
27 | 25,1000008_10000008_T2,25
28 | 26,1000008_10000008_T3,26
29 | 27,1000011_10000011_T1,27
30 | 28,1000011_10000011_T2,28
31 | 29,1000011_10000011_T3,29
32 | 30,1000013_10000013_T1,30
33 | 31,1000013_10000013_T2,31
34 | 32,1000013_10000013_T3,32
35 | 33,1000015_10000015_T1,33
36 | 34,1000015_10000015_T2,34
37 | 35,1000015_10000015_T3,35
38 | 36,1000017_10000017_T1,36
39 | 37,1000017_10000017_T2,37
40 | 38,1000017_10000017_T3,38
41 | 39,1000018_10000018_T1,39
42 | 40,1000018_10000018_T2,40
43 | 41,1000018_10000018_T3,41
44 | 


--------------------------------------------------------------------------------
/Onset of Labor/data/processed_data/sampleID_indices_with_obs_word2vec_from_ool.csv:
--------------------------------------------------------------------------------
 1 | ,0,1
 2 | 0,1000000_10000000_T1,0
 3 | 1,1000000_10000000_T2,1
 4 | 2,1000000_10000000_T3,2
 5 | 3,1000001_10000001_T1,3
 6 | 4,1000001_10000001_T2,4
 7 | 5,1000001_10000001_T3,5
 8 | 6,1000002_10000002_T1,6
 9 | 7,1000002_10000002_T2,7
10 | 8,1000002_10000002_T3,8
11 | 9,1000003_10000003_T1,9
12 | 10,1000003_10000003_T2,10
13 | 11,1000003_10000003_T3,11
14 | 12,1000004_10000004_T1,12
15 | 13,1000004_10000004_T2,13
16 | 14,1000004_10000004_T3,14
17 | 15,1000005_10000005_T1,15
18 | 16,1000005_10000005_T2,16
19 | 17,1000005_10000005_T3,17
20 | 18,1000006_10000006_T1,18
21 | 19,1000006_10000006_T2,19
22 | 20,1000006_10000006_T3,20
23 | 21,1000007_10000007_T1,21
24 | 22,1000007_10000007_T2,22
25 | 23,1000007_10000007_T3,23
26 | 24,1000008_10000008_T1,24
27 | 25,1000008_10000008_T2,25
28 | 26,1000008_10000008_T3,26
29 | 27,1000011_10000011_T1,27
30 | 28,1000011_10000011_T2,28
31 | 29,1000011_10000011_T3,29
32 | 30,1000013_10000013_T1,30
33 | 31,1000013_10000013_T2,31
34 | 32,1000013_10000013_T3,32
35 | 33,1000015_10000015_T1,33
36 | 34,1000015_10000015_T2,34
37 | 35,1000015_10000015_T3,35
38 | 36,1000017_10000017_T1,36
39 | 37,1000017_10000017_T2,37
40 | 38,1000017_10000017_T3,38
41 | 39,1000018_10000018_T1,39
42 | 40,1000018_10000018_T2,40
43 | 41,1000018_10000018_T3,41
44 | 


--------------------------------------------------------------------------------
/Onset of Labor/data/raw_data/EHR/EHR_cohort_conditions.csv:
--------------------------------------------------------------------------------
 1 | mom_person_id,child_person_id,condition_concept_id,condition_start_DATETIME,child_birth_date
 2 | 1000000,10000000,10,2024-04-12 19:23:34.042570,2024-05-30 19:23:34.042570
 3 | 1000000,10000000,3,2023-11-25 19:23:34.042570,2024-05-30 19:23:34.042570
 4 | 1000000,10000000,4,2024-04-06 19:23:34.042570,2024-05-30 19:23:34.042570
 5 | 1000000,10000000,5,2024-01-21 19:23:34.042570,2024-05-30 19:23:34.042570
 6 | 1000001,10000001,5,2023-08-02 19:23:34.042644,2023-12-14 19:23:34.042644
 7 | 1000001,10000001,2,2023-07-01 19:23:34.042644,2023-12-14 19:23:34.042644
 8 | 1000001,10000001,2,2023-09-17 19:23:34.042644,2023-12-14 19:23:34.042644
 9 | 1000002,10000002,10,2023-09-05 19:23:34.042656,2024-05-26 19:23:34.042656
10 | 1000002,10000002,2,2024-04-04 19:23:34.042656,2024-05-26 19:23:34.042656
11 | 1000003,10000003,7,2024-03-31 19:23:34.042666,2024-06-30 19:23:34.042666
12 | 1000003,10000003,3,2023-10-11 19:23:34.042666,2024-06-30 19:23:34.042666
13 | 1000003,10000003,1,2023-12-08 19:23:34.042666,2024-06-30 19:23:34.042666
14 | 1000003,10000003,8,2024-05-12 19:23:34.042666,2024-06-30 19:23:34.042666
15 | 1000004,10000004,6,2024-03-04 19:23:34.042674,2024-03-05 19:23:34.042674
16 | 1000004,10000004,10,2024-01-12 19:23:34.042674,2024-03-05 19:23:34.042674
17 | 1000004,10000004,6,2023-06-20 19:23:34.042674,2024-03-05 19:23:34.042674
18 | 1000004,10000004,2,2023-08-28 19:23:34.042674,2024-03-05 19:23:34.042674
19 | 1000005,10000005,2,2024-07-08 19:23:34.042682,2024-08-20 19:23:34.042682
20 | 1000005,10000005,4,2024-02-01 19:23:34.042682,2024-08-20 19:23:34.042682
21 | 1000006,10000006,8,2023-09-03 19:23:34.042690,2024-05-30 19:23:34.042690
22 | 1000006,10000006,9,2023-10-29 19:23:34.042690,2024-05-30 19:23:34.042690
23 | 1000007,10000007,2,2023-10-12 19:23:34.042698,2024-05-11 19:23:34.042698
24 | 1000007,10000007,8,2024-03-20 19:23:34.042698,2024-05-11 19:23:34.042698
25 | 1000008,10000008,9,2023-07-07 19:23:34.042706,2024-02-08 19:23:34.042706
26 | 1000008,10000008,1,2023-09-05 19:23:34.042706,2024-02-08 19:23:34.042706
27 | 1000009,10000009,8,2024-06-06 19:23:34.042713,2024-06-14 19:23:34.042713
28 | 1000010,10000010,8,2024-01-19 19:23:34.042720,2024-06-02 19:23:34.042720
29 | 1000011,10000011,3,2023-11-25 19:23:34.042728,2024-04-11 19:23:34.042728
30 | 1000011,10000011,8,2024-01-22 19:23:34.042728,2024-04-11 19:23:34.042728
31 | 1000011,10000011,3,2023-11-01 19:23:34.042728,2024-04-11 19:23:34.042728
32 | 1000012,10000012,10,2023-08-16 19:23:34.042736,2024-05-02 19:23:34.042736
33 | 1000013,10000013,7,2024-03-04 19:23:34.042743,2024-04-13 19:23:34.042743
34 | 1000013,10000013,8,2023-09-26 19:23:34.042743,2024-04-13 19:23:34.042743
35 | 1000013,10000013,2,2023-07-21 19:23:34.042743,2024-04-13 19:23:34.042743
36 | 1000014,10000014,7,2023-11-10 19:23:34.042750,2023-12-27 19:23:34.042750
37 | 1000015,10000015,8,2024-01-01 19:23:34.042758,2024-03-02 19:23:34.042758
38 | 1000015,10000015,8,2023-11-25 19:23:34.042758,2024-03-02 19:23:34.042758
39 | 1000015,10000015,3,2023-08-02 19:23:34.042758,2024-03-02 19:23:34.042758
40 | 1000016,10000016,5,2023-04-26 19:23:34.042765,2023-12-08 19:23:34.042765
41 | 1000017,10000017,1,2023-11-24 19:23:34.042773,2024-04-02 19:23:34.042773
42 | 1000017,10000017,10,2024-03-29 19:23:34.042773,2024-04-02 19:23:34.042773
43 | 1000017,10000017,7,2023-07-23 19:23:34.042773,2024-04-02 19:23:34.042773
44 | 1000018,10000018,10,2024-01-26 19:23:34.042780,2024-08-19 19:23:34.042780
45 | 1000018,10000018,3,2024-07-09 19:23:34.042780,2024-08-19 19:23:34.042780
46 | 1000018,10000018,7,2024-06-18 19:23:34.042780,2024-08-19 19:23:34.042780
47 | 1000019,10000019,4,2023-11-11 19:23:34.042787,2024-01-01 19:23:34.042787
48 | 


--------------------------------------------------------------------------------
/Onset of Labor/data/raw_data/EHR/EHR_cohort_drugs.csv:
--------------------------------------------------------------------------------
 1 | mom_person_id,child_person_id,drug_concept_id,drug_exposure_start_DATETIME,child_birth_date
 2 | 1000000,10000000,4,2023-10-13 19:23:34.042570,2024-05-30 19:23:34.042570
 3 | 1000000,10000000,7,2024-05-18 19:23:34.042570,2024-05-30 19:23:34.042570
 4 | 1000001,10000001,6,2023-04-16 19:23:34.042644,2023-12-14 19:23:34.042644
 5 | 1000001,10000001,10,2023-10-10 19:23:34.042644,2023-12-14 19:23:34.042644
 6 | 1000001,10000001,9,2023-10-31 19:23:34.042644,2023-12-14 19:23:34.042644
 7 | 1000002,10000002,4,2024-01-14 19:23:34.042656,2024-05-26 19:23:34.042656
 8 | 1000003,10000003,10,2024-02-24 19:23:34.042666,2024-06-30 19:23:34.042666
 9 | 1000003,10000003,1,2023-11-13 19:23:34.042666,2024-06-30 19:23:34.042666
10 | 1000004,10000004,4,2023-11-06 19:23:34.042674,2024-03-05 19:23:34.042674
11 | 1000005,10000005,3,2024-01-01 19:23:34.042682,2024-08-20 19:23:34.042682
12 | 1000006,10000006,8,2023-11-15 19:23:34.042690,2024-05-30 19:23:34.042690
13 | 1000006,10000006,5,2024-01-15 19:23:34.042690,2024-05-30 19:23:34.042690
14 | 1000006,10000006,3,2023-10-19 19:23:34.042690,2024-05-30 19:23:34.042690
15 | 1000007,10000007,8,2023-11-22 19:23:34.042698,2024-05-11 19:23:34.042698
16 | 1000007,10000007,8,2023-12-04 19:23:34.042698,2024-05-11 19:23:34.042698
17 | 1000008,10000008,1,2023-08-13 19:23:34.042706,2024-02-08 19:23:34.042706
18 | 1000009,10000009,7,2024-04-24 19:23:34.042713,2024-06-14 19:23:34.042713
19 | 1000009,10000009,5,2024-02-23 19:23:34.042713,2024-06-14 19:23:34.042713
20 | 1000010,10000010,1,2024-03-14 19:23:34.042720,2024-06-02 19:23:34.042720
21 | 1000011,10000011,6,2023-12-04 19:23:34.042728,2024-04-11 19:23:34.042728
22 | 1000011,10000011,1,2023-09-01 19:23:34.042728,2024-04-11 19:23:34.042728
23 | 1000012,10000012,2,2023-12-29 19:23:34.042736,2024-05-02 19:23:34.042736
24 | 1000013,10000013,10,2023-10-25 19:23:34.042743,2024-04-13 19:23:34.042743
25 | 1000014,10000014,7,2023-06-13 19:23:34.042750,2023-12-27 19:23:34.042750
26 | 1000014,10000014,8,2023-06-08 19:23:34.042750,2023-12-27 19:23:34.042750
27 | 1000014,10000014,1,2023-08-27 19:23:34.042750,2023-12-27 19:23:34.042750
28 | 1000014,10000014,6,2023-04-17 19:23:34.042750,2023-12-27 19:23:34.042750
29 | 1000015,10000015,6,2023-11-26 19:23:34.042758,2024-03-02 19:23:34.042758
30 | 1000015,10000015,1,2023-07-07 19:23:34.042758,2024-03-02 19:23:34.042758
31 | 1000015,10000015,6,2023-08-15 19:23:34.042758,2024-03-02 19:23:34.042758
32 | 1000015,10000015,4,2023-06-18 19:23:34.042758,2024-03-02 19:23:34.042758
33 | 1000016,10000016,4,2023-07-15 19:23:34.042765,2023-12-08 19:23:34.042765
34 | 1000016,10000016,4,2023-05-24 19:23:34.042765,2023-12-08 19:23:34.042765
35 | 1000016,10000016,7,2023-08-03 19:23:34.042765,2023-12-08 19:23:34.042765
36 | 1000016,10000016,9,2023-08-02 19:23:34.042765,2023-12-08 19:23:34.042765
37 | 1000017,10000017,10,2023-11-04 19:23:34.042773,2024-04-02 19:23:34.042773
38 | 1000018,10000018,10,2023-12-01 19:23:34.042780,2024-08-19 19:23:34.042780
39 | 1000018,10000018,9,2024-04-30 19:23:34.042780,2024-08-19 19:23:34.042780
40 | 1000018,10000018,1,2024-06-21 19:23:34.042780,2024-08-19 19:23:34.042780
41 | 1000019,10000019,5,2023-04-23 19:23:34.042787,2024-01-01 19:23:34.042787
42 | 1000019,10000019,7,2023-11-26 19:23:34.042787,2024-01-01 19:23:34.042787
43 | 


--------------------------------------------------------------------------------
/Onset of Labor/data/raw_data/EHR/EHR_cohort_measurements.csv:
--------------------------------------------------------------------------------
 1 | mom_person_id,child_person_id,measurement_concept_id,measurement_DATETIME,child_birth_date,value_as_number
 2 | 1000000,10000000,9,2024-02-21 19:23:34.042570,2024-05-30 19:23:34.042570,18.81211597237613
 3 | 1000001,10000001,9,2023-09-30 19:23:34.042644,2023-12-14 19:23:34.042644,46.36984049399822
 4 | 1000001,10000001,4,2023-05-23 19:23:34.042644,2023-12-14 19:23:34.042644,35.33522280260528
 5 | 1000001,10000001,5,2023-07-24 19:23:34.042644,2023-12-14 19:23:34.042644,58.365611185087204
 6 | 1000002,10000002,9,2024-02-18 19:23:34.042656,2024-05-26 19:23:34.042656,7.773463696498483
 7 | 1000003,10000003,7,2024-05-09 19:23:34.042666,2024-06-30 19:23:34.042666,97.43948076661665
 8 | 1000003,10000003,9,2024-02-08 19:23:34.042666,2024-06-30 19:23:34.042666,98.62107444796028
 9 | 1000003,10000003,5,2024-05-23 19:23:34.042666,2024-06-30 19:23:34.042666,69.81617140197451
10 | 1000003,10000003,10,2024-05-20 19:23:34.042666,2024-06-30 19:23:34.042666,53.60963663441204
11 | 1000004,10000004,10,2023-07-27 19:23:34.042674,2024-03-05 19:23:34.042674,30.952761628632775
12 | 1000004,10000004,3,2023-10-25 19:23:34.042674,2024-03-05 19:23:34.042674,81.37950197069486
13 | 1000004,10000004,2,2023-08-04 19:23:34.042674,2024-03-05 19:23:34.042674,68.47311725538793
14 | 1000005,10000005,9,2024-06-06 19:23:34.042682,2024-08-20 19:23:34.042682,16.26169393448913
15 | 1000005,10000005,10,2024-06-08 19:23:34.042682,2024-08-20 19:23:34.042682,91.09271844938425
16 | 1000005,10000005,6,2023-12-12 19:23:34.042682,2024-08-20 19:23:34.042682,82.2537242923169
17 | 1000006,10000006,8,2023-12-09 19:23:34.042690,2024-05-30 19:23:34.042690,94.9799913291924
18 | 1000006,10000006,9,2023-12-13 19:23:34.042690,2024-05-30 19:23:34.042690,72.571950838836
19 | 1000006,10000006,2,2024-01-30 19:23:34.042690,2024-05-30 19:23:34.042690,61.34151959357899
20 | 1000007,10000007,5,2024-04-13 19:23:34.042698,2024-05-11 19:23:34.042698,41.82430362906189
21 | 1000008,10000008,1,2023-09-26 19:23:34.042706,2024-02-08 19:23:34.042706,93.27284833540132
22 | 1000008,10000008,7,2023-09-17 19:23:34.042706,2024-02-08 19:23:34.042706,86.60638895004084
23 | 1000009,10000009,4,2024-02-04 19:23:34.042713,2024-06-14 19:23:34.042713,4.521867010618941
24 | 1000010,10000010,3,2023-09-25 19:23:34.042720,2024-06-02 19:23:34.042720,2.6366974497252005
25 | 1000010,10000010,7,2023-12-04 19:23:34.042720,2024-06-02 19:23:34.042720,37.64633668780496
26 | 1000011,10000011,10,2023-09-23 19:23:34.042728,2024-04-11 19:23:34.042728,81.0553330781833
27 | 1000011,10000011,7,2024-03-24 19:23:34.042728,2024-04-11 19:23:34.042728,98.72761293149445
28 | 1000011,10000011,3,2023-12-08 19:23:34.042728,2024-04-11 19:23:34.042728,15.041689110352818
29 | 1000012,10000012,2,2023-12-14 19:23:34.042736,2024-05-02 19:23:34.042736,59.41307153521351
30 | 1000013,10000013,10,2023-12-10 19:23:34.042743,2024-04-13 19:23:34.042743,38.08908566310215
31 | 1000013,10000013,8,2023-11-18 19:23:34.042743,2024-04-13 19:23:34.042743,96.99143978146031
32 | 1000013,10000013,3,2024-04-13 19:23:34.042743,2024-04-13 19:23:34.042743,84.21189231357087
33 | 1000013,10000013,9,2023-10-07 19:23:34.042743,2024-04-13 19:23:34.042743,83.83287047111378
34 | 1000014,10000014,6,2023-09-02 19:23:34.042750,2023-12-27 19:23:34.042750,46.86931597949703
35 | 1000014,10000014,5,2023-10-31 19:23:34.042750,2023-12-27 19:23:34.042750,41.48195023376652
36 | 1000014,10000014,6,2023-10-09 19:23:34.042750,2023-12-27 19:23:34.042750,27.340707193070624
37 | 1000015,10000015,5,2023-08-13 19:23:34.042758,2024-03-02 19:23:34.042758,5.6375496650927115
38 | 1000016,10000016,3,2023-04-26 19:23:34.042765,2023-12-08 19:23:34.042765,86.47223762550531
39 | 1000016,10000016,9,2023-11-19 19:23:34.042765,2023-12-08 19:23:34.042765,81.29010091300776
40 | 1000016,10000016,9,2023-10-23 19:23:34.042765,2023-12-08 19:23:34.042765,99.97176732861305
41 | 1000016,10000016,1,2023-11-25 19:23:34.042765,2023-12-08 19:23:34.042765,99.66368370739053
42 | 1000017,10000017,6,2024-02-09 19:23:34.042773,2024-04-02 19:23:34.042773,55.543170560262745
43 | 1000018,10000018,7,2024-03-29 19:23:34.042780,2024-08-19 19:23:34.042780,76.89874151805105
44 | 1000018,10000018,10,2024-08-08 19:23:34.042780,2024-08-19 19:23:34.042780,94.47657298824281
45 | 1000018,10000018,8,2024-01-09 19:23:34.042780,2024-08-19 19:23:34.042780,84.96473906774115
46 | 1000019,10000019,5,2023-07-30 19:23:34.042787,2024-01-01 19:23:34.042787,24.734810174319765
47 | 1000019,10000019,10,2023-12-25 19:23:34.042787,2024-01-01 19:23:34.042787,45.05441353100935
48 | 


--------------------------------------------------------------------------------
/Onset of Labor/data/raw_data/EHR/EHR_cohort_observations.csv:
--------------------------------------------------------------------------------
 1 | mom_person_id,child_person_id,observation_concept_id,observation_DATETIME,child_birth_date
 2 | 1000000,10000000,3,2024-02-01 19:23:34.042570,2024-05-30 19:23:34.042570
 3 | 1000001,10000001,7,2023-10-18 19:23:34.042644,2023-12-14 19:23:34.042644
 4 | 1000001,10000001,10,2023-04-30 19:23:34.042644,2023-12-14 19:23:34.042644
 5 | 1000001,10000001,7,2023-08-20 19:23:34.042644,2023-12-14 19:23:34.042644
 6 | 1000001,10000001,9,2023-08-10 19:23:34.042644,2023-12-14 19:23:34.042644
 7 | 1000002,10000002,10,2024-01-19 19:23:34.042656,2024-05-26 19:23:34.042656
 8 | 1000002,10000002,1,2024-01-26 19:23:34.042656,2024-05-26 19:23:34.042656
 9 | 1000002,10000002,2,2023-10-01 19:23:34.042656,2024-05-26 19:23:34.042656
10 | 1000002,10000002,6,2024-02-21 19:23:34.042656,2024-05-26 19:23:34.042656
11 | 1000003,10000003,8,2024-04-03 19:23:34.042666,2024-06-30 19:23:34.042666
12 | 1000003,10000003,5,2023-11-07 19:23:34.042666,2024-06-30 19:23:34.042666
13 | 1000003,10000003,7,2023-10-03 19:23:34.042666,2024-06-30 19:23:34.042666
14 | 1000003,10000003,5,2023-12-22 19:23:34.042666,2024-06-30 19:23:34.042666
15 | 1000004,10000004,3,2023-07-03 19:23:34.042674,2024-03-05 19:23:34.042674
16 | 1000004,10000004,10,2023-12-21 19:23:34.042674,2024-03-05 19:23:34.042674
17 | 1000005,10000005,9,2024-05-27 19:23:34.042682,2024-08-20 19:23:34.042682
18 | 1000005,10000005,5,2024-01-14 19:23:34.042682,2024-08-20 19:23:34.042682
19 | 1000005,10000005,1,2024-07-05 19:23:34.042682,2024-08-20 19:23:34.042682
20 | 1000005,10000005,4,2024-05-19 19:23:34.042682,2024-08-20 19:23:34.042682
21 | 1000006,10000006,10,2023-11-09 19:23:34.042690,2024-05-30 19:23:34.042690
22 | 1000007,10000007,4,2024-04-03 19:23:34.042698,2024-05-11 19:23:34.042698
23 | 1000007,10000007,5,2023-09-01 19:23:34.042698,2024-05-11 19:23:34.042698
24 | 1000008,10000008,10,2023-06-04 19:23:34.042706,2024-02-08 19:23:34.042706
25 | 1000008,10000008,5,2023-06-08 19:23:34.042706,2024-02-08 19:23:34.042706
26 | 1000008,10000008,2,2023-11-01 19:23:34.042706,2024-02-08 19:23:34.042706
27 | 1000009,10000009,10,2023-11-05 19:23:34.042713,2024-06-14 19:23:34.042713
28 | 1000009,10000009,3,2023-10-09 19:23:34.042713,2024-06-14 19:23:34.042713
29 | 1000009,10000009,1,2024-03-11 19:23:34.042713,2024-06-14 19:23:34.042713
30 | 1000009,10000009,8,2024-06-11 19:23:34.042713,2024-06-14 19:23:34.042713
31 | 1000010,10000010,2,2023-09-30 19:23:34.042720,2024-06-02 19:23:34.042720
32 | 1000010,10000010,4,2023-12-26 19:23:34.042720,2024-06-02 19:23:34.042720
33 | 1000010,10000010,2,2024-01-03 19:23:34.042720,2024-06-02 19:23:34.042720
34 | 1000010,10000010,1,2023-12-23 19:23:34.042720,2024-06-02 19:23:34.042720
35 | 1000011,10000011,5,2023-10-16 19:23:34.042728,2024-04-11 19:23:34.042728
36 | 1000012,10000012,1,2024-02-28 19:23:34.042736,2024-05-02 19:23:34.042736
37 | 1000012,10000012,10,2023-11-17 19:23:34.042736,2024-05-02 19:23:34.042736
38 | 1000012,10000012,2,2024-03-21 19:23:34.042736,2024-05-02 19:23:34.042736
39 | 1000012,10000012,3,2023-08-24 19:23:34.042736,2024-05-02 19:23:34.042736
40 | 1000013,10000013,7,2024-03-10 19:23:34.042743,2024-04-13 19:23:34.042743
41 | 1000013,10000013,8,2024-01-15 19:23:34.042743,2024-04-13 19:23:34.042743
42 | 1000014,10000014,2,2023-09-29 19:23:34.042750,2023-12-27 19:23:34.042750
43 | 1000015,10000015,7,2023-11-19 19:23:34.042758,2024-03-02 19:23:34.042758
44 | 1000015,10000015,10,2023-08-20 19:23:34.042758,2024-03-02 19:23:34.042758
45 | 1000015,10000015,8,2023-11-10 19:23:34.042758,2024-03-02 19:23:34.042758
46 | 1000016,10000016,5,2023-05-06 19:23:34.042765,2023-12-08 19:23:34.042765
47 | 1000016,10000016,4,2023-04-04 19:23:34.042765,2023-12-08 19:23:34.042765
48 | 1000016,10000016,3,2023-05-31 19:23:34.042765,2023-12-08 19:23:34.042765
49 | 1000016,10000016,3,2023-06-15 19:23:34.042765,2023-12-08 19:23:34.042765
50 | 1000017,10000017,2,2024-02-27 19:23:34.042773,2024-04-02 19:23:34.042773
51 | 1000017,10000017,8,2023-12-29 19:23:34.042773,2024-04-02 19:23:34.042773
52 | 1000017,10000017,5,2023-11-04 19:23:34.042773,2024-04-02 19:23:34.042773
53 | 1000017,10000017,1,2023-07-10 19:23:34.042773,2024-04-02 19:23:34.042773
54 | 1000018,10000018,7,2024-08-07 19:23:34.042780,2024-08-19 19:23:34.042780
55 | 1000018,10000018,3,2024-01-16 19:23:34.042780,2024-08-19 19:23:34.042780
56 | 1000018,10000018,1,2023-11-21 19:23:34.042780,2024-08-19 19:23:34.042780
57 | 1000019,10000019,9,2023-10-10 19:23:34.042787,2024-01-01 19:23:34.042787
58 | 1000019,10000019,3,2023-06-29 19:23:34.042787,2024-01-01 19:23:34.042787
59 | 


--------------------------------------------------------------------------------
/Onset of Labor/data/raw_data/EHR/EHR_cohort_procedures.csv:
--------------------------------------------------------------------------------
 1 | mom_person_id,child_person_id,procedure_concept_id,procedure_DATETIME,child_birth_date
 2 | 1000000,10000000,3,2023-10-11 19:23:34.042570,2024-05-30 19:23:34.042570
 3 | 1000001,10000001,3,2023-05-21 19:23:34.042644,2023-12-14 19:23:34.042644
 4 | 1000001,10000001,6,2023-07-16 19:23:34.042644,2023-12-14 19:23:34.042644
 5 | 1000001,10000001,1,2023-08-17 19:23:34.042644,2023-12-14 19:23:34.042644
 6 | 1000002,10000002,1,2024-02-01 19:23:34.042656,2024-05-26 19:23:34.042656
 7 | 1000002,10000002,4,2024-02-13 19:23:34.042656,2024-05-26 19:23:34.042656
 8 | 1000002,10000002,3,2023-09-16 19:23:34.042656,2024-05-26 19:23:34.042656
 9 | 1000002,10000002,9,2024-02-05 19:23:34.042656,2024-05-26 19:23:34.042656
10 | 1000003,10000003,2,2024-01-30 19:23:34.042666,2024-06-30 19:23:34.042666
11 | 1000003,10000003,6,2023-12-20 19:23:34.042666,2024-06-30 19:23:34.042666
12 | 1000003,10000003,9,2024-01-20 19:23:34.042666,2024-06-30 19:23:34.042666
13 | 1000004,10000004,4,2023-09-27 19:23:34.042674,2024-03-05 19:23:34.042674
14 | 1000004,10000004,4,2023-06-03 19:23:34.042674,2024-03-05 19:23:34.042674
15 | 1000004,10000004,3,2023-10-23 19:23:34.042674,2024-03-05 19:23:34.042674
16 | 1000004,10000004,1,2023-10-30 19:23:34.042674,2024-03-05 19:23:34.042674
17 | 1000005,10000005,6,2024-07-30 19:23:34.042682,2024-08-20 19:23:34.042682
18 | 1000005,10000005,3,2024-01-04 19:23:34.042682,2024-08-20 19:23:34.042682
19 | 1000005,10000005,2,2023-12-01 19:23:34.042682,2024-08-20 19:23:34.042682
20 | 1000005,10000005,1,2023-11-18 19:23:34.042682,2024-08-20 19:23:34.042682
21 | 1000006,10000006,4,2024-01-05 19:23:34.042690,2024-05-30 19:23:34.042690
22 | 1000006,10000006,1,2024-04-26 19:23:34.042690,2024-05-30 19:23:34.042690
23 | 1000006,10000006,5,2024-05-14 19:23:34.042690,2024-05-30 19:23:34.042690
24 | 1000006,10000006,3,2024-05-25 19:23:34.042690,2024-05-30 19:23:34.042690
25 | 1000007,10000007,5,2023-09-22 19:23:34.042698,2024-05-11 19:23:34.042698
26 | 1000007,10000007,1,2023-08-06 19:23:34.042698,2024-05-11 19:23:34.042698
27 | 1000007,10000007,3,2023-11-13 19:23:34.042698,2024-05-11 19:23:34.042698
28 | 1000007,10000007,1,2023-11-06 19:23:34.042698,2024-05-11 19:23:34.042698
29 | 1000008,10000008,5,2023-08-02 19:23:34.042706,2024-02-08 19:23:34.042706
30 | 1000008,10000008,3,2023-09-01 19:23:34.042706,2024-02-08 19:23:34.042706
31 | 1000008,10000008,2,2023-10-04 19:23:34.042706,2024-02-08 19:23:34.042706
32 | 1000008,10000008,6,2023-07-01 19:23:34.042706,2024-02-08 19:23:34.042706
33 | 1000009,10000009,8,2023-12-24 19:23:34.042713,2024-06-14 19:23:34.042713
34 | 1000009,10000009,6,2024-02-22 19:23:34.042713,2024-06-14 19:23:34.042713
35 | 1000010,10000010,10,2024-01-28 19:23:34.042720,2024-06-02 19:23:34.042720
36 | 1000010,10000010,8,2023-09-04 19:23:34.042720,2024-06-02 19:23:34.042720
37 | 1000010,10000010,7,2023-10-01 19:23:34.042720,2024-06-02 19:23:34.042720
38 | 1000011,10000011,7,2023-10-25 19:23:34.042728,2024-04-11 19:23:34.042728
39 | 1000012,10000012,10,2024-04-20 19:23:34.042736,2024-05-02 19:23:34.042736
40 | 1000012,10000012,4,2024-03-28 19:23:34.042736,2024-05-02 19:23:34.042736
41 | 1000013,10000013,3,2023-07-25 19:23:34.042743,2024-04-13 19:23:34.042743
42 | 1000014,10000014,2,2023-08-07 19:23:34.042750,2023-12-27 19:23:34.042750
43 | 1000014,10000014,6,2023-05-27 19:23:34.042750,2023-12-27 19:23:34.042750
44 | 1000014,10000014,9,2023-11-07 19:23:34.042750,2023-12-27 19:23:34.042750
45 | 1000015,10000015,6,2023-12-31 19:23:34.042758,2024-03-02 19:23:34.042758
46 | 1000015,10000015,6,2024-01-05 19:23:34.042758,2024-03-02 19:23:34.042758
47 | 1000016,10000016,10,2023-06-12 19:23:34.042765,2023-12-08 19:23:34.042765
48 | 1000017,10000017,6,2023-07-07 19:23:34.042773,2024-04-02 19:23:34.042773
49 | 1000017,10000017,1,2023-12-24 19:23:34.042773,2024-04-02 19:23:34.042773
50 | 1000018,10000018,5,2024-06-28 19:23:34.042780,2024-08-19 19:23:34.042780
51 | 1000018,10000018,4,2024-05-09 19:23:34.042780,2024-08-19 19:23:34.042780
52 | 1000018,10000018,4,2023-12-02 19:23:34.042780,2024-08-19 19:23:34.042780
53 | 1000018,10000018,3,2024-07-04 19:23:34.042780,2024-08-19 19:23:34.042780
54 | 1000019,10000019,2,2023-08-11 19:23:34.042787,2024-01-01 19:23:34.042787
55 | 1000019,10000019,10,2023-08-31 19:23:34.042787,2024-01-01 19:23:34.042787
56 | 1000019,10000019,3,2023-07-28 19:23:34.042787,2024-01-01 19:23:34.042787
57 | 


--------------------------------------------------------------------------------
/Onset of Labor/process_EHR_data_full_PT_cohort.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "87e061dd",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "from tqdm.notebook import tqdm\n",
 13 |     "from scipy.stats import pearsonr\n",
 14 |     "import gensim\n",
 15 |     "from gensim.models import Word2Vec\n",
 16 |     "from gensim.models.callbacks import CallbackAny2Vec\n",
 17 |     "import random\n",
 18 |     "import pickle\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "import seaborn as sns\n",
 21 |     "import dask\n",
 22 |     "import dask.dataframe as dd\n",
 23 |     "import os"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "id": "9faa9869",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "directories = [\n",
 34 |     "    'models',\n",
 35 |     "    'models/hyperparameters',\n",
 36 |     "    'models/predictive_models',\n",
 37 |     "    'results'\n",
 38 |     "]\n",
 39 |     "\n",
 40 |     "# Create each directory\n",
 41 |     "for directory in directories:\n",
 42 |     "    os.makedirs(directory, exist_ok=True)\n"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "id": "e9d808da",
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "CPU times: user 19.3 ms, sys: 4.52 ms, total: 23.8 ms\n",
 56 |       "Wall time: 22.1 ms\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "%%time\n",
 62 |     "#load data\n",
 63 |     "#the person_id, concept_id, and date columns are extracted from OMOP tables for people who delivered babies at Stanford\n",
 64 |     "conds = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_conditions.csv')\n",
 65 |     "conds = conds[conds['condition_concept_id'] != 0]\n",
 66 |     "drugs = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_drugs.csv')\n",
 67 |     "drugs = drugs[drugs['drug_concept_id'] != 0]\n",
 68 |     "procs = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_procedures.csv')\n",
 69 |     "procs = procs[procs['procedure_concept_id'] != 0]\n",
 70 |     "obs = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_observations.csv')\n",
 71 |     "obs = obs[obs['observation_concept_id'] != 0]\n",
 72 |     "\n",
 73 |     "conds['condition_start_DATETIME'] = pd.to_datetime(conds['condition_start_DATETIME'])\n",
 74 |     "procs['procedure_DATETIME'] = pd.to_datetime(procs['procedure_DATETIME'])\n",
 75 |     "drugs['drug_exposure_start_DATETIME'] = pd.to_datetime(drugs['drug_exposure_start_DATETIME'])\n",
 76 |     "obs['observation_DATETIME'] = pd.to_datetime(obs['observation_DATETIME'])\n",
 77 |     "\n",
 78 |     "conds['child_birth_date'] = pd.to_datetime(conds['child_birth_date'])\n",
 79 |     "procs['child_birth_date'] = pd.to_datetime(procs['child_birth_date'])\n",
 80 |     "drugs['child_birth_date'] = pd.to_datetime(drugs['child_birth_date'])\n",
 81 |     "obs['child_birth_date'] = pd.to_datetime(obs['child_birth_date'])"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 3,
 87 |    "id": "48776334",
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "name": "stdout",
 92 |      "output_type": "stream",
 93 |      "text": [
 94 |       "CPU times: user 8.55 ms, sys: 0 ns, total: 8.55 ms\n",
 95 |       "Wall time: 6.78 ms\n"
 96 |      ]
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "%%time\n",
101 |     "measurements = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_measurements.csv')\n",
102 |     "measurements = measurements[~pd.isnull(measurements['value_as_number'])]\n",
103 |     "measurements = measurements[measurements['measurement_concept_id'] != 0]\n",
104 |     "measurements['measurement_DATETIME'] = pd.to_datetime(measurements['measurement_DATETIME'])\n",
105 |     "measurements['child_birth_date'] = pd.to_datetime(measurements['child_birth_date'])"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 4,
111 |    "id": "3cbd4572",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "#identify ppl who have omics data so they can be EXCLUDED from the pre-training cohort\n",
116 |     "OOL_cohort_omop = pd.read_csv('./data/ool_EHR_features.csv')['mom_person_id'].values"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 5,
122 |    "id": "348331a6",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "#remove EHR data from ppl who are in the omics cohort\n",
127 |     "conds = conds[~conds['mom_person_id'].isin(OOL_cohort_omop)]\n",
128 |     "procs = procs[~procs['mom_person_id'].isin(OOL_cohort_omop)]\n",
129 |     "drugs = drugs[~drugs['mom_person_id'].isin(OOL_cohort_omop)]\n",
130 |     "measurements = measurements[~measurements['mom_person_id'].isin(OOL_cohort_omop)]\n",
131 |     "obs = obs[~obs['mom_person_id'].isin(OOL_cohort_omop)]"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 6,
137 |    "id": "d4826244",
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "230"
144 |       ]
145 |      },
146 |      "execution_count": 6,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "conds['mom_person_id'].nunique()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 7,
158 |    "id": "a1cf6ad5",
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "\n",
163 |     "def filter_df(df, birth_time, time_col, time_range_days=280):\n",
164 |     "    \"\"\"\n",
165 |     "    A function to remove entries in a dataframe prior to time of birth. \n",
166 |     "    \n",
167 |     "    df: The dataframe to filter. Must contain a column called person_id with the OMOP ID of the mother\n",
168 |     "    birth_time: A dataframe that contains two columns: maternal_OMOP and birth_DATETIME\n",
169 |     "    time_col: The index of the column with the date of the event in df\n",
170 |     "    time_range_days: keeps data from delivery up to time_range_days prior \n",
171 |     "    \n",
172 |     "    \"\"\"\n",
173 |     "    print('There were {} patients before filtering.'.format(len(df['person_id'].unique())))\n",
174 |     "    df = df.merge(birth_time, how='inner', left_on='person_id', right_on='maternal_OMOP')\n",
175 |     "    df['diff'] = df['birth_DATETIME']-df[time_col]\n",
176 |     "    new_df = df[(df['diff'].dt.days > 0) & (df['diff'].dt.days <= time_range_days)].drop('maternal_OMOP', axis=1)\n",
177 |     "    print('There were {} patients after filtering.'.format(len(new_df['person_id'].unique())))\n",
178 |     "    return new_df\n",
179 |     "\n",
180 |     "#function to help with appropriate sample / patient labeling \n",
181 |     "def generate_features_EHR_cohort(proteomics, input_df, time_col_name, concept_id_col, indicator, binary=True):\n",
182 |     "    df = proteomics[['DOS','mom_person_id','child_person_id']].merge(input_df, how='left', on=['mom_person_id','child_person_id'])\n",
183 |     "    df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days\n",
184 |     "    df = df[df['delta'] < df['DOS']]\n",
185 |     "    df['sample_ID'] = df['mom_person_id'].astype(str)+'_'+df['child_person_id'].astype(str)\n",
186 |     "    return df\n",
187 |     "    "
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 8,
193 |    "id": "9def4745",
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "#pick a random date during pregnancy for the women so we can create an artificial sampling time\n",
198 |     "#we will use EHR data from beginning of pregnancy up until this sampling time for features\n",
199 |     "#and number of days from this sampling time to birth as the \"time to onset of labor\" pre-training problem\n",
200 |     "time_col_name = 'condition_start_DATETIME'\n",
201 |     "df = conds\n",
202 |     "df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days\n",
203 |     "min_ool = df[['mom_person_id','child_person_id','delta']].groupby(['mom_person_id','child_person_id']).min()\n",
204 |     "min_ool.columns = ['min_delta']\n",
205 |     "max_ool = df[['mom_person_id','child_person_id','delta']].groupby(['mom_person_id','child_person_id']).max()\n",
206 |     "max_ool.columns = ['max_delta']\n",
207 |     "sampling_df = pd.concat([min_ool, max_ool],axis=1)\n",
208 |     "\n",
209 |     "np.random.seed(3)\n",
210 |     "sampling_df = sampling_df[((sampling_df['max_delta'] - sampling_df['min_delta']) >= 7) == True]\n",
211 |     "sampling_df['DOS'] = ((sampling_df['max_delta'] - sampling_df['min_delta'] - 7) * np.random.rand(sampling_df.shape[0]) + sampling_df['min_delta']).astype(int)\n",
212 |     "sampling_df = sampling_df[sampling_df['max_delta'] > -100]\n",
213 |     "#sample from last 100 days of pregnancy to mirror design of omics study\n",
214 |     "sampling_df['DOS'] = sampling_df.apply(lambda row: int(np.random.uniform(-100, row['max_delta'])), axis=1)\n",
215 |     "sampling_df = sampling_df.reset_index()"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 9,
221 |    "id": "c260a873",
222 |    "metadata": {},
223 |    "outputs": [
224 |     {
225 |      "name": "stdout",
226 |      "output_type": "stream",
227 |      "text": [
228 |       "CPU times: user 30 ms, sys: 341 µs, total: 30.3 ms\n",
229 |       "Wall time: 28.5 ms\n"
230 |      ]
231 |     }
232 |    ],
233 |    "source": [
234 |     "%%time\n",
235 |     "condition_features_EHR = generate_features_EHR_cohort(sampling_df, conds, 'condition_start_DATETIME','condition_concept_id','C')\n",
236 |     "procedure_features_EHR = generate_features_EHR_cohort(sampling_df, procs, 'procedure_DATETIME','procedure_concept_id','P')\n",
237 |     "drug_features_EHR = generate_features_EHR_cohort(sampling_df, drugs, 'drug_exposure_start_DATETIME','drug_concept_id','D')\n",
238 |     "measurement_features_EHR = generate_features_EHR_cohort(sampling_df, measurements, 'measurement_DATETIME','measurement_concept_id','M')\n",
239 |     "observation_features_EHR = generate_features_EHR_cohort(sampling_df, obs, 'observation_DATETIME','observation_concept_id','O')\n"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 10,
245 |    "id": "7422c01b",
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "class EpochLogger(CallbackAny2Vec):\n",
250 |     "    def __init__(self):\n",
251 |     "        self.epoch = 0\n",
252 |     "        self.losses = []\n",
253 |     "\n",
254 |     "    def on_epoch_begin(self, model):\n",
255 |     "        print(f\"Starting epoch #{self.epoch}\")\n",
256 |     "\n",
257 |     "    def on_epoch_end(self, model):\n",
258 |     "        print(f\"Finished epoch #{self.epoch}\")\n",
259 |     "        loss = model.get_latest_training_loss()\n",
260 |     "        self.losses.append(loss)\n",
261 |     "        print(self.losses)\n",
262 |     "        print(f'  Loss: {loss}')\n",
263 |     "        self.epoch += 1"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 11,
269 |    "id": "7d6a36dd",
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "name": "stdout",
274 |      "output_type": "stream",
275 |      "text": [
276 |       "CPU times: user 530 µs, sys: 3.41 ms, total: 3.94 ms\n",
277 |       "Wall time: 2.19 ms\n"
278 |      ]
279 |     }
280 |    ],
281 |    "source": [
282 |     "%%time\n",
283 |     "#learn word2vec embeddings\n",
284 |     "try:\n",
285 |     "    model = Word2Vec.load(\"./models/word2vec_full_pregnancy_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n",
286 |     "except:\n",
287 |     "    print('training new model!')\n",
288 |     "    epoch_logger = EpochLogger()\n",
289 |     "\n",
290 |     "    word2vec_conds = condition_features_EHR[['sample_ID','condition_concept_id','condition_start_DATETIME']]\n",
291 |     "    word2vec_conds.columns = ['sample_ID','concept_id','ts']\n",
292 |     "\n",
293 |     "    word2vec_procs = procedure_features_EHR[['sample_ID','procedure_concept_id','procedure_DATETIME']]\n",
294 |     "    word2vec_procs.columns = ['sample_ID','concept_id','ts']\n",
295 |     "\n",
296 |     "    word2vec_drug = drug_features_EHR[['sample_ID','drug_concept_id','drug_exposure_start_DATETIME']]\n",
297 |     "    word2vec_drug.columns = ['sample_ID','concept_id','ts']\n",
298 |     "    \n",
299 |     "    word2vec_mea = measurement_features_EHR[['sample_ID','measurement_concept_id','measurement_DATETIME']]\n",
300 |     "    word2vec_mea.columns = ['sample_ID','concept_id','ts']\n",
301 |     "    \n",
302 |     "    word2vec_obs = observation_features_EHR[['sample_ID','observation_concept_id','observation_DATETIME']]\n",
303 |     "    word2vec_obs.columns = ['sample_ID','concept_id','ts']\n",
304 |     "    \n",
305 |     "    word2vec_data = pd.concat([word2vec_conds, word2vec_procs, word2vec_drug,word2vec_mea, word2vec_obs],axis=0)\n",
306 |     "    word2vec_data['date'] = pd.to_datetime(word2vec_data['ts'])\n",
307 |     "    word2vec_data['date'] = word2vec_data['date'].dt.date\n",
308 |     "    word2vec_data = word2vec_data.drop('ts',axis=1)\n",
309 |     "    word2vec_data = word2vec_data[~pd.isnull(word2vec_data['concept_id'])]\n",
310 |     "    word2vec_data['concept_id'] = word2vec_data['concept_id'].astype(int)\n",
311 |     "    \n",
312 |     "    grouped_data = word2vec_data.groupby(['sample_ID', 'date'])\n",
313 |     "    sentences = []\n",
314 |     "    for _, group in tqdm(grouped_data):\n",
315 |     "        codes = group['concept_id'].tolist()\n",
316 |     "        random.shuffle(codes)\n",
317 |     "        sentences.append(codes)\n",
318 |     "        \n",
319 |     "    print('starting training')\n",
320 |     "    model = Word2Vec(sentences, vector_size=400, window=100, min_count=5, workers=64)\n",
321 |     "    model.train(sentences, total_examples=len(sentences), epochs=5, callbacks=[epoch_logger])\n",
322 |     "    model.save(\"./models/word2vec_full_pregnancy_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 12,
328 |    "id": "32b51bd1",
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "code_to_embedding = {code: model.wv[code] for code in model.wv.index_to_key}"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 13,
338 |    "id": "92deadf0",
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "#replace EHR data with learned embeddings\n",
343 |     "embedded_conds = condition_features_EHR[(condition_features_EHR['condition_concept_id'] != 0)]\n",
344 |     "embedded_conds = embedded_conds[~pd.isnull(embedded_conds['condition_concept_id'])]\n",
345 |     "embedded_conds['embedding'] = [code_to_embedding.get(code) for code in embedded_conds['condition_concept_id']]\n"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 14,
351 |    "id": "422a0471",
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "embedded_procs = procedure_features_EHR[(procedure_features_EHR['procedure_concept_id'] != 0)]\n",
356 |     "embedded_procs = embedded_procs[~pd.isnull(embedded_procs['procedure_concept_id'])]\n",
357 |     "embedded_procs['embedding'] = [code_to_embedding.get(code) for code in embedded_procs['procedure_concept_id']]\n"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 15,
363 |    "id": "47d16d21",
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "embedded_drugs = drug_features_EHR[(drug_features_EHR['drug_concept_id'] != 0)]\n",
368 |     "embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['drug_concept_id'])]\n",
369 |     "embedded_drugs['embedding'] = [code_to_embedding.get(code) for code in embedded_drugs['drug_concept_id']]\n"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 16,
375 |    "id": "7942c2ef",
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": [
379 |     "embedded_measurements = measurement_features_EHR[(measurement_features_EHR['measurement_concept_id'] != 0)]\n",
380 |     "embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['measurement_concept_id'])]\n",
381 |     "embedded_measurements['embedding'] = [code_to_embedding.get(code) for code in embedded_measurements['measurement_concept_id']]\n"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 17,
387 |    "id": "08c61942",
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "embedded_obs = observation_features_EHR[(observation_features_EHR['observation_concept_id'] != 0)]\n",
392 |     "embedded_obs = embedded_obs[~pd.isnull(embedded_obs['observation_concept_id'])]\n",
393 |     "embedded_obs['embedding'] = [code_to_embedding.get(code) for code in embedded_obs['observation_concept_id']]\n"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 18,
399 |    "id": "39441058",
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "embedded_procs['date'] = pd.to_datetime(embedded_procs['procedure_DATETIME'].dt.date)\n",
404 |     "embedded_conds['date'] = pd.to_datetime(embedded_conds['condition_start_DATETIME'].dt.date)\n",
405 |     "embedded_drugs['date'] = pd.to_datetime(embedded_drugs['drug_exposure_start_DATETIME'].dt.date)\n",
406 |     "embedded_measurements['date'] = pd.to_datetime(embedded_measurements['measurement_DATETIME'].dt.date)\n",
407 |     "embedded_obs['date'] = pd.to_datetime(embedded_obs['observation_DATETIME'].dt.date)\n"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 19,
413 |    "id": "5efd7c59",
414 |    "metadata": {},
415 |    "outputs": [
416 |     {
417 |      "name": "stdout",
418 |      "output_type": "stream",
419 |      "text": [
420 |       "CPU times: user 1.65 s, sys: 221 ms, total: 1.88 s\n",
421 |       "Wall time: 1.68 s\n"
422 |      ]
423 |     }
424 |    ],
425 |    "source": [
426 |     "%%time\n",
427 |     "# Convert pandas dataframes to dask dataframes\n",
428 |     "embedded_conds_dsk = dd.from_pandas(embedded_conds, npartitions=120)\n",
429 |     "embedded_procs_dsk = dd.from_pandas(embedded_procs, npartitions=120)\n",
430 |     "embedded_drugs_dsk = dd.from_pandas(embedded_drugs, npartitions=120)\n",
431 |     "embedded_measurements_dsk = dd.from_pandas(embedded_measurements, npartitions=120)\n",
432 |     "embedded_obs_dsk = dd.from_pandas(embedded_obs, npartitions=120)\n",
433 |     "\n",
434 |     "# Filter null embeddings\n",
435 |     "embedded_conds_dsk = embedded_conds_dsk[embedded_conds_dsk['embedding'].notnull()]\n",
436 |     "embedded_procs_dsk = embedded_procs_dsk[embedded_procs_dsk['embedding'].notnull()]\n",
437 |     "embedded_drugs_dsk = embedded_drugs_dsk[embedded_drugs_dsk['embedding'].notnull()]\n",
438 |     "embedded_measurements_dsk = embedded_measurements_dsk[embedded_measurements_dsk['embedding'].notnull()]\n",
439 |     "embedded_obs_dsk = embedded_obs_dsk[embedded_obs_dsk['embedding'].notnull()]\n",
440 |     "\n",
441 |     "# Concatenate different EHR tables\n",
442 |     "all_data = dd.concat([\n",
443 |     "    embedded_conds_dsk.drop(['DOS','mom_person_id','child_person_id', 'condition_concept_id',\n",
444 |     "                      'condition_start_DATETIME','child_birth_date','delta'], axis=1),\n",
445 |     "    embedded_procs_dsk.drop(['DOS','mom_person_id','child_person_id', 'procedure_concept_id',\n",
446 |     "                        'procedure_DATETIME','child_birth_date','delta'], axis=1),\n",
447 |     "    embedded_drugs_dsk.drop(['DOS','mom_person_id','child_person_id', 'drug_concept_id',\n",
448 |     "                        'drug_exposure_start_DATETIME','child_birth_date','delta'], axis=1),\n",
449 |     "    embedded_measurements_dsk.drop(['DOS','mom_person_id','child_person_id', 'measurement_concept_id',\n",
450 |     "                        'measurement_DATETIME','value_as_number','child_birth_date','delta'], axis=1),\n",
451 |     "    embedded_obs_dsk.drop(['DOS','mom_person_id','child_person_id', 'observation_concept_id',\n",
452 |     "                        'observation_DATETIME','child_birth_date','delta'], axis=1)\n",
453 |     "], ignore_index=True)[['sample_ID','date','embedding']].compute()\n",
454 |     "\n",
455 |     "all_data.sort_values('date', ascending=False, inplace=True)\n",
456 |     "\n",
457 |     "\n",
458 |     "max_dates = 32\n",
459 |     "\n",
460 |     "all_data.sort_values(by=['sample_ID', 'date'], ascending=[True, False], inplace=True)\n",
461 |     "\n",
462 |     "# Create a helper column to rank the unique dates for each person_id\n",
463 |     "all_data['date_rank'] = all_data.groupby('sample_ID')['date'].transform(lambda x: x.rank(method='dense', ascending=False))\n",
464 |     "\n",
465 |     "# Filter the rows where date_rank is within the range of 1 to max_dates\n",
466 |     "filtered_data = all_data[all_data['date_rank'].between(1, max_dates)]\n",
467 |     "filtered_data = filtered_data.drop(columns='date_rank')\n"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": 20,
473 |    "id": "585d6bf3",
474 |    "metadata": {},
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "CPU times: user 156 ms, sys: 95 µs, total: 156 ms\n",
481 |       "Wall time: 154 ms\n"
482 |      ]
483 |     }
484 |    ],
485 |    "source": [
486 |     "%%time\n",
487 |     "#create patient-day embeddings\n",
488 |     "patient_day_embeddings = filtered_data.groupby(['sample_ID','date']).mean()"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": 21,
494 |    "id": "fb177a6c",
495 |    "metadata": {},
496 |    "outputs": [],
497 |    "source": [
498 |     "patient_day_embeddings = patient_day_embeddings.reset_index()"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": 22,
504 |    "id": "52b92faa",
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "patient_day_embeddings['date'] = pd.to_datetime(patient_day_embeddings['date'])\n",
509 |     "patient_day_embeddings = patient_day_embeddings.sort_values(['sample_ID', 'date'])"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": 23,
515 |    "id": "247db58c",
516 |    "metadata": {},
517 |    "outputs": [],
518 |    "source": [
519 |     "unique_patients = patient_day_embeddings['sample_ID'].nunique()\n",
520 |     "num_features = len(patient_day_embeddings['embedding'].iloc[0])\n"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": 24,
526 |    "id": "c8f56448",
527 |    "metadata": {},
528 |    "outputs": [],
529 |    "source": [
530 |     "max_dates = patient_day_embeddings.groupby('sample_ID')['date'].count().max()\n",
531 |     "max_dates = 32"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": 25,
537 |    "id": "81eb984e",
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": [
541 |     "#assign each patient to an index in the data matrix\n",
542 |     "patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['sample_ID'].unique())}\n"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": 26,
548 |    "id": "eb9a7d30",
549 |    "metadata": {},
550 |    "outputs": [],
551 |    "source": [
552 |     "#create numpy matrix for data\n",
553 |     "RNN_data = np.full((num_features, max_dates, unique_patients), np.nan)\n"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": 27,
559 |    "id": "86ccbe67",
560 |    "metadata": {},
561 |    "outputs": [
562 |     {
563 |      "data": {
564 |       "application/vnd.jupyter.widget-view+json": {
565 |        "model_id": "9b6c1e45999c496b8b95a203b42c5e66",
566 |        "version_major": 2,
567 |        "version_minor": 0
568 |       },
569 |       "text/plain": [
570 |        "0it [00:00, ?it/s]"
571 |       ]
572 |      },
573 |      "metadata": {},
574 |      "output_type": "display_data"
575 |     },
576 |     {
577 |      "name": "stdout",
578 |      "output_type": "stream",
579 |      "text": [
580 |       "CPU times: user 103 ms, sys: 17 ms, total: 120 ms\n",
581 |       "Wall time: 103 ms\n"
582 |      ]
583 |     }
584 |    ],
585 |    "source": [
586 |     "%%time\n",
587 |     "#populate data matrix with input data\n",
588 |     "date_position = {}\n",
589 |     "for index, row in tqdm(patient_day_embeddings.iterrows()):\n",
590 |     "    patient_id = row['sample_ID']\n",
591 |     "    patient_index = patient_id_to_index[patient_id]\n",
592 |     "    \n",
593 |     "    if patient_id not in date_position:\n",
594 |     "        date_position[patient_id] = 0\n",
595 |     "    else:\n",
596 |     "        date_position[patient_id] += 1\n",
597 |     "        \n",
598 |     "    date_index = date_position[patient_id]\n",
599 |     "    \n",
600 |     "    RNN_data[:,date_index, patient_index] = row['embedding']\n"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "code",
605 |    "execution_count": 28,
606 |    "id": "95f0c2cf",
607 |    "metadata": {},
608 |    "outputs": [],
609 |    "source": [
610 |     "RNN_data = RNN_data.transpose(2,1,0)"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": 29,
616 |    "id": "25c6a751",
617 |    "metadata": {},
618 |    "outputs": [
619 |     {
620 |      "data": {
621 |       "text/plain": [
622 |        "(113, 32, 400)"
623 |       ]
624 |      },
625 |      "execution_count": 29,
626 |      "metadata": {},
627 |      "output_type": "execute_result"
628 |     }
629 |    ],
630 |    "source": [
631 |     "RNN_data.shape"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": 30,
637 |    "id": "c389e1f4",
638 |    "metadata": {},
639 |    "outputs": [],
640 |    "source": [
641 |     "sampling_df['sample_ID'] = sampling_df['mom_person_id'].astype(str)+'_'+sampling_df['child_person_id'].astype(str)"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": 31,
647 |    "id": "38a4b841",
648 |    "metadata": {},
649 |    "outputs": [],
650 |    "source": [
651 |     "sampling_df = sampling_df.merge(pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T, how='right', left_on='sample_ID', right_on=0)\n"
652 |    ]
653 |   },
654 |   {
655 |    "cell_type": "code",
656 |    "execution_count": 32,
657 |    "id": "56dfc054",
658 |    "metadata": {},
659 |    "outputs": [],
660 |    "source": [
661 |     "#align outcome data with feature matrix\n",
662 |     "sampling_df = sampling_df.merge(patient_day_embeddings.groupby('sample_ID').count()[['date']], how='left', on='sample_ID')"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": 33,
668 |    "id": "e64bdd52",
669 |    "metadata": {},
670 |    "outputs": [],
671 |    "source": [
672 |     "DOS_outcomes = np.array(sampling_df[['DOS',1]].sort_values(1)['DOS'])"
673 |    ]
674 |   },
675 |   {
676 |    "cell_type": "code",
677 |    "execution_count": 34,
678 |    "id": "51e0d166",
679 |    "metadata": {},
680 |    "outputs": [],
681 |    "source": [
682 |     "#save processed data\n",
683 |     "np.save('./data/processed_data/RNN_data_full_EHR_cohort_with_obs_fixed.npy', RNN_data)\n",
684 |     "np.save('./data/processed_data/RNN_data_outcomes_full_EHR_cohort_with_obs_fixed.npy', DOS_outcomes)\n"
685 |    ]
686 |   },
687 |   {
688 |    "cell_type": "code",
689 |    "execution_count": 35,
690 |    "id": "0b8aaa6c",
691 |    "metadata": {},
692 |    "outputs": [],
693 |    "source": [
694 |     "num_patient_visits = np.minimum(np.array(sampling_df['date']), 32)"
695 |    ]
696 |   },
697 |   {
698 |    "cell_type": "code",
699 |    "execution_count": 36,
700 |    "id": "baba0b97",
701 |    "metadata": {},
702 |    "outputs": [],
703 |    "source": [
704 |     "np.save('./data/processed_data/RNN_data_lengths_full_EHR_cohort_with_obs_fixed.npy', num_patient_visits)"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "code",
709 |    "execution_count": 37,
710 |    "id": "7f0cffbc",
711 |    "metadata": {},
712 |    "outputs": [],
713 |    "source": [
714 |     "df = pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T\n",
715 |     "df.to_csv('./data/processed_data/sampleID_indices_full_cohort_with_obs_fixed.csv')"
716 |    ]
717 |   },
718 |   {
719 |    "cell_type": "code",
720 |    "execution_count": null,
721 |    "id": "da8cbeee",
722 |    "metadata": {},
723 |    "outputs": [],
724 |    "source": []
725 |   }
726 |  ],
727 |  "metadata": {
728 |   "kernelspec": {
729 |    "display_name": "Python 3 (ipykernel)",
730 |    "language": "python",
731 |    "name": "python3"
732 |   },
733 |   "language_info": {
734 |    "codemirror_mode": {
735 |     "name": "ipython",
736 |     "version": 3
737 |    },
738 |    "file_extension": ".py",
739 |    "mimetype": "text/x-python",
740 |    "name": "python",
741 |    "nbconvert_exporter": "python",
742 |    "pygments_lexer": "ipython3",
743 |    "version": "3.10.6"
744 |   }
745 |  },
746 |  "nbformat": 4,
747 |  "nbformat_minor": 5
748 | }
749 | 


--------------------------------------------------------------------------------
/Onset of Labor/process_EHR_data_omics_cohort.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 41,
  6 |    "id": "401a48de",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "from tqdm.notebook import tqdm\n",
 13 |     "from scipy.stats import pearsonr\n",
 14 |     "import gensim\n",
 15 |     "from gensim.models import Word2Vec\n",
 16 |     "from gensim.models.callbacks import CallbackAny2Vec\n",
 17 |     "import random\n",
 18 |     "import pickle\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "import seaborn as sns\n",
 21 |     "import torch"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 42,
 27 |    "id": "fcfe5126",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "#load CSV files which are direct extracts from OMOP tables\n",
 32 |     "conds = pd.read_csv('./data/raw_data/EHR/EHR_cohort_conditions.csv')\n",
 33 |     "conds = conds[conds['condition_concept_id'] != 0]\n",
 34 |     "drugs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_drugs.csv')\n",
 35 |     "drugs = drugs[drugs['drug_concept_id'] != 0]\n",
 36 |     "procs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_procedures.csv')\n",
 37 |     "procs = procs[procs['procedure_concept_id'] != 0]\n",
 38 |     "obs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_observations.csv')\n",
 39 |     "obs = obs[obs['observation_concept_id'] != 0]\n",
 40 |     "\n",
 41 |     "conds['condition_start_DATETIME'] = pd.to_datetime(conds['condition_start_DATETIME'])\n",
 42 |     "procs['procedure_DATETIME'] = pd.to_datetime(procs['procedure_DATETIME'])\n",
 43 |     "drugs['drug_exposure_start_DATETIME'] = pd.to_datetime(drugs['drug_exposure_start_DATETIME'])\n",
 44 |     "obs['observation_DATETIME'] = pd.to_datetime(obs['observation_DATETIME'])\n",
 45 |     "\n",
 46 |     "conds['child_birth_date'] = pd.to_datetime(conds['child_birth_date'])\n",
 47 |     "procs['child_birth_date'] = pd.to_datetime(procs['child_birth_date'])\n",
 48 |     "drugs['child_birth_date'] = pd.to_datetime(drugs['child_birth_date'])\n",
 49 |     "obs['child_birth_date'] = pd.to_datetime(obs['child_birth_date'])\n",
 50 |     "\n",
 51 |     "measurements = pd.read_csv('./data/raw_data/EHR/EHR_cohort_measurements.csv')\n",
 52 |     "measurements = measurements[~pd.isnull(measurements['value_as_number'])]\n",
 53 |     "measurements = measurements[measurements['measurement_concept_id'] != 0]\n",
 54 |     "measurements['measurement_DATETIME'] = pd.to_datetime(measurements['measurement_DATETIME'])\n",
 55 |     "measurements['child_birth_date'] = pd.to_datetime(measurements['child_birth_date'])"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 43,
 61 |    "id": "c7f04ac1",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "#load IDs of mothers in omics cohort\n",
 66 |     "OOL_cohort_omop = pd.read_csv('./data/ool_EHR_features.csv')['mom_person_id'].values"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 44,
 72 |    "id": "42763cbb",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "#filter data to only mothers in omics cohort\n",
 77 |     "conds = conds[conds['mom_person_id'].isin(OOL_cohort_omop)]\n",
 78 |     "drugs = drugs[drugs['mom_person_id'].isin(OOL_cohort_omop)]\n",
 79 |     "procs = procs[procs['mom_person_id'].isin(OOL_cohort_omop)]\n",
 80 |     "measurements = measurements[measurements['mom_person_id'].isin(OOL_cohort_omop)]\n",
 81 |     "obs = obs[obs['mom_person_id'].isin(OOL_cohort_omop)]"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 45,
 87 |    "id": "b504fe9c",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "def filter_df(df, birth_time, time_col, time_range_days=280):\n",
 92 |     "    \"\"\"\n",
 93 |     "    A function to remove entries in a dataframe prior to time of birth. \n",
 94 |     "    \n",
 95 |     "    df: The dataframe to filter. Must contain a column called person_id with the OMOP ID of the mother\n",
 96 |     "    birth_time: A dataframe that contains two columns: maternal_OMOP and birth_DATETIME\n",
 97 |     "    time_col: The index of the column with the date of the event in df\n",
 98 |     "    time_range_days: keeps data from delivery up to time_range_days prior \n",
 99 |     "    \n",
100 |     "    \"\"\"\n",
101 |     "    print('There were {} patients before filtering.'.format(len(df['person_id'].unique())))\n",
102 |     "    df = df.merge(birth_time, how='inner', left_on='person_id', right_on='maternal_OMOP')\n",
103 |     "    df['diff'] = df['birth_DATETIME']-df[time_col]\n",
104 |     "    new_df = df[(df['diff'].dt.days > 0) & (df['diff'].dt.days <= time_range_days)].drop('maternal_OMOP', axis=1)\n",
105 |     "    print('There were {} patients after filtering.'.format(len(new_df['person_id'].unique())))\n",
106 |     "    return new_df\n",
107 |     "\n",
108 |     "def generate_features_EHR_cohort(proteomics, input_df, time_col_name, concept_id_col, indicator, binary=True):\n",
109 |     "    df = proteomics[['DOS','mom_person_id','child_person_id','sample_ID']].merge(input_df, how='left', on=['mom_person_id','child_person_id'])\n",
110 |     "    df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days\n",
111 |     "    df = df[df['delta'] < df['DOS']]\n",
112 |     "    return df\n",
113 |     "    "
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 46,
119 |    "id": "2742da9d",
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "#load key file which can be used to map proteomics data to mother person_id\n",
124 |     "patient_indices = pd.read_csv('./data/processed_data/sampleID_indices.csv')"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 7,
130 |    "id": "558feb6e",
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "#load and clean proteomics data\n",
135 |     "OOL_proteomics = pd.read_csv('./data/processed_data/ool_proteomics_omop_id.csv')\n",
136 |     "OOL_proteomics['sample_ID'] = OOL_proteomics['maternal_person_id'].astype(str)+'_'+OOL_proteomics['Timepoint'].astype(str)\n",
137 |     "OOL_proteomics = OOL_proteomics.drop(['Timepoint','maternal_person_id'],axis=1)\n",
138 |     "OOL_proteomics.columns = [str(i)+'_protein' for i in OOL_proteomics.columns]\n",
139 |     "OOL_proteomics = OOL_proteomics.rename(columns={'DOS_protein':'DOS_sampling_time', 'sample_ID_protein':'sample_ID'})\n",
140 |     "OOL_proteomics = OOL_proteomics[['sample_ID','DOS_sampling_time']]\n",
141 |     "OOL_proteomics['mom_person_id'] = OOL_proteomics['sample_ID'].str[0:7].astype(int)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 8,
147 |    "id": "668c6b85",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "## This block of code creates a dataframe with mom_person_id, child_person_id, min_delta, max_delta \n",
152 |     "## (based on the range of EHR data available), days to onset, and a combined sample_ID col which is used as an identifier\n",
153 |     "\n",
154 |     "# Filter and calculate delta\n",
155 |     "time_col_name = 'condition_start_DATETIME'\n",
156 |     "df = conds\n",
157 |     "df['delta'] = (df[time_col_name] - df['child_birth_date']).dt.days\n",
158 |     "\n",
159 |     "# Calculate min and max delta in one operation\n",
160 |     "ool = df.groupby(['mom_person_id', 'child_person_id'])['delta'].agg(['min', 'max'])\n",
161 |     "ool.columns = ['min_delta', 'max_delta']\n",
162 |     "\n",
163 |     "# Filter for samples with at least 7 days between min and max\n",
164 |     "sampling_df = ool[ool['max_delta'] - ool['min_delta'] >= 7].reset_index()\n",
165 |     "\n",
166 |     "# Create initial sample_ID\n",
167 |     "sampling_df['sample_ID'] = sampling_df['mom_person_id'].astype(str) + '_' + sampling_df['child_person_id'].astype(str)\n",
168 |     "\n",
169 |     "# Filter based on OOL_sample_IDs\n",
170 |     "OOL_sample_IDs = np.unique([i[0:15] for i in list(patient_indices['0'])])\n",
171 |     "sampling_df = sampling_df[sampling_df['sample_ID'].str[:15].isin(OOL_sample_IDs)]\n",
172 |     "\n",
173 |     "# Merge with OOL_proteomics\n",
174 |     "sampling_df = sampling_df.merge(OOL_proteomics, how='inner', on='mom_person_id', suffixes=('_x', '_y'))\n",
175 |     "\n",
176 |     "# Set DOS\n",
177 |     "sampling_df['DOS'] = sampling_df['DOS_sampling_time']\n",
178 |     "\n",
179 |     "# Create the correct sample_ID\n",
180 |     "sampling_df['sample_ID'] = sampling_df['sample_ID_x'] + sampling_df['sample_ID_y'].str[-3:]\n",
181 |     "\n",
182 |     "# Drop unnecessary columns\n",
183 |     "columns_to_drop = ['sample_ID_x', 'sample_ID_y', 'DOS_sampling_time']\n",
184 |     "sampling_df = sampling_df.drop(columns_to_drop, axis=1, errors='ignore')"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 9,
190 |    "id": "3a0c5c4a",
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "#filter data so it only occurs within the correct time range (beginning of pregnancy thru sampling)\n",
195 |     "condition_features_EHR = generate_features_EHR_cohort(sampling_df, conds, 'condition_start_DATETIME','condition_concept_id','C')\n",
196 |     "procedure_features_EHR = generate_features_EHR_cohort(sampling_df, procs, 'procedure_DATETIME','procedure_concept_id','P')\n",
197 |     "drug_features_EHR = generate_features_EHR_cohort(sampling_df, drugs, 'drug_exposure_start_DATETIME','drug_concept_id','D')\n",
198 |     "measurement_features_EHR = generate_features_EHR_cohort(sampling_df, measurements, 'measurement_DATETIME','measurement_concept_id','M')\n",
199 |     "observation_features_EHR = generate_features_EHR_cohort(sampling_df, obs, 'observation_DATETIME','observation_concept_id','O')\n"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 10,
205 |    "id": "a3facc35",
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "class EpochLogger(CallbackAny2Vec):\n",
210 |     "    def __init__(self):\n",
211 |     "        self.epoch = 0\n",
212 |     "\n",
213 |     "    def on_epoch_begin(self, model):\n",
214 |     "        print(f\"Starting epoch #{self.epoch}\")\n",
215 |     "\n",
216 |     "    def on_epoch_end(self, model):\n",
217 |     "        print(f\"Finished epoch #{self.epoch}\")\n",
218 |     "        self.epoch += 1"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 11,
224 |    "id": "01f87efa",
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "# Train word2vec model\n",
229 |     "# NOTE: For word2vec model training, we do NOT do the date filtering and use all data from pregnancy\n",
230 |     "try:\n",
231 |     "    model = Word2Vec.load(\"./models/word2vec_OOL_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n",
232 |     "except:\n",
233 |     "    \n",
234 |     "    epoch_logger = EpochLogger()\n",
235 |     "\n",
236 |     "    word2vec_conds = condition_features_EHR[['sample_ID','condition_concept_id','condition_start_DATETIME']]\n",
237 |     "    word2vec_conds.columns = ['sample_ID','concept_id','ts']\n",
238 |     "\n",
239 |     "    word2vec_procs = procedure_features_EHR[['sample_ID','procedure_concept_id','procedure_DATETIME']]\n",
240 |     "    word2vec_procs.columns = ['sample_ID','concept_id','ts']\n",
241 |     "\n",
242 |     "    word2vec_drug = drug_features_EHR[['sample_ID','drug_concept_id','drug_exposure_start_DATETIME']]\n",
243 |     "    word2vec_drug.columns = ['sample_ID','concept_id','ts']\n",
244 |     "    \n",
245 |     "    word2vec_mea = measurement_features_EHR[['sample_ID','measurement_concept_id','measurement_DATETIME']]\n",
246 |     "    word2vec_mea.columns = ['sample_ID','concept_id','ts']\n",
247 |     "    \n",
248 |     "    word2vec_obs = observation_features_EHR[['sample_ID','observation_concept_id','observation_DATETIME']]\n",
249 |     "    word2vec_obs.columns = ['sample_ID','concept_id','ts']\n",
250 |     "    \n",
251 |     "    word2vec_data = pd.concat([word2vec_conds, word2vec_procs, word2vec_drug,word2vec_mea, word2vec_obs],axis=0)\n",
252 |     "    word2vec_data['date'] = pd.to_datetime(word2vec_data['ts'])\n",
253 |     "    word2vec_data['date'] = word2vec_data['date'].dt.date\n",
254 |     "    word2vec_data = word2vec_data.drop('ts',axis=1)\n",
255 |     "    word2vec_data = word2vec_data[~pd.isnull(word2vec_data['concept_id'])]\n",
256 |     "    word2vec_data['concept_id'] = word2vec_data['concept_id'].astype(int)\n",
257 |     "    \n",
258 |     "    grouped_data = word2vec_data.groupby(['sample_ID', 'date'])\n",
259 |     "    sentences = []\n",
260 |     "    for _, group in tqdm(grouped_data):\n",
261 |     "        codes = group['concept_id'].tolist()\n",
262 |     "        random.shuffle(codes)\n",
263 |     "        sentences.append(codes)\n",
264 |     "        \n",
265 |     "    print('starting training')\n",
266 |     "    model = Word2Vec(sentences, vector_size=400, window=1000, min_count=5, workers=64)\n",
267 |     "    model.train(sentences, total_examples=len(sentences), epochs=5, callbacks=[epoch_logger])\n",
268 |     "    model.save(\"./models/word2vec_OOL_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 12,
274 |    "id": "2c112962",
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "code_to_embedding = {code: model.wv[code] for code in model.wv.index_to_key}"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 13,
284 |    "id": "004203e4",
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "#map EHR data to their respective learned embeddings from word2vec\n",
289 |     "embedded_conds = condition_features_EHR[(condition_features_EHR['condition_concept_id'] != 0)]\n",
290 |     "embedded_conds = embedded_conds[~pd.isnull(embedded_conds['condition_concept_id'])]\n",
291 |     "embedded_conds['embedding'] = [code_to_embedding.get(code) for code in embedded_conds['condition_concept_id']]\n"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 14,
297 |    "id": "0a2def90",
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "embedded_procs = procedure_features_EHR[(procedure_features_EHR['procedure_concept_id'] != 0)]\n",
302 |     "embedded_procs = embedded_procs[~pd.isnull(embedded_procs['procedure_concept_id'])]\n",
303 |     "embedded_procs['embedding'] = [code_to_embedding.get(code) for code in embedded_procs['procedure_concept_id']]\n"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 15,
309 |    "id": "ca224cd0",
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "embedded_drugs = drug_features_EHR[(drug_features_EHR['drug_concept_id'] != 0)]\n",
314 |     "embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['drug_concept_id'])]\n",
315 |     "embedded_drugs['embedding'] = [code_to_embedding.get(code) for code in embedded_drugs['drug_concept_id']]\n"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 16,
321 |    "id": "6d61abf2",
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "embedded_measurements = measurement_features_EHR[(measurement_features_EHR['measurement_concept_id'] != 0)]\n",
326 |     "embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['measurement_concept_id'])]\n",
327 |     "embedded_measurements['embedding'] = [code_to_embedding.get(code) for code in embedded_measurements['measurement_concept_id']]\n"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 17,
333 |    "id": "a8d8e620",
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "embedded_obs = observation_features_EHR[(observation_features_EHR['observation_concept_id'] != 0)]\n",
338 |     "embedded_obs = embedded_obs[~pd.isnull(embedded_obs['observation_concept_id'])]\n",
339 |     "embedded_obs['embedding'] = [code_to_embedding.get(code) for code in embedded_obs['observation_concept_id']]\n"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 18,
345 |    "id": "1b284aab",
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "embedded_procs['date'] = pd.to_datetime(embedded_procs['procedure_DATETIME'].dt.date)\n",
350 |     "embedded_conds['date'] = pd.to_datetime(embedded_conds['condition_start_DATETIME'].dt.date)\n",
351 |     "embedded_drugs['date'] = pd.to_datetime(embedded_drugs['drug_exposure_start_DATETIME'].dt.date)\n",
352 |     "embedded_measurements['date'] = pd.to_datetime(embedded_measurements['measurement_DATETIME'].dt.date)\n",
353 |     "embedded_obs['date'] = pd.to_datetime(embedded_obs['observation_DATETIME'].dt.date)\n"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 19,
359 |    "id": "34f1dc40",
360 |    "metadata": {},
361 |    "outputs": [
362 |     {
363 |      "name": "stdout",
364 |      "output_type": "stream",
365 |      "text": [
366 |       "done making interim dataframe\n",
367 |       "CPU times: user 79.5 ms, sys: 12.5 ms, total: 91.9 ms\n",
368 |       "Wall time: 89.2 ms\n"
369 |      ]
370 |     }
371 |    ],
372 |    "source": [
373 |     "%%time\n",
374 |     "#combine all EHR data tables together\n",
375 |     "embedded_conds = embedded_conds[~pd.isnull(embedded_conds['embedding'])]\n",
376 |     "embedded_procs = embedded_procs[~pd.isnull(embedded_procs['embedding'])]\n",
377 |     "embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['embedding'])]\n",
378 |     "embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['embedding'])]\n",
379 |     "embedded_obs = embedded_obs[~pd.isnull(embedded_obs['embedding'])]\n",
380 |     "\n",
381 |     "all_data = pd.concat([embedded_conds.drop(['DOS','mom_person_id','child_person_id', 'condition_concept_id',\n",
382 |     "                      'condition_start_DATETIME','child_birth_date','delta'],axis=1),\n",
383 |     "    embedded_procs.drop(['DOS','mom_person_id','child_person_id', 'procedure_concept_id',\n",
384 |     "                        'procedure_DATETIME','child_birth_date','delta'],axis=1),\n",
385 |     "     embedded_drugs.drop(['DOS','mom_person_id','child_person_id', 'drug_concept_id',\n",
386 |     "                        'drug_exposure_start_DATETIME','child_birth_date','delta'],axis=1),\n",
387 |     "     embedded_measurements.drop(['DOS','mom_person_id','child_person_id', 'measurement_concept_id',\n",
388 |     "                        'measurement_DATETIME','value_as_number','child_birth_date','delta'],axis=1),\n",
389 |     "    embedded_obs.drop(['DOS','mom_person_id','child_person_id', 'observation_concept_id',\n",
390 |     "                        'observation_DATETIME','child_birth_date','delta'],axis=1)], ignore_index=True)[['sample_ID','date','embedding']]\n",
391 |     "\n",
392 |     "expanded_embedding_df = pd.DataFrame(all_data['embedding'].tolist())\n",
393 |     "print('done making interim dataframe')\n",
394 |     "all_data = pd.concat([all_data.reset_index(drop=True).drop('embedding',axis=1), expanded_embedding_df], axis=1)\n"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 20,
400 |    "id": "44e59ec0",
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "name": "stdout",
405 |      "output_type": "stream",
406 |      "text": [
407 |       "CPU times: user 6.59 ms, sys: 0 ns, total: 6.59 ms\n",
408 |       "Wall time: 5.22 ms\n"
409 |      ]
410 |     }
411 |    ],
412 |    "source": [
413 |     "%%time\n",
414 |     "#take the mean to compute patient-day embeddings\n",
415 |     "patient_day_embeddings = all_data.groupby(['sample_ID','date']).mean()"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 21,
421 |    "id": "07dfd3a3",
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": [
425 |     "patient_day_embeddings = patient_day_embeddings.reset_index()"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 22,
431 |    "id": "2fbf3201",
432 |    "metadata": {},
433 |    "outputs": [],
434 |    "source": [
435 |     "patient_day_embeddings['date'] = pd.to_datetime(patient_day_embeddings['date'])\n",
436 |     "patient_day_embeddings = patient_day_embeddings.sort_values(['sample_ID', 'date'])"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": 23,
442 |    "id": "cca3d994",
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "unique_patients = patient_day_embeddings['sample_ID'].nunique()\n",
447 |     "num_features = len(patient_day_embeddings.columns) - 2  # Subtract patient_id and date columns\n"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 24,
453 |    "id": "ca7e301b",
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "max_dates = patient_day_embeddings.groupby('sample_ID')['date'].count().max()\n",
458 |     "max_dates = 32"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 25,
464 |    "id": "bdeb43c9",
465 |    "metadata": {},
466 |    "outputs": [],
467 |    "source": [
468 |     "#assign each patient id to an index in the input data matrix\n",
469 |     "patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['sample_ID'].unique())}\n"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 26,
475 |    "id": "d569be44",
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "#create input data matrix\n",
480 |     "RNN_data = np.full((num_features, max_dates, unique_patients), np.nan)\n"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": 27,
486 |    "id": "2c0b8f07",
487 |    "metadata": {},
488 |    "outputs": [
489 |     {
490 |      "data": {
491 |       "application/vnd.jupyter.widget-view+json": {
492 |        "model_id": "0704510ca12f4363bda5bce905a00a1b",
493 |        "version_major": 2,
494 |        "version_minor": 0
495 |       },
496 |       "text/plain": [
497 |        "0it [00:00, ?it/s]"
498 |       ]
499 |      },
500 |      "metadata": {},
501 |      "output_type": "display_data"
502 |     },
503 |     {
504 |      "name": "stdout",
505 |      "output_type": "stream",
506 |      "text": [
507 |       "CPU times: user 408 ms, sys: 16.3 ms, total: 424 ms\n",
508 |       "Wall time: 407 ms\n"
509 |      ]
510 |     }
511 |    ],
512 |    "source": [
513 |     "%%time\n",
514 |     "#fill in input data matrix with person-day EHR data embeddings\n",
515 |     "date_position = {}\n",
516 |     "for index, row in tqdm(patient_day_embeddings.iterrows()):\n",
517 |     "    patient_id = row['sample_ID']\n",
518 |     "    patient_index = patient_id_to_index[patient_id]\n",
519 |     "    \n",
520 |     "    if patient_id not in date_position:\n",
521 |     "        date_position[patient_id] = 0\n",
522 |     "    else:\n",
523 |     "        date_position[patient_id] += 1\n",
524 |     "        \n",
525 |     "    date_index = date_position[patient_id]\n",
526 |     "    \n",
527 |     "    for feature_index, feature_value in enumerate(row.drop(['sample_ID', 'date'])):\n",
528 |     "        if date_index < max_dates:\n",
529 |     "            RNN_data[feature_index, date_index, patient_index] = feature_value\n"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 28,
535 |    "id": "c2de670d",
536 |    "metadata": {},
537 |    "outputs": [
538 |     {
539 |      "data": {
540 |       "text/plain": [
541 |        "(400, 32, 42)"
542 |       ]
543 |      },
544 |      "execution_count": 28,
545 |      "metadata": {},
546 |      "output_type": "execute_result"
547 |     }
548 |    ],
549 |    "source": [
550 |     "RNN_data.shape"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": 29,
556 |    "id": "f21c2dd3",
557 |    "metadata": {},
558 |    "outputs": [],
559 |    "source": [
560 |     "RNN_data = RNN_data.transpose(2,1,0)"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 30,
566 |    "id": "f043f5ae",
567 |    "metadata": {},
568 |    "outputs": [
569 |     {
570 |      "data": {
571 |       "text/plain": [
572 |        "(42, 32, 400)"
573 |       ]
574 |      },
575 |      "execution_count": 30,
576 |      "metadata": {},
577 |      "output_type": "execute_result"
578 |     }
579 |    ],
580 |    "source": [
581 |     "RNN_data.shape"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "code",
586 |    "execution_count": 31,
587 |    "id": "98f90b14",
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": [
591 |     "#align outcome data with correct index\n",
592 |     "sampling_df = sampling_df.merge(pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T, how='right', left_on='sample_ID', right_on=0)\n"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "code",
597 |    "execution_count": 32,
598 |    "id": "e6fd3f73",
599 |    "metadata": {},
600 |    "outputs": [],
601 |    "source": [
602 |     "#align outcome data with correct index\n",
603 |     "sampling_df = sampling_df.merge(patient_day_embeddings.groupby('sample_ID').count()[['date']], how='left', on='sample_ID')"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": 33,
609 |    "id": "74645d66",
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": [
613 |     "DOS_outcomes = np.array(sampling_df[['DOS',1]].sort_values(1)['DOS'])"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": 34,
619 |    "id": "d6490d23",
620 |    "metadata": {},
621 |    "outputs": [
622 |     {
623 |      "data": {
624 |       "text/plain": [
625 |        "((42, 32, 400), (42,))"
626 |       ]
627 |      },
628 |      "execution_count": 34,
629 |      "metadata": {},
630 |      "output_type": "execute_result"
631 |     }
632 |    ],
633 |    "source": [
634 |     "RNN_data.shape, DOS_outcomes.shape"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 35,
640 |    "id": "e6427287",
641 |    "metadata": {},
642 |    "outputs": [],
643 |    "source": [
644 |     "#Save processed data below"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "code",
649 |    "execution_count": 36,
650 |    "id": "86cda5eb",
651 |    "metadata": {},
652 |    "outputs": [],
653 |    "source": [
654 |     "np.save('./data/processed_data/RNN_data_codes_with_obs_word2vec_from_ool.npy', RNN_data)"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 37,
660 |    "id": "862c6ed5",
661 |    "metadata": {},
662 |    "outputs": [],
663 |    "source": [
664 |     "np.save('./data/processed_data/RNN_data_outcomes_with_obs_word2vec_from_ool.npy', DOS_outcomes)"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "code",
669 |    "execution_count": 38,
670 |    "id": "9bb0ee8e",
671 |    "metadata": {},
672 |    "outputs": [],
673 |    "source": [
674 |     "patient_outcomes = torch.tensor(DOS_outcomes).float()\n",
675 |     "num_patient_visits = np.minimum(np.array(sampling_df['date']), 32)"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "code",
680 |    "execution_count": 39,
681 |    "id": "e3a77d44",
682 |    "metadata": {},
683 |    "outputs": [],
684 |    "source": [
685 |     "np.save('./data/processed_data/RNN_data_lengths_with_obs_word2vec_from_ool.npy', num_patient_visits)"
686 |    ]
687 |   },
688 |   {
689 |    "cell_type": "code",
690 |    "execution_count": 40,
691 |    "id": "76b41b0b",
692 |    "metadata": {},
693 |    "outputs": [],
694 |    "source": [
695 |     "pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('./data/processed_data/sampleID_indices_with_obs_word2vec_from_ool.csv')\n"
696 |    ]
697 |   },
698 |   {
699 |    "cell_type": "code",
700 |    "execution_count": null,
701 |    "id": "c6018108",
702 |    "metadata": {},
703 |    "outputs": [],
704 |    "source": []
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": null,
709 |    "id": "ba06bf46",
710 |    "metadata": {},
711 |    "outputs": [],
712 |    "source": []
713 |   },
714 |   {
715 |    "cell_type": "code",
716 |    "execution_count": null,
717 |    "id": "4dd427e9",
718 |    "metadata": {},
719 |    "outputs": [],
720 |    "source": []
721 |   }
722 |  ],
723 |  "metadata": {
724 |   "kernelspec": {
725 |    "display_name": "Python 3 (ipykernel)",
726 |    "language": "python",
727 |    "name": "python3"
728 |   },
729 |   "language_info": {
730 |    "codemirror_mode": {
731 |     "name": "ipython",
732 |     "version": 3
733 |    },
734 |    "file_extension": ".py",
735 |    "mimetype": "text/x-python",
736 |    "name": "python",
737 |    "nbconvert_exporter": "python",
738 |    "pygments_lexer": "ipython3",
739 |    "version": "3.10.6"
740 |   }
741 |  },
742 |  "nbformat": 4,
743 |  "nbformat_minor": 5
744 | }
745 | 


--------------------------------------------------------------------------------
/Onset of Labor/process_EHR_data_omics_cohort_with_PT_word2vec.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "401a48de",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "from tqdm.notebook import tqdm\n",
 13 |     "from scipy.stats import pearsonr\n",
 14 |     "import gensim\n",
 15 |     "from gensim.models import Word2Vec\n",
 16 |     "from gensim.models.callbacks import CallbackAny2Vec\n",
 17 |     "import random\n",
 18 |     "import pickle\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "import seaborn as sns\n",
 21 |     "import torch"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "id": "fcfe5126",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "#load CSV files which are direct extracts from OMOP tables\n",
 32 |     "conds = pd.read_csv('./data/raw_data/EHR/EHR_cohort_conditions.csv')\n",
 33 |     "conds = conds[conds['condition_concept_id'] != 0]\n",
 34 |     "drugs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_drugs.csv')\n",
 35 |     "drugs = drugs[drugs['drug_concept_id'] != 0]\n",
 36 |     "procs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_procedures.csv')\n",
 37 |     "procs = procs[procs['procedure_concept_id'] != 0]\n",
 38 |     "obs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_observations.csv')\n",
 39 |     "obs = obs[obs['observation_concept_id'] != 0]\n",
 40 |     "\n",
 41 |     "conds['condition_start_DATETIME'] = pd.to_datetime(conds['condition_start_DATETIME'])\n",
 42 |     "procs['procedure_DATETIME'] = pd.to_datetime(procs['procedure_DATETIME'])\n",
 43 |     "drugs['drug_exposure_start_DATETIME'] = pd.to_datetime(drugs['drug_exposure_start_DATETIME'])\n",
 44 |     "obs['observation_DATETIME'] = pd.to_datetime(obs['observation_DATETIME'])\n",
 45 |     "\n",
 46 |     "conds['child_birth_date'] = pd.to_datetime(conds['child_birth_date'])\n",
 47 |     "procs['child_birth_date'] = pd.to_datetime(procs['child_birth_date'])\n",
 48 |     "drugs['child_birth_date'] = pd.to_datetime(drugs['child_birth_date'])\n",
 49 |     "obs['child_birth_date'] = pd.to_datetime(obs['child_birth_date'])\n",
 50 |     "\n",
 51 |     "measurements = pd.read_csv('./data/raw_data/EHR/EHR_cohort_measurements.csv')\n",
 52 |     "measurements = measurements[~pd.isnull(measurements['value_as_number'])]\n",
 53 |     "measurements = measurements[measurements['measurement_concept_id'] != 0]\n",
 54 |     "measurements['measurement_DATETIME'] = pd.to_datetime(measurements['measurement_DATETIME'])\n",
 55 |     "measurements['child_birth_date'] = pd.to_datetime(measurements['child_birth_date'])"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 3,
 61 |    "id": "c7f04ac1",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "#load IDs of mothers in omics cohort\n",
 66 |     "OOL_cohort_omop = pd.read_csv('./data/ool_EHR_features.csv')['mom_person_id'].values"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 4,
 72 |    "id": "42763cbb",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "#filter data to only mothers in omics cohort\n",
 77 |     "conds = conds[conds['mom_person_id'].isin(OOL_cohort_omop)]\n",
 78 |     "drugs = drugs[drugs['mom_person_id'].isin(OOL_cohort_omop)]\n",
 79 |     "procs = procs[procs['mom_person_id'].isin(OOL_cohort_omop)]\n",
 80 |     "measurements = measurements[measurements['mom_person_id'].isin(OOL_cohort_omop)]\n",
 81 |     "obs = obs[obs['mom_person_id'].isin(OOL_cohort_omop)]"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 5,
 87 |    "id": "b504fe9c",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "def filter_df(df, birth_time, time_col, time_range_days=280):\n",
 92 |     "    \"\"\"\n",
 93 |     "    A function to remove entries in a dataframe prior to time of birth. \n",
 94 |     "    \n",
 95 |     "    df: The dataframe to filter. Must contain a column called person_id with the OMOP ID of the mother\n",
 96 |     "    birth_time: A dataframe that contains two columns: maternal_OMOP and birth_DATETIME\n",
 97 |     "    time_col: The index of the column with the date of the event in df\n",
 98 |     "    time_range_days: keeps data from delivery up to time_range_days prior \n",
 99 |     "    \n",
100 |     "    \"\"\"\n",
101 |     "    print('There were {} patients before filtering.'.format(len(df['person_id'].unique())))\n",
102 |     "    df = df.merge(birth_time, how='inner', left_on='person_id', right_on='maternal_OMOP')\n",
103 |     "    df['diff'] = df['birth_DATETIME']-df[time_col]\n",
104 |     "    new_df = df[(df['diff'].dt.days > 0) & (df['diff'].dt.days <= time_range_days)].drop('maternal_OMOP', axis=1)\n",
105 |     "    print('There were {} patients after filtering.'.format(len(new_df['person_id'].unique())))\n",
106 |     "    return new_df\n",
107 |     "\n",
108 |     "def generate_features_EHR_cohort(proteomics, input_df, time_col_name, concept_id_col, indicator, binary=True):\n",
109 |     "    df = proteomics[['DOS','mom_person_id','child_person_id','sample_ID']].merge(input_df, how='left', on=['mom_person_id','child_person_id'])\n",
110 |     "    df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days\n",
111 |     "    df = df[df['delta'] < df['DOS']]\n",
112 |     "    return df\n",
113 |     "    "
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 6,
119 |    "id": "2742da9d",
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "#load key file which can be used to map proteomics data to mother person_id\n",
124 |     "patient_indices = pd.read_csv('./data/processed_data/sampleID_indices.csv')"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 7,
130 |    "id": "558feb6e",
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "#load and clean proteomics data\n",
135 |     "OOL_proteomics = pd.read_csv('./data/processed_data/ool_proteomics_omop_id.csv')\n",
136 |     "OOL_proteomics['sample_ID'] = OOL_proteomics['maternal_person_id'].astype(str)+'_'+OOL_proteomics['Timepoint'].astype(str)\n",
137 |     "OOL_proteomics = OOL_proteomics.drop(['Timepoint','maternal_person_id'],axis=1)\n",
138 |     "OOL_proteomics.columns = [str(i)+'_protein' for i in OOL_proteomics.columns]\n",
139 |     "OOL_proteomics = OOL_proteomics.rename(columns={'DOS_protein':'DOS_sampling_time', 'sample_ID_protein':'sample_ID'})\n",
140 |     "OOL_proteomics = OOL_proteomics[['sample_ID','DOS_sampling_time']]\n",
141 |     "OOL_proteomics['mom_person_id'] = OOL_proteomics['sample_ID'].str[0:7].astype(int)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 8,
147 |    "id": "668c6b85",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "## This block of code creates a dataframe with mom_person_id, child_person_id, min_delta, max_delta \n",
152 |     "## (based on the range of EHR data available), days to onset, and a combined sample_ID col which is used as an identifier\n",
153 |     "\n",
154 |     "# Filter and calculate delta\n",
155 |     "time_col_name = 'condition_start_DATETIME'\n",
156 |     "df = conds\n",
157 |     "df['delta'] = (df[time_col_name] - df['child_birth_date']).dt.days\n",
158 |     "\n",
159 |     "# Calculate min and max delta in one operation\n",
160 |     "ool = df.groupby(['mom_person_id', 'child_person_id'])['delta'].agg(['min', 'max'])\n",
161 |     "ool.columns = ['min_delta', 'max_delta']\n",
162 |     "\n",
163 |     "# Filter for samples with at least 7 days between min and max\n",
164 |     "sampling_df = ool[ool['max_delta'] - ool['min_delta'] >= 7].reset_index()\n",
165 |     "\n",
166 |     "# Create initial sample_ID\n",
167 |     "sampling_df['sample_ID'] = sampling_df['mom_person_id'].astype(str) + '_' + sampling_df['child_person_id'].astype(str)\n",
168 |     "\n",
169 |     "# Filter based on OOL_sample_IDs\n",
170 |     "OOL_sample_IDs = np.unique([i[0:15] for i in list(patient_indices['0'])])\n",
171 |     "sampling_df = sampling_df[sampling_df['sample_ID'].str[:15].isin(OOL_sample_IDs)]\n",
172 |     "\n",
173 |     "# Merge with OOL_proteomics\n",
174 |     "sampling_df = sampling_df.merge(OOL_proteomics, how='inner', on='mom_person_id', suffixes=('_x', '_y'))\n",
175 |     "\n",
176 |     "# Set DOS\n",
177 |     "sampling_df['DOS'] = sampling_df['DOS_sampling_time']\n",
178 |     "\n",
179 |     "# Create the correct sample_ID\n",
180 |     "sampling_df['sample_ID'] = sampling_df['sample_ID_x'] + sampling_df['sample_ID_y'].str[-3:]\n",
181 |     "\n",
182 |     "# Drop unnecessary columns\n",
183 |     "columns_to_drop = ['sample_ID_x', 'sample_ID_y', 'DOS_sampling_time']\n",
184 |     "sampling_df = sampling_df.drop(columns_to_drop, axis=1, errors='ignore')"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 9,
190 |    "id": "3a0c5c4a",
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "#filter data so it only occurs within the correct time range (beginning of pregnancy thru sampling)\n",
195 |     "condition_features_EHR = generate_features_EHR_cohort(sampling_df, conds, 'condition_start_DATETIME','condition_concept_id','C')\n",
196 |     "procedure_features_EHR = generate_features_EHR_cohort(sampling_df, procs, 'procedure_DATETIME','procedure_concept_id','P')\n",
197 |     "drug_features_EHR = generate_features_EHR_cohort(sampling_df, drugs, 'drug_exposure_start_DATETIME','drug_concept_id','D')\n",
198 |     "measurement_features_EHR = generate_features_EHR_cohort(sampling_df, measurements, 'measurement_DATETIME','measurement_concept_id','M')\n",
199 |     "observation_features_EHR = generate_features_EHR_cohort(sampling_df, obs, 'observation_DATETIME','observation_concept_id','O')\n"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 10,
205 |    "id": "a3facc35",
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "class EpochLogger(CallbackAny2Vec):\n",
210 |     "    def __init__(self):\n",
211 |     "        self.epoch = 0\n",
212 |     "\n",
213 |     "    def on_epoch_begin(self, model):\n",
214 |     "        print(f\"Starting epoch #{self.epoch}\")\n",
215 |     "\n",
216 |     "    def on_epoch_end(self, model):\n",
217 |     "        print(f\"Finished epoch #{self.epoch}\")\n",
218 |     "        self.epoch += 1"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 11,
224 |    "id": "01f87efa",
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "# Train word2vec model\n",
229 |     "# NOTE: For word2vec model training, we do NOT do the date filtering and use all data from pregnancy\n",
230 |     "try:\n",
231 |     "    model = Word2Vec.load(\"./models/word2vec_full_pregnancy_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n",
232 |     "except:\n",
233 |     "    print('word2vec model from PT cohort not available, proceeding with training new word2vec model')\n",
234 |     "    epoch_logger = EpochLogger()\n",
235 |     "\n",
236 |     "    word2vec_conds = condition_features_EHR[['sample_ID','condition_concept_id','condition_start_DATETIME']]\n",
237 |     "    word2vec_conds.columns = ['sample_ID','concept_id','ts']\n",
238 |     "\n",
239 |     "    word2vec_procs = procedure_features_EHR[['sample_ID','procedure_concept_id','procedure_DATETIME']]\n",
240 |     "    word2vec_procs.columns = ['sample_ID','concept_id','ts']\n",
241 |     "\n",
242 |     "    word2vec_drug = drug_features_EHR[['sample_ID','drug_concept_id','drug_exposure_start_DATETIME']]\n",
243 |     "    word2vec_drug.columns = ['sample_ID','concept_id','ts']\n",
244 |     "    \n",
245 |     "    word2vec_mea = measurement_features_EHR[['sample_ID','measurement_concept_id','measurement_DATETIME']]\n",
246 |     "    word2vec_mea.columns = ['sample_ID','concept_id','ts']\n",
247 |     "    \n",
248 |     "    word2vec_obs = observation_features_EHR[['sample_ID','observation_concept_id','observation_DATETIME']]\n",
249 |     "    word2vec_obs.columns = ['sample_ID','concept_id','ts']\n",
250 |     "    \n",
251 |     "    word2vec_data = pd.concat([word2vec_conds, word2vec_procs, word2vec_drug,word2vec_mea, word2vec_obs],axis=0)\n",
252 |     "    word2vec_data['date'] = pd.to_datetime(word2vec_data['ts'])\n",
253 |     "    word2vec_data['date'] = word2vec_data['date'].dt.date\n",
254 |     "    word2vec_data = word2vec_data.drop('ts',axis=1)\n",
255 |     "    word2vec_data = word2vec_data[~pd.isnull(word2vec_data['concept_id'])]\n",
256 |     "    word2vec_data['concept_id'] = word2vec_data['concept_id'].astype(int)\n",
257 |     "    \n",
258 |     "    grouped_data = word2vec_data.groupby(['sample_ID', 'date'])\n",
259 |     "    sentences = []\n",
260 |     "    for _, group in tqdm(grouped_data):\n",
261 |     "        codes = group['concept_id'].tolist()\n",
262 |     "        random.shuffle(codes)\n",
263 |     "        sentences.append(codes)\n",
264 |     "        \n",
265 |     "    print('starting training')\n",
266 |     "    model = Word2Vec(sentences, vector_size=400, window=1000, min_count=5, workers=64)\n",
267 |     "    model.train(sentences, total_examples=len(sentences), epochs=5, callbacks=[epoch_logger])\n",
268 |     "    model.save(\"./models/word2vec_OOL_cohort_measurements_observations_full_pregnancy_sampling_400dim.model\")\n"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 12,
274 |    "id": "2c112962",
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "code_to_embedding = {code: model.wv[code] for code in model.wv.index_to_key}"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 13,
284 |    "id": "004203e4",
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "#map EHR data to their respective learned embeddings from word2vec\n",
289 |     "embedded_conds = condition_features_EHR[(condition_features_EHR['condition_concept_id'] != 0)]\n",
290 |     "embedded_conds = embedded_conds[~pd.isnull(embedded_conds['condition_concept_id'])]\n",
291 |     "embedded_conds['embedding'] = [code_to_embedding.get(code) for code in embedded_conds['condition_concept_id']]\n"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 14,
297 |    "id": "0a2def90",
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "embedded_procs = procedure_features_EHR[(procedure_features_EHR['procedure_concept_id'] != 0)]\n",
302 |     "embedded_procs = embedded_procs[~pd.isnull(embedded_procs['procedure_concept_id'])]\n",
303 |     "embedded_procs['embedding'] = [code_to_embedding.get(code) for code in embedded_procs['procedure_concept_id']]\n"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 15,
309 |    "id": "ca224cd0",
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "embedded_drugs = drug_features_EHR[(drug_features_EHR['drug_concept_id'] != 0)]\n",
314 |     "embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['drug_concept_id'])]\n",
315 |     "embedded_drugs['embedding'] = [code_to_embedding.get(code) for code in embedded_drugs['drug_concept_id']]\n"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 16,
321 |    "id": "6d61abf2",
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "embedded_measurements = measurement_features_EHR[(measurement_features_EHR['measurement_concept_id'] != 0)]\n",
326 |     "embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['measurement_concept_id'])]\n",
327 |     "embedded_measurements['embedding'] = [code_to_embedding.get(code) for code in embedded_measurements['measurement_concept_id']]\n"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 17,
333 |    "id": "a8d8e620",
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "embedded_obs = observation_features_EHR[(observation_features_EHR['observation_concept_id'] != 0)]\n",
338 |     "embedded_obs = embedded_obs[~pd.isnull(embedded_obs['observation_concept_id'])]\n",
339 |     "embedded_obs['embedding'] = [code_to_embedding.get(code) for code in embedded_obs['observation_concept_id']]\n"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 18,
345 |    "id": "1b284aab",
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "embedded_procs['date'] = pd.to_datetime(embedded_procs['procedure_DATETIME'].dt.date)\n",
350 |     "embedded_conds['date'] = pd.to_datetime(embedded_conds['condition_start_DATETIME'].dt.date)\n",
351 |     "embedded_drugs['date'] = pd.to_datetime(embedded_drugs['drug_exposure_start_DATETIME'].dt.date)\n",
352 |     "embedded_measurements['date'] = pd.to_datetime(embedded_measurements['measurement_DATETIME'].dt.date)\n",
353 |     "embedded_obs['date'] = pd.to_datetime(embedded_obs['observation_DATETIME'].dt.date)\n"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 19,
359 |    "id": "34f1dc40",
360 |    "metadata": {},
361 |    "outputs": [
362 |     {
363 |      "name": "stdout",
364 |      "output_type": "stream",
365 |      "text": [
366 |       "done making interim dataframe\n",
367 |       "CPU times: user 79.1 ms, sys: 4.32 ms, total: 83.4 ms\n",
368 |       "Wall time: 80.3 ms\n"
369 |      ]
370 |     }
371 |    ],
372 |    "source": [
373 |     "%%time\n",
374 |     "#combine all EHR data tables together\n",
375 |     "embedded_conds = embedded_conds[~pd.isnull(embedded_conds['embedding'])]\n",
376 |     "embedded_procs = embedded_procs[~pd.isnull(embedded_procs['embedding'])]\n",
377 |     "embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['embedding'])]\n",
378 |     "embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['embedding'])]\n",
379 |     "embedded_obs = embedded_obs[~pd.isnull(embedded_obs['embedding'])]\n",
380 |     "\n",
381 |     "all_data = pd.concat([embedded_conds.drop(['DOS','mom_person_id','child_person_id', 'condition_concept_id',\n",
382 |     "                      'condition_start_DATETIME','child_birth_date','delta'],axis=1),\n",
383 |     "    embedded_procs.drop(['DOS','mom_person_id','child_person_id', 'procedure_concept_id',\n",
384 |     "                        'procedure_DATETIME','child_birth_date','delta'],axis=1),\n",
385 |     "     embedded_drugs.drop(['DOS','mom_person_id','child_person_id', 'drug_concept_id',\n",
386 |     "                        'drug_exposure_start_DATETIME','child_birth_date','delta'],axis=1),\n",
387 |     "     embedded_measurements.drop(['DOS','mom_person_id','child_person_id', 'measurement_concept_id',\n",
388 |     "                        'measurement_DATETIME','value_as_number','child_birth_date','delta'],axis=1),\n",
389 |     "    embedded_obs.drop(['DOS','mom_person_id','child_person_id', 'observation_concept_id',\n",
390 |     "                        'observation_DATETIME','child_birth_date','delta'],axis=1)], ignore_index=True)[['sample_ID','date','embedding']]\n",
391 |     "\n",
392 |     "expanded_embedding_df = pd.DataFrame(all_data['embedding'].tolist())\n",
393 |     "print('done making interim dataframe')\n",
394 |     "all_data = pd.concat([all_data.reset_index(drop=True).drop('embedding',axis=1), expanded_embedding_df], axis=1)\n"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 20,
400 |    "id": "44e59ec0",
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "name": "stdout",
405 |      "output_type": "stream",
406 |      "text": [
407 |       "CPU times: user 0 ns, sys: 7.69 ms, total: 7.69 ms\n",
408 |       "Wall time: 5.74 ms\n"
409 |      ]
410 |     }
411 |    ],
412 |    "source": [
413 |     "%%time\n",
414 |     "#take the mean to compute patient-day embeddings\n",
415 |     "patient_day_embeddings = all_data.groupby(['sample_ID','date']).mean()"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 21,
421 |    "id": "07dfd3a3",
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": [
425 |     "patient_day_embeddings = patient_day_embeddings.reset_index()"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 22,
431 |    "id": "2fbf3201",
432 |    "metadata": {},
433 |    "outputs": [],
434 |    "source": [
435 |     "patient_day_embeddings['date'] = pd.to_datetime(patient_day_embeddings['date'])\n",
436 |     "patient_day_embeddings = patient_day_embeddings.sort_values(['sample_ID', 'date'])"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": 23,
442 |    "id": "cca3d994",
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "unique_patients = patient_day_embeddings['sample_ID'].nunique()\n",
447 |     "num_features = len(patient_day_embeddings.columns) - 2  # Subtract patient_id and date columns\n"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 24,
453 |    "id": "ca7e301b",
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "max_dates = patient_day_embeddings.groupby('sample_ID')['date'].count().max()\n",
458 |     "max_dates = 32"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 25,
464 |    "id": "bdeb43c9",
465 |    "metadata": {},
466 |    "outputs": [],
467 |    "source": [
468 |     "#assign each patient id to an index in the input data matrix\n",
469 |     "patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['sample_ID'].unique())}\n"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 26,
475 |    "id": "d569be44",
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "#create input data matrix\n",
480 |     "RNN_data = np.full((num_features, max_dates, unique_patients), np.nan)\n"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": 27,
486 |    "id": "2c0b8f07",
487 |    "metadata": {},
488 |    "outputs": [
489 |     {
490 |      "data": {
491 |       "application/vnd.jupyter.widget-view+json": {
492 |        "model_id": "3e1a8abc925b4aa5bec5b763552aeccc",
493 |        "version_major": 2,
494 |        "version_minor": 0
495 |       },
496 |       "text/plain": [
497 |        "0it [00:00, ?it/s]"
498 |       ]
499 |      },
500 |      "metadata": {},
501 |      "output_type": "display_data"
502 |     },
503 |     {
504 |      "name": "stdout",
505 |      "output_type": "stream",
506 |      "text": [
507 |       "CPU times: user 421 ms, sys: 13.9 ms, total: 435 ms\n",
508 |       "Wall time: 404 ms\n"
509 |      ]
510 |     }
511 |    ],
512 |    "source": [
513 |     "%%time\n",
514 |     "#fill in input data matrix with person-day EHR data embeddings\n",
515 |     "date_position = {}\n",
516 |     "for index, row in tqdm(patient_day_embeddings.iterrows()):\n",
517 |     "    patient_id = row['sample_ID']\n",
518 |     "    patient_index = patient_id_to_index[patient_id]\n",
519 |     "    \n",
520 |     "    if patient_id not in date_position:\n",
521 |     "        date_position[patient_id] = 0\n",
522 |     "    else:\n",
523 |     "        date_position[patient_id] += 1\n",
524 |     "        \n",
525 |     "    date_index = date_position[patient_id]\n",
526 |     "    \n",
527 |     "    for feature_index, feature_value in enumerate(row.drop(['sample_ID', 'date'])):\n",
528 |     "        if date_index < max_dates:\n",
529 |     "            RNN_data[feature_index, date_index, patient_index] = feature_value\n"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 28,
535 |    "id": "c2de670d",
536 |    "metadata": {},
537 |    "outputs": [
538 |     {
539 |      "data": {
540 |       "text/plain": [
541 |        "(400, 32, 42)"
542 |       ]
543 |      },
544 |      "execution_count": 28,
545 |      "metadata": {},
546 |      "output_type": "execute_result"
547 |     }
548 |    ],
549 |    "source": [
550 |     "RNN_data.shape"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": 29,
556 |    "id": "f21c2dd3",
557 |    "metadata": {},
558 |    "outputs": [],
559 |    "source": [
560 |     "RNN_data = RNN_data.transpose(2,1,0)"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 30,
566 |    "id": "f043f5ae",
567 |    "metadata": {},
568 |    "outputs": [
569 |     {
570 |      "data": {
571 |       "text/plain": [
572 |        "(42, 32, 400)"
573 |       ]
574 |      },
575 |      "execution_count": 30,
576 |      "metadata": {},
577 |      "output_type": "execute_result"
578 |     }
579 |    ],
580 |    "source": [
581 |     "RNN_data.shape"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "code",
586 |    "execution_count": 31,
587 |    "id": "98f90b14",
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": [
591 |     "#align outcome data with correct index\n",
592 |     "sampling_df = sampling_df.merge(pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T, how='right', left_on='sample_ID', right_on=0)\n"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "code",
597 |    "execution_count": 32,
598 |    "id": "e6fd3f73",
599 |    "metadata": {},
600 |    "outputs": [],
601 |    "source": [
602 |     "#align outcome data with correct index\n",
603 |     "sampling_df = sampling_df.merge(patient_day_embeddings.groupby('sample_ID').count()[['date']], how='left', on='sample_ID')"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": 33,
609 |    "id": "74645d66",
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": [
613 |     "DOS_outcomes = np.array(sampling_df[['DOS',1]].sort_values(1)['DOS'])"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": 34,
619 |    "id": "d6490d23",
620 |    "metadata": {},
621 |    "outputs": [
622 |     {
623 |      "data": {
624 |       "text/plain": [
625 |        "((42, 32, 400), (42,))"
626 |       ]
627 |      },
628 |      "execution_count": 34,
629 |      "metadata": {},
630 |      "output_type": "execute_result"
631 |     }
632 |    ],
633 |    "source": [
634 |     "RNN_data.shape, DOS_outcomes.shape"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 35,
640 |    "id": "e6427287",
641 |    "metadata": {},
642 |    "outputs": [],
643 |    "source": [
644 |     "#Save processed data below"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "code",
649 |    "execution_count": 36,
650 |    "id": "86cda5eb",
651 |    "metadata": {},
652 |    "outputs": [],
653 |    "source": [
654 |     "np.save('./data/processed_data/RNN_data_codes_with_obs.npy', RNN_data)"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 37,
660 |    "id": "862c6ed5",
661 |    "metadata": {},
662 |    "outputs": [],
663 |    "source": [
664 |     "np.save('./data/processed_data/RNN_data_outcomes_with_obs.npy', DOS_outcomes)"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "code",
669 |    "execution_count": 38,
670 |    "id": "9bb0ee8e",
671 |    "metadata": {},
672 |    "outputs": [],
673 |    "source": [
674 |     "patient_outcomes = torch.tensor(DOS_outcomes).float()\n",
675 |     "num_patient_visits = np.minimum(np.array(sampling_df['date']), 32)"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "code",
680 |    "execution_count": 39,
681 |    "id": "e3a77d44",
682 |    "metadata": {},
683 |    "outputs": [],
684 |    "source": [
685 |     "np.save('./data/processed_data/RNN_data_lengths_with_obs.npy', num_patient_visits)"
686 |    ]
687 |   },
688 |   {
689 |    "cell_type": "code",
690 |    "execution_count": 40,
691 |    "id": "76b41b0b",
692 |    "metadata": {},
693 |    "outputs": [],
694 |    "source": [
695 |     "pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('./data/processed_data/sampleID_indices_with_obs.csv')\n"
696 |    ]
697 |   },
698 |   {
699 |    "cell_type": "code",
700 |    "execution_count": null,
701 |    "id": "7e81e07a",
702 |    "metadata": {},
703 |    "outputs": [],
704 |    "source": []
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": null,
709 |    "id": "5956a041",
710 |    "metadata": {},
711 |    "outputs": [],
712 |    "source": []
713 |   },
714 |   {
715 |    "cell_type": "code",
716 |    "execution_count": null,
717 |    "id": "68fc8e43",
718 |    "metadata": {},
719 |    "outputs": [],
720 |    "source": []
721 |   }
722 |  ],
723 |  "metadata": {
724 |   "kernelspec": {
725 |    "display_name": "Python 3 (ipykernel)",
726 |    "language": "python",
727 |    "name": "python3"
728 |   },
729 |   "language_info": {
730 |    "codemirror_mode": {
731 |     "name": "ipython",
732 |     "version": 3
733 |    },
734 |    "file_extension": ".py",
735 |    "mimetype": "text/x-python",
736 |    "name": "python",
737 |    "nbconvert_exporter": "python",
738 |    "pygments_lexer": "ipython3",
739 |    "version": "3.10.6"
740 |   }
741 |  },
742 |  "nbformat": 4,
743 |  "nbformat_minor": 5
744 | }
745 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![DOI](https://zenodo.org/badge/833866868.svg)](https://doi.org/10.5281/zenodo.13977341)
 2 | # COMET: Clinical and Omics Multi-Modal Analysis Enhanced with Transfer Learning
 3 | COMET is a machine learning framework that incorporates large, observational electronic health record (EHR) databases and transfer learning to improve the analysis of small datasets from omics studies.
 4 | ## Overview
 5 | This repo contains the code used for the analyses and results presented in our manuscript. Due to HIPAA constraints, we cannot share the EHR data used in our study. The proteomics data for the onset of labor cohort can be found [here](https://datadryad.org/stash/dataset/doi:10.5061/dryad.280gb5mpd). Due to UK Biobank policies, we cannot share the proteomics data from the UK Biobank cohort. Researchers who have access to the UK Biobank proteomics data and EHR data can run the provided notebooks from the UK Biobank Research Analysis Platform (RAP) to reproduce our analysis. To generate the necessary processed data files run ./Cancer/process_EHR_data_omics.ipynb first, then ./Cancer/process_PT_data.ipynb, followed by ./Cancer/grouped_embeddings_to_matrices.ipynb. You can then run ./Cancer/experiments.ipynb. Details about the UK Biobank, including how to get approved as a researcher and access the RAP can be found on the UK Biobank's [website](https://www.ukbiobank.ac.uk/).
 6 | ## Installation and Setup
 7 | First, clone the GitHub repo:
 8 | ```
 9 | git clone https://github.com/samson920/COMET
10 | ```
11 | Then, set up the environment:
12 | ```
13 | conda env create -f environment.yml
14 | conda activate COMET
15 | ```
16 | The installation should take about 10 minutes.
17 | 
18 | ## Demo
19 | We have included some toy data in the ./Onset of Labor/data/ folder to show the expected structure of data for the onset of labor experiments. The EHR data are direct extracts of OMOP tables. The toy data will work with our code, though the results won't be particularly meaningful as the data are randomly generated. You can replace the toy data with your own data from OMOP tables and your own tabular omics data to run COMET on your own datasets. To run the data processing scripts, run the Jupyter notebooks in ./Onset of Labor/, starting with process_EHR_data_full_PT_cohort.ipynb, then process_EHR_data_omics_cohort.ipynb, lastly, process_EHR_data_omics_cohort_with_PT_word2vec.ipynb. These notebooks will create the processed EHR data files expected by the experiments.ipynb notebook, which you can run after the data processing notebooks.
20 | 
21 | The data processing notebooks will take <1 minute on our toy data, but substantially longer with real, larger datasets. The experiments notebook will take about 20 minutes to run with our toy data on machines with a GPU, but substantially longer with real, larger datasets. We do not recommend running this code on a CPU as it will take a very long time.
22 | 
23 | 
24 | ## General Repo Organization
25 | There are two folders: Onset of Labor and Cancer. Within each folder, we have Jupyter notebooks used for various aspects of the data processing and analysis. Within the onset of labor folder we have:
26 | - process_EHR_data_full_PT_cohort.ipynb: This notebook contains the code necessary to process EHR data for the pre-training cohort from extracts of OMOP tables to matrices that can be direct inputs to the ML models. This includes the training of the word2vec model to embed EHR codes.
27 | - process_EHR_data_omics_cohort.ipynb: This notebook contains the code necessary to process EHR data for the omics from extracts of OMOP tables to matrices that can be direct inputs to the ML models. This includes the training of the word2vec model to embed EHR codes.
28 | - process_EHR_data_omics_cohort_with_PT_word2vec.ipynb: This notebook is the same as the above, except it uses the word2vec model from the PT cohort, and is for use in the latter experiments which utilize COMET (including the pre-trained word2vec model).
29 | - experiments.ipynb: This notebook contains all other code for experiments and analysis. Most notably, it contains the code for the actual architecture of our models, hyperparameter optimization, actual experiments, and downstream analyses including feature importance computation and visualization of the parameter space in Figure 6.
30 | 
31 | Within the cancer folder we have:
32 | - process_EHR_data_omics.ipynb: This file contains the queries to pull the patient cohorts and the data necessary to train the word2vec models, and trains the word2vec models for both the omics and pre-training cohorts. This file also contains downstream processing to pull the feature data from the patients in the omics cohort and ultimately saves a CSV containing the person-day embeddings.
33 | - process_PT_data.ipynb: This file contains the queries to pull the feature data from the pre-training cohort and downstream processing to compute person-day embeddings. 
34 | - grouped_embeddings_to_matrices.ipynb: contains code to convert person-day embeddings to feature matrix for RNN input, also computes other inputs for ML (length of sequence based on number of days of data, outcome data, mapping between patient ID and indices in the feature matrix), also contains code used to extract all proteomics data
35 | - experiments.ipynb: This notebook contains all other code for experiments and analysis. Most notably, it contains the code for the actual architecture of our models, hyperparameter optimization, actual experiments, and downstream analyses including feature importance computation and visualization of the parameter space in Figure 6.
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: COMET
  2 | channels:
  3 |   - anaconda
  4 |   - pytorch
  5 |   - bioconda
  6 |   - conda-forge
  7 |   - defaults
  8 | dependencies:
  9 |   - _libgcc_mutex=0.1=conda_forge
 10 |   - _openmp_mutex=4.5=2_kmp_llvm
 11 |   - _py-xgboost-mutex=2.0=cpu_0
 12 |   - adjusttext=0.7.3.1=py_1
 13 |   - alsa-lib=1.2.7.2=h166bdaf_0
 14 |   - anyio=3.6.2=pyhd8ed1ab_0
 15 |   - aom=3.5.0=h27087fc_0
 16 |   - argon2-cffi=21.3.0=pyhd8ed1ab_0
 17 |   - asttokens=2.0.8=pyhd8ed1ab_0
 18 |   - attr=2.5.1=h166bdaf_1
 19 |   - attrs=22.1.0=pyh71513ae_1
 20 |   - babel=2.10.3=pyhd8ed1ab_0
 21 |   - backcall=0.2.0=pyh9f0ad1d_0
 22 |   - backports=1.1=pyhd3eb1b0_0
 23 |   - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
 24 |   - beautifulsoup4=4.11.1=pyha770c72_0
 25 |   - bioinfokit=2.1.0=pyh7cba7a3_0
 26 |   - blas=1.0=mkl
 27 |   - bleach=5.0.1=pyhd8ed1ab_0
 28 |   - bokeh=3.3.0=pyhd8ed1ab_0
 29 |   - brotli=1.0.9=h166bdaf_7
 30 |   - brotli-bin=1.0.9=h166bdaf_7
 31 |   - bzip2=1.0.8=h7f98852_4
 32 |   - ca-certificates=2023.08.22=h06a4308_0
 33 |   - captum=0.6.0=0
 34 |   - certifi=2023.7.22=pyhd8ed1ab_0
 35 |   - charset-normalizer=2.1.1=pyhd8ed1ab_0
 36 |   - click=8.1.3=unix_pyhd8ed1ab_2
 37 |   - cloudpickle=2.2.1=pyhd8ed1ab_0
 38 |   - colorama=0.4.5=pyhd8ed1ab_0
 39 |   - cudatoolkit=11.3.1=h2bc3f7f_2
 40 |   - cycler=0.11.0=pyhd8ed1ab_0
 41 |   - dask=2022.2.1=pyhd3eb1b0_0
 42 |   - dask-core=2022.2.1=pyhd3eb1b0_0
 43 |   - dbus=1.13.18=hb2f20db_0
 44 |   - decorator=5.1.1=pyhd8ed1ab_0
 45 |   - defusedxml=0.7.1=pyhd8ed1ab_0
 46 |   - distributed=2022.2.1=pyhd3eb1b0_0
 47 |   - entrypoints=0.4=pyhd8ed1ab_0
 48 |   - executing=1.1.1=pyhd8ed1ab_0
 49 |   - expat=2.4.9=h27087fc_0
 50 |   - ffmpeg=5.1.2=gpl_he10e716_101
 51 |   - fftw=3.3.10=nompi_hf0379b8_105
 52 |   - flit-core=3.7.1=pyhd8ed1ab_0
 53 |   - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
 54 |   - font-ttf-inconsolata=3.000=h77eed37_0
 55 |   - font-ttf-source-code-pro=2.038=h77eed37_0
 56 |   - font-ttf-ubuntu=0.83=hab24e00_0
 57 |   - fontconfig=2.14.0=hc2a2eb6_1
 58 |   - fonts-conda-ecosystem=1=0
 59 |   - fonts-conda-forge=1=0
 60 |   - freetype=2.12.1=hca18f0e_0
 61 |   - fsspec=2023.5.0=pyh1a96a4e_0
 62 |   - gettext=0.21.1=h27087fc_0
 63 |   - giflib=5.2.1=h36c2ea0_2
 64 |   - glib=2.74.0=h6239696_0
 65 |   - glib-tools=2.74.0=h6239696_0
 66 |   - gmp=6.2.1=h58526e2_0
 67 |   - gnutls=3.7.8=hf3e180e_0
 68 |   - gst-plugins-base=1.20.3=h57caac4_2
 69 |   - gstreamer=1.20.3=hd4edc92_2
 70 |   - icu=70.1=h27087fc_0
 71 |   - idna=3.4=pyhd8ed1ab_0
 72 |   - imageio=2.31.1=pyh24c5eb1_0
 73 |   - importlib_resources=5.10.0=pyhd8ed1ab_0
 74 |   - intel-openmp=2021.4.0=h06a4308_3561
 75 |   - ipykernel=6.16.0=pyh210e3f2_0
 76 |   - ipython=8.5.0=pyh41d4057_1
 77 |   - ipython_genutils=0.2.0=py_1
 78 |   - ipywidgets=8.0.2=pyhd8ed1ab_1
 79 |   - jack=1.9.21=h2a1e645_0
 80 |   - jedi=0.18.1=pyhd8ed1ab_2
 81 |   - jinja2=3.1.2=pyhd8ed1ab_1
 82 |   - joblib=1.2.0=pyhd8ed1ab_0
 83 |   - jpeg=9e=h166bdaf_2
 84 |   - json5=0.9.6=pyhd3eb1b0_0
 85 |   - jsonschema=4.16.0=pyhd8ed1ab_0
 86 |   - jupyter_client=7.4.2=pyhd8ed1ab_0
 87 |   - jupyter_core=4.11.1=py310hff52083_0
 88 |   - jupyter_server=1.21.0=pyhd8ed1ab_0
 89 |   - jupyterlab=3.4.8=pyhd8ed1ab_0
 90 |   - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0
 91 |   - jupyterlab_server=2.16.0=pyhd8ed1ab_0
 92 |   - jupyterlab_widgets=3.0.3=pyhd8ed1ab_0
 93 |   - keyutils=1.6.1=h166bdaf_0
 94 |   - krb5=1.19.3=h3790be6_0
 95 |   - lame=3.100=h166bdaf_1003
 96 |   - lcms2=2.12=hddcbb42_0
 97 |   - ld_impl_linux-64=2.39=hc81fddc_0
 98 |   - lerc=4.0.0=h27087fc_0
 99 |   - libblas=3.9.0=12_linux64_mkl
100 |   - libbrotlicommon=1.0.9=h166bdaf_7
101 |   - libbrotlidec=1.0.9=h166bdaf_7
102 |   - libbrotlienc=1.0.9=h166bdaf_7
103 |   - libcap=2.66=ha37c62d_0
104 |   - libcblas=3.9.0=12_linux64_mkl
105 |   - libclang=14.0.6=default_hc1a23ef_0
106 |   - libclang13=14.0.6=default_h31cde19_0
107 |   - libcups=2.3.3=h3e49a29_2
108 |   - libdb=6.2.32=h9c3ff4c_0
109 |   - libdeflate=1.14=h166bdaf_0
110 |   - libdrm=2.4.113=h166bdaf_0
111 |   - libedit=3.1.20210910=h7f8727e_0
112 |   - libevent=2.1.10=h9b69904_4
113 |   - libffi=3.4.2=h7f98852_5
114 |   - libflac=1.4.1=h27087fc_0
115 |   - libgcc-ng=12.2.0=h65d4601_18
116 |   - libgfortran-ng=12.2.0=h69a702a_18
117 |   - libgfortran5=12.2.0=h337968e_18
118 |   - libglib=2.74.0=h7a41b64_0
119 |   - libiconv=1.17=h166bdaf_0
120 |   - libidn2=2.3.3=h166bdaf_0
121 |   - libllvm11=11.1.0=he0ac6c6_5
122 |   - libllvm14=14.0.6=he0ac6c6_0
123 |   - libnsl=2.0.0=h7f98852_0
124 |   - libogg=1.3.5=h27cfd23_1
125 |   - libopus=1.3.1=h7f98852_1
126 |   - libpciaccess=0.16=h516909a_0
127 |   - libpng=1.6.38=h753d276_0
128 |   - libpq=14.5=hd77ab85_0
129 |   - libsndfile=1.1.0=h27087fc_0
130 |   - libsodium=1.0.18=h36c2ea0_1
131 |   - libsqlite=3.39.4=h753d276_0
132 |   - libstdcxx-ng=12.2.0=h46fd767_18
133 |   - libtasn1=4.19.0=h166bdaf_0
134 |   - libtiff=4.4.0=h55922b4_4
135 |   - libtool=2.4.6=h9c3ff4c_1008
136 |   - libudev1=249=h166bdaf_4
137 |   - libunistring=0.9.10=h7f98852_0
138 |   - libuuid=2.32.1=h7f98852_1000
139 |   - libva=2.16.0=h166bdaf_0
140 |   - libvorbis=1.3.7=h9c3ff4c_0
141 |   - libvpx=1.11.0=h9c3ff4c_3
142 |   - libwebp=1.2.4=h522a892_0
143 |   - libwebp-base=1.2.4=h166bdaf_0
144 |   - libxcb=1.13=h7f98852_1004
145 |   - libxgboost=1.6.2=cpu_ha3b9936_1
146 |   - libxkbcommon=1.0.3=he3ba5ed_0
147 |   - libxml2=2.9.14=h22db469_4
148 |   - libzlib=1.2.13=h166bdaf_4
149 |   - llvm-openmp=14.0.6=h9e868ea_0
150 |   - locket=1.0.0=pyhd8ed1ab_0
151 |   - matplotlib-base=3.6.1=py310h8d5ebf3_0
152 |   - matplotlib-inline=0.1.6=pyhd8ed1ab_0
153 |   - matplotlib-venn=0.11.7=pyhd8ed1ab_0
154 |   - mistune=2.0.4=pyhd8ed1ab_0
155 |   - mkl=2021.4.0=h8d4b97c_729
156 |   - mkl_fft=1.3.1=py310h2b4bcf5_1
157 |   - mkl_random=1.2.2=py310h00e6091_0
158 |   - mpg123=1.30.2=h27087fc_1
159 |   - msgpack-python=1.0.5=py310hdf3cbec_0
160 |   - munkres=1.1.4=pyh9f0ad1d_0
161 |   - mysql-common=8.0.31=haf5c9bc_0
162 |   - mysql-libs=8.0.31=h28c427c_0
163 |   - nbclassic=0.4.5=pyhd8ed1ab_0
164 |   - nbclient=0.7.0=pyhd8ed1ab_0
165 |   - nbconvert=7.2.1=pyhd8ed1ab_0
166 |   - nbconvert-core=7.2.1=pyhd8ed1ab_0
167 |   - nbconvert-pandoc=7.2.1=pyhd8ed1ab_0
168 |   - nbformat=5.7.0=pyhd8ed1ab_0
169 |   - ncurses=6.3=h27087fc_1
170 |   - nest-asyncio=1.5.6=pyhd8ed1ab_0
171 |   - nettle=3.8.1=hc379101_1
172 |   - networkx=3.1=pyhd8ed1ab_0
173 |   - notebook=6.5.1=pyha770c72_0
174 |   - notebook-shim=0.2.0=pyhd8ed1ab_0
175 |   - nspr=4.33=h295c915_0
176 |   - nss=3.78=h2350873_0
177 |   - numpy-base=1.23.3
178 |   - openh264=2.3.1=h27087fc_1
179 |   - openssl=1.1.1w=hd590300_0
180 |   - p11-kit=0.24.1=hc5aa10d_0
181 |   - packaging=21.3=pyhd8ed1ab_0
182 |   - pandoc=2.19.2=ha770c72_0
183 |   - pandocfilters=1.5.0=pyhd8ed1ab_0
184 |   - parso=0.8.3=pyhd8ed1ab_0
185 |   - partd=1.4.0=pyhd8ed1ab_0
186 |   - patsy=0.5.3=pyhd8ed1ab_0
187 |   - pcre2=10.37=hc3806b6_1
188 |   - pexpect=4.8.0=pyh9f0ad1d_2
189 |   - pickleshare=0.7.5=py_1003
190 |   - pip=22.3=pyhd8ed1ab_0
191 |   - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0
192 |   - ply=3.11=py_1
193 |   - prometheus_client=0.15.0=pyhd8ed1ab_0
194 |   - prompt-toolkit=3.0.31=pyha770c72_0
195 |   - pthread-stubs=0.4=h36c2ea0_1001
196 |   - ptyprocess=0.7.0=pyhd3deb0d_0
197 |   - pulseaudio=14.0=habe0971_10
198 |   - pure_eval=0.2.2=pyhd8ed1ab_0
199 |   - py-xgboost=1.6.2=cpu_py310hd1aba9c_1
200 |   - pycparser=2.21=pyhd8ed1ab_0
201 |   - pygments=2.13.0=pyhd8ed1ab_0
202 |   - pynndescent=0.5.10=pyh1a96a4e_0
203 |   - pyopenssl=22.1.0=pyhd8ed1ab_0
204 |   - pyparsing=3.0.9=pyhd8ed1ab_0
205 |   - pyqt=5.15.7=py310h29803b5_1
206 |   - pysocks=1.7.1=pyha2e5f31_6
207 |   - python=3.10.6=h582c2e5_0_cpython
208 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
209 |   - python-fastjsonschema=2.16.2=pyhd8ed1ab_0
210 |   - python-louvain=0.16=pyhd8ed1ab_0
211 |   - python_abi=3.10=2_cp310
212 |   - pytorch=1.12.1=py3.10_cuda11.3_cudnn8.3.2_0
213 |   - pytorch-mutex=1.0=cuda
214 |   - pytz=2022.4=pyhd8ed1ab_0
215 |   - qt-main=5.15.6=hc525480_0
216 |   - readline=8.1.2=h0f457ee_0
217 |   - requests=2.28.1=pyhd8ed1ab_1
218 |   - seaborn=0.11.2=pyhd3eb1b0_0
219 |   - send2trash=1.8.0=pyhd8ed1ab_0
220 |   - setuptools=65.5.0=pyhd8ed1ab_0
221 |   - six=1.16.0=pyh6c4a22f_0
222 |   - slicer=0.0.7=pyhd8ed1ab_0
223 |   - smart_open=6.3.0=pyhd8ed1ab_1
224 |   - sniffio=1.3.0=pyhd8ed1ab_0
225 |   - sortedcontainers=2.4.0=pyhd8ed1ab_0
226 |   - soupsieve=2.3.2.post1=pyhd8ed1ab_0
227 |   - sqlite=3.39.4=h4ff8645_0
228 |   - stack_data=0.5.1=pyhd8ed1ab_0
229 |   - svt-av1=1.2.1=h27087fc_0
230 |   - tabulate=0.9.0=pyhd8ed1ab_1
231 |   - tbb=2021.6.0=h924138e_0
232 |   - tblib=1.7.0=pyhd8ed1ab_0
233 |   - terminado=0.16.0=pyh41d4057_0
234 |   - textwrap3=0.9.2=py_0
235 |   - threadpoolctl=3.1.0=pyh8a188c0_0
236 |   - tinycss2=1.2.1=pyhd8ed1ab_0
237 |   - tk=8.6.12=h27826a3_0
238 |   - toml=0.10.2=pyhd8ed1ab_0
239 |   - tomli=2.0.1=pyhd8ed1ab_0
240 |   - toolz=0.12.0=pyhd8ed1ab_0
241 |   - tqdm=4.64.1=pyhd8ed1ab_0
242 |   - traitlets=5.5.0=pyhd8ed1ab_0
243 |   - typing_extensions=4.4.0=pyha770c72_0
244 |   - tzdata=2022e=h191b570_0
245 |   - urllib3=1.26.11=pyhd8ed1ab_0
246 |   - wcwidth=0.2.5=pyh9f0ad1d_2
247 |   - webencodings=0.5.1=py_1
248 |   - websocket-client=1.4.1=pyhd8ed1ab_0
249 |   - wheel=0.37.1=pyhd8ed1ab_0
250 |   - widgetsnbextension=4.0.3=pyhd8ed1ab_0
251 |   - x264=1!164.3095=h166bdaf_2
252 |   - x265=3.5=h924138e_3
253 |   - xcb-util=0.4.0=h166bdaf_0
254 |   - xcb-util-image=0.4.0=h166bdaf_0
255 |   - xcb-util-keysyms=0.4.0=h166bdaf_0
256 |   - xcb-util-renderutil=0.3.9=h166bdaf_0
257 |   - xcb-util-wm=0.4.1=h166bdaf_0
258 |   - xorg-fixesproto=5.0=h7f98852_1002
259 |   - xorg-kbproto=1.0.7=h7f98852_1002
260 |   - xorg-libx11=1.7.2=h7f98852_0
261 |   - xorg-libxau=1.0.9=h7f98852_0
262 |   - xorg-libxdmcp=1.1.3=h7f98852_0
263 |   - xorg-libxext=1.3.4=h7f98852_1
264 |   - xorg-libxfixes=5.0.3=h7f98852_1004
265 |   - xorg-xextproto=7.3.0=h7f98852_1002
266 |   - xorg-xproto=7.0.31=h7f98852_1007
267 |   - xyzservices=2023.2.0=pyhd8ed1ab_0
268 |   - xz=5.2.6=h166bdaf_0
269 |   - yaml=0.2.5=h7f98852_2
270 |   - zeromq=4.3.4=h9c3ff4c_1
271 |   - zict=3.0.0=pyhd8ed1ab_0
272 |   - zipp=3.9.0=pyhd8ed1ab_0
273 |   - zlib=1.2.13=h166bdaf_4
274 |   - zstd=1.5.2=h6239696_4
275 |   - pip:
276 |     - appdirs==1.4.4
277 |     - argon2-cffi-bindings==21.2.0
278 |     - brotlipy==0.7.0
279 |     - cffi==1.15.1
280 |     - colorcet==3.1.0
281 |     - contourpy==1.0.5
282 |     - cryptography==38.0.2
283 |     - cytoolz==0.12.0
284 |     - debugpy==1.6.3
285 |     - docker-pycreds==0.4.0
286 |     - et-xmlfile==1.1.0
287 |     - fonttools==4.37.4
288 |     - gensim==4.3.0
289 |     - gitdb==4.0.10
290 |     - gitpython==3.1.30
291 |     - gseapy==1.0.6
292 |     - importlib-metadata==4.11.4
293 |     - jupyter-core==4.11.1
294 |     - kiwisolver==1.4.4
295 |     - llvmlite==0.39.1
296 |     - loralib==0.1.1
297 |     - markupsafe==2.1.1
298 |     - matplotlib==3.6.1
299 |     - mkl-fft==1.3.1
300 |     - mkl-random==1.2.2
301 |     - mkl-service==2.4.0
302 |     - msgpack==1.0.5
303 |     - numba==0.56.4
304 |     - numpy==1.23.3
305 |     - openpyxl==3.1.2
306 |     - pandas==1.5.0
307 |     - pathtools==0.1.2
308 |     - pillow==9.2.0
309 |     - protobuf==4.21.12
310 |     - psutil==5.9.3
311 |     - pyqt5==5.15.7
312 |     - pyqt5-sip==12.11.0
313 |     - pyrsistent==0.18.1
314 |     - pyyaml==6.0
315 |     - pyzmq==24.0.1
316 |     - scikit-learn==1.1.2
317 |     - scipy==1.9.1
318 |     - sentry-sdk==1.15.0
319 |     - setproctitle==1.3.2
320 |     - shap==0.41.0
321 |     - sip==6.7.2
322 |     - smmap==5.0.0
323 |     - statsmodels==0.13.5
324 |     - torch==1.12.1
325 |     - torchaudio==0.12.1
326 |     - torchvision==0.13.1
327 |     - tornado==6.2
328 |     - umap-learn==0.5.3
329 |     - unicodedata2==14.0.0
330 |     - wandb==0.13.10
331 |     - xgboost==1.6.2
332 | prefix: /home/samsonm/miniconda3/envs/multi_modal_DL
333 | 


--------------------------------------------------------------------------------