├── 0_Load_data.py ├── 1_Synthea_exploration.sql ├── 2_Synthea_cooccurrence.py ├── 3_Synthea_predict_breast_cancer.py ├── LICENSE ├── ML_with_simulated_EMR.pptx ├── README.md ├── docs ├── 0_Load_Data.html ├── 1_Synthea_exploration.html ├── 2_Synthea_cooccurrence.html ├── 3_Synthea_predict_breast_cancer.html ├── README.md └── synthea_cooccurrence_demo.html ├── extra_credit.sql └── sample_data.zip /0_Load_data.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC The github repo contains a small dataset, which we will load into a database called "emr_sample"; this will let you run through the exercises, but it is not really large enough to train a good ML model. A single-node cluster is sufficient for working with this dataset. 5 | # MAGIC 6 | # MAGIC We will provide a SAS token to people who want to stay after the workshop and re-run the model on the larger "missouri" dataset. If you want to work with the larger datast, you should set up a multi-node cluster, the run the "Load big dataset from storage container" section below. The only modification you should have to make in the other notebooks is to change 'use emr_sample' to 'use missouri'. 7 | 8 | # COMMAND ---------- 9 | 10 | # MAGIC %md 11 | # MAGIC 12 | # MAGIC # Load sample data from github repo 13 | 14 | # COMMAND ---------- 15 | 16 | displayHTML(f''' 17 | 23 | ''') 24 | 25 | # wdSlideIndex=8 26 | 27 | # COMMAND ---------- 28 | 29 | import os 30 | import zipfile 31 | 32 | data_path = '/FileStore/emr_sample' 33 | local_path = '/dbfs' + data_path 34 | 35 | dbutils.fs.mkdirs(local_path) 36 | 37 | with zipfile.ZipFile("sample_data.zip", "r") as zip_ref: 38 | zip_ref.extractall(local_path) 39 | 40 | ## If you change your mind: 41 | # dbutils.fs.rm(data_path, recurse=True) 42 | 43 | # COMMAND ---------- 44 | 45 | # MAGIC %fs 46 | # MAGIC 47 | # MAGIC ls /FileStore 48 | 49 | # COMMAND ---------- 50 | 51 | !ls -R /dbfs/FileStore 52 | 53 | # COMMAND ---------- 54 | 55 | import os 56 | import re 57 | 58 | DB_NAME = "emr_sample" 59 | 60 | spark.sql(f"create database if not exists {DB_NAME}") 61 | spark.sql(f"use {DB_NAME}") 62 | 63 | for file_info in dbutils.fs.ls('/FileStore/emr_sample/csv'): 64 | table_name = re.sub('(.*)\\.csv$', '\\1', file_info.name).lower() 65 | print(f"creating table '{DB_NAME}.{table_name}' from file {file_info.path}") 66 | spark.read.options(header=True).csv(file_info.path).write.mode('overwrite').saveAsTable(table_name) 67 | 68 | 69 | ## If you change your mind: 70 | # spark.sql(f"drop database {DB_NAME} cascade") 71 | 72 | # COMMAND ---------- 73 | 74 | # MAGIC %md 75 | # MAGIC 76 | # MAGIC # Load big dataset from storage container 77 | 78 | # COMMAND ---------- 79 | 80 | # DBTITLE 1,Enter your data connection information here 81 | secrets = {'storage_account_name':'syntheauploadsa', 82 | 'container_name':'syntheadata1', 83 | 'data_path': '/missouri/2021_07_11T17_42_12Z_parquet', 84 | 'sas_token':'PUT_YOUR_SAS_TOKEN_HERE'} 85 | 86 | # COMMAND ---------- 87 | 88 | if secrets['sas_token'] == 'PUT_YOUR_SAS_TOKEN_HERE': 89 | displayHTML('''

You need to enter your connection info in the 'secrets' dict!

''') 90 | 91 | # COMMAND ---------- 92 | 93 | # DBTITLE 1,Mount storage container to DBFS 94 | DATA_SOURCE = "wasbs://{container_name}@{storage_account_name}.blob.core.windows.net".format(**secrets) 95 | 96 | DATA_PATH = secrets['data_path'] 97 | 98 | DB_NAME = 'missouri' 99 | 100 | MOUNT_POINT = f"/mnt/{DB_NAME}" 101 | 102 | EXTRA_CONFIGS = {"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net".format(**secrets): secrets['sas_token']} 103 | 104 | CURRENTLY_MOUNTED = {mount_info.mountPoint for mount_info in dbutils.fs.mounts()} 105 | if MOUNT_POINT in CURRENTLY_MOUNTED: 106 | dbutils.fs.unmount(MOUNT_POINT) 107 | 108 | dbutils.fs.mount( 109 | source = DATA_SOURCE + DATA_PATH, 110 | mount_point = MOUNT_POINT, 111 | extra_configs = EXTRA_CONFIGS 112 | ) 113 | 114 | [f.name for f in dbutils.fs.ls(MOUNT_POINT)] 115 | 116 | # COMMAND ---------- 117 | 118 | # DBTITLE 1,Create new database and tables 119 | import re 120 | 121 | spark.sql(f"create database if not exists {DB_NAME}") 122 | spark.sql(f"use {DB_NAME}") 123 | 124 | for file_info in dbutils.fs.ls(MOUNT_POINT): 125 | table_name = re.sub('(.*)\\.parquet/$', '\\1', file_info.name).lower() 126 | print(f"creating table '{DB_NAME}.{table_name}' from file {file_info.path}") 127 | spark.read.parquet(file_info.path).write.mode('overwrite').saveAsTable(table_name) 128 | 129 | ## If you change your mind: 130 | # spark.sql(f"drop database {DB_NAME} cascade") 131 | 132 | # COMMAND ---------- 133 | 134 | 135 | -------------------------------------------------------------------------------- /1_Synthea_exploration.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %python 3 | -- MAGIC 4 | -- MAGIC # displayHTML(f''' 5 | -- MAGIC # 12 | -- MAGIC # ''') 13 | 14 | -- COMMAND ---------- 15 | 16 | -- MAGIC %md 17 | -- MAGIC 18 | -- MAGIC # Explore Synthea data 19 | 20 | -- COMMAND ---------- 21 | 22 | -- MAGIC %md 23 | -- MAGIC ## Databricks Magic Commands 24 | -- MAGIC To explore data a useful feature of databricks notebook is the magic commands that can come at the beginning of each cell and change the interpretation of the cell content. 25 | -- MAGIC 26 | -- MAGIC **Mixed Languages** 27 | -- MAGIC The default language of this notebook is SQL. However, you can easily switch to other languages by magic commands: "%python", "%R", "%SQL", "%scala" 28 | -- MAGIC 29 | -- MAGIC **Auxiliary Cells** 30 | -- MAGIC - "%md": mark-down 31 | -- MAGIC - "%sh": run shell code 32 | -- MAGIC - "%fs": run dbutils filesystem commands; e.g. _%fs ls_ instead of _"dbutils.fs.ls"_ 33 | -- MAGIC 34 | -- MAGIC More information on [Databricks Notebook Utilities](https://docs.databricks.com/notebooks/notebooks-use.html#mix-languages) 35 | 36 | -- COMMAND ---------- 37 | 38 | -- MAGIC %md 39 | -- MAGIC 40 | -- MAGIC ## Examine the filesystem(s) 41 | -- MAGIC A databricks cluster have a driver node and potentially multiple worker nodes. The file root path on databricks depends the code executed; whether it is executed locally or on a distributed cluster. 42 | -- MAGIC 43 | -- MAGIC This is because the cluster is dealing with two filesystems: 44 | -- MAGIC - Local filesystem (e.g., driver's) 45 | -- MAGIC - Distributed DBFS filesystem 46 | -- MAGIC 47 | -- MAGIC 48 | -- MAGIC Below you can see & examine the commands and their default filesystems & root-path. You can also learn more about [Accessing Files on Databricks](https://docs.databricks.com/files/index.html). 49 | 50 | -- COMMAND ---------- 51 | 52 | -- MAGIC %md 53 | -- MAGIC 54 | -- MAGIC ### Local driver filesystem 55 | -- MAGIC The block storage volume attached to the driver is the root path for code executed locally. These include command-types: 56 | -- MAGIC - %sh 57 | -- MAGIC - Most Python code (not PySpark) 58 | -- MAGIC - Most Scala code (not Spark) 59 | 60 | -- COMMAND ---------- 61 | 62 | -- MAGIC %md 63 | -- MAGIC As you are running the notebook from the "Repos", your current directory is the 'EMR-data-science' 64 | 65 | -- COMMAND ---------- 66 | 67 | -- MAGIC %python 68 | -- MAGIC 69 | -- MAGIC import os 70 | -- MAGIC os.getcwd() 71 | 72 | -- COMMAND ---------- 73 | 74 | -- MAGIC %md 75 | -- MAGIC While your root directory shows the driver file-system root directory: 76 | 77 | -- COMMAND ---------- 78 | 79 | -- MAGIC %python 80 | -- MAGIC os.listdir("/") 81 | 82 | -- COMMAND ---------- 83 | 84 | -- MAGIC %md 85 | -- MAGIC Similarly, %sh codes are executed locally: 86 | 87 | -- COMMAND ---------- 88 | 89 | -- MAGIC %sh 90 | -- MAGIC ls -alh 91 | 92 | -- COMMAND ---------- 93 | 94 | --%sh 95 | --ls /dbfs/ 96 | 97 | -- COMMAND ---------- 98 | 99 | --%fs 100 | --ls file:/ 101 | 102 | -- COMMAND ---------- 103 | 104 | -- MAGIC %md ### Distributed filesystem 105 | -- MAGIC The DBFS root is the root path for Spark and DBFS commands. These include command-types: 106 | -- MAGIC - Spark SQL 107 | -- MAGIC - DataFrames 108 | -- MAGIC - dbutils.fs 109 | -- MAGIC - %fs 110 | -- MAGIC 111 | -- MAGIC By default these commands are executed in a distributed fashion, and their default filesystem is DBFS. 112 | 113 | -- COMMAND ---------- 114 | 115 | -- MAGIC %fs 116 | -- MAGIC 117 | -- MAGIC ls / 118 | 119 | -- COMMAND ---------- 120 | 121 | -- MAGIC %fs 122 | -- MAGIC 123 | -- MAGIC ls /FileStore/ 124 | 125 | -- COMMAND ---------- 126 | 127 | -- MAGIC %md 128 | -- MAGIC 129 | -- MAGIC On previous notebook, we uploaded our data to DBFS as a relational database "emr_sample". We can see the corresponding files on the dbfs hive directory as below: 130 | 131 | -- COMMAND ---------- 132 | 133 | -- MAGIC %fs 134 | -- MAGIC 135 | -- MAGIC ls /user/hive/warehouse/emr_sample.db/ 136 | 137 | -- COMMAND ---------- 138 | 139 | -- MAGIC %md 140 | -- MAGIC ## Explore the database 141 | -- MAGIC This section we use SQL queries and plotting tools to explore and understand Synthea data. 142 | -- MAGIC 143 | -- MAGIC Reference for [Spark SQL built-in functions](https://spark.apache.org/docs/latest/api/sql/) 144 | -- MAGIC 145 | -- MAGIC **Note** Some questions in this section are marked as _Extra Credit_. If you can skip or if you got extra time, try writing query to answer those. The correct queries are not unique, but you can find some suggested queries _extra_credit.sql_ file; feel free to check them if you are interested. 146 | 147 | -- COMMAND ---------- 148 | 149 | -- MAGIC %md 150 | -- MAGIC ### Database Structure 151 | -- MAGIC The first step in data analytics is understanding the data. Using SQL queries we can get a sense of the data organization (database, tables, attributes) and common values. 152 | 153 | -- COMMAND ---------- 154 | 155 | -- MAGIC %sql 156 | -- MAGIC 157 | -- MAGIC -- this should fail if you haven't selected the right database 158 | -- MAGIC 159 | -- MAGIC select * from encounters 160 | 161 | -- COMMAND ---------- 162 | 163 | -- MAGIC %md 164 | -- MAGIC Review all the database in your DBFS environment, and find the one we have uploaded on previous notebook: 165 | 166 | -- COMMAND ---------- 167 | 168 | -- hidden on html-version due to privacy 169 | show databases 170 | 171 | -- COMMAND ---------- 172 | 173 | -- MAGIC %md 174 | -- MAGIC Print all the tables in emr_sample database: 175 | 176 | -- COMMAND ---------- 177 | 178 | use emr_sample; 179 | 180 | show tables; 181 | 182 | -- COMMAND ---------- 183 | 184 | -- MAGIC %md 185 | -- MAGIC Take a peek at the tables you are curious about: 186 | 187 | -- COMMAND ---------- 188 | 189 | select * from encounters limit 5 190 | 191 | -- COMMAND ---------- 192 | 193 | -- MAGIC %md 194 | -- MAGIC Or check their schema description: 195 | 196 | -- COMMAND ---------- 197 | 198 | desc encounters 199 | 200 | -- COMMAND ---------- 201 | 202 | -- MAGIC %md 203 | -- MAGIC Sometimes it is easier to switch to Python to explore the data content. We can simply run spark sql query & convert the result to pandas dataframe and play with it: 204 | 205 | -- COMMAND ---------- 206 | 207 | -- MAGIC %python 208 | -- MAGIC 209 | -- MAGIC # print all the tables & describe them 210 | -- MAGIC tables = spark.sql("show tables").toPandas() 211 | -- MAGIC synthea_tables = tables[tables.database == 'emr_sample']['tableName'].values 212 | -- MAGIC 213 | -- MAGIC for syntab in synthea_tables: 214 | -- MAGIC print(f'{syntab}') 215 | -- MAGIC print(spark.sql(f'describe table {syntab}').toPandas()) 216 | 217 | -- COMMAND ---------- 218 | 219 | -- MAGIC %md 220 | -- MAGIC You rarely need to get all the rows from patient table and download it! If the data is too large, the UI only shows the first 1000 records: 221 | 222 | -- COMMAND ---------- 223 | 224 | select * from patients 225 | 226 | -- COMMAND ---------- 227 | 228 | -- MAGIC %md 229 | -- MAGIC 230 | -- MAGIC _Extra Credit_: What is the total number of patients? 231 | 232 | -- COMMAND ---------- 233 | 234 | -- MAGIC %md 235 | -- MAGIC ### Stat Check 236 | 237 | -- COMMAND ---------- 238 | 239 | -- MAGIC %md 240 | -- MAGIC Check specialities and see if the numbers match what you expect in reality: 241 | 242 | -- COMMAND ---------- 243 | 244 | select speciality, count(*) tally 245 | from providers 246 | group by speciality 247 | 248 | -- COMMAND ---------- 249 | 250 | -- DBTITLE 1,Encounters 251 | select * from encounters 252 | 253 | -- COMMAND ---------- 254 | 255 | -- MAGIC %md 256 | -- MAGIC 257 | -- MAGIC _Extra Credit:_ What was the date of the most recent encounter for each patient? 258 | 259 | -- COMMAND ---------- 260 | 261 | -- MAGIC %md 262 | -- MAGIC ### Freq. & Cardinalities 263 | 264 | -- COMMAND ---------- 265 | 266 | -- MAGIC %md 267 | -- MAGIC One "code" could map to multiple descriptions: 268 | 269 | -- COMMAND ---------- 270 | 271 | select count(*) tally, code, collect_set(description) description_list 272 | from observations 273 | group by code 274 | order by size(description_list) desc 275 | 276 | -- COMMAND ---------- 277 | 278 | select count(*) tally, code, collect_set(description) description_list 279 | from conditions 280 | group by code 281 | order by size(description_list) desc 282 | 283 | 284 | -- COMMAND ---------- 285 | 286 | -- MAGIC %python 287 | -- MAGIC 288 | -- MAGIC # 72514-3 "Pain severity - 0-10 verbal numeric rating [Score] - Reported" 289 | -- MAGIC 290 | -- MAGIC sql = "select int(value) pain_level, count(*) tally from observations where code = '72514-3' group by pain_level order by pain_level" 291 | -- MAGIC 292 | -- MAGIC display(spark.sql(sql).toPandas().plot.bar(x='pain_level', y='tally')) 293 | 294 | -- COMMAND ---------- 295 | 296 | -- MAGIC %md 297 | -- MAGIC 298 | -- MAGIC _Extra Credit:_ How would you discover observations related to 'pain'? 299 | 300 | -- COMMAND ---------- 301 | 302 | -- MAGIC %md 303 | -- MAGIC 304 | -- MAGIC _Extra Credit:_ What are the different kinds of encounters, and how many of each are in the database? 305 | 306 | -- COMMAND ---------- 307 | 308 | select * from conditions 309 | 310 | -- COMMAND ---------- 311 | 312 | -- MAGIC %md 313 | -- MAGIC Calculate patients' ages at the condition start: 314 | 315 | -- COMMAND ---------- 316 | 317 | -- DBTITLE 1,Age at onset 318 | select 319 | p.first, 320 | p.last, 321 | floor( 322 | datediff(date(c.start), date(p.birthdate)) / 365.24 323 | ) age_at_onset, 324 | c.description condition_description 325 | from 326 | conditions c, patients p 327 | where c.patient = p.id 328 | 329 | -- COMMAND ---------- 330 | 331 | -- MAGIC %md 332 | -- MAGIC Let's see what are the codes for the breast-cancer related conditions: 333 | 334 | -- COMMAND ---------- 335 | 336 | select 337 | c.description, 338 | count(*) 339 | from 340 | conditions c 341 | where 342 | lower(c.description) rlike('breast') 343 | group by 344 | c.description 345 | 346 | -- conditions: 'Pathological fracture due to osteoporosis (disorder)'' 1026 347 | -- observations: 'DXA [T-score] Bone density' 348 | -- select c.* from conditions c where c.description = 'Malignant neoplasm of breast (disorder)' 349 | -- 'Whiplash injury to neck' 5161 350 | -- 'Dislocation of hip joint (disorder)' 63 351 | -- 'Osteoarthritis of hip' 1657 352 | -- 'Closed fracture of hip' 948 353 | 354 | -- COMMAND ---------- 355 | 356 | -- select description, value from observations where description rlike('DXA') 357 | -- show tables -- medications, procedures, conditions, observations 358 | 359 | select 360 | p.gender, 361 | cast(o.value as float) T_score 362 | from 363 | observations o 364 | join patients p on o.patient = p.id 365 | where 366 | o.description == 'DXA [T-score] Bone density' 367 | 368 | -- COMMAND ---------- 369 | 370 | -- DBTITLE 1,Medications 371 | select 372 | * 373 | from 374 | medications 375 | limit 376 | 10 377 | 378 | -- COMMAND ---------- 379 | 380 | -- MAGIC %md 381 | -- MAGIC ### Plots in R 382 | -- MAGIC Plots assist us in investigating the data and getting quick insight and understanding. 383 | -- MAGIC 384 | -- MAGIC One can switch to R to use its powerful packages for data wrangling and plotting; e.g., dplyr, ggplot2 and sparklyr. 385 | 386 | -- COMMAND ---------- 387 | 388 | -- MAGIC %r 389 | -- MAGIC options(repr.plot.width=600, repr.plot.height=1200) 390 | 391 | -- COMMAND ---------- 392 | 393 | -- MAGIC %md 394 | -- MAGIC Age-gender distribution for different conditions. 395 | 396 | -- COMMAND ---------- 397 | 398 | -- MAGIC %r 399 | -- MAGIC library(dplyr) 400 | -- MAGIC library(sparklyr) 401 | -- MAGIC library(ggplot2) 402 | -- MAGIC 403 | -- MAGIC sc <- spark_connect(method = "databricks") 404 | -- MAGIC 405 | -- MAGIC conditions <- c( 406 | -- MAGIC 'Dislocation of hip joint (disorder)', 407 | -- MAGIC 'Closed fracture of hip', 408 | -- MAGIC 'Osteoarthritis of hip', 409 | -- MAGIC 'Malignant neoplasm of breast (disorder)') 410 | -- MAGIC 411 | -- MAGIC sql <- sprintf( 412 | -- MAGIC "select 413 | -- MAGIC p.first, 414 | -- MAGIC p.last, 415 | -- MAGIC p.gender, 416 | -- MAGIC c.description condition, 417 | -- MAGIC floor(datediff(date(c.start), date(p.birthdate))/365.24) age_at_onset 418 | -- MAGIC from conditions c join patients p 419 | -- MAGIC where c.patient = p.id 420 | -- MAGIC and c.description in ('%s')", 421 | -- MAGIC paste(conditions, collapse="','") 422 | -- MAGIC ) 423 | -- MAGIC 424 | -- MAGIC 425 | -- MAGIC sdf_sql(sc, sql) %>% ggplot(aes(x=age_at_onset, fill=gender)) + geom_density(alpha=0.5) + facet_grid(condition ~ ., scales='free_y') 426 | 427 | -- COMMAND ---------- 428 | 429 | -- MAGIC %md 430 | -- MAGIC A finer-grain look reveals some issues or questionable observations in the data: 431 | 432 | -- COMMAND ---------- 433 | 434 | -- MAGIC %r 435 | -- MAGIC 436 | -- MAGIC sdf_sql(sc, sql) %>% 437 | -- MAGIC ggplot(aes(x=age_at_onset, fill=gender)) + geom_histogram(binwidth = 1) + facet_grid(condition ~ gender, scales='free_y') 438 | 439 | -- COMMAND ---------- 440 | 441 | -- MAGIC %md 442 | -- MAGIC A normal distribution is what we expect in real population, not bi-modal one. 443 | 444 | -- COMMAND ---------- 445 | 446 | -- MAGIC %r 447 | -- MAGIC 448 | -- MAGIC library(dplyr) 449 | -- MAGIC library(sparklyr) 450 | -- MAGIC library(ggplot2) 451 | -- MAGIC 452 | -- MAGIC sc <- spark_connect(method = "databricks") 453 | -- MAGIC sql <- " 454 | -- MAGIC select 455 | -- MAGIC description, 456 | -- MAGIC cast(value as float) T_score 457 | -- MAGIC from observations 458 | -- MAGIC where description == 'DXA [T-score] Bone density'" 459 | -- MAGIC 460 | -- MAGIC bone_density <- sdf_sql(sc, sql) 461 | -- MAGIC bone_density %>% ggplot(aes(x=T_score)) + geom_density(fill='blue', alpha=0.5) 462 | 463 | -- COMMAND ---------- 464 | 465 | -- select max(cast(value as float)) max_T_score from observations where description == 'DXA [T-score] Bone density' 466 | select 467 | value T_score, 468 | count(*) tally 469 | from 470 | observations 471 | where 472 | description == 'DXA [T-score] Bone density' 473 | group by 474 | value 475 | order by 476 | cast(value as float) 477 | 478 | -- COMMAND ---------- 479 | 480 | -- MAGIC %r 481 | -- MAGIC sql <- " 482 | -- MAGIC select 483 | -- MAGIC p.gender, 484 | -- MAGIC cast(o.value as float) T_score 485 | -- MAGIC from observations o, patients p 486 | -- MAGIC where 487 | -- MAGIC o.patient = p.id 488 | -- MAGIC AND o.description == 'DXA [T-score] Bone density'" 489 | -- MAGIC sdf_sql(sc, sql) %>% ggplot(aes(x=T_score, fill=gender)) + geom_density(alpha=0.5) 490 | 491 | -- COMMAND ---------- 492 | 493 | -- select * from encounters limit 10 494 | select 495 | code, 496 | collect_set(description), 497 | count(*) tally 498 | from 499 | encounters 500 | group by 501 | code 502 | order by 503 | tally desc 504 | 505 | -- encounterclass 506 | -- 'General examination of patient (procedure)', "Encounter for 'check-up'" 507 | -- 162673000 General examination of patient (procedure) 508 | -- 185349003 Encounter for check up (procedure) 509 | 510 | -- COMMAND ---------- 511 | 512 | -- select count(*) from conditions where code = 254837009 -- 1369 513 | with encounter_condition as ( 514 | select 515 | e.id encounter, 516 | e.code encounter_code, 517 | collect_set(c.code) as condition_code_list 518 | from 519 | encounters e 520 | join conditions c on c.encounter = e.id 521 | -- where e.code in ('162673000', '185349003') 522 | group by 523 | e.id, 524 | e.code 525 | ), 526 | encounter_bcadx as ( 527 | select 528 | encounter, 529 | encounter_code, 530 | case 531 | when array_contains(condition_code_list, '254837009') then 'Y' 532 | else 'N' 533 | end as breast_cancer_dx 534 | from 535 | encounter_condition 536 | ) 537 | select 538 | * 539 | from 540 | encounter_bcadx 541 | where 542 | breast_cancer_dx = 'Y' 543 | 544 | -- select breast_cancer_dx, count(*) tally from encounter_bcadx group by breast_cancer_dx 545 | 546 | -- COMMAND ---------- 547 | 548 | 549 | -- select count(*) from conditions where code = 254837009 -- 1369 550 | -- code 254837009 'Malignant neoplasm of breast (disorder)' 551 | 552 | with 553 | encounter_condition as ( 554 | select e.id encounter, e.code encounter_code, collect_set(c.code) as condition_code_list 555 | from encounters e join conditions c on c.encounter = e.id 556 | group by e.id, e.code 557 | ), 558 | encounter_bcadx as ( 559 | select encounter, encounter_code, case when array_contains(condition_code_list, '254837009') then 'Y' else 'N' end as breast_cancer_dx 560 | from encounter_condition 561 | ) 562 | select * from encounter_bcadx where breast_cancer_dx = 'Y' 563 | -- select breast_cancer_dx, count(*) tally from encounter_bcadx group by breast_cancer_dx 564 | -------------------------------------------------------------------------------- /2_Synthea_cooccurrence.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC This notebook uses the [vis.js](https://visjs.org/) Javascript language to create interactive visualizations of co-occurrence graphs. 5 | # MAGIC 6 | # MAGIC The documentation for [Network](https://visjs.github.io/vis-network/docs/network/) structures describes the options available for [nodes](https://visjs.github.io/vis-network/docs/network/nodes.html) and [edges](https://visjs.github.io/vis-network/docs/network/edges.html). 7 | 8 | # COMMAND ---------- 9 | 10 | # MAGIC %md 11 | # MAGIC 12 | # MAGIC # Library of Functions 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %python 17 | # MAGIC 18 | # MAGIC import pandas as pd 19 | # MAGIC import numpy as np 20 | # MAGIC import re 21 | # MAGIC # import json 22 | # MAGIC 23 | # MAGIC # import datetime 24 | # MAGIC 25 | # MAGIC def get_nodes_and_edges_from_item_pair_stats(cooccurrence_pdf): 26 | # MAGIC item_stats = {r['item1']:{'count':r['item1_count'], 'prevalence':r['item1_prevalence']} 27 | # MAGIC for idx, r in cooccurrence_pdf.iterrows()} 28 | # MAGIC 29 | # MAGIC item_stats.update({r['item2']:{'count':r['item2_count'], 'prevalence':r['item2_prevalence']} 30 | # MAGIC for idx, r in cooccurrence_pdf.iterrows()}) 31 | # MAGIC 32 | # MAGIC nodes_df = pd.DataFrame([{'label':k,'count':v['count'], 'prevalence':v['prevalence']} 33 | # MAGIC for k,v in item_stats.items()]) 34 | # MAGIC nodes_df['id'] = nodes_df.index 35 | # MAGIC 36 | # MAGIC edges_df = cooccurrence_pdf.copy() 37 | # MAGIC node_id = {r['label']:r['id'] for idx, r in nodes_df.iterrows()} 38 | # MAGIC edges_df['from'] = [node_id[nn] for nn in edges_df['item1']] 39 | # MAGIC edges_df['to'] = [node_id[nn] for nn in edges_df['item2']] 40 | # MAGIC 41 | # MAGIC print("Your graph will have {0} nodes and {1} edges.".format( len(nodes_df), len(edges_df) )) 42 | # MAGIC 43 | # MAGIC return nodes_df, edges_df[[ 'from', 'to', 'both_count', 'confidence', 'lift']] 44 | # MAGIC 45 | # MAGIC 46 | # MAGIC 47 | # MAGIC def export_to_vis_js(nodes_df, edges_df, title, html_file_name): 48 | # MAGIC """ 49 | # MAGIC Generate vis_js graph from cooccurrence Pandas dataframe and write to HTML file. 50 | # MAGIC """ 51 | # MAGIC default_metric = 'lift' 52 | # MAGIC max_lift = np.quantile(edges_df['lift'], 0.95) 53 | # MAGIC 54 | # MAGIC nodes_str = nodes_df.to_json(orient='records') 55 | # MAGIC edges_str = edges_df.to_json(orient='records') 56 | # MAGIC 57 | # MAGIC html_string = ( 58 | # MAGIC '\n' 59 | # MAGIC '\n' 60 | # MAGIC '\n' 61 | # MAGIC ' \n' 62 | # MAGIC f' {title}\n' 63 | # MAGIC ' \n' 64 | # MAGIC f' \n' 65 | # MAGIC ' \n' 66 | # MAGIC ' \n' 67 | # MAGIC ' \n' 68 | # MAGIC '

\n' 75 | # MAGIC '

\n' 76 | # MAGIC ' \n' 123 | # MAGIC ' \n' 124 | # MAGIC ' \n' 125 | # MAGIC ) 126 | # MAGIC with open(html_file_name, "wt") as html_file: 127 | # MAGIC html_file.write(html_string) 128 | 129 | # COMMAND ---------- 130 | 131 | # MAGIC %md 132 | # MAGIC 133 | # MAGIC # Compute item-pair statistics 134 | 135 | # COMMAND ---------- 136 | 137 | # MAGIC %md 138 | # MAGIC 139 | # MAGIC We're going to name items by their descriptions, so we need to check that each item only has one description. 140 | 141 | # COMMAND ---------- 142 | 143 | # MAGIC %sql 144 | # MAGIC 145 | # MAGIC use emr_sample; 146 | # MAGIC 147 | # MAGIC show tables; 148 | 149 | # COMMAND ---------- 150 | 151 | # DBTITLE 1,Check for denormalized descriptions in medications 152 | # MAGIC %sql 153 | # MAGIC 154 | # MAGIC select code, collect_list(distinct lower(description)) description_list from medications group by code order by size(description_list) desc 155 | # MAGIC 156 | # MAGIC -- medication code '999999' appears to be bogus; it is used for 4 different things. All the other differences in description are just in capitalization. 157 | 158 | # COMMAND ---------- 159 | 160 | # DBTITLE 1,Check for denormalized descriptions in conditions 161 | # MAGIC %sql 162 | # MAGIC select code, count(*) tally, collect_list(distinct description) descriptions from conditions group by code order by size(descriptions) desc 163 | # MAGIC 164 | # MAGIC --- only 4 codes have multiple descriptions; 3 of these are trivial differences 165 | # MAGIC --- code '427089005' could be either "Male Infertility" or "Diabetes from Cystic Fibrosis"; we'll skip that code 166 | 167 | # COMMAND ---------- 168 | 169 | # DBTITLE 1,Collect 'baskets' and 'items' 170 | # MAGIC %sql 171 | # MAGIC 172 | # MAGIC create or replace temporary view basket_item as 173 | # MAGIC with 174 | # MAGIC pe1 as ( 175 | # MAGIC select enc.id encounter 176 | # MAGIC , floor(datediff(enc.start, pat.birthdate)/365.24) age 177 | # MAGIC , pat.race 178 | # MAGIC , pat.ethnicity 179 | # MAGIC , pat.gender 180 | # MAGIC from patients pat join encounters enc on enc.patient=pat.id 181 | # MAGIC where enc.encounterclass in ('inpatient', 'outpatient') 182 | # MAGIC ) 183 | # MAGIC , 184 | # MAGIC pe2 as ( 185 | # MAGIC select encounter, 186 | # MAGIC concat_ws('_', 'gender', gender) gender, 187 | # MAGIC concat_ws('_', 'ethnicity', ethnicity) ethnicity, 188 | # MAGIC concat_ws('_', 'race', race) race, 189 | # MAGIC case -- approximately 'MeSH' age ranges according to https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3825015/ 190 | # MAGIC when age < 2 then 'age_00_01' 191 | # MAGIC when age < 5 then 'age_02_04' 192 | # MAGIC when age < 12 then 'age_05_11' 193 | # MAGIC when age < 18 then 'age_12_17' 194 | # MAGIC when age < 24 then 'age_18_23' 195 | # MAGIC when age < 44 then 'age_24_43' 196 | # MAGIC when age < 65 then 'age_44_64' 197 | # MAGIC when age < 80 then 'age_65_79' 198 | # MAGIC when age >=80 then 'age_80_plus' 199 | # MAGIC else 'age_unknown' 200 | # MAGIC end age_group 201 | # MAGIC from pe1 202 | # MAGIC ) 203 | # MAGIC , 204 | # MAGIC code_tally as ( 205 | # MAGIC select code, count(*) tally, first(description) description 206 | # MAGIC from conditions 207 | # MAGIC where code != '427089005' -- could be either "Male Infertility" or "Diabetes from Cystic Fibrosis" 208 | # MAGIC group by code 209 | # MAGIC ) 210 | # MAGIC , 211 | # MAGIC encounter_condition_long as ( 212 | # MAGIC select e.id encounter, ct.description condition 213 | # MAGIC from encounters e 214 | # MAGIC join conditions c on c.patient = e.patient 215 | # MAGIC join code_tally ct on ct.code = c.code 216 | # MAGIC join pe2 on e.id = pe2.encounter 217 | # MAGIC where ct.tally > 100 218 | # MAGIC and c.start < e.stop 219 | # MAGIC and (c.stop > e.stop or c.stop is null) 220 | # MAGIC ) 221 | # MAGIC , 222 | # MAGIC bmi as ( 223 | # MAGIC select encounter, value as bmi, 224 | # MAGIC case -- https://www.cdc.gov/healthyweight/assessing/bmi/adult_bmi/index.html 225 | # MAGIC when value < 18.5 then 'bmi_underweight' 226 | # MAGIC when value < 25 then 'bmi_healthy weight' 227 | # MAGIC when value < 30 then 'bmi_overweight' 228 | # MAGIC when value < 40 then 'bmi_obese' 229 | # MAGIC when value >= 40 then 'bmi_morbidly_obese' 230 | # MAGIC else 'bmi_unknown' 231 | # MAGIC end as bmi_category 232 | # MAGIC from observations where code = '39156-5' 233 | # MAGIC ) 234 | # MAGIC , 235 | # MAGIC patient_features_long as ( 236 | # MAGIC select encounter, stack(4, gender, ethnicity, race, age_group) as feature from pe2 237 | # MAGIC ) 238 | # MAGIC select encounter as basket, concat('CONDITION:', condition) as item from encounter_condition_long 239 | # MAGIC union 240 | # MAGIC select encounter as basket, concat('PATIENT:', feature) as item from patient_features_long 241 | # MAGIC union 242 | # MAGIC select encounter as basket, concat('MEDICATION:', lower(description)) as item from medications where code != '999999' 243 | # MAGIC union 244 | # MAGIC select encounter as basket, concat('OBSERVATION:', bmi_category) as item from bmi 245 | # MAGIC union 246 | # MAGIC select encounter, concat('OBSERVATION:', value) from observations where description = 'Tobacco smoking status NHIS' 247 | # MAGIC ; 248 | 249 | # COMMAND ---------- 250 | 251 | # MAGIC %sql 252 | # MAGIC 253 | # MAGIC -- select count(*) from basket_item; -- 25754861 254 | # MAGIC 255 | # MAGIC select * from basket_item; 256 | 257 | # COMMAND ---------- 258 | 259 | # DBTITLE 1,Calculate item-pair statistics 260 | # MAGIC %sql 261 | # MAGIC -- MIN_COUNT = 200 262 | # MAGIC 263 | # MAGIC drop table if exists item_pair_stats; 264 | # MAGIC 265 | # MAGIC create table item_pair_stats as 266 | # MAGIC with 267 | # MAGIC bi as ( 268 | # MAGIC select basket, item 269 | # MAGIC from basket_item 270 | # MAGIC group by basket, item 271 | # MAGIC ), 272 | # MAGIC item_counts as ( 273 | # MAGIC select item, count(*) item_count 274 | # MAGIC from bi 275 | # MAGIC group by item 276 | # MAGIC ), 277 | # MAGIC bi_count as ( 278 | # MAGIC select bi.*, ic.item_count 279 | # MAGIC from bi 280 | # MAGIC join item_counts ic on bi.item=ic.item 281 | # MAGIC where ic.item_count > 200 282 | # MAGIC ), 283 | # MAGIC item_pair_stats as ( 284 | # MAGIC select bi1.item item1, bi2.item item2, 285 | # MAGIC bi1.item_count item1_count, bi2.item_count item2_count, 286 | # MAGIC count(*) as both_count 287 | # MAGIC from bi_count bi1 288 | # MAGIC join bi_count bi2 289 | # MAGIC on bi1.basket = bi2.basket and bi1.item != bi2.item 290 | # MAGIC group by bi1.item, bi1.item_count, 291 | # MAGIC bi2.item, bi2.item_count 292 | # MAGIC ), 293 | # MAGIC cc as ( 294 | # MAGIC SELECT item1, item2, item1_count, item2_count, both_count, 295 | # MAGIC CAST(item1_count AS FLOAT)/(select count(distinct basket) from basket_item) as item1_prevalence, 296 | # MAGIC CAST(item2_count AS FLOAT)/(select count(distinct basket) from basket_item) as item2_prevalence, 297 | # MAGIC CAST(both_count AS FLOAT)/CAST(item1_count AS FLOAT) AS confidence 298 | # MAGIC FROM item_pair_stats 299 | # MAGIC ) 300 | # MAGIC select *, confidence/item2_prevalence lift from cc 301 | 302 | # COMMAND ---------- 303 | 304 | # MAGIC %md 305 | # MAGIC 306 | # MAGIC # Explore item-pair statistics 307 | 308 | # COMMAND ---------- 309 | 310 | # MAGIC %sql 311 | # MAGIC 312 | # MAGIC select * from item_pair_stats order by confidence desc; 313 | 314 | # COMMAND ---------- 315 | 316 | # MAGIC %sql 317 | # MAGIC select item1, item2, confidence, lift from item_pair_stats 318 | # MAGIC where item2 rlike 'Non-small cell lung cancer' 319 | # MAGIC and item1 rlike 'MEDICATION' 320 | # MAGIC order by lift desc; 321 | 322 | # COMMAND ---------- 323 | 324 | # MAGIC %md 325 | # MAGIC 326 | # MAGIC ### Extra credit: 327 | # MAGIC 328 | # MAGIC * How would you find the low-confidence examples? 329 | # MAGIC 330 | # MAGIC * What medication has the highest lift for predicting Non-small cell lung cancer? Is it reasonable to use this as a predictor? 331 | # MAGIC 332 | # MAGIC * What does a lift less than 1 mean? 333 | 334 | # COMMAND ---------- 335 | 336 | # MAGIC %md 337 | # MAGIC 338 | # MAGIC # Generate Interactive Co-occurrence Graph 339 | 340 | # COMMAND ---------- 341 | 342 | # MAGIC %md 343 | # MAGIC 344 | # MAGIC We can't plot all the edges in this graph, so we need to filter out the weak ones. First let's plot a distribution and decide where to make the cut-off: 345 | 346 | # COMMAND ---------- 347 | 348 | # MAGIC %python 349 | # MAGIC 350 | # MAGIC # select all the item pairs with confidence greater than 0.5 351 | # MAGIC ip_stats = spark.sql("select * from item_pair_stats where confidence > 0.5").toPandas() 352 | # MAGIC 353 | # MAGIC # reformat as two separate tables, one for nodes and the other for edges 354 | # MAGIC nodes, edges = get_nodes_and_edges_from_item_pair_stats(ip_stats) 355 | # MAGIC 356 | # MAGIC # decide which colors to use for the different categories of nodes 357 | # MAGIC color_map = {'PATIENT': '#FF9999', 'CONDITION': '#9999FF', 'MEDICATION': '#99FF99', 'OBSERVATION':'#FFFF99'} 358 | # MAGIC 359 | # MAGIC # split off the category type from the node label 360 | # MAGIC label_parts = [lbl.split(':') for lbl in nodes['label']] 361 | # MAGIC 362 | # MAGIC # make separate colums for node characteristics (to be used by the vis.js library) 363 | # MAGIC nodes['category'] = [lp[0] for lp in label_parts] 364 | # MAGIC 365 | # MAGIC # 'label' is the text that appears on the node 366 | # MAGIC nodes['label'] = [lp[1] for lp in label_parts] 367 | # MAGIC # yup, color 368 | # MAGIC nodes['color'] = [color_map[cat] for cat in nodes['category']] 369 | # MAGIC # 'title' is the text that appears on mouseover 370 | # MAGIC nodes['title'] = [ '\n'.join([row['category'], 371 | # MAGIC row['label'], 372 | # MAGIC 'count: ' + str(row['count']), 373 | # MAGIC 'prevalence: ' + str(row['prevalence'])]) 374 | # MAGIC for i, row in nodes.iterrows()] 375 | # MAGIC 376 | # MAGIC nodes 377 | 378 | # COMMAND ---------- 379 | 380 | # MAGIC %python 381 | # MAGIC 382 | # MAGIC display(ip_stats.hist(column='confidence', bins=15)[0][0]) ### ??? 383 | 384 | # COMMAND ---------- 385 | 386 | 387 | 388 | # COMMAND ---------- 389 | 390 | # MAGIC %python 391 | # MAGIC 392 | # MAGIC # make sure the plots directory exists, then save the cooccurrence plot there 393 | # MAGIC dbutils.fs.mkdirs('/FileStore/plots') 394 | # MAGIC export_to_vis_js(nodes, edges, 'Synthea Co-occurrence Demo', '/dbfs/FileStore/plots/synthea_cooccurrence_demo.html') 395 | 396 | # COMMAND ---------- 397 | 398 | # MAGIC %md 399 | # MAGIC 400 | # MAGIC This is just a demo of linking to the file store. You will need to customize the hyperlink by copying the correct number from your own Databricks URL: 401 | # MAGIC 402 | # MAGIC `https://adb-1953517438448055.15.azuredatabricks.net/?o=`__1953517438448055__`#notebook/3520119352938610/command/2583312897290483` 403 | # MAGIC 404 | # MAGIC 405 | # MAGIC View results [here](https://adb-7320327251662587.7.azuredatabricks.net/files/plots/synthea_cooccurrence_demo.html?o=7320327251662587) 406 | 407 | # COMMAND ---------- 408 | 409 | 410 | -------------------------------------------------------------------------------- /3_Synthea_predict_breast_cancer.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Train an Explainable ML Classifier 5 | 6 | # COMMAND ---------- 7 | 8 | # If you have not permanently installed this package on your cluster, you can just install it temporarily by removing the # sign from the next line: 9 | #! pip install interpret 10 | 11 | # COMMAND ---------- 12 | 13 | # MAGIC %sql 14 | # MAGIC use emr_sample; 15 | 16 | # COMMAND ---------- 17 | 18 | # MAGIC %md 19 | # MAGIC 20 | # MAGIC # Breast Cancer 21 | # MAGIC 22 | # MAGIC Predict whether breast cancer will be diagnosed in a given encounter. Exclude patients who have a current diagnosis of breast cancer. 23 | 24 | # COMMAND ---------- 25 | 26 | # MAGIC %sql 27 | # MAGIC 28 | # MAGIC select description, count(*) tally from conditions where code = '254837009' group by description 29 | 30 | # COMMAND ---------- 31 | 32 | # DBTITLE 1,Feature engineering 33 | # MAGIC %sql 34 | # MAGIC create or replace temporary view patient_breast_cancer as 35 | # MAGIC with 36 | # MAGIC retro_numbered_encounters as ( 37 | # MAGIC SELECT *, 38 | # MAGIC ROW_NUMBER() OVER (PARTITION BY patient ORDER BY date(start) DESC) AS row_number 39 | # MAGIC FROM encounters 40 | # MAGIC ), 41 | # MAGIC most_recent_encounter as ( 42 | # MAGIC select * from retro_numbered_encounters where row_number = 1 43 | # MAGIC ), 44 | # MAGIC breast_ca_conditions as ( 45 | # MAGIC select * from conditions c where c.code = 254837009 -- 'Malignant neoplasm of breast (disorder)' 46 | # MAGIC ) 47 | # MAGIC select concat_ws(' ', p.first, p.last) patient_name, p.gender, p.race, p.ethnicity, 48 | # MAGIC floor (datediff(date(e.start), date(p.birthdate))/365.24) age, 49 | # MAGIC case when c.code is null then 0 else 1 end as breast_cancer 50 | # MAGIC from most_recent_encounter e 51 | # MAGIC join patients p on e.patient = p.id 52 | # MAGIC left outer join breast_ca_conditions c on c.patient = e.patient; 53 | # MAGIC 54 | # MAGIC 55 | # MAGIC 56 | # MAGIC select * from patient_breast_cancer limit 5; 57 | 58 | # COMMAND ---------- 59 | 60 | # MAGIC %python 61 | # MAGIC 62 | # MAGIC pbc = spark.sql('select * from patient_breast_cancer').toPandas() 63 | # MAGIC # type(pbc.age[0]) # decimal.Decimal 64 | # MAGIC pbc['age'] = pbc['age'].astype(float) 65 | 66 | # COMMAND ---------- 67 | 68 | # MAGIC %python 69 | # MAGIC 70 | # MAGIC pbc['age'].dtypes 71 | 72 | # COMMAND ---------- 73 | 74 | # MAGIC %python 75 | # MAGIC 76 | # MAGIC import pandas as pd 77 | # MAGIC from sklearn.model_selection import train_test_split 78 | # MAGIC from interpret.glassbox import ExplainableBoostingClassifier 79 | # MAGIC 80 | # MAGIC X = pbc[['gender', 'race', 'ethnicity', 'age']] 81 | # MAGIC y = pbc['breast_cancer'] 82 | # MAGIC 83 | # MAGIC seed = 1 84 | # MAGIC X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed) 85 | # MAGIC 86 | # MAGIC ebm = ExplainableBoostingClassifier(random_state=seed) 87 | # MAGIC ebm.fit(X_train, y_train) 88 | 89 | # COMMAND ---------- 90 | 91 | # MAGIC %python 92 | # MAGIC 93 | # MAGIC from interpret import show 94 | # MAGIC 95 | # MAGIC ebm_global = ebm.explain_global() 96 | # MAGIC show(ebm_global) 97 | 98 | # COMMAND ---------- 99 | 100 | # MAGIC %python 101 | # MAGIC 102 | # MAGIC ebm_local = ebm.explain_local(X_test[10:15], y_test[10:15]) 103 | # MAGIC show(ebm_local) 104 | 105 | # COMMAND ---------- 106 | 107 | # MAGIC %python 108 | # MAGIC # y_test 109 | # MAGIC 110 | # MAGIC p_test = ebm.predict_proba(X_test)[:,1] 111 | # MAGIC 112 | # MAGIC actual_predicted_pdf = pd.DataFrame({'actual':y_test, 'predicted_probability':p_test}) 113 | # MAGIC 114 | # MAGIC ## I'll plot these densities in R. Export the data to the database: 115 | # MAGIC spark.createDataFrame(actual_predicted_pdf).createOrReplaceTempView("actual_predicted") 116 | # MAGIC 117 | # MAGIC ## or make a permanent table: 118 | # MAGIC # actual_predicted_pdf.write.mode("overwrite").saveAsTable("actual_predicted") 119 | 120 | # COMMAND ---------- 121 | 122 | # MAGIC %r 123 | # MAGIC options(repr.plot.width=800, repr.plot.height=400) 124 | 125 | # COMMAND ---------- 126 | 127 | # MAGIC %r 128 | # MAGIC library(dplyr) 129 | # MAGIC library(sparklyr) 130 | # MAGIC library(ggplot2) 131 | # MAGIC 132 | # MAGIC sc <- spark_connect(method = "databricks") 133 | # MAGIC 134 | # MAGIC spark_read_table(sc, "actual_predicted") %>% 135 | # MAGIC collect %>% # download it locally 136 | # MAGIC mutate(actual=factor(actual)) %>% 137 | # MAGIC ggplot(aes(x=predicted_probability, fill=actual)) + geom_density(alpha=0.5) 138 | 139 | # COMMAND ---------- 140 | 141 | # MAGIC %python 142 | # MAGIC 143 | # MAGIC from interpret import perf 144 | # MAGIC roc = perf.ROC(ebm.predict_proba, feature_names=X_train.columns) 145 | # MAGIC 146 | # MAGIC roc_explanation = roc.explain_perf(X_test, y_test) 147 | # MAGIC show(roc_explanation) 148 | 149 | # COMMAND ---------- 150 | 151 | 152 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Robert M. Horton, PhD 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ML_with_simulated_EMR.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmhorton/EMR-data-science/56efdf97f4961f9b948b3b2ced88d0637ca9c27c/ML_with_simulated_EMR.pptx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EMR-data-science: Introduction to Data Science with Simulated Electronic Medical Record Data 2 | 3 | ## Sanity checking skills for clinical informatics 4 | 5 | This is a collection of open source educational materials (mostly Databricks notebooks) for introducing fundamental concepts of data science to a clinical audience. We focus on exploratory analysis, visualization, and interpretable machine learning (ML) models assuming that these will be particularly useful skills for clinician data scientists involved in planning and oversight of research, who will need to sanity check various findings. 6 | 7 | The data for these exercises was generated by Synthea using the standard collection of modules. ML is most useful in situations where classifications or predictions of outcomes must be made on the basis of many weak associations (if they can be made based on a small number of strong associations, you probably don't need ML). Unfortunately, Synthea data often lacks the subtle statistical relationships among variables that would make for compelling machine learning demonstrations. The missing subtlety is sometimes manifested in associations that have not been included in the simulation, and sometimes in associations that are overly significant. This makes some outcomes impossible to predict, while others can be predicted with far too great certainty. 8 | 9 | However, the same assortment of statistically inappropriate relationships that make it difficult to demonstrate ML on this data make it a treasure trove for sanity checking! Clinicians will easily be able to identify associations between disorders, treatments, observations, and patient characteristics that are either suspiciously strong or conspicuously absent. 10 | 11 | After negotiating some potential pitfalls, we are able to identify a set of features correlated (but not too strongly correlated) with a clinical outcome, which lets us demonstrate a machine learning classifier. The model we use is an Explainable Boosting Machine (EBM), a form of generalized additive model that comes with its own visualization tools for understanding the contribution of each feature to the prediction. 12 | 13 | These are the HTML versions of the notebooks: 14 | 15 | - [0_Load_Data](https://rmhorton.github.io/EMR-data-science/0_Load_Data.html) 16 | - [1_Synthea_exploration](https://rmhorton.github.io/EMR-data-science/1_Synthea_exploration.html) 17 | - [2_Synthea_cooccurrence](https://rmhorton.github.io/EMR-data-science/2_Synthea_cooccurrence.html) 18 | - [3_Synthea_predict_breast_cancer](https://rmhorton.github.io/EMR-data-science/3_Synthea_predict_breast_cancer.html) 19 | 20 | Co-occurrence plots, using various metrics: 21 | - [confidence](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html) 22 | - [lift](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html?metric=lift) 23 | - [log2lift](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html?metric=log2lift) 24 | 25 | 26 | ## Sample Data 27 | 28 | The 'sample_data.zip' archive contains CSV files copied from the "[Synthetic Mass](https://synthetichealth.github.io/synthea-sample-data/downloads/synthea_sample_data_csv_apr2020.zip)" 1k patient sample. 29 | 30 | This dataset is described in this reference: 31 | ``` 32 | Walonoski J, Klaus S, Granger E, Hall D, Gregorowicz A, Neyarapally G, Watson A, Eastman J. 33 | Synthea™ Novel coronavirus (COVID-19) model and synthetic data set. 34 | Intelligence-Based Medicine. 2020 Nov;1:100007. https://doi.org/10.1016/j.ibmed.2020.100007 35 | ``` 36 | 37 | ## Workshop Instructions 38 | 39 | The workshop instructions are in the [ML_with_simulated_EMR.pptx](ML_with_simulated_EMR.pptx) file; see Part 0: Setting up Databricks. -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | HTML files from open source workshop materials. 2 | 3 | Copies of notebooks: 4 | - [0_Load_Data](https://rmhorton.github.io/EMR-data-science/0_Load_Data.html) 5 | - [1_Synthea_exploration](https://rmhorton.github.io/EMR-data-science/1_Synthea_exploration.html) 6 | - [2_Synthea_cooccurrence](https://rmhorton.github.io/EMR-data-science/2_Synthea_cooccurrence.html) 7 | - [3_Synthea_predict_breast_cancer](https://rmhorton.github.io/EMR-data-science/3_Synthea_predict_breast_cancer.html) 8 | 9 | - test[3_Synthea_predict_breast_cancer](https://rmhorton.github.io/virtual-generalist/workshop/3_Synthea_predict_breast_cancer.html) 10 | 11 | Interactive visualizations: 12 | - [confidence](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html) 13 | - [lift](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html?metric=lift) 14 | - [log2lift](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html?metric=log2lift) 15 | -------------------------------------------------------------------------------- /extra_credit.sql: -------------------------------------------------------------------------------- 1 | -- What is the total number of patients? 2 | select count(distinct id) from patients; 3 | ; 4 | 5 | -- What was the date of the most recent encounter for each patient? 6 | select patient, max(date(START)) most_recent_encounter from encounters group by patient 7 | ; 8 | 9 | -- How would you discover observations related to 'pain'? 10 | select description, count(*) tally from observations where lower(description) rlike 'pain' group by description 11 | ; 12 | 13 | --- What are the different kinds of encounters, and how many of each are in the database? 14 | select encounterclass, count(*) tally from encounters group by encounterclass order by tally desc 15 | ; 16 | 17 | -- What is the most common medication and dose? 18 | select description, count(*) tally from medications group by description order by tally desc 19 | ; 20 | 21 | -- What is the most common disorder treated by medication? 22 | select reasondescription, count(*) tally from medications group by reasondescription order by tally desc 23 | ; 24 | 25 | --- What are the most common prescriptions for hypertension? 26 | select description, count(*) tally from medications where reasondescription == 'Hypertension' group by description order by tally desc 27 | ; 28 | 29 | How would you get only the latest measurement for each patient? 30 | -------------------------------------------------------------------------------- /sample_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmhorton/EMR-data-science/56efdf97f4961f9b948b3b2ced88d0637ca9c27c/sample_data.zip --------------------------------------------------------------------------------