\n'
75 | # MAGIC ' \n'
76 | # MAGIC ' \n'
123 | # MAGIC ' \n'
124 | # MAGIC ' \n'
125 | # MAGIC )
126 | # MAGIC with open(html_file_name, "wt") as html_file:
127 | # MAGIC html_file.write(html_string)
128 |
129 | # COMMAND ----------
130 |
131 | # MAGIC %md
132 | # MAGIC
133 | # MAGIC # Compute item-pair statistics
134 |
135 | # COMMAND ----------
136 |
137 | # MAGIC %md
138 | # MAGIC
139 | # MAGIC We're going to name items by their descriptions, so we need to check that each item only has one description.
140 |
141 | # COMMAND ----------
142 |
143 | # MAGIC %sql
144 | # MAGIC
145 | # MAGIC use emr_sample;
146 | # MAGIC
147 | # MAGIC show tables;
148 |
149 | # COMMAND ----------
150 |
151 | # DBTITLE 1,Check for denormalized descriptions in medications
152 | # MAGIC %sql
153 | # MAGIC
154 | # MAGIC select code, collect_list(distinct lower(description)) description_list from medications group by code order by size(description_list) desc
155 | # MAGIC
156 | # MAGIC -- medication code '999999' appears to be bogus; it is used for 4 different things. All the other differences in description are just in capitalization.
157 |
158 | # COMMAND ----------
159 |
160 | # DBTITLE 1,Check for denormalized descriptions in conditions
161 | # MAGIC %sql
162 | # MAGIC select code, count(*) tally, collect_list(distinct description) descriptions from conditions group by code order by size(descriptions) desc
163 | # MAGIC
164 | # MAGIC --- only 4 codes have multiple descriptions; 3 of these are trivial differences
165 | # MAGIC --- code '427089005' could be either "Male Infertility" or "Diabetes from Cystic Fibrosis"; we'll skip that code
166 |
167 | # COMMAND ----------
168 |
169 | # DBTITLE 1,Collect 'baskets' and 'items'
170 | # MAGIC %sql
171 | # MAGIC
172 | # MAGIC create or replace temporary view basket_item as
173 | # MAGIC with
174 | # MAGIC pe1 as (
175 | # MAGIC select enc.id encounter
176 | # MAGIC , floor(datediff(enc.start, pat.birthdate)/365.24) age
177 | # MAGIC , pat.race
178 | # MAGIC , pat.ethnicity
179 | # MAGIC , pat.gender
180 | # MAGIC from patients pat join encounters enc on enc.patient=pat.id
181 | # MAGIC where enc.encounterclass in ('inpatient', 'outpatient')
182 | # MAGIC )
183 | # MAGIC ,
184 | # MAGIC pe2 as (
185 | # MAGIC select encounter,
186 | # MAGIC concat_ws('_', 'gender', gender) gender,
187 | # MAGIC concat_ws('_', 'ethnicity', ethnicity) ethnicity,
188 | # MAGIC concat_ws('_', 'race', race) race,
189 | # MAGIC case -- approximately 'MeSH' age ranges according to https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3825015/
190 | # MAGIC when age < 2 then 'age_00_01'
191 | # MAGIC when age < 5 then 'age_02_04'
192 | # MAGIC when age < 12 then 'age_05_11'
193 | # MAGIC when age < 18 then 'age_12_17'
194 | # MAGIC when age < 24 then 'age_18_23'
195 | # MAGIC when age < 44 then 'age_24_43'
196 | # MAGIC when age < 65 then 'age_44_64'
197 | # MAGIC when age < 80 then 'age_65_79'
198 | # MAGIC when age >=80 then 'age_80_plus'
199 | # MAGIC else 'age_unknown'
200 | # MAGIC end age_group
201 | # MAGIC from pe1
202 | # MAGIC )
203 | # MAGIC ,
204 | # MAGIC code_tally as (
205 | # MAGIC select code, count(*) tally, first(description) description
206 | # MAGIC from conditions
207 | # MAGIC where code != '427089005' -- could be either "Male Infertility" or "Diabetes from Cystic Fibrosis"
208 | # MAGIC group by code
209 | # MAGIC )
210 | # MAGIC ,
211 | # MAGIC encounter_condition_long as (
212 | # MAGIC select e.id encounter, ct.description condition
213 | # MAGIC from encounters e
214 | # MAGIC join conditions c on c.patient = e.patient
215 | # MAGIC join code_tally ct on ct.code = c.code
216 | # MAGIC join pe2 on e.id = pe2.encounter
217 | # MAGIC where ct.tally > 100
218 | # MAGIC and c.start < e.stop
219 | # MAGIC and (c.stop > e.stop or c.stop is null)
220 | # MAGIC )
221 | # MAGIC ,
222 | # MAGIC bmi as (
223 | # MAGIC select encounter, value as bmi,
224 | # MAGIC case -- https://www.cdc.gov/healthyweight/assessing/bmi/adult_bmi/index.html
225 | # MAGIC when value < 18.5 then 'bmi_underweight'
226 | # MAGIC when value < 25 then 'bmi_healthy weight'
227 | # MAGIC when value < 30 then 'bmi_overweight'
228 | # MAGIC when value < 40 then 'bmi_obese'
229 | # MAGIC when value >= 40 then 'bmi_morbidly_obese'
230 | # MAGIC else 'bmi_unknown'
231 | # MAGIC end as bmi_category
232 | # MAGIC from observations where code = '39156-5'
233 | # MAGIC )
234 | # MAGIC ,
235 | # MAGIC patient_features_long as (
236 | # MAGIC select encounter, stack(4, gender, ethnicity, race, age_group) as feature from pe2
237 | # MAGIC )
238 | # MAGIC select encounter as basket, concat('CONDITION:', condition) as item from encounter_condition_long
239 | # MAGIC union
240 | # MAGIC select encounter as basket, concat('PATIENT:', feature) as item from patient_features_long
241 | # MAGIC union
242 | # MAGIC select encounter as basket, concat('MEDICATION:', lower(description)) as item from medications where code != '999999'
243 | # MAGIC union
244 | # MAGIC select encounter as basket, concat('OBSERVATION:', bmi_category) as item from bmi
245 | # MAGIC union
246 | # MAGIC select encounter, concat('OBSERVATION:', value) from observations where description = 'Tobacco smoking status NHIS'
247 | # MAGIC ;
248 |
249 | # COMMAND ----------
250 |
251 | # MAGIC %sql
252 | # MAGIC
253 | # MAGIC -- select count(*) from basket_item; -- 25754861
254 | # MAGIC
255 | # MAGIC select * from basket_item;
256 |
257 | # COMMAND ----------
258 |
259 | # DBTITLE 1,Calculate item-pair statistics
260 | # MAGIC %sql
261 | # MAGIC -- MIN_COUNT = 200
262 | # MAGIC
263 | # MAGIC drop table if exists item_pair_stats;
264 | # MAGIC
265 | # MAGIC create table item_pair_stats as
266 | # MAGIC with
267 | # MAGIC bi as (
268 | # MAGIC select basket, item
269 | # MAGIC from basket_item
270 | # MAGIC group by basket, item
271 | # MAGIC ),
272 | # MAGIC item_counts as (
273 | # MAGIC select item, count(*) item_count
274 | # MAGIC from bi
275 | # MAGIC group by item
276 | # MAGIC ),
277 | # MAGIC bi_count as (
278 | # MAGIC select bi.*, ic.item_count
279 | # MAGIC from bi
280 | # MAGIC join item_counts ic on bi.item=ic.item
281 | # MAGIC where ic.item_count > 200
282 | # MAGIC ),
283 | # MAGIC item_pair_stats as (
284 | # MAGIC select bi1.item item1, bi2.item item2,
285 | # MAGIC bi1.item_count item1_count, bi2.item_count item2_count,
286 | # MAGIC count(*) as both_count
287 | # MAGIC from bi_count bi1
288 | # MAGIC join bi_count bi2
289 | # MAGIC on bi1.basket = bi2.basket and bi1.item != bi2.item
290 | # MAGIC group by bi1.item, bi1.item_count,
291 | # MAGIC bi2.item, bi2.item_count
292 | # MAGIC ),
293 | # MAGIC cc as (
294 | # MAGIC SELECT item1, item2, item1_count, item2_count, both_count,
295 | # MAGIC CAST(item1_count AS FLOAT)/(select count(distinct basket) from basket_item) as item1_prevalence,
296 | # MAGIC CAST(item2_count AS FLOAT)/(select count(distinct basket) from basket_item) as item2_prevalence,
297 | # MAGIC CAST(both_count AS FLOAT)/CAST(item1_count AS FLOAT) AS confidence
298 | # MAGIC FROM item_pair_stats
299 | # MAGIC )
300 | # MAGIC select *, confidence/item2_prevalence lift from cc
301 |
302 | # COMMAND ----------
303 |
304 | # MAGIC %md
305 | # MAGIC
306 | # MAGIC # Explore item-pair statistics
307 |
308 | # COMMAND ----------
309 |
310 | # MAGIC %sql
311 | # MAGIC
312 | # MAGIC select * from item_pair_stats order by confidence desc;
313 |
314 | # COMMAND ----------
315 |
316 | # MAGIC %sql
317 | # MAGIC select item1, item2, confidence, lift from item_pair_stats
318 | # MAGIC where item2 rlike 'Non-small cell lung cancer'
319 | # MAGIC and item1 rlike 'MEDICATION'
320 | # MAGIC order by lift desc;
321 |
322 | # COMMAND ----------
323 |
324 | # MAGIC %md
325 | # MAGIC
326 | # MAGIC ### Extra credit:
327 | # MAGIC
328 | # MAGIC * How would you find the low-confidence examples?
329 | # MAGIC
330 | # MAGIC * What medication has the highest lift for predicting Non-small cell lung cancer? Is it reasonable to use this as a predictor?
331 | # MAGIC
332 | # MAGIC * What does a lift less than 1 mean?
333 |
334 | # COMMAND ----------
335 |
336 | # MAGIC %md
337 | # MAGIC
338 | # MAGIC # Generate Interactive Co-occurrence Graph
339 |
340 | # COMMAND ----------
341 |
342 | # MAGIC %md
343 | # MAGIC
344 | # MAGIC We can't plot all the edges in this graph, so we need to filter out the weak ones. First let's plot a distribution and decide where to make the cut-off:
345 |
346 | # COMMAND ----------
347 |
348 | # MAGIC %python
349 | # MAGIC
350 | # MAGIC # select all the item pairs with confidence greater than 0.5
351 | # MAGIC ip_stats = spark.sql("select * from item_pair_stats where confidence > 0.5").toPandas()
352 | # MAGIC
353 | # MAGIC # reformat as two separate tables, one for nodes and the other for edges
354 | # MAGIC nodes, edges = get_nodes_and_edges_from_item_pair_stats(ip_stats)
355 | # MAGIC
356 | # MAGIC # decide which colors to use for the different categories of nodes
357 | # MAGIC color_map = {'PATIENT': '#FF9999', 'CONDITION': '#9999FF', 'MEDICATION': '#99FF99', 'OBSERVATION':'#FFFF99'}
358 | # MAGIC
359 | # MAGIC # split off the category type from the node label
360 | # MAGIC label_parts = [lbl.split(':') for lbl in nodes['label']]
361 | # MAGIC
362 | # MAGIC # make separate colums for node characteristics (to be used by the vis.js library)
363 | # MAGIC nodes['category'] = [lp[0] for lp in label_parts]
364 | # MAGIC
365 | # MAGIC # 'label' is the text that appears on the node
366 | # MAGIC nodes['label'] = [lp[1] for lp in label_parts]
367 | # MAGIC # yup, color
368 | # MAGIC nodes['color'] = [color_map[cat] for cat in nodes['category']]
369 | # MAGIC # 'title' is the text that appears on mouseover
370 | # MAGIC nodes['title'] = [ '\n'.join([row['category'],
371 | # MAGIC row['label'],
372 | # MAGIC 'count: ' + str(row['count']),
373 | # MAGIC 'prevalence: ' + str(row['prevalence'])])
374 | # MAGIC for i, row in nodes.iterrows()]
375 | # MAGIC
376 | # MAGIC nodes
377 |
378 | # COMMAND ----------
379 |
380 | # MAGIC %python
381 | # MAGIC
382 | # MAGIC display(ip_stats.hist(column='confidence', bins=15)[0][0]) ### ???
383 |
384 | # COMMAND ----------
385 |
386 |
387 |
388 | # COMMAND ----------
389 |
390 | # MAGIC %python
391 | # MAGIC
392 | # MAGIC # make sure the plots directory exists, then save the cooccurrence plot there
393 | # MAGIC dbutils.fs.mkdirs('/FileStore/plots')
394 | # MAGIC export_to_vis_js(nodes, edges, 'Synthea Co-occurrence Demo', '/dbfs/FileStore/plots/synthea_cooccurrence_demo.html')
395 |
396 | # COMMAND ----------
397 |
398 | # MAGIC %md
399 | # MAGIC
400 | # MAGIC This is just a demo of linking to the file store. You will need to customize the hyperlink by copying the correct number from your own Databricks URL:
401 | # MAGIC
402 | # MAGIC `https://adb-1953517438448055.15.azuredatabricks.net/?o=`__1953517438448055__`#notebook/3520119352938610/command/2583312897290483`
403 | # MAGIC
404 | # MAGIC
405 | # MAGIC View results [here](https://adb-7320327251662587.7.azuredatabricks.net/files/plots/synthea_cooccurrence_demo.html?o=7320327251662587)
406 |
407 | # COMMAND ----------
408 |
409 |
410 |
--------------------------------------------------------------------------------
/3_Synthea_predict_breast_cancer.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %md
3 | # MAGIC
4 | # MAGIC # Train an Explainable ML Classifier
5 |
6 | # COMMAND ----------
7 |
8 | # If you have not permanently installed this package on your cluster, you can just install it temporarily by removing the # sign from the next line:
9 | #! pip install interpret
10 |
11 | # COMMAND ----------
12 |
13 | # MAGIC %sql
14 | # MAGIC use emr_sample;
15 |
16 | # COMMAND ----------
17 |
18 | # MAGIC %md
19 | # MAGIC
20 | # MAGIC # Breast Cancer
21 | # MAGIC
22 | # MAGIC Predict whether breast cancer will be diagnosed in a given encounter. Exclude patients who have a current diagnosis of breast cancer.
23 |
24 | # COMMAND ----------
25 |
26 | # MAGIC %sql
27 | # MAGIC
28 | # MAGIC select description, count(*) tally from conditions where code = '254837009' group by description
29 |
30 | # COMMAND ----------
31 |
32 | # DBTITLE 1,Feature engineering
33 | # MAGIC %sql
34 | # MAGIC create or replace temporary view patient_breast_cancer as
35 | # MAGIC with
36 | # MAGIC retro_numbered_encounters as (
37 | # MAGIC SELECT *,
38 | # MAGIC ROW_NUMBER() OVER (PARTITION BY patient ORDER BY date(start) DESC) AS row_number
39 | # MAGIC FROM encounters
40 | # MAGIC ),
41 | # MAGIC most_recent_encounter as (
42 | # MAGIC select * from retro_numbered_encounters where row_number = 1
43 | # MAGIC ),
44 | # MAGIC breast_ca_conditions as (
45 | # MAGIC select * from conditions c where c.code = 254837009 -- 'Malignant neoplasm of breast (disorder)'
46 | # MAGIC )
47 | # MAGIC select concat_ws(' ', p.first, p.last) patient_name, p.gender, p.race, p.ethnicity,
48 | # MAGIC floor (datediff(date(e.start), date(p.birthdate))/365.24) age,
49 | # MAGIC case when c.code is null then 0 else 1 end as breast_cancer
50 | # MAGIC from most_recent_encounter e
51 | # MAGIC join patients p on e.patient = p.id
52 | # MAGIC left outer join breast_ca_conditions c on c.patient = e.patient;
53 | # MAGIC
54 | # MAGIC
55 | # MAGIC
56 | # MAGIC select * from patient_breast_cancer limit 5;
57 |
58 | # COMMAND ----------
59 |
60 | # MAGIC %python
61 | # MAGIC
62 | # MAGIC pbc = spark.sql('select * from patient_breast_cancer').toPandas()
63 | # MAGIC # type(pbc.age[0]) # decimal.Decimal
64 | # MAGIC pbc['age'] = pbc['age'].astype(float)
65 |
66 | # COMMAND ----------
67 |
68 | # MAGIC %python
69 | # MAGIC
70 | # MAGIC pbc['age'].dtypes
71 |
72 | # COMMAND ----------
73 |
74 | # MAGIC %python
75 | # MAGIC
76 | # MAGIC import pandas as pd
77 | # MAGIC from sklearn.model_selection import train_test_split
78 | # MAGIC from interpret.glassbox import ExplainableBoostingClassifier
79 | # MAGIC
80 | # MAGIC X = pbc[['gender', 'race', 'ethnicity', 'age']]
81 | # MAGIC y = pbc['breast_cancer']
82 | # MAGIC
83 | # MAGIC seed = 1
84 | # MAGIC X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)
85 | # MAGIC
86 | # MAGIC ebm = ExplainableBoostingClassifier(random_state=seed)
87 | # MAGIC ebm.fit(X_train, y_train)
88 |
89 | # COMMAND ----------
90 |
91 | # MAGIC %python
92 | # MAGIC
93 | # MAGIC from interpret import show
94 | # MAGIC
95 | # MAGIC ebm_global = ebm.explain_global()
96 | # MAGIC show(ebm_global)
97 |
98 | # COMMAND ----------
99 |
100 | # MAGIC %python
101 | # MAGIC
102 | # MAGIC ebm_local = ebm.explain_local(X_test[10:15], y_test[10:15])
103 | # MAGIC show(ebm_local)
104 |
105 | # COMMAND ----------
106 |
107 | # MAGIC %python
108 | # MAGIC # y_test
109 | # MAGIC
110 | # MAGIC p_test = ebm.predict_proba(X_test)[:,1]
111 | # MAGIC
112 | # MAGIC actual_predicted_pdf = pd.DataFrame({'actual':y_test, 'predicted_probability':p_test})
113 | # MAGIC
114 | # MAGIC ## I'll plot these densities in R. Export the data to the database:
115 | # MAGIC spark.createDataFrame(actual_predicted_pdf).createOrReplaceTempView("actual_predicted")
116 | # MAGIC
117 | # MAGIC ## or make a permanent table:
118 | # MAGIC # actual_predicted_pdf.write.mode("overwrite").saveAsTable("actual_predicted")
119 |
120 | # COMMAND ----------
121 |
122 | # MAGIC %r
123 | # MAGIC options(repr.plot.width=800, repr.plot.height=400)
124 |
125 | # COMMAND ----------
126 |
127 | # MAGIC %r
128 | # MAGIC library(dplyr)
129 | # MAGIC library(sparklyr)
130 | # MAGIC library(ggplot2)
131 | # MAGIC
132 | # MAGIC sc <- spark_connect(method = "databricks")
133 | # MAGIC
134 | # MAGIC spark_read_table(sc, "actual_predicted") %>%
135 | # MAGIC collect %>% # download it locally
136 | # MAGIC mutate(actual=factor(actual)) %>%
137 | # MAGIC ggplot(aes(x=predicted_probability, fill=actual)) + geom_density(alpha=0.5)
138 |
139 | # COMMAND ----------
140 |
141 | # MAGIC %python
142 | # MAGIC
143 | # MAGIC from interpret import perf
144 | # MAGIC roc = perf.ROC(ebm.predict_proba, feature_names=X_train.columns)
145 | # MAGIC
146 | # MAGIC roc_explanation = roc.explain_perf(X_test, y_test)
147 | # MAGIC show(roc_explanation)
148 |
149 | # COMMAND ----------
150 |
151 |
152 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Robert M. Horton, PhD
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/ML_with_simulated_EMR.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmhorton/EMR-data-science/56efdf97f4961f9b948b3b2ced88d0637ca9c27c/ML_with_simulated_EMR.pptx
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # EMR-data-science: Introduction to Data Science with Simulated Electronic Medical Record Data
2 |
3 | ## Sanity checking skills for clinical informatics
4 |
5 | This is a collection of open source educational materials (mostly Databricks notebooks) for introducing fundamental concepts of data science to a clinical audience. We focus on exploratory analysis, visualization, and interpretable machine learning (ML) models assuming that these will be particularly useful skills for clinician data scientists involved in planning and oversight of research, who will need to sanity check various findings.
6 |
7 | The data for these exercises was generated by Synthea using the standard collection of modules. ML is most useful in situations where classifications or predictions of outcomes must be made on the basis of many weak associations (if they can be made based on a small number of strong associations, you probably don't need ML). Unfortunately, Synthea data often lacks the subtle statistical relationships among variables that would make for compelling machine learning demonstrations. The missing subtlety is sometimes manifested in associations that have not been included in the simulation, and sometimes in associations that are overly significant. This makes some outcomes impossible to predict, while others can be predicted with far too great certainty.
8 |
9 | However, the same assortment of statistically inappropriate relationships that make it difficult to demonstrate ML on this data make it a treasure trove for sanity checking! Clinicians will easily be able to identify associations between disorders, treatments, observations, and patient characteristics that are either suspiciously strong or conspicuously absent.
10 |
11 | After negotiating some potential pitfalls, we are able to identify a set of features correlated (but not too strongly correlated) with a clinical outcome, which lets us demonstrate a machine learning classifier. The model we use is an Explainable Boosting Machine (EBM), a form of generalized additive model that comes with its own visualization tools for understanding the contribution of each feature to the prediction.
12 |
13 | These are the HTML versions of the notebooks:
14 |
15 | - [0_Load_Data](https://rmhorton.github.io/EMR-data-science/0_Load_Data.html)
16 | - [1_Synthea_exploration](https://rmhorton.github.io/EMR-data-science/1_Synthea_exploration.html)
17 | - [2_Synthea_cooccurrence](https://rmhorton.github.io/EMR-data-science/2_Synthea_cooccurrence.html)
18 | - [3_Synthea_predict_breast_cancer](https://rmhorton.github.io/EMR-data-science/3_Synthea_predict_breast_cancer.html)
19 |
20 | Co-occurrence plots, using various metrics:
21 | - [confidence](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html)
22 | - [lift](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html?metric=lift)
23 | - [log2lift](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html?metric=log2lift)
24 |
25 |
26 | ## Sample Data
27 |
28 | The 'sample_data.zip' archive contains CSV files copied from the "[Synthetic Mass](https://synthetichealth.github.io/synthea-sample-data/downloads/synthea_sample_data_csv_apr2020.zip)" 1k patient sample.
29 |
30 | This dataset is described in this reference:
31 | ```
32 | Walonoski J, Klaus S, Granger E, Hall D, Gregorowicz A, Neyarapally G, Watson A, Eastman J.
33 | Synthea™ Novel coronavirus (COVID-19) model and synthetic data set.
34 | Intelligence-Based Medicine. 2020 Nov;1:100007. https://doi.org/10.1016/j.ibmed.2020.100007
35 | ```
36 |
37 | ## Workshop Instructions
38 |
39 | The workshop instructions are in the [ML_with_simulated_EMR.pptx](ML_with_simulated_EMR.pptx) file; see Part 0: Setting up Databricks.
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | HTML files from open source workshop materials.
2 |
3 | Copies of notebooks:
4 | - [0_Load_Data](https://rmhorton.github.io/EMR-data-science/0_Load_Data.html)
5 | - [1_Synthea_exploration](https://rmhorton.github.io/EMR-data-science/1_Synthea_exploration.html)
6 | - [2_Synthea_cooccurrence](https://rmhorton.github.io/EMR-data-science/2_Synthea_cooccurrence.html)
7 | - [3_Synthea_predict_breast_cancer](https://rmhorton.github.io/EMR-data-science/3_Synthea_predict_breast_cancer.html)
8 |
9 | - test[3_Synthea_predict_breast_cancer](https://rmhorton.github.io/virtual-generalist/workshop/3_Synthea_predict_breast_cancer.html)
10 |
11 | Interactive visualizations:
12 | - [confidence](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html)
13 | - [lift](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html?metric=lift)
14 | - [log2lift](https://rmhorton.github.io/EMR-data-science/synthea_cooccurrence_demo.html?metric=log2lift)
15 |
--------------------------------------------------------------------------------
/extra_credit.sql:
--------------------------------------------------------------------------------
1 | -- What is the total number of patients?
2 | select count(distinct id) from patients;
3 | ;
4 |
5 | -- What was the date of the most recent encounter for each patient?
6 | select patient, max(date(START)) most_recent_encounter from encounters group by patient
7 | ;
8 |
9 | -- How would you discover observations related to 'pain'?
10 | select description, count(*) tally from observations where lower(description) rlike 'pain' group by description
11 | ;
12 |
13 | --- What are the different kinds of encounters, and how many of each are in the database?
14 | select encounterclass, count(*) tally from encounters group by encounterclass order by tally desc
15 | ;
16 |
17 | -- What is the most common medication and dose?
18 | select description, count(*) tally from medications group by description order by tally desc
19 | ;
20 |
21 | -- What is the most common disorder treated by medication?
22 | select reasondescription, count(*) tally from medications group by reasondescription order by tally desc
23 | ;
24 |
25 | --- What are the most common prescriptions for hypertension?
26 | select description, count(*) tally from medications where reasondescription == 'Hypertension' group by description order by tally desc
27 | ;
28 |
29 | How would you get only the latest measurement for each patient?
30 |
--------------------------------------------------------------------------------
/sample_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rmhorton/EMR-data-science/56efdf97f4961f9b948b3b2ced88d0637ca9c27c/sample_data.zip
--------------------------------------------------------------------------------