├── py ├── terra_widgets │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ └── test_workspace_paths.py │ ├── workspace_metadata.py │ ├── workspace_paths.py │ └── html_snapshots.py ├── requirements.txt ├── setup.py ├── README.md └── py_cromwell_setup.py ├── storage-snippets ├── snippets_setup.R ├── snippets_setup.py ├── interact_with_html_snapshots.py ├── list_objects_in_bucket.R ├── list_objects_in_bucket.py ├── copy_file_from_workspace_bucket.R ├── copy_file_from_workspace_bucket.py ├── copy_data_to_workspace_bucket.py ├── copy_data_to_workspace_bucket.R └── README.md ├── .gitignore ├── sql-snippets ├── total_number_of_participants.sql ├── number_of_participants_with_measurements.sql ├── number_of_participants_with_med_conditions.sql ├── measurement_of_interest_by_site.ggplot ├── measurement_of_interest_by_site.plotnine ├── measurement_of_interest_by_sex_at_birth.plotnine ├── most_recent_measurement_of_interest_by_site.ggplot ├── measurement_of_interest_by_sex_at_birth.ggplot ├── most_recent_measurement_of_interest_by_sex_at_birth.plotnine ├── most_recent_measurement_of_interest_by_site.plotnine ├── most_recent_measurement_of_interest_by_sex_at_birth.ggplot ├── measurement_of_interest_by_age_and_sex_at_birth.ggplot ├── most_recent_measurement_of_interest_by_age_and_sex_at_birth.ggplot ├── snippets_setup.py ├── measurement_of_interest_by_age_and_sex_at_birth.plotnine ├── measurement_of_interest.sql ├── most_recent_measurement_of_interest_by_age_and_sex_at_birth.plotnine ├── snippets_setup.R ├── most_recent_measurement_of_interest.sql ├── measurements_of_interest_summary.sql ├── measurements_of_interest_summary_test.py ├── most_recent_measurement_of_interest_test.py ├── measurement_of_interest_test.py └── README.md ├── dataset-snippets ├── summarize_a_dataframe.R ├── summarize_a_dataframe.py ├── add_age_to_demographics.py ├── join_dataframes.py ├── add_age_to_demographics.R ├── snippets_setup.py ├── join_dataframes.R ├── snippets_setup.R ├── summarize_a_survey_module.R ├── summarize_a_survey_module.py ├── measurement_by_sex_at_birth.ggplot ├── measurement_by_sex_at_birth.plotnine ├── measurement_by_age_and_sex_at_birth.ggplot ├── summarize_a_survey_by_question_concept_id.py ├── summarize_a_survey_by_question_concept_id.R ├── measurement_by_age_and_sex_at_birth.plotnine └── README.md ├── .github └── PULL_REQUEST_TEMPLATE.md ├── README.md ├── LICENSE.txt ├── r └── r_cromwell_setup.R └── CONTRIBUTING.md /py/terra_widgets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /py/terra_widgets/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /py/requirements.txt: -------------------------------------------------------------------------------- 1 | firecloud 2 | ipython 3 | ipywidgets 4 | multiprocess 5 | pandas 6 | -------------------------------------------------------------------------------- /storage-snippets/snippets_setup.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) # Data wrangling packages. 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | *smoke_test.R 6 | *smoke_test.py 7 | *.json 8 | *.html 9 | -------------------------------------------------------------------------------- /storage-snippets/snippets_setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | -------------------------------------------------------------------------------- /storage-snippets/interact_with_html_snapshots.py: -------------------------------------------------------------------------------- 1 | from terra_widgets.html_snapshots import display_html_snapshots_widget 2 | 3 | # This will display a user interface to interact with HTML snapshots stored in the workspace bucket. 4 | display_html_snapshots_widget() 5 | -------------------------------------------------------------------------------- /sql-snippets/total_number_of_participants.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Compute the count of unique participants in our All of Us cohort. 3 | SELECT 4 | COUNT(DISTINCT person_id) AS total_number_of_participants 5 | FROM 6 | `{CDR}.person` 7 | WHERE 8 | person_id IN ({COHORT_QUERY}) 9 | -------------------------------------------------------------------------------- /storage-snippets/list_objects_in_bucket.R: -------------------------------------------------------------------------------- 1 | # This snippet assumes that you run setup first 2 | 3 | # This code lists objects in your Google Bucket 4 | 5 | # Get the bucket name 6 | my_bucket <- Sys.getenv('WORKSPACE_BUCKET') 7 | 8 | # List objects in the bucket 9 | system(paste0("gsutil ls -r ", my_bucket), intern=T) 10 | 11 | 12 | -------------------------------------------------------------------------------- /sql-snippets/number_of_participants_with_measurements.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Compute the count of unique participants in our All of Us cohort 3 | -- that have at least one measurement. 4 | SELECT 5 | COUNT(DISTINCT person_id) AS number_of_participants_with_measurements 6 | FROM 7 | `{CDR}.measurement` 8 | WHERE 9 | person_id IN ({COHORT_QUERY}) 10 | -------------------------------------------------------------------------------- /sql-snippets/number_of_participants_with_med_conditions.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Compute the count of unique participants in our All of Us cohort 3 | -- that have at least one condition. 4 | SELECT 5 | COUNT(DISTINCT person_id) AS number_of_participants_with_med_conditions 6 | FROM 7 | `{CDR}.condition_occurrence` 8 | WHERE 9 | person_id IN ({COHORT_QUERY}) 10 | -------------------------------------------------------------------------------- /storage-snippets/list_objects_in_bucket.py: -------------------------------------------------------------------------------- 1 | # This snippet assumes that you run setup first 2 | 3 | # This code lists objects in your Google Bucket 4 | 5 | # Get the bucket name 6 | my_bucket = os.getenv('WORKSPACE_BUCKET') 7 | 8 | # List objects in the bucket 9 | print(subprocess.check_output(f"gsutil ls -r {my_bucket}", shell=True).decode('utf-8')) 10 | 11 | 12 | -------------------------------------------------------------------------------- /dataset-snippets/summarize_a_dataframe.R: -------------------------------------------------------------------------------- 1 | # Use snippet 'summarize_a_dataframe' to display summary statistics for a dataframe. 2 | # It assumes snippet 'Setup' has been executed. 3 | # See also https://www.rdocumentation.org/packages/skimr/versions/1.0.7/topics/skim 4 | 5 | 6 | ## -----[ CHANGE THE DATAFRAME NAME(S) TO MATCH YOURS FROM DATASET BUILDER] ----- 7 | print(skim(YOUR_DATASET_NAME_person_df)) 8 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | \ 2 | 3 | Unfortunately we don't have automated testing configured for the code in this 4 | repository yet so we set up this checklist as an *automatic reminder*: 5 | 6 | - [ ] Ensure that the smoke tests pass using the current (or upcoming) CDR 7 | - [ ] Update documentation relevant to this pull request 8 | 9 | Questions? See [CONTRIBUTING.md](https://github.com/all-of-us/workbench-snippets/blob/master/CONTRIBUTING.md) 10 | or file an issue so that we can get it documented! 11 | -------------------------------------------------------------------------------- /dataset-snippets/summarize_a_dataframe.py: -------------------------------------------------------------------------------- 1 | # Use snippet 'summarize_a_dataframe' to display summary statistics for a dataframe. 2 | # It assumes snippet 'Setup' has been executed. 3 | # See also https://towardsdatascience.com/exploring-your-data-with-just-1-line-of-python-4b35ce21a82d 4 | 5 | 6 | ## -----[ CHANGE THE DATAFRAME NAME(S) TO MATCH YOURS FROM DATASET BUILDER] ----- 7 | YOUR_DATASET_NAME_person_df.loc[:10000,:].profile_report() # Examine up to the first 10,000 rows. Larger 8 | # dataframes can be profiled, but it takes more time. 9 | -------------------------------------------------------------------------------- /dataset-snippets/add_age_to_demographics.py: -------------------------------------------------------------------------------- 1 | # Use snippet 'add_age_to_demographics' to calculate the age of people in your demographics. 2 | # It assumes the 'Setup' snippet has been executed. 3 | # It also assumes that you got your demographics dataframe from Dataset Builder 4 | 5 | # Note: This snippet calculates current age and does not take into account whether the person is already dead 6 | 7 | 8 | ## -----[ CHANGE THE DATAFRAME NAME(S) `YOUR_DATASET_NAME_person_df` TO MATCH YOURS FROM DATASET BUILDER] ----- 9 | YOUR_DATASET_NAME_person_df['age'] = pd.to_datetime('today').year - YOUR_DATASET_NAME_person_df['date_of_birth'].dt.year 10 | -------------------------------------------------------------------------------- /dataset-snippets/join_dataframes.py: -------------------------------------------------------------------------------- 1 | # Use snippet 'join_dataframes' to join together two dataframes. 2 | # It assumes the 'Setup' snippet has been executed. 3 | # 4 | # In the example below, it joins Demographics '_person_df' and Measurements '_measurement_df' using 5 | # any columns they have in common, which in this case should only be 'person_id'. 6 | # 7 | # See also https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/pandas.merge.html 8 | 9 | 10 | ## -----[ CHANGE THE DATAFRAME NAME(S) TO MATCH YOURS FROM DATASET BUILDER] ----- 11 | measurement_df = pd.merge(left=YOUR_DATASET_NAME_person_df, right=YOUR_DATASET_NAME_measurement_df, how='inner') 12 | 13 | measurement_df.shape 14 | -------------------------------------------------------------------------------- /dataset-snippets/add_age_to_demographics.R: -------------------------------------------------------------------------------- 1 | # Use snippet 'add_age_to_demographics' to calculate the age of people in your demographics. 2 | # It assumes the 'Setup' snippet has been executed. 3 | # It also assumes that you got your demographics dataframe from Dataset Builder 4 | 5 | # Note: This snippet calculates current age and does not take into account whether the person is already dead 6 | 7 | 8 | ## -----[ CHANGE THE DATAFRAME NAME(S) `YOUR_DATASET_NAME_person_df` TO MATCH YOURS FROM DATASET BUILDER] ----- 9 | YOUR_DATASET_NAME_person_df <- YOUR_DATASET_NAME_person_df %>% 10 | mutate_if(is.list, as.character) %>% 11 | mutate(age = year(today()) - year(YOUR_DATASET_NAME_person_df$date_of_birth)) 12 | -------------------------------------------------------------------------------- /sql-snippets/measurement_of_interest_by_site.ggplot: -------------------------------------------------------------------------------- 1 | # This plot assumes that measurement_of_interest.sql has been run. 2 | options(repr.plot.height = 8, repr.plot.width = 16) 3 | 4 | measurement_of_interest_df %>% 5 | filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers. 6 | ggplot(aes(x = src_id, y = value_as_number)) + 7 | geom_boxplot() + 8 | stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4, 9 | position = position_dodge(width = 0.9), vjust = -0.8) + 10 | # scale_y_log10() + # Uncomment if the data looks skewed. 11 | coord_flip() + 12 | ylab(str_glue('{UNIT_NAME}')) + 13 | labs(title = str_glue('All {MEASUREMENT_NAME} measurements, by site'), 14 | caption = 'Source: All Of Us Data') 15 | -------------------------------------------------------------------------------- /sql-snippets/measurement_of_interest_by_site.plotnine: -------------------------------------------------------------------------------- 1 | # This plot assumes that measurement_of_interest.sql has been run. 2 | 3 | # meas_filter is a column of True and False 4 | meas_filter = measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers. 5 | (ggplot(measurement_of_interest_df[meas_filter], aes(x = 'src_id', y = 'value_as_number')) + 6 | geom_boxplot() + 7 | stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10, 8 | position = position_dodge(width = 0.9), va = 'top') + 9 | # scale_y_log10() + # Uncomment if the data looks skewed. 10 | coord_flip() + 11 | ylab(f'{UNIT_NAME}') + 12 | ggtitle(f'All {MEASUREMENT_NAME} measurements, by site\nSource: All Of Us Data') + 13 | theme(figure_size=(12, 6))) 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Workbench snippets 2 | 3 | ## How to use the workbench snippets 4 | 5 | Please see the welcome page for the [All of Us Researcher Workbench](https://workbench.researchallofus.org/). It has both a tutorial video and several articles in user support documentation. 6 | 7 | ## How to add/update a workbench snippet 8 | 9 | First see [CONTRIBUTING](./CONTRIBUTING.md) for general getting started instructions. 10 | 11 | If you want to add/modify a snippet that uses a dataframe from Dataset Builder as its input, then see [dataset-snippets/README](./dataset-snippets/README.md). 12 | 13 | Otherwise, see the other snippets collections such as 14 | 15 | * [sql-snippets/README](./sql-snippets/README.md) 16 | * [storage-snippets/README](./storage-snippets/README.md) 17 | * [terra-widgets/README](./py/README.md) 18 | -------------------------------------------------------------------------------- /dataset-snippets/snippets_setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import pandas_profiling 5 | import plotnine 6 | from plotnine import * # Provides a ggplot-like interface to matplotlib. 7 | from IPython.display import display 8 | 9 | ## Plot setup. 10 | theme_set(theme_bw(base_size = 11)) # Default theme for plots. 11 | 12 | def get_boxplot_fun_data(df): 13 | """Returns a data frame with a y position and a label, for use annotating ggplot boxplots. 14 | 15 | Args: 16 | d: A data frame. 17 | Returns: 18 | A data frame with column y as max and column label as length. 19 | """ 20 | d = {'y': max(df), 'label': f'N = {len(df)}'} 21 | return(pd.DataFrame(data=d, index=[0])) 22 | 23 | # NOTE: if you get any errors from this cell, restart your kernel and run it again. 24 | -------------------------------------------------------------------------------- /dataset-snippets/join_dataframes.R: -------------------------------------------------------------------------------- 1 | # Use snippet 'join_dataframes' to join together two dataframes. 2 | # It assumes the 'Setup' snippet has been executed. 3 | # 4 | # In the example below, it joins Demographics '_person_df' and Measurements '_measurement_df' using 5 | # any columns they have in common, which in this case should only be 'person_id'. 6 | # 7 | # See also https://dplyr.tidyverse.org/reference/join.html and https://r4ds.had.co.nz/relational-data.html#understanding-joins 8 | 9 | 10 | ## -----[ CHANGE THE DATAFRAME NAME(S) TO MATCH YOURS FROM DATASET BUILDER] ----- 11 | measurement_df <- inner_join(YOUR_DATASET_NAME_person_df, 12 | YOUR_DATASET_NAME_measurement_df) %>% 13 | mutate_if(is.list, as.character) # Convert column type list as character. 14 | 15 | dim(measurement_df) 16 | -------------------------------------------------------------------------------- /sql-snippets/measurement_of_interest_by_sex_at_birth.plotnine: -------------------------------------------------------------------------------- 1 | # This plot assumes that measurement_of_interest.sql has been run. 2 | 3 | # meas_filter is a column of True and False 4 | meas_filter = measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers. 5 | (ggplot(measurement_of_interest_df[meas_filter], aes(x = 'sex_at_birth', y = 'value_as_number')) + 6 | geom_boxplot() + 7 | stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10, 8 | position = position_dodge(width = 0.9), va = 'top') + 9 | # scale_y_log10() + # Uncomment if the data looks skewed. 10 | ylab(f'{UNIT_NAME}') + 11 | ggtitle(f'All {MEASUREMENT_NAME} measurements, by site\nSource: All Of Us Data') + 12 | theme(figure_size=(12, 6), axis_text_x = element_text(angle=25, hjust=1))) 13 | -------------------------------------------------------------------------------- /sql-snippets/most_recent_measurement_of_interest_by_site.ggplot: -------------------------------------------------------------------------------- 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run. 2 | options(repr.plot.height = 8, repr.plot.width = 16) 3 | 4 | most_recent_measurement_of_interest_df %>% 5 | filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers. 6 | ggplot(aes(x = src_id, y = value_as_number)) + 7 | geom_boxplot() + 8 | stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4, 9 | position = position_dodge(width = 0.9), vjust = -0.8) + 10 | # scale_y_log10() + # Uncomment if the data looks skewed. 11 | coord_flip() + 12 | ylab(str_glue('{UNIT_NAME}')) + 13 | labs(title = str_glue('Most recent {MEASUREMENT_NAME} measurement\nper person, by site'), 14 | caption = 'Source: All Of Us Data') 15 | -------------------------------------------------------------------------------- /sql-snippets/measurement_of_interest_by_sex_at_birth.ggplot: -------------------------------------------------------------------------------- 1 | # This plot assumes that measurement_of_interest.sql has been run. 2 | options(repr.plot.height = 8, repr.plot.width = 16) 3 | 4 | measurement_of_interest_df %>% 5 | filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers. 6 | ggplot(aes(x = sex_at_birth, y = value_as_number)) + 7 | geom_boxplot() + 8 | stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4, 9 | position = position_dodge(width = 0.9), vjust = -0.8) + 10 | # scale_y_log10() + # Uncomment if the data looks skewed. 11 | ylab(str_glue('{UNIT_NAME}')) + 12 | labs(title = str_glue('All {MEASUREMENT_NAME} measurements, by sex_at_birth'), 13 | caption = 'Source: All Of Us Data') + 14 | theme(axis.text.x = element_text(angle=25, hjust=1)) 15 | -------------------------------------------------------------------------------- /sql-snippets/most_recent_measurement_of_interest_by_sex_at_birth.plotnine: -------------------------------------------------------------------------------- 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run. 2 | 3 | meas_filter = most_recent_measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers. 4 | (ggplot(most_recent_measurement_of_interest_df[meas_filter], aes(x = 'sex_at_birth', y = 'value_as_number')) + 5 | geom_boxplot() + 6 | stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10, 7 | position = position_dodge(width = 0.9), va = 'top') + 8 | # scale_y_log10() + # Uncomment if the data looks skewed. 9 | ylab(f'{UNIT_NAME}') + 10 | ggtitle(f'Most recent {MEASUREMENT_NAME} measurement\nper person, by sex_at_birth\nSource: All Of Us Data') + 11 | theme(figure_size=(12, 6), axis_text_x = element_text(angle=25, hjust=1))) 12 | -------------------------------------------------------------------------------- /sql-snippets/most_recent_measurement_of_interest_by_site.plotnine: -------------------------------------------------------------------------------- 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run. 2 | 3 | # meas_filter is a column of True and False 4 | meas_filter = most_recent_measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers. 5 | (ggplot(most_recent_measurement_of_interest_df[meas_filter], aes(x = 'src_id', y = 'value_as_number')) + 6 | geom_boxplot() + 7 | stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10, 8 | position = position_dodge(width = 0.9), va = 'top') + 9 | # scale_y_log10() + # Uncomment if the data looks skewed. 10 | coord_flip() + 11 | ylab(f'{UNIT_NAME}') + 12 | ggtitle(f'Most recent {MEASUREMENT_NAME} measurement\nper person, by site\nSource: All Of Us Data') + 13 | theme(figure_size=(12, 6))) 14 | -------------------------------------------------------------------------------- /sql-snippets/most_recent_measurement_of_interest_by_sex_at_birth.ggplot: -------------------------------------------------------------------------------- 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run. 2 | options(repr.plot.height = 8, repr.plot.width = 16) 3 | 4 | most_recent_measurement_of_interest_df %>% 5 | filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers. 6 | ggplot(aes(x = sex_at_birth, y = value_as_number)) + 7 | geom_boxplot() + 8 | stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4, 9 | position = position_dodge(width = 0.9), vjust = -0.8) + 10 | # scale_y_log10() + # Uncomment if the data looks skewed. 11 | ylab(str_glue('{UNIT_NAME}')) + 12 | labs(title = str_glue('Most recent {MEASUREMENT_NAME} measurement\nper person, by sex_at_birth'), 13 | caption = 'Source: All Of Us Data') + 14 | theme(axis.text.x = element_text(angle=25, hjust=1)) 15 | -------------------------------------------------------------------------------- /storage-snippets/copy_file_from_workspace_bucket.R: -------------------------------------------------------------------------------- 1 | # This snippet assumes that you run setup first 2 | 3 | # This code copies a file from your Google Bucket into a dataframe 4 | 5 | # replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks) 6 | name_of_file_in_bucket <- 'test.csv' 7 | 8 | ######################################################################## 9 | ## 10 | ################# DON'T CHANGE FROM HERE ############################### 11 | ## 12 | ######################################################################## 13 | 14 | # Get the bucket name 15 | my_bucket <- Sys.getenv('WORKSPACE_BUCKET') 16 | 17 | # Copy the file from current workspace to the bucket 18 | system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T) 19 | 20 | # Load the file into a dataframe 21 | my_dataframe <- read_csv(name_of_file_in_bucket) 22 | head(my_dataframe) 23 | -------------------------------------------------------------------------------- /dataset-snippets/snippets_setup.R: -------------------------------------------------------------------------------- 1 | lapply(c('viridis', 'ggthemes', 'skimr'), 2 | function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } ) 3 | 4 | library(viridis) # A nice color scheme for plots. 5 | library(ggthemes) # Common themes to change the look and feel of plots. 6 | library(scales) # Graphical scales map data to aesthetics in plots. 7 | library(skimr) # Better summaries of data. 8 | library(lubridate) # Date library from the tidyverse. 9 | library(tidyverse) # Data wrangling packages. 10 | library(bigrquery) # Data extraction from Google BigQuery 11 | 12 | ## Plot setup. 13 | theme_set(theme_bw(base_size = 14)) # Default theme for plots. 14 | 15 | #' Returns a data frame with a y position and a label, for use annotating ggplot boxplots. 16 | #' 17 | #' @param d A data frame. 18 | #' @return A data frame with column y as max and column label as length. 19 | get_boxplot_fun_data <- function(df) { 20 | return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df)))) 21 | } 22 | -------------------------------------------------------------------------------- /storage-snippets/copy_file_from_workspace_bucket.py: -------------------------------------------------------------------------------- 1 | # This snippet assumes you run setup first 2 | 3 | # This code copies file in your Google Bucket and loads it into a dataframe 4 | 5 | # Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks) 6 | name_of_file_in_bucket = 'test.csv' 7 | 8 | ######################################################################## 9 | ## 10 | ################# DON'T CHANGE FROM HERE ############################### 11 | ## 12 | ######################################################################## 13 | 14 | # get the bucket name 15 | my_bucket = os.getenv('WORKSPACE_BUCKET') 16 | 17 | # copy csv file from the bucket to the current working space 18 | os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .") 19 | 20 | print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space') 21 | # save dataframe in a csv file in the same workspace as the notebook 22 | my_dataframe = pd.read_csv(name_of_file_in_bucket) 23 | my_dataframe.head() 24 | -------------------------------------------------------------------------------- /sql-snippets/measurement_of_interest_by_age_and_sex_at_birth.ggplot: -------------------------------------------------------------------------------- 1 | # This plot assumes that measurement_of_interest.sql has been run. 2 | options(repr.plot.height = 20, repr.plot.width = 16) 3 | 4 | measurement_of_interest_df %>% 5 | filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers. 6 | mutate(age_at_measurement = year(as.period(interval(start = birth_datetime, end = measurement_date)))) %>% 7 | ggplot(aes(x = cut_width(age_at_measurement, width = 10, boundary = 0), y = value_as_number)) + 8 | geom_boxplot() + 9 | stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4, 10 | position = position_dodge(width = 0.9), vjust = -0.8) + 11 | # scale_y_log10() + # Uncomment if the data looks skewed. 12 | coord_flip() + 13 | facet_wrap(~ sex_at_birth, nrow = length(unique(measurement_of_interest_df$sex_at_birth))) + 14 | xlab('age') + 15 | ylab(str_glue('{UNIT_NAME}')) + 16 | labs(title = str_glue('All {MEASUREMENT_NAME} measurements, by age, faceted by sex_at_birth'), 17 | caption = 'Source: All Of Us Data') 18 | -------------------------------------------------------------------------------- /storage-snippets/copy_data_to_workspace_bucket.py: -------------------------------------------------------------------------------- 1 | # This snippet assumes you run setup first 2 | 3 | # This code saves your dataframe into a csv file in a "data" folder in Google Bucket 4 | 5 | # Replace df with THE NAME OF YOUR DATAFRAME 6 | my_dataframe = df 7 | 8 | # Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks) 9 | destination_filename = 'test.csv' 10 | 11 | ######################################################################## 12 | ## 13 | ################# DON'T CHANGE FROM HERE ############################### 14 | ## 15 | ######################################################################## 16 | 17 | # save dataframe in a csv file in the same workspace as the notebook 18 | my_dataframe.to_csv(destination_filename, index=False) 19 | 20 | # get the bucket name 21 | my_bucket = os.getenv('WORKSPACE_BUCKET') 22 | 23 | # copy csv file to the bucket 24 | args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"] 25 | output = subprocess.run(args, capture_output=True) 26 | 27 | # print output from gsutil 28 | output.stderr 29 | -------------------------------------------------------------------------------- /storage-snippets/copy_data_to_workspace_bucket.R: -------------------------------------------------------------------------------- 1 | # This snippet assumes that you run setup first 2 | 3 | # This code saves your dataframe into a csv file in a "data" folder in Google Bucket 4 | 5 | # Replace df with THE NAME OF YOUR DATAFRAME 6 | my_dataframe <- df 7 | 8 | # Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks) 9 | destination_filename <- 'test.csv' 10 | 11 | ######################################################################## 12 | ## 13 | ################# DON'T CHANGE FROM HERE ############################### 14 | ## 15 | ######################################################################## 16 | 17 | # store the dataframe in current workspace 18 | write_excel_csv(my_dataframe, destination_filename) 19 | 20 | # Get the bucket name 21 | my_bucket <- Sys.getenv('WORKSPACE_BUCKET') 22 | 23 | # Copy the file from current workspace to the bucket 24 | system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T) 25 | 26 | # Check if file is in the bucket 27 | system(paste0("gsutil ls ", my_bucket, "/data/*.csv"), intern=T) 28 | -------------------------------------------------------------------------------- /py/setup.py: -------------------------------------------------------------------------------- 1 | """A setuptools based module for PIP installation of the Terra widgets package.""" 2 | 3 | import pathlib 4 | from setuptools import find_packages 5 | from setuptools import setup 6 | 7 | here = pathlib.Path(__file__).parent.resolve() 8 | # Get the requirements from the requirements file 9 | requirements = (here / 'requirements.txt').read_text(encoding='utf-8') 10 | # Get the long description from the README file 11 | long_description = (here / 'README.md').read_text(encoding='utf-8') 12 | 13 | setup( 14 | name='terra-widgets', 15 | version='0.0.1', 16 | license='BSD', 17 | 18 | description='Terra Notebook widgets', 19 | long_description=long_description, 20 | long_description_content_type='text/markdown', 21 | 22 | python_requires='>=3.7', 23 | install_requires=requirements, 24 | packages=find_packages(), 25 | 26 | url='https://github.com/all-of-us/workbench-snippets', 27 | project_urls={ 28 | 'Bug Reports': 'https://github.com/all-of-us/workbench-snippets/issues', 29 | 'Source': 'https://github.com/all-of-us/workbench-snippets/blob/main/py', 30 | }, 31 | ) 32 | -------------------------------------------------------------------------------- /sql-snippets/most_recent_measurement_of_interest_by_age_and_sex_at_birth.ggplot: -------------------------------------------------------------------------------- 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run. 2 | options(repr.plot.height = 20, repr.plot.width = 16) 3 | 4 | most_recent_measurement_of_interest_df %>% 5 | filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers. 6 | mutate(age_at_measurement = year(as.period(interval(start = birth_datetime, end = measurement_date)))) %>% 7 | ggplot(aes(x = cut_width(age_at_measurement, width = 10, boundary = 0), y = value_as_number)) + 8 | geom_boxplot() + 9 | stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4, 10 | position = position_dodge(width = 0.9), vjust = -0.8) + 11 | # scale_y_log10() + # Uncomment if the data looks skewed. 12 | coord_flip() + 13 | facet_wrap(~ sex_at_birth, nrow = length(unique(most_recent_measurement_of_interest_df$sex_at_birth))) + 14 | xlab('age') + 15 | ylab(str_glue('{UNIT_NAME}')) + 16 | labs(title = str_glue('Most recent {MEASUREMENT_NAME} measurement\nper person, by age, faceted by sex_at_birth'), 17 | caption = 'Source: All Of Us Data') 18 | -------------------------------------------------------------------------------- /dataset-snippets/summarize_a_survey_module.R: -------------------------------------------------------------------------------- 1 | # Use snippet 'summarize_a_survey_module' to print a table of participant counts by question in a module 2 | # The snippet assumes that a dataframe containing survey questions and answers already exists 3 | 4 | # Update the next 3 lines 5 | 6 | survey_df <- YOUR_DATASET_NAME_survey_df 7 | module_name <- 'The Basics' 8 | denominator <- NULL 9 | 10 | #################################################################################### 11 | # DON'T CHANGE FROM HERE 12 | #################################################################################### 13 | summarize_a_module <- function(df, module=NULL, denominator=NULL) { 14 | if (!is.null(module)){ 15 | df <- df %>% filter(tolower(survey) == tolower(module)) 16 | } 17 | data <- df %>% group_by(survey, question_concept_id, question) %>% 18 | summarize(n_participant = n_distinct(person_id)) 19 | if (!is.null(denominator)) { 20 | data <- data %>% mutate(response_rate = paste0(round(100*n_participant/denominator,2),'%')) 21 | } 22 | data 23 | } 24 | 25 | summarize_a_module(survey_df, module_name, denominator) 26 | 27 | -------------------------------------------------------------------------------- /dataset-snippets/summarize_a_survey_module.py: -------------------------------------------------------------------------------- 1 | # Use snippet 'summarize_a_survey_module' to print a table of participant counts by question in a module 2 | # The snippet assumes that a dataframe containing survey questions and answers already exists 3 | 4 | # Update the next 3 lines 5 | survey_df = YOUR_DATASET_NAME_survey_df 6 | module_name = 'The Basics' # e.g: 'The Basics', 'Lifestyle', 'Overall Health', etc. 7 | denominator = None # e.g: 200000 8 | 9 | #################################################################################### 10 | # DON'T CHANGE FROM HERE 11 | #################################################################################### 12 | 13 | def summarize_a_module(df, module=None, denominator=None): 14 | if module: 15 | df = df[df['survey'].str.lower() == module.lower()].copy() 16 | data = (df.groupby(['survey','question_concept_id','question'])['person_id'].nunique() 17 | .reset_index() 18 | .rename(columns={'person_id':'n_participant'})) 19 | if denominator: 20 | data['response_rate'] = round(100*data['n_participant']/denominator,2) 21 | return data 22 | 23 | summarize_a_module(df=survey_df, module=module_name, denominator=denominator) 24 | 25 | -------------------------------------------------------------------------------- /dataset-snippets/measurement_by_sex_at_birth.ggplot: -------------------------------------------------------------------------------- 1 | # Use snippet 'measurement_by_sex_at_birth' to plot joined demographics and measurements dataframes. 2 | # This plot assumes 'measurement_df' was created using snippet 'Basic operations -> join_dataframes' to 3 | # join together demographics and measurements dataframes. 4 | # See also https://r4ds.had.co.nz/data-visualisation.html 5 | 6 | 7 | options(repr.plot.height = 10, repr.plot.width = 16) 8 | 9 | # There could be many different measurements in the dataframe. By default, plot the first one. 10 | measurement_to_plot <- unique(measurement_df$standard_concept_name)[1] 11 | 12 | measurement_df %>% 13 | filter(standard_concept_name == measurement_to_plot) %>% 14 | filter(!unit_concept_name %in% c('No matching concept', 'NULL')) %>% 15 | filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers. 16 | ggplot(aes(x = sex_at_birth, y = value_as_number)) + 17 | geom_boxplot() + 18 | stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4, 19 | position = position_dodge(width = 0.9), vjust = -0.8) + 20 | # scale_y_log10() + # Uncomment if the data looks skewed. 21 | facet_wrap(standard_concept_name ~ unit_concept_name, ncol = 2, scales = 'free') + 22 | labs(title = str_glue('Numeric values of measurements, by sex_at_birth'), caption = 'Source: All Of Us Data') + 23 | theme(axis.text.x = element_text(angle=25, hjust=1)) 24 | -------------------------------------------------------------------------------- /sql-snippets/snippets_setup.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | import plotnine 6 | from plotnine import * # Provides a ggplot-like interface to matplotlib. 7 | 8 | # Get the BigQuery curated dataset for the current workspace context. 9 | CDR = os.environ['WORKSPACE_CDR'] 10 | 11 | ## Plot setup. 12 | theme_set(theme_bw(base_size = 11)) # Default theme for plots. 13 | 14 | def get_boxplot_fun_data(df): 15 | """Returns a data frame with a y position and a label, for use annotating ggplot boxplots. 16 | 17 | Args: 18 | d: A data frame. 19 | Returns: 20 | A data frame with column y as max and column label as length. 21 | """ 22 | d = {'y': max(df), 'label': f'N = {len(df)}'} 23 | return(pd.DataFrame(data=d, index=[0])) 24 | 25 | ## ---------------[ CHANGE THESE AS NEEDED] --------------------------------------- 26 | # Set default parameter values so that all snippets run successfully with no edits needed. 27 | COHORT_QUERY = f'SELECT person_id FROM `{CDR}.person`' # Default to all participants. 28 | MEASUREMENT_OF_INTEREST = 'hemoglobin' 29 | # Tip: the next four parameters could be set programmatically using one row from 30 | # the result of measurements_of_interest_summary.sql 31 | MEASUREMENT_CONCEPT_ID = 3004410 # Hemoglobin A1c 32 | UNIT_CONCEPT_ID = 8554 # percent 33 | MEASUREMENT_NAME = '' 34 | UNIT_NAME = '' 35 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2019 All of Us Research Program 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /dataset-snippets/measurement_by_sex_at_birth.plotnine: -------------------------------------------------------------------------------- 1 | # Use snippet 'measurement_by_sex_at_birth' to plot joined demographics and measurements dataframes. 2 | # This plot assumes 'measurement_df' was created using snippet 'Basic operations -> join_dataframes' to 3 | # join together demographics and measurements dataframes. 4 | # See also https://plotnine.readthedocs.io/en/stable/ 5 | 6 | 7 | # There could be many different measurements in the dataframe. By default, plot the first one. 8 | measurement_to_plot = measurement_df.standard_concept_name.unique()[0] 9 | 10 | # meas_filter is a column of True and False. 11 | meas_filter = ((measurement_df.standard_concept_name == measurement_to_plot) 12 | & (measurement_df.unit_concept_name != 'No matching concept') 13 | & (measurement_df.unit_concept_name.notna()) 14 | & (measurement_df.value_as_number < 9999999) # Get rid of nonsensical outliers. 15 | ) 16 | 17 | (ggplot(measurement_df[meas_filter], aes(x = 'sex_at_birth', y = 'value_as_number')) + 18 | geom_boxplot() + 19 | stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10, 20 | position = position_dodge(width = 0.9), va = 'top') + 21 | # scale_y_log10() + # Uncomment if the data looks skewed. 22 | facet_wrap(('standard_concept_name', 'unit_concept_name'), ncol = 2, scales = 'free') + 23 | ggtitle(f'Numeric values of measurements, by sex_at_birth\nSource: All Of Us Data') + 24 | theme(figure_size=(12, 6), panel_spacing = .5, axis_text_x = element_text(angle=25, hjust=1))) 25 | -------------------------------------------------------------------------------- /sql-snippets/measurement_of_interest_by_age_and_sex_at_birth.plotnine: -------------------------------------------------------------------------------- 1 | # This plot assumes that measurement_of_interest.sql has been run. 2 | 3 | measurement_of_interest_df['age_at_measurement'] = ((pd.to_datetime(measurement_of_interest_df['measurement_date']) 4 | - measurement_of_interest_df['birth_datetime'].dt.tz_localize(None)).dt.days)//365.24 5 | measurement_of_interest_df['age_group'] = pd.cut(measurement_of_interest_df['age_at_measurement'], 6 | [-np.inf, 34.5, 49.5, 64.5, np.inf], 7 | labels=["<35", "35-49", "50-64", "65+"]) 8 | # meas_filter is a column of True and False 9 | meas_filter = measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers. 10 | age_group_not_null = (measurement_of_interest_df['age_group'].notnull()) 11 | 12 | (ggplot(measurement_of_interest_df[meas_filter & age_group_not_null], aes(x = 'age_group', y = 'value_as_number')) + 13 | geom_boxplot() + 14 | stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10, 15 | position = position_dodge(width = 0.9), va = 'top') + 16 | # scale_y_log10() + # Uncomment if the data looks skewed. 17 | coord_flip() + 18 | facet_wrap('~ sex_at_birth', nrow = len(measurement_of_interest_df.sex_at_birth.unique())) + 19 | xlab('age') + 20 | ylab(f'{UNIT_NAME}') + 21 | ggtitle(f'All {MEASUREMENT_NAME} measurements, by age, faceted by sex_at_birth\nSource: All Of Us Data') + 22 | theme(figure_size=(12, 12))) 23 | -------------------------------------------------------------------------------- /sql-snippets/measurement_of_interest.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Return row level data for a measurement for our cohort. 3 | -- 4 | -- PARAMETERS: 5 | -- MEASUREMENT_CONCEPT_ID: for example 3004410 # Hemoglobin A1c 6 | -- UNIT_CONCEPT_ID: for example 8554 # percent 7 | 8 | WITH 9 | -- 10 | -- Retrieve participants birthdate and sex_at_birth. 11 | -- 12 | persons AS ( 13 | SELECT 14 | person_id, 15 | birth_datetime, 16 | concept_name AS sex_at_birth 17 | FROM 18 | `{CDR}.person` 19 | LEFT JOIN `{CDR}.concept` ON concept_id = sex_at_birth_concept_id), 20 | -- 21 | -- Retrieve the row-level data for our measurement of interest. 22 | -- 23 | measurements AS ( 24 | SELECT 25 | person_id, 26 | measurement_id, 27 | measurement_concept_id, 28 | measurement_date, 29 | measurement_datetime, 30 | measurement_type_concept_id, 31 | operator_concept_id, 32 | value_as_number, 33 | value_as_concept_id, 34 | unit_concept_id, 35 | range_low, 36 | range_high 37 | FROM 38 | `{CDR}.measurement` 39 | WHERE 40 | measurement_concept_id = {MEASUREMENT_CONCEPT_ID} 41 | AND unit_concept_id = {UNIT_CONCEPT_ID} 42 | AND person_id IN ({COHORT_QUERY})) 43 | -- 44 | -- Lastly, JOIN all this data together so that we have the birthdate, sex_at_birth and site for each measurement. 45 | -- 46 | SELECT 47 | persons.*, 48 | src_id, 49 | measurements.* EXCEPT(person_id, measurement_id) 50 | FROM 51 | measurements 52 | LEFT JOIN 53 | persons USING (person_id) 54 | LEFT JOIN 55 | `{CDR}.measurement_ext` USING (measurement_id) 56 | ORDER BY 57 | person_id, 58 | measurement_id 59 | 60 | -------------------------------------------------------------------------------- /dataset-snippets/measurement_by_age_and_sex_at_birth.ggplot: -------------------------------------------------------------------------------- 1 | # Use snippet 'measurement_by_age_and_sex_at_birth' to plot joined demographics and measurements dataframes. 2 | # This plot assumes 'measurement_df' was created using snippet 'Basic operations -> join_dataframes' to 3 | # join together demographics and measurements dataframes. 4 | # See also https://r4ds.had.co.nz/data-visualisation.html 5 | 6 | 7 | options(repr.plot.height = 16, repr.plot.width = 16) 8 | 9 | # There could be many different measurements in the dataframe. By default, plot the first one. 10 | measurement_to_plot <- unique(measurement_df$standard_concept_name)[1] 11 | 12 | measurement_df %>% 13 | filter(standard_concept_name == measurement_to_plot) %>% 14 | filter(!unit_concept_name %in% c('No matching concept', 'NULL')) %>% 15 | filter(sex_at_birth != 'No matching concept') %>% 16 | filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers. 17 | mutate(age_at_measurement = year(as.period(interval(start = date_of_birth, end = measurement_datetime)))) %>% 18 | ggplot(aes(x = cut_width(age_at_measurement, width = 5, boundary = 0), y = value_as_number)) + 19 | geom_boxplot() + 20 | stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 2, 21 | position = position_dodge(width = 0.9), vjust = -0.8) + 22 | # scale_y_log10() + # Uncomment if the data looks skewed. 23 | coord_flip() + 24 | facet_wrap(standard_concept_name + unit_concept_name ~ sex_at_birth, ncol = 2, scales = 'free') + 25 | xlab('age group') + 26 | labs(title = str_glue('Numeric values of measurements by age and sex_at_birth'), caption = 'Source: All Of Us Data') 27 | -------------------------------------------------------------------------------- /py/README.md: -------------------------------------------------------------------------------- 1 | # Terra widgets 2 | 3 | A python package for ipywidget-based user interfaces for performing tasks within the context Python Jupyter notebooks running in either the Terra or All of Us workbench environments. 4 | 5 | 6 | ## Create and view HTML snapshots of notebooks 7 | 8 | The workbench takes care of saving the current version of your notebooks for you. But what if you want to know **what your notebook looked like two weeks ago?** Use `display_html_snapshots_widget()` to display a widget which can save snapshots of a notebook for later review, allowing users to track changes to results in notebooks over time. To do this, it: 9 | 10 | 1. Converts the selected notebook to an HTML file (without re-running the notebook). 11 | 1. And then copies that HTML file to subfolder within the same workspace bucket where the notebook file is stored. 12 | 13 | Use this interface to create an HTML snapshot each time you make a major change to your notebook. You can choose notebooks from **any of your workspaces!** 14 | 15 | Implementation details: 16 | 17 | * The user interface controls are implemented using the [ipywidgets](https://ipywidgets.readthedocs.io/en/latest/) Python package. 18 | 19 | * Notebooks are converted from `.ipynb` to `.html` using [nbconvert](https://nbconvert.readthedocs.io/en/latest/). 20 | 21 | * Files are transfered back and forth from the workspace bucket using both: 22 | * [gsutil](https://cloud.google.com/storage/docs/gsutil) 23 | * [Tensorflow GFile](https://www.tensorflow.org/api_docs/python/tf/io/gfile/GFile). 24 | 25 | * The few files of code implementing this interface are preinstalled as a [Python library](https://github.com/all-of-us/workbench-snippets/blob/main/py/setup.py) on the AoU workbench. 26 | -------------------------------------------------------------------------------- /sql-snippets/most_recent_measurement_of_interest_by_age_and_sex_at_birth.plotnine: -------------------------------------------------------------------------------- 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run. 2 | 3 | most_recent_measurement_of_interest_df['age_at_measurement'] = ((pd.to_datetime(most_recent_measurement_of_interest_df['measurement_date']) 4 | - most_recent_measurement_of_interest_df['birth_datetime'].dt.tz_localize(None)).dt.days)//365.24 5 | most_recent_measurement_of_interest_df['age_group'] = pd.cut(most_recent_measurement_of_interest_df['age_at_measurement'], 6 | [-np.inf, 34.5, 49.5, 64.5, np.inf], 7 | labels=["<35", "35-49", "50-64", "65+"]) 8 | # meas_filter is a column of True and False 9 | meas_filter = most_recent_measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers. 10 | age_group_not_null = (most_recent_measurement_of_interest_df['age_group'].notnull()) 11 | 12 | (ggplot(most_recent_measurement_of_interest_df[meas_filter & age_group_not_null], aes(x = 'age_group', y = 'value_as_number')) + 13 | geom_boxplot() + 14 | stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10, 15 | position = position_dodge(width = 0.9), va = 'top') + 16 | # scale_y_log10() + # Uncomment if the data looks skewed. 17 | coord_flip() + 18 | facet_wrap('~ sex_at_birth', nrow = len(most_recent_measurement_of_interest_df.sex_at_birth.unique())) + 19 | xlab('age') + 20 | ylab(f'{UNIT_NAME}') + 21 | ggtitle(f'Most recent {MEASUREMENT_NAME} measurement\nper person, by age, faceted by sex_at_birth\nSource: All Of Us Data') + 22 | theme(figure_size=(12, 6))) 23 | -------------------------------------------------------------------------------- /sql-snippets/snippets_setup.R: -------------------------------------------------------------------------------- 1 | lapply(c('viridis', 'ggthemes', 'skimr'), 2 | function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } ) 3 | 4 | library(viridis) # A nice color scheme for plots. 5 | library(ggthemes) # Common themes to change the look and feel of plots. 6 | library(scales) # Graphical scales map data to aesthetics in plots. 7 | library(skimr) # Better summaries of data. 8 | library(lubridate) # Date library from the tidyverse. 9 | library(bigrquery) # BigQuery R client. 10 | library(tidyverse) # Data wrangling packages. 11 | 12 | ## BigQuery setup. 13 | BILLING_PROJECT_ID <- Sys.getenv('GOOGLE_PROJECT') 14 | # Get the BigQuery curated dataset for the current workspace context. 15 | CDR <- Sys.getenv('WORKSPACE_CDR') 16 | 17 | ## Plot setup. 18 | theme_set(theme_bw(base_size = 14)) # Default theme for plots. 19 | 20 | #' Returns a data frame with a y position and a label, for use annotating ggplot boxplots. 21 | #' 22 | #' @param d A data frame. 23 | #' @return A data frame with column y as max and column label as length. 24 | get_boxplot_fun_data <- function(df) { 25 | return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df)))) 26 | } 27 | 28 | ## ---------------[ CHANGE THESE AS NEEDED] --------------------------------------- 29 | # Set default parameter values so that all snippets run successfully with no edits needed. 30 | COHORT_QUERY <- str_glue('SELECT person_id FROM `{CDR}.person`') # Default to all participants. 31 | MEASUREMENT_OF_INTEREST <- 'hemoglobin' 32 | # Tip: the next four parameters could be set programmatically using one row from 33 | # the result of measurements_of_interest_summary.sql 34 | MEASUREMENT_CONCEPT_ID <- 3004410 # Hemoglobin A1c 35 | UNIT_CONCEPT_ID <- 8554 # percent 36 | MEASUREMENT_NAME <- '' 37 | UNIT_NAME <- '' 38 | -------------------------------------------------------------------------------- /dataset-snippets/summarize_a_survey_by_question_concept_id.py: -------------------------------------------------------------------------------- 1 | # Use snippet 'summarize_a_survey_module' to output a table and a graph of 2 | # participant counts by response for one question_concept_id 3 | # The snippet assumes that a dataframe containing survey questions and answers already exists 4 | # The snippet also assumes that setup has been run 5 | 6 | # Update the next 3 lines 7 | survey_df = YOUR_DATASET_NAME_survey_df 8 | question_concept_id = 1585940 9 | denominator = None # e.g: 200000 10 | 11 | #################################################################################### 12 | # DON'T CHANGE FROM HERE 13 | #################################################################################### 14 | def summarize_a_question_concept_id(df, question_concept_id, denominator=None): 15 | df = df.loc[df['question_concept_id'] == question_concept_id].copy() 16 | new_df = df.groupby(['answer_concept_id', 'answer'])['person_id']\ 17 | .nunique()\ 18 | .reset_index()\ 19 | .rename(columns=dict(person_id='n_participant'))\ 20 | .assign(answer_concept_id = lambda x: np.int32(x.answer_concept_id)) 21 | if denominator: 22 | new_df['response_rate'] = round(100*new_df['n_participant']/denominator,2) 23 | if question_concept_id in df['question_concept_id'].unique(): 24 | print(f"Distribution of response to {df.loc[df['question_concept_id'] == question_concept_id, 'question'].unique()[0]}") 25 | # show table 26 | display(new_df) 27 | # show graph 28 | display(ggplot(data=new_df) + 29 | geom_bar(aes(x='answer', y='n_participant'), stat='identity') + 30 | coord_flip() + 31 | labs(y="Participant count", x="") + 32 | theme_bw()) 33 | else: 34 | print("There is an error with your question_concept_id") 35 | 36 | summarize_a_question_concept_id(survey_df, question_concept_id, denominator) 37 | 38 | 39 | -------------------------------------------------------------------------------- /dataset-snippets/summarize_a_survey_by_question_concept_id.R: -------------------------------------------------------------------------------- 1 | # Use snippet 'summarize_a_survey_module' to output a table and a graph of 2 | # participant counts by response for one question_concept_id 3 | # The snippet assumes that a dataframe containing survey questions and answers already exists 4 | # The snippet also assumes that setup has been run 5 | 6 | # Update the next 3 lines 7 | survey_df <- YOUR_DATASET_NAME_survey_df 8 | question_concept_id <- 1585940 9 | denominator <- NULL 10 | 11 | #################################################################################### 12 | # DON'T CHANGE FROM HERE 13 | #################################################################################### 14 | summarize_a_question_concept_id <- function(df, q_concept_id, denominator=NULL){ 15 | df <- df %>% 16 | mutate(question_concept_id = as.numeric(question_concept_id)) %>% 17 | filter(question_concept_id == q_concept_id) 18 | 19 | new_df <- df %>% group_by(answer_concept_id, answer) %>% 20 | summarize(n_participant = n_distinct(person_id)) %>% 21 | ungroup() %>% 22 | mutate(answer_concept_id = as.integer(answer_concept_id)) 23 | if (!is.null(denominator)) { 24 | new_df <- new_df %>% mutate(response_rate = paste0(round(100*n_participant/denominator,2),'%')) 25 | } 26 | 27 | if (q_concept_id %in% as.vector(unique(df[['question_concept_id']]))){ 28 | question_name <- as.vector(unique(df$question)) 29 | print(str_glue("Distribution of response to {question_name}")) 30 | 31 | # show table 32 | print(new_df) 33 | 34 | # show graph 35 | options(repr.plot.width=12, repr.plot.height=6) 36 | ggplot(new_df) + 37 | geom_bar(aes(x=answer, y=n_participant), stat='identity') + 38 | coord_flip() + 39 | labs(y="Participant count", x="") + 40 | theme_bw() 41 | } 42 | else { 43 | print("There is an error with your question_concept_id") 44 | } 45 | } 46 | 47 | summarize_a_question_concept_id(survey_df, question_concept_id, denominator) 48 | 49 | 50 | -------------------------------------------------------------------------------- /dataset-snippets/measurement_by_age_and_sex_at_birth.plotnine: -------------------------------------------------------------------------------- 1 | # Use snippet 'measurement_by_age_and_sex_at_birth' to plot joined demographics and measurements dataframes. 2 | # This plot assumes 'measurement_df' was created using snippet 'Basic operations -> join_dataframes' to 3 | # join together demographics and measurements dataframes. 4 | # See also https://plotnine.readthedocs.io/en/stable/ 5 | 6 | 7 | # There could be many different measurements in the dataframe. By default, plot the first one. 8 | measurement_to_plot = measurement_df.standard_concept_name.unique()[0] 9 | 10 | # Create a derived variable for age group. 11 | measurement_df['age_at_measurement'] = ((measurement_df['measurement_datetime'].dt.tz_localize(None) 12 | - measurement_df['date_of_birth'].dt.tz_localize(None)).dt.days)//365.24 13 | measurement_df['age_group'] = pd.cut(measurement_df['age_at_measurement'], 14 | [-np.inf, 34.5, 49.5, 64.5, np.inf], 15 | labels=["<35", "35-49", "50-64", "65+"]) 16 | 17 | # meas_filter is a column of True and False 18 | meas_filter = ((measurement_df.standard_concept_name == measurement_to_plot) 19 | & (measurement_df.unit_concept_name != 'No matching concept') 20 | & (measurement_df.unit_concept_name.notna()) 21 | & (measurement_df.sex_at_birth != 'No matching concept') 22 | & (measurement_df.value_as_number < 9999999) 23 | & (measurement_df['age_at_measurement'].notnull()) # Get rid of nonsensical outliers. 24 | ) 25 | 26 | (ggplot(measurement_df[meas_filter], aes(x = 'age_group', y = 'value_as_number')) + 27 | geom_boxplot() + 28 | stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10, 29 | position = position_dodge(width = 0.9), va = 'top') + 30 | # scale_y_log10() + # Uncomment if the data looks skewed. 31 | coord_flip() + 32 | facet_wrap(['standard_concept_name + ": " + unit_concept_name', 'sex_at_birth'], ncol = 2, scales = 'free') + 33 | xlab('age group') + 34 | ggtitle('Numeric values of measurements by age and sex_at_birth\nSource: All Of Us Data') + 35 | theme(figure_size = (12, 12), panel_spacing = .5)) 36 | -------------------------------------------------------------------------------- /sql-snippets/most_recent_measurement_of_interest.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Return row level data for a measurement, limited to only the most recent result per person in our cohort. 3 | -- 4 | -- PARAMETERS: 5 | -- MEASUREMENT_CONCEPT_ID: for example 3004410 # Hemoglobin A1c 6 | -- UNIT_CONCEPT_ID: for example 8554 # percent 7 | 8 | WITH 9 | -- 10 | -- Retrieve participants birthdate and sex_at_birth. 11 | -- 12 | persons AS ( 13 | SELECT 14 | person_id, 15 | birth_datetime, 16 | concept_name AS sex_at_birth 17 | FROM 18 | `{CDR}.person` 19 | LEFT JOIN `{CDR}.concept` ON concept_id = sex_at_birth_concept_id), 20 | -- 21 | -- Retrieve the row-level data for our measurement of interest. Also compute 22 | -- a new column for the recency rank of the measurement per person, a rank of 23 | -- of 1 being the most recent lab result for that person. 24 | -- 25 | measurements AS ( 26 | SELECT 27 | person_id, 28 | measurement_id, 29 | measurement_concept_id, 30 | unit_concept_id, 31 | measurement_date, 32 | measurement_datetime, 33 | measurement_type_concept_id, 34 | operator_concept_id, 35 | value_as_number, 36 | value_as_concept_id, 37 | range_low, 38 | range_high, 39 | ROW_NUMBER() OVER (PARTITION BY person_id 40 | ORDER BY measurement_date DESC, 41 | measurement_datetime DESC, 42 | measurement_id DESC) AS recency_rank 43 | 44 | FROM 45 | `{CDR}.measurement` 46 | WHERE 47 | measurement_concept_id = {MEASUREMENT_CONCEPT_ID} 48 | AND unit_concept_id = {UNIT_CONCEPT_ID} 49 | AND person_id IN ({COHORT_QUERY})) 50 | -- 51 | -- Lastly, JOIN all this data together so that we have the birthdate, sex_at_birth and site for each 52 | -- measurement, retaining only the most recent result per person. 53 | -- 54 | SELECT 55 | persons.*, 56 | src_id, 57 | measurements.* EXCEPT(person_id, measurement_id, recency_rank) 58 | FROM 59 | measurements 60 | LEFT JOIN 61 | persons USING (person_id) 62 | LEFT JOIN 63 | `{CDR}.measurement_ext` USING (measurement_id) 64 | WHERE 65 | recency_rank = 1 66 | ORDER BY 67 | person_id, 68 | measurement_id 69 | 70 | -------------------------------------------------------------------------------- /sql-snippets/measurements_of_interest_summary.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Compute summary information for our measurements of interest for our cohort. 3 | -- 4 | -- PARAMETERS: 5 | -- MEASUREMENT_OF_INTEREST: a case-insensitive string, such as "hemoglobin", to be compared 6 | -- to all measurement concept names to identify those of interest 7 | 8 | WITH 9 | -- 10 | -- Use a case insensitive string to search the measurement concept names of those 11 | -- measurements we do have in the measurements table. 12 | -- 13 | labs_of_interest AS ( 14 | SELECT 15 | measurement_concept_id, 16 | measurement_concept.concept_name AS measurement_name, 17 | unit_concept_id, 18 | unit_concept.concept_name AS unit_name 19 | FROM 20 | `{CDR}.measurement` 21 | LEFT JOIN `{CDR}.concept` AS measurement_concept 22 | ON measurement_concept.concept_id = measurement_concept_id 23 | LEFT JOIN `{CDR}.concept` AS unit_concept 24 | ON unit_concept.concept_id = unit_concept_id 25 | WHERE 26 | REGEXP_CONTAINS(measurement_concept.concept_name, r"(?i){MEASUREMENT_OF_INTEREST}") 27 | GROUP BY 28 | measurement_concept_id, 29 | unit_concept_id, 30 | measurement_concept.concept_name, 31 | unit_concept.concept_name 32 | ) 33 | -- 34 | -- Summarize the information about each measurement concept of interest that our 35 | -- prior query identified. 36 | -- 37 | SELECT 38 | measurement_name AS measurement, 39 | IFNULL(unit_name, "NA") AS unit, 40 | COUNT(1) AS N, 41 | COUNTIF(value_as_number IS NULL 42 | AND (value_as_concept_id IS NULL 43 | OR value_as_concept_id = 0)) AS missing, 44 | MIN(value_as_number) AS min, 45 | MAX(value_as_number) AS max, 46 | AVG(value_as_number) AS avg, 47 | STDDEV(value_as_number) AS stddev, 48 | APPROX_QUANTILES(value_as_number, 4) AS quantiles, 49 | COUNTIF(value_as_number IS NOT NULL) AS num_numeric_values, 50 | COUNTIF(value_as_concept_id IS NOT NULL 51 | AND value_as_concept_id != 0) AS num_concept_values, 52 | COUNTIF(operator_concept_id IS NOT NULL) AS num_operators, 53 | IF(src_id = "PPI/PM", "PPI", "EHR") AS measurement_source, 54 | measurement_concept_id, 55 | unit_concept_id 56 | FROM 57 | `{CDR}.measurement` 58 | INNER JOIN 59 | labs_of_interest USING(measurement_concept_id, unit_concept_id) 60 | LEFT JOIN 61 | `{CDR}.measurement_ext` USING(measurement_id) 62 | WHERE 63 | person_id IN ({COHORT_QUERY}) 64 | GROUP BY 65 | measurement_concept_id, 66 | measurement_name, 67 | measurement_source, 68 | unit_concept_id, 69 | unit_name 70 | ORDER BY 71 | N DESC 72 | 73 | -------------------------------------------------------------------------------- /sql-snippets/measurements_of_interest_summary_test.py: -------------------------------------------------------------------------------- 1 | """Tests for query measurements_of_interest_summary.sql. 2 | 3 | See https://github.com/verilylifesciences/analysis-py-utils for more details 4 | about the testing framework. 5 | """ 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | from ddt import ddt 12 | import os 13 | import unittest 14 | from verily.bigquery_wrapper import bq_test_case 15 | 16 | SQL_TEMPLATE = "measurements_of_interest_summary.sql" 17 | 18 | 19 | @ddt 20 | class QueryTest(bq_test_case.BQTestCase): 21 | 22 | @classmethod 23 | def setUpClass(cls): 24 | """Set up class.""" 25 | super(QueryTest, cls).setUpClass(use_mocks=False) 26 | cls.sql_to_test = open( 27 | os.path.join(os.path.dirname(os.path.realpath(__file__)), 28 | SQL_TEMPLATE), "r").read() 29 | 30 | @classmethod 31 | def create_mock_tables(cls): 32 | """Create mock tables.""" 33 | 34 | cls.client.create_table_from_query(""" 35 | SELECT * FROM UNNEST([ 36 | STRUCT 39 | (1001, '1990-12-31 00:00:00 UTC', 501), 40 | (1002, '1950-08-01 00:00:00 UTC', 500), 41 | (1003, '1965-06-30 00:00:00 UTC', 500) 42 | ]) 43 | """, cls.client.path("person")) 44 | 45 | cls.client.create_table_from_query(""" 46 | SELECT * FROM UNNEST([ 47 | STRUCT 50 | (123, 'Hemoglobin', 'LOINC'), 51 | (456, 'gram per deciliter', 'UCUM') 52 | ]) 53 | """, cls.client.path("concept")) 54 | 55 | cls.client.create_table_from_query(""" 56 | SELECT * FROM UNNEST([ 57 | STRUCT 59 | (1, 'EHR site1'), 60 | (2, 'EHR site1'), 61 | (3, 'EHR site1'), 62 | (4, 'EHR site2'), 63 | (5, 'EHR site2'), 64 | (6, 'PPI/PM') 65 | ]) 66 | """, cls.client.path("measurement_ext")) 67 | 68 | cls.client.create_table_from_query(""" 69 | SELECT * FROM UNNEST([ 70 | STRUCT 78 | (1, 1001, 123, 123, 456, NULL, 42.0, NULL), 79 | (2, 1001, 123, 123, 456, NULL, 13.5, NULL), 80 | (3, 1002, 123, 123, 456, NULL, NULL, 100), 81 | (4, 1002, 123, 123, 456, NULL, NULL, NULL), 82 | (5, 1002, 123, 123, 456, 789, 7.2, NULL), 83 | # This measurement is for someone not in our cohort. 84 | (6, 1003, 123, 123, 456, NULL, 500, NULL) 85 | ]) 86 | """, cls.client.path("measurement")) 87 | 88 | # Get the project id and dataset name where the temp tables are stored. 89 | (project_id, dataset_id, _) = cls.client.parse_table_path( 90 | cls.client.path("any_temp_table")) 91 | cls.src_dataset = ".".join([project_id, dataset_id]) 92 | 93 | def test(self): 94 | sql = self.sql_to_test.format( 95 | CDR=self.src_dataset, 96 | COHORT_QUERY="SELECT person_id FROM `{}.person` WHERE person_id <= 1002".format(self.src_dataset), 97 | MEASUREMENT_OF_INTEREST="hemoglobin") 98 | 99 | expected = [ 100 | # measurement unit N missing min max avg stddev quantiles num_numeric_values num_concept_values num_operators measurement_source measurement_concept_id unit_concept_id 101 | ("Hemoglobin", "gram per deciliter", 5, 1, 7.2, 42.0, 20.9, 18.542653531789888, [7.2, 7.2, 13.5, 42.0, 42.0], 3, 1, 1, "EHR", 123, 456) 102 | ] 103 | self.expect_query_result(query=sql, expected=expected) 104 | 105 | if __name__ == "__main__": 106 | unittest.main() 107 | 108 | -------------------------------------------------------------------------------- /storage-snippets/README.md: -------------------------------------------------------------------------------- 1 | # Cloud Storage snippets 2 | 3 | This snippets in this subdirectory are for workbench users who directly use the workspace bucket. 4 | 5 | # Get setup for GitHub 6 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#get-setup-for-github) for the details. If you are new to `git`, please see the example commands there. 7 | 8 | # How to contribute a snippet to the Cloud Storage snippets menu group 9 | 10 | 1. Write your snippet of code in your preferred language, R or Python. 11 | * Try to make your snippet consistent with other snippets in this collection. 12 | * For data wrangling, use [dplyr](https://dplyr.tidyverse.org/) for R and [pandas](https://pandas.pydata.org/) for Python. 13 | * Choose a good prefix and suffix for your snippet file name. 14 | * See the names of the other files for examples. 15 | * The file name helps users decide whether the snippet will be useful to them. 16 | * Put some comments at the top of your snippet to explain its purpose and any assumptions. 17 | 1. After you are happy with your new snippet, port it to the other language or file a GitHub issue asking for help from someone else to do this. 18 | 1. If your snippet has any inputs or parameters, add default values for those parameters to both [`snippets_setup.R`](./snippets_setup.R) and [`snippets_setup.py`](./snippets_setup.py) so that your snippet will work as-is. 19 | 1. Update [r_gcs_snippets_menu_config.yml](../build/r_gcs_snippets_menu_config.yml) and [py_gcs_snippets_menu_config.yml](../build/py_gcs_snippets_menu_config.yml) to add your snippet where ever you would like it to be displayed within the menu. 20 | 1. Send your pull request! 21 | 22 | Don't like these conventions? We can change them! This is just a starting point. Keep in mind we'll need to reflect those changes in the auto-generation script described in the next section. 23 | 24 | # Auto-generation of Jupyter 'Snippets Menu' configuration 25 | 26 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#auto-generation-of-jupyter-snippets-menu-configuration) for the details. 27 | 28 | # Testing 29 | 30 | ## Snippet tests 31 | To test individual snippets, the best thing to do is copy and paste them into a notebook on the workbench. 32 | 33 | ## Integration 'smoke tests' 34 | 35 | If the smoke tests are run from the workbench environment and there are no obvious bugs in the snippets, they will run start-to-finish without error. (This won't necessarily catch all bugs, but its a good start.) 36 | 37 | * The script to auto-generate the Jupyter Snippets Menu configuration also emits both `r_gcs_snippets_menu_config_smoke_test.R` and `py_gcs_snippets_menu_config_smoke_test.py`. 38 | * Those scripts each include, respectively, all the R Cloud Storage snippets and all the Python Cloud Storage snippets. 39 | * Additional configuration needed for the smoke tests can be defined in [r_gcs_snippets_menu_config.smoke_test_setup](../build/r_gcs_snippets_menu_config.smoke_test_setup) and [py_gcs_snippets_menu_config.smoke_test_setup](../build/py_gcs_snippets_menu_config.smoke_test_setup), respectively. Update it as needed. 40 | 41 | After opening a notebook in the production workbench environment, upload these smoke test files into Jupyter and then execute the following code from the Jupyter terminal or a Python notebook in the same directory. They will emit _"Smoke test complete!"_ when they have completed successfully. 42 | 43 | To run the R Cloud Storage snippets smoke tests: 44 | ``` 45 | %%bash 46 | 47 | Rscript r_gcs_snippets_menu_config_smoke_test.R # There will be output, but there should be no errors. 48 | ``` 49 | 50 | To run the Python Cloud Storage snippets smoke tests: 51 | ``` 52 | %%bash 53 | 54 | python3 py_gcs_snippets_menu_config_smoke_test.py # There will be output, but there should be no errors. 55 | ``` 56 | 57 | # Deployment 58 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#deployment) for the details. 59 | -------------------------------------------------------------------------------- /py/terra_widgets/tests/test_workspace_paths.py: -------------------------------------------------------------------------------- 1 | """Tests for the WorkspacePaths class.""" 2 | 3 | import os 4 | import unittest 5 | from terra_widgets.workspace_paths import WorkspacePaths 6 | 7 | 8 | class TestWorkspacePaths(unittest.TestCase): 9 | 10 | def setUp(self): 11 | self.wp = WorkspacePaths(workspace_bucket='fc-fake-bucket') 12 | os.environ['OWNER_EMAIL'] = 'testUser@somecompany.com' 13 | 14 | def tearDown(self): 15 | os.unsetenv('OWNER_EMAIL') 16 | 17 | def test_destinations(self): 18 | notebook_paths = ['gs://fc-fake-bucket/notebooks/test1.ipynb', 19 | 'gs://fc-fake-bucket/notebooks/test2.ipynb'] 20 | destinations = self.wp.formulate_destination_paths(notebooks=notebook_paths) 21 | self.assertSetEqual(set(destinations.keys()), set(notebook_paths)) 22 | self.assertRegex( 23 | destinations[notebook_paths[0]].html_file, 24 | r'gs://fc-fake-bucket/reports/testUser@somecompany.com/\d{8}/\d{6}/test1.html') 25 | self.assertRegex( 26 | destinations[notebook_paths[0]].comment_file, 27 | r'gs://fc-fake-bucket/reports/testUser@somecompany.com/\d{8}/\d{6}/test1.comment.txt') 28 | self.assertRegex( 29 | destinations[notebook_paths[1]].html_file, 30 | r'gs://fc-fake-bucket/reports/testUser@somecompany.com/\d{8}/\d{6}/test2.html') 31 | self.assertRegex( 32 | destinations[notebook_paths[1]].comment_file, 33 | r'gs://fc-fake-bucket/reports/testUser@somecompany.com/\d{8}/\d{6}/test2.comment.txt') 34 | 35 | def test_fail_destinations(self): 36 | with self.assertRaisesRegex( 37 | ValueError, 38 | r'"gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/120000/test1.html" does not match "gs://fc-fake-bucket/notebooks/\*\.ipynb"'): 39 | self.wp.formulate_destination_paths(notebooks=['gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/120000/test1.html']) 40 | 41 | def test_glob_for_aou(self): 42 | input_path = 'gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/120000' 43 | expected = os.path.join(input_path, '*.html') 44 | self.assertEqual(self.wp.add_html_glob_to_path(input_path), expected) 45 | 46 | def test_glob_for_terra(self): 47 | wp = WorkspacePaths(workspace_bucket='fc-fake-bucket') 48 | input_path = 'gs://fc-fake-bucket/reports/test@somecompany.com/20200701/120000' 49 | expected = os.path.join(input_path, '*.html') 50 | self.assertEqual(wp.add_html_glob_to_path(input_path), expected) 51 | 52 | def test_glob_path_already_complete(self): 53 | # Pass a complete path to an HTML file when instead we should pass a partial path to it. 54 | with self.assertRaisesRegex(ValueError, '"gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/120000/test1.html" does not match'): 55 | self.wp.add_html_glob_to_path('gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/120000/test1.html') 56 | 57 | def test_glob_path_missing_time(self): 58 | with self.assertRaisesRegex(ValueError, 'does not match'): 59 | self.wp.add_html_glob_to_path('gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/') 60 | 61 | def test_glob_path_missing_date(self): 62 | with self.assertRaisesRegex(ValueError, 'does not match'): 63 | self.wp.add_html_glob_to_path('gs://fc-fake-bucket/reports/test@researchallofus.org/120000/') 64 | 65 | def test_glob_path_missing_user(self): 66 | with self.assertRaisesRegex(ValueError, 'does not match'): 67 | self.wp.add_html_glob_to_path('gs://fc-fake-bucket/reports/20200701/120000/') 68 | 69 | def test_glob_path_missing_report_folder(self): 70 | with self.assertRaisesRegex(ValueError, 'does not match'): 71 | self.wp.add_html_glob_to_path('gs://fc-fake-bucket/test@researchallofus.org/20200701/120000/') 72 | 73 | def test_glob_wrong_path(self): 74 | # Pass a path to a notebook when instead we should pass a partial path to a report. 75 | with self.assertRaisesRegex(ValueError, '"gs://fc-fake-bucket/notebooks/test1.ipynb" does not match'): 76 | self.wp.add_html_glob_to_path('gs://fc-fake-bucket/notebooks/test1.ipynb') 77 | 78 | 79 | if __name__ == '__main__': 80 | unittest.main() 81 | 82 | -------------------------------------------------------------------------------- /dataset-snippets/README.md: -------------------------------------------------------------------------------- 1 | # Dataset Builder snippets 2 | 3 | This snippets in this subdirectory are for workbench users who use Dataset Builder to retrieve their data. 4 | 5 | # Get setup for GitHub 6 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#get-setup-for-github) for the details. If you are new to `git`, please see the example commands there. 7 | 8 | # How to contribute a snippet to the Dataset Builder snippets menu group 9 | 10 | 1. Write your snippet of code in your preferred language, R or Python. 11 | * Try to make your snippet consistent with other snippets in this collection. 12 | * For data wrangling, use [dplyr](https://dplyr.tidyverse.org/) for R and [pandas](https://pandas.pydata.org/) for Python. 13 | * For static plots, use [ggplot2](https://ggplot2.tidyverse.org/) for R and [plotnine](https://plotnine.readthedocs.io/en/stable/) for Python. 14 | * Choose a good prefix and suffix for your snippet file name. 15 | * See the names of the other files for examples. 16 | * The file name helps users decide whether the snippet will be useful to them. 17 | * Put some comments at the top of your snippet to explain its purpose and any assumptions. 18 | 1. After you are happy with your new snippet, port it to the other language or file a GitHub issue asking for help from someone else to do this. 19 | 1. If your snippet has any inputs or parameters other than a dataframe created by Dataset Builder, add default values for those parameters to both [`snippets_setup.R`](./snippets_setup.R) and [`snippets_setup.py`](./snippets_setup.py) so that your snippet will work as-is. 20 | 1. Update [r_dataset_snippets_menu_config.yml](../build/r_dataset_snippets_menu_config.yml) and [py_dataset_snippets_menu_config.yml](../build/py_dataset_snippets_menu_config.yml) to add your snippet where ever you would like it to be displayed within the menu. 21 | 1. Send your pull request! 22 | 23 | Don't like these conventions? We can change them! This is just a starting point. Keep in mind we'll need to reflect those changes in the auto-generation script described in the next section. 24 | 25 | # Auto-generation of Jupyter 'Snippets Menu' configuration 26 | 27 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#auto-generation-of-jupyter-snippets-menu-configuration) for the details. 28 | 29 | # Testing 30 | 31 | ## Snippet tests 32 | To test individual snippets such as plots, the best thing to do is copy and paste them into a notebook on the workbench. 33 | 34 | ## Integration 'smoke tests' 35 | 36 | If the smoke tests are run from the workbench environment and there are no obvious bugs in the snippets, they will run start-to-finish without error. (This won't necessarily catch all bugs, but its a good start.) 37 | 38 | * The script to auto-generate the Jupyter Snippets Menu configuration also emits both `r_dataset_snippets_menu_config_smoke_test.R` and `py_dataset_snippets_menu_config_smoke_test.py`. 39 | * Those scripts each include, respectively, all the R Dataset Builder snippets and all the Python Dataset Builder snippets. 40 | * The Dataset from Dataset Builder is defined in [r_dataset_snippets_menu_config.smoke_test_setup](../build/r_dataset_snippets_menu_config.smoke_test_setup) and [py_dataset_snippets_menu_config.smoke_test_setup](../build/py_dataset_snippets_menu_config.smoke_test_setup), respectively. Update it as needed. 41 | 42 | After opening a notebook in the production workbench environment, upload these smoke test files into Jupyter and then execute the following code from the Jupyter terminal or a Python notebook in the same directory. They will emit _"Smoke test complete!"_ when they have completed successfully. 43 | 44 | To run the R Dataset Builder snippets smoke tests: 45 | ``` 46 | %%bash 47 | 48 | Rscript r_dataset_snippets_menu_config_smoke_test.R # There will be output, but there should be no errors. 49 | ``` 50 | 51 | To run the Python Dataset Builder snippets smoke tests: 52 | ``` 53 | %%bash 54 | 55 | # Any notebook '!' commands won't work in this context. Comment them out and run them explicitly first. 56 | perl -i -pe 's/!pip/#!pip/g' py_dataset_snippets_menu_config_smoke_test.py 57 | pip3 install --user pandas_profiling 58 | 59 | python3 py_dataset_snippets_menu_config_smoke_test.py # There will be output, but there should be no errors. 60 | ``` 61 | 62 | # Deployment 63 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#deployment) for the details. 64 | -------------------------------------------------------------------------------- /py/terra_widgets/workspace_metadata.py: -------------------------------------------------------------------------------- 1 | """Methods to obtain workspace metadata for the current user in various formats.""" 2 | 3 | import json 4 | import os 5 | from typing import Dict 6 | 7 | import firecloud.api as fapi 8 | from IPython import get_ipython 9 | 10 | 11 | class WorkspaceMetadata: 12 | """Encapsulate all logic for obtaining workspace metadata.""" 13 | 14 | AOU_DOMAIN = '@researchallofus.org' 15 | EDIT_ACCESS_LEVELS = ['WRITER', 'OWNER', 'PROJECT_OWNER'] 16 | AOU_PROD_API = 'https://api.workbench.researchallofus.org/v1/workspaces' 17 | 18 | def __init__(self): 19 | self.user = os.getenv('OWNER_EMAIL') 20 | self.terra_workspaces = fapi.list_workspaces().json() 21 | if self.user.endswith(self.AOU_DOMAIN): 22 | aou_api = os.getenv('RW_API_BASE_URL') 23 | if not aou_api: 24 | aou_api = self.AOU_PROD_API 25 | # Use the All of Us API to get the human-readable workspace names. For All of Us workspaces, 26 | # the Terra workspace metadata the workspace names are actually the AoU workspace ids. 27 | aou_response = get_ipython().getoutput(f'''curl -H "Content-Type: application/json" \ 28 | -H "Authorization: Bearer $(gcloud auth print-access-token)" \ 29 | "{aou_api}" 2>/dev/null | jq .''') 30 | self.aou_workspaces = json.loads(''.join(aou_response))['items'] 31 | else: 32 | self.aou_workspaces = None 33 | 34 | def get_workspace_name_to_id_mapping(self, include_private_readonly: bool = False, include_all: bool = False) -> Dict[str, str]: 35 | """Retrieve a mapping of workspace names to ids. 36 | 37 | Args: 38 | include_private_readonly: whether to include private workspaces for which the current user has only has read access. 39 | include_all: whether to include all workspaces visible to the user 40 | Returns: 41 | A dictionary of workspace names to workspace ids. 42 | """ 43 | if self.aou_workspaces: 44 | return {ws['workspace']['name']: ws['workspace']['id'] for ws in self.aou_workspaces 45 | if include_all 46 | or (include_private_readonly and not ws['workspace']['published']) 47 | or ws['accessLevel'] in self.EDIT_ACCESS_LEVELS} 48 | else: 49 | return {ws['workspace']['name']: ws['workspace']['workspaceId'] for ws in self.terra_workspaces 50 | if include_all 51 | or (include_private_readonly and not ws['public']) 52 | or ws['accessLevel'] in self.EDIT_ACCESS_LEVELS} 53 | 54 | def get_workspace_name_to_bucket_mapping(self, include_private_readonly: bool = False, include_all: bool = False) -> Dict[str, str]: 55 | """Retrieve a mapping of workspace names to Cloud Storage bucket names. 56 | 57 | Args: 58 | include_private_readonly: whether to include private workspaces for which the current user has only has read access. 59 | include_all: whether to include all workspaces visible to the user 60 | Returns: 61 | A dictionary of workspace names to workspace bucket names. 62 | """ 63 | ws_mapping = self.get_workspace_name_to_id_mapping(include_private_readonly=include_private_readonly, 64 | include_all=include_all) 65 | if self.aou_workspaces: 66 | # For All of Us workspaces, in the Terra workspace metadata the workspace names are actually 67 | # the AoU workspace ids. 68 | terra_ws_names = ws_mapping.values() 69 | else: 70 | terra_ws_names = ws_mapping.keys() 71 | return {ws['workspace']['name']: ws['workspace']['bucketName'] for ws in self.terra_workspaces 72 | if ws['workspace']['name'] in terra_ws_names} 73 | 74 | def get_workspace_id_to_bucket_mapping(self, include_private_readonly: bool = False, include_all: bool = False) -> Dict[str, str]: 75 | """Retrieve a mapping of workspace ids to Cloud Storage bucket names. 76 | 77 | Args: 78 | include_private_readonly: whether to include private workspaces for which the current user has only has read access. 79 | include_all: whether to include all workspaces visible to the user 80 | Returns: 81 | A dictionary of workspace names to workspace bucket names. 82 | """ 83 | ws_mapping = self.get_workspace_name_to_id_mapping(include_private_readonly=include_private_readonly, 84 | include_all=include_all) 85 | if self.aou_workspaces: 86 | # For All of Us workspaces, in the Terra workspace metadata the workspace names are actually 87 | # the AoU workspace ids. 88 | terra_metadata_key = 'name' 89 | else: 90 | terra_metadata_key = 'workspaceId' 91 | return {ws['workspace'][terra_metadata_key]: ws['workspace']['bucketName'] for ws in self.terra_workspaces 92 | if ws['workspace'][terra_metadata_key] in ws_mapping.values()} 93 | -------------------------------------------------------------------------------- /sql-snippets/most_recent_measurement_of_interest_test.py: -------------------------------------------------------------------------------- 1 | """Tests for query most_recent_measurement_of_interest.sql. 2 | 3 | See https://github.com/verilylifesciences/analysis-py-utils for more details 4 | about the testing framework. 5 | """ 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | from datetime import date 12 | from datetime import datetime 13 | from dateutil import tz 14 | from ddt import ddt 15 | import os 16 | import unittest 17 | from verily.bigquery_wrapper import bq_test_case 18 | 19 | SQL_TEMPLATE = "most_recent_measurement_of_interest.sql" 20 | 21 | 22 | @ddt 23 | class QueryTest(bq_test_case.BQTestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | """Set up class.""" 28 | super(QueryTest, cls).setUpClass(use_mocks=False) 29 | cls.sql_to_test = open( 30 | os.path.join(os.path.dirname(os.path.realpath(__file__)), 31 | SQL_TEMPLATE), "r").read() 32 | 33 | @classmethod 34 | def create_mock_tables(cls): 35 | """Create mock tables.""" 36 | 37 | cls.client.create_table_from_query(""" 38 | SELECT * FROM UNNEST([ 39 | STRUCT 42 | (1001, '1990-12-31 00:00:00 UTC', 501), 43 | (1002, '1950-08-01 00:00:00 UTC', 500), 44 | (1003, '1965-06-30 00:00:00 UTC', 500) 45 | ]) 46 | """, cls.client.path("person")) 47 | 48 | cls.client.create_table_from_query(""" 49 | SELECT * FROM UNNEST([ 50 | STRUCT 52 | ( 0, 'No matching concept'), 53 | (123, 'Hemoglobin'), 54 | (456, 'gram per deciliter'), 55 | (500, 'FEMALE'), 56 | (501, 'MALE') 57 | ]) 58 | """, cls.client.path("concept")) 59 | 60 | cls.client.create_table_from_query(""" 61 | SELECT * FROM UNNEST([ 62 | STRUCT 64 | (1, 'EHR site1'), 65 | (2, 'EHR site1'), 66 | (3, 'PPI/PM'), 67 | (4, 'EHR site2'), 68 | (5, 'EHR site2'), 69 | (6, 'EHR site2') 70 | ]) 71 | """, cls.client.path("measurement_ext")) 72 | 73 | cls.client.create_table_from_query(""" 74 | SELECT * FROM UNNEST([ 75 | STRUCT 87 | (1, 1001, 123, 456, NULL, '2005-12-31', '2005-12-31 10:30:00 UTC', NULL, 42.0, NULL, 0, 999), 88 | (2, 1001, 123, 456, NULL, '2007-09-11', '2007-09-11 08:00:00 UTC', NULL, 13.5, NULL, 0, 999), 89 | (3, 1001, 123, 456, NULL, '2007-09-11', '2007-09-11 20:59:00 UTC', NULL, NULL, 100, 0, 999), 90 | (4, 1002, 123, 456, NULL, '2008-02-10', '2008-02-10 23:30:00 UTC', NULL, NULL, NULL, 0, 999), 91 | (5, 1002, 123, 456, 789, '2008-02-10', '2008-02-10 23:30:00 UTC', NULL, 7.2, NULL, 0, 999), 92 | # This measurement is for someone not in our cohort. 93 | (6, 1003, 123, 456, 789, '2010-01-01', '2010-10-01 23:30:00 UTC', NULL, 500, NULL, 0, 999) 94 | ]) 95 | """, cls.client.path("measurement")) 96 | 97 | # Get the project id and dataset name where the temp tables are stored. 98 | (project_id, dataset_id, _) = cls.client.parse_table_path( 99 | cls.client.path("any_temp_table")) 100 | cls.src_dataset = ".".join([project_id, dataset_id]) 101 | 102 | def test(self): 103 | sql = self.sql_to_test.format( 104 | CDR=self.src_dataset, 105 | COHORT_QUERY="SELECT person_id FROM `{}.person` WHERE person_id <= 1002".format(self.src_dataset), 106 | MEASUREMENT_CONCEPT_ID=123, 107 | UNIT_CONCEPT_ID=456) 108 | 109 | expected = [ 110 | # person_id birth_datetime sex_at_birth src_id measurement_concept_id unit_concept_id measurement_date measurement_datetime measurement_type_concept_id operator_concept_id value_as_number value_as_concept_id range_low range_high 111 | (1001, datetime(1990, 12, 31, 0, 0, tzinfo=tz.gettz("UTC")), "MALE", "PPI/PM", 123, 456, date(2007, 9, 11), datetime(2007, 9, 11, 20, 59, tzinfo=tz.gettz("UTC")), None, None, None, 100, 0, 999), 112 | (1002, datetime(1950, 8, 1, 0, 0, tzinfo=tz.gettz("UTC")), "FEMALE", "EHR site2", 123, 456, date(2008, 2, 10), datetime(2008, 2, 10, 23, 30, tzinfo=tz.gettz("UTC")), None, 789, 7.2, None, 0, 999) 113 | ] 114 | self.expect_query_result(query=sql, expected=expected) 115 | 116 | if __name__ == "__main__": 117 | unittest.main() 118 | 119 | -------------------------------------------------------------------------------- /py/terra_widgets/workspace_paths.py: -------------------------------------------------------------------------------- 1 | """Methods to obtains paths to files within the workspace bucket.""" 2 | 3 | import datetime 4 | import fnmatch 5 | import os 6 | from typing import Dict 7 | from typing import List 8 | from typing import NamedTuple 9 | 10 | 11 | WorkspaceDestination = NamedTuple('WorkspaceDestination', [('html_file', str), ('comment_file', str)]) 12 | 13 | 14 | class WorkspacePaths: 15 | """Encapsulate all logic for manipulating workspace paths. 16 | 17 | Paths are of the form: 18 | gs:///reports///