├── py
    ├── terra_widgets
    │   ├── __init__.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_workspace_paths.py
    │   ├── workspace_metadata.py
    │   ├── workspace_paths.py
    │   └── html_snapshots.py
    ├── requirements.txt
    ├── setup.py
    ├── README.md
    └── py_cromwell_setup.py
├── storage-snippets
    ├── snippets_setup.R
    ├── snippets_setup.py
    ├── interact_with_html_snapshots.py
    ├── list_objects_in_bucket.R
    ├── list_objects_in_bucket.py
    ├── copy_file_from_workspace_bucket.R
    ├── copy_file_from_workspace_bucket.py
    ├── copy_data_to_workspace_bucket.py
    ├── copy_data_to_workspace_bucket.R
    └── README.md
├── .gitignore
├── sql-snippets
    ├── total_number_of_participants.sql
    ├── number_of_participants_with_measurements.sql
    ├── number_of_participants_with_med_conditions.sql
    ├── measurement_of_interest_by_site.ggplot
    ├── measurement_of_interest_by_site.plotnine
    ├── measurement_of_interest_by_sex_at_birth.plotnine
    ├── most_recent_measurement_of_interest_by_site.ggplot
    ├── measurement_of_interest_by_sex_at_birth.ggplot
    ├── most_recent_measurement_of_interest_by_sex_at_birth.plotnine
    ├── most_recent_measurement_of_interest_by_site.plotnine
    ├── most_recent_measurement_of_interest_by_sex_at_birth.ggplot
    ├── measurement_of_interest_by_age_and_sex_at_birth.ggplot
    ├── most_recent_measurement_of_interest_by_age_and_sex_at_birth.ggplot
    ├── snippets_setup.py
    ├── measurement_of_interest_by_age_and_sex_at_birth.plotnine
    ├── measurement_of_interest.sql
    ├── most_recent_measurement_of_interest_by_age_and_sex_at_birth.plotnine
    ├── snippets_setup.R
    ├── most_recent_measurement_of_interest.sql
    ├── measurements_of_interest_summary.sql
    ├── measurements_of_interest_summary_test.py
    ├── most_recent_measurement_of_interest_test.py
    ├── measurement_of_interest_test.py
    └── README.md
├── dataset-snippets
    ├── summarize_a_dataframe.R
    ├── summarize_a_dataframe.py
    ├── add_age_to_demographics.py
    ├── join_dataframes.py
    ├── add_age_to_demographics.R
    ├── snippets_setup.py
    ├── join_dataframes.R
    ├── snippets_setup.R
    ├── summarize_a_survey_module.R
    ├── summarize_a_survey_module.py
    ├── measurement_by_sex_at_birth.ggplot
    ├── measurement_by_sex_at_birth.plotnine
    ├── measurement_by_age_and_sex_at_birth.ggplot
    ├── summarize_a_survey_by_question_concept_id.py
    ├── summarize_a_survey_by_question_concept_id.R
    ├── measurement_by_age_and_sex_at_birth.plotnine
    └── README.md
├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── README.md
├── LICENSE.txt
├── r
    └── r_cromwell_setup.R
└── CONTRIBUTING.md


/py/terra_widgets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/py/terra_widgets/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/py/requirements.txt:
--------------------------------------------------------------------------------
1 | firecloud
2 | ipython
3 | ipywidgets
4 | multiprocess
5 | pandas
6 | 


--------------------------------------------------------------------------------
/storage-snippets/snippets_setup.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)  # Data wrangling packages.
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | *smoke_test.R
6 | *smoke_test.py
7 | *.json
8 | *.html
9 | 


--------------------------------------------------------------------------------
/storage-snippets/snippets_setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import numpy as np
4 | import pandas as pd
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/storage-snippets/interact_with_html_snapshots.py:
--------------------------------------------------------------------------------
1 | from terra_widgets.html_snapshots import display_html_snapshots_widget
2 | 
3 | # This will display a user interface to interact with HTML snapshots stored in the workspace bucket.
4 | display_html_snapshots_widget()
5 | 


--------------------------------------------------------------------------------
/sql-snippets/total_number_of_participants.sql:
--------------------------------------------------------------------------------
1 | 
2 | -- Compute the count of unique participants in our All of Us cohort.
3 | SELECT
4 |   COUNT(DISTINCT person_id) AS total_number_of_participants
5 | FROM
6 |   `{CDR}.person`
7 | WHERE
8 |   person_id IN ({COHORT_QUERY})
9 | 


--------------------------------------------------------------------------------
/storage-snippets/list_objects_in_bucket.R:
--------------------------------------------------------------------------------
 1 | # This snippet assumes that you run setup first
 2 | 
 3 | # This code lists objects in your Google Bucket
 4 | 
 5 | # Get the bucket name
 6 | my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
 7 | 
 8 | # List objects in the bucket
 9 | system(paste0("gsutil ls -r ", my_bucket), intern=T)
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/sql-snippets/number_of_participants_with_measurements.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Compute the count of unique participants in our All of Us cohort
 3 | -- that have at least one measurement.
 4 | SELECT
 5 |   COUNT(DISTINCT person_id) AS number_of_participants_with_measurements
 6 | FROM
 7 |   `{CDR}.measurement`
 8 | WHERE
 9 |   person_id IN ({COHORT_QUERY})
10 | 


--------------------------------------------------------------------------------
/sql-snippets/number_of_participants_with_med_conditions.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Compute the count of unique participants in our All of Us cohort
 3 | -- that have at least one condition.
 4 | SELECT
 5 |   COUNT(DISTINCT person_id) AS number_of_participants_with_med_conditions
 6 | FROM
 7 |   `{CDR}.condition_occurrence`
 8 | WHERE
 9 |   person_id IN ({COHORT_QUERY})
10 | 


--------------------------------------------------------------------------------
/storage-snippets/list_objects_in_bucket.py:
--------------------------------------------------------------------------------
 1 | # This snippet assumes that you run setup first
 2 | 
 3 | # This code lists objects in your Google Bucket
 4 | 
 5 | # Get the bucket name
 6 | my_bucket = os.getenv('WORKSPACE_BUCKET')
 7 | 
 8 | # List objects in the bucket
 9 | print(subprocess.check_output(f"gsutil ls -r {my_bucket}", shell=True).decode('utf-8'))
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/dataset-snippets/summarize_a_dataframe.R:
--------------------------------------------------------------------------------
1 | # Use snippet 'summarize_a_dataframe' to display summary statistics for a dataframe.
2 | # It assumes snippet 'Setup' has been executed.
3 | # See also https://www.rdocumentation.org/packages/skimr/versions/1.0.7/topics/skim
4 | 
5 | 
6 | ## -----[ CHANGE THE DATAFRAME NAME(S) TO MATCH YOURS FROM DATASET BUILDER] -----
7 | print(skim(YOUR_DATASET_NAME_person_df))
8 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | \<your comments for this pull request go here\>
 2 | 
 3 | Unfortunately we don't have automated testing configured for the code in this
 4 | repository yet so we set up this checklist as an *automatic reminder*:
 5 | 
 6 | - [ ] Ensure that the smoke tests pass using the current (or upcoming) CDR
 7 | - [ ] Update documentation relevant to this pull request
 8 | 
 9 | Questions? See [CONTRIBUTING.md](https://github.com/all-of-us/workbench-snippets/blob/master/CONTRIBUTING.md)
10 | or file an issue so that we can get it documented!
11 | 


--------------------------------------------------------------------------------
/dataset-snippets/summarize_a_dataframe.py:
--------------------------------------------------------------------------------
1 | # Use snippet 'summarize_a_dataframe' to display summary statistics for a dataframe.
2 | # It assumes snippet 'Setup' has been executed.
3 | # See also https://towardsdatascience.com/exploring-your-data-with-just-1-line-of-python-4b35ce21a82d
4 | 
5 | 
6 | ## -----[ CHANGE THE DATAFRAME NAME(S) TO MATCH YOURS FROM DATASET BUILDER] -----
7 | YOUR_DATASET_NAME_person_df.loc[:10000,:].profile_report()  # Examine up to the first 10,000 rows. Larger
8 |                                                             # dataframes can be profiled, but it takes more time.
9 | 


--------------------------------------------------------------------------------
/dataset-snippets/add_age_to_demographics.py:
--------------------------------------------------------------------------------
 1 | # Use snippet 'add_age_to_demographics' to calculate the age of people in your demographics.
 2 | # It assumes the 'Setup' snippet has been executed.
 3 | # It also assumes that you got your demographics dataframe from Dataset Builder
 4 | 
 5 | # Note: This snippet calculates current age and does not take into account whether the person is already dead
 6 | 
 7 | 
 8 | ## -----[ CHANGE THE DATAFRAME NAME(S) `YOUR_DATASET_NAME_person_df` TO MATCH YOURS FROM DATASET BUILDER] -----
 9 | YOUR_DATASET_NAME_person_df['age'] = pd.to_datetime('today').year - YOUR_DATASET_NAME_person_df['date_of_birth'].dt.year
10 | 


--------------------------------------------------------------------------------
/dataset-snippets/join_dataframes.py:
--------------------------------------------------------------------------------
 1 | # Use snippet 'join_dataframes' to join together two dataframes.
 2 | # It assumes the 'Setup' snippet has been executed.
 3 | #
 4 | # In the example below, it joins Demographics '_person_df' and Measurements '_measurement_df' using
 5 | # any columns they have in common, which in this case should only be 'person_id'.
 6 | #
 7 | # See also https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/pandas.merge.html
 8 | 
 9 | 
10 | ## -----[ CHANGE THE DATAFRAME NAME(S) TO MATCH YOURS FROM DATASET BUILDER] -----
11 | measurement_df = pd.merge(left=YOUR_DATASET_NAME_person_df, right=YOUR_DATASET_NAME_measurement_df, how='inner')
12 | 
13 | measurement_df.shape
14 | 


--------------------------------------------------------------------------------
/dataset-snippets/add_age_to_demographics.R:
--------------------------------------------------------------------------------
 1 | # Use snippet 'add_age_to_demographics' to calculate the age of people in your demographics.
 2 | # It assumes the 'Setup' snippet has been executed.
 3 | # It also assumes that you got your demographics dataframe from Dataset Builder
 4 | 
 5 | # Note: This snippet calculates current age and does not take into account whether the person is already dead
 6 | 
 7 | 
 8 | ## -----[ CHANGE THE DATAFRAME NAME(S) `YOUR_DATASET_NAME_person_df` TO MATCH YOURS FROM DATASET BUILDER] -----
 9 | YOUR_DATASET_NAME_person_df <- YOUR_DATASET_NAME_person_df %>%
10 |                 mutate_if(is.list, as.character) %>%
11 |                 mutate(age = year(today()) - year(YOUR_DATASET_NAME_person_df$date_of_birth))
12 | 


--------------------------------------------------------------------------------
/sql-snippets/measurement_of_interest_by_site.ggplot:
--------------------------------------------------------------------------------
 1 | # This plot assumes that measurement_of_interest.sql has been run.
 2 | options(repr.plot.height = 8, repr.plot.width = 16)
 3 | 
 4 | measurement_of_interest_df %>%
 5 |     filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers.
 6 |     ggplot(aes(x = src_id, y = value_as_number)) +
 7 |     geom_boxplot() +
 8 |     stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
 9 |                  position = position_dodge(width = 0.9), vjust = -0.8) +
10 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
11 |     coord_flip() +
12 |     ylab(str_glue('{UNIT_NAME}')) +
13 |     labs(title = str_glue('All {MEASUREMENT_NAME} measurements, by site'),
14 |          caption = 'Source: All Of Us Data')
15 | 


--------------------------------------------------------------------------------
/sql-snippets/measurement_of_interest_by_site.plotnine:
--------------------------------------------------------------------------------
 1 | # This plot assumes that measurement_of_interest.sql has been run.
 2 | 
 3 | # meas_filter is a column of True and False
 4 | meas_filter = measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers.
 5 | (ggplot(measurement_of_interest_df[meas_filter], aes(x = 'src_id', y = 'value_as_number')) +
 6 |     geom_boxplot() +
 7 |     stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10,
 8 |                  position = position_dodge(width = 0.9), va = 'top') +
 9 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
10 |     coord_flip() +
11 |     ylab(f'{UNIT_NAME}') +
12 |     ggtitle(f'All {MEASUREMENT_NAME} measurements, by site\nSource: All Of Us Data') +
13 |     theme(figure_size=(12, 6)))
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Workbench snippets
 2 | 
 3 | ## How to use the workbench snippets
 4 | 
 5 | Please see the welcome page for the [All of Us Researcher Workbench](https://workbench.researchallofus.org/). It has both a tutorial video and several articles in user support documentation.
 6 | 
 7 | ## How to add/update a workbench snippet
 8 | 
 9 | First see [CONTRIBUTING](./CONTRIBUTING.md) for general getting started instructions.
10 | 
11 | If you want to add/modify a snippet that uses a dataframe from Dataset Builder as its input, then see [dataset-snippets/README](./dataset-snippets/README.md).
12 | 
13 | Otherwise, see the other snippets collections such as
14 | 
15 | * [sql-snippets/README](./sql-snippets/README.md)
16 | * [storage-snippets/README](./storage-snippets/README.md)
17 | * [terra-widgets/README](./py/README.md)
18 | 


--------------------------------------------------------------------------------
/dataset-snippets/snippets_setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | import pandas_profiling
 5 | import plotnine
 6 | from plotnine import *  # Provides a ggplot-like interface to matplotlib.
 7 | from IPython.display import display
 8 | 
 9 | ## Plot setup.
10 | theme_set(theme_bw(base_size = 11)) # Default theme for plots.
11 | 
12 | def get_boxplot_fun_data(df):
13 |   """Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
14 | 
15 |   Args:
16 |     d: A data frame.
17 |   Returns:
18 |     A data frame with column y as max and column label as length.
19 |   """
20 |   d = {'y': max(df), 'label': f'N = {len(df)}'}
21 |   return(pd.DataFrame(data=d, index=[0]))
22 | 
23 | # NOTE: if you get any errors from this cell, restart your kernel and run it again.
24 | 


--------------------------------------------------------------------------------
/dataset-snippets/join_dataframes.R:
--------------------------------------------------------------------------------
 1 | # Use snippet 'join_dataframes' to join together two dataframes.
 2 | # It assumes the 'Setup' snippet has been executed.
 3 | #
 4 | # In the example below, it joins Demographics '_person_df' and Measurements '_measurement_df' using
 5 | # any columns they have in common, which in this case should only be 'person_id'.
 6 | #
 7 | # See also https://dplyr.tidyverse.org/reference/join.html and https://r4ds.had.co.nz/relational-data.html#understanding-joins
 8 | 
 9 | 
10 | ## -----[ CHANGE THE DATAFRAME NAME(S) TO MATCH YOURS FROM DATASET BUILDER] -----
11 | measurement_df <- inner_join(YOUR_DATASET_NAME_person_df,
12 |                              YOUR_DATASET_NAME_measurement_df) %>%
13 |   mutate_if(is.list, as.character)  # Convert column type list as character.
14 | 
15 | dim(measurement_df)
16 | 


--------------------------------------------------------------------------------
/sql-snippets/measurement_of_interest_by_sex_at_birth.plotnine:
--------------------------------------------------------------------------------
 1 | # This plot assumes that measurement_of_interest.sql has been run.
 2 | 
 3 | # meas_filter is a column of True and False
 4 | meas_filter = measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers.
 5 | (ggplot(measurement_of_interest_df[meas_filter], aes(x = 'sex_at_birth', y = 'value_as_number')) +
 6 |     geom_boxplot() +
 7 |     stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10,
 8 |                  position = position_dodge(width = 0.9), va = 'top') +
 9 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
10 |     ylab(f'{UNIT_NAME}') +
11 |     ggtitle(f'All {MEASUREMENT_NAME} measurements, by site\nSource: All Of Us Data') +
12 |     theme(figure_size=(12, 6), axis_text_x = element_text(angle=25, hjust=1)))
13 | 


--------------------------------------------------------------------------------
/sql-snippets/most_recent_measurement_of_interest_by_site.ggplot:
--------------------------------------------------------------------------------
 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run.
 2 | options(repr.plot.height = 8, repr.plot.width = 16)
 3 | 
 4 | most_recent_measurement_of_interest_df %>%
 5 |     filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers.
 6 |     ggplot(aes(x = src_id, y = value_as_number)) +
 7 |     geom_boxplot() +
 8 |     stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
 9 |                  position = position_dodge(width = 0.9), vjust = -0.8) +
10 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
11 |     coord_flip() +
12 |     ylab(str_glue('{UNIT_NAME}')) +
13 |     labs(title = str_glue('Most recent {MEASUREMENT_NAME} measurement\nper person, by site'),
14 |          caption = 'Source: All Of Us Data')
15 | 


--------------------------------------------------------------------------------
/sql-snippets/measurement_of_interest_by_sex_at_birth.ggplot:
--------------------------------------------------------------------------------
 1 | # This plot assumes that measurement_of_interest.sql has been run.
 2 | options(repr.plot.height = 8, repr.plot.width = 16)
 3 | 
 4 | measurement_of_interest_df %>%
 5 |     filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers.
 6 |     ggplot(aes(x = sex_at_birth, y = value_as_number)) +
 7 |     geom_boxplot() +
 8 |     stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
 9 |                  position = position_dodge(width = 0.9), vjust = -0.8) +
10 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
11 |     ylab(str_glue('{UNIT_NAME}')) +
12 |     labs(title = str_glue('All {MEASUREMENT_NAME} measurements, by sex_at_birth'),
13 |          caption = 'Source: All Of Us Data') +
14 |     theme(axis.text.x = element_text(angle=25, hjust=1))
15 | 


--------------------------------------------------------------------------------
/sql-snippets/most_recent_measurement_of_interest_by_sex_at_birth.plotnine:
--------------------------------------------------------------------------------
 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run.
 2 | 
 3 | meas_filter = most_recent_measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers.
 4 | (ggplot(most_recent_measurement_of_interest_df[meas_filter], aes(x = 'sex_at_birth', y = 'value_as_number')) +
 5 |     geom_boxplot() +
 6 |     stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10,
 7 |                  position = position_dodge(width = 0.9), va = 'top') +
 8 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
 9 |     ylab(f'{UNIT_NAME}') +
10 |     ggtitle(f'Most recent {MEASUREMENT_NAME} measurement\nper person, by sex_at_birth\nSource: All Of Us Data') +
11 |     theme(figure_size=(12, 6), axis_text_x = element_text(angle=25, hjust=1)))
12 | 


--------------------------------------------------------------------------------
/sql-snippets/most_recent_measurement_of_interest_by_site.plotnine:
--------------------------------------------------------------------------------
 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run.
 2 | 
 3 | # meas_filter is a column of True and False
 4 | meas_filter = most_recent_measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers.
 5 | (ggplot(most_recent_measurement_of_interest_df[meas_filter], aes(x = 'src_id', y = 'value_as_number')) +
 6 |     geom_boxplot() +
 7 |     stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10,
 8 |                  position = position_dodge(width = 0.9), va = 'top') +
 9 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
10 |     coord_flip() +
11 |     ylab(f'{UNIT_NAME}') +
12 |     ggtitle(f'Most recent {MEASUREMENT_NAME} measurement\nper person, by site\nSource: All Of Us Data') +
13 |     theme(figure_size=(12, 6)))
14 | 


--------------------------------------------------------------------------------
/sql-snippets/most_recent_measurement_of_interest_by_sex_at_birth.ggplot:
--------------------------------------------------------------------------------
 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run.
 2 | options(repr.plot.height = 8, repr.plot.width = 16)
 3 | 
 4 | most_recent_measurement_of_interest_df %>%
 5 |     filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers.
 6 |     ggplot(aes(x = sex_at_birth, y = value_as_number)) +
 7 |     geom_boxplot() +
 8 |     stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
 9 |                  position = position_dodge(width = 0.9), vjust = -0.8) +
10 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
11 |     ylab(str_glue('{UNIT_NAME}')) +
12 |     labs(title = str_glue('Most recent {MEASUREMENT_NAME} measurement\nper person, by sex_at_birth'),
13 |          caption = 'Source: All Of Us Data') +
14 |     theme(axis.text.x = element_text(angle=25, hjust=1))
15 | 


--------------------------------------------------------------------------------
/storage-snippets/copy_file_from_workspace_bucket.R:
--------------------------------------------------------------------------------
 1 | # This snippet assumes that you run setup first
 2 | 
 3 | # This code copies a file from your Google Bucket into a dataframe
 4 | 
 5 | # replace 'test.csv' with the name of the file in your google bucket (don't delete the quotation marks)
 6 | name_of_file_in_bucket <- 'test.csv'
 7 | 
 8 | ########################################################################
 9 | ##
10 | ################# DON'T CHANGE FROM HERE ###############################
11 | ##
12 | ########################################################################
13 | 
14 | # Get the bucket name
15 | my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
16 | 
17 | # Copy the file from current workspace to the bucket
18 | system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)
19 | 
20 | # Load the file into a dataframe
21 | my_dataframe  <- read_csv(name_of_file_in_bucket)
22 | head(my_dataframe)
23 | 


--------------------------------------------------------------------------------
/dataset-snippets/snippets_setup.R:
--------------------------------------------------------------------------------
 1 | lapply(c('viridis', 'ggthemes', 'skimr'),
 2 |        function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } )
 3 | 
 4 | library(viridis)    # A nice color scheme for plots.
 5 | library(ggthemes)   # Common themes to change the look and feel of plots.
 6 | library(scales)     # Graphical scales map data to aesthetics in plots.
 7 | library(skimr)      # Better summaries of data.
 8 | library(lubridate)  # Date library from the tidyverse.
 9 | library(tidyverse)  # Data wrangling packages.
10 | library(bigrquery)  # Data extraction from Google BigQuery
11 | 
12 | ## Plot setup.
13 | theme_set(theme_bw(base_size = 14)) # Default theme for plots.
14 | 
15 | #' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
16 | #'
17 | #' @param d A data frame.
18 | #' @return A data frame with column y as max and column label as length.
19 | get_boxplot_fun_data <- function(df) {
20 |   return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
21 | }
22 | 


--------------------------------------------------------------------------------
/storage-snippets/copy_file_from_workspace_bucket.py:
--------------------------------------------------------------------------------
 1 | # This snippet assumes you run setup first
 2 | 
 3 | # This code copies file in your Google Bucket and loads it into a dataframe
 4 | 
 5 | # Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
 6 | name_of_file_in_bucket = 'test.csv'
 7 | 
 8 | ########################################################################
 9 | ##
10 | ################# DON'T CHANGE FROM HERE ###############################
11 | ##
12 | ########################################################################
13 | 
14 | # get the bucket name
15 | my_bucket = os.getenv('WORKSPACE_BUCKET')
16 | 
17 | # copy csv file from the bucket to the current working space
18 | os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")
19 | 
20 | print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
21 | # save dataframe in a csv file in the same workspace as the notebook
22 | my_dataframe = pd.read_csv(name_of_file_in_bucket)
23 | my_dataframe.head()
24 | 


--------------------------------------------------------------------------------
/sql-snippets/measurement_of_interest_by_age_and_sex_at_birth.ggplot:
--------------------------------------------------------------------------------
 1 | # This plot assumes that measurement_of_interest.sql has been run.
 2 | options(repr.plot.height = 20, repr.plot.width = 16)
 3 | 
 4 | measurement_of_interest_df %>%
 5 |     filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers.
 6 |     mutate(age_at_measurement = year(as.period(interval(start = birth_datetime, end = measurement_date)))) %>%
 7 |     ggplot(aes(x = cut_width(age_at_measurement, width = 10, boundary = 0), y = value_as_number)) +
 8 |     geom_boxplot() +
 9 |     stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
10 |                  position = position_dodge(width = 0.9), vjust = -0.8) +
11 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
12 |     coord_flip() +
13 |     facet_wrap(~ sex_at_birth, nrow = length(unique(measurement_of_interest_df$sex_at_birth))) +
14 |     xlab('age') +
15 |     ylab(str_glue('{UNIT_NAME}')) +
16 |     labs(title = str_glue('All {MEASUREMENT_NAME} measurements, by age, faceted by sex_at_birth'),
17 |          caption = 'Source: All Of Us Data')
18 | 


--------------------------------------------------------------------------------
/storage-snippets/copy_data_to_workspace_bucket.py:
--------------------------------------------------------------------------------
 1 | # This snippet assumes you run setup first
 2 | 
 3 | # This code saves your dataframe into a csv file in a "data" folder in Google Bucket
 4 | 
 5 | # Replace df with THE NAME OF YOUR DATAFRAME
 6 | my_dataframe = df   
 7 | 
 8 | # Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
 9 | destination_filename = 'test.csv'
10 | 
11 | ########################################################################
12 | ##
13 | ################# DON'T CHANGE FROM HERE ###############################
14 | ##
15 | ########################################################################
16 | 
17 | # save dataframe in a csv file in the same workspace as the notebook
18 | my_dataframe.to_csv(destination_filename, index=False)
19 | 
20 | # get the bucket name
21 | my_bucket = os.getenv('WORKSPACE_BUCKET')
22 | 
23 | # copy csv file to the bucket
24 | args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
25 | output = subprocess.run(args, capture_output=True)
26 | 
27 | # print output from gsutil
28 | output.stderr
29 | 


--------------------------------------------------------------------------------
/storage-snippets/copy_data_to_workspace_bucket.R:
--------------------------------------------------------------------------------
 1 | # This snippet assumes that you run setup first
 2 | 
 3 | # This code saves your dataframe into a csv file in a "data" folder in Google Bucket
 4 | 
 5 | # Replace df with THE NAME OF YOUR DATAFRAME
 6 | my_dataframe <- df
 7 | 
 8 | # Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
 9 | destination_filename <- 'test.csv'
10 | 
11 | ########################################################################
12 | ##
13 | ################# DON'T CHANGE FROM HERE ###############################
14 | ##
15 | ########################################################################
16 | 
17 | # store the dataframe in current workspace
18 | write_excel_csv(my_dataframe, destination_filename)
19 | 
20 | # Get the bucket name
21 | my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
22 | 
23 | # Copy the file from current workspace to the bucket
24 | system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/data/"), intern=T)
25 | 
26 | # Check if file is in the bucket
27 | system(paste0("gsutil ls ", my_bucket, "/data/*.csv"), intern=T)
28 | 


--------------------------------------------------------------------------------
/py/setup.py:
--------------------------------------------------------------------------------
 1 | """A setuptools based module for PIP installation of the Terra widgets package."""
 2 | 
 3 | import pathlib
 4 | from setuptools import find_packages
 5 | from setuptools import setup
 6 | 
 7 | here = pathlib.Path(__file__).parent.resolve()
 8 | # Get the requirements from the requirements file
 9 | requirements = (here / 'requirements.txt').read_text(encoding='utf-8')
10 | # Get the long description from the README file
11 | long_description = (here / 'README.md').read_text(encoding='utf-8')
12 | 
13 | setup(
14 |     name='terra-widgets',
15 |     version='0.0.1',
16 |     license='BSD',
17 | 
18 |     description='Terra Notebook widgets',
19 |     long_description=long_description,
20 |     long_description_content_type='text/markdown',
21 | 
22 |     python_requires='>=3.7',
23 |     install_requires=requirements,
24 |     packages=find_packages(),
25 | 
26 |     url='https://github.com/all-of-us/workbench-snippets',
27 |     project_urls={
28 |         'Bug Reports': 'https://github.com/all-of-us/workbench-snippets/issues',
29 |         'Source': 'https://github.com/all-of-us/workbench-snippets/blob/main/py',
30 |     },
31 | )
32 | 


--------------------------------------------------------------------------------
/sql-snippets/most_recent_measurement_of_interest_by_age_and_sex_at_birth.ggplot:
--------------------------------------------------------------------------------
 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run.
 2 | options(repr.plot.height = 20, repr.plot.width = 16)
 3 | 
 4 | most_recent_measurement_of_interest_df %>%
 5 |     filter(value_as_number < 9999999) %>% # Get rid of nonsensical outliers.
 6 |     mutate(age_at_measurement = year(as.period(interval(start = birth_datetime, end = measurement_date)))) %>%
 7 |     ggplot(aes(x = cut_width(age_at_measurement, width = 10, boundary = 0), y = value_as_number)) +
 8 |     geom_boxplot() +
 9 |     stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
10 |                  position = position_dodge(width = 0.9), vjust = -0.8) +
11 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
12 |     coord_flip() +
13 |     facet_wrap(~ sex_at_birth, nrow = length(unique(most_recent_measurement_of_interest_df$sex_at_birth))) +
14 |     xlab('age') +
15 |     ylab(str_glue('{UNIT_NAME}')) +
16 |     labs(title = str_glue('Most recent {MEASUREMENT_NAME} measurement\nper person, by age, faceted by sex_at_birth'),
17 |          caption = 'Source: All Of Us Data')
18 | 


--------------------------------------------------------------------------------
/dataset-snippets/summarize_a_survey_module.R:
--------------------------------------------------------------------------------
 1 | # Use snippet 'summarize_a_survey_module' to print a table of participant counts by question in a module
 2 | # The snippet assumes that a dataframe containing survey questions and answers already exists
 3 | 
 4 | # Update the next 3 lines
 5 | 
 6 | survey_df <- YOUR_DATASET_NAME_survey_df
 7 | module_name <- 'The Basics'
 8 | denominator <- NULL
 9 | 
10 | ####################################################################################
11 | #                           DON'T CHANGE FROM HERE
12 | ####################################################################################
13 | summarize_a_module <- function(df, module=NULL, denominator=NULL) {
14 |     if (!is.null(module)){
15 |         df <- df %>% filter(tolower(survey) == tolower(module))
16 |     }
17 |     data <- df %>% group_by(survey, question_concept_id, question) %>%
18 |                summarize(n_participant = n_distinct(person_id))
19 |     if (!is.null(denominator)) {
20 |         data <- data %>% mutate(response_rate = paste0(round(100*n_participant/denominator,2),'%'))
21 |     }
22 |     data
23 | }
24 | 
25 | summarize_a_module(survey_df, module_name, denominator)
26 | 
27 | 


--------------------------------------------------------------------------------
/dataset-snippets/summarize_a_survey_module.py:
--------------------------------------------------------------------------------
 1 | # Use snippet 'summarize_a_survey_module' to print a table of participant counts by question in a module
 2 | # The snippet assumes that a dataframe containing survey questions and answers already exists
 3 | 
 4 | # Update the next 3 lines
 5 | survey_df = YOUR_DATASET_NAME_survey_df
 6 | module_name = 'The Basics' # e.g: 'The Basics', 'Lifestyle', 'Overall Health', etc.
 7 | denominator = None # e.g: 200000
 8 | 
 9 | ####################################################################################
10 | #                           DON'T CHANGE FROM HERE
11 | ####################################################################################
12 | 
13 | def summarize_a_module(df, module=None, denominator=None):
14 |     if module:
15 |         df = df[df['survey'].str.lower() == module.lower()].copy()
16 |     data = (df.groupby(['survey','question_concept_id','question'])['person_id'].nunique()
17 |                 .reset_index()
18 |                 .rename(columns={'person_id':'n_participant'}))
19 |     if denominator:
20 |         data['response_rate'] = round(100*data['n_participant']/denominator,2)
21 |     return data
22 | 
23 | summarize_a_module(df=survey_df, module=module_name, denominator=denominator)
24 | 
25 | 


--------------------------------------------------------------------------------
/dataset-snippets/measurement_by_sex_at_birth.ggplot:
--------------------------------------------------------------------------------
 1 | # Use snippet 'measurement_by_sex_at_birth' to plot joined demographics and measurements dataframes.
 2 | # This plot assumes 'measurement_df' was created using snippet 'Basic operations -> join_dataframes' to
 3 | # join together demographics and measurements dataframes.
 4 | # See also https://r4ds.had.co.nz/data-visualisation.html
 5 | 
 6 | 
 7 | options(repr.plot.height = 10, repr.plot.width = 16)
 8 | 
 9 | # There could be many different measurements in the dataframe. By default, plot the first one.
10 | measurement_to_plot <- unique(measurement_df$standard_concept_name)[1]
11 | 
12 | measurement_df %>%
13 |     filter(standard_concept_name == measurement_to_plot) %>%
14 |     filter(!unit_concept_name %in% c('No matching concept', 'NULL')) %>%
15 |     filter(value_as_number < 9999999) %>%  # Get rid of nonsensical outliers.
16 |     ggplot(aes(x = sex_at_birth, y = value_as_number)) +
17 |     geom_boxplot() +
18 |     stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 4,
19 |                  position = position_dodge(width = 0.9), vjust = -0.8) +
20 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
21 |     facet_wrap(standard_concept_name ~ unit_concept_name, ncol = 2, scales = 'free') +
22 |     labs(title = str_glue('Numeric values of measurements, by sex_at_birth'), caption = 'Source: All Of Us Data') +
23 |     theme(axis.text.x = element_text(angle=25, hjust=1))
24 | 


--------------------------------------------------------------------------------
/sql-snippets/snippets_setup.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | import plotnine
 6 | from plotnine import *  # Provides a ggplot-like interface to matplotlib.
 7 | 
 8 | # Get the BigQuery curated dataset for the current workspace context.
 9 | CDR = os.environ['WORKSPACE_CDR']
10 | 
11 | ## Plot setup.
12 | theme_set(theme_bw(base_size = 11)) # Default theme for plots.
13 | 
14 | def get_boxplot_fun_data(df):
15 |   """Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
16 | 
17 |   Args:
18 |     d: A data frame.
19 |   Returns:
20 |     A data frame with column y as max and column label as length.
21 |   """
22 |   d = {'y': max(df), 'label': f'N = {len(df)}'}
23 |   return(pd.DataFrame(data=d, index=[0]))
24 | 
25 | ## ---------------[ CHANGE THESE AS NEEDED] ---------------------------------------
26 | # Set default parameter values so that all snippets run successfully with no edits needed.
27 | COHORT_QUERY = f'SELECT person_id FROM `{CDR}.person`'  # Default to all participants.
28 | MEASUREMENT_OF_INTEREST = 'hemoglobin'
29 | # Tip: the next four parameters could be set programmatically using one row from
30 | # the result of measurements_of_interest_summary.sql
31 | MEASUREMENT_CONCEPT_ID = 3004410        # Hemoglobin A1c
32 | UNIT_CONCEPT_ID = 8554                  # percent
33 | MEASUREMENT_NAME = '<this should be the measurement name>'
34 | UNIT_NAME = '<this should be the unit name>'
35 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2019 All of Us Research Program
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 | 


--------------------------------------------------------------------------------
/dataset-snippets/measurement_by_sex_at_birth.plotnine:
--------------------------------------------------------------------------------
 1 | # Use snippet 'measurement_by_sex_at_birth' to plot joined demographics and measurements dataframes.
 2 | # This plot assumes 'measurement_df' was created using snippet 'Basic operations -> join_dataframes' to
 3 | # join together demographics and measurements dataframes.
 4 | # See also https://plotnine.readthedocs.io/en/stable/
 5 | 
 6 | 
 7 | # There could be many different measurements in the dataframe. By default, plot the first one.
 8 | measurement_to_plot = measurement_df.standard_concept_name.unique()[0]
 9 | 
10 | # meas_filter is a column of True and False.
11 | meas_filter = ((measurement_df.standard_concept_name == measurement_to_plot)
12 |   & (measurement_df.unit_concept_name != 'No matching concept')
13 |   & (measurement_df.unit_concept_name.notna())
14 |   & (measurement_df.value_as_number < 9999999)  # Get rid of nonsensical outliers.
15 | )
16 | 
17 | (ggplot(measurement_df[meas_filter], aes(x = 'sex_at_birth', y = 'value_as_number')) +
18 |     geom_boxplot() +
19 |     stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10,
20 |                  position = position_dodge(width = 0.9), va = 'top') +
21 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
22 |     facet_wrap(('standard_concept_name', 'unit_concept_name'), ncol = 2, scales = 'free') +
23 |     ggtitle(f'Numeric values of measurements, by sex_at_birth\nSource: All Of Us Data') +
24 |     theme(figure_size=(12, 6), panel_spacing = .5, axis_text_x = element_text(angle=25, hjust=1)))
25 | 


--------------------------------------------------------------------------------
/sql-snippets/measurement_of_interest_by_age_and_sex_at_birth.plotnine:
--------------------------------------------------------------------------------
 1 | # This plot assumes that measurement_of_interest.sql has been run.
 2 | 
 3 | measurement_of_interest_df['age_at_measurement'] = ((pd.to_datetime(measurement_of_interest_df['measurement_date'])
 4 |                                      - measurement_of_interest_df['birth_datetime'].dt.tz_localize(None)).dt.days)//365.24
 5 | measurement_of_interest_df['age_group'] = pd.cut(measurement_of_interest_df['age_at_measurement'],
 6 |                                                  [-np.inf, 34.5, 49.5, 64.5, np.inf],
 7 |                                                  labels=["<35", "35-49", "50-64", "65+"])
 8 | # meas_filter is a column of True and False
 9 | meas_filter = measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers.
10 | age_group_not_null = (measurement_of_interest_df['age_group'].notnull())
11 | 
12 | (ggplot(measurement_of_interest_df[meas_filter & age_group_not_null], aes(x = 'age_group', y = 'value_as_number')) +
13 |     geom_boxplot() +
14 |     stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10,
15 |                  position = position_dodge(width = 0.9), va = 'top') +
16 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
17 |     coord_flip() +
18 |     facet_wrap('~ sex_at_birth', nrow = len(measurement_of_interest_df.sex_at_birth.unique())) +
19 |     xlab('age') +
20 |     ylab(f'{UNIT_NAME}') +
21 |     ggtitle(f'All {MEASUREMENT_NAME} measurements, by age, faceted by sex_at_birth\nSource: All Of Us Data') +
22 |     theme(figure_size=(12, 12)))
23 | 


--------------------------------------------------------------------------------
/sql-snippets/measurement_of_interest.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Return row level data for a measurement for our cohort.
 3 | --
 4 | -- PARAMETERS:
 5 | --   MEASUREMENT_CONCEPT_ID: for example 3004410        # Hemoglobin A1c
 6 | --   UNIT_CONCEPT_ID: for example 8554                  # percent
 7 | 
 8 | WITH
 9 |   --
10 |   -- Retrieve participants birthdate and sex_at_birth.
11 |   --
12 | persons AS (
13 |   SELECT
14 |     person_id,
15 |     birth_datetime,
16 |     concept_name AS sex_at_birth
17 |   FROM
18 |     `{CDR}.person`
19 |   LEFT JOIN `{CDR}.concept` ON concept_id = sex_at_birth_concept_id),
20 |   --
21 |   -- Retrieve the row-level data for our measurement of interest.
22 |   --
23 | measurements AS (
24 |   SELECT
25 |     person_id,
26 |     measurement_id,
27 |     measurement_concept_id,
28 |     measurement_date,
29 |     measurement_datetime,
30 |     measurement_type_concept_id,
31 |     operator_concept_id,
32 |     value_as_number,
33 |     value_as_concept_id,
34 |     unit_concept_id,
35 |     range_low,
36 |     range_high
37 |   FROM
38 |     `{CDR}.measurement`
39 |   WHERE
40 |     measurement_concept_id = {MEASUREMENT_CONCEPT_ID}
41 |     AND unit_concept_id = {UNIT_CONCEPT_ID}
42 |     AND person_id IN ({COHORT_QUERY}))
43 |   --
44 |   -- Lastly, JOIN all this data together so that we have the birthdate, sex_at_birth and site for each measurement.
45 |   --
46 | SELECT
47 |   persons.*,
48 |   src_id,
49 |   measurements.* EXCEPT(person_id, measurement_id)
50 | FROM
51 |   measurements
52 | LEFT JOIN
53 |   persons USING (person_id)
54 | LEFT JOIN
55 |   `{CDR}.measurement_ext` USING (measurement_id)
56 | ORDER BY
57 |   person_id,
58 |   measurement_id
59 | 
60 | 


--------------------------------------------------------------------------------
/dataset-snippets/measurement_by_age_and_sex_at_birth.ggplot:
--------------------------------------------------------------------------------
 1 | # Use snippet 'measurement_by_age_and_sex_at_birth' to plot joined demographics and measurements dataframes.
 2 | # This plot assumes 'measurement_df' was created using snippet 'Basic operations -> join_dataframes' to
 3 | # join together demographics and measurements dataframes.
 4 | # See also https://r4ds.had.co.nz/data-visualisation.html
 5 | 
 6 | 
 7 | options(repr.plot.height = 16, repr.plot.width = 16)
 8 | 
 9 | # There could be many different measurements in the dataframe. By default, plot the first one.
10 | measurement_to_plot <- unique(measurement_df$standard_concept_name)[1]
11 | 
12 | measurement_df %>%
13 |     filter(standard_concept_name == measurement_to_plot) %>%
14 |     filter(!unit_concept_name %in% c('No matching concept', 'NULL')) %>%
15 |     filter(sex_at_birth != 'No matching concept') %>%
16 |     filter(value_as_number < 9999999) %>%  # Get rid of nonsensical outliers.
17 |     mutate(age_at_measurement = year(as.period(interval(start = date_of_birth, end = measurement_datetime)))) %>%
18 |     ggplot(aes(x = cut_width(age_at_measurement, width = 5, boundary = 0), y = value_as_number)) +
19 |     geom_boxplot() +
20 |     stat_summary(fun.data = get_boxplot_fun_data, geom = 'text', size = 2,
21 |                  position = position_dodge(width = 0.9), vjust = -0.8) +
22 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
23 |     coord_flip() +
24 |     facet_wrap(standard_concept_name + unit_concept_name ~ sex_at_birth, ncol = 2, scales = 'free') +
25 |     xlab('age group') +
26 |     labs(title = str_glue('Numeric values of measurements by age and sex_at_birth'), caption = 'Source: All Of Us Data')
27 | 


--------------------------------------------------------------------------------
/py/README.md:
--------------------------------------------------------------------------------
 1 | # Terra widgets
 2 | 
 3 | A python package for ipywidget-based user interfaces for performing tasks within the context Python Jupyter notebooks running in either the Terra or All of Us workbench environments.
 4 | 
 5 | 
 6 | ## Create and view HTML snapshots of notebooks
 7 | 
 8 | The workbench takes care of saving the current version of your notebooks for you. But what if you want to know **what your notebook looked like two weeks ago?** Use `display_html_snapshots_widget()` to display a widget which can save snapshots of a notebook for later review, allowing users to track changes to results in notebooks over time. To do this, it:
 9 | 
10 | 1. Converts the selected notebook to an HTML file (without re-running the notebook).
11 | 1. And then copies that HTML file to subfolder within the same workspace bucket where the notebook file is stored.
12 | 
13 | Use this interface to create an HTML snapshot each time you make a major change to your notebook. You can choose notebooks from **any of your workspaces!**
14 | 
15 | Implementation details:
16 | 
17 | * The user interface controls are implemented using the [ipywidgets](https://ipywidgets.readthedocs.io/en/latest/) Python package.
18 | 
19 | * Notebooks are converted from `.ipynb` to `.html` using [nbconvert](https://nbconvert.readthedocs.io/en/latest/).
20 | 
21 | * Files are transfered back and forth from the workspace bucket using both:
22 |     * [gsutil](https://cloud.google.com/storage/docs/gsutil)
23 |     * [Tensorflow GFile](https://www.tensorflow.org/api_docs/python/tf/io/gfile/GFile).
24 |     
25 | * The few files of code implementing this interface are preinstalled as a [Python library](https://github.com/all-of-us/workbench-snippets/blob/main/py/setup.py) on the AoU workbench.
26 | 


--------------------------------------------------------------------------------
/sql-snippets/most_recent_measurement_of_interest_by_age_and_sex_at_birth.plotnine:
--------------------------------------------------------------------------------
 1 | # This plot assumes that most_recent_measurement_of_interest.sql has been run.
 2 | 
 3 | most_recent_measurement_of_interest_df['age_at_measurement'] = ((pd.to_datetime(most_recent_measurement_of_interest_df['measurement_date'])
 4 |                                                  - most_recent_measurement_of_interest_df['birth_datetime'].dt.tz_localize(None)).dt.days)//365.24
 5 | most_recent_measurement_of_interest_df['age_group'] = pd.cut(most_recent_measurement_of_interest_df['age_at_measurement'],
 6 |                                                              [-np.inf, 34.5, 49.5, 64.5, np.inf],
 7 |                                                              labels=["<35", "35-49", "50-64", "65+"])
 8 | # meas_filter is a column of True and False
 9 | meas_filter = most_recent_measurement_of_interest_df['value_as_number'] < 9999999 # Get rid of nonsensical outliers.
10 | age_group_not_null = (most_recent_measurement_of_interest_df['age_group'].notnull())
11 | 
12 | (ggplot(most_recent_measurement_of_interest_df[meas_filter & age_group_not_null], aes(x = 'age_group', y = 'value_as_number')) +
13 |     geom_boxplot() +
14 |     stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10,
15 |                  position = position_dodge(width = 0.9), va = 'top') +
16 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
17 |     coord_flip() +
18 |     facet_wrap('~ sex_at_birth', nrow = len(most_recent_measurement_of_interest_df.sex_at_birth.unique())) +
19 |     xlab('age') +
20 |     ylab(f'{UNIT_NAME}') +
21 |     ggtitle(f'Most recent {MEASUREMENT_NAME} measurement\nper person, by age, faceted by sex_at_birth\nSource: All Of Us Data') +
22 |     theme(figure_size=(12, 6)))
23 | 


--------------------------------------------------------------------------------
/sql-snippets/snippets_setup.R:
--------------------------------------------------------------------------------
 1 | lapply(c('viridis', 'ggthemes', 'skimr'),
 2 |        function(pkg_name) { if(! pkg_name %in% installed.packages()) { install.packages(pkg_name)} } )
 3 | 
 4 | library(viridis)    # A nice color scheme for plots.
 5 | library(ggthemes)   # Common themes to change the look and feel of plots.
 6 | library(scales)     # Graphical scales map data to aesthetics in plots.
 7 | library(skimr)      # Better summaries of data.
 8 | library(lubridate)  # Date library from the tidyverse.
 9 | library(bigrquery)  # BigQuery R client.
10 | library(tidyverse)  # Data wrangling packages.
11 | 
12 | ## BigQuery setup.
13 | BILLING_PROJECT_ID <- Sys.getenv('GOOGLE_PROJECT')
14 | # Get the BigQuery curated dataset for the current workspace context.
15 | CDR <- Sys.getenv('WORKSPACE_CDR')
16 | 
17 | ## Plot setup.
18 | theme_set(theme_bw(base_size = 14)) # Default theme for plots.
19 | 
20 | #' Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
21 | #'
22 | #' @param d A data frame.
23 | #' @return A data frame with column y as max and column label as length.
24 | get_boxplot_fun_data <- function(df) {
25 |   return(data.frame(y = max(df), label = stringr::str_c('N = ', length(df))))
26 | }
27 | 
28 | ## ---------------[ CHANGE THESE AS NEEDED] ---------------------------------------
29 | # Set default parameter values so that all snippets run successfully with no edits needed.
30 | COHORT_QUERY <- str_glue('SELECT person_id FROM `{CDR}.person`')  # Default to all participants.
31 | MEASUREMENT_OF_INTEREST <- 'hemoglobin'
32 | # Tip: the next four parameters could be set programmatically using one row from
33 | # the result of measurements_of_interest_summary.sql
34 | MEASUREMENT_CONCEPT_ID <- 3004410        # Hemoglobin A1c
35 | UNIT_CONCEPT_ID <- 8554                  # percent
36 | MEASUREMENT_NAME <- '<this should be the measurement name>'
37 | UNIT_NAME <- '<this should be the unit name>'
38 | 


--------------------------------------------------------------------------------
/dataset-snippets/summarize_a_survey_by_question_concept_id.py:
--------------------------------------------------------------------------------
 1 | # Use snippet 'summarize_a_survey_module' to output a table and a graph of 
 2 | # participant counts by response for one question_concept_id
 3 | # The snippet assumes that a dataframe containing survey questions and answers already exists
 4 | # The snippet also assumes that setup has been run
 5 | 
 6 | # Update the next 3 lines
 7 | survey_df = YOUR_DATASET_NAME_survey_df
 8 | question_concept_id = 1585940
 9 | denominator = None # e.g: 200000
10 | 
11 | ####################################################################################
12 | #                           DON'T CHANGE FROM HERE
13 | ####################################################################################
14 | def summarize_a_question_concept_id(df, question_concept_id, denominator=None):
15 |     df = df.loc[df['question_concept_id'] == question_concept_id].copy()
16 |     new_df = df.groupby(['answer_concept_id', 'answer'])['person_id']\
17 |            .nunique()\
18 |            .reset_index()\
19 |            .rename(columns=dict(person_id='n_participant'))\
20 |            .assign(answer_concept_id = lambda x: np.int32(x.answer_concept_id))
21 |     if denominator:
22 |         new_df['response_rate'] = round(100*new_df['n_participant']/denominator,2)
23 |     if question_concept_id in df['question_concept_id'].unique():
24 |         print(f"Distribution of response to {df.loc[df['question_concept_id'] == question_concept_id, 'question'].unique()[0]}")
25 |         # show table
26 |         display(new_df)
27 |         # show graph
28 |         display(ggplot(data=new_df) +
29 |               geom_bar(aes(x='answer', y='n_participant'), stat='identity') +
30 |                coord_flip() +
31 |                 labs(y="Participant count", x="") +
32 |                theme_bw())
33 |     else:
34 |         print("There is an error with your question_concept_id")
35 | 
36 | summarize_a_question_concept_id(survey_df, question_concept_id, denominator)    
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/dataset-snippets/summarize_a_survey_by_question_concept_id.R:
--------------------------------------------------------------------------------
 1 | # Use snippet 'summarize_a_survey_module' to output a table and a graph of 
 2 | # participant counts by response for one question_concept_id
 3 | # The snippet assumes that a dataframe containing survey questions and answers already exists
 4 | # The snippet also assumes that setup has been run
 5 | 
 6 | # Update the next 3 lines
 7 | survey_df <- YOUR_DATASET_NAME_survey_df
 8 | question_concept_id <- 1585940
 9 | denominator <- NULL
10 | 
11 | ####################################################################################
12 | #                           DON'T CHANGE FROM HERE
13 | ####################################################################################
14 | summarize_a_question_concept_id <- function(df, q_concept_id, denominator=NULL){
15 |     df <- df %>% 
16 |         mutate(question_concept_id = as.numeric(question_concept_id)) %>%
17 |         filter(question_concept_id == q_concept_id)
18 |     
19 |     new_df <- df %>% group_by(answer_concept_id, answer) %>%
20 |                     summarize(n_participant = n_distinct(person_id)) %>%
21 |                     ungroup() %>%
22 |                     mutate(answer_concept_id = as.integer(answer_concept_id))
23 |     if (!is.null(denominator)) {
24 |         new_df <- new_df %>% mutate(response_rate = paste0(round(100*n_participant/denominator,2),'%'))
25 |     }
26 |     
27 |     if (q_concept_id %in% as.vector(unique(df[['question_concept_id']]))){
28 |         question_name <- as.vector(unique(df$question))
29 |         print(str_glue("Distribution of response to {question_name}"))
30 |         
31 |         # show table
32 |         print(new_df)
33 | 
34 |         # show graph
35 |         options(repr.plot.width=12, repr.plot.height=6)
36 |         ggplot(new_df) +
37 |             geom_bar(aes(x=answer, y=n_participant), stat='identity') +
38 |             coord_flip() +
39 |             labs(y="Participant count", x="") +
40 |             theme_bw()
41 |     }
42 |     else {
43 |         print("There is an error with your question_concept_id")
44 |     }
45 | }
46 | 
47 | summarize_a_question_concept_id(survey_df, question_concept_id, denominator)
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/dataset-snippets/measurement_by_age_and_sex_at_birth.plotnine:
--------------------------------------------------------------------------------
 1 | # Use snippet 'measurement_by_age_and_sex_at_birth' to plot joined demographics and measurements dataframes.
 2 | # This plot assumes 'measurement_df' was created using snippet 'Basic operations -> join_dataframes' to
 3 | # join together demographics and measurements dataframes.
 4 | # See also https://plotnine.readthedocs.io/en/stable/
 5 | 
 6 | 
 7 | # There could be many different measurements in the dataframe. By default, plot the first one.
 8 | measurement_to_plot = measurement_df.standard_concept_name.unique()[0]
 9 | 
10 | # Create a derived variable for age group.
11 | measurement_df['age_at_measurement'] = ((measurement_df['measurement_datetime'].dt.tz_localize(None)
12 |                                      - measurement_df['date_of_birth'].dt.tz_localize(None)).dt.days)//365.24
13 | measurement_df['age_group'] = pd.cut(measurement_df['age_at_measurement'],
14 |                                                  [-np.inf, 34.5, 49.5, 64.5, np.inf],
15 |                                                  labels=["<35", "35-49", "50-64", "65+"])
16 | 
17 | # meas_filter is a column of True and False
18 | meas_filter = ((measurement_df.standard_concept_name == measurement_to_plot)
19 |   & (measurement_df.unit_concept_name != 'No matching concept')
20 |   & (measurement_df.unit_concept_name.notna())
21 |   & (measurement_df.sex_at_birth != 'No matching concept')
22 |   & (measurement_df.value_as_number < 9999999)
23 |   & (measurement_df['age_at_measurement'].notnull())  # Get rid of nonsensical outliers.
24 | )
25 | 
26 | (ggplot(measurement_df[meas_filter], aes(x = 'age_group', y = 'value_as_number')) +
27 |     geom_boxplot() +
28 |     stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10,
29 |                  position = position_dodge(width = 0.9), va = 'top') +
30 | #    scale_y_log10() +  # Uncomment if the data looks skewed.
31 |     coord_flip() +
32 |     facet_wrap(['standard_concept_name + ": " + unit_concept_name', 'sex_at_birth'], ncol = 2, scales = 'free') +
33 |     xlab('age group') +
34 |     ggtitle('Numeric values of measurements by age and sex_at_birth\nSource: All Of Us Data') +
35 |     theme(figure_size = (12, 12), panel_spacing = .5))
36 | 


--------------------------------------------------------------------------------
/sql-snippets/most_recent_measurement_of_interest.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Return row level data for a measurement, limited to only the most recent result per person in our cohort.
 3 | --
 4 | -- PARAMETERS:
 5 | --   MEASUREMENT_CONCEPT_ID: for example 3004410        # Hemoglobin A1c
 6 | --   UNIT_CONCEPT_ID: for example 8554                  # percent
 7 | 
 8 | WITH
 9 |   --
10 |   -- Retrieve participants birthdate and sex_at_birth.
11 |   --
12 | persons AS (
13 |   SELECT
14 |     person_id,
15 |     birth_datetime,
16 |     concept_name AS sex_at_birth
17 |   FROM
18 |     `{CDR}.person`
19 |   LEFT JOIN `{CDR}.concept` ON concept_id = sex_at_birth_concept_id),
20 |   --
21 |   -- Retrieve the row-level data for our measurement of interest. Also compute
22 |   -- a new column for the recency rank of the measurement per person, a rank of
23 |   -- of 1 being the most recent lab result for that person.
24 |   --
25 | measurements AS (
26 |   SELECT
27 |     person_id,
28 |     measurement_id,
29 |     measurement_concept_id,
30 |     unit_concept_id,
31 |     measurement_date,
32 |     measurement_datetime,
33 |     measurement_type_concept_id,
34 |     operator_concept_id,
35 |     value_as_number,
36 |     value_as_concept_id,
37 |     range_low,
38 |     range_high,
39 |     ROW_NUMBER() OVER (PARTITION BY person_id
40 |                        ORDER BY measurement_date DESC,
41 |                                 measurement_datetime DESC,
42 |                                 measurement_id DESC) AS recency_rank
43 | 
44 |   FROM
45 |     `{CDR}.measurement`
46 |   WHERE
47 |     measurement_concept_id = {MEASUREMENT_CONCEPT_ID}
48 |     AND unit_concept_id = {UNIT_CONCEPT_ID}
49 |     AND person_id IN ({COHORT_QUERY}))
50 |   --
51 |   -- Lastly, JOIN all this data together so that we have the birthdate, sex_at_birth and site for each
52 |   -- measurement, retaining only the most recent result per person.
53 |   --
54 | SELECT
55 |   persons.*,
56 |   src_id,
57 |   measurements.* EXCEPT(person_id, measurement_id, recency_rank)
58 | FROM
59 |   measurements
60 | LEFT JOIN
61 |   persons USING (person_id)
62 | LEFT JOIN
63 |   `{CDR}.measurement_ext` USING (measurement_id)
64 | WHERE
65 |   recency_rank = 1
66 | ORDER BY
67 |   person_id,
68 |   measurement_id
69 | 
70 | 


--------------------------------------------------------------------------------
/sql-snippets/measurements_of_interest_summary.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- Compute summary information for our measurements of interest for our cohort.
 3 | --
 4 | -- PARAMETERS:
 5 | --   MEASUREMENT_OF_INTEREST: a case-insensitive string, such as "hemoglobin", to be compared
 6 | --                            to all measurement concept names to identify those of interest
 7 | 
 8 | WITH
 9 |   --
10 |   -- Use a case insensitive string to search the measurement concept names of those
11 |   -- measurements we do have in the measurements table.
12 |   --
13 |   labs_of_interest AS (
14 |   SELECT
15 |     measurement_concept_id,
16 |     measurement_concept.concept_name AS measurement_name,
17 |     unit_concept_id,
18 |     unit_concept.concept_name AS unit_name
19 |   FROM
20 |     `{CDR}.measurement`
21 |   LEFT JOIN `{CDR}.concept` AS measurement_concept
22 |   ON measurement_concept.concept_id = measurement_concept_id
23 |   LEFT JOIN `{CDR}.concept` AS unit_concept
24 |   ON unit_concept.concept_id = unit_concept_id
25 |   WHERE
26 |     REGEXP_CONTAINS(measurement_concept.concept_name, r"(?i){MEASUREMENT_OF_INTEREST}")
27 |   GROUP BY
28 |     measurement_concept_id,
29 |     unit_concept_id,
30 |     measurement_concept.concept_name,
31 |     unit_concept.concept_name
32 | )
33 |   --
34 |   -- Summarize the information about each measurement concept of interest that our
35 |   -- prior query identified.
36 |   --
37 | SELECT
38 |   measurement_name AS measurement,
39 |   IFNULL(unit_name, "NA") AS unit,
40 |   COUNT(1) AS N,
41 |   COUNTIF(value_as_number IS NULL
42 |     AND (value_as_concept_id IS NULL
43 |       OR value_as_concept_id = 0)) AS missing,
44 |   MIN(value_as_number) AS min,
45 |   MAX(value_as_number) AS max,
46 |   AVG(value_as_number) AS avg,
47 |   STDDEV(value_as_number) AS stddev,
48 |   APPROX_QUANTILES(value_as_number, 4) AS quantiles,
49 |   COUNTIF(value_as_number IS NOT NULL) AS num_numeric_values,
50 |   COUNTIF(value_as_concept_id IS NOT NULL
51 |       AND value_as_concept_id != 0) AS num_concept_values,
52 |   COUNTIF(operator_concept_id IS NOT NULL) AS num_operators,
53 |   IF(src_id = "PPI/PM", "PPI", "EHR") AS measurement_source,
54 |   measurement_concept_id,
55 |   unit_concept_id
56 | FROM
57 |   `{CDR}.measurement`
58 | INNER JOIN
59 |  labs_of_interest USING(measurement_concept_id, unit_concept_id)
60 | LEFT JOIN
61 |   `{CDR}.measurement_ext` USING(measurement_id)
62 | WHERE
63 |   person_id IN ({COHORT_QUERY})
64 | GROUP BY
65 |   measurement_concept_id,
66 |   measurement_name,
67 |   measurement_source,
68 |   unit_concept_id,
69 |   unit_name
70 | ORDER BY
71 |   N DESC
72 | 
73 | 


--------------------------------------------------------------------------------
/sql-snippets/measurements_of_interest_summary_test.py:
--------------------------------------------------------------------------------
  1 | """Tests for query measurements_of_interest_summary.sql.
  2 | 
  3 | See https://github.com/verilylifesciences/analysis-py-utils for more details
  4 | about the testing framework.
  5 | """
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | from ddt import ddt
 12 | import os
 13 | import unittest
 14 | from verily.bigquery_wrapper import bq_test_case
 15 | 
 16 | SQL_TEMPLATE = "measurements_of_interest_summary.sql"
 17 | 
 18 | 
 19 | @ddt
 20 | class QueryTest(bq_test_case.BQTestCase):
 21 | 
 22 |   @classmethod
 23 |   def setUpClass(cls):
 24 |     """Set up class."""
 25 |     super(QueryTest, cls).setUpClass(use_mocks=False)
 26 |     cls.sql_to_test = open(
 27 |         os.path.join(os.path.dirname(os.path.realpath(__file__)),
 28 |                      SQL_TEMPLATE), "r").read()
 29 | 
 30 |   @classmethod
 31 |   def create_mock_tables(cls):
 32 |     """Create mock tables."""
 33 | 
 34 |     cls.client.create_table_from_query("""
 35 | SELECT * FROM UNNEST([
 36 | STRUCT<person_id INT64,
 37 |        birth_datetime TIMESTAMP,
 38 |        sex_at_birth_concept_id INT64>
 39 |     (1001, '1990-12-31 00:00:00 UTC', 501),
 40 |     (1002, '1950-08-01 00:00:00 UTC', 500),
 41 |     (1003, '1965-06-30 00:00:00 UTC', 500)
 42 | ])
 43 |     """, cls.client.path("person"))
 44 | 
 45 |     cls.client.create_table_from_query("""
 46 | SELECT * FROM UNNEST([
 47 | STRUCT<concept_id INT64,
 48 |        concept_name STRING,
 49 |        vocabulary_id STRING>
 50 |     (123, 'Hemoglobin', 'LOINC'),
 51 |     (456, 'gram per deciliter', 'UCUM')
 52 | ])
 53 |     """, cls.client.path("concept"))
 54 | 
 55 |     cls.client.create_table_from_query("""
 56 | SELECT * FROM UNNEST([
 57 | STRUCT<measurement_id INT64,
 58 |        src_id STRING>
 59 |     (1, 'EHR site1'),
 60 |     (2, 'EHR site1'),
 61 |     (3, 'EHR site1'),
 62 |     (4, 'EHR site2'),
 63 |     (5, 'EHR site2'),
 64 |     (6, 'PPI/PM')
 65 | ])
 66 |     """, cls.client.path("measurement_ext"))
 67 | 
 68 |     cls.client.create_table_from_query("""
 69 | SELECT * FROM UNNEST([
 70 | STRUCT<measurement_id INT64,
 71 |        person_id INT64,
 72 |        measurement_source_concept_id INT64,
 73 |        measurement_concept_id INT64,
 74 |        unit_concept_id INT64,
 75 |        operator_concept_id INT64,
 76 |        value_as_number FLOAT64,
 77 |        value_as_concept_id INT64>
 78 |     (1, 1001, 123, 123, 456, NULL, 42.0, NULL),
 79 |     (2, 1001, 123, 123, 456, NULL, 13.5, NULL),
 80 |     (3, 1002, 123, 123, 456, NULL, NULL, 100),
 81 |     (4, 1002, 123, 123, 456, NULL, NULL, NULL),
 82 |     (5, 1002, 123, 123, 456, 789, 7.2, NULL),
 83 |     # This measurement is for someone not in our cohort.
 84 |     (6, 1003, 123, 123, 456, NULL, 500, NULL)
 85 | ])
 86 |     """, cls.client.path("measurement"))
 87 | 
 88 |     # Get the project id and dataset name where the temp tables are stored.
 89 |     (project_id, dataset_id, _) = cls.client.parse_table_path(
 90 |         cls.client.path("any_temp_table"))
 91 |     cls.src_dataset = ".".join([project_id, dataset_id])
 92 | 
 93 |   def test(self):
 94 |     sql = self.sql_to_test.format(
 95 |         CDR=self.src_dataset,
 96 |         COHORT_QUERY="SELECT person_id FROM `{}.person` WHERE person_id <= 1002".format(self.src_dataset),
 97 |         MEASUREMENT_OF_INTEREST="hemoglobin")
 98 | 
 99 |     expected = [
100 |         # measurement	unit	N	missing	min	max	avg	stddev	quantiles	num_numeric_values	num_concept_values	num_operators	measurement_source measurement_concept_id	unit_concept_id
101 |         ("Hemoglobin", "gram per deciliter", 5, 1, 7.2, 42.0, 20.9, 18.542653531789888, [7.2, 7.2, 13.5, 42.0, 42.0], 3, 1, 1, "EHR", 123, 456)
102 |         ]
103 |     self.expect_query_result(query=sql, expected=expected)
104 | 
105 | if __name__ == "__main__":
106 |   unittest.main()
107 | 
108 | 


--------------------------------------------------------------------------------
/storage-snippets/README.md:
--------------------------------------------------------------------------------
 1 | # Cloud Storage snippets
 2 | 
 3 | This snippets in this subdirectory are for workbench users who directly use the workspace bucket.
 4 | 
 5 | # Get setup for GitHub
 6 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#get-setup-for-github) for the details. If you are new to `git`, please see the example commands there. 
 7 | 
 8 | # How to contribute a snippet to the Cloud Storage snippets menu group
 9 | 
10 | 1. Write your snippet of code in your preferred language, R or Python.
11 |     * Try to make your snippet consistent with other snippets in this collection.
12 |         * For data wrangling, use [dplyr](https://dplyr.tidyverse.org/) for R and [pandas](https://pandas.pydata.org/) for Python.
13 |     * Choose a good prefix and suffix for your snippet file name.
14 |         * See the names of the other files for examples.
15 |         * The file name helps users decide whether the snippet will be useful to them.
16 |     * Put some comments at the top of your snippet to explain its purpose and any assumptions.
17 | 1. After you are happy with your new snippet, port it to the other language or file a GitHub issue asking for help from someone else to do this.
18 | 1. If your snippet has any inputs or parameters, add default values for those parameters to both [`snippets_setup.R`](./snippets_setup.R) and [`snippets_setup.py`](./snippets_setup.py) so that your snippet will work as-is.
19 | 1. Update [r_gcs_snippets_menu_config.yml](../build/r_gcs_snippets_menu_config.yml) and [py_gcs_snippets_menu_config.yml](../build/py_gcs_snippets_menu_config.yml) to add your snippet where ever you would like it to be displayed within the menu.
20 | 1. Send your pull request!
21 | 
22 | Don't like these conventions? We can change them! This is just a starting point. Keep in mind we'll need to reflect those changes in the auto-generation script described in the next section.
23 | 
24 | # Auto-generation of Jupyter 'Snippets Menu' configuration
25 | 
26 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#auto-generation-of-jupyter-snippets-menu-configuration) for the details.
27 | 
28 | # Testing
29 | 
30 | ## Snippet tests
31 | To test individual snippets, the best thing to do is copy and paste them into a notebook on the workbench.
32 | 
33 | ## Integration 'smoke tests'
34 | 
35 | If the smoke tests are run from the workbench environment and there are no obvious bugs in the snippets, they will run start-to-finish without error. (This won't necessarily catch all bugs, but its a good start.)
36 | 
37 | * The script to auto-generate the Jupyter Snippets Menu configuration also emits both `r_gcs_snippets_menu_config_smoke_test.R` and `py_gcs_snippets_menu_config_smoke_test.py`. 
38 | * Those scripts each include, respectively, all the R Cloud Storage snippets and all the Python Cloud Storage snippets. 
39 | * Additional configuration needed for the smoke tests can be defined in [r_gcs_snippets_menu_config.smoke_test_setup](../build/r_gcs_snippets_menu_config.smoke_test_setup) and [py_gcs_snippets_menu_config.smoke_test_setup](../build/py_gcs_snippets_menu_config.smoke_test_setup), respectively. Update it as needed.
40 | 
41 | After opening a notebook in the production workbench environment, upload these smoke test files into Jupyter and then execute the following code from the Jupyter terminal or a Python notebook in the same directory. They will emit _"Smoke test complete!"_ when they have completed successfully.
42 | 
43 | To run the R Cloud Storage snippets smoke tests:
44 | ```
45 | %%bash
46 | 
47 | Rscript r_gcs_snippets_menu_config_smoke_test.R  # There will be output, but there should be no errors.
48 | ```
49 | 
50 | To run the Python Cloud Storage snippets smoke tests:
51 | ```
52 | %%bash
53 | 
54 | python3 py_gcs_snippets_menu_config_smoke_test.py  # There will be output, but there should be no errors.
55 | ```
56 | 
57 | # Deployment
58 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#deployment) for the details.
59 | 


--------------------------------------------------------------------------------
/py/terra_widgets/tests/test_workspace_paths.py:
--------------------------------------------------------------------------------
 1 | """Tests for the WorkspacePaths class."""
 2 | 
 3 | import os
 4 | import unittest
 5 | from terra_widgets.workspace_paths import WorkspacePaths
 6 | 
 7 | 
 8 | class TestWorkspacePaths(unittest.TestCase):
 9 | 
10 |   def setUp(self):
11 |     self.wp = WorkspacePaths(workspace_bucket='fc-fake-bucket')
12 |     os.environ['OWNER_EMAIL'] = 'testUser@somecompany.com'
13 | 
14 |   def tearDown(self):
15 |     os.unsetenv('OWNER_EMAIL')
16 | 
17 |   def test_destinations(self):
18 |     notebook_paths = ['gs://fc-fake-bucket/notebooks/test1.ipynb',
19 |                       'gs://fc-fake-bucket/notebooks/test2.ipynb']
20 |     destinations = self.wp.formulate_destination_paths(notebooks=notebook_paths)
21 |     self.assertSetEqual(set(destinations.keys()), set(notebook_paths))
22 |     self.assertRegex(
23 |         destinations[notebook_paths[0]].html_file,
24 |         r'gs://fc-fake-bucket/reports/testUser@somecompany.com/\d{8}/\d{6}/test1.html')
25 |     self.assertRegex(
26 |         destinations[notebook_paths[0]].comment_file,
27 |         r'gs://fc-fake-bucket/reports/testUser@somecompany.com/\d{8}/\d{6}/test1.comment.txt')
28 |     self.assertRegex(
29 |         destinations[notebook_paths[1]].html_file,
30 |         r'gs://fc-fake-bucket/reports/testUser@somecompany.com/\d{8}/\d{6}/test2.html')
31 |     self.assertRegex(
32 |         destinations[notebook_paths[1]].comment_file,
33 |         r'gs://fc-fake-bucket/reports/testUser@somecompany.com/\d{8}/\d{6}/test2.comment.txt')
34 | 
35 |   def test_fail_destinations(self):
36 |     with self.assertRaisesRegex(
37 |         ValueError,
38 |         r'"gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/120000/test1.html" does not match "gs://fc-fake-bucket/notebooks/\*\.ipynb"'):
39 |       self.wp.formulate_destination_paths(notebooks=['gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/120000/test1.html'])
40 | 
41 |   def test_glob_for_aou(self):
42 |     input_path = 'gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/120000'
43 |     expected = os.path.join(input_path, '*.html')
44 |     self.assertEqual(self.wp.add_html_glob_to_path(input_path), expected)
45 | 
46 |   def test_glob_for_terra(self):
47 |     wp = WorkspacePaths(workspace_bucket='fc-fake-bucket')
48 |     input_path = 'gs://fc-fake-bucket/reports/test@somecompany.com/20200701/120000'
49 |     expected = os.path.join(input_path, '*.html')
50 |     self.assertEqual(wp.add_html_glob_to_path(input_path), expected)
51 | 
52 |   def test_glob_path_already_complete(self):
53 |     # Pass a complete path to an HTML file when instead we should pass a partial path to it.
54 |     with self.assertRaisesRegex(ValueError, '"gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/120000/test1.html" does not match'):
55 |       self.wp.add_html_glob_to_path('gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/120000/test1.html')
56 | 
57 |   def test_glob_path_missing_time(self):
58 |     with self.assertRaisesRegex(ValueError, 'does not match'):
59 |       self.wp.add_html_glob_to_path('gs://fc-fake-bucket/reports/test@researchallofus.org/20200701/')
60 | 
61 |   def test_glob_path_missing_date(self):
62 |     with self.assertRaisesRegex(ValueError, 'does not match'):
63 |       self.wp.add_html_glob_to_path('gs://fc-fake-bucket/reports/test@researchallofus.org/120000/')
64 | 
65 |   def test_glob_path_missing_user(self):
66 |     with self.assertRaisesRegex(ValueError, 'does not match'):
67 |       self.wp.add_html_glob_to_path('gs://fc-fake-bucket/reports/20200701/120000/')
68 | 
69 |   def test_glob_path_missing_report_folder(self):
70 |     with self.assertRaisesRegex(ValueError, 'does not match'):
71 |       self.wp.add_html_glob_to_path('gs://fc-fake-bucket/test@researchallofus.org/20200701/120000/')
72 | 
73 |   def test_glob_wrong_path(self):
74 |     # Pass a path to a notebook when instead we should pass a partial path to a report.
75 |     with self.assertRaisesRegex(ValueError, '"gs://fc-fake-bucket/notebooks/test1.ipynb" does not match'):
76 |       self.wp.add_html_glob_to_path('gs://fc-fake-bucket/notebooks/test1.ipynb')
77 | 
78 | 
79 | if __name__ == '__main__':
80 |   unittest.main()
81 | 
82 | 


--------------------------------------------------------------------------------
/dataset-snippets/README.md:
--------------------------------------------------------------------------------
 1 | # Dataset Builder snippets
 2 | 
 3 | This snippets in this subdirectory are for workbench users who use Dataset Builder to retrieve their data.
 4 | 
 5 | # Get setup for GitHub
 6 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#get-setup-for-github) for the details. If you are new to `git`, please see the example commands there. 
 7 | 
 8 | # How to contribute a snippet to the Dataset Builder snippets menu group
 9 | 
10 | 1. Write your snippet of code in your preferred language, R or Python.
11 |     * Try to make your snippet consistent with other snippets in this collection.
12 |         * For data wrangling, use [dplyr](https://dplyr.tidyverse.org/) for R and [pandas](https://pandas.pydata.org/) for Python.
13 |         * For static plots, use [ggplot2](https://ggplot2.tidyverse.org/) for R and [plotnine](https://plotnine.readthedocs.io/en/stable/) for Python.
14 |     * Choose a good prefix and suffix for your snippet file name.
15 |         * See the names of the other files for examples.
16 |         * The file name helps users decide whether the snippet will be useful to them.
17 |     * Put some comments at the top of your snippet to explain its purpose and any assumptions.
18 | 1. After you are happy with your new snippet, port it to the other language or file a GitHub issue asking for help from someone else to do this.
19 | 1. If your snippet has any inputs or parameters other than a dataframe created by Dataset Builder, add default values for those parameters to both [`snippets_setup.R`](./snippets_setup.R) and [`snippets_setup.py`](./snippets_setup.py) so that your snippet will work as-is.
20 | 1. Update [r_dataset_snippets_menu_config.yml](../build/r_dataset_snippets_menu_config.yml) and [py_dataset_snippets_menu_config.yml](../build/py_dataset_snippets_menu_config.yml) to add your snippet where ever you would like it to be displayed within the menu.
21 | 1. Send your pull request!
22 | 
23 | Don't like these conventions? We can change them! This is just a starting point. Keep in mind we'll need to reflect those changes in the auto-generation script described in the next section.
24 | 
25 | # Auto-generation of Jupyter 'Snippets Menu' configuration
26 | 
27 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#auto-generation-of-jupyter-snippets-menu-configuration) for the details.
28 | 
29 | # Testing
30 | 
31 | ## Snippet tests
32 | To test individual snippets such as plots, the best thing to do is copy and paste them into a notebook on the workbench.
33 | 
34 | ## Integration 'smoke tests'
35 | 
36 | If the smoke tests are run from the workbench environment and there are no obvious bugs in the snippets, they will run start-to-finish without error. (This won't necessarily catch all bugs, but its a good start.)
37 | 
38 | * The script to auto-generate the Jupyter Snippets Menu configuration also emits both `r_dataset_snippets_menu_config_smoke_test.R` and `py_dataset_snippets_menu_config_smoke_test.py`. 
39 | * Those scripts each include, respectively, all the R Dataset Builder snippets and all the Python Dataset Builder snippets. 
40 | * The Dataset from Dataset Builder is defined in [r_dataset_snippets_menu_config.smoke_test_setup](../build/r_dataset_snippets_menu_config.smoke_test_setup) and [py_dataset_snippets_menu_config.smoke_test_setup](../build/py_dataset_snippets_menu_config.smoke_test_setup), respectively. Update it as needed.
41 | 
42 | After opening a notebook in the production workbench environment, upload these smoke test files into Jupyter and then execute the following code from the Jupyter terminal or a Python notebook in the same directory. They will emit _"Smoke test complete!"_ when they have completed successfully.
43 | 
44 | To run the R Dataset Builder snippets smoke tests:
45 | ```
46 | %%bash
47 | 
48 | Rscript r_dataset_snippets_menu_config_smoke_test.R  # There will be output, but there should be no errors.
49 | ```
50 | 
51 | To run the Python Dataset Builder snippets smoke tests:
52 | ```
53 | %%bash
54 | 
55 | # Any notebook '!' commands won't work in this context. Comment them out and run them explicitly first.
56 | perl -i -pe 's/!pip/#!pip/g' py_dataset_snippets_menu_config_smoke_test.py
57 | pip3 install --user pandas_profiling
58 | 
59 | python3 py_dataset_snippets_menu_config_smoke_test.py  # There will be output, but there should be no errors.
60 | ```
61 | 
62 | # Deployment
63 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#deployment) for the details.
64 | 


--------------------------------------------------------------------------------
/py/terra_widgets/workspace_metadata.py:
--------------------------------------------------------------------------------
 1 | """Methods to obtain workspace metadata for the current user in various formats."""
 2 | 
 3 | import json
 4 | import os
 5 | from typing import Dict
 6 | 
 7 | import firecloud.api as fapi
 8 | from IPython import get_ipython
 9 | 
10 | 
11 | class WorkspaceMetadata:
12 |   """Encapsulate all logic for obtaining workspace metadata."""
13 | 
14 |   AOU_DOMAIN = '@researchallofus.org'
15 |   EDIT_ACCESS_LEVELS = ['WRITER', 'OWNER', 'PROJECT_OWNER']
16 |   AOU_PROD_API = 'https://api.workbench.researchallofus.org/v1/workspaces'
17 | 
18 |   def __init__(self):
19 |     self.user = os.getenv('OWNER_EMAIL')
20 |     self.terra_workspaces = fapi.list_workspaces().json()
21 |     if self.user.endswith(self.AOU_DOMAIN):
22 |       aou_api = os.getenv('RW_API_BASE_URL')
23 |       if not aou_api:
24 |         aou_api = self.AOU_PROD_API
25 |       # Use the All of Us API to get the human-readable workspace names. For All of Us workspaces,
26 |       # the Terra workspace metadata the workspace names are actually the AoU workspace ids.
27 |       aou_response = get_ipython().getoutput(f'''curl -H "Content-Type: application/json" \
28 |           -H "Authorization: Bearer $(gcloud auth print-access-token)" \
29 |           "{aou_api}" 2>/dev/null | jq .''')
30 |       self.aou_workspaces = json.loads(''.join(aou_response))['items']
31 |     else:
32 |       self.aou_workspaces = None
33 | 
34 |   def get_workspace_name_to_id_mapping(self, include_private_readonly: bool = False, include_all: bool = False) -> Dict[str, str]:
35 |     """Retrieve a mapping of workspace names to ids.
36 | 
37 |     Args:
38 |       include_private_readonly: whether to include private workspaces for which the current user has only has read access.
39 |       include_all: whether to include all workspaces visible to the user
40 |     Returns:
41 |       A dictionary of workspace names to workspace ids.
42 |     """
43 |     if self.aou_workspaces:
44 |       return {ws['workspace']['name']: ws['workspace']['id'] for ws in self.aou_workspaces
45 |               if include_all
46 |               or (include_private_readonly and not ws['workspace']['published'])
47 |               or ws['accessLevel'] in self.EDIT_ACCESS_LEVELS}
48 |     else:
49 |       return {ws['workspace']['name']: ws['workspace']['workspaceId'] for ws in self.terra_workspaces
50 |               if include_all
51 |               or (include_private_readonly and not ws['public'])
52 |               or ws['accessLevel'] in self.EDIT_ACCESS_LEVELS}
53 | 
54 |   def get_workspace_name_to_bucket_mapping(self, include_private_readonly: bool = False, include_all: bool = False) -> Dict[str, str]:
55 |     """Retrieve a mapping of workspace names to Cloud Storage bucket names.
56 | 
57 |     Args:
58 |       include_private_readonly: whether to include private workspaces for which the current user has only has read access.
59 |       include_all: whether to include all workspaces visible to the user
60 |     Returns:
61 |       A dictionary of workspace names to workspace bucket names.
62 |     """
63 |     ws_mapping = self.get_workspace_name_to_id_mapping(include_private_readonly=include_private_readonly,
64 |                                                        include_all=include_all)
65 |     if self.aou_workspaces:
66 |       # For All of Us workspaces, in the Terra workspace metadata the workspace names are actually
67 |       # the AoU workspace ids.
68 |       terra_ws_names = ws_mapping.values()
69 |     else:
70 |       terra_ws_names = ws_mapping.keys()
71 |     return {ws['workspace']['name']: ws['workspace']['bucketName'] for ws in self.terra_workspaces
72 |             if ws['workspace']['name'] in terra_ws_names}
73 | 
74 |   def get_workspace_id_to_bucket_mapping(self, include_private_readonly: bool = False, include_all: bool = False) -> Dict[str, str]:
75 |     """Retrieve a mapping of workspace ids to Cloud Storage bucket names.
76 | 
77 |     Args:
78 |       include_private_readonly: whether to include private workspaces for which the current user has only has read access.
79 |       include_all: whether to include all workspaces visible to the user
80 |     Returns:
81 |       A dictionary of workspace names to workspace bucket names.
82 |     """
83 |     ws_mapping = self.get_workspace_name_to_id_mapping(include_private_readonly=include_private_readonly,
84 |                                                        include_all=include_all)
85 |     if self.aou_workspaces:
86 |       # For All of Us workspaces, in the Terra workspace metadata the workspace names are actually
87 |       # the AoU workspace ids.
88 |       terra_metadata_key = 'name'
89 |     else:
90 |       terra_metadata_key = 'workspaceId'
91 |     return {ws['workspace'][terra_metadata_key]: ws['workspace']['bucketName'] for ws in self.terra_workspaces
92 |             if ws['workspace'][terra_metadata_key] in ws_mapping.values()}
93 | 


--------------------------------------------------------------------------------
/sql-snippets/most_recent_measurement_of_interest_test.py:
--------------------------------------------------------------------------------
  1 | """Tests for query most_recent_measurement_of_interest.sql.
  2 | 
  3 | See https://github.com/verilylifesciences/analysis-py-utils for more details
  4 | about the testing framework.
  5 | """
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | from datetime import date
 12 | from datetime import datetime
 13 | from dateutil import tz
 14 | from ddt import ddt
 15 | import os
 16 | import unittest
 17 | from verily.bigquery_wrapper import bq_test_case
 18 | 
 19 | SQL_TEMPLATE = "most_recent_measurement_of_interest.sql"
 20 | 
 21 | 
 22 | @ddt
 23 | class QueryTest(bq_test_case.BQTestCase):
 24 | 
 25 |   @classmethod
 26 |   def setUpClass(cls):
 27 |     """Set up class."""
 28 |     super(QueryTest, cls).setUpClass(use_mocks=False)
 29 |     cls.sql_to_test = open(
 30 |         os.path.join(os.path.dirname(os.path.realpath(__file__)),
 31 |                      SQL_TEMPLATE), "r").read()
 32 | 
 33 |   @classmethod
 34 |   def create_mock_tables(cls):
 35 |     """Create mock tables."""
 36 | 
 37 |     cls.client.create_table_from_query("""
 38 | SELECT * FROM UNNEST([
 39 | STRUCT<person_id INT64,
 40 |        birth_datetime TIMESTAMP,
 41 |        sex_at_birth_concept_id INT64>
 42 |     (1001, '1990-12-31 00:00:00 UTC', 501),
 43 |     (1002, '1950-08-01 00:00:00 UTC', 500),
 44 |     (1003, '1965-06-30 00:00:00 UTC', 500)
 45 | ])
 46 |     """, cls.client.path("person"))
 47 | 
 48 |     cls.client.create_table_from_query("""
 49 | SELECT * FROM UNNEST([
 50 | STRUCT<concept_id INT64,
 51 |        concept_name STRING>
 52 |     (  0, 'No matching concept'),
 53 |     (123, 'Hemoglobin'),
 54 |     (456, 'gram per deciliter'),
 55 |     (500, 'FEMALE'),
 56 |     (501, 'MALE')
 57 | ])
 58 |     """, cls.client.path("concept"))
 59 | 
 60 |     cls.client.create_table_from_query("""
 61 | SELECT * FROM UNNEST([
 62 | STRUCT<measurement_id INT64,
 63 |        src_id STRING>
 64 |     (1, 'EHR site1'),
 65 |     (2, 'EHR site1'),
 66 |     (3, 'PPI/PM'),
 67 |     (4, 'EHR site2'),
 68 |     (5, 'EHR site2'),
 69 |     (6, 'EHR site2')
 70 | ])
 71 |     """, cls.client.path("measurement_ext"))
 72 | 
 73 |     cls.client.create_table_from_query("""
 74 | SELECT * FROM UNNEST([
 75 | STRUCT<measurement_id INT64,
 76 |        person_id INT64,
 77 |        measurement_concept_id INT64,
 78 |        unit_concept_id INT64,
 79 |        operator_concept_id INT64,
 80 |        measurement_date DATE,
 81 |        measurement_datetime TIMESTAMP,
 82 |        measurement_type_concept_id INT64,
 83 |        value_as_number FLOAT64,
 84 |        value_as_concept_id INT64,
 85 |        range_low FLOAT64,
 86 |        range_high FLOAT64>
 87 |     (1, 1001, 123, 456, NULL, '2005-12-31', '2005-12-31 10:30:00 UTC', NULL, 42.0, NULL, 0, 999),
 88 |     (2, 1001, 123, 456, NULL, '2007-09-11', '2007-09-11 08:00:00 UTC', NULL, 13.5, NULL, 0, 999),
 89 |     (3, 1001, 123, 456, NULL, '2007-09-11', '2007-09-11 20:59:00 UTC', NULL, NULL,  100, 0, 999),
 90 |     (4, 1002, 123, 456, NULL, '2008-02-10', '2008-02-10 23:30:00 UTC', NULL, NULL, NULL, 0, 999),
 91 |     (5, 1002, 123, 456,  789, '2008-02-10', '2008-02-10 23:30:00 UTC', NULL,  7.2, NULL, 0, 999),
 92 |     # This measurement is for someone not in our cohort.
 93 |     (6, 1003, 123, 456,  789, '2010-01-01', '2010-10-01 23:30:00 UTC', NULL,  500, NULL, 0, 999)
 94 | ])
 95 |     """, cls.client.path("measurement"))
 96 | 
 97 |     # Get the project id and dataset name where the temp tables are stored.
 98 |     (project_id, dataset_id, _) = cls.client.parse_table_path(
 99 |         cls.client.path("any_temp_table"))
100 |     cls.src_dataset = ".".join([project_id, dataset_id])
101 | 
102 |   def test(self):
103 |     sql = self.sql_to_test.format(
104 |         CDR=self.src_dataset,
105 |         COHORT_QUERY="SELECT person_id FROM `{}.person` WHERE person_id <= 1002".format(self.src_dataset),
106 |         MEASUREMENT_CONCEPT_ID=123,
107 |         UNIT_CONCEPT_ID=456)
108 | 
109 |     expected = [
110 |         # person_id	birth_datetime	sex_at_birth	src_id	measurement_concept_id	unit_concept_id	measurement_date	measurement_datetime	measurement_type_concept_id	operator_concept_id	value_as_number	value_as_concept_id	range_low	range_high
111 |         (1001, datetime(1990, 12, 31, 0, 0, tzinfo=tz.gettz("UTC")),  "MALE", "PPI/PM", 123, 456, date(2007, 9, 11), datetime(2007, 9, 11, 20, 59, tzinfo=tz.gettz("UTC")), None, None, None,  100, 0, 999),
112 |         (1002, datetime(1950, 8, 1, 0, 0, tzinfo=tz.gettz("UTC")),  "FEMALE", "EHR site2", 123, 456, date(2008, 2, 10), datetime(2008, 2, 10, 23, 30, tzinfo=tz.gettz("UTC")), None,  789,  7.2, None, 0, 999)
113 |         ]
114 |     self.expect_query_result(query=sql, expected=expected)
115 | 
116 | if __name__ == "__main__":
117 |   unittest.main()
118 | 
119 | 


--------------------------------------------------------------------------------
/py/terra_widgets/workspace_paths.py:
--------------------------------------------------------------------------------
  1 | """Methods to obtains paths to files within the workspace bucket."""
  2 | 
  3 | import datetime
  4 | import fnmatch
  5 | import os
  6 | from typing import Dict
  7 | from typing import List
  8 | from typing import NamedTuple
  9 | 
 10 | 
 11 | WorkspaceDestination = NamedTuple('WorkspaceDestination', [('html_file', str), ('comment_file', str)])
 12 | 
 13 | 
 14 | class WorkspacePaths:
 15 |   """Encapsulate all logic for manipulating workspace paths.
 16 | 
 17 |   Paths are of the form:
 18 |     gs://<workspace bucket name>/reports/<your email address>/<date>/<time>/<notebook>.html
 19 |     gs://<workspace bucket name>/reports/<your email address>/<date>/<time>/<notebook>.html.comment.txt
 20 | 
 21 |   For example:
 22 |     gs://fc-secure-83282461-002f-4bad-86a9-59fdfd11b933/reports/deflaux@researchallofus.org/20200624/211319/Create a version from any of your workspaces.html
 23 |     gs://fc-secure-83282461-002f-4bad-86a9-59fdfd11b933/reports/deflaux@researchallofus.org/20200624/211319/Create a version from any of your workspaces.html.comment.txt
 24 |   """
 25 |   MANAGED_NOTEBOOKS_FOLDER = 'notebooks'
 26 |   HTML_SNAPSHOTS_FOLDER = 'reports'
 27 |   COMMENT_FILE_SUFFIX = '.comment.txt'
 28 |   HTML_FILE_SUFFIX = '.html'
 29 |   NOTEBOOK_FILE_SUFFIX = '.ipynb'
 30 |   USER_GLOB = '*@*'
 31 |   DATE_GLOB = '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'
 32 |   TIME_GLOB = '[0-9][0-9][0-9][0-9][0-9][0-9]'
 33 |   COMMENT_FILE_GLOB = f'*{COMMENT_FILE_SUFFIX}'
 34 |   HTML_FILE_GLOB = f'*{HTML_FILE_SUFFIX}'
 35 |   NOTEBOOK_FILE_GLOB = f'*{NOTEBOOK_FILE_SUFFIX}'
 36 | 
 37 |   def __init__(self, workspace_bucket: str):
 38 |     self.workspace_bucket: str = f'gs://{workspace_bucket}'
 39 | 
 40 |   def get_subfolder(self) -> str:
 41 |     return os.path.join(self.workspace_bucket, self.HTML_SNAPSHOTS_FOLDER)
 42 | 
 43 |   def formulate_destination_paths(self, notebooks: List[str]) -> Dict[str, WorkspaceDestination]:
 44 |     """Formulate paths within the workspace bucket where transformations of notebooks can be stored.
 45 | 
 46 |     Args:
 47 |       notebooks: List of one or more notebook paths.
 48 |     Returns:
 49 |       A dictionary of notebooks paths to its corresponding WorkspaceDestination tuple.
 50 |     """
 51 |     user = os.getenv('OWNER_EMAIL')
 52 |     if not user:
 53 |       raise ValueError('''Environment variable "OWNER_EMAIL" is not defined.
 54 |         If you are using this library outside of Terra, be sure to set a valid email address
 55 |         as the value of that environment variable.''')
 56 |     timestamp = datetime.datetime.now().strftime('%Y%m%d/%H%M%S')
 57 |     destination = os.path.join(self.get_subfolder(), user, timestamp)
 58 |     workspace_destinations = {}
 59 |     for notebook in notebooks:
 60 |       self._check_path_matches_glob(notebook, self.get_notebook_file_glob())
 61 |       remote_html = os.path.join(
 62 |           destination, os.path.basename(notebook).replace(self.NOTEBOOK_FILE_SUFFIX, self.HTML_FILE_SUFFIX))
 63 |       remote_comment = os.path.join(
 64 |           destination, os.path.basename(notebook).replace(self.NOTEBOOK_FILE_SUFFIX, self.COMMENT_FILE_SUFFIX))
 65 |       workspace_destinations[notebook] = WorkspaceDestination(
 66 |           html_file=remote_html,
 67 |           comment_file=remote_comment
 68 |       )
 69 |     return workspace_destinations
 70 | 
 71 |   def get_user_glob(self) -> str:
 72 |     return os.path.join(self.get_subfolder(), self.USER_GLOB)
 73 | 
 74 |   def get_date_glob(self) -> str:
 75 |     return os.path.join(self.get_user_glob(), self.DATE_GLOB)
 76 | 
 77 |   def get_time_glob(self) -> str:
 78 |     return os.path.join(self.get_date_glob(), self.TIME_GLOB)
 79 | 
 80 |   def get_comment_file_glob(self) -> str:
 81 |     return os.path.join(self.get_time_glob(), self.COMMENT_FILE_GLOB)
 82 | 
 83 |   def get_html_file_glob(self) -> str:
 84 |     return os.path.join(self.get_time_glob(), self.HTML_FILE_GLOB)
 85 | 
 86 |   def get_notebook_file_glob(self) -> str:
 87 |     return os.path.join(self.workspace_bucket, self.MANAGED_NOTEBOOKS_FOLDER, self.NOTEBOOK_FILE_GLOB)
 88 | 
 89 |   def add_date_glob_to_path(self, path) -> str:
 90 |     self._check_path_matches_glob(path, self.get_user_glob())
 91 |     return os.path.join(path, self.DATE_GLOB)
 92 | 
 93 |   def add_time_glob_to_path(self, path) -> str:
 94 |     self._check_path_matches_glob(path, self.get_date_glob())
 95 |     return os.path.join(path, self.TIME_GLOB)
 96 | 
 97 |   def add_html_glob_to_path(self, path) -> str:
 98 |     self._check_path_matches_glob(path, self.get_time_glob())
 99 |     return os.path.join(path, '*' + self.HTML_FILE_SUFFIX)
100 | 
101 |   @staticmethod
102 |   def _check_path_matches_glob(path: str, glob_to_match: str):
103 |     if not fnmatch.fnmatch(path, glob_to_match):
104 |       raise ValueError(f'"{path}" does not match "{glob_to_match}"')
105 | 


--------------------------------------------------------------------------------
/r/r_cromwell_setup.R:
--------------------------------------------------------------------------------
  1 | library(httr)
  2 | library(jsonlite)
  3 | library(rprojroot)
  4 | library(glue)
  5 | library(stringr)
  6 | 
  7 | # Check for the CROMWELL app
  8 | check_for_app <- function(env) {
  9 |   list_apps_url <- glue("{env$leonardo_url}/api/google/v1/apps/{env$google_project}")
 10 | 
 11 |   res <- GET(
 12 |     url = list_apps_url,
 13 |     query = list(includeDeleted = "false"),
 14 |     add_headers(Authorization = paste("Bearer", env$token))
 15 |   )
 16 |   stop_for_status(res)
 17 | 
 18 |   for (potential_app in content(res, as = "parsed")) {
 19 |     if (potential_app$appType == "CROMWELL" &&
 20 |         (toString(potential_app$auditInfo$creator) == env$owner_email ||
 21 |          toString(potential_app$auditInfo$creator) == env$user_email)) {
 22 |       potential_app_name <- potential_app$appName
 23 |       potential_app_status <- potential_app$status
 24 | 
 25 |       # We found a CROMWELL app in the correct google project and owned by the user. Now just check the workspace:
 26 |       app_details <- get_app_details(env, potential_app_name)
 27 |       workspace_namespace <- app_details[[2]]
 28 |       proxy_url <- app_details[[3]]
 29 |       if (workspace_namespace == env$workspace_namespace) {
 30 |         return(list(potential_app_name, potential_app_status, proxy_url))
 31 |       }
 32 |     }
 33 |   }
 34 |   return(list(NULL, NULL, NULL))
 35 | }
 36 | 
 37 | # Get the details of the specified app
 38 | get_app_details <- function(env, app_name) {
 39 |   get_app_url <- glue("{env$leonardo_url}/api/google/v1/apps/{env$google_project}/{app_name}")
 40 | 
 41 |   res <- GET(
 42 |     url = get_app_url,
 43 |     query = list(includeDeleted = "true", role = "creator"),
 44 |     add_headers(Authorization = paste("Bearer", env$token))
 45 |   )
 46 |   if (status_code(res) == 404) {
 47 |     return(list("DELETED", NULL, NULL))
 48 |   } else {
 49 |     stop_for_status(res)
 50 |   }
 51 | 
 52 |   result_json <- content(res, as = "parsed")
 53 |   custom_environment_variables <- result_json$customEnvironmentVariables
 54 | 
 55 |   list(result_json$status, custom_environment_variables$WORKSPACE_NAMESPACE, result_json$proxyUrls)
 56 | }
 57 | 
 58 | # Check if cromshell is installed
 59 | validate_cromshell <- function() {
 60 |   cat('Scanning for correct cromshell version...\n')
 61 |   tryCatch({
 62 |     validate_cromshell_alias()
 63 |   }, error = function(e) {
 64 |     validate_cromshell_alpha()
 65 |   })
 66 |   return
 67 | }
 68 | 
 69 | 
 70 | validate_cromshell_alpha <- function() {
 71 |   cat('Scanning for cromshell 2 alpha..')
 72 |   tryCatch({
 73 |     system2('cromshell-alpha', args = 'version', stdout = TRUE, stderr = TRUE)
 74 |     cat('\nFound cromshell-alpha, please use cromshell-alpha\n')
 75 |   }, error = function(e) {
 76 |     cat('cromshell-alpha not found\n')
 77 |       stop(e)
 78 |   })
 79 | 
 80 |   return
 81 | }
 82 | 
 83 | validate_cromshell_alias <- function() {
 84 |   cat('Scanning for cromshell 2...\n')
 85 |   tryCatch({
 86 |     system2('cromshell', args = 'version', stdout = TRUE, stderr = TRUE)
 87 |     cat('\nFound cromshell, please use cromshell\n')
 88 |   }, error = function(e) {
 89 |     stop(e)
 90 |   })
 91 | }
 92 | 
 93 | # Configure Cromwell
 94 | configure_cromwell <- function(env, proxy_url) {
 95 |   cat("Updating cromwell config\n")
 96 |   file_path <- file.path(path.expand("~"), ".cromshell", "cromshell_config.json")
 97 |   configuration <- list(
 98 |     cromwell_server = ifelse(!is.null(proxy_url), proxy_url, "invalid url"),
 99 |     requests_timeout = 5,
100 |     gcloud_token_email = env$user_email,
101 |     referer_header_url = env$leonardo_url
102 |   )
103 |   write(toJSON(configuration, auto_unbox = TRUE, pretty = TRUE), file_path)
104 | }
105 | 
106 | # Find the status of the app
107 | find_app_status <- function(env) {
108 |   cat('Checking status for CROMWELL app\n')
109 |   app_info <- check_for_app(env)
110 |   app_name <- app_info[[1]]
111 |   app_status <- app_info[[2]]
112 |   proxy_url <- app_info[[3]]
113 | 
114 |   configure_cromwell(env, proxy_url[[1]])
115 | 
116 |   if (is.null(app_name)) {
117 |     cat('CROMWELL app does not exist. Please create cromwell server from workbench\n')
118 |   } else {
119 |     cat(sprintf('app_name=%s; app_status=%s\n', app_name, app_status))
120 |     cat(sprintf('Existing CROMWELL app found (app_name=%s; app_status=%s).\n', app_name, app_status))
121 |     quit(save = "no", status = 1, runLast = FALSE)
122 |   }
123 | }
124 | 
125 | main <- function() {
126 |   # Iteration 1: these ENV reads will throw errors if not set.
127 |   env <- list(
128 |     workspace_namespace = Sys.getenv('WORKSPACE_NAMESPACE'),
129 |     workspace_bucket = Sys.getenv('WORKSPACE_BUCKET'),
130 |     user_email = ifelse(is.null(Sys.getenv('PET_SA_EMAIL')), Sys.getenv('OWNER_EMAIL'), Sys.getenv('PET_SA_EMAIL')),
131 |     owner_email = Sys.getenv('OWNER_EMAIL'),
132 |     google_project = Sys.getenv('GOOGLE_PROJECT'),
133 |     leonardo_url = Sys.getenv('LEONARDO_BASE_URL')
134 |   )
135 | 
136 |   # Before going any further, check that cromshell2 is installed:
137 |   validate_cromshell()
138 | 
139 |   # Fetch the token:
140 |   token <- system2('gcloud', args = c("auth", "print-access-token"), stdout = TRUE)
141 | 
142 |   env['token'] <- token
143 | 
144 |   find_app_status(env)
145 | }
146 | 
147 |  main()
148 | 
149 | 


--------------------------------------------------------------------------------
/py/py_cromwell_setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import json
  4 | import requests
  5 | from pathlib import Path
  6 | 
  7 | def check_for_app(env):
  8 |     list_apps_url = f'{env["leonardo_url"]}/api/google/v1/apps/{env["google_project"]}'
  9 |     r = requests.get(
 10 |         list_apps_url,
 11 |         params={
 12 |           'includeDeleted': 'false'
 13 |         },
 14 |         headers = {
 15 |             'Authorization': f'Bearer {env["token"]}'
 16 |         }
 17 |     )
 18 |     r.raise_for_status()
 19 | 
 20 |     for potential_app in r.json():
 21 |         if potential_app['appType'] == 'CROMWELL' and (
 22 |                 str(potential_app['auditInfo']['creator']) == env['owner_email']
 23 |                 or str(potential_app['auditInfo']['creator']) == env['user_email']
 24 |         ) :
 25 |             potential_app_name = potential_app['appName']
 26 |             potential_app_status = potential_app['status']
 27 | 
 28 |             # We found a CROMWELL app in the correct google project and owned by the user. Now just check the workspace:
 29 |             _, workspace_namespace,  proxy_url = get_app_details(env, potential_app_name)
 30 |             if workspace_namespace == env['workspace_namespace']:
 31 |                 return potential_app_name, potential_app_status, proxy_url['cromwell-service']
 32 | 
 33 |     return None, None, None
 34 | 
 35 | def get_app_details(env, app_name):
 36 |     get_app_url = f'{env["leonardo_url"]}/api/google/v1/apps/{env["google_project"]}/{app_name}'
 37 |     print('start')
 38 |     r = requests.get(
 39 |         get_app_url,
 40 |         params={
 41 |             'includeDeleted': 'true',
 42 |             'role': 'creator'
 43 |         },
 44 |         headers={
 45 |             'Authorization': f'Bearer {env["token"]}'
 46 |         }
 47 |     )
 48 |     if r.status_code == 404:
 49 |         return 'DELETED', None, None, None
 50 |     else:
 51 |         r.raise_for_status()
 52 |     result_json = r.json()
 53 |     custom_environment_variables = result_json['customEnvironmentVariables']
 54 |     return result_json['status'], custom_environment_variables['WORKSPACE_NAMESPACE'], result_json.get('proxyUrls')
 55 | 
 56 | # Checks that cromshell is installed. Otherwise raises an error.
 57 | def validate_cromshell():
 58 |     if validate_cromshell_alias():
 59 |         print("Found cromshell, please use cromshell")
 60 |     elif validate_cromshell_alpha():
 61 |         print("Found cromshell-alpha, please use cromshell-alpha")
 62 |     else:
 63 |         raise Exception("Cromshell is not installed.")
 64 | 
 65 | # Checks that cromshell is installed. Otherwise raises an error.
 66 | def validate_cromshell_alpha():
 67 |     print('Scanning for cromshell 2 alpha...')
 68 |     try:
 69 |         subprocess.run(['cromshell-alpha', 'version'], capture_output=True, check=True, encoding='utf-8')
 70 |     except FileNotFoundError:
 71 |         return False
 72 |     return True
 73 | # Checks that cromshell is installed. Otherwise raises an error.
 74 | def validate_cromshell_alias():
 75 |     print('Scanning for cromshell 2')
 76 |     try:
 77 |         subprocess.run(['cromshell', 'version'], capture_output=True, check=True, encoding='utf-8')
 78 |     except FileNotFoundError:
 79 |         return False
 80 |     return True
 81 | 
 82 | def configure_cromwell(env, proxy_url):
 83 |      print('Updating cromwell config')
 84 |      file = f'{str(Path.home())}/.cromshell/cromshell_config.json'
 85 |      configuration = {
 86 |         'cromwell_server': proxy_url.split("swagger/", 1)[0] if proxy_url else "invalid url",
 87 |         'requests_timeout': 5,
 88 |         'gcloud_token_email': env['user_email'],
 89 |         'referer_header_url': env['leonardo_url']
 90 |      }
 91 |      with open(file, 'w') as filetowrite:
 92 |         filetowrite.write(json.dumps(configuration, indent=2))
 93 | 
 94 | def find_app_status(env):
 95 |     print(f'Checking status for CROMWELL app')
 96 |     app_name, app_status, proxy_url = check_for_app(env)
 97 | 
 98 |     configure_cromwell(env, proxy_url)
 99 | 
100 |     if app_name is None:
101 |         print(f'CROMWELL app does not exist. Please create cromwell server from workbench')
102 |     else:
103 |         print(f'app_name={app_name}; app_status={app_status}')
104 |         print(f'Existing CROMWELL app found (app_name={app_name}; app_status={app_status}).')
105 |         exit(1)
106 | 
107 | def main():
108 |     # Iteration 1: these ENV reads will throw errors if not set.
109 |     env = {
110 |         'workspace_namespace': os.environ['WORKSPACE_NAMESPACE'],
111 |         'workspace_bucket': os.environ['WORKSPACE_BUCKET'],
112 |         'user_email': os.environ.get('PET_SA_EMAIL', default = os.environ['OWNER_EMAIL']),
113 |         'owner_email': os.environ['OWNER_EMAIL'],
114 |         'google_project': os.environ['GOOGLE_PROJECT'],
115 |         'leonardo_url': os.environ['LEONARDO_BASE_URL']
116 |     }
117 | 
118 |     # Before going any further, check that cromshell2 is installed:
119 |     validate_cromshell()
120 | 
121 |     # Fetch the token:
122 |     token_fetch_command = subprocess.run(['gcloud', 'auth', 'print-access-token', env['user_email']], capture_output=True, check=True, encoding='utf-8')
123 |     env['token'] = str.strip(token_fetch_command.stdout)
124 | 
125 |     find_app_status(env)
126 | 
127 | 
128 | if __name__ == '__main__':
129 |     main()
130 |  
131 | 


--------------------------------------------------------------------------------
/sql-snippets/measurement_of_interest_test.py:
--------------------------------------------------------------------------------
  1 | """Tests for query measurement_of_interest.sql.
  2 | 
  3 | See https://github.com/verilylifesciences/analysis-py-utils for more details
  4 | about the testing framework.
  5 | """
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | from datetime import date
 12 | from datetime import datetime
 13 | from dateutil import tz
 14 | from ddt import ddt
 15 | import os
 16 | import unittest
 17 | from verily.bigquery_wrapper import bq_test_case
 18 | 
 19 | SQL_TEMPLATE = "measurement_of_interest.sql"
 20 | 
 21 | 
 22 | @ddt
 23 | class QueryTest(bq_test_case.BQTestCase):
 24 | 
 25 |   @classmethod
 26 |   def setUpClass(cls):
 27 |     """Set up class."""
 28 |     super(QueryTest, cls).setUpClass(use_mocks=False)
 29 |     cls.sql_to_test = open(
 30 |         os.path.join(os.path.dirname(os.path.realpath(__file__)),
 31 |                      SQL_TEMPLATE), "r").read()
 32 | 
 33 |   @classmethod
 34 |   def create_mock_tables(cls):
 35 |     """Create mock tables."""
 36 | 
 37 |     cls.client.create_table_from_query("""
 38 | SELECT * FROM UNNEST([
 39 | STRUCT<person_id INT64,
 40 |        birth_datetime TIMESTAMP,
 41 |        sex_at_birth_concept_id INT64>
 42 |     (1001, '1990-12-31 00:00:00 UTC', 501),
 43 |     (1002, '1950-08-01 00:00:00 UTC', 500),
 44 |     (1003, '1965-06-30 00:00:00 UTC', 500)
 45 | ])
 46 |     """, cls.client.path("person"))
 47 | 
 48 |     cls.client.create_table_from_query("""
 49 | SELECT * FROM UNNEST([
 50 | STRUCT<concept_id INT64,
 51 |        concept_name STRING>
 52 |     (  0, 'No matching concept'),
 53 |     (123, 'Hemoglobin'),
 54 |     (456, 'gram per deciliter'),
 55 |     (500, 'FEMALE'),
 56 |     (501, 'MALE')
 57 | ])
 58 |     """, cls.client.path("concept"))
 59 | 
 60 |     cls.client.create_table_from_query("""
 61 | SELECT * FROM UNNEST([
 62 | STRUCT<measurement_id INT64,
 63 |        src_id STRING>
 64 |     (1, 'EHR site1'),
 65 |     (2, 'EHR site1'),
 66 |     (3, 'PPI/PM'),
 67 |     (4, 'EHR site2'),
 68 |     (5, 'EHR site2'),
 69 |     (6, 'EHR site2')
 70 | ])
 71 |     """, cls.client.path("measurement_ext"))
 72 | 
 73 |     cls.client.create_table_from_query("""
 74 | SELECT * FROM UNNEST([
 75 | STRUCT<measurement_id INT64,
 76 |        person_id INT64,
 77 |        measurement_concept_id INT64,
 78 |        unit_concept_id INT64,
 79 |        operator_concept_id INT64,
 80 |        measurement_date DATE,
 81 |        measurement_datetime TIMESTAMP,
 82 |        measurement_type_concept_id INT64,
 83 |        value_as_number FLOAT64,
 84 |        value_as_concept_id INT64,
 85 |        range_low FLOAT64,
 86 |        range_high FLOAT64>
 87 |     (1, 1001, 123, 456, NULL, '2005-12-31', '2005-12-31 10:30:00 UTC', NULL, 42.0, NULL, 0, 999),
 88 |     (2, 1001, 123, 456, NULL, '2007-09-11', '2007-09-11 08:00:00 UTC', NULL, 13.5, NULL, 0, 999),
 89 |     (3, 1001, 123, 456, NULL, '2007-09-11', '2007-09-11 20:59:00 UTC', NULL, NULL,  100, 0, 999),
 90 |     (4, 1002, 123, 456, NULL, '2008-02-10', '2008-02-10 23:30:00 UTC', NULL, NULL, NULL, 0, 999),
 91 |     (5, 1002, 123, 456,  789, '2008-02-10', '2008-02-10 23:30:00 UTC', NULL,  7.2, NULL, 0, 999),
 92 |     # This measurement is for someone not in our cohort.
 93 |     (6, 1003, 123, 456,  789, '2010-01-01', '2010-10-01 23:30:00 UTC', NULL,  500, NULL, 0, 999)
 94 | ])
 95 |     """, cls.client.path("measurement"))
 96 | 
 97 |     # Get the project id and dataset name where the temp tables are stored.
 98 |     (project_id, dataset_id, _) = cls.client.parse_table_path(
 99 |         cls.client.path("any_temp_table"))
100 |     cls.src_dataset = ".".join([project_id, dataset_id])
101 | 
102 |   def test(self):
103 |     sql = self.sql_to_test.format(
104 |         CDR=self.src_dataset,
105 |         COHORT_QUERY="SELECT person_id FROM `{}.person` WHERE person_id <= 1002".format(self.src_dataset),
106 |         MEASUREMENT_CONCEPT_ID=123,
107 |         UNIT_CONCEPT_ID=456)
108 | 
109 |     expected = [
110 |         # person_id	birth_datetime	sex_at_birth	src_id	 measurement_concept_id	measurement_date	measurement_datetime	measurement_type_concept_id	operator_concept_id	value_as_number	value_as_concept_id	unit_concept_id range_low	range_high
111 |         (1001, datetime(1990, 12, 31, 0, 0, tzinfo=tz.gettz("UTC")),  "MALE", "EHR site1", 123, date(2005, 12, 31), datetime(2005, 12, 31, 10, 30, tzinfo=tz.gettz("UTC")), None, None, 42.0, None, 456, 0, 999),
112 |         (1001, datetime(1990, 12, 31, 0, 0, tzinfo=tz.gettz("UTC")),  "MALE", "EHR site1", 123, date(2007,  9, 11), datetime(2007,  9, 11,  8,  0, tzinfo=tz.gettz("UTC")), None, None, 13.5, None, 456, 0, 999),
113 |         (1001, datetime(1990, 12, 31, 0, 0, tzinfo=tz.gettz("UTC")),  "MALE", "PPI/PM", 123, date(2007,  9, 11), datetime(2007,  9, 11, 20, 59, tzinfo=tz.gettz("UTC")), None, None, None,  100, 456, 0, 999),
114 |         (1002, datetime(1950, 8, 1, 0, 0, tzinfo=tz.gettz("UTC")),  "FEMALE", "EHR site2", 123, date(2008,  2, 10), datetime(2008,  2, 10, 23, 30, tzinfo=tz.gettz("UTC")), None, None, None, None, 456, 0, 999),
115 |         (1002, datetime(1950, 8, 1, 0, 0, tzinfo=tz.gettz("UTC")),  "FEMALE", "EHR site2", 123, date(2008,  2, 10), datetime(2008,  2, 10, 23, 30, tzinfo=tz.gettz("UTC")), None,  789,  7.2, None, 456, 0, 999)
116 |         ]
117 |     self.expect_query_result(query=sql, expected=expected)
118 | 
119 | if __name__ == "__main__":
120 |   unittest.main()
121 | 
122 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Get setup for GitHub
  2 | 
  3 | Small typos in code or documentation may be edited directly using the GitHub web interface. Otherwise:
  4 | 
  5 | 1. If you are new to GitHub, don't start here. Instead, work through a GitHub tutorial such as one of these:
  6 |     * https://guides.github.com/activities/hello-world/
  7 |     * http://r-pkgs.had.co.nz/git.html.
  8 | 2. Create a fork of https://github.com/all-of-us/workbench-snippets.
  9 | 3. Clone your fork.
 10 | 4. Follow [the instructions](https://github.com/all-of-us/workbench#git-secrets) to install [git-secrets](https://github.com/awslabs/git-secrets). It's a git commit hook that *"Prevents you from committing passwords and other sensitive information to a git repository."*.
 11 | 5. Work from a feature branch. See the [Appendix](#appendix) for detailed `git` commands.
 12 | 
 13 | # How to contribute a snippet
 14 | 
 15 | If you want to add/modify a snippet that uses a dataframe from Dataset Builder as its input, then see [dataset-snippets/README](./dataset-snippets/README.md).
 16 | 
 17 | Otherwise, see the other snippets collections such as [sql-snippets/README](./sql-snippets/README.md) or [storage-snippets/README](./storage-snippets/README.md).
 18 | 
 19 | # Auto-generation of Jupyter 'Snippets Menu' configuration
 20 | 
 21 | 1. Generate the JSON configuration:
 22 |     ```
 23 |     git clone https://github.com/all-of-us/workbench-snippets.git
 24 |     cd workbench-snippets/build
 25 |     python3 ./generate_jupyter_snippets_menu_extension_config.py
 26 |     # If you get an error about a missing library, run command 'pip3 install --user Jinja2 pyyaml'
 27 |     ```
 28 | 1. Then copy and paste the contents of the newly created json file (such as `r_sql_snippets_menu_config.json` or `py_sql_snippets_menu_config.json` for the [sql-snippets](./sql-snippets/)) into form field '*JSON string parsed to define custom menus (only used if the option above is checked)*' in the Snippets Menu extension configuration.
 29 | 
 30 | For more detail, see the [Snippets Menu](https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/nbextensions/snippets_menu/readme.html) Jupyter extension documentation.
 31 | 
 32 | # Testing
 33 | 
 34 | To test individual snippets such as plots, the best thing to do is copy and paste them into a notebook on the workbench.
 35 | 
 36 | There are more detailed testing instructions README for each of the snippets collections, such as [dataset-snippets/README](./dataset-snippets/README.md#testing) or [sql-snippets/README](./sql-snippets/README.md#testing).
 37 | 
 38 | # Deployment
 39 | 
 40 | For the updated snippets to be incorporated into the Workbench environment, the workbench team needs to trigger a manual process to pull in snippets updates for deployment.
 41 | 
 42 | Once this happens, new snippets contents go live in users' Jupyter servers on a rolling basis over a period of a few weeks.
 43 | 
 44 | # Appendix
 45 | 
 46 | For the workbench-snippets GitHub repository, we are doing ‘merge and squash’ of pull requests. So that means your fork does not match upstream after your pull request has been merged. The easiest way to manage this is to always work in a feature branch, instead of checking changes into your fork’s main branch.
 47 | 
 48 | 
 49 | ## How to work on a new feature
 50 | 
 51 | (1) Get the latest version of the upstream repo
 52 | 
 53 | ```
 54 | git fetch upstream
 55 | ```
 56 | 
 57 | Note: If you get an error saying that upstream is unknown, run the following remote add command and then re-run the fetch command. You only need to do this once per git clone.
 58 | 
 59 | ```
 60 | git remote add upstream https://github.com/all-of-us/workbench-snippets.git
 61 | ```
 62 | 
 63 | (2) Make sure your main branch is “even” with upstream.
 64 | 
 65 | ```
 66 | git checkout main
 67 | git merge --ff-only upstream/main
 68 | git push
 69 | ```
 70 | 
 71 | Now the main branch of your fork on GitHub should say *"This branch is even with all-of-us:main."*.
 72 | 
 73 | 
 74 | (3) Create a feature branch for your change.
 75 | 
 76 | ```
 77 | git checkout -b my-feature-branch-name
 78 | ```
 79 | 
 80 | Because you created this feature branch from your main branch that was up to date with upstream (step 2), your feature branch is also up to date with upstream. Commit your changes to this branch until you are happy with them.
 81 | 
 82 | (4) Push your changes to GitHub and send a pull request.
 83 | 
 84 | ```
 85 | git push --set-upstream origin my-feature-branch-name
 86 | ```
 87 | 
 88 | After your pull request is merged, its safe to delete your branch!
 89 | 
 90 | ## I accidentally checked a new change to my main branch instead of a feature branch. How to fix this?
 91 | 
 92 | (1) Soft undo your change(s). This leaves the changes in the files on disk but undoes the commit.
 93 | 
 94 | ```
 95 | git checkout main
 96 | # Moves pointer back to previous HEAD
 97 | git reset --soft HEAD@{1}
 98 | ```
 99 | 
100 | Or if you need to move back several commits to the most recent one in common with upstream, you can change ‘1’ to be however many commits back you need to go.
101 | 
102 | (2) “stash” your now-unchecked-in changes so that you can get them back later.
103 | 
104 | ```
105 | git stash
106 | ```
107 | 
108 | (3) Now do the [How to work on a new feature](#how-to-work-on-a-new-feature) step to bring main up to date and create your new feature branch that is “even” with upstream. Here are those commands again:
109 | 
110 | ```
111 | git fetch upstream
112 | git merge --ff-only upstream/main
113 | git checkout -b my-feature-branch-name
114 | ```
115 | 
116 | (4) “unstash” your changes.
117 | 
118 | ```
119 | git stash pop
120 | ```
121 | Now you can proceed with your work!
122 | 


--------------------------------------------------------------------------------
/sql-snippets/README.md:
--------------------------------------------------------------------------------
 1 | # SQL snippets
 2 | 
 3 | This snippets in this subdirectory are for workbench users who either know SQL or want to learn how to use SQL.
 4 | 
 5 | # Get setup for GitHub
 6 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#get-setup-for-github) for the details. If you are new to `git`, please see the example commands there. 
 7 | 
 8 | # How to contribute a snippet to the SQL snippets menu group
 9 | 
10 | 1. Write the SQL to retrieve the data of interest.
11 |     * Try to put as much of the data wrangling logic into SQL as possible. This means there's less data wrangling code to write twice (e.g., once in [pandas](https://pandas.pydata.org/) for Python and again [dplyr](https://dplyr.tidyverse.org/) for R).
12 |     * Choose a good prefix for your `<my-query>.sql` file name. The file name helps users decide whether its useful to them. It will also become the name of the dataframe holding the query results (e.g. `<my-query>_df`).
13 |     * Put some comments in your query too. Look at the other `.sql` files for examples.
14 |     * Tip: Paste your SQL into the [BigQuery web UI](https://bigquery.cloud.google.com/) and click on the 'format' button to format it nicely.
15 | 1. If your query has any parameter(s) other than `DATASET`, add default values for those parameters to both [`snippets_setup.R`](./snippets_setup.R) and [`snippets_setup.py`](./snippets_setup.py) so that the query will work as-is.
16 | 1. [Optional] If your SQL is complex, consider writing a test case for it. Look at the other SQL tests for examples.
17 | 1. If you want to include any visualizations for the data returned by your query, write a plot that assumes the existence of `<my-query>_df`.
18 |     * Write it first in [ggplot2](https://ggplot2.tidyverse.org/) or [plotnine](https://plotnine.readthedocs.io/en/stable/), depending on your preference, and iterate until you like the look of it.
19 |     * Once you have it the way you want it, port it to the other language.
20 |     * Name the files `<my-query>_<viz-description>.ggplot` and `<my-query>_<viz-description>.plotnine`
21 | 1. Update [r_sql_snippets_menu_config.yml](../build/r_sql_snippets_menu_config.yml) and [py_sql_snippets_menu_config.yml](../build/py_sql_snippets_menu_config.yml) to add your snippet where ever you would like it to be displayed within the menu.
22 | 1. Send your pull request!
23 | 
24 | Don't like these conventions? We can change them! This is just a starting point. Keep in mind we'll need to reflect those changes in the auto-generation script described in the next section.
25 | 
26 | # Auto-generation of Jupyter 'Snippets Menu' configuration
27 | 
28 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#auto-generation-of-jupyter-snippets-menu-configuration) for the details.
29 | 
30 | # Testing
31 | 
32 | ## Snippet tests
33 | To test individual snippets such as plots, the best thing to do is copy and paste them into a notebook on the workbench.
34 | 
35 | ## Query tests
36 | 
37 | The logic in some SQL queries can be quite complex, and therefore meriting some test cases. Be sure to represent realistic messiness of the data in the fake data created for each test case. The test framework we use is [BQTestCase](https://github.com/verilylifesciences/analysis-py-utils).
38 | 
39 | To run a test:
40 | ```
41 | # BQTestCase is only compatible with Python 2.7 right now.
42 | pip2.7 install git+https://github.com/verilylifesciences/analysis-py-utils.git@v0.3.0
43 | 
44 | git clone https://github.com/all-of-us/workbench-snippets.git
45 | cd workbench-snippets/sql-snippets
46 | 
47 | # This should be the id of a Cloud Platform project to which you can write temporary BigQuery tables.
48 | export TEST_PROJECT=<your-project-id>
49 | python2.7 most_recent_measurement_of_interest_test.py
50 | ```
51 | 
52 | We cannot current run these tests from the workbench because we are unable to create the BigQuery tables with syntheic data. Instead, run them from an environment such as [Terra](https://app.terra.bio/) or [Cloud Shell](https://cloud.google.com/shell/).
53 | 
54 | ## Integration 'smoke tests'
55 | If the smoke tests are run from the workbench environment and there are no obvious bugs in the snippets, they will run start-to-finish without error. (This won't necessarily catch all bugs, but its a good start.)
56 | 
57 | * The script to auto-generate the Jupyter Snippets Menu configuration also emits both `r_sql_snippets_menu_config_smoke_test.R` and `py_sql_snippets_menu_config_smoke_test.py`.
58 | * Those scripts each include, respectively, all the R SQL snippets and all the Python SQL snippets.
59 | * The CDR that will be used is determined by the `WORKBENCH_CDR` environment variable. The example below shows how to override it, if desired.
60 | 
61 | After opening a notebook in the production workbench environment, upload these smoke test files into Jupyter and then execute the following code from the Jupyter terminal or a Python notebook in the same directory. They will emit _"Smoke test complete!"_ when they have completed successfully.
62 | 
63 | To run the R SQL snippets smoke tests:
64 | ```
65 | %%bash
66 | 
67 | export WORKSPACE_CDR='fc-aou-cdr-prod.THE-CDR-YOU-WANT-TO-TEST-AGAINST'
68 | Rscript r_sql_snippets_menu_config_smoke_test.R  # There will be output, but there should be no errors.
69 | ```
70 | 
71 | To run the Python SQL snippets smoke tests:
72 | ```
73 | %%bash
74 | 
75 | export WORKSPACE_CDR='fc-aou-cdr-prod.THE-CDR-YOU-WANT-TO-TEST-AGAINST'
76 | 
77 | python3 py_sql_snippets_menu_config_smoke_test.py  # There will be output, but there should be no errors.
78 | ```
79 | 
80 | # Deployment
81 | The instructions are identical for all of the snippets collections. See [CONTRIBUTING](../CONTRIBUTING.md#deployment) for the details.
82 | 


--------------------------------------------------------------------------------
/py/terra_widgets/html_snapshots.py:
--------------------------------------------------------------------------------
  1 | """Methods for creating and viewing HTML snapshots of notebooks.
  2 | 
  3 | It is useful to create and retain HTML snapshots of notebooks to track how results within
  4 | a notebook changes over time as analysis work proceeds.
  5 | """
  6 | 
  7 | import collections
  8 | import os
  9 | import tempfile
 10 | from typing import Dict
 11 | from typing import List
 12 | 
 13 | from IPython import get_ipython
 14 | from IPython.display import display
 15 | from IPython.display import HTML
 16 | from IPython.display import IFrame
 17 | from ipywidgets import widgets
 18 | from multiprocess import Pool
 19 | import pandas as pd
 20 | import tensorflow as tf
 21 | from tqdm import tqdm
 22 | 
 23 | from terra_widgets.workspace_metadata import WorkspaceMetadata
 24 | from terra_widgets.workspace_paths import WorkspacePaths
 25 | 
 26 | # Define this in the outer scope so that it lives for the duration of the Jupyter kernel.
 27 | TEMP_HTML = tempfile.NamedTemporaryFile(dir=os.getcwd(), prefix='view_an_html_snapshot_', suffix='.html')
 28 | 
 29 | 
 30 | def create_html_snapshot(notebook_paths: List[str],
 31 |                          comment: str,
 32 |                          workspace_paths: WorkspacePaths,
 33 |                          overwrite: bool = False) -> HTML:
 34 |   """Render a notebook to HTML and transfer the HTML and its comment to the workspace bucket.
 35 | 
 36 |   The notebook is rendered as-is (it is not re-run). The comment is stored in a file in the same folder as the HTML.
 37 | 
 38 |   Note, there's a mix of TensorFlow GFile in here and gsutil. We use GFile while the file could be
 39 |   either local or remote. We use gsutil when its definitely remote because the console output serves as
 40 |   useful progress messages.
 41 | 
 42 |   Args:
 43 |     notebook_paths: A list of Cloud Storage paths to the notebooks for which to create HTML
 44 |     comment: A string to be associated with the HTML file(s) such as a description of recent
 45 |              changes to the notebook and the results within it.
 46 |     workspace_paths: A list of WorkspacePaths objects indicating the destinations for the HTML and comment files.
 47 |     overwrite: Skip the snapshot creation if the file already exists.
 48 |   Returns:
 49 |     An HTML object for display.
 50 |   """
 51 |   if not notebook_paths:
 52 |     return HTML('''<div class="alert alert-block alert-danger">
 53 |     No notebook was selected. To create an HTML snapshot of a notebook, select the desired notebook.</div>''')
 54 |   if not comment:
 55 |     return HTML('''<div class="alert alert-block alert-danger">
 56 |     No comment was specified. Please provide some context in the comment field
 57 |     describing why you wish to make an HTML snapshot of the selected notebook(s) at this time.</div>''')
 58 | 
 59 |   destinations = workspace_paths.formulate_destination_paths(notebooks=notebook_paths)
 60 | 
 61 |   with tempfile.TemporaryDirectory() as tmpdirname:
 62 |     for notebook_path in notebook_paths:
 63 |       temp_notebook = os.path.join(tmpdirname, os.path.basename(notebook_path))
 64 |       try:
 65 |         tf.io.gfile.copy(src=notebook_path, dst=temp_notebook)
 66 |       except (tf.errors.NotFoundError, tf.errors.PermissionDeniedError) as e:
 67 |         return HTML(f'''<div class="alert alert-block alert-danger">
 68 |         <b>Warning:</b> Unable to copy {notebook_path} to {temp_notebook}.
 69 |         <hr><p><pre>{e.message}</pre></p></div>''')
 70 | 
 71 |       noclobber = '-n' if not overwrite else ''
 72 |       # Create and transfer the html file to the workspace bucket.
 73 |       get_ipython().system(f"set -o xtrace ; jupyter nbconvert --to html --ExtractOutputPreprocessor.enabled=False '{temp_notebook}'")
 74 |       temp_html = temp_notebook.replace(WorkspacePaths.NOTEBOOK_FILE_SUFFIX, WorkspacePaths.HTML_FILE_SUFFIX)
 75 |       get_ipython().system(f"set -o xtrace ; gsutil cp {noclobber} '{temp_html}' '{destinations[notebook_path].html_file}'")
 76 |       # Create and transfer the comment file to the workspace bucket.
 77 |       temp_comment = temp_notebook.replace(WorkspacePaths.NOTEBOOK_FILE_SUFFIX, WorkspacePaths.COMMENT_FILE_SUFFIX)
 78 |       with open(temp_comment, 'w') as f:
 79 |         f.write(comment)
 80 |       get_ipython().system(f"set -o xtrace ; gsutil cp {noclobber} '{temp_comment}' '{destinations[notebook_path].comment_file}'")
 81 |       get_ipython().system(f"set -o xtrace ; gsutil setmeta -h 'Content-Type:text/plain' '{destinations[notebook_path].comment_file}'")
 82 | 
 83 |   # Intentionally empty. 'No clobber' does not throw an error, only warns, so returning success might not be correct.
 84 |   return HTML('')
 85 | 
 86 | 
 87 | def create_html_snapshot_widget(ws_names2id: Dict[str, str], ws_paths: Dict[str, WorkspacePaths], output):
 88 |   """Create an ipywidget UI for creating html copies."""
 89 |   workspace_chooser = widgets.Dropdown(
 90 |       options=ws_names2id,
 91 |       value=None,
 92 |       description='<b>Choose the workspace</b>:',
 93 |       style={'description_width': 'initial'},
 94 |       layout=widgets.Layout(width='900px')
 95 |   )
 96 |   notebook_chooser = widgets.SelectMultiple(
 97 |       options=[],  # This will be populated after a workspace is chosen.
 98 |       value=[],
 99 |       description='<b>Choose one or more notebooks for which to create an HTML snapshot:</b>',
100 |       style={'description_width': 'initial'},
101 |       layout=widgets.Layout(width='900px')
102 |   )
103 |   commenter = widgets.Textarea(
104 |       value='',
105 |       placeholder='Type a comment here about this HTML snapshot of your notebook',
106 |       description='<b>Comment</b>:',
107 |       disabled=False,
108 |       layout=widgets.Layout(width='900px', height='50px'),
109 |       style={'description_width': 'initial'}
110 |   )
111 |   submit_button = widgets.Button(
112 |       description='Submit',
113 |       disabled=False,
114 |       button_style='success',
115 |       tooltip='Click the submit button to create the HTML snapshot.'
116 |   )
117 | 
118 |   def on_button_clicked(_):
119 |     with output:
120 |       output.clear_output()
121 |       if workspace_chooser.value is None:
122 |         display(HTML('''<div class="alert alert-block alert-danger">
123 |         No workspace was selected. To create an HTML snapshot of a notebook, select the desired workspace.</div>'''))
124 |         return
125 |       workspace_paths = ws_paths[workspace_chooser.value]
126 |       display(create_html_snapshot(notebook_paths=notebook_chooser.value,
127 |                                    comment=commenter.value,
128 |                                    workspace_paths=workspace_paths))
129 |   submit_button.on_click(on_button_clicked)
130 | 
131 |   def on_choose_workspace(changed):
132 |     output.clear_output()
133 |     workspace_paths = ws_paths[changed['new']]
134 |     workspace_notebooks = tf.io.gfile.glob(pattern=workspace_paths.get_notebook_file_glob())
135 |     notebook_chooser.options = {os.path.basename(nb): nb for nb in workspace_notebooks}
136 |   workspace_chooser.observe(on_choose_workspace, names='value')
137 | 
138 |   return widgets.VBox(
139 |       [widgets.HTML('''
140 |        <h3>Create an HTML snapshot of a notebook</h3>
141 |        <p>Use this when you want to save an HTML snapshot of a notebook containing its outputs. The notebook will be rendered to HTML as-is (not re-run).
142 |        <br>It will be saved in the <code>reports</code> folder of the workspace bucket:
143 |        <br><ul>
144 |          <li><code>gs://&lt;workspace bucket name&gt;/reports/&lt;your email address&gt;/&lt;date&gt;/&lt;time&gt;/&lt;notebook&gt;.html</code></li>
145 |          <li><code>gs://&lt;workspace bucket name&gt;/reports/&lt;your email address&gt;/&lt;date&gt;/&lt;time&gt;/&lt;notebook&gt;.comment.txt</code></li>
146 |          </ul>
147 |        </p><hr>
148 |        '''),
149 |        workspace_chooser, notebook_chooser, commenter, submit_button],
150 |       layout=widgets.Layout(width='auto', border='solid 1px grey'))
151 | 
152 | 
153 | def create_view_files_widget(ws_names2id: Dict[str, str], ws_paths: Dict[str, WorkspacePaths], output):
154 |   """Create an ipywidget UI to view HTML snapshots and their associated comment files."""
155 |   workspace_chooser = widgets.Dropdown(
156 |       options=ws_names2id,
157 |       value=None,
158 |       description='<b>Choose the workspace</b>:',
159 |       style={'description_width': 'initial'},
160 |       layout=widgets.Layout(width='900px')
161 |   )
162 |   user_chooser = widgets.Dropdown(
163 |       options=[],
164 |       value=None,
165 |       description='<b>Choose the user</b>:',
166 |       style={'description_width': 'initial'},
167 |       layout=widgets.Layout(width='900px')
168 |   )
169 |   date_chooser = widgets.Dropdown(
170 |       options=[],
171 |       value=None,
172 |       description='<b>Choose the date</b>:',
173 |       style={'description_width': 'initial'},
174 |       layout=widgets.Layout(width='900px')
175 |   )
176 |   time_chooser = widgets.Dropdown(
177 |       options=[],
178 |       value=None,
179 |       description='<b>Choose the time</b>:',
180 |       style={'description_width': 'initial'},
181 |       layout=widgets.Layout(width='900px')
182 |   )
183 |   file_chooser = widgets.Dropdown(
184 |       options=[],
185 |       value=None,
186 |       description='<b>Choose the file</b>:',
187 |       style={'description_width': 'initial'},
188 |       layout=widgets.Layout(width='900px')
189 |   )
190 |   view_comment_button = widgets.Button(
191 |       description='View the comment for the HTML snapshot',
192 |       disabled=False,
193 |       button_style='success',
194 |       layout=widgets.Layout(width='300px'),
195 |       tooltip='Click the button to view the comment associated with the HTML snapshot of the notebook.'
196 |   )
197 |   view_html_button = widgets.Button(
198 |       description='View the HTML snapshot',
199 |       disabled=False,
200 |       button_style='success',
201 |       layout=widgets.Layout(width='250px'),
202 |       tooltip='Click the button to view the HTML snapshot of the notebook.'
203 |   )
204 | 
205 |   def on_view_comment_button_clicked(_):
206 |     with output:
207 |       output.clear_output()
208 |       if not file_chooser.value:
209 |         display(HTML('''<div class="alert alert-block alert-warning">
210 |         No comment files found for HTML snapshots in this workspace.</div>'''))
211 |         return
212 |       comment_file = file_chooser.value.replace('.html', WorkspacePaths.COMMENT_FILE_SUFFIX)
213 |       comment = get_ipython().getoutput(f"gsutil cat '{comment_file}'")
214 |       display(HTML(f'''<div class="alert alert-block alert-info">{comment}</div>'''))
215 |   view_comment_button.on_click(on_view_comment_button_clicked)
216 | 
217 |   def on_view_html_button_clicked(_):
218 |     with output:
219 |       output.clear_output()
220 |       if not file_chooser.value:
221 |         display(HTML('''<div class="alert alert-block alert-warning">
222 |         No HTML snapshots found in this workspace.</div>'''))
223 |         return
224 |       source = file_chooser.value
225 |       dest = TEMP_HTML.name
226 |       get_ipython().system(f"set -o xtrace ; gsutil cp '{source}' '{dest}'")
227 |       display(IFrame(os.path.join('.', os.path.basename(TEMP_HTML.name)), width='100%', height=800))
228 |   view_html_button.on_click(on_view_html_button_clicked)
229 | 
230 |   def on_choose_workspace(changed):
231 |     output.clear_output()
232 |     user_chooser.options = []
233 |     if changed['new']:
234 |       workspace_paths = ws_paths[changed['new']]
235 |       items = tf.io.gfile.glob(pattern=workspace_paths.get_user_glob())
236 |       if items:
237 |         user_chooser.options = {os.path.basename(item): item for item in items}
238 |   workspace_chooser.observe(on_choose_workspace, names='value')
239 | 
240 |   def on_choose_user(changed):
241 |     date_chooser.options = []
242 |     if changed['new']:
243 |       workspace_paths = ws_paths[workspace_chooser.value]
244 |       items = tf.io.gfile.glob(pattern=workspace_paths.add_date_glob_to_path(path=changed['new']))
245 |       if items:
246 |         date_chooser.options = collections.OrderedDict(sorted(
247 |             {os.path.basename(item): item for item in items}.items(), reverse=True))
248 |   user_chooser.observe(on_choose_user, names='value')
249 | 
250 |   def on_choose_date(changed):
251 |     time_chooser.options = []
252 |     if changed['new']:
253 |       workspace_paths = ws_paths[workspace_chooser.value]
254 |       items = tf.io.gfile.glob(pattern=workspace_paths.add_time_glob_to_path(path=changed['new']))
255 |       if items:
256 |         time_chooser.options = collections.OrderedDict(sorted(
257 |             {os.path.basename(item): item for item in items}.items(), reverse=True))
258 |   date_chooser.observe(on_choose_date, names='value')
259 | 
260 |   def on_choose_time(changed):
261 |     file_chooser.options = []
262 |     if changed['new']:
263 |       workspace_paths = ws_paths[workspace_chooser.value]
264 |       items = tf.io.gfile.glob(pattern=workspace_paths.add_html_glob_to_path(path=changed['new']))
265 |       if items:
266 |         file_chooser.options = {os.path.basename(item): item for item in items}
267 |   time_chooser.observe(on_choose_time, names='value')
268 | 
269 |   return widgets.VBox(
270 |       [widgets.HTML('''
271 |        <h3>View an HTML snapshot of a notebook</h3>
272 |        <p>Use the dropdowns to select the workspace, user, date, time, and particular HTML snapshot.
273 |        <br>Then click on the 'view' buttons to see either the comment for the snapshot or the actual snapshot.
274 |        </p><hr>'''),
275 |        workspace_chooser, user_chooser, date_chooser, time_chooser, file_chooser,
276 |        widgets.HBox([view_comment_button, view_html_button])],
277 |       layout=widgets.Layout(width='auto', border='solid 1px grey'))
278 | 
279 | 
280 | def create_view_all_comments_widget(ws_names2id: Dict[str, str], ws_paths: Dict[str, WorkspacePaths], output):
281 |   """Create an ipywidget UI to display the contents of all comment files within a particular workspace."""
282 |   workspace_chooser = widgets.Dropdown(
283 |       options=ws_names2id,
284 |       value=None,
285 |       description='<b>Choose a workspace to view:</b>',
286 |       style={'description_width': 'initial'},
287 |       layout=widgets.Layout(width='900px')
288 |   )
289 | 
290 |   def on_choose_workspace(changed):
291 |     with output:
292 |       output.clear_output()
293 |       workspace_paths = ws_paths[changed['new']]
294 |       try:
295 |         comment_files = tf.io.gfile.glob(pattern=workspace_paths.get_comment_file_glob())
296 |       except tf.errors.PermissionDeniedError as e:
297 |         target_workspace = [name for name, id in ws_names2id.items() if id == changed['new']]
298 |         display(HTML(f'''<div class="alert alert-block alert-danger">
299 |           <b>Warning:</b> Unable to view HTML snapshots in workspace {target_workspace} from <b>this workspace</b>.
300 |           <hr><p><pre>{e.message}</pre></p>
301 |           </div>'''))
302 |         return
303 |       if not comment_files:
304 |         display(HTML('''<div class="alert alert-block alert-warning">
305 |           No comment files found for HTML snapshots in this workspace.</div>'''))
306 |         return
307 | 
308 |       def get_comment(f):
309 |         with tf.io.gfile.GFile(f, 'r') as fh:
310 |           return fh.readlines()
311 | 
312 |       def process_task(f):
313 |         return f, get_comment(f)
314 | 
315 |       max_pool = 8
316 |       with Pool(max_pool) as p:
317 |         pool_outputs = list(tqdm(p.imap(process_task, comment_files), total=len(comment_files)))
318 | 
319 |       comments = pd.DataFrame.from_dict({f.replace(workspace_paths.get_subfolder(), ''): c[0] for f, c in pool_outputs},
320 |                                         orient = 'index',
321 |                                         columns = ['comment']
322 |                                        ).reset_index()
323 |       comments[['extra', 'author', 'date', 'time', 'item']] = comments['index'].str.split(pat='/', expand=True)
324 |       display(comments[['date', 'time', 'author', 'item', 'comment']].sort_values(by=['date', 'time']).reset_index(drop=True))
325 |   workspace_chooser.observe(on_choose_workspace, names='value')
326 | 
327 |   return widgets.VBox(
328 |       [widgets.HTML('''
329 |        <h3>View all comments for a workspace</h3>
330 |        <p>Use the dropdown to choose a workspace. Then this will display the contents of all comment files for the selected workspace.
331 |        <br>The user, date, time, and notebook name are shown in the left column. The comment is shown in the right column.
332 |        </p><hr>'''),
333 |        workspace_chooser],
334 |       layout=widgets.Layout(width='auto', border='solid 1px grey'))
335 | 
336 | 
337 | def display_html_snapshots_widget():
338 |   """Create an ipywidget UI encapsulating all three UIs related to HTML snapshots."""
339 |   if not get_ipython():
340 |     print('The HTML snapshot widget cannot be display in environments other than IPython.')
341 |     return
342 | 
343 |   # Configure notebook display preferences to better suit this UI. These display settings
344 |   # will be in effect for all cells in the notebook run after this one is run.
345 |   pd.set_option('display.max_colwidth', None)
346 |   pd.set_option('display.max_rows', None)
347 |   get_ipython().run_cell_magic(
348 |       'javascript',
349 |       '',
350 |       '''// Display cell outputs to full height (no vertical scroll bar)
351 |          IPython.OutputArea.auto_scroll_threshold = 9999;''')
352 | 
353 |   # Retrieve the workspace metadata for the current user and environment.
354 |   ws_meta = WorkspaceMetadata()
355 |   workspace_names2id = collections.OrderedDict(sorted(
356 |       ws_meta.get_workspace_name_to_id_mapping().items()))
357 |   workspace_names2id_include_readonly = collections.OrderedDict(sorted(
358 |       ws_meta.get_workspace_name_to_id_mapping(include_private_readonly=True).items()))
359 |   workspace_ids2bucket_include_readonly = ws_meta.get_workspace_id_to_bucket_mapping(include_private_readonly=True)
360 |   workspace_paths = {k: WorkspacePaths(workspace_bucket=v)
361 |                      for k, v in workspace_ids2bucket_include_readonly.items()}
362 | 
363 |   ui_output = widgets.Output()
364 | 
365 |   ui_tabs = widgets.Tab()
366 |   ui_tabs.children = [create_html_snapshot_widget(ws_names2id=workspace_names2id,
367 |                                                   ws_paths=workspace_paths,
368 |                                                   output=ui_output),
369 |                       create_view_files_widget(ws_names2id=workspace_names2id_include_readonly,
370 |                                                ws_paths=workspace_paths,
371 |                                                output=ui_output),
372 |                       create_view_all_comments_widget(ws_names2id=workspace_names2id_include_readonly,
373 |                                                       ws_paths=workspace_paths,
374 |                                                       output=ui_output)]
375 |   ui_tabs.set_title(title='Create', index=0)
376 |   ui_tabs.set_title(title='View one', index=1)
377 |   ui_tabs.set_title(title='View all', index=2)
378 | 
379 |   display(ui_tabs, ui_output)
380 | 


--------------------------------------------------------------------------------