├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   └── ml4cvd-issue-template.md
    └── workflows
    │   ├── RELEASE.md
    │   ├── increment-version.yml
    │   ├── publish-to-gcr-ghcr.yml
    │   ├── publish-to-pypi.yml
    │   ├── publish-to-terra.yml
    │   └── python-package.yml
├── .gitignore
├── .lfsconfig
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE.txt
├── MANIFEST.in
├── NOTICE.txt
├── README.md
├── RECIPE_EXAMPLES.md
├── benchmarks
    ├── BENCHMARKS.md
    ├── benchmark.py
    └── data.py
├── docker
    ├── DOCKER.md
    ├── ml4h_deploy
    │   ├── Dockerfile
    │   ├── README.md
    │   └── process_files.py
    └── vm_boot_images
    │   ├── CREATE_DOCKER_IMAGES.md
    │   ├── Dockerfile
    │   ├── build.sh
    │   └── config
    │       ├── fastai-requirements.txt
    │       ├── fastai.sh
    │       ├── pyukbb.sh
    │       ├── tensorflow-requirements.txt
    │       └── ubuntu.sh
├── git_secrets_provider_ml4h.txt
├── go.mod
├── go.sum
├── ingest
    ├── bulkprocess
    │   ├── README.md
    │   ├── dicom-metadata.go
    │   ├── dicom-overlay.go
    │   ├── field-ids.go
    │   ├── output.go
    │   ├── renamer.go
    │   └── zip-metadata.go
    ├── cmd
    │   ├── batcher
    │   │   ├── batcher.linux
    │   │   ├── dicom.go
    │   │   ├── functions.go
    │   │   └── main.go
    │   ├── build_curl_command.py
    │   ├── dicom2jpeg
    │   │   ├── dicom2jpeg.linux
    │   │   └── main.go
    │   ├── downloader
    │   │   ├── downloader.linux
    │   │   └── main.go
    │   ├── gene2chrpos
    │   │   ├── gene2chrpos.osx
    │   │   ├── lookups
    │   │   │   ├── ensembl.grch37.p13.genes
    │   │   │   └── url.txt
    │   │   └── main.go
    │   ├── manifester
    │   │   ├── main.go
    │   │   └── manifester.linux
    │   └── merge-lvef
    │   │   ├── main.go
    │   │   └── merge-lvef.linux
    ├── partners_ecg
    │   ├── organize_xmls.py
    │   └── remove_xml_duplicates.py
    └── ukbb_csv_bigquery
    │   ├── README.md
    │   ├── censor
    │       ├── censor.go
    │       ├── censor_result.go
    │       ├── main.go
    │       ├── query_single.go
    │       └── time_handling.go
    │   ├── convertcoding
    │       ├── cc_test.go
    │       └── main.go
    │   ├── convertdict
    │       ├── cd_test.go
    │       └── main.go
    │   ├── convertpheno
    │       ├── flagslice.go
    │       └── main.go
    │   ├── convertsample
    │       └── main.go
    │   ├── decrypt_all.sh
    │   ├── do_all.sh
    │   ├── firstdate
    │       ├── README.md
    │       └── main.go
    │   ├── importcensor
    │       ├── censor.json
    │       └── import.sh
    │   ├── importcoding
    │       ├── coding.json
    │       └── import.sh
    │   ├── importdict
    │       ├── dictionary.json
    │       └── import.sh
    │   ├── importhesin
    │       ├── hesin.json
    │       ├── hesin_diag.json
    │       ├── hesin_diag10.json
    │       ├── hesin_diag9.json
    │       ├── hesin_lubitz.json
    │       ├── hesin_oper.json
    │       └── import.sh
    │   ├── importpheno
    │       ├── append.sh
    │       └── phenotype.json
    │   ├── importsample
    │       ├── import.sh
    │       └── sample.json
    │   ├── inspect_screenshot.png
    │   └── preprocessing_data.ipynb
├── ml4h
    ├── DATA_MODELING_TESTS.md
    ├── DatabaseClient.py
    ├── TensorMap.py
    ├── __init__.py
    ├── applications
    │   ├── feature_selection
    │   │   ├── 2020.11.30_analysis_cleaned2.r
    │   │   ├── coxnet_training_testing_evaluating.py
    │   │   └── xgboost_training_testing_evaluating.py
    │   ├── ingest
    │   │   ├── ingest_autosegment.py
    │   │   ├── ingest_mri.py
    │   │   ├── ingest_xml_metadata.py
    │   │   ├── requirements.txt
    │   │   └── two_d_projection.py
    │   └── jpp_inference_rv
    │   │   ├── README.md
    │   │   ├── infer_on_sax.py
    │   │   └── infer_to_hd5_local.sh
    ├── arguments.py
    ├── data_descriptions.py
    ├── defines.py
    ├── explorations.py
    ├── hypertuning.py
    ├── logger.py
    ├── make_tensor_maps_for_partners_ecg_labels.py
    ├── metrics.py
    ├── ml4ht_integration
    │   ├── __init__.py
    │   ├── tensor_generator.py
    │   └── tensor_map.py
    ├── models
    │   ├── Block.py
    │   ├── __init__.py
    │   ├── basic_blocks.py
    │   ├── conv_blocks.py
    │   ├── diffusion_blocks.py
    │   ├── inspect.py
    │   ├── layer_wrappers.py
    │   ├── legacy_models.py
    │   ├── merge_blocks.py
    │   ├── model_factory.py
    │   ├── perceiver_blocks.py
    │   ├── pretrained_blocks.py
    │   ├── train.py
    │   ├── train_diffusion.py
    │   ├── transformer_blocks.py
    │   └── transformer_blocks_embedding.py
    ├── normalizer.py
    ├── optimizers.py
    ├── plots.py
    ├── recipes.py
    ├── runtime_data_defines.py
    ├── tensor_generators.py
    ├── tensorize
    │   ├── PARTNERS.md
    │   ├── README.md
    │   ├── TENSORIZE.md
    │   ├── __init__.py
    │   ├── dataflow
    │   │   ├── __init__.py
    │   │   ├── bigquery_ukb_queries.py
    │   │   ├── fieldids.json
    │   │   ├── load_fieldids.sh
    │   │   ├── ml4h_dataflow.yml
    │   │   └── requirements_ml4h_dataflow.txt
    │   ├── merge_hd5s.py
    │   ├── tensor_writer_mgb.py
    │   ├── tensor_writer_ukbb.py
    │   └── tensorize_dataflow.py
    ├── tensormap
    │   ├── __init__.py
    │   ├── celeba.py
    │   ├── gatk.py
    │   ├── general.py
    │   ├── mgb
    │   │   ├── __init__.py
    │   │   ├── dynamic.py
    │   │   ├── ecg.py
    │   │   └── xdl.py
    │   ├── mnist.py
    │   ├── tensor_map_maker.py
    │   ├── text.py
    │   └── ukb
    │   │   ├── __init__.py
    │   │   ├── categorical.py
    │   │   ├── continuous.py
    │   │   ├── demographics.py
    │   │   ├── disease.py
    │   │   ├── dxa.py
    │   │   ├── ecg.py
    │   │   ├── embedding.py
    │   │   ├── genetics.py
    │   │   ├── mri.py
    │   │   ├── mri_brain.py
    │   │   ├── mri_ecg.py
    │   │   ├── mri_vtk.py
    │   │   └── survival.py
    ├── test_utils.py
    └── visualization_tools
    │   ├── __init__.py
    │   ├── annotation_storage.py
    │   ├── annotations.py
    │   ├── annotations_schema.json
    │   ├── batch_image_annotations.py
    │   ├── dicom_interactive_plots.py
    │   ├── dicom_plots.py
    │   ├── ecg_interactive_plots.py
    │   ├── ecg_reshape.py
    │   ├── ecg_static_plots.py
    │   ├── facets.py
    │   └── hd5_mri_plots.py
├── model_zoo
    ├── DROID-MVP
    │   ├── droid_mvp_checkpoint
    │   │   ├── checkpoint
    │   │   ├── chkp.data-00000-of-00001
    │   │   └── chkp.index
    │   ├── droid_mvp_inference.py
    │   ├── droid_mvp_model_description.py
    │   ├── movinet_a2_base
    │   │   ├── checkpoint
    │   │   ├── ckpt-1.data-00000-of-00001
    │   │   └── ckpt-1.index
    │   └── readme.md
    ├── DROID-RV
    │   ├── droid_rv_checkpoint
    │   │   ├── checkpoint
    │   │   ├── chkp.data-00000-of-00001
    │   │   └── chkp.index
    │   ├── droid_rv_inference.py
    │   ├── droid_rv_model_description.py
    │   ├── droid_rvef_checkpoint
    │   │   ├── checkpoint
    │   │   ├── chkp.data-00000-of-00001
    │   │   └── chkp.index
    │   ├── movinet_a2_base
    │   │   ├── checkpoint
    │   │   ├── ckpt-1.data-00000-of-00001
    │   │   └── ckpt-1.index
    │   └── readme.md
    ├── DROID
    │   ├── README.md
    │   ├── data_descriptions
    │   │   ├── __init__.py
    │   │   ├── echo.py
    │   │   └── wide_file.py
    │   ├── echo_defines.py
    │   ├── echo_supervised_inference_recipe.py
    │   ├── echo_supervised_training_recipe.py
    │   ├── echo_to_lmdb.py
    │   ├── encoders
    │   │   ├── LA_DROID
    │   │   │   └── model
    │   │   │   │   ├── checkpoint
    │   │   │   │   ├── chkp.data-00000-of-00001
    │   │   │   │   └── chkp.index
    │   │   └── LV_DROID
    │   │   │   └── model
    │   │   │       ├── checkpoint
    │   │   │       ├── chkp.data-00000-of-00001
    │   │   │       └── chkp.index
    │   └── model_descriptions
    │   │   ├── __init__.py
    │   │   └── echo.py
    ├── ECG2AF
    │   ├── README.md
    │   ├── architecture.png
    │   ├── ecg2af_infer.ipynb
    │   ├── ecg2af_quintuplet_v2024_01_13.keras
    │   ├── ecg_5000_survival_curve_af_quadruple_task_mgh_v2021_05_21.h5
    │   ├── km.jpg
    │   ├── salience.jpg
    │   ├── strip_II_survival_curve_af_v2021_06_15.h5
    │   ├── strip_I_survival_curve_af_v2021_06_15.h5
    │   └── study_design.jpg
    ├── ECG_PheWAS
    │   ├── README.md
    │   ├── decoder_median.h5
    │   ├── decoder_median.keras
    │   ├── ecg_median_autoencoder_latent_space_infer.ipynb
    │   ├── ecg_median_autoencoder_reconstruct_phecode.ipynb
    │   ├── ecg_write_biosppy_medians.ipynb
    │   ├── encoder_median.h5
    │   ├── encoder_median.keras
    │   ├── latent_space_phewas.ipynb
    │   ├── mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.h5
    │   ├── mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.keras
    │   ├── pandas_boxplots_phewas.ipynb
    │   ├── phecode_projection.ipynb
    │   └── ukb_phewas.png
    ├── PCLR
    │   ├── PCLR.h5
    │   ├── PCLR
    │   │   ├── saved_model.pb
    │   │   └── variables
    │   │   │   ├── variables.data-00000-of-00001
    │   │   │   └── variables.index
    │   ├── PCLR_lead_I
    │   │   ├── saved_model.pb
    │   │   └── variables
    │   │   │   ├── variables.data-00000-of-00001
    │   │   │   └── variables.index
    │   ├── PCLR_lead_II
    │   │   ├── saved_model.pb
    │   │   └── variables
    │   │   │   ├── variables.data-00000-of-00001
    │   │   │   └── variables.index
    │   ├── README.md
    │   ├── build_model.py
    │   ├── get_representations.py
    │   ├── preprocess_ecg.py
    │   └── requirements.txt
    ├── README.md
    ├── adiposity_mlandepi
    │   ├── README.md
    │   ├── compute_projections.py
    │   ├── downstream_associations_v3.ipynb
    │   ├── ingest.py
    │   ├── shrinkage_loss.py
    │   └── train.py
    ├── cardiac_mri_derived_left_ventricular_mass
    │   ├── Lreg.png
    │   ├── Lseg.png
    │   ├── README.md
    │   ├── arguments_2020-11-13_11-47.txt
    │   ├── calibrations_sax_all_diastole_segmented.png
    │   ├── metric_history_sax_diastole_segment_no_flat.png
    │   ├── per_class_roc_sax_all_diastole_segmented.png
    │   ├── precision_recall_sax_all_diastole_segmented.png
    │   └── sax_diastole_segment_no_flat.h5
    ├── dropfuse
    │   ├── README.md
    │   ├── decoder_ecg_rest_median_raw_10.h5
    │   ├── decoder_lax_4ch_heart_center.h5
    │   ├── dropout_pair_contrastive_lax_4ch_cycle_ecg_median_10_pretrained_256d_v2020_06_07.h5
    │   ├── encoder_ecg_rest_median_raw_10.h5
    │   ├── encoder_lax_4ch_heart_center.h5
    │   └── overview.png
    ├── left_ventricular_mass_from_ecg_student_and_mri_teacher
    │   ├── README.md
    │   ├── TrainingAndTestSets.jpg
    │   ├── ecg_rest_raw_age_sex_bmi_lvm_asymmetric_loss.h5
    │   ├── ecg_rest_raw_lvm_asymmetric_loss.h5
    │   └── ecg_rest_raw_lvm_symmetric_loss.h5
    ├── liver_fat_from_mri_ukb
    │   ├── README.md
    │   ├── liver_fat_from_echo.h5
    │   ├── liver_fat_from_echo_teacher_model.png
    │   ├── liver_fat_from_ideal.h5
    │   └── liver_fat_from_ideal_student_model.png
    ├── mi_feature_selection
    │   ├── 2020.11.30_analysis_cleaned2.ipynb
    │   ├── 2020.11.30_analysis_cleaned2.r
    │   ├── README.md
    │   ├── coxnet_training_testing_evaluating.py
    │   ├── models
    │   │   ├── coxnet_survival_05_final.pickle
    │   │   └── xgcox_model.json
    │   ├── requirements.txt
    │   └── xgboost_training_testing_evaluating.py
    ├── registration_reveals_genetics
    │   ├── README.md
    │   ├── latent_space_comparisons.ipynb
    │   ├── registration.png
    │   ├── table1.png
    │   └── table2.png
    └── silhouette_mri
    │   ├── README.md
    │   ├── callbacks.py
    │   ├── shrinkage_loss.py
    │   └── train_models.py
├── notebooks
    ├── ML4H_ML_intro.ipynb
    ├── ML4H_Model_Factory_Intro.ipynb
    ├── latent_space_bias_detection.ipynb
    ├── mnist_survival_analysis_demo.ipynb
    ├── review_results
    │   ├── identify_a_sample_to_review.ipynb
    │   ├── image_annotations.ipynb
    │   ├── review_one_sample.ipynb
    │   └── test_error_handling_for_notebook_visualizations.ipynb
    ├── terra_featured_workspace
    │   ├── generate_synthetic_tabular_data.ipynb
    │   ├── image_annotations_demo.ipynb
    │   ├── ml4h_setup.ipynb
    │   ├── review_model_results_interactive.ipynb
    │   ├── review_one_sample_interactive.ipynb
    │   └── workspace_description.md
    └── typecast_column_for_hesin.ipynb
├── phenotype_labels
    ├── disease
    │   ├── DATES.md
    │   ├── README.md
    │   ├── main.go
    │   ├── materialized_hesin_dates.sql
    │   ├── materialized_special_dates.sql
    │   ├── result.go
    │   ├── special_fields.go
    │   ├── tabfile.go
    │   └── time_handling.go
    └── phecodes
    │   ├── README.md
    │   ├── __init__.py
    │   ├── icd10.json
    │   ├── load_phecodes.sh
    │   ├── map_phecodes.py
    │   ├── phecode_dictionary.json
    │   └── phecode_mapping.json
├── pylintrc
├── scripts
    ├── create_dev_dataset.py
    ├── detach_disk.sh
    ├── jupyter.sh
    ├── latent_space_gwas.py
    ├── merge_hd5s.sh
    ├── tensorize.sh
    ├── tf.sh
    ├── train_subsets.sh
    ├── validate_tensors.sh
    ├── vm_image
    │   └── ml4cvd-image.sh
    ├── vm_launch
    │   ├── launch_dl_instance.sh
    │   ├── launch_instance.sh
    │   └── run_once.sh
    └── vm_start.sh
├── setup.py
└── tests
    ├── conftest.py
    ├── ml4ht_integration
        └── test_tensor_map.py
    ├── test_arguments.py
    ├── test_models.py
    ├── test_recipes.py
    └── test_tensor_generators.py


/.github/ISSUE_TEMPLATE/ml4cvd-issue-template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: ml4h issue template
 3 | about: default issue template
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **What**
11 | Summarize the issue in 1-2 sentences.
12 | 
13 | **Why**
14 | Describe why this issue should be solved, new feature implemented, etc.
15 | 
16 | **How**
17 | High-level overview of how you propose to address.
18 | 
19 | **Acceptance Criteria**
20 | Unambiguous milestones; if any are incomplete, the PR cannot be merged.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/RELEASE.md:
--------------------------------------------------------------------------------
 1 | Release process
 2 | 
 3 | 
 4 | For PRs and after merge, testing is run with:
 5 | [python-package.yml](python-package.yml)
 6 | 
 7 | 
 8 | 
 9 | Manually navigate to GitHub's Releases page and select Draft a new release. 
10 | https://github.com/broadinstitute/ml4h/releases
11 | 
12 | This process should automatically kick off the following workflows
13 | 
14 | Creation of updated docker images on CPU and GPU base images and published in GCR and GHCR
15 | [publish-to-gcr-ghcr.yml](publish-to-gcr-ghcr.yml)
16 | 
17 | Images are named:
18 | tf2.9-latest-cpu
19 | tf2.9-latest-gpu
20 | And can be found on [GitHubs Container Registry](https://github.com/broadinstitute/ml4h/pkgs/container/ml4h)
21 | 
22 | Updating of ml4h library and published to Pypi
23 | [publish-to-pypi.yml](publish-to-pypi.yml)
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/.github/workflows/increment-version.yml:
--------------------------------------------------------------------------------
 1 | name: Increment version
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: write
 9 | 
10 | jobs:
11 |   update_version:
12 |     if: ${{ github.event.release.tag_name != '' }}
13 |     name: Update version
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Check out source code
17 |         uses: actions/checkout@v4
18 |         env:
19 |           GIT_LFS_SKIP_SMUDGE: 1
20 | 
21 |       - name: Checkout main for version edit
22 |         run: |
23 |           export GIT_LFS_SKIP_SMUDGE=1
24 |           # Note: the following account information will not work on GHES
25 |           export GIT_LFS_SKIP_SMUDGE=1
26 |           git config --global user.name "github-actions[bot]"
27 |           git config --global user.email {user.id}+{user.login}@users.noreply.github.com
28 |           git fetch
29 |           git checkout main
30 | 
31 |       - name: Replace string in file
32 |         run: |
33 |           grep "version" setup.py
34 |           if [[ ${{ github.event.release.tag_name }} =~ [v0-9.]* ]]; then
35 |               sed  -i "s/version='[v0-9.]*',/version='${{ github.event.release.tag_name }}',/g" setup.py
36 |           else
37 |               echo "Tag is an unexpected value and no version uodate will occur"
38 |           fi
39 | 
40 |       - name: Check for version update
41 |         run: cat setup.py
42 | 
43 |       - name: Push to git
44 |         run: |
45 |           git config lfs.https://github.com/broadinstitute/ml4h.locksverify false
46 |           git config lfs.https://github.com/broadinstitute/ml4h.git.locksverify false
47 |           git remote set-url origin https://${{ secrets.GHCR_USERNAME }}:${{ secrets.GHCR_TOKEN }}@github.com/${{ github.repository }}
48 |           git add setup.py 
49 |           git commit -m "Version bump to ${{ github.event.release.tag_name }}"
50 |           git push
51 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-gcr-ghcr.yml:
--------------------------------------------------------------------------------
 1 | name: Push to GCR/GHCR GitHub Action
 2 | on: 
 3 |   push:
 4 |     tags:        
 5 |       - '*'           # Push events to every tag not containing / 
 6 |   workflow_dispatch:
 7 | 
 8 | 
 9 | jobs:
10 |   build-and-push-to-gcr-service-account:
11 |     name: Build & push to GCR/GHCR
12 |     runs-on: self-hosted
13 |     steps:
14 |       - uses: actions/checkout@v3
15 |       - name: Authenticate to Google Cloud
16 |         id: auth
17 |         uses: google-github-actions/auth@v2
18 |         with:
19 |           credentials_json: '${{ secrets.B64_GCLOUD_SERVICE_ACCOUNT_JSON }}'
20 |       - name: Building and pushing the image
21 |         run: |
22 |           echo "${{ secrets.GHCR_TOKEN }}" | docker login ghcr.io -u "${{ secrets.GHCR_USERNAME }}" --password-stdin
23 |           yes | gcloud auth configure-docker gcr.io
24 |           docker system prune --all --force
25 |           ./docker/vm_boot_images/build.sh -P
26 |           docker system prune --all --force
27 |           ./docker/vm_boot_images/build.sh -c -P
28 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distribution 📦 to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*'           # Push events to every tag not containing /
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   build:
11 |     name: Build distribution 📦
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python-version: [3.12]
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v4
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v4
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |     - name: Install dependencies
24 |       run: |
25 |         pip cache purge
26 |         python -m pip install --upgrade pip
27 |         # Install the ml4h Python package.
28 |         pip install .
29 |         pip install -r docker/vm_boot_images/config/tensorflow-requirements.txt
30 |     - name: Install pypa/build
31 |       run: >-
32 |         python -m pip install build --user
33 |     - name: Build a binary wheel and a source tarball
34 |       run: python -m build
35 |     - name: Store the distribution packages
36 |       uses: actions/upload-artifact@v4
37 |       with:
38 |         name: python-package-distributions-${{ matrix.python-version }}
39 |         path: dist/
40 | 
41 |   publish-to-pypi:
42 |     name: >-
43 |       Publish Python 🐍 distribution 📦 to PyPI
44 |     needs:
45 |     - build
46 |     runs-on: ubuntu-latest
47 |     environment:
48 |       name: pypi
49 |       url: https://pypi.org/p/ml4h
50 |     permissions:
51 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
52 | 
53 |     steps:
54 |     - name: Download all the dists
55 |       uses: actions/download-artifact@v4
56 |       with:
57 |         name: python-package-distributions-3.12
58 |         path: dist/
59 |     - name: Publish distribution 📦 to PyPI
60 |       uses: pypa/gh-action-pypi-publish@release/v1
61 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install the ml4h Python package and run its tests.
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Test ml4h Python package
 5 | 
 6 | on:
 7 |   workflow_dispatch:
 8 |     # Allows manually triggering workflow in GitHub UI on selected branch.
 9 |     # GitHub doc: https://docs.github.com/en/free-pro-team@latest/actions/reference/events-that-trigger-workflows#workflow_dispatch.
10 |     # GitHub blog demo: https://github.blog/changelog/2020-07-06-github-actions-manual-triggers-with-workflow_dispatch/.
11 | 
12 |   push:
13 |     branches: [ master ]
14 | 
15 |   pull_request:
16 |     branches: [ master ]
17 | 
18 | jobs:
19 |   build:
20 | 
21 |     runs-on: ubuntu-latest
22 |     strategy:
23 |       matrix:
24 |         python-version: [3.11, 3.12]
25 | 
26 |     steps:
27 |     - uses: actions/checkout@v2
28 |     - name: Set up Python ${{ matrix.python-version }}
29 |       uses: actions/setup-python@v2
30 |       with:
31 |         python-version: ${{ matrix.python-version }}
32 |     - name: Install dependencies
33 |       run: |
34 |         pip cache purge
35 |         python -m pip install --upgrade pip
36 |         # Install the ml4h Python package.
37 |         pip install .
38 |     - name: Test with pytest and pytest-xdist
39 |       run: |
40 |         pytest tests -m "not slow" -n auto
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .vscode
 3 | .idea/*
 4 | *.linux
 5 | *.log
 6 | **/__pycache__/*
 7 | trained_models/*
 8 | recipes_output/*
 9 | .ipynb_checkpoints
10 | ml4h.egg-info/*
11 | .Rproj.user
12 | docker/terra_image/*/**
13 | build/
14 | dist/
15 | 


--------------------------------------------------------------------------------
/.lfsconfig:
--------------------------------------------------------------------------------
1 | [lfs]
2 | 	url = git@github.com:broadinstitute/ml4h.git
3 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/kynan/nbstripout
 3 |   rev: 0.6.1
 4 |   hooks:
 5 |     - id: nbstripout
 6 |       files: ".ipynb"
 7 | - repo: https://github.com/pre-commit/pre-commit-hooks
 8 |   rev: v4.4.0
 9 |   hooks:
10 |     - id: trailing-whitespace
11 |       files: ".py"
12 |     - id: end-of-file-fixer
13 |       files: ".py"
14 |     - id: debug-statements
15 |       files: ".py"
16 | - repo: https://github.com/Lucas-C/pre-commit-hooks
17 |   rev: v1.3.1
18 |   hooks:
19 |     - id: remove-tabs
20 |       files: ".py"
21 | - repo: https://github.com/asottile/add-trailing-comma
22 |   rev: v2.4.0
23 |   hooks:
24 |     - id: add-trailing-comma
25 |       exclude: "ml4h/tensormap/ukb/by_script.py"
26 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | ML4H is released under the following BSD 3-Clause License:
 2 | 
 3 | Copyright (c) 2025, Broad Institute, Inc. All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | * Redistributions of source code must retain the above copyright notice, this
 9 |   list of conditions and the following disclaimer.
10 | 
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 |   this list of conditions and the following disclaimer in the documentation
13 |   and/or other materials provided with the distribution.
14 | 
15 | * Neither the name Broad Institute, Inc.
16 |   nor the names of its contributors may be used to endorse or promote products 
17 |   derived from this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include docker/vm_boot_images/config/tensorflow-requirements.txt


--------------------------------------------------------------------------------
/benchmarks/BENCHMARKS.md:
--------------------------------------------------------------------------------
 1 | # Data generation benchmarks :straight_ruler:
 2 | Benchmarks keep track of how quickly we can produce data for training models.
 3 | To run all benchmarks:
 4 | ```bash
 5 | python benchmarks/benchmark.py
 6 | ```
 7 | 
 8 | 
 9 | ## Index
10 | * [Running benchmarks](#running-benchmarks)
11 | * [Contributing benchmarks](#running-benchmarks)
12 | 
13 | 
14 | ## Running benchmarks
15 | Benchmarks are run using [benchmarks.py](./benchmark.py).
16 | You can specify specific benchmarks using
17 | ```bash
18 | python benchmarks/benchmarks.py [name of benchmark] [name of another benchmark] ...
19 | ```
20 | The names of the currently available benchmarks are the directory names under [benchmark_results](./benchmark_results).
21 | 
22 | ## Contributing benchmarks
23 | 
24 | There are three components to a benchmark:
25 | 1. [The type of data](#data-descriptions)
26 | 2. [The way the data is iterated over](#generatorfactories)
27 | 
28 | All three can be tinkered with.
29 | 
30 | ### Data descriptions
31 | Synthetic data is produced in [data.py](./data.py) using the function `data.build_example`.
32 | A synthetic datum is described by a 3-tuple
33 | ```
34 | [name, shape, data type]
35 | ```
36 | For example, the `ecg_single_task` benchmark simulates reading ECG bmi pairs,
37 | so it has two data descriptions:
38 | ```python
39 | ('ecg', (5000, 12), StorageType.CONTINUOUS),
40 | ('bmi', (1,), StorageType.CONTINUOUS),
41 | ```
42 | 
43 | ### GeneratorFactories
44 | `GeneratorFactorie`s prepare the data for a class of generators and produce data generators that use that data.
45 | for example `benchmark.TensorGeneratorFactory` builds `ml4h.TensorGenerator`s with `hd5`s to store data.
46 | `GeneratorFactorie`s must implement a `setup` function which prepares the synthetic data given a `DataDescription`
47 | and a number of synthetic samples to build.
48 | 
49 | `GeneratorFactorie`s must also implement `__call__(batch_size: int, num_workers: int)`
50 | which produces a data generator with that batch size and number of multiprocessing workers.
51 | 


--------------------------------------------------------------------------------
/docker/DOCKER.md:
--------------------------------------------------------------------------------
 1 | # ML4H Docker
 2 | 
 3 | ## Editing and pushing the docker
 4 | 
 5 | To edit the packages inside the ML4H docker container, first edit:
 6 | ```
 7 | ml4h/ml4h/docker/vm_boot_images/config/tensorflow-requirements.txt
 8 | ```
 9 | Add a line for each package, with optional version numbers.
10 | 
11 | Then, the docker container should be pushed to both the [Google Container Registry](https://console.cloud.google.com/gcr/images/broad-ml4cvd/GLOBAL/deeplearning) and the [Github GHCR Repository](https://github.com/broadinstitute/ml4h/pkgs/container/ml4h).
12 | 
13 | For GHCR, you will to generate a [personal access token](https://github.com/settings/tokens) on github, and grant docker access:
14 | ```
15 | docker login ghcr.io -u GITHUB_USERNAME -p ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
16 | ```
17 | 
18 | Finally, use the ```jupyter.sh``` script to build, tag and push an ML4H image. Use ```-c``` for the CPU-only image:
19 | ```
20 | cd ml4h
21 | ./docker/vm_boot_images/build.sh -P
22 | ./docker/vm_boot_images/build.sh -c -P
23 | ```
24 | Note that each image will have two tags: a short unique SHA1 tag from ```HEAD```, and either ```tf2.9-latest-gpu``` or ```tf2.9-latest-cpu```.
25 | 


--------------------------------------------------------------------------------
/docker/ml4h_deploy/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM us-central1-docker.pkg.dev/broad-ml4cvd/deeplearning/ml4h:tf2.19-latest-cpu
 2 | 
 3 | # Set the working directory
 4 | WORKDIR /app
 5 | 
 6 | # Install TensorFlow (or any other necessary libraries)
 7 | RUN pip install tensorflow
 8 | 
 9 | # Copy the Keras model file into the Docker image
10 | COPY ecg_5000_hf_quintuplet_dropout_v2023_04_17.keras /app/ecg_5000_hf_quintuplet_dropout_v2023_04_17.keras
11 | 
12 | # Copy the Python script
13 | COPY process_files.py /app/process_files.py
14 | 
15 | RUN pip install ml4h
16 | 
17 | # Define the command to run the script
18 | CMD ["python", "process_files.py", "/data"]


--------------------------------------------------------------------------------
/docker/ml4h_deploy/README.md:
--------------------------------------------------------------------------------
 1 | # Make a deployment docker with a model from the Model Factory
 2 | Edit `Dockerfile` and `process_files.py` to copy and load your `.keras` model file.
 3 | Then build the docker image:
 4 | ```bash 
 5 | docker build -t ecg2hf_finngen_deploy .
 6 | ```
 7 | Then run the docker image:
 8 | ```bash 
 9 | docker run -v /home/sam/ecg_xml:/data -v /home/sam:/output ecg2hf_finngen_deploy
10 | ```
11 | If it works, you should see the output in `/home/sam`. Then save your docker image as tarball:
12 | ```bash
13 | docker save ecg2hf_finngen_deploy:latest -o ecg2hf_finngen_deploy.tar
14 | ```
15 | 
16 | ## Deploy to FinnGEN
17 | Download the tarball (maybe a huge 20GB+ file). Then split it into smaller files, because FinnGEN has a limit of 5GB per file:
18 | ```bash
19 | split -b 2300M ecg2hf_finngen_deploy.tar ecg2hf_finngen_deploy_part_
20 | ```
21 | Login to your finngen account and navigate to the green bucket Google Console page. 
22 | The address depends on the sandbox version. Currently, it is at: [https://console.cloud.google.com/storage/browser/fg-production-sandbox-6_greenuploads/sam](https://console.cloud.google.com/storage/browser/fg-production-sandbox-6_greenuploads/sam).
23 | Upload all the parts here. Then after they pass the virus scan, which takes ~20 minutes, they will show up in your FinnGEN IVM at the path `/finngen/green/sam`.
24 | You can replace `sam` with any folder name you want, but must be consistent between the upload and the IVM path.
25 | 
26 | 
27 | More docs are here: [https://docs.finngen.fi/working-in-the-sandbox/quirks-and-features/how-to-upload-to-your-own-ivm-via-finngen-green](https://docs.finngen.fi/working-in-the-sandbox/quirks-and-features/how-to-upload-to-your-own-ivm-via-finngen-green)
28 | 
29 | Once all the pieces have been uploaded, reassemble them in the sandbox:
30 | ```bash
31 | cd /finngen/green/sam
32 | cat ecg2hf_finngen_deploy_part_* > ~/ecg2hf_finngen_deploy.tar
33 | ```
34 | 
35 | Load the docker image:
36 | ```bash
37 | cd ~
38 | docker load -i ecg2hf_finngen_deploy.tar
39 | ```
40 | Then run the docker image:
41 | ```
42 | docker run -v /finngen/library-red/EAS_HEART_FAILURE_1.0/data/ecg:/data -v /home/ivm/output:/output ecg2hf_finngen_deploy
43 | ```


--------------------------------------------------------------------------------
/docker/vm_boot_images/Dockerfile:
--------------------------------------------------------------------------------
 1 | # The suggested base images are:
 2 | #   - ufoym/deepo:all-py36-jupyter for GPU-enabled machines
 3 | #   - ufoym/deepo:all-py36-jupyter-cpu for CPU-only (non-GPU-enabled) machines
 4 | # BASE_IMAGE can be specified at build time by adding the following argument:
 5 | #   --build_arg BASE_IMAGE="some_other_image"
 6 | 
 7 | ARG BASE_IMAGE
 8 | FROM ${BASE_IMAGE}
 9 | 
10 | LABEL maintainer="Sam Freesun Friedman <sam@broadinstitute.org>"
11 | 
12 | # Setup time zone (or else docker build hangs)
13 | ENV TZ=America/New_York
14 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
15 | 
16 | COPY ./config/* /app/
17 | WORKDIR /app
18 | 
19 | # Note that some layers are kept separate to encourage layer re-use and to try
20 | # to minimize full recompilation where possible.
21 | 
22 | # Basic setup
23 | #RUN rm /etc/apt/sources.list.d/cuda.list
24 | ##RUN rm /etc/apt/sources.list.d/nvidia-ml.list
25 | #RUN apt-key del 7fa2af80
26 | #RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
27 | #RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
28 | RUN ./ubuntu.sh
29 | 
30 | # Point any MLflow tracking hooks at the main MLflow instance on Cloud Run
31 | ENV MLFLOW_TRACKING_URI='https://mlflow-783282864357.us-central1.run.app'
32 | 
33 | # FastAI. See the Developer Install under https://github.com/fastai/fastai/ to
34 | # understand this odd sequence of installing then uninstalling fastai before
35 | # installing it from github. (Basically, to get its deps.)
36 | # RUN pip3 install -r fastai-requirements.txt
37 | # RUN pip3 uninstall -y fastai
38 | # RUN ./fastai.sh
39 | 
40 | RUN apt-get update
41 | RUN apt-get upgrade -y
42 | RUN apt-get install python3 python3-pip python3-tk libgl1-mesa-glx libxt-dev -y
43 | RUN apt-get install -y wget unzip curl python3-pydot graphviz git ffmpeg
44 | 
45 | # Requirements for the tensorflow project
46 | RUN pip3 install --upgrade pip
47 | #RUN pip3 install -r pre_requirements.txt
48 | RUN pip3 install -r tensorflow-requirements.txt
49 | 


--------------------------------------------------------------------------------
/docker/vm_boot_images/config/fastai-requirements.txt:
--------------------------------------------------------------------------------
1 | --find-links https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
2 | torch_nightly
3 | fastai
4 | 


--------------------------------------------------------------------------------
/docker/vm_boot_images/config/fastai.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install the github repo version
 4 | git clone https://github.com/fastai/fastai
 5 | cd fastai
 6 | 
 7 | # Peg our version to a known-working SHA, since they make 
 8 | # post-1.0 breaking changes literally every day...
 9 | git reset --hard 14868ca69483afbaa8e28d4e281c148d1dad1c89
10 | 
11 | tools/run-after-git-clone
12 | pip install -e .[dev]
13 | 


--------------------------------------------------------------------------------
/docker/vm_boot_images/config/pyukbb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Fetch the latest available version
 4 | wget https://storage.googleapis.com/ml4cvd/ml4cvd-master.zip
 5 | unzip ml4cvd-master.zip
 6 | cd ml4cvd-master/pyukbb
 7 | 
 8 | tools/run-after-git-clone
 9 | pip install -e .[dev]
10 | 


--------------------------------------------------------------------------------
/docker/vm_boot_images/config/tensorflow-requirements.txt:
--------------------------------------------------------------------------------
 1 | pydot
 2 | nibabel==4.0.2
 3 | pydicom==1.2.2
 4 | seaborn
 5 | scikit-image
 6 | peakutils
 7 | biosppy
 8 | imageio
 9 | ipywidgets>=7.5.1
10 | bokeh
11 | pillow
12 | notebook
13 | pytest
14 | pytest-xdist
15 | pysam
16 | tensorflow==2.19.0
17 | tensorflow_hub
18 | tensorflow_probability
19 | tensorflow-text
20 | tf-models-official
21 | keras-tuner
22 | numcodecs
23 | beautifulsoup4
24 | lxml
25 | xmltodict
26 | google-cloud-bigquery
27 | google-cloud-bigquery-storage
28 | pandas_gbq
29 | pyarrow
30 | altair
31 | facets-overview
32 | plotnine
33 | vega
34 | ipycanvas>=0.7.0
35 | ipyannotations>=0.2.1
36 | torch==2.2.2
37 | opencv-python
38 | blosc
39 | boto3
40 | ml4ht==0.0.10
41 | google-cloud-storage
42 | umap-learn[plot]
43 | neurite
44 | voxelmorph
45 | pystrum
46 | av
47 | lmdb
48 | 


--------------------------------------------------------------------------------
/docker/vm_boot_images/config/ubuntu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Other necessities
 4 | apt-get update
 5 | 
 6 | echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
 7 | 
 8 | apt-get install -y wget unzip curl python3-pydot python3-pydot-ng graphviz ttf-mscorefonts-installer git pip ffmpeg
 9 | 
10 | wget https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
11 | dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
12 | cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-local-8138232B-keyring.gpg /usr/share/keyrings/
13 | apt-get update
14 | apt-get -y install cudnn
15 | 


--------------------------------------------------------------------------------
/git_secrets_provider_ml4h.txt:
--------------------------------------------------------------------------------
1 | private_key
2 | private_key_id
3 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module go_ml4h
 2 | 
 3 | go 1.20
 4 | 
 5 | require (
 6 | 	cloud.google.com/go v0.110.0 // indirect
 7 | 	cloud.google.com/go/bigquery v1.51.2 // indirect
 8 | 	cloud.google.com/go/compute v1.19.0 // indirect
 9 | 	cloud.google.com/go/compute/metadata v0.2.3 // indirect
10 | 	cloud.google.com/go/iam v0.13.0 // indirect
11 | 	cloud.google.com/go/storage v1.29.0 // indirect
12 | 	github.com/andybalholm/brotli v1.0.4 // indirect
13 | 	github.com/apache/arrow/go/v12 v12.0.0 // indirect
14 | 	github.com/apache/thrift v0.16.0 // indirect
15 | 	github.com/carbocation/genomisc v0.0.0-20221110225648-66a475457014 // indirect
16 | 	github.com/carbocation/pfx v0.0.0-20210408121254-ad6c6d3ac2f0 // indirect
17 | 	github.com/csimplestring/go-csv v0.0.0-20180328183906-5b8b3cd94f2c // indirect
18 | 	github.com/goccy/go-json v0.9.11 // indirect
19 | 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
20 | 	github.com/golang/protobuf v1.5.3 // indirect
21 | 	github.com/golang/snappy v0.0.4 // indirect
22 | 	github.com/google/flatbuffers v2.0.8+incompatible // indirect
23 | 	github.com/google/go-cmp v0.5.9 // indirect
24 | 	github.com/google/s2a-go v0.1.0 // indirect
25 | 	github.com/google/uuid v1.3.0 // indirect
26 | 	github.com/googleapis/enterprise-certificate-proxy v0.2.3 // indirect
27 | 	github.com/googleapis/gax-go/v2 v2.8.0 // indirect
28 | 	github.com/klauspost/asmfmt v1.3.2 // indirect
29 | 	github.com/klauspost/compress v1.15.9 // indirect
30 | 	github.com/klauspost/cpuid/v2 v2.0.9 // indirect
31 | 	github.com/krolaw/zipstream v0.0.0-20180621105154-0a2661891f94 // indirect
32 | 	github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect
33 | 	github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect
34 | 	github.com/pierrec/lz4/v4 v4.1.15 // indirect
35 | 	github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
36 | 	github.com/zeebo/xxh3 v1.0.2 // indirect
37 | 	go.opencensus.io v0.24.0 // indirect
38 | 	golang.org/x/crypto v0.7.0 // indirect
39 | 	golang.org/x/mod v0.8.0 // indirect
40 | 	golang.org/x/net v0.9.0 // indirect
41 | 	golang.org/x/oauth2 v0.7.0 // indirect
42 | 	golang.org/x/sync v0.1.0 // indirect
43 | 	golang.org/x/sys v0.7.0 // indirect
44 | 	golang.org/x/text v0.9.0 // indirect
45 | 	golang.org/x/tools v0.6.0 // indirect
46 | 	golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
47 | 	google.golang.org/api v0.118.0 // indirect
48 | 	google.golang.org/appengine v1.6.7 // indirect
49 | 	google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect
50 | 	google.golang.org/grpc v1.55.0 // indirect
51 | 	google.golang.org/protobuf v1.30.0 // indirect
52 | 	gopkg.in/guregu/null.v3 v3.5.0 // indirect
53 | )
54 | 


--------------------------------------------------------------------------------
/ingest/bulkprocess/README.md:
--------------------------------------------------------------------------------
 1 | # Downloading UKBB bulk data (cardiac MRI)
 2 | 
 3 | Prepare permissions
 4 | 1. Make sure that you have created a `.ukbkey` file containing the application ID on line 1 and the private key on line 2 (directly downloadable as an attachment from the email that you received from the UKBB). This file should not be readable by anyone without proper UKBB permissions, so consider setting this to be user-readable only.
 5 | 
 6 | ```bash
 7 | ./ukbunpack 6764.enc .ukbkey
 8 | ./ukbconv 6764.enc_ukb bulk -s20216
 9 | mv ~/ml/ingest/cmd/downloader/main.go .
10 | go run main.go
11 | ```
12 | Download data
13 | 1. Download the encrypted file (`ukb21481.enc`) and decrypt it to the encoded file (`ukb21481.enc_ukb`)
14 | 1. Extract the list of all samples with the field of interest. 20208 is Heart MRI Long Axis `ukbconv ukb21481.enc_ukb bulk -s20208`
15 |     * Atttempt #2: Try to get all MRI fields at once. `ukbconv ukb21481.enc_ukb bulk -ifields.list`
16 | 1. Inspect: `wc -l ukb21481.bulk` and you can see that there is one entry per person for whom this data exists
17 | 1. You cannot download more than 1,000 samples' bulk files at a time. So, iteratively do it:
18 |     * For now, just take 50 
19 |     * `head -n 50 ukb21481.bulk > heart.50`
20 |     * `ukbfetch -bheart.50` *(Note: no space between `-b` and `heart.50`)*


--------------------------------------------------------------------------------
/ingest/bulkprocess/field-ids.go:
--------------------------------------------------------------------------------
 1 | package bulkprocess
 2 | 
 3 | type FieldID string
 4 | 
 5 | const (
 6 | 	AorticDistensibility FieldID = "20210"
 7 | 	BloodFlow            FieldID = "20213"
 8 | 	CineTagging          FieldID = "20211"
 9 | 	SHMOLLI              FieldID = "20214"
10 | 	LVOT                 FieldID = "20212"
11 | 	LAX                  FieldID = "20208"
12 | 	Scout                FieldID = "20207"
13 | 	SAX                  FieldID = "20209"
14 | )
15 | 


--------------------------------------------------------------------------------
/ingest/bulkprocess/output.go:
--------------------------------------------------------------------------------
 1 | package bulkprocess
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"time"
 6 | 
 7 | 	"github.com/araddon/dateparse"
 8 | )
 9 | 
10 | type DicomOutput struct {
11 | 	SampleID  string
12 | 	ZipFile   string
13 | 	FieldID   string
14 | 	Instance  string
15 | 	Index     string
16 | 	Dicom     DicomRow
17 | 	DicomMeta DicomMeta
18 | }
19 | 
20 | type DicomRow struct {
21 | 	Filename          string
22 | 	PatientID         string
23 | 	StudyID           string
24 | 	StudyDescription  string
25 | 	Date              string
26 | 	SeriesID          string
27 | 	SeriesDescription string
28 | 	Modality          string // Not always present
29 | 	AET               string
30 | 	Host              string
31 | }
32 | 
33 | func (d DicomRow) ParsedDate() (time.Time, error) {
34 | 	res, err := dateparse.ParseAny(d.Date)
35 | 	if err == nil {
36 | 		return res, nil
37 | 	}
38 | 
39 | 	// Try some known values that dateparse fails to understand
40 | 	return time.Parse("02-Jan-2006 15:04:05", d.Date)
41 | }
42 | 
43 | func stringSliceToDicomStruct(input []string) (out DicomOutput, err error) {
44 | 	if l := len(input); l < 9 || l > 10 {
45 | 		return out, fmt.Errorf("Expected 9 or 10 fields, found %d", l)
46 | 	}
47 | 
48 | 	out.Dicom.Filename = input[0]
49 | 	out.Dicom.PatientID = input[1]
50 | 	out.Dicom.StudyID = input[2]
51 | 	out.Dicom.StudyDescription = input[3]
52 | 	out.Dicom.Date = input[4]
53 | 	out.Dicom.SeriesID = input[5]
54 | 	out.Dicom.SeriesDescription = input[6]
55 | 
56 | 	if len(input) == 10 {
57 | 		out.Dicom.Modality = input[7]
58 | 		out.Dicom.AET = input[8]
59 | 		out.Dicom.Host = input[9]
60 | 	} else {
61 | 		out.Dicom.AET = input[7]
62 | 		out.Dicom.Host = input[8]
63 | 	}
64 | 
65 | 	return
66 | }
67 | 


--------------------------------------------------------------------------------
/ingest/bulkprocess/zip-metadata.go:
--------------------------------------------------------------------------------
 1 | package bulkprocess
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"path/filepath"
 6 | 	"strings"
 7 | )
 8 | 
 9 | type ZipMetadata struct {
10 | 	SampleID string
11 | 	FieldID  string
12 | 	Instance string
13 | 	Index    string
14 | }
15 | 
16 | func zipPathToMetadata(path string) (ZipMetadata, error) {
17 | 	filename := filepath.Base(path)
18 | 
19 | 	// Remove .zip
20 | 	name := strings.Split(filename, ".")[0]
21 | 
22 | 	data := strings.Split(name, "_")
23 | 
24 | 	if len(data) != 4 {
25 | 		return ZipMetadata{}, fmt.Errorf("Expected filename to be of format sampleID_fieldID_instance_index.zip, but found %d parts instead of 4", len(data))
26 | 	}
27 | 
28 | 	return ZipMetadata{data[0], data[1], data[2], data[3]}, nil
29 | }
30 | 


--------------------------------------------------------------------------------
/ingest/cmd/batcher/batcher.linux:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6a3c75db2e09bbe690b9ed85525a5a71643d86da353cec65a73896fd0e2acc71
3 | size 18518072
4 | 


--------------------------------------------------------------------------------
/ingest/cmd/batcher/dicom.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"image"
 7 | 	"image/color"
 8 | 	"image/jpeg"
 9 | 	"io"
10 | 	"io/ioutil"
11 | 
12 | 	"github.com/gradienthealth/dicom"
13 | 	"github.com/gradienthealth/dicom/dicomtag"
14 | )
15 | 
16 | // Takes in a dicom file (in bytes), outputs one or more jpeg file equivalents
17 | // (in bytes)
18 | func DicomToJpeg(dicomReader io.Reader) ([][]byte, error) {
19 | 	dcm, err := ioutil.ReadAll(dicomReader)
20 | 	if err != nil {
21 | 		return nil, err
22 | 	}
23 | 
24 | 	p, err := dicom.NewParserFromBytes(dcm, nil)
25 | 	if err != nil {
26 | 		return nil, err
27 | 	}
28 | 
29 | 	parsedData, err := p.Parse(dicom.ParseOptions{DropPixelData: false})
30 | 	if parsedData == nil || err != nil {
31 | 		return nil, fmt.Errorf("Error reading dicom: %v", err)
32 | 	}
33 | 
34 | 	var output [][]byte
35 | 
36 | 	for _, elem := range parsedData.Elements {
37 | 		if elem.Tag != dicomtag.PixelData {
38 | 			continue
39 | 		}
40 | 
41 | 		data := elem.Value[0].(dicom.PixelDataInfo)
42 | 
43 | 		for _, frame := range data.Frames {
44 | 
45 | 			// Encapsulated
46 | 
47 | 			if frame.IsEncapsulated {
48 | 				output = append(output, frame.EncapsulatedData.Data)
49 | 				continue
50 | 			}
51 | 
52 | 			// Unencapsulated
53 | 
54 | 			img := image.NewGray16(image.Rect(0, 0, frame.NativeData.Cols, frame.NativeData.Rows))
55 | 			for j := 0; j < len(frame.NativeData.Data); j++ {
56 | 				// for now, assume we're not overflowing uint16, assume gray image
57 | 				img.SetGray16(j%frame.NativeData.Cols, j/frame.NativeData.Rows, color.Gray16{Y: uint16(frame.NativeData.Data[j][0])})
58 | 			}
59 | 			buf := new(bytes.Buffer)
60 | 			jpeg.Encode(buf, img, &jpeg.Options{Quality: 100})
61 | 			output = append(output, buf.Bytes())
62 | 		}
63 | 	}
64 | 
65 | 	return output, nil
66 | }
67 | 


--------------------------------------------------------------------------------
/ingest/cmd/batcher/functions.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | 	"time"
 6 | )
 7 | 
 8 | func init() {
 9 | 	// Ensure different folder names on each run
10 | 	rand.Seed(time.Now().UTC().UnixNano())
11 | }
12 | 
13 | // RandOrthoglyphs produces a string of length n randomly.
14 | func RandOrthoglyphs(n int) string {
15 | 	var letters = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
16 | 	lenLetters := len(letters)
17 | 	b := make([]rune, n)
18 | 	for i := range b {
19 | 		b[i] = letters[rand.Intn(lenLetters)]
20 | 	}
21 | 	return string(b)
22 | }
23 | 


--------------------------------------------------------------------------------
/ingest/cmd/build_curl_command.py:
--------------------------------------------------------------------------------
 1 | """
 2 | To build curl commands from copy pasted forms from the biobank website
 3 | """
 4 | 
 5 | import sys
 6 | 
 7 | FORM_TEXT = """
 8 |             <form name="fetch" action="https://biota.osc.ox.ac.uk/dataset.cgi" method="post">
 9 | 
10 | <input type="hidden" name="id" value="67343563">
11 | <input type="hidden" name="s" value="38023627">
12 | <input type="hidden" name="t" value="1262365114">
13 | <input type="hidden" name="i" value="76.66.198.53">
14 | <input type="hidden" name="v" value="dc7d7089413fa4a56c6s301a059148asa81904816804130e7909ec72402">
15 | <input class="btn_glow" type="submit" value="Fetch">
16 | </form>
17 | """
18 | 
19 | 
20 | NAME = "DOWNLOAD.enc"  # Downloaded file's name
21 | 
22 | 
23 | test = """
24 | <form name="fetch" action="https://biota.osc.ox.ac.uk/dataset.cgi" method="post">
25 | <input type="hidden" name="id" value="671600"/>
26 | <input type="hidden" name="s" value="305736"/>
27 | <input type="hidden" name="t" value="1684501586"/>
28 | <input type="hidden" name="i" value="67.244.49.54"/>
29 | <input type="hidden" name="v" value="891f3ec7f3388d4c7a0c094ef1abde73f44c356f2732dade6a7921d9770dd095"/>
30 | <input class="btn_glow" type="submit" value="Fetch"/>
31 | </form>
32 | """
33 | 
34 | 
35 | def get_fields(txt):
36 |     i = txt.find('''name="fetch"''')
37 |     if i == -1:
38 |         print('Fetch form not in text')
39 |         return
40 |     action, i = get_field(txt, i, '''action="''')
41 |     fields = {'action': action}
42 |     for field in ['id', 's', 't', 'i', 'v']:
43 |         fields[field], i = get_field(txt, i)
44 |     return fields
45 | 
46 | 
47 | def get_field(txt, start, target='''value="'''):
48 |     start = txt.find(target, start)
49 |     end = txt.find('''"''', start + len(target))
50 |     return txt[start + len(target): end], end
51 | 
52 | 
53 | def fields_to_curl(name, action, id, s, t, i, v):
54 |     return f"""
55 | curl -d "id={id}&s={s}&t={t}&i={i}&v={v}&submit=Fetch" \
56 | -X POST {action} \
57 | -o {name}
58 |     """
59 | 
60 | 
61 | def txt_to_curl(name, txt):
62 |     return fields_to_curl(name, **get_fields(txt))
63 | 
64 | # check to see if an argument was provided (single argument with path to form text in a file)
65 | if len(sys.argv) > 1:
66 |     try:
67 |         with open (sys.argv[1], "r") as form_text_file:
68 |             FORM_TEXT = form_text_file.read()
69 |     except:
70 |         print(f'This program expects the input argument, if provided, to be a path')
71 |         print(f'to a file containing the form data from the ukbiobank website.')
72 |         exit(1)
73 | 
74 | print(txt_to_curl(NAME, FORM_TEXT))
75 | 


--------------------------------------------------------------------------------
/ingest/cmd/dicom2jpeg/dicom2jpeg.linux:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:54810ae1323a965d0c24db89967cfc23f7aca642e809f421090e5b27faa1409c
3 | size 17949402
4 | 


--------------------------------------------------------------------------------
/ingest/cmd/downloader/downloader.linux:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4b997debdc843297b109d57d99ef18fd0e337aba677d94e6370597687d3c7c64
3 | size 2494206
4 | 


--------------------------------------------------------------------------------
/ingest/cmd/downloader/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"encoding/csv"
  5 | 	"flag"
  6 | 	"fmt"
  7 | 	"log"
  8 | 	"os"
  9 | 	"os/exec"
 10 | 	"time"
 11 | )
 12 | 
 13 | func main() {
 14 | 	// Consume a .bulk file
 15 | 	// Download all data with ukbfetch
 16 | 
 17 | 	var bulkPath, ukbKey, ukbFetch string
 18 | 	var concurrency int
 19 | 
 20 | 	flag.StringVar(&bulkPath, "bulk", "", "Path to *.bulk file, as specified by UKBB.")
 21 | 	flag.StringVar(&ukbFetch, "ukbfetch", "ukbfetch", "Path to the ukbfetch utility (if not already in your PATH as ukbfetch).")
 22 | 	flag.StringVar(&ukbKey, "ukbkey", ".ukbkey", "Path to the .ukbkey file with the app ID and special key.")
 23 | 	flag.IntVar(&concurrency, "concurrency", 10, "Number of simultaneous connections to UK Biobank servers.")
 24 | 
 25 | 	flag.Parse()
 26 | 
 27 | 	log.Println("Note: This tool only checks for pre-existing files in the order specified by the bulk file.")
 28 | 
 29 | 	if bulkPath == "" {
 30 | 		flag.PrintDefaults()
 31 | 		os.Exit(1)
 32 | 	}
 33 | 
 34 | 	f, err := os.Open(bulkPath)
 35 | 	if err != nil {
 36 | 		log.Fatalln(err)
 37 | 	}
 38 | 
 39 | 	c := csv.NewReader(f)
 40 | 	c.Comma = ' '
 41 | 
 42 | 	entries, err := c.ReadAll()
 43 | 	if err != nil {
 44 | 		log.Fatalln(err)
 45 | 	}
 46 | 
 47 | 	// Note: The UK Biobank updated their rules to permit only 10 simultaneous
 48 | 	// downloads per application in 3/2019.
 49 | 	log.Println("Using up to", concurrency, "simultaneous downloads")
 50 | 
 51 | 	// Make it 1-based
 52 | 	concurrency = concurrency - 1
 53 | 
 54 | 	sem := make(chan bool, concurrency)
 55 | 
 56 | 	finishedCheckingExisting := false
 57 | 	for i, row := range entries {
 58 | 		exists := false
 59 | 		zipFile := ""
 60 | 
 61 | 		if !finishedCheckingExisting {
 62 | 			// Since statting on a GCSFuse filesystem is slow, we assume sorted
 63 | 			// order. If that is true, then once we stop finding files we have
 64 | 			// already downloaded, we can stop checking.
 65 | 			for _, suffix := range []string{"zip", "cram", "cram.crai"} {
 66 | 				zipFile = fmt.Sprintf("%s_%s.%s", row[0], row[1], suffix)
 67 | 
 68 | 				// If we already downloaded this file, skip it
 69 | 				if _, err := os.Stat(zipFile); !os.IsNotExist(err) {
 70 | 					log.Println(i, len(entries), "Already downloaded", zipFile)
 71 | 					exists = true
 72 | 					break
 73 | 				}
 74 | 			}
 75 | 		}
 76 | 
 77 | 		if exists {
 78 | 			continue
 79 | 		}
 80 | 
 81 | 		finishedCheckingExisting = true
 82 | 
 83 | 		log.Println(i, len(entries), "Downloading", zipFile)
 84 | 
 85 | 		sem <- true
 86 | 		go func(row []string) {
 87 | 			defer func() { <-sem }()
 88 | 
 89 | 			nErrors := 0
 90 | 			for {
 91 | 				if out, err := exec.Command(ukbFetch, fmt.Sprintf("-a%s", ukbKey), fmt.Sprintf("-e%s", row[0]), fmt.Sprintf("-d%s", row[1])).CombinedOutput(); err != nil && nErrors < 3 {
 92 | 					nErrors++
 93 | 					log.Println(fmt.Errorf("Output: %s | Error: %s", string(out), err.Error()))
 94 | 					log.Println("Sleeping 30 seconds and retrying")
 95 | 					time.Sleep(30 * time.Second)
 96 | 					continue
 97 | 				}
 98 | 
 99 | 				// If we already errored 3x or we had no error, break the loop
100 | 				break
101 | 			}
102 | 		}(append([]string{}, row...))
103 | 
104 | 	}
105 | 
106 | 	for i := 0; i < cap(sem); i++ {
107 | 		sem <- true
108 | 	}
109 | }
110 | 


--------------------------------------------------------------------------------
/ingest/cmd/gene2chrpos/gene2chrpos.osx:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5d83cce426c001f4e40e50ab096e0450b3ffc5e936bb240a0c04cb989b49f087
3 | size 12456236
4 | 


--------------------------------------------------------------------------------
/ingest/cmd/gene2chrpos/lookups/ensembl.grch37.p13.genes:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3415c81f7b51ccd8ead5928b06aece7e146c575560e30f420c59f93f7067e2a6
3 | size 20482401
4 | 


--------------------------------------------------------------------------------
/ingest/cmd/gene2chrpos/lookups/url.txt:
--------------------------------------------------------------------------------
1 | http://grch37.ensembl.org/biomart/martview/6f9488c78379ccab56985d13f802be0f?VIRTUALSCHEMANAME=default&ATTRIBUTES=hsapiens_gene_ensembl.default.feature_page.ensembl_gene_id|hsapiens_gene_ensembl.default.feature_page.ensembl_transcript_id|hsapiens_gene_ensembl.default.feature_page.ensembl_peptide_id|hsapiens_gene_ensembl.default.feature_page.chromosome_name|hsapiens_gene_ensembl.default.feature_page.start_position|hsapiens_gene_ensembl.default.feature_page.end_position|hsapiens_gene_ensembl.default.feature_page.strand|hsapiens_gene_ensembl.default.feature_page.transcript_start|hsapiens_gene_ensembl.default.feature_page.transcript_end|hsapiens_gene_ensembl.default.feature_page.transcript_length|hsapiens_gene_ensembl.default.feature_page.external_gene_name&FILTERS=&VISIBLEPANEL=attributepanel


--------------------------------------------------------------------------------
/ingest/cmd/gene2chrpos/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"encoding/csv"
 6 | 	"flag"
 7 | 	"fmt"
 8 | 	"io"
 9 | 	"log"
10 | 	"strings"
11 | 
12 | 	"github.com/gobuffalo/packr"
13 | )
14 | 
15 | const (
16 | 	GeneStableID int = iota
17 | 	TranscriptStableID
18 | 	ProteinStableID
19 | 	Chromosome
20 | 	GeneStartOneBased
21 | 	GeneEndOneBased
22 | 	Strand
23 | 	TranscriptStartOneBased
24 | 	TranscriptEndOneBased
25 | 	TranscriptLengthIncludingUTRAndCDS
26 | 	GeneName
27 | )
28 | 
29 | func main() {
30 | 	var geneName string
31 | 
32 | 	flag.StringVar(&geneName, "gene", "", "Name of the gene whose GRCH37 transcript's chr:pos you would like to lookup.")
33 | 	flag.Parse()
34 | 
35 | 	if geneName == "" {
36 | 		flag.PrintDefaults()
37 | 		return
38 | 	}
39 | 
40 | 	if err := Lookup(geneName); err != nil {
41 | 		log.Fatalln(err)
42 | 	}
43 | }
44 | 
45 | func Lookup(geneName string) error {
46 | 	lookups := packr.NewBox("./lookups")
47 | 
48 | 	file := lookups.Bytes("ensembl.grch37.p13.genes")
49 | 	buf := bytes.NewBuffer(file)
50 | 	cr := csv.NewReader(buf)
51 | 	cr.Comma = '\t'
52 | 
53 | 	results := make([][]string, 0)
54 | 
55 | 	header := make([]string, 0)
56 | 	var i int64
57 | 	for {
58 | 		rec, err := cr.Read()
59 | 		if err != nil && err == io.EOF {
60 | 			break
61 | 		} else if err != nil {
62 | 			return err
63 | 		}
64 | 
65 | 		i++
66 | 		if i == 1 {
67 | 			header = append(header, rec...)
68 | 
69 | 			continue
70 | 		}
71 | 
72 | 		if rec[GeneName] != geneName {
73 | 			continue
74 | 		}
75 | 
76 | 		strand := "-"
77 | 		if rec[Strand] == "1" {
78 | 			strand = "+"
79 | 		}
80 | 
81 | 		results = append(results, []string{rec[GeneName], rec[Chromosome], rec[TranscriptStartOneBased], rec[TranscriptEndOneBased], rec[TranscriptLengthIncludingUTRAndCDS], strand})
82 | 	}
83 | 
84 | 	if len(results) < 1 {
85 | 		return fmt.Errorf("No results were found for %s. Were you using a transcript name instead of a gene name?", geneName)
86 | 	}
87 | 
88 | 	fmt.Println("Gene\tChromosome\tTranscriptStart\tTranscriptEnd\tTranscriptLength\tStrand")
89 | 	for _, result := range results {
90 | 		fmt.Println(strings.Join(result, "\t"))
91 | 	}
92 | 
93 | 	return nil
94 | }
95 | 


--------------------------------------------------------------------------------
/ingest/cmd/manifester/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"flag"
  5 | 	"fmt"
  6 | 	"io/ioutil"
  7 | 	"log"
  8 | 	"os"
  9 | 	"runtime"
 10 | 	"strings"
 11 | 
 12 | 	"github.com/broadinstitute/ml4h/go/bulkprocess"
 13 | )
 14 | 
 15 | func main() {
 16 | 	// Makes one big combined manifest
 17 | 	// Emits to stdout
 18 | 
 19 | 	var path string
 20 | 
 21 | 	flag.StringVar(&path, "path", "./", "Path where the UKBB bulk .zip files are being held.")
 22 | 
 23 | 	flag.Parse()
 24 | 
 25 | 	files, err := ioutil.ReadDir(path)
 26 | 	if err != nil {
 27 | 		log.Fatalln(err)
 28 | 	}
 29 | 
 30 | 	// Read each zip (names are significant)
 31 | 	fmt.Printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n",
 32 | 		"sample_id",
 33 | 		"field_id",
 34 | 		"instance",
 35 | 		"index",
 36 | 		"zip_file",
 37 | 		"dicom_file",
 38 | 		"series",
 39 | 		"date",
 40 | 		"instance_number",
 41 | 		"overlay_text",
 42 | 		"overlay_fraction",
 43 | 		"overlay_rows",
 44 | 		"overlay_cols",
 45 | 		"image_x",
 46 | 		"image_y",
 47 | 		"image_z",
 48 | 	)
 49 | 
 50 | 	concurrency := 4 * runtime.NumCPU()
 51 | 
 52 | 	results := make(chan string, concurrency)
 53 | 	doneListening := make(chan struct{})
 54 | 	go func() {
 55 | 		defer func() { doneListening <- struct{}{} }()
 56 | 		// Serialize results so you don't dump text haphazardly into os.Stdout
 57 | 		// (which is not goroutine safe).
 58 | 		for {
 59 | 			select {
 60 | 			case res, ok := <-results:
 61 | 				if !ok {
 62 | 					return
 63 | 				}
 64 | 
 65 | 				fmt.Println(res)
 66 | 			}
 67 | 		}
 68 | 
 69 | 	}()
 70 | 
 71 | 	semaphore := make(chan struct{}, concurrency)
 72 | 
 73 | 	for _, file := range files {
 74 | 
 75 | 		// Will block after `concurrency` simultaneous goroutines are running
 76 | 		semaphore <- struct{}{}
 77 | 
 78 | 		go func(file os.FileInfo) {
 79 | 
 80 | 			// Be sure to permit unblocking once we finish
 81 | 			defer func() { <-semaphore }()
 82 | 
 83 | 			if !strings.HasSuffix(file.Name(), ".zip") {
 84 | 				return
 85 | 			}
 86 | 
 87 | 			err := bulkprocess.CardiacMRIZipIterator(path+file.Name(), func(dcm bulkprocess.DicomOutput) error {
 88 | 				if err := PrintCSVRow(dcm, results); err != nil {
 89 | 					log.Printf("Error parsing %+v\n", dcm)
 90 | 					return err
 91 | 				}
 92 | 
 93 | 				return nil
 94 | 			})
 95 | 			if err != nil {
 96 | 				log.Println("Error parsing", path+file.Name())
 97 | 				log.Fatalln(err)
 98 | 			}
 99 | 		}(file)
100 | 	}
101 | 
102 | 	// Make sure we finish all the reads before we exit, otherwise we'll lose
103 | 	// the last `concurrency` lines.
104 | 	for i := 0; i < cap(semaphore); i++ {
105 | 		semaphore <- struct{}{}
106 | 	}
107 | 
108 | 	// Close the results channel and make sure we are done listening
109 | 	close(results)
110 | 	<-doneListening
111 | }
112 | 
113 | func PrintCSVRow(row bulkprocess.DicomOutput, results chan<- string) error {
114 | 	studyDate, err := row.Dicom.ParsedDate()
115 | 	if err != nil {
116 | 		return err
117 | 	}
118 | 
119 | 	overlayText := "NoOverlay"
120 | 	if row.DicomMeta.HasOverlay {
121 | 		overlayText = "HasOverlay"
122 | 	}
123 | 
124 | 	results <- fmt.Sprintf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.8f\t%d\t%d\t%.2f\t%.2f\t%.2f",
125 | 		row.SampleID, row.FieldID, row.Instance, row.Index, row.ZipFile,
126 | 		row.Dicom.Filename, row.Dicom.SeriesDescription, studyDate.Format("2006-01-02"),
127 | 		row.DicomMeta.InstanceNumber, overlayText, row.DicomMeta.OverlayFraction, row.DicomMeta.OverlayRows, row.DicomMeta.OverlayCols,
128 | 		row.DicomMeta.PatientX, row.DicomMeta.PatientY, row.DicomMeta.PatientZ)
129 | 	return nil
130 | }
131 | 


--------------------------------------------------------------------------------
/ingest/cmd/manifester/manifester.linux:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:15317181e392337e49d20a4e5bc04e96e19cd3b9880e0bed0daee4dedd8a1413
3 | size 18546323
4 | 


--------------------------------------------------------------------------------
/ingest/cmd/merge-lvef/merge-lvef.linux:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:520b3aab986215f83f816ff157e422662e94ced2d5311f2126ec3f4787fba6c3
3 | size 2390746
4 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/censor/censor_result.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"strings"
 6 | 	"time"
 7 | 
 8 | 	"gopkg.in/guregu/null.v3"
 9 | )
10 | 
11 | type CensorResult struct {
12 | 	SampleID int64
13 | 
14 | 	// In the database, populate with a list of fields that we would like to
15 | 	// have (e.g., month of birth, lost to followup) but which were not present,
16 | 	// so we know if the table was constructed from incomplete data
17 | 	Missing []string
18 | 
19 | 	// Guaranteed
20 | 	enrolled time.Time
21 | 	computed time.Time // Date this was computed
22 | 
23 | 	// May be null appropriately
24 | 	died null.Time
25 | 	lost null.Time
26 | 
27 | 	// Unsure
28 | 	phenoCensored time.Time
29 | 	deathCensored time.Time
30 | 
31 | 	// convenience / not exported
32 | 	bornYear  string
33 | 	bornMonth string
34 | }
35 | 
36 | func (s CensorResult) Born() time.Time {
37 | 	// If we know year + month, then neutral assumption is that birthday is on
38 | 	// the middle day of the month. If we just know year, then assumption is
39 | 	// being born midway through the year (July 2).
40 | 	month := s.bornMonth
41 | 	day := "15"
42 | 
43 | 	if month == "" {
44 | 		month = "7"
45 | 		day = "02"
46 | 	}
47 | 
48 | 	dt, err := time.Parse("2006-01-02", fmt.Sprintf("%04s-%02s-%02s", s.bornYear, month, day))
49 | 	if err != nil {
50 | 		return time.Time{}
51 | 	}
52 | 
53 | 	return dt
54 | }
55 | 
56 | func (s CensorResult) DiedString() string {
57 | 	if !s.died.Valid {
58 | 		return NullMarker
59 | 	}
60 | 
61 | 	return TimeToUKBDate(s.died.Time)
62 | }
63 | 
64 | func (s CensorResult) DeathCensored() time.Time {
65 | 	if s.died.Valid {
66 | 		return s.died.Time
67 | 	}
68 | 
69 | 	if s.lost.Valid {
70 | 		return s.lost.Time
71 | 	}
72 | 
73 | 	return s.deathCensored
74 | }
75 | 
76 | func (s CensorResult) PhenoCensored() time.Time {
77 | 	if s.died.Valid {
78 | 		return s.died.Time
79 | 	}
80 | 
81 | 	if s.lost.Valid {
82 | 		return s.lost.Time
83 | 	}
84 | 
85 | 	return s.phenoCensored
86 | }
87 | 
88 | func (s CensorResult) MissingToString() string {
89 | 	if res := strings.Join(s.Missing, "|"); len(res) > 0 {
90 | 		return res
91 | 	}
92 | 
93 | 	return "NA"
94 | }
95 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/censor/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"flag"
 6 | 	"log"
 7 | 	"os"
 8 | 	"cloud.google.com/go/bigquery"
 9 | )
10 | 
11 | const NullMarker = "NA"
12 | 
13 | type SamplePheno struct {
14 | 	SampleID     int64              `bigquery:"sample_id"`
15 | 	Value        string             `bigquery:"value"`
16 | 	FieldID      int64              `bigquery:"FieldID"`
17 | 	Instance     int64              `bigquery:"instance"`
18 | 	ArrayIDX     int64              `bigquery:"array_idx"`
19 | 	CodingFileID bigquery.NullInt64 `bigquery:"coding_file_id"`
20 | }
21 | 
22 | type WrappedBigQuery struct {
23 | 	Context  context.Context
24 | 	Client   *bigquery.Client
25 | 	Project  string
26 | 	Database string
27 | }
28 | 
29 | func main() {
30 | 	var (
31 | 		phenoCensorDateString string
32 | 		deathCensorDateString string
33 | 		BQ                    = &WrappedBigQuery{}
34 | 	)
35 | 
36 | 	flag.StringVar(&phenoCensorDateString, "pheno_censor", "", "With format YYYY-MM-DD, please provide the Hospital Data censor date from https://biobank.ctsu.ox.ac.uk/crystal/exinfo.cgi?src=Data_providers_and_dates")
37 | 	flag.StringVar(&deathCensorDateString, "death_censor", "", "With format YYYY-MM-DD, please provide the Death censor date from https://biobank.ctsu.ox.ac.uk/crystal/exinfo.cgi?src=Data_providers_and_dates")
38 | 	flag.StringVar(&BQ.Project, "project", "broad-ml4cvd", "Name of the Google Cloud project that hosts your BigQuery database instance")
39 | 	flag.StringVar(&BQ.Database, "bigquery", "", "BigQuery source database name")
40 | 	flag.Parse()
41 | 
42 | 	if phenoCensorDateString == "" || deathCensorDateString == "" || BQ.Project == "" || BQ.Database == "" {
43 | 		flag.PrintDefaults()
44 | 		os.Exit(1)
45 | 	}
46 | 
47 | 	log.Println("Using bigquery database", BQ.Database)
48 | 	log.Println("Output uses", NullMarker, "in place of null values. Please specify this when loading data into bigquery.")
49 | 
50 | 	log.Println("Producing censoring table")
51 | 	if err := Censor(BQ, deathCensorDateString, phenoCensorDateString); err != nil {
52 | 		log.Fatalln(err)
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/censor/query_single.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"cloud.google.com/go/bigquery"
 7 | 	"google.golang.org/api/iterator"
 8 | )
 9 | 
10 | func BigQuerySingleFieldFirst(wbq *WrappedBigQuery, fieldID int64) (map[int64]string, error) {
11 | 	out := make(map[int64]string)
12 | 
13 | 	query := wbq.Client.Query(fmt.Sprintf(`SELECT * 
14 | FROM %s.phenotype
15 | WHERE 1=1
16 | AND FieldID=@FieldID
17 | ORDER BY instance ASC, array_idx ASC
18 | 
19 | -- Uncomment for testing
20 | -- ORDER BY sample_id DESC
21 | -- LIMIT 10
22 | `, wbq.Database))
23 | 
24 | 	query.QueryConfig.Parameters = append(query.QueryConfig.Parameters, []bigquery.QueryParameter{
25 | 		{Name: "FieldID", Value: fieldID},
26 | 	}...)
27 | 
28 | 	itr, err := query.Read(wbq.Context)
29 | 	if err != nil {
30 | 		return nil, err
31 | 	}
32 | 	for {
33 | 		var values SamplePheno
34 | 		err := itr.Next(&values)
35 | 		if err == iterator.Done {
36 | 			break
37 | 		}
38 | 		if err != nil {
39 | 			return nil, err
40 | 		}
41 | 
42 | 		// Take only the first, since we use this for things like enrollment
43 | 		// date. If someone came to a follow-up visit, we don't want to say that
44 | 		// they "enrolled" at the time of their follow-up, for example. Relies
45 | 		// on sort order specified above in the query.
46 | 		if _, exists := out[values.SampleID]; exists {
47 | 			continue
48 | 		}
49 | 		out[values.SampleID] = values.Value
50 | 	}
51 | 
52 | 	return out, nil
53 | }
54 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/censor/time_handling.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"time"
 6 | )
 7 | 
 8 | func TimeToUKBDate(t time.Time) string {
 9 | 	if t.Equal(time.Time{}) {
10 | 		return NullMarker
11 | 	}
12 | 
13 | 	return t.Format("2006-01-02")
14 | }
15 | 
16 | func TimesToFractionalYears(earlier, later time.Time) string {
17 | 	if later.Before(earlier) {
18 | 		return NullMarker
19 | 	}
20 | 	y, m, d, h, min, sec := time_diff(earlier, later)
21 | 
22 | 	return fmt.Sprintf("%.6f", float64(y)+float64(m)/12+float64(d)/(12*30)+float64(h)/(24*365)+float64(min)/(60*24*365)+float64(sec)/(60*60*24*365))
23 | }
24 | 
25 | // Taken directly from https://stackoverflow.com/a/36531443/199475
26 | func time_diff(a, b time.Time) (year, month, day, hour, min, sec int) {
27 | 	if a.Location() != b.Location() {
28 | 		b = b.In(a.Location())
29 | 	}
30 | 	if a.After(b) {
31 | 		a, b = b, a
32 | 	}
33 | 	y1, M1, d1 := a.Date()
34 | 	y2, M2, d2 := b.Date()
35 | 
36 | 	h1, m1, s1 := a.Clock()
37 | 	h2, m2, s2 := b.Clock()
38 | 
39 | 	year = int(y2 - y1)
40 | 	month = int(M2 - M1)
41 | 	day = int(d2 - d1)
42 | 	hour = int(h2 - h1)
43 | 	min = int(m2 - m1)
44 | 	sec = int(s2 - s1)
45 | 
46 | 	// Normalize negative values
47 | 	if sec < 0 {
48 | 		sec += 60
49 | 		min--
50 | 	}
51 | 	if min < 0 {
52 | 		min += 60
53 | 		hour--
54 | 	}
55 | 	if hour < 0 {
56 | 		hour += 24
57 | 		day--
58 | 	}
59 | 	if day < 0 {
60 | 		// days in month:
61 | 		t := time.Date(y1, M1, 32, 0, 0, 0, 0, time.UTC)
62 | 		day += 32 - t.Day()
63 | 		month--
64 | 	}
65 | 	if month < 0 {
66 | 		month += 12
67 | 		year--
68 | 	}
69 | 
70 | 	return
71 | }
72 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/convertcoding/cc_test.go:
--------------------------------------------------------------------------------
1 | package main
2 | 
3 | import "testing"
4 | 
5 | func Test2(t *testing.T) {
6 | 	main()
7 | }
8 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/convertcoding/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"encoding/csv"
 6 | 	"flag"
 7 | 	"fmt"
 8 | 	"io"
 9 | 	"log"
10 | 	"net/http"
11 | 	"strings"
12 | )
13 | 
14 | const (
15 | 	ExpectedRows = 3
16 | )
17 | 
18 | func main() {
19 | 	var (
20 | 		codingPath string
21 | 	)
22 | 
23 | 	flag.StringVar(&codingPath, "coding", "https://raw.githubusercontent.com/OxWearables/ukb_download_and_prep_template/main/Codings_Showcase.csv", "URL to CSV file with the UKBB data encodings")
24 | 	flag.Parse()
25 | 
26 | 	if codingPath == "" {
27 | 		flag.PrintDefaults()
28 | 		log.Fatalln()
29 | 	}
30 | 
31 | 	if err := ImportCoding(codingPath); err != nil {
32 | 		log.Fatalln(err)
33 | 	}
34 | }
35 | 
36 | func ImportCoding(url string) error {
37 | 	log.Printf("Importing from %s\n", url)
38 | 
39 | 	resp, err := http.Get(url)
40 | 	if err != nil {
41 | 		return err
42 | 	}
43 | 	reader := csv.NewReader(resp.Body)
44 | 	reader.Comma = ','
45 | 	reader.LazyQuotes = true
46 | 
47 | 	header := make([]string, 0)
48 | 	j := 0
49 | 	for ; ; j++ {
50 | 		row, err := reader.Read()
51 | 		if err != nil && err == io.EOF {
52 | 			resp.Body.Close()
53 | 			break
54 | 		} else if err != nil {
55 | 			buf := bytes.NewBuffer(nil)
56 | 			io.Copy(buf, resp.Body)
57 | 			if strings.Contains(buf.String(), "internal error") {
58 | 				log.Println("Coding File is not permitted to be downloaded from the UKBB")
59 | 				continue
60 | 			}
61 | 		}
62 | 
63 | 		// Handle the header
64 | 		if j == 0 {
65 | 			log.Printf("Header (%d elements): %+v\n", len(row), row)
66 | 			header = append(header, row...)
67 | 			for k, v := range header {
68 | 				if v == "Coding" {
69 | 					header[k] = "coding_file_id"
70 | 				} else if v == "Value" {
71 | 					header[k] = "coding"
72 | 				} else if v == "Meaning" {
73 | 					header[k] = "meaning"
74 | 				}
75 | 			}
76 | 
77 | 			if nCols := len(header); nCols != ExpectedRows {
78 | 				return fmt.Errorf("Expected a CSV with %d columns; got one with %d", ExpectedRows, nCols)
79 | 			}
80 | 
81 | 			fmt.Println(strings.Join(header, "\t"))
82 | 
83 | 			continue
84 | 		}
85 | 
86 | 		// Handle the entries
87 | 		if len(row) == ExpectedRows {
88 | 			fmt.Println(strings.Join(row, "\t"))
89 | 		}
90 | 	}
91 | 
92 | 	log.Println("Created coding file with", j, "entries")
93 | 
94 | 	return nil
95 | }
96 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/convertdict/cd_test.go:
--------------------------------------------------------------------------------
1 | package main
2 | 
3 | import "testing"
4 | 
5 | func Test4(t *testing.T) {
6 | 	main()
7 | }
8 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/convertdict/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/csv"
  6 | 	"flag"
  7 | 	"fmt"
  8 | 	"io"
  9 | 	"log"
 10 | 
 11 | 	// "os"
 12 | 
 13 | 	"net/http"
 14 | 	"strings"
 15 | )
 16 | 
 17 | const (
 18 | 	ExpectedRows = 17
 19 | )
 20 | 
 21 | func main() {
 22 | 	var (
 23 | 		dictPath string
 24 | 	)
 25 | 
 26 | 	flag.StringVar(&dictPath, "dict", "https://biobank.ndph.ox.ac.uk/~bbdatan/Data_Dictionary_Showcase.tsv", "URL to CSV file with the UKBB data dictionary")
 27 | 	// flag.StringVar(&dictPath, "dict", "/home/anamika/ml4h/data_dictionary/Data_Dictionary_Showcase.tsv", "URL to CSV file with the UKBB data dictionary")
 28 | 	flag.Parse()
 29 | 
 30 | 	if dictPath == "" {
 31 | 		flag.PrintDefaults()
 32 | 		log.Fatalln()
 33 | 	}
 34 | 
 35 | 	if err := ImportDictionary(dictPath); err != nil {
 36 | 		log.Fatalln(err)
 37 | 	}
 38 | }
 39 | 
 40 | func ImportDictionary(url string) error {
 41 | 	log.Printf("Importing from %s\n", url)
 42 | 
 43 | 	resp, err := http.Get(url)
 44 | 	// resp, err := os.Open(url)
 45 | 	if err != nil {
 46 | 		return err
 47 | 	}
 48 | 	reader := csv.NewReader(resp.Body)
 49 | 	// reader := csv.NewReader(resp)
 50 | 	reader.Comma = '\t'
 51 | 	reader.LazyQuotes = true
 52 | 
 53 | 	header := make([]string, 0)
 54 | 	j := 0
 55 | 	for ; ; j++ {
 56 | 		// log.Printf("Count J %d\n", j)
 57 | 		row, err := reader.Read()
 58 | 		if err != nil && err == io.EOF {
 59 | 			resp.Body.Close()
 60 | 			// resp.Close()
 61 | 			break
 62 | 		} else if err != nil {
 63 | 			buf := bytes.NewBuffer(nil)
 64 | 			io.Copy(buf, resp.Body)
 65 | 			// io.Copy(buf, resp)
 66 | 			if strings.Contains(buf.String(), "internal error") {
 67 | 				log.Println("Dictionary File is not permitted to be downloaded from the UKBB")
 68 | 				continue
 69 | 			}
 70 | 		}
 71 | 
 72 | 		// Handle the header
 73 | 		if j == 0 {
 74 | 			log.Printf("Header (%d elements): %+v\n", len(row), row)
 75 | 			header = append(header, row...)
 76 | 			for k, v := range header {
 77 | 				if v == "Coding" {
 78 | 					header[k] = "coding_file_id"
 79 | 					break
 80 | 				}
 81 | 			}
 82 | 
 83 | 			if nCols := len(header); nCols != ExpectedRows {
 84 | 				return fmt.Errorf("Expected a CSV with %d columns; got one with %d", ExpectedRows, nCols)
 85 | 			}
 86 | 
 87 | 			fmt.Println(strings.Join(header, "\t"))
 88 | 
 89 | 			continue
 90 | 		}
 91 | 
 92 | 		// Handle the entries
 93 | 		if len(row) == ExpectedRows {
 94 | 			fmt.Println(strings.Join(row, "\t"))
 95 | 		}
 96 | 	}
 97 | 
 98 | 	log.Println("Created dictionary file with", j, "entries")
 99 | 
100 | 	return nil
101 | }
102 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/convertpheno/flagslice.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import "strings"
 4 | 
 5 | type flagSlice []string
 6 | 
 7 | func (i *flagSlice) String() string {
 8 | 	if i == nil {
 9 | 		return ""
10 | 	}
11 | 
12 | 	return strings.Join([]string(*i), "\t")
13 | }
14 | 
15 | func (i *flagSlice) Set(value string) error {
16 | 	*i = append(*i, value)
17 | 	return nil
18 | }
19 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/convertsample/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"encoding/csv"
 5 | 	"flag"
 6 | 	"fmt"
 7 | 	"io"
 8 | 	"log"
 9 | 	"os"
10 | 
11 | 	"github.com/carbocation/genomisc"
12 | )
13 | 
14 | const (
15 | 	// .sample file field columns
16 | 	ID_1 = iota
17 | 	ID_2
18 | 	missing
19 | 	sex
20 | )
21 | 
22 | func main() {
23 | 	var (
24 | 		samplePath string
25 | 	)
26 | 
27 | 	flag.StringVar(&samplePath, "sample", "", "genotyping .sample file for the UKBB")
28 | 	flag.Parse()
29 | 
30 | 	if samplePath == "" {
31 | 		flag.PrintDefaults()
32 | 		os.Exit(1)
33 | 	}
34 | 
35 | 	samplePath = genomisc.ExpandHome(samplePath)
36 | 	log.Printf("Importing %s\n", samplePath)
37 | 
38 | 	// .sample file
39 | 
40 | 	f, err := os.Open(samplePath)
41 | 	if err != nil {
42 | 		log.Fatalln(err)
43 | 	}
44 | 	defer f.Close()
45 | 
46 | 	delim := genomisc.DetermineDelimiter(f)
47 | 
48 | 	f.Seek(0, 0)
49 | 	fileCSV := csv.NewReader(f)
50 | 	fileCSV.Comma = delim
51 | 
52 | 	// .sample files have 2 header rows that we will discard
53 | 	fileCSV.Read()
54 | 	fileCSV.Read()
55 | 
56 | 	i := 0
57 | 	fmt.Printf("sample_id\tfile_row\n")
58 | 	for ; ; i++ {
59 | 		row, err := fileCSV.Read()
60 | 		if err != nil && err == io.EOF {
61 | 			break
62 | 		} else if err != nil {
63 | 			log.Fatalln(err)
64 | 		}
65 | 
66 | 		fmt.Printf("%s\t%d\n", row[ID_1], i)
67 | 	}
68 | 
69 | 	log.Println("Extracted", i, "records from the .sample file")
70 | }
71 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/decrypt_all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | set -o nounset
 5 | 
 6 | #this script takes a folder of .enc files from UKBB, keys, and produces .csv.gz
 7 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 8 | enc_directory=${__dir}/uk_biobank_4_1_2019
 9 | 
10 | for file in 28112 23300 23301 23302 
11 | do
12 | 	${__dir}/ukbunpack ${enc_directory}/ukb${file}.enc ${__dir}/k17488_${file}.key
13 | 	${__dir}/ukbconv ${enc_directory}/ukb${file}.enc_ukb csv
14 | 	gzip ${enc_directory}/ukb${file}.csv
15 | done
16 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/firstdate/README.md:
--------------------------------------------------------------------------------
 1 | #THIS IS ARCHIVAL -- no code/notes here are meant to be used, but we want to keep the notes around
 2 | # Dates in the UK Biobank
 3 | 
 4 | ## Attended assessment center
 5 | 
 6 | * Date FieldID 53
 7 | 
 8 | Useful for:
 9 | 
10 | 1. Defining threshold date for incidence
11 | 1. Defining dates for things that don't otherwise have an associated date
12 | 
13 | ## Birth
14 | 
15 | * Date FieldID 34: Year of birth
16 | * Date FieldID 52: Month of birth
17 | * Date Field 33: birth date (*Note: this field is restricted due to its precision*)
18 | 
19 | ## Lost to follow-up
20 | 
21 | * Date FieldID 191
22 | 
23 | ## Died
24 | 
25 | * Date FieldID 40000
26 | 
27 | ## ICD10
28 | 
29 | * Date FieldID ==> derived from HESIN
30 | * Main ICD10: 41202
31 | * Secondary ICD10: 41204
32 | * ICD10 Primary Cause of Death: 40001
33 | * ICD10 Secondary Cause of Death: 40002
34 | 
35 | ## ICD9
36 | 
37 | * Date FieldID ==> derived from HESIN
38 | * Main ICD9: 41203
39 | * Secondary ICD9: 41205
40 | 
41 | ## Operation (OPCS4)
42 | 
43 | * Date FieldID ==> derived from HESIN
44 | * Main OPCS4: 41200
45 | * Secondary OPCS4: 41210
46 | * Self-reported:
47 |   * FieldID: 20004
48 |   * *float32 Year*: 20010 (need to truncate and add month/day)
49 | 
50 | ## Special cases
51 | 
52 | * Myocardial infarction:
53 |   * FieldID: 42001
54 |   * Date: 42000
55 | * Non-cancer illness:
56 |   * FieldID: 20002
57 |   * Date: 20008
58 | * Cancer:
59 |   * FieldID: 20001
60 |   * Date: 20006
61 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/firstdate/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | func main() {
 4 | 	// http://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=41253
 5 | 
 6 | 	// Run and merge from 3 separate queries:
 7 | 	//
 8 | 	// 1) for ICD/opcode fields, look into hesin
 9 | 	// 2) for special fields with dates, look at their dates
10 | 	// 3) for all other fields, use the enrollment date based on their array_idx
11 | 
12 | 	// assign ICDs to their transformed FieldIDs - different for main and secondary
13 | 
14 | 	// Then, the output:
15 | 	// SampleID FieldID Value Date
16 | 
17 | 	// Then left join on censor
18 | 	// GROUP BY sample_id
19 | 
20 | 	// Downstream:
21 | 	// Pass 1: Fetch censor data for everyone
22 | 	// Pass 2:
23 | }
24 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importcensor/censor.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "NULLABLE", 
 4 |     "name": "sample_id", 
 5 |     "type": "INTEGER"
 6 |   }, 
 7 |   {
 8 |     "mode": "NULLABLE", 
 9 |     "name": "birthdate", 
10 |     "type": "DATE"
11 |   }, 
12 |   {
13 |     "mode": "NULLABLE", 
14 |     "name": "enroll_date", 
15 |     "type": "DATE"
16 |   }, 
17 |   {
18 |     "mode": "NULLABLE", 
19 |     "name": "enroll_age", 
20 |     "type": "FLOAT"
21 |   }, 
22 |   {
23 |     "mode": "NULLABLE", 
24 |     "name": "death_date", 
25 |     "type": "DATE"
26 |   }, 
27 |   {
28 |     "mode": "NULLABLE", 
29 |     "name": "death_age", 
30 |     "type": "FLOAT"
31 |   }, 
32 |   {
33 |     "mode": "NULLABLE", 
34 |     "name": "death_censor_date", 
35 |     "type": "DATE"
36 |   }, 
37 |   {
38 |     "mode": "NULLABLE", 
39 |     "name": "death_censor_age", 
40 |     "type": "FLOAT"
41 |   }, 
42 |   {
43 |     "mode": "NULLABLE", 
44 |     "name": "phenotype_censor_date", 
45 |     "type": "DATE"
46 |   }, 
47 |   {
48 |     "mode": "NULLABLE", 
49 |     "name": "phenotype_censor_age", 
50 |     "type": "FLOAT"
51 |   }, 
52 |   {
53 |     "mode": "NULLABLE", 
54 |     "name": "lost_to_followup_date", 
55 |     "type": "DATE"
56 |   }, 
57 |   {
58 |     "mode": "NULLABLE", 
59 |     "name": "lost_to_followup_age", 
60 |     "type": "FLOAT"
61 |   }, 
62 |   {
63 |     "mode": "NULLABLE", 
64 |     "name": "computed_date", 
65 |     "type": "DATE"
66 |   },
67 |   {
68 |     "mode": "NULLABLE", 
69 |     "name": "missing_fields", 
70 |     "type": "STRING"
71 |   }
72 | ]
73 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importcensor/import.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | set -o nounset
 5 | #pass in
 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903"
 7 | DATASET=$2 #e.g. "ukbb7089_201903"
 8 | 
 9 | #specific to this func
10 | GEO="US"
11 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12 | 
13 | bq --location=${GEO} load \
14 |  --field_delimiter "\t" \
15 |  --quote "" \
16 |  --replace \
17 |  --source_format=CSV \
18 |  --null_marker "NA" \
19 |  --skip_leading_rows 1 \
20 |  ${DATASET}.censor ${BUCKET}/censor.tsv.gz \
21 | ${__dir}/censor.json
22 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importcoding/coding.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED", 
 4 |     "name": "coding_file_id", 
 5 |     "type": "INTEGER"
 6 |   }, 
 7 |   {
 8 |     "mode": "REQUIRED", 
 9 |     "name": "coding", 
10 |     "type": "STRING"
11 |   }, 
12 |   {
13 |     "mode": "REQUIRED", 
14 |     "name": "meaning", 
15 |     "type": "STRING"
16 |   }
17 | ]
18 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importcoding/import.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | set -o nounset
 5 | #pass in
 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903"
 7 | DATASET=$2 #e.g. "ukbb7089_201903"
 8 | 
 9 | #specific to this func
10 | GEO="US"
11 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12 | 
13 | bq --location=${GEO} load \
14 |  --field_delimiter "\t" \
15 |  --replace \
16 |  --quote "" \
17 |  --source_format=CSV \
18 |  --skip_leading_rows 1 \
19 |  ${DATASET}.coding ${BUCKET}/coding.tsv.gz \
20 |  ${__dir}/coding.json
21 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importdict/dictionary.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED", 
 4 |     "name": "Path", 
 5 |     "type": "STRING"
 6 |   }, 
 7 |   {
 8 |     "mode": "REQUIRED", 
 9 |     "name": "Category", 
10 |     "type": "INTEGER"
11 |   }, 
12 |   {
13 |     "mode": "REQUIRED", 
14 |     "name": "FieldID", 
15 |     "type": "INTEGER"
16 |   }, 
17 |   {
18 |     "mode": "REQUIRED", 
19 |     "name": "Field", 
20 |     "type": "STRING"
21 |   }, 
22 |   {
23 |     "mode": "REQUIRED", 
24 |     "name": "Participants", 
25 |     "type": "INTEGER"
26 |   }, 
27 |   {
28 |     "mode": "REQUIRED", 
29 |     "name": "Items", 
30 |     "type": "INTEGER"
31 |   }, 
32 |   {
33 |     "mode": "REQUIRED", 
34 |     "name": "Stability", 
35 |     "type": "STRING"
36 |   }, 
37 |   {
38 |     "mode": "REQUIRED", 
39 |     "name": "ValueType", 
40 |     "type": "STRING"
41 |   }, 
42 |   {
43 |     "name": "Units", 
44 |     "type": "STRING"
45 |   }, 
46 |   {
47 |     "mode": "REQUIRED", 
48 |     "name": "ItemType", 
49 |     "type": "STRING"
50 |   }, 
51 |   {
52 |     "mode": "REQUIRED", 
53 |     "name": "Strata", 
54 |     "type": "STRING"
55 |   }, 
56 |   {
57 |     "mode": "REQUIRED", 
58 |     "name": "Sexed", 
59 |     "type": "STRING"
60 |   }, 
61 |   {
62 |     "mode": "REQUIRED", 
63 |     "name": "Instances", 
64 |     "type": "INTEGER"
65 |   }, 
66 |   {
67 |     "mode": "REQUIRED", 
68 |     "name": "Array", 
69 |     "type": "INTEGER"
70 |   }, 
71 |   {
72 |     "name": "coding_file_id", 
73 |     "type": "INTEGER"
74 |   }, 
75 |   {
76 |     "name": "Notes", 
77 |     "type": "STRING"
78 |   }, 
79 |   {
80 |     "mode": "REQUIRED", 
81 |     "name": "Link", 
82 |     "type": "STRING"
83 |   }
84 | ]
85 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importdict/import.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | set -o nounset
 5 | #pass in
 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903"
 7 | DATASET=$2 #e.g. "ukbb7089_201903"
 8 | 
 9 | #specific to this func
10 | GEO="US"
11 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12 | 
13 | #note hardcoded dictionary.json, dictionary.tsv.gz, dictionary table
14 | # Special for dict: need to disable quotes
15 | bq --location=${GEO} load \
16 |  --field_delimiter "\t" \
17 |  --replace \
18 |  --quote "" \
19 |  --source_format=CSV \
20 |  --skip_leading_rows 1 \
21 |  ${DATASET}.dictionary ${BUCKET}/dictionary.tsv.gz \
22 |  ${__dir}/dictionary.json
23 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importhesin/hesin_diag.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED",
 4 |     "name": "eid",
 5 |     "type": "INTEGER"
 6 |   },
 7 |   {
 8 |     "mode": "REQUIRED",
 9 |     "name": "ins_index",
10 |     "type": "INTEGER"
11 |   },
12 |   {
13 |     "mode": "NULLABLE",
14 |     "name": "arr_index",
15 |     "type": "INTEGER"
16 |   },
17 |   {
18 |     "mode": "NULLABLE",
19 |     "name": "level",
20 |     "type": "INTEGER"
21 |   },  
22 |   {
23 |     "mode": "NULLABLE",
24 |     "name": "diag_icd9",
25 |     "type": "STRING"
26 |   },
27 |   {
28 |     "mode": "NULLABLE",
29 |     "name": "diag_icd9_nb",
30 |     "type": "INTEGER"
31 |   },
32 |   {
33 |     "mode": "NULLABLE",
34 |     "name": "diag_icd10",
35 |     "type": "STRING"
36 |   },
37 |   {
38 |     "mode": "NULLABLE",
39 |     "name": "diag_icd10_nb",
40 |     "type": "STRING"
41 |   }
42 | ]
43 | 
44 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importhesin/hesin_diag10.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED", 
 4 |     "name": "eid", 
 5 |     "type": "INTEGER"
 6 |   }, 
 7 |   {
 8 |     "mode": "REQUIRED", 
 9 |     "name": "record_id", 
10 |     "type": "INTEGER"
11 |   }, 
12 |   {
13 |     "mode": "NULLABLE", 
14 |     "name": "arr_index", 
15 |     "type": "INTEGER"
16 |   }, 
17 |   {
18 |     "mode": "NULLABLE", 
19 |     "name": "diag_icd10", 
20 |     "type": "STRING"
21 |   }, 
22 |   {
23 |     "mode": "NULLABLE", 
24 |     "name": "diag_icd10_nb", 
25 |     "type": "STRING"
26 |   }
27 | ]
28 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importhesin/hesin_diag9.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED", 
 4 |     "name": "eid", 
 5 |     "type": "INTEGER"
 6 |   }, 
 7 |   {
 8 |     "mode": "REQUIRED", 
 9 |     "name": "record_id", 
10 |     "type": "INTEGER"
11 |   }, 
12 |   {
13 |     "mode": "NULLABLE", 
14 |     "name": "arr_index", 
15 |     "type": "INTEGER"
16 |   }, 
17 |   {
18 |     "mode": "NULLABLE", 
19 |     "name": "diag_icd9", 
20 |     "type": "STRING"
21 |   }, 
22 |   {
23 |     "mode": "NULLABLE", 
24 |     "name": "diag_icd9_nb", 
25 |     "type": "STRING"
26 |   }
27 | ]
28 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importhesin/hesin_lubitz.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED", 
 4 |     "name": "eid", 
 5 |     "type": "INTEGER"
 6 |   }, 
 7 |   {
 8 |     "mode": "REQUIRED", 
 9 |     "name": "record_id", 
10 |     "type": "INTEGER"
11 |   }, 
12 |   {
13 |     "mode": "NULLABLE", 
14 |     "name": "admidate", 
15 |     "type": "STRING"
16 |   }, 
17 |   {
18 |     "mode": "NULLABLE", 
19 |     "name": "cause_icd10", 
20 |     "type": "STRING"
21 |   }, 
22 |   {
23 |     "mode": "NULLABLE", 
24 |     "name": "cause_icd10_nb", 
25 |     "type": "STRING"
26 |   }, 
27 |   {
28 |     "mode": "NULLABLE", 
29 |     "name": "diag_icd10", 
30 |     "type": "STRING"
31 |   }, 
32 |   {
33 |     "mode": "NULLABLE", 
34 |     "name": "diag_icd10_nb", 
35 |     "type": "STRING"
36 |   }, 
37 |   {
38 |     "mode": "NULLABLE", 
39 |     "name": "diag_icd9", 
40 |     "type": "STRING"
41 |   }, 
42 |   {
43 |     "mode": "NULLABLE", 
44 |     "name": "diag_icd9_nb", 
45 |     "type": "STRING"
46 |   }, 
47 |   {
48 |     "mode": "NULLABLE", 
49 |     "name": "disdate", 
50 |     "type": "STRING"
51 |   }, 
52 |   {
53 |     "mode": "NULLABLE", 
54 |     "name": "epiend", 
55 |     "type": "STRING"
56 |   }, 
57 |   {
58 |     "mode": "NULLABLE", 
59 |     "name": "epistart", 
60 |     "type": "STRING"
61 |   }, 
62 |   {
63 |     "mode": "NULLABLE", 
64 |     "name": "opdate", 
65 |     "type": "STRING"
66 |   }, 
67 |   {
68 |     "mode": "NULLABLE", 
69 |     "name": "oper4", 
70 |     "type": "STRING"
71 |   }, 
72 |   {
73 |     "mode": "NULLABLE", 
74 |     "name": "oper4_nb", 
75 |     "type": "STRING"
76 |   }, 
77 |   {
78 |     "mode": "NULLABLE", 
79 |     "name": "operstat", 
80 |     "type": "INTEGER"
81 |   }, 
82 |   {
83 |     "mode": "NULLABLE", 
84 |     "name": "source", 
85 |     "type": "STRING"
86 |   }
87 | ]
88 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importhesin/hesin_oper.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "NULLABLE", 
 4 |     "name": "eid", 
 5 |     "type": "INTEGER"
 6 |   }, 
 7 |   {
 8 |     "mode": "NULLABLE", 
 9 |     "name": "ins_index", 
10 |     "type": "INTEGER"
11 |   }, 
12 |   {
13 |     "mode": "NULLABLE", 
14 |     "name": "arr_index", 
15 |     "type": "INTEGER"
16 |   },
17 |   {
18 |     "mode": "NULLABLE",
19 |     "name": "level",
20 |     "type": "INTEGER"
21 |   },  
22 |   {
23 |     "mode": "NULLABLE", 
24 |     "name": "opdate", 
25 |     "type": "STRING"
26 |   },
27 |   {
28 |     "mode": "NULLABLE",
29 |     "name": "oper3",
30 |     "type": "STRING"
31 |   },
32 |   {
33 |     "mode": "NULLABLE",
34 |     "name": "oper3_nb",
35 |     "type": "STRING"
36 |   },  
37 |   {
38 |     "mode": "NULLABLE", 
39 |     "name": "oper4", 
40 |     "type": "STRING"
41 |   }, 
42 |   {
43 |     "mode": "NULLABLE", 
44 |     "name": "oper4_nb", 
45 |     "type": "STRING"
46 |   },
47 |   {
48 |     "mode": "NULLABLE",
49 |     "name": "posopdur",
50 |     "type": "INTEGER"
51 |   },
52 |   {
53 |     "mode": "NULLABLE",
54 |     "name": "preopdur",
55 |     "type": "INTEGER"
56 |   }  
57 | ]
58 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importhesin/import.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | set -o nounset
 5 | #pass in
 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903"
 7 | DATASET=$2 #e.g. "ukbb7089_201903"
 8 | 
 9 | #specific to this func
10 | GEO="US"
11 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12 | 
13 | 
14 | #for lubitz, must replace hesin.json with hesin_lubitz.json
15 | #for NAME in hesin hesin_diag10 hesin_diag9 hesin_oper
16 | for NAME in hesin hesin_diag hesin_oper
17 | do
18 |     bq --location=${GEO} load \
19 |        --field_delimiter "\t" \
20 |        --replace \
21 |        --source_format=CSV \
22 |        --skip_leading_rows 1 \
23 |        ${DATASET}.${NAME} ${BUCKET}/${NAME}.tsv.gz \
24 |        ${__dir}/${NAME}.json
25 | done
26 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importpheno/append.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | set -o nounset
 5 | #pass in
 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903"
 7 | DATASET=$2 #e.g. "ukbb7089_201903"
 8 | 
 9 | #specific to this func
10 | GEO="US"
11 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12 | 
13 | # For phenotypes, we expect to add repeatedly, so we don't replace here. Note:
14 | # if you run append.sh with the same data twice, you'll just duplicate the
15 | # contents of the table.
16 | bq --location=${GEO} load \
17 |  --field_delimiter "\t" \
18 |  --quote "" \
19 |  --null_marker "NULL" \
20 |  --source_format=CSV \
21 |  --skip_leading_rows 1 \
22 |  ${DATASET}.phenotype ${BUCKET}/phenotype.tsv \
23 |  ${__dir}/phenotype.json
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importpheno/phenotype.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED", 
 4 |     "name": "sample_id", 
 5 |     "type": "INTEGER"
 6 |   }, 
 7 |   {
 8 |     "mode": "REQUIRED", 
 9 |     "name": "FieldID", 
10 |     "type": "INTEGER"
11 |   }, 
12 |   {
13 |     "mode": "REQUIRED",
14 |     "name": "instance", 
15 |     "type": "INTEGER"
16 |   }, 
17 |   {
18 |     "mode": "REQUIRED",
19 |     "name": "array_idx", 
20 |     "type": "INTEGER"
21 |   }, 
22 |   {
23 |     
24 |     "name": "value", 
25 |     "type": "STRING"
26 |   }, 
27 |   {
28 |     "name": "coding_file_id", 
29 |     "type": "INTEGER"
30 |   }
31 | ]
32 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importsample/import.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | set -o nounset
 5 | #pass in
 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903"
 7 | DATASET=$2 #e.g. "ukbb7089_201903"
 8 | 
 9 | #specific to this func
10 | GEO="US"
11 | NAME="sample"
12 | 
13 | 
14 | bq --location=${GEO} load \
15 |  --field_delimiter "\t" \
16 |  --replace \
17 |  --quote "" \
18 |  --source_format=CSV \
19 |  --skip_leading_rows 1 \
20 |  ${DATASET}.${NAME} ${TABLE_LOC}/${NAME}.tsv.gz 
21 |  ${NAME}.json
22 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/importsample/sample.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED", 
 4 |     "name": "sample_id", 
 5 |     "type": "INTEGER"
 6 |   }, 
 7 |   {
 8 |     "mode": "REQUIRED", 
 9 |     "name": "file_row", 
10 |     "type": "INTEGER"
11 |   }
12 | ]
13 | 


--------------------------------------------------------------------------------
/ingest/ukbb_csv_bigquery/inspect_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ingest/ukbb_csv_bigquery/inspect_screenshot.png


--------------------------------------------------------------------------------
/ml4h/DATA_MODELING_TESTS.md:
--------------------------------------------------------------------------------
 1 | # Data/Modeling/Tests
 2 | ## Running tests
 3 | Tests can be run in Docker with
 4 | ```
 5 | ${HOME}/ml4h/scripts/tf.sh -T ${HOME}/ml4h/tests
 6 | ```
 7 | Tests can be run locally in a conda environment with
 8 | ```
 9 | python -m pytest ${HOME}/ml4h/tests
10 | ```
11 | Some of the tests are slow due to creating, saving and loading `tensorflow` models.
12 | To skip those tests to move quickly, run
13 | ```
14 | python -m pytest ${HOME}/ml4h/tests -m "not slow"
15 | ```
16 | pytest can also run specific tests using `::`. For example
17 | ```
18 | python -m pytest ${HOME}/ml4h/tests/test_models.py::TestMakeMultimodalMultitaskModel::test_u_connect_segment
19 | ```
20 | For more pytest usage information, checkout the [usage guide](https://docs.pytest.org/en/latest/usage.html).
21 | 
22 | ### Phenotypic SQLite database
23 | Data for 500k people containing almost everything available in the UK Biobank Showcase
24 | 
25 | `/mnt/disks/data/raw/sql/ukbb7089.r10data.db`
26 | 
27 | To access the data using `sqlite`:
28 | 
29 | `sqlite3 /mnt/disks/data/raw/sql/ukbb7089.r10data.db`
30 | 
31 | The data can also be accessed through [BigQuery](https://console.cloud.google.com/bigquery?project=broad-ml4cvd&p=broad-ml4cvd&page=project).
32 | 
33 | 
34 | ### Cardiac MRI
35 | 212,158 individual zip files in ~20k people. Dicom-formatted files inside:
36 | 
37 | `/mnt/disks/data/raw/mris/cardiac/*.zip`
38 | 
39 | ### Liver MRI
40 | 10,132 individual zip files in ~10k people. Dicom-formatted files inside:
41 | 
42 | `/mnt/disks/data/raw/mris/liver/*.zip`
43 | 
44 | ### ECG: XML
45 | 119,097 ECGGs (12-lead resting and 3-lead exercise):
46 | 
47 | `/mnt/disks/data/raw/ecgs/*.xml`
48 | 
49 | ### Direct Genotypes 
50 | ~800k/person:
51 | 
52 | `/mnt/imputed_v2`
53 | 
54 | ### Imputed Genotypes
55 | 90 million/person:
56 | 
57 | `/mnt/imputed_v3`
58 | 
59 | ## Modeling with TensorFlow
60 | Once you have a virtual machine and an environment setup it is time to start learning.
61 | The first step is to create training data by writing tensors to the disk.  
62 | 
63 | To write tensors with default categorical and continuous phenotypes, and no MRI or EKG data
64 | ```
65 | ${HOME}/ml/scripts/tf.sh ${HOME}/ml/ml4h/recipes.py --mode tensorize --tensors ${HOME}/my_tensors/ --max_sample_id 1003000 --mri_field_id  --xml_field_id
66 | ```
67 | This should take about a minute to run and will output the SQL queries as well as the counts for the phenotype categories and responses that it finds.  Now let's train a model:
68 | ```
69 | ${HOME}/ml/scripts/tf.sh ${HOME}/ml/ml4h/recipes.py --mode train --tensors ${HOME}/my_tensors/ --input_tensors categorical-phenotypes-94 --output_tensors coronary_artery_disease_soft --id my_first_mlp_for_cvd
70 | ```
71 | This model should achieve about 75% validation set accuracy on predicting from the phenotypes whether this person was labelled with an ICD code corresponding to cardivascular disease.
72 | 


--------------------------------------------------------------------------------
/ml4h/DatabaseClient.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod, ABC
 2 | 
 3 | import sqlite3
 4 | from google.cloud.bigquery import Client
 5 | 
 6 | 
 7 | class DatabaseClient(ABC):
 8 |     def __init__(self, client):
 9 |         self.client = client
10 |         super(DatabaseClient, self).__init__()
11 | 
12 |     @abstractmethod
13 |     def execute(self, query: str):
14 |         pass
15 | 
16 | 
17 | class BigQueryDatabaseClient(DatabaseClient):
18 |     """ If running locally, run the following commandline to authenticate yourself:
19 |     gcloud auth application-default login
20 |     """
21 | 
22 |     def __init__(self, client=None, credentials_file=None):
23 |         if client is not None:
24 |             super(BigQueryDatabaseClient, self).__init__(client)
25 |         else:
26 |             if credentials_file is not None:
27 |                 bigquery_client = Client.from_service_account_json(credentials_file)
28 |             else:
29 |                 raise ValueError("BigQueryDatabaseClient requires a client or a credentials_file.")
30 |             super(BigQueryDatabaseClient, self).__init__(bigquery_client)
31 | 
32 |     def execute(self, query: str):
33 |         query_job = self.client.query(query)  # API request
34 |         rows = query_job.result()  # Waits for query to finish
35 |         return rows
36 | 
37 | 
38 | class SqLiteDatabaseClient(DatabaseClient):
39 |     def __init__(self, client=None, db_file=None):
40 |         if client is not None:
41 |             super(SqLiteDatabaseClient, self).__init__(client)
42 |         else:
43 |             if db_file is not None:
44 |                 super(SqLiteDatabaseClient, self).__init__(sqlite3.connect(db_file).cursor())
45 |             else:
46 |                 raise ValueError("SqLiteDatabaseClient requires a client or a db_file.")
47 | 
48 |     def execute(self, query: str):
49 |         return self.client.execute(query)
50 | 
51 | 
52 | if '__main__' == __name__:
53 |     credentials_file = '/Users/kyuksel/ml4h/bigquery-viewer-credentials.json'
54 |     db_client = BigQueryDatabaseClient(credentials_file=credentials_file)
55 | 
56 |     dataset = 'broad-ml4cvd.ukbb7089_r10data'
57 | 
58 |     dictionary_table = f"`{dataset}.dictionary`"
59 |     phenotype_table = f"`{dataset}.phenotype`"
60 |     coding_table = f"`{dataset}.coding`"
61 | 
62 |     fid = 20001
63 |     fids = [3143, 3144]
64 |     sample_id = 2907043
65 | 
66 |     job_title_field_id = 22600
67 |     icd10_field = 41202
68 |     query = \
69 |         f"SELECT value FROM {phenotype_table} WHERE fieldid={icd10_field} AND sample_id={sample_id}"
70 | 
71 |     rows = db_client.execute(query.format())
72 |     for row in rows:
73 |         print(row)
74 | 


--------------------------------------------------------------------------------
/ml4h/__init__.py:
--------------------------------------------------------------------------------
1 | from . import defines
2 | 


--------------------------------------------------------------------------------
/ml4h/applications/ingest/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastparquet
 2 | blosc
 3 | xxhash
 4 | zstandard
 5 | cv2
 6 | scipy
 7 | pandas
 8 | numpy
 9 | h5py
10 | 


--------------------------------------------------------------------------------
/ml4h/applications/jpp_inference_rv/README.md:
--------------------------------------------------------------------------------
 1 | # Reproduction scripts: inference RV
 2 | 
 3 | This folder contains scripts to reproduce the models used in:
 4 | 
 5 | **Genetic Analysis of Right Heart Structure and Function in 45,000 People**. James P. Pirruccello*, Paolo Di Achille*, Victor Nauffal*, Mahan Nekoui, Samuel N. Friedman, Marcus D. R. Klarqvist, Mark D. Chaffin, Shaan Khurshid, Carolina Roselli, Puneet Batra, Kenney Ng, Steven A. Lubitz, Jennifer E. Ho, Mark E. Lindsay, Anthony Philippakis, Patrick T. Ellinor. [To appear]
 6 | 
 7 | ## Example
 8 | 
 9 | Given a pre-trained semantic segmentation model `sax_slices_jamesp_4b_hyperopted_dropout_pap_dupe.h5` and the `ml4h.tensormap` that was used to generate data we can proceed to make inference on new data.
10 | 
11 | ```py
12 | import infer_on_sax # Local file
13 | 
14 | # Pre-trained model
15 | model = prepare_model("/tf/sax_slices_jamesp_4b_hyperopted_dropout_pap_dupe.h5", ml4h.tensormap.ukb.mri.cine_segmented_sax_slice_jamesp)
16 | # Enumerate the target files of interest.
17 | files = glob.glob('/mnt/disks/annotated-cardiac-tensors-44k/2020-09-21/*.hd5')
18 | # Partition the files into buckets and retrieve the files corresponding to that bucket.
19 | # For example, embarassingly parallel computation across 50 GCP VMs with NVidia P4 GPUs
20 | # using the provided shell script.
21 | files = split_files_for_parallel_computing(files, partition_number=0, total_partitions=50)
22 | jpp_infer_short_axis(files, model, output_path='/tf/')
23 | ```
24 | 
25 | A provided shell script `infer_hdf5_to_local.sh` streamline the procedure of spawning multiple GCP VMs with attached disks and GPUs for inference. Make sure you modify this file for executing the appropriate commands on the VMs.
26 | 


--------------------------------------------------------------------------------
/ml4h/applications/jpp_inference_rv/infer_to_hd5_local.sh:
--------------------------------------------------------------------------------
 1 | # Snapshot to spawn.
 2 | target_vm_snapshot = rv-parameterization
 3 | # Disk(s) to mount.
 4 | target_disk = annotated-cardiac-tensors-45k
 5 | # Base prefix name for the VM.
 6 | vm_base_name = pdiachil-rv-inference
 7 | # Username.
 8 | user = pdiachil
 9 | # Branch of ML4H to use.
10 | ml4h_branch = pd_sf_blox
11 | 
12 | for j in {0..4}
13 | do
14 |     start=$((j*10))
15 |     end=$((start+10-1))
16 |     for k in {0..2}
17 |     do
18 |     for i in $(seq $start $end)
19 |     do
20 |     # Spawn a VM instance given prepared VM snapshot.
21 | 	gcloud compute instances create ${vm_base_name}-${i} \
22 | 	       --machine-type=n1-standard-8 \
23 | 	       --boot-disk-size=150 \
24 | 	       --image=${target_vm_snapshot} \
25 | 	       --maintenance-policy=TERMINATE \
26 | 	       --accelerator=type=nvidia-tesla-t4,count=1 &
27 | 	sleep 1  # Sleep for a second
28 |     done # End loop i
29 |     wait # Wait until completion
30 |     done # End loop k
31 |     wait # Wait until completion
32 |     for i in $(seq $start $end)
33 |     do
34 |     # Attach disk(s) to the spawned VM instance.
35 | 	gcloud compute instances attach-disk ${vm_base_name}-${i} --disk=${target_disk} --mode ro &
36 | 	sleep 1 # Sleep for a second
37 |     done # End loop i
38 |     wait # Wait until completeion
39 |     sleep 25 # Sleep for 25 seconds 
40 |     for i in $(seq $start $end)
41 |     do
42 | 	gcloud compute ssh ${vm_base_name}-${i} --command="cd /home/${user};cd ml;git pull;git checkout ${ml4h_branch};git pull;nohup bash /home/${user}/ml/scripts/infer_to_hd5.sh $i > /home/${user}/out_${i}.out 2> /home/${user}/out_${i}.err < /dev/null &" &
43 | 	sleep 1 # Sleep for a second
44 |     done # End loop i
45 | done # End outer VM loop
46 | 


--------------------------------------------------------------------------------
/ml4h/logger.py:
--------------------------------------------------------------------------------
 1 | """Provides config settings for the logger and a way to load them"""
 2 | 
 3 | import sys
 4 | import os
 5 | import errno
 6 | import logging
 7 | 
 8 | 
 9 | def load_config(log_level, log_dir, log_file_basename, log_file_suffix):
10 |     from logging import config as logging_config
11 | 
12 |     try:
13 |         os.makedirs(log_dir)
14 |     except OSError as e:
15 |         if e.errno != errno.EEXIST:
16 |             raise e
17 | 
18 |     logger = logging.getLogger(__name__)
19 | 
20 |     log_file = "{}/{}_{}.log".format(log_dir, log_file_basename, log_file_suffix)
21 | 
22 |     try:
23 |         logging_config.dictConfig(_create_config(log_level, log_file))
24 |         success_msg = "Logging configuration was loaded. Log messages can be found at {}.".format(log_file)
25 |         logger.info(success_msg)
26 |     except Exception as e:
27 |         logger.error("Failed to load logging config!")
28 |         raise e
29 | 
30 | 
31 | def _create_config(log_level, log_file):
32 |     return {
33 |         'version': 1,
34 |         'disable_existing_loggers': False,
35 |         'formatters': {
36 |             'simple': {
37 |                 'format': '%(asctime)s - %(module)s:%(lineno)d - %(levelname)s - %(message)s',
38 |             },
39 |             'detailed': {
40 |                 'format': '%(name)s:%(levelname)s %(module)s:%(lineno)d:  %(message)s',
41 |             },
42 |         },
43 |         'handlers': {
44 |             'console': {
45 |                 'level': log_level,
46 |                 'class': 'logging.StreamHandler',
47 |                 'formatter': 'simple',
48 |                 'stream': sys.stdout,
49 |             },
50 |             'file': {
51 |                 'level': log_level,
52 |                 'class': 'logging.FileHandler',
53 |                 'formatter': 'simple',
54 |                 'filename': log_file,
55 |                 'mode': 'w',
56 |             },
57 |         },
58 |         'loggers': {
59 |             '': {
60 |                 'handlers': ['console', 'file'],
61 |                 'level': log_level,
62 |             },
63 |         },
64 |     }
65 | 


--------------------------------------------------------------------------------
/ml4h/ml4ht_integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/ml4ht_integration/__init__.py


--------------------------------------------------------------------------------
/ml4h/models/Block.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Dict, List, Callable
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | from ml4h.TensorMap import TensorMap
 7 | 
 8 | Tensor = tf.Tensor
 9 | Block = Callable[[Tensor, Dict[TensorMap, List[Tensor]]], Tensor]
10 | 
11 | 
12 | class Block(ABC):
13 |     @abstractmethod
14 |     def __call__(self, x: Tensor, intermediates: Dict[TensorMap, List[Tensor]] = None) -> Tensor:
15 |         pass
16 | 
17 |     def can_apply(self):
18 |         return True
19 | 


--------------------------------------------------------------------------------
/ml4h/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/models/__init__.py


--------------------------------------------------------------------------------
/ml4h/normalizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | from ml4h.defines import EPS
 5 | from tensorflow.keras.applications import imagenet_utils
 6 | 
 7 | 
 8 | class Normalizer(ABC):
 9 |     @abstractmethod
10 |     def normalize(self, tensor: np.ndarray) -> np.ndarray:
11 |         """Shape preserving transformation"""
12 |         pass
13 | 
14 |     def normalize_loading_option(self, tensor: np.ndarray, _) -> np.ndarray:
15 |         """Shape preserving transformation for use with DataDescription.
16 |          Defaults to the normalize function if not defined in the descendant"""
17 |         return self.normalize(tensor)
18 | 
19 |     def un_normalize(self, tensor: np.ndarray) -> np.ndarray:
20 |         """The inverse of normalize if possible. Otherwise identity."""
21 |         return tensor
22 | 
23 | 
24 | class Standardize(Normalizer):
25 |     def __init__(self, mean: float, std: float):
26 |         self.mean, self.std = mean, std
27 | 
28 |     def normalize(self, tensor: np.ndarray) -> np.ndarray:
29 |         return (tensor - self.mean) / self.std
30 | 
31 |     def un_normalize(self, tensor: np.ndarray) -> np.ndarray:
32 |         return tensor * self.std + self.mean
33 | 
34 | 
35 | class ZeroMeanStd1(Normalizer):
36 |     def normalize(self, tensor: np.ndarray) -> np.ndarray:
37 |         tensor -= np.mean(tensor)
38 |         tensor /= np.std(tensor) + EPS
39 |         return tensor
40 | 
41 | 
42 | class NonZeroNormalize(Normalizer):
43 |     def normalize(self, tensor: np.ndarray) -> np.ndarray:
44 |         nonzero = tensor > 0
45 |         tensor[nonzero] = (tensor[nonzero] - tensor[nonzero].mean() + 1e-9) / (
46 |             tensor[nonzero].std() + 1e-9
47 |         )
48 |         return tensor
49 | 
50 | 
51 | class TopKNormalize(Normalizer):
52 |     def __init__(self, n_top: int = 50):
53 |         self.n_top = n_top
54 | 
55 |     def normalize(self, tensor: np.ndarray) -> np.ndarray:
56 |         """Find top K itensity voxels are set upper range to the mean of those"""
57 |         upper = np.mean(sorted(np.max(tensor, axis=-1).flatten())[::-1][0:self.n_top])
58 |         tensor = np.where(tensor >= upper, upper, tensor)
59 |         tensor /= tensor.max()
60 |         return tensor
61 | 
62 | 
63 | class ImagenetNormalizeTorch(Normalizer):
64 |     def normalize(self, tensor: np.ndarray) -> np.ndarray:
65 |         # This is equivalent to:
66 |         # x /= 255.
67 |         # mean = [0.485, 0.456, 0.406]
68 |         # std = [0.229, 0.224, 0.225]
69 |         # when mode is torch
70 |         return imagenet_utils.preprocess_input(tensor, data_format=None, mode="torch")
71 | 
72 | 
73 | class RandomStandardize(Normalizer):
74 |     def __init__(self, mean: float, std: float, ratio: float = 0.5):
75 |         self.mean, self.std, self.ratio = mean, std, ratio
76 | 
77 |     def normalize(self, tensor: np.ndarray) -> np.ndarray:
78 |         if np.random.rand() > self.ratio:
79 |             return (tensor - self.mean) / (self.std + EPS)
80 |         else:
81 |             return (tensor - np.mean(tensor)) / (np.std(tensor) + EPS)
82 | 
83 |     def un_normalize(self, tensor: np.ndarray) -> np.ndarray:
84 |         return tensor * self.std + self.mean
85 | 


--------------------------------------------------------------------------------
/ml4h/tensorize/PARTNERS.md:
--------------------------------------------------------------------------------
 1 | # Partners ECG
 2 | Organizing and Tensorizing MUSE 12-lead ECGs
 3 | 
 4 | ## Table of Contents
 5 | 1. [Organizing XMLs and Removing Duplicates](#organizing-xmls-and-removing-duplicates)
 6 | 2. [Tensorizing XMLs to HDF5](#tensorizing-xmls-to-hdf5)
 7 | 3. [ECG Data Structure](#ecg-data-structure)
 8 | 4. [Extracting ECG Metadata](#extracting-ecg-metadata)
 9 | 5. [Other documentation](#other-documentation)
10 | 
11 | ## Organizing XMLs and Removing Duplicates
12 | `ingest/partners_ecg/organize_xml.py` moves XML files from a single directory into the appropriate yyyy-mm directory.
13 | 
14 | `ingest/partners_ecg/remove_xml_duplicates.py` finds and removes exact duplicate XML files, as defined by every bit of two files being identical, determined via SHA-256 hashing.  
15 | 
16 | ## Tensorizing XMLs to HDF5
17 | `tensorize_partners` mode in `recipes.py` extracts data from all XML files and saves as [HDF5 files](https://www.hdfgroup.org). Tensorization also removes duplicates that contain nearly the same information, except for minor differences, for example minor version changes in acquisition software. This duplicate detection is done by matching patient-date-time fields.  
18 | 
19 | This mode is called with the following arguments:  
20 | `--xml_folder` to specify the directory containing ECG XMLs.  
21 | `--tensors` to specify the directory where tensorized HD5 files should be saved.  
22 | 
23 | All the ECGs belonging to one patient, identified by medical record number (MRN), will be saved to one HD5, indexed by ECG acquisition date and time:  
24 | ```
25 | <MRN>.hd5
26 | └--partners_ecg_rest
27 |    |
28 |    |--date_1
29 |    |  └--ECG Data
30 |    |
31 |    └--date_2
32 |       └--ECG Data
33 | ```
34 | 
35 | ## ECG Data Structure
36 | Voltage is saved from XMLs as a dictionary of numpy arrays indexed by leads in the set `("I", "II", "V1", "V2", "V3", "V4", "V5", "V6")`, e.g.:
37 | 
38 | ```
39 | voltage = {'I': array([0, -4, -2, ..., 7]),
40 |           {'II': array([2, -9, 0, ..., 5]),
41 |           ...
42 |           {'V6': array([1, -4, -3, ..., 4]),
43 | ```
44 | 
45 | Every other element extracted from the XML is returned as a string, even if the underlying primitive type is a number (e.g. age). Here are some of the more important elements:
46 | 
47 | ```
48 | acquisitiondate
49 | atrialrate
50 | dateofbirth
51 | diagnosis_computer
52 | diagnosis_md
53 | ecgsamplebase
54 | ecgsampleexponent
55 | gender
56 | heightin
57 | location
58 | locationname
59 | overreaderfirstname
60 | overreaderid
61 | overreaderlastname
62 | patientid
63 | paxis
64 | poffset
65 | ponset
66 | printerval
67 | qoffset
68 | qonset
69 | qrscount
70 | qrsduration
71 | qtcfrederica
72 | qtcorrected
73 | qtinterval
74 | race
75 | raxis
76 | taxis
77 | toffset
78 | ventricularrate
79 | weightlbs
80 | ```
81 | 
82 | ## Extracting ECG metadata
83 | 
84 | `explore` mode in `recipes.py` extracts data specified by `--input_tensors` from all HD5 files given to `--tensors` and calculates summary statistics. Additionally, all metadata is saved to a large CSV file:  
85 | 
86 | This CSV file will be used to construct a performant, queryable database to identify future cohorts for research projects.
87 | 
88 | ## Other documentation
89 | GE documentation is stored in a shared Partners Dropbox folder ([link](https://www.dropbox.com/sh/c5tgm0lory72ge0/AADqKvUicDdyWzHYhtad0lU4a?dl=0)), including 1. physician's guide to the Marquette 12SL ECG analysis program, 2. guide to MuseDB search, and 3. Muse v9 XML developer's guide.
90 | 


--------------------------------------------------------------------------------
/ml4h/tensorize/README.md:
--------------------------------------------------------------------------------
 1 | # Run Dataflow
 2 | The following steps will run a Dataflow pipeline remotely, which in turn will, tensorize fields of a type
 3 | specified by the user (e.g. categorical, continuous) and write them onto a GCS bucket in the form of
 4 | one `hd5` file per sample id.
 5 | 
 6 | * Clone the repo and cd into it:
 7 | ```
 8 |     git clone git@github.com:broadinstitute/ml4h.git
 9 |     cd ml4h
10 | ```
11 | 
12 | * Create and activate the right Python environment:
13 | ```
14 |     conda env create -f ml4h/tensorize/dataflow/ml4h_dataflow.yml
15 |     conda activate ml4h_dataflow
16 | ```
17 | 
18 | * Make sure you are authenticated by Google Cloud:
19 | ```
20 |     gcloud auth application-default login
21 | ```
22 | 
23 | * Re install ml4h if you have made any changes:
24 | ```
25 |     pip install .
26 | ```
27 | 
28 | * Run with the help option to see the list of command line arguments.
29 | ```
30 |     python ml4h/tensorize/tensorize_dataflow.py -h
31 | ```
32 | 
33 | * Comment out the requirements in setup.py. Because some dataflow requirements conflict with ml4h base requirements you must comment out the lines (currently lines 6 and 16) in setup.py in the repo root:
34 | ```
35 | requirements = (here / 'docker/vm_boot_images/config/tensorflow-requirements.txt').read_text(encoding='utf-8')
36 | ...
37 | install_requires=requirements,
38 | ```
39 | 
40 | * **Note** that Google requires the `id` consist of only the
41 | characters `[-a-z0-9]`, i.e. starting with a letter and ending with a letter or number.
42 | 
43 | * Run the application to submit the pipeline to Dataflow to be executed remotely provided the
44 | command line argument `--beam_runner` is set to `DataflowRunner`. Set it to `DirectRunner` for local execution.
45 | For example:
46 | ```
47 | python ml4h/tensorize/tensorize_dataflow.py  \
48 |     --id categorical-v2023-01-16  \
49 |     --tensor_type categorical \
50 |     --bigquery_dataset ukbb_dev  \
51 |     --beam_runner DataflowRunner \
52 |     --repo_root /Users/sam/Dropbox/Code/ml4h \
53 |     --gcs_output_path tensors/continuous_v2023_01_17
54 | ```
55 | 
56 | * The pipeline can be run multiple times to tensorize different types of fields. This will populate the per-sample tensors
57 | in specified GCS buckets. In order to unify them, they can be downloaded via `gsutil` as shown below
58 | and merged using `merge_hd5s.py` script.
59 | ```
60 |     gsutil -m cp -r <gcs bucket with tensors> <local directory>
61 | ```
62 | 


--------------------------------------------------------------------------------
/ml4h/tensorize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/tensorize/__init__.py


--------------------------------------------------------------------------------
/ml4h/tensorize/dataflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/tensorize/dataflow/__init__.py


--------------------------------------------------------------------------------
/ml4h/tensorize/dataflow/fieldids.json:
--------------------------------------------------------------------------------
1 | [
2 |   {
3 |     "mode": "REQUIRED", 
4 |     "name": "fieldid",
5 |     "type": "INTEGER"
6 |   }
7 | ]
8 | 


--------------------------------------------------------------------------------
/ml4h/tensorize/dataflow/load_fieldids.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | set -o nounset
 5 | 
 6 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 7 | #RAW DATA LOCATIONS
 8 | #field file should already be in cloud before running this. looks like
 9 | # field_id
10 | # val1
11 | # val2
12 | # etc
13 | # upload using gsutil cp fieldids.csv gs://ml4h/data/
14 | # depending on access pattern, we can make this less one-offy.
15 | FIELDID_FILE="gs://ml4cvd/data/fieldids.csv"
16 | 
17 | #SHARED_DATASET -- should already be created, location of shared data across UKBB applications
18 | SHARED_DATA="shared_data"
19 | 
20 | 
21 | bq load \
22 |     --replace \
23 |     --source_format=CSV \
24 |     --skip_leading_rows 1 \
25 |     --schema  ${__dir}/fieldids.json \
26 |     ${SHARED_DATA}.tensorization_fieldids ${FIELDID_FILE}
27 | 


--------------------------------------------------------------------------------
/ml4h/tensorize/dataflow/ml4h_dataflow.yml:
--------------------------------------------------------------------------------
 1 | # Minimal set of packages to tensorize using Dataflow (tested on OSX-64)
 2 | # To be used to create the Python env on which to pip freeze to create the requirements file for Dataflow
 3 | 
 4 | name: ml4h_dataflow
 5 | channels:
 6 |   - defaults
 7 |   - anaconda
 8 | dependencies:
 9 |   - python==3.8.10
10 | #  - pip==22.3.1
11 | #  - pip:
12 | #    - apache-beam[gcp]==2.12.0
13 | #    - google-cloud-storage==1.13.0
14 | #    - h5py==2.9.0
15 | 


--------------------------------------------------------------------------------
/ml4h/tensorize/tensorize_dataflow.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import datetime
 3 | import logging
 4 | 
 5 | import apache_beam as beam
 6 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions
 7 | 
 8 | from ml4h.defines import GCS_BUCKET
 9 | from ml4h.tensorize.dataflow import bigquery_ukb_queries
10 | 
11 | 
12 | def parse_args():
13 |     now_string = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
14 | 
15 |     parser = argparse.ArgumentParser()
16 | 
17 |     parser.add_argument(
18 |         '--id', default=f"run_{now_string}",
19 |         help='User-defined identifier for this pipeline run. '
20 |              'Per Google: the name must consist of only the characters [-a-z0-9], '
21 |              'starting with a letter and ending with a letter or number.',
22 |     )
23 |     parser.add_argument(
24 |         '--tensor_type', default="categorical",
25 |         help='Type of data to be tensorized',
26 |         choices=['categorical', 'continuous', 'icd', 'disease', 'death', 'phecode_disease'],
27 |     )
28 |     parser.add_argument(
29 |         '--bigquery_dataset', default='ukbb_dev',
30 |         help='BigQuery dataset where the data will be drawn from',
31 |     )
32 |     parser.add_argument(
33 |         '--beam_runner', default='DirectRunner',
34 |         help='Apache Beam runner that will execute the pipeline',
35 |         choices=['DirectRunner', 'DataflowRunner'],
36 |     )
37 |     parser.add_argument(
38 |         '--repo_root',
39 |         help='Root directory of the cloned ml repo',
40 |     )
41 |     parser.add_argument(
42 |         '--gcp_project', default='broad-ml4cvd',
43 |         help='Name of the Google Cloud Platform project',
44 |     )
45 |     parser.add_argument(
46 |         '--gcp_region', default='us-central1',
47 |         help='Google Cloud Platform region',
48 |     )
49 |     # parser.add_argument('--gcs_bucket', default='ml4h',
50 |     #                     help='Name of the Google Cloud Storage bucket where tensors will be written to')
51 |     parser.add_argument(
52 |         '--gcs_output_path',
53 |         help='gs:// folder path excluding the bucket name where tensors will be written to '
54 |              'e.g. specifying /path/to/folder will write to gs://<gcs_bucket>/path/to/folder',
55 |     )
56 |     parser.add_argument(
57 |         "--logging_level", default='INFO', help="Logging level",
58 |         choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
59 |     )
60 | 
61 |     return parser.parse_args()
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     args = parse_args()
66 | 
67 |     logging.getLogger().setLevel(args.logging_level)
68 | 
69 |     packaging_args = [
70 |         f'--requirements_file={args.repo_root}/ml4h/tensorize/dataflow/requirements_ml4h_dataflow.txt',
71 |         f'--setup_file={args.repo_root}/setup.py',
72 |     ]
73 | 
74 |     pipeline_opts = PipelineOptions(flags=packaging_args)
75 |     google_cloud_options = pipeline_opts.view_as(GoogleCloudOptions)
76 |     google_cloud_options.region = args.gcp_region
77 |     google_cloud_options.project = args.gcp_project
78 |     google_cloud_options.job_name = args.id
79 |     google_cloud_options.staging_location = f"gs://{GCS_BUCKET}/dataflow/staging"
80 |     google_cloud_options.temp_location = f"gs://{GCS_BUCKET}/dataflow/temp"
81 |     pipeline_opts.view_as(StandardOptions).runner = args.beam_runner
82 | 
83 |     pipeline = beam.Pipeline(options=pipeline_opts)
84 | 
85 |     bigquery_ukb_queries.tensorize_sql_fields(
86 |         pipeline,
87 |         args.gcs_output_path,
88 |         args.bigquery_dataset,
89 |         args.tensor_type,
90 |     )
91 | 


--------------------------------------------------------------------------------
/ml4h/tensormap/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/tensormap/__init__.py


--------------------------------------------------------------------------------
/ml4h/tensormap/gatk.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | import logging
 3 | import numpy as np
 4 | from typing import Dict
 5 | 
 6 | from ml4h.TensorMap import TensorMap, Interpretation
 7 | from ml4h.normalizer import Standardize
 8 | 
 9 | DNA_SYMBOLS = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
10 | VARIANT_LABELS = {'NOT_SNP': 0, 'NOT_INDEL': 1, 'SNP': 2, 'INDEL': 3}
11 | 
12 | 
13 | def tensor_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray:
14 |     return np.array(hd5[tm.name])
15 | 
16 | 
17 | reference = TensorMap('reference', shape=(128, len(DNA_SYMBOLS)), tensor_from_file=tensor_from_hd5)
18 | read_tensor = TensorMap('read_tensor', shape=(128, 128, 15), tensor_from_file=tensor_from_hd5)
19 | dp = TensorMap('dp', shape=(1,), normalization=Standardize(mean=34, std=8.6), tensor_from_file=tensor_from_hd5)
20 | fs = TensorMap('fs', shape=(1,), normalization=Standardize(mean=4.03, std=7.2), tensor_from_file=tensor_from_hd5)
21 | qd = TensorMap('qd', shape=(1,), normalization=Standardize(mean=12.8, std=6.1), tensor_from_file=tensor_from_hd5)
22 | mq = TensorMap('mq', shape=(1,), normalization=Standardize(mean=59.1, std=8.6), tensor_from_file=tensor_from_hd5)
23 | sor = TensorMap('sor', shape=(1,), normalization=Standardize(mean=1.03, std=0.8), tensor_from_file=tensor_from_hd5)
24 | mqranksum = TensorMap(
25 |     'mqranksum', shape=(1,),
26 |     normalization=Standardize(mean=-0.23, std=1.1), tensor_from_file=tensor_from_hd5,
27 | )
28 | readposranksum = TensorMap(
29 |     'readposranksum', shape=(1,),
30 |     normalization=Standardize(mean=-0.04, std=1.2), tensor_from_file=tensor_from_hd5,
31 | )
32 | 
33 | 
34 | def variant_label_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray:
35 |     one_hot = np.zeros(tm.shape, dtype=np.float32)
36 |     variant_str = str(hd5['variant_label'][()], 'utf-8')
37 |     for channel in tm.channel_map:
38 |         if channel.lower() == variant_str.lower():
39 |             one_hot[tm.channel_map[channel]] = 1.0
40 |     if one_hot.sum() != 1:
41 |         raise ValueError(f'TensorMap {tm.name} missing or invalid label: {variant_str} one_hot: {one_hot}')
42 |     return one_hot
43 | 
44 | 
45 | variant_label = TensorMap(
46 |     'variant_label', Interpretation.CATEGORICAL,
47 |     shape=(len(VARIANT_LABELS),),
48 |     tensor_from_file=variant_label_from_hd5,
49 |     channel_map=VARIANT_LABELS,
50 | )
51 | 


--------------------------------------------------------------------------------
/ml4h/tensormap/mgb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/tensormap/mgb/__init__.py


--------------------------------------------------------------------------------
/ml4h/tensormap/mgb/xdl.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import h5py
 4 | import numpy as np
 5 | from ml4h.TensorMap import TensorMap, Interpretation
 6 | 
 7 | ecg_5000_std = TensorMap('ecg_5000_std', Interpretation.CONTINUOUS, shape=(5000, 12))
 8 | ecg_single_lead_I = TensorMap(f'ecg_strip_I', Interpretation.CONTINUOUS, shape=(5000, 1))
 9 | 
10 | hypertension_icd_only = TensorMap(
11 |     name='hypertension_icd_only', interpretation=Interpretation.CATEGORICAL,
12 |     channel_map={'no_hypertension_icd_only': 0, 'hypertension_icd_only': 1},
13 | )
14 | hypertension_icd_bp = TensorMap(
15 |     name='hypertension_icd_bp', interpretation=Interpretation.CATEGORICAL,
16 |     channel_map={'no_hypertension_icd_bp': 0, 'hypertension_icd_bp': 1},
17 | )
18 | hypertension_icd_bp_med = TensorMap(
19 |     name='hypertension_icd_bp_med', interpretation=Interpretation.CATEGORICAL,
20 |     channel_map={'no_hypertension_icd_bp_med': 0, 'hypertension_icd_bp_med': 1},
21 | )
22 | hypertension_med = TensorMap(
23 |     name='start_fu_hypertension_med', interpretation=Interpretation.CATEGORICAL,
24 |     channel_map={'no_hypertension_medication': 0, 'hypertension_medication': 1},
25 | )
26 | 
27 | lvef = TensorMap(name='LVEF', interpretation=Interpretation.CONTINUOUS, channel_map={'LVEF': 0})
28 | 
29 | age = TensorMap(name='age_in_days', interpretation=Interpretation.CONTINUOUS, channel_map={'age_in_days': 0})
30 | sex = TensorMap(name='sex', interpretation=Interpretation.CATEGORICAL, channel_map={'Female': 0, 'Male': 1})
31 | 
32 | cad = TensorMap(name='cad', interpretation=Interpretation.CATEGORICAL, channel_map={'no_cad': 0, 'cad': 1})
33 | dm = TensorMap(name='dm', interpretation=Interpretation.CATEGORICAL, channel_map={'no_dm': 0, 'dm': 1})
34 | hypercholesterolemia = TensorMap(
35 |     name='hypercholesterolemia', interpretation=Interpretation.CATEGORICAL,
36 |     channel_map={'no_hypercholesterolemia': 0, 'hypercholesterolemia': 1},
37 | )
38 | 
39 | n_intervals = 25
40 | af_tmap = TensorMap('survival_curve_af', Interpretation.SURVIVAL_CURVE, shape=(n_intervals*2,))
41 | death_tmap = TensorMap('death_event', Interpretation.SURVIVAL_CURVE, shape=(n_intervals*2,))
42 | 
43 | 
44 | def ecg_median_biosppy(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray:
45 |     tensor = np.zeros(tm.shape, dtype=np.float32)
46 |     for lead in tm.channel_map:
47 |         tensor[:, tm.channel_map[lead]] = hd5[f'{tm.path_prefix}{lead}']
48 |     tensor = np.nan_to_num(tensor)
49 |     return tensor
50 | 
51 | ecg_channel_map = {
52 |     'I': 0, 'II': 1, 'III': 2, 'aVR': 3, 'aVL': 4, 'aVF': 5,
53 |     'V1': 6, 'V2': 7, 'V3': 8, 'V4': 9, 'V5': 10, 'V6': 11,
54 | }
55 | 
56 | ecg_biosppy_median_60bpm = TensorMap(
57 |     'median', Interpretation.CONTINUOUS, path_prefix='median_60bpm_', shape=(600, 12),
58 |     tensor_from_file=ecg_median_biosppy,
59 |     channel_map=ecg_channel_map,
60 | )
61 | 


--------------------------------------------------------------------------------
/ml4h/tensormap/ukb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/tensormap/ukb/__init__.py


--------------------------------------------------------------------------------
/ml4h/tensormap/ukb/embedding.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.models import load_model
 2 | 
 3 | from ml4h.TensorMap import TensorMap, Interpretation
 4 | from ml4h.models.model_factory import get_custom_objects
 5 | from ml4h.tensormap.ukb.ecg import ecg_rest_median_raw_10
 6 | 
 7 | custom_dict = get_custom_objects([])
 8 | ecg_model_file = '/home/sam/hypertuned_48m_16e_ecg_median_raw_10_autoencoder_256d/encoder_ecg_rest_median_raw_10.h5'
 9 | ecg_median_autoencoder_256d = TensorMap(
10 |     'ecg_median_autoencoder_256d', Interpretation.EMBEDDING, shape=(256,),
11 |     model=load_model(ecg_model_file, custom_objects=custom_dict),
12 |     parents=[ecg_rest_median_raw_10],
13 | )
14 | 


--------------------------------------------------------------------------------
/ml4h/visualization_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/visualization_tools/__init__.py


--------------------------------------------------------------------------------
/ml4h/visualization_tools/annotations_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "name": "sample_id",
 4 |         "type": "STRING",
 5 |         "mode": "REQUIRED"
 6 |     },
 7 |     {
 8 |         "mode": "REQUIRED",
 9 |         "name": "annotator",
10 |         "type": "STRING"
11 |     },
12 |     {
13 |         "mode": "REQUIRED",
14 |         "name": "annotation_timestamp",
15 |         "type": "TIMESTAMP"
16 |     },
17 |     {
18 |         "mode": "REQUIRED",
19 |         "name": "key",
20 |         "type": "STRING"
21 |     },
22 |     {
23 |         "mode": "NULLABLE",
24 |         "name": "value_numeric",
25 |         "type": "NUMERIC"
26 |     },
27 |     {
28 |         "mode": "NULLABLE",
29 |         "name": "value_string",
30 |         "type": "STRING"
31 |     },
32 |     {
33 |         "mode": "NULLABLE",
34 |         "name": "comment",
35 |         "type": "STRING"
36 |     }
37 | ]
38 | 


--------------------------------------------------------------------------------
/ml4h/visualization_tools/ecg_static_plots.py:
--------------------------------------------------------------------------------
 1 | """Methods for integration of static plots within notebooks."""
 2 | import os
 3 | import tempfile
 4 | from typing import List, Optional, Union
 5 | 
 6 | from IPython.display import HTML
 7 | from IPython.display import SVG
 8 | import numpy as np
 9 | from ml4h.plots import plot_ecg_rest
10 | from ml4h.runtime_data_defines import get_resting_ecg_hd5_folder
11 | from ml4h.runtime_data_defines import get_resting_ecg_svg_folder
12 | import tensorflow as tf
13 | 
14 | 
15 | def display_resting_ecg(sample_id: Union[int, str], folder: Optional[str] = None) -> Union[HTML, SVG]:
16 |   """Retrieve (or render) and display the SVG of the resting ECG.
17 | 
18 |   Args:
19 |     sample_id: The id of the ECG SVG to retrieve.
20 |     folder: The local or Cloud Storage path under which the files reside.
21 | 
22 |   Returns:
23 |     An IPython SVG object or a notebook-friendly error.
24 |   """
25 |   if folder is None:
26 |     svg_folder = get_resting_ecg_svg_folder(sample_id)
27 |     hd5_folder = get_resting_ecg_hd5_folder(sample_id)
28 |   else:
29 |     svg_folder = folder
30 |     hd5_folder = folder
31 | 
32 |   with tempfile.TemporaryDirectory() as tmpdirname:
33 |     # First, see if we already have one rendered.
34 |     sample_svg = str(sample_id) + '.svg'
35 |     local_path = os.path.join(tmpdirname, sample_svg)
36 |     try:
37 |       tf.io.gfile.copy(src=os.path.join(svg_folder, sample_svg), dst=local_path)
38 |       return SVG(filename=local_path)
39 |     except (tf.errors.NotFoundError, tf.errors.PermissionDeniedError) as e:
40 |       pass
41 |     # If not, dynamically render a SVG
42 |     sample_hd5 = str(sample_id) + '.hd5'
43 |     local_path = os.path.join(tmpdirname, sample_hd5)
44 |     try:
45 |       tf.io.gfile.copy(src=os.path.join(hd5_folder, sample_hd5), dst=local_path)
46 |     except (tf.errors.NotFoundError, tf.errors.PermissionDeniedError) as e:
47 |       return HTML(f'''
48 |       <div class="alert alert-block alert-danger">
49 |       <b>Warning:</b> Resting ECG not available for sample {sample_id} in {svg_folder} or {hd5_folder}:
50 |       <hr><p><pre>{e.message}</pre></p>
51 |       Use the <kbd>folder</kbd> parameter to read from a different local directory or Cloud Storage bucket.
52 |       </div>''')
53 | 
54 |     try:
55 |       # We don't need the resulting SVG, so send it to a temporary directory.
56 |       with tempfile.TemporaryDirectory() as tmpdirname:
57 |         return plot_ecg_rest(tensor_paths=[local_path], rows=[0], out_folder=tmpdirname, is_blind=False)
58 |     except Exception as e:  # pylint: disable=broad-except
59 |       return HTML(f'''
60 |         <div class="alert alert-block alert-danger">
61 |         <b>Warning:</b> Unable to render static plot of resting ECG for sample {sample_id} from {hd5_folder}:
62 |         <hr><p><pre>{e}</pre></p>
63 |         </div>''')
64 | 
65 | 
66 | def major_breaks_x_resting_ecg(limits: List[float]) -> np.array:
67 |   """Method to compute breaks for plotnine plots of ECG resting data.
68 | 
69 |   Args:
70 |     limits: The approximate limits.
71 | 
72 |   Returns:
73 |     The desired limits.
74 |   """
75 |   step = 0.2
76 |   if limits[0] <= 0:
77 |     min_break = 0.0
78 |     max_break = 2.5
79 |   elif limits[0] <= 2.5:
80 |     min_break = 2.5
81 |     max_break = 5.0
82 |   elif limits[0] <= 5.0:
83 |     min_break = 5.0
84 |     max_break = 7.5
85 |   else:
86 |     min_break = 7.5
87 |     max_break = 10.0
88 |   return np.arange(min_break, max_break + step, step)
89 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-MVP/droid_mvp_checkpoint/checkpoint:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eb5a38d5763dc7f41acdb9468670ec94f1b31846285558ab69e302bd01917962
3 | size 65
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-MVP/droid_mvp_checkpoint/chkp.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d2196f91e9ae5e932b9c3dd546e3ce31e1a509fae5c3c13291bb0e496fb4e33a
3 | size 34826365
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-MVP/droid_mvp_checkpoint/chkp.index:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0af27bb4a0285fadf47c471f4f9047d86d767e6f697d9ec40cccd246b20f9cb7
3 | size 99789
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-MVP/droid_mvp_inference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | from droid_mvp_model_description import create_movinet_classifier, create_regressor_classifier
 7 | import logging 
 8 | tf.get_logger().setLevel(logging.ERROR)
 9 | 
10 | pretrained_chkp_dir = "droid_mvp_checkpoint/chkp"
11 | movinet_chkp_dir = 'movinet_a2_base/'
12 | 
13 | movinet_model, backbone = create_movinet_classifier(
14 |     n_input_frames=16,
15 |     batch_size=16,
16 |     num_classes=600,
17 |     checkpoint_dir=movinet_chkp_dir,
18 | )
19 | 
20 | backbone_output = backbone.layers[-1].output[0]
21 | flatten = tf.keras.layers.Flatten()(backbone_output)
22 | encoder = tf.keras.Model(inputs=[backbone.input], outputs=[flatten])
23 | 
24 | func_args = {
25 |     'input_shape': (16, 224, 224, 3),
26 |     'n_output_features': 0, # number of regression features
27 |     'categories': {"mvp_status_binary":2, "mvp_status_detailed":6},
28 |     'category_order': ["mvp_status_binary", "mvp_status_detailed"],
29 | }
30 | 
31 | model_plus_head = create_regressor_classifier(encoder, **func_args)
32 | 
33 | model_plus_head.load_weights(pretrained_chkp_dir)
34 | 
35 | random_video = np.random.random((1, 16, 224, 224, 3))
36 | 
37 | print(f"""
38 | DROID-MVP Predictions:
39 | {model_plus_head.predict(random_video)}
40 | """)


--------------------------------------------------------------------------------
/model_zoo/DROID-MVP/droid_mvp_model_description.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from official.vision.beta.projects.movinet.modeling import movinet, movinet_model
 4 | 
 5 | hidden_units = 256
 6 | dropout_rate = 0.5
 7 | 
 8 | def create_movinet_classifier(
 9 |         n_input_frames,
10 |         batch_size,
11 |         checkpoint_dir,
12 |         num_classes,
13 |         freeze_backbone=False
14 | ):
15 |     backbone = movinet.Movinet(model_id='a2')
16 |     model = movinet_model.MovinetClassifier(backbone=backbone, num_classes=600)
17 |     model.build([1, 1, 1, 1, 3])
18 |     checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
19 |     checkpoint = tf.train.Checkpoint(model=model)
20 |     status = checkpoint.restore(checkpoint_path)
21 |     status.assert_existing_objects_matched()
22 | 
23 |     model = movinet_model.MovinetClassifier(
24 |         backbone=backbone,
25 |         num_classes=num_classes
26 |     )
27 |     model.build([batch_size, n_input_frames, 224, 224, 3])
28 | 
29 |     if freeze_backbone:
30 |         for layer in model.layers[:-1]:
31 |             layer.trainable = False
32 |         model.layers[-1].trainable = True
33 | 
34 |     return model, backbone
35 | 
36 | def create_regressor_classifier(encoder, trainable=True, input_shape=(224, 224, 3), n_output_features=0, categories={},
37 |                                 category_order=None, add_dense={'regressor': False, 'classifier': False}):
38 |     for layer in encoder.layers:
39 |         layer.trainable = trainable
40 | 
41 |     inputs = tf.keras.Input(shape=input_shape, name='image')
42 |     features = encoder(inputs)
43 |     features = tf.keras.layers.Dropout(dropout_rate)(features)
44 |     features = tf.keras.layers.Dense(hidden_units, activation="relu")(features)
45 |     features = tf.keras.layers.Dropout(dropout_rate)(features)
46 | 
47 |     outputs = []
48 |     if n_output_features > 0:
49 |         if add_dense['regressor']:
50 |             features_reg = tf.keras.layers.Dense(hidden_units, activation="relu")(features)
51 |             features_reg = tf.keras.layers.Dropout(dropout_rate)(features_reg)
52 |             outputs.append(tf.keras.layers.Dense(n_output_features, activation=None, name='echolab')(features_reg))
53 |         else:
54 |             outputs.append(tf.keras.layers.Dense(n_output_features, activation=None, name='echolab')(features))
55 |     if len(categories.keys()) > 0:
56 |         if add_dense['classifier']:
57 |             features = tf.keras.layers.Dense(hidden_units, activation="relu")(features)
58 |             features = tf.keras.layers.Dropout(dropout_rate)(features)
59 |         for category in category_order:
60 |             activation = 'softmax'
61 |             n_classes = categories[category]
62 |             outputs.append(tf.keras.layers.Dense(n_classes, name='cls_'+category, activation=activation)(features))
63 | 
64 |     model = tf.keras.Model(inputs=inputs, outputs=outputs, name="regressor_classifier")
65 | 
66 |     return model
67 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-MVP/movinet_a2_base/checkpoint:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:78fdb1e081e9fc8d4e10e3bca4fe00117a236ddc4726bbf75594db19ae1be665
3 | size 69
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-MVP/movinet_a2_base/ckpt-1.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f393b7ef377ffaf59bd8bf081c72d05e74a576c5bba0d4bc180315432e49e557
3 | size 21240182
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-MVP/movinet_a2_base/ckpt-1.index:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1d801f29eace2f39bcc7b268ecf0d1bd117d9b3881cbcd810721c8c1b1f6c161
3 | size 10102
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-MVP/readme.md:
--------------------------------------------------------------------------------
 1 | # DROID-MVP Inference Example
 2 | 
 3 | This is a simple example script demonstrating how to load and run the DROID-MVP model. Model training and inference was performed using the code provided in the ML4H [model zoo](https://github.com/broadinstitute/ml4h/tree/master/model_zoo/DROID). The example below was adapted from the DROID inference code.
 4 | 
 5 | 1. Download DROID docker image. Note: docker image is not compatible with Apple Silicon.
 6 | 
 7 | `docker pull alalusim/droid:latest`
 8 | 
 9 | 2.  Pull github repo, including DROID-MVP model checkpoints stored using git lfs.
10 | 
11 | ```
12 | github clone https://github.com/broadinstitute/ml4h.git
13 | git lfs pull --include ml4h/model_zoo/DROID-MVP/droid_mvp_checkpoint/*
14 | git lfs pull --include ml4h/model_zoo/DROID-MVP/movinet_a2_base/*
15 | ```
16 | 
17 | 3. Run docker image while mounting ml4h directory and run example inference script.
18 | 
19 | `docker run -it -v {PATH TO CLONED ML4H DIRECTORY}:/ml4h/ alalusim/droid:latest`
20 | 
21 | ```
22 | cd /ml4h/model_zoo/DROID-MVP/
23 | python droid_mvp_inference.py
24 | ```
25 | 
26 | To use with your own data, format echocardiogram videos as tensors with shape (16, 224, 224, 3) before passing to the model. Code for data preprocessing, storage, loading, training, and inference can be found in the ml4h [model zoo](https://github.com/broadinstitute/ml4h/tree/master/model_zoo/DROID).
27 | 
28 | Model outputs for DROID-MVP take the form: 
29 | ```
30 | [
31 |     [["MVP", "Not MVP"]], 
32 |     [["Anterior ", "Bileaflet", "Not MVP", "Posterior", "Superior Displacement", "MVP not otherwise specified"]], 
33 | ]
34 | ```
35 | 
36 | Note that the model was optimized for predicting binary MVP status (the primary task) and that detailed MVP status was used as an auxiliary task to improve performance on the primary classification task.
37 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/droid_rv_checkpoint/checkpoint:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eb5a38d5763dc7f41acdb9468670ec94f1b31846285558ab69e302bd01917962
3 | size 65
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/droid_rv_checkpoint/chkp.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2af9efac9cec47cdd0bb4ca0c539b153657583fc6f261ac42bf5ef01031792f0
3 | size 34827706
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/droid_rv_checkpoint/chkp.index:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7a6c10fc7bfb8667a75ae8faeb19ed01278ca6d3fbf67488dd35834209921d17
3 | size 100586
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/droid_rv_inference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | from droid_rv_model_description import create_movinet_classifier, create_regressor_classifier, rescale_droid_rv_outputs, rescale_droid_rvef_outputs
 7 | import logging 
 8 | tf.get_logger().setLevel(logging.ERROR)
 9 | 
10 | droid_rv_checkpoint = "droid_rv_checkpoint/chkp"
11 | droid_rvef_checkpoint = "droid_rvef_checkpoint/chkp"
12 | movinet_chkp_dir = 'movinet_a2_base/'
13 | 
14 | movinet_model, backbone = create_movinet_classifier(
15 |     n_input_frames=16,
16 |     batch_size=16,
17 |     num_classes=600,
18 |     checkpoint_dir=movinet_chkp_dir,
19 | )
20 | 
21 | backbone_output = backbone.layers[-1].output[0]
22 | flatten = tf.keras.layers.Flatten()(backbone_output)
23 | encoder = tf.keras.Model(inputs=[backbone.input], outputs=[flatten])
24 | 
25 | droid_rv_func_args = {
26 |     'input_shape': (16, 224, 224, 3),
27 |     'n_output_features': 2, # number of regression features
28 |     'categories': {"RV_size":2, "RV_function":2, "Sex":2},
29 |     'category_order': ["RV_size", "RV_function", "Sex"],
30 | }
31 | 
32 | droid_rvef_func_args = {
33 |     'input_shape': (16, 224, 224, 3),
34 |     'n_output_features': 4, # number of regression features
35 |     'categories': {"Sex":2},
36 |     'category_order': ["Sex"],
37 | }
38 | 
39 | droid_rv_model = create_regressor_classifier(encoder, **droid_rv_func_args)
40 | droid_rv_model.load_weights(droid_rv_checkpoint)
41 | 
42 | droid_rvef_model = create_regressor_classifier(encoder, **droid_rvef_func_args)
43 | droid_rvef_model.load_weights(droid_rvef_checkpoint)
44 | 
45 | random_video = np.random.random((1, 16, 224, 224, 3))
46 | 
47 | droid_rv_pred = droid_rv_model.predict(random_video)
48 | droid_rvef_pred = droid_rvef_model.predict(random_video)
49 | 
50 | print(f"""
51 | 
52 | DROID-RV Predictions:
53 | {rescale_droid_rv_outputs(droid_rv_pred)}
54 | 
55 | DROID-RVEF Predictions:
56 | {rescale_droid_rvef_outputs(droid_rvef_pred)}
57 | 
58 | """)


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/droid_rvef_checkpoint/checkpoint:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eb5a38d5763dc7f41acdb9468670ec94f1b31846285558ab69e302bd01917962
3 | size 65
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/droid_rvef_checkpoint/chkp.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b635997716fc47e4d7694ed4dffb1fb19d32a3e8731bb250ec9bca9dc3283eba
3 | size 34820197
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/droid_rvef_checkpoint/chkp.index:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b5cae8c8fadd814be6d5d2fd967d6479d08edcae3f40f842f107dda63effd935
3 | size 99789
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/movinet_a2_base/checkpoint:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:78fdb1e081e9fc8d4e10e3bca4fe00117a236ddc4726bbf75594db19ae1be665
3 | size 69
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/movinet_a2_base/ckpt-1.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f393b7ef377ffaf59bd8bf081c72d05e74a576c5bba0d4bc180315432e49e557
3 | size 21240182
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/movinet_a2_base/ckpt-1.index:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1d801f29eace2f39bcc7b268ecf0d1bd117d9b3881cbcd810721c8c1b1f6c161
3 | size 10102
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID-RV/readme.md:
--------------------------------------------------------------------------------
 1 | # DROID-RV Inference Example
 2 | 
 3 | This is a simple example script demonstrating how to load and run the DROID-RV and DROID-RVEF models. Model training and inference was performed using the code provided in the ML4H [model zoo](https://github.com/broadinstitute/ml4h/tree/master/model_zoo/DROID). The example below was adapted from the DROID inference code.
 4 | 
 5 | 1. Download DROID docker image. Note: docker image is not compatible with Apple Silicon.
 6 | 
 7 | `docker pull alalusim/droid:latest`
 8 | 
 9 | 2.  Pull github repo, including DROID-RV model checkpoints stored using git lfs.
10 | 
11 | ```
12 | github clone https://github.com/broadinstitute/ml4h.git
13 | git lfs pull --include ml4h/model_zoo/DROID-RV/droid_rv_checkpoint/*
14 | git lfs pull --include ml4h/model_zoo/DROID-RV/droid_rvef_checkpoint/*
15 | git lfs pull --include ml4h/model_zoo/DROID-RV/movinet_a2_base/*
16 | ```
17 | 
18 | 3. Run docker image while mounting ml4h directory and run example inference script.
19 | 
20 | `docker run -it -v {PATH TO CLONED ML4H DIRECTORY}:/ml4h/ alalusim/droid:latest`
21 | 
22 | ```
23 | cd /ml4h/model_zoo/DROID-RV/
24 | python droid_rv_inference.py
25 | ```
26 | 
27 | To use with your own data, format echocardiogram videos as tensors with shape (16, 224, 224, 3) before passing to the model. Code for data preprocessing, storage, loading, training, and inference can be found in the ML4H [model zoo](https://github.com/broadinstitute/ml4h/tree/master/model_zoo/DROID).
28 | 
29 | Model outputs for DROID-RV take the form: 
30 | ```
31 | [
32 |     [["Age", "RVEDD"]], 
33 |     [["Dilated", "Not Dilated"]], 
34 |     [["Hypokinetic", "Not Hypokinetic"]], 
35 |     [["Female", "Male"]]
36 | ]
37 | ```
38 | 
39 | Model outputs for DROID-RVEF take the form: 
40 | ```
41 | [
42 |     [["RVEF", "RV End-Diastolic Volume, "RV End-Systolic Volume", "Age"]],
43 |     [["Female", "Male"]]
44 | ]
45 | ```
46 | 


--------------------------------------------------------------------------------
/model_zoo/DROID/README.md:
--------------------------------------------------------------------------------
 1 | # DROID (Dimensional Reconstruction of Imaging Data)
 2 | 
 3 | DROID is a 3-D convolutional neural network modeling approach for echocardiographic view
 4 | classification and quantification of LA dimension, LV wall thickness, chamber diameter and
 5 | ejection fraction.
 6 | 
 7 | The DROID echo movie encoder is based on the 
 8 | [MoViNet-A2-Base](https://tfhub.dev/tensorflow/movinet/a2/base/kinetics-600/classification/3) 
 9 | video classification model. MoViNet was fine-tuned in a supervised fashion to produce two
10 | specialized encoders:
11 | - DROID-LA
12 |   - input views: PLAX, A4C, A2C
13 |   - output predictions: LA A/P
14 | - DROID-LV
15 |   - input views: PLAX, A4C, A2C
16 |   - output predictions: LVEF, LVEDD, LVESD, IVS, PWT
17 | 
18 | Multi-instance attention heads were then trained to integrate up to 40 view encodings to predict
19 | a single measurement of each type per echo study.
20 | 
21 | ## Requirements
22 | In addition to the `ml4h` repository, DROID also requires `ml4ht_data_source` plus other dependencies. First, clone the
23 | ml4h repositories:
24 | ```commandline
25 | git clone https://github.com/broadinstitute/ml4h.git
26 | git clone https://github.com/broadinstitute/ml4ht_data_source.git
27 | ```
28 | 
29 | For convenience, we provide a docker image containing additional dependencies:
30 | ```commandline
31 | docker run -it --gpus all --rm -v {PARENT_DIRECTORY_OF_REPOS} -v {OPTIONAL_DATA_DIRECTORY} \
32 | us-central1-docker.pkg.dev/broad-ml4cvd/droid/droid:0.1 /bin/bash
33 | ```
34 | 
35 | Within the docker container, install `ml4ht`:
36 | ```commandline
37 | pip install --user ml4ht_data_source
38 | ```
39 | 
40 | ## Usage
41 | ### Preprocessing
42 | The following scripts are designed to handle echo movies that have been processed and stored in Lightning 
43 | Memory-Mapped Database (lmdb) files. We create one lmdb per echo study in which the keys are the filenames of the dicoms and
44 | the values are echo movies that have been anonymized, cropped, and converted to avis. See `echo_to_lmdb.py` for an
45 | example.
46 | 
47 | ### Inference
48 | `echo_supervised_inference_recipe.py` can be used to obtain predictions from echo movies given either the DROID-LA or
49 | DROID-LV specialized encoders.
50 | 
51 | An example of parameters to use when running this script are:
52 | ```commandline
53 | python echo_supervised_inference_recipe.py \
54 |     --n_input_frames 16 \
55 |     --output_labels LA_A_P \
56 |     --selected_views A4C --selected_views A2C --selected_views PLAX \
57 |     --selected_doppler standard \
58 |     --selected_quality good \
59 |     --selected_canonical on_axis \
60 |     --split_idx 0 \
61 |     --n_splits 1 \
62 |     --skip_modulo 4 \
63 |     --wide_file {WIDE_FILE_PATH} \
64 |     --splits_file {SPLITS_JSON} \
65 |     --lmdb_folder {LMDB_DIRECTORY_PATH} \
66 |     --pretrained_chkp_dir {SPECIALIZED_ENCODER_PATH} \
67 |     --movinet_chkp_dir {MoViNet-A2-Base_PATH} \
68 |     --output_dir {WHERE_TO_STORE_PREDICTIONS}
69 | ```


--------------------------------------------------------------------------------
/model_zoo/DROID/data_descriptions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/DROID/data_descriptions/__init__.py


--------------------------------------------------------------------------------
/model_zoo/DROID/echo_defines.py:
--------------------------------------------------------------------------------
  1 | category_dictionaries = {
  2 |     'view': {
  3 |         'PLAX': 0,
  4 |         'Ascending_aorta': 1,
  5 |         'RV_inflow': 2,
  6 |         'RV_focused': 3,
  7 |         'Pulmonary_artery': 4,
  8 |         'PSAX_AV': 5,
  9 |         'PSAX_MV': 6,
 10 |         'PSAX_papillary': 7,
 11 |         'PSAX_apex': 8,
 12 |         'A4C': 9,
 13 |         'A5C': 10,
 14 |         'A3C': 11,
 15 |         'A2C': 12,
 16 |         'Suprasternal': 13,
 17 |         'Subcostal': 14,
 18 |     },
 19 |     'doppler': {
 20 |         'standard': 0,
 21 |         'doppler': 1,
 22 |         '3-D': 2,
 23 |     },
 24 | 
 25 |     'quality': {
 26 |         'good': 0,
 27 |         'unusable': 1,
 28 |     },
 29 |     'canonical': {
 30 |         'on_axis': 0,
 31 |         'off_axis': 1,
 32 |     },
 33 |     'LV_EjectionFraction': {
 34 |         'N': {
 35 |             'index': 0,
 36 |             'weight': 0.259667,
 37 |         },
 38 |         'A': {
 39 |             'index': 1,
 40 |             'weight': 0.862008,
 41 |         },
 42 |         'I': {
 43 |             'index': 2,
 44 |             'weight': 0.916131,
 45 |         },
 46 |         'L': {
 47 |             'index': 3,
 48 |             'weight': 0.980843,
 49 |         },
 50 |         'H': {
 51 |             'index': 0,
 52 |             'weight': 0.981351,
 53 |         },
 54 |     },
 55 |     'LV_FunctionDescription': {
 56 |         '4.0': {
 57 |             'index': 0,
 58 |             'weight': 0.520803,
 59 |         },
 60 |         '2.0': {
 61 |             'index': 1,
 62 |             'weight': 0.662169,
 63 |         },
 64 |         '3.0': {
 65 |             'index': 2,
 66 |             'weight': 0.817028,
 67 |         },
 68 |     },
 69 |     'LV_CavitySize': {
 70 |         'N': {
 71 |             'index': 0,
 72 |             'weight': 0.209487,
 73 |         },
 74 |         'D': {
 75 |             'index': 1,
 76 |             'weight': 0.833406,
 77 |         },
 78 |         'S': {
 79 |             'index': 2,
 80 |             'weight': 0.957354,
 81 |         },
 82 |         'P': {
 83 |             'index': 3,
 84 |             'weight': 1.0,
 85 |         },
 86 |     },
 87 |     'RV_SystolicFunction': {
 88 |         'N': {
 89 |             'index': 0,
 90 |             'weight': 0.19156206811684748,
 91 |         },
 92 |         'Y': {
 93 |             'index': 1,
 94 |             'weight': 2.5944871794871798,
 95 |         },
 96 |         'A': {
 97 |             'index': 2,
 98 |             'weight': 4.161422989923915,
 99 |         },
100 |         'L': {
101 |             'index': 3,
102 |             'weight': 8.256629946960423,
103 |         },
104 |     },
105 | }
106 | 


--------------------------------------------------------------------------------
/model_zoo/DROID/encoders/LA_DROID/model/checkpoint:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eb5a38d5763dc7f41acdb9468670ec94f1b31846285558ab69e302bd01917962
3 | size 65
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID/encoders/LA_DROID/model/chkp.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:29c6b22e560ee68834704974407e5ed3bdd8c6ad5adc4c1064e4a0bc8f75a79f
3 | size 34804093
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID/encoders/LA_DROID/model/chkp.index:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2c25e8c208cc21d755e3be52d32c61994ecf04a27401cb16c17f42f18d9a4482
3 | size 99388
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID/encoders/LV_DROID/model/checkpoint:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eb5a38d5763dc7f41acdb9468670ec94f1b31846285558ab69e302bd01917962
3 | size 65
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID/encoders/LV_DROID/model/chkp.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eb84c1c904df40ad73dc6048cca7a29a418be2dfcf52dcc0d1f9a6aaa4df4626
3 | size 34816429
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID/encoders/LV_DROID/model/chkp.index:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a92657cf830c64e12841d5cff5e13be6bec35ba1399501ed5cb7e913aa050a12
3 | size 99388
4 | 


--------------------------------------------------------------------------------
/model_zoo/DROID/model_descriptions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/DROID/model_descriptions/__init__.py


--------------------------------------------------------------------------------
/model_zoo/ECG2AF/architecture.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:494e8f6bb6f032b21877e4b3fa69a6d9e00c3f5d9fd898c100943db84a2f911e
3 | size 1301867
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG2AF/ecg2af_infer.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "987044ef-389c-4d67-aed1-ae420a92f35f",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "import numpy as np\n",
11 |     "from tensorflow.keras.models import load_model\n",
12 |     "from ml4h.models.model_factory import get_custom_objects\n",
13 |     "from ml4h.tensormap.ukb.demographics import age_in_days, af_dummy2, sex_dummy1\n",
14 |     "from ml4h.tensormap.ukb.survival import mgb_afib_wrt_instance2, mgb_death_wrt_instance2"
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "code",
19 |    "execution_count": null,
20 |    "id": "abbbac4b-5b97-4e6d-a5ab-0f39df9e41b1",
21 |    "metadata": {},
22 |    "outputs": [],
23 |    "source": [
24 |     "output_tensormaps = {tm.output_name(): tm for tm in [mgb_afib_wrt_instance2, mgb_death_wrt_instance2, age_in_days, af_dummy2, sex_dummy1]}\n",
25 |     "model = load_model('./ecg2af_quintuplet_v2024_01_13.keras')\n",
26 |     "ecg = np.random.random((1, 5000, 12))\n",
27 |     "prediction = model(ecg)"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": null,
33 |    "id": "1998e754-4f5f-4bb3-b611-ce06ad8fee36",
34 |    "metadata": {},
35 |    "outputs": [],
36 |    "source": [
37 |     "for name, pred in zip(model.output_names, prediction):\n",
38 |     "    otm = output_tensormaps[name]\n",
39 |     "    if otm.is_survival_curve():\n",
40 |     "        intervals = otm.shape[-1] // 2\n",
41 |     "        days_per_bin = 1 + otm.days_window // intervals\n",
42 |     "        predicted_survivals = np.cumprod(pred[:, :intervals], axis=1)\n",
43 |     "        print(f'AF Risk {otm} prediction is: {str(1 - predicted_survivals[0, -1])}')\n",
44 |     "    else:\n",
45 |     "        print(f'{otm} prediction is {pred}')\n",
46 |     "        "
47 |    ]
48 |   }
49 |  ],
50 |  "metadata": {
51 |   "kernelspec": {
52 |    "display_name": "Python 3 (ipykernel)",
53 |    "language": "python",
54 |    "name": "python3"
55 |   },
56 |   "language_info": {
57 |    "codemirror_mode": {
58 |     "name": "ipython",
59 |     "version": 3
60 |    },
61 |    "file_extension": ".py",
62 |    "mimetype": "text/x-python",
63 |    "name": "python",
64 |    "nbconvert_exporter": "python",
65 |    "pygments_lexer": "ipython3",
66 |    "version": "3.11.12"
67 |   }
68 |  },
69 |  "nbformat": 4,
70 |  "nbformat_minor": 5
71 | }
72 | 


--------------------------------------------------------------------------------
/model_zoo/ECG2AF/ecg2af_quintuplet_v2024_01_13.keras:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:de38e009be5613ebb31ee1969bc1cab648d0f11c865e8afbd60e4bb7784f5627
3 | size 43797442
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG2AF/ecg_5000_survival_curve_af_quadruple_task_mgh_v2021_05_21.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ece878a5ed4f1523735a35648a8fa2a3086261d2bf38a5db16c8f3d0fc34c667
3 | size 220080440
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG2AF/km.jpg:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e1f87168d95c01f55f6b19dc460f5329002e4256bb33d0b16a8cd7fff4144969
3 | size 242929
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG2AF/salience.jpg:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3b4e44790f22eecd5fb972032ea13eac24828ed308e318f4188334d1c8675f97
3 | size 567352
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG2AF/strip_II_survival_curve_af_v2021_06_15.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ef58a4b7294d91cb4e4686e6fd4ee72098719d55c43b9b0d487b34066923e0da
3 | size 219947384
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG2AF/strip_I_survival_curve_af_v2021_06_15.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d4c6b591d12b8fa4c1c0115adab442c68c7c33ce98c9071e66f7dd4f33f03296
3 | size 219883672
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG2AF/study_design.jpg:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3a8f3b219ed14d2a542ae983e78a70e7ea8eaf036ede3ef0f388bcfeb3ccb790
3 | size 241160
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG_PheWAS/README.md:
--------------------------------------------------------------------------------
 1 | ## ECG PheWAS
 2 | This directory contains python notebooks and instructions to create the models and results from 
 3 | [this NPJ Digital Medicine Paper](https://www.nature.com/articles/s41746-024-01418-9).
 4 | 
 5 | The raw model files are stored using `git lfs` so you must have `git` and `git lfs` installed and localize the full ~135MB autoencoder as well as the component decoder and encoder:
 6 | ```bash
 7 | git lfs pull --include model_zoo/ECG_PheWAS/*.h5
 8 | ```
 9 | 
10 | Our model expects ECG median waveforms with 600 milliVolt voltages across 12 leads as input and produces 
11 | a 256 dimensional latent space encoding, as well as a reconstructed ECG with the same shape as the input. 
12 | The notebook [ecg_write_biosppy_medians.ipynb](./ecg_write_biosppy_medians.ipynb) provides an example of creating these median waveforms from 10 second 12 lead ECGs.
13 | 
14 | The electrocardiogram (ECG) is an inexpensive and widely available diagnostic tool, and therefore has great potential 
15 | to facilitate disease detection in large-scale populations. 
16 | Both cardiac and noncardiac diseases may alter the appearance of the ECG, though the extent to which diseases across 
17 | the human phenotypic landscape can be detected on the ECG remains unclear. 
18 | We developed an autoencoder model that encodes and reconstructs ECG waveform data within a 
19 | multidimensional latent space.
20 | The ECG latent space model demonstrated a greater number of associations than ECG models using standard ECG intervals 
21 | alone, and generally resulted in improvements in discrimination of diseases compared to models comprising 
22 | only age, sex, and race. 
23 | We further demonstrate how a latent space model can be used to generate disease-specific ECG waveforms and facilitate 
24 | disease profiling for individual patients.
25 | 
26 | To create a model from scratch run:
27 | ```bash
28 |   python /path/to/ml4h/ml4h/recipes.py \
29 |     --mode train \
30 |     --tensors /path/to/hd5_tensors/ \
31 |     --output_folder /path/to/output/ \
32 |     --tensormap_prefix ml4h.tensormap.ukb \
33 |     --input_tensors ecg.ecg_biosppy_median_60bpm --output_tensors ecg.ecg_biosppy_median_60bpm \
34 |     --encoder_blocks conv_encode --decoder_blocks conv_decode --activation mish --conv_layers 23 23 \
35 |     --dense_blocks 46 --block_size 5 --dense_layers 256 --dense_normalize layer_norm \
36 |     --batch_size 2 --epochs 96 --training_steps 128 --validation_steps 36 --test_steps 32 --patience 64 \
37 |     --id ecg_median_autoencoder
38 | ```
39 | 
40 | Given this model, infer a latent space with:
41 | ```bash
42 | python /path/to/ml4h/ml4h/recipes.py \
43 |     --mode infer_encoders \
44 |     --tensors /path/to/hd5_tensors/ \
45 |     --output_folder /path/to/output/ \
46 |     --tensormap_prefix ml4h.tensormap.ukb \
47 |     --input_tensors ecg.ecg_biosppy_median_60bpm --output_tensors ecg.ecg_biosppy_median_60bpm \
48 |     --model_file /path/to/ml4h/model_zoo/ECG_PheWAS/mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.h5 \
49 |     --id ecg_median_autoencoder 
50 | ```
51 | 
52 | With this latent space and phecode diagnoses for the same cohort, the jupyter notebook 
53 | [latent_space_phewas](./latent_space_phewas.ipynb)
54 | allows you to conduct the PheWAS analysis.
55 | 
56 | ![UKB PheWAS Plot](./ukb_phewas.png)
57 | 


--------------------------------------------------------------------------------
/model_zoo/ECG_PheWAS/decoder_median.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2d6fda0d012dd40f06c806a37d03620a29d9707f9e537b79f8c002cbef6e060c
3 | size 23259088
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG_PheWAS/decoder_median.keras:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4d397f589150bf51d96d3fc73cff3c586afa24a2373cb679033e17a5e7318062
3 | size 23270296
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG_PheWAS/encoder_median.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f1a09f154272b4c45f75d6e8eb18eb1c36773fc3a9fb44ee693f4a7fd331e1dc
3 | size 21950032
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG_PheWAS/encoder_median.keras:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3fabb54ff60d3a7824a9824205a20f8545f0587aaecd43cf7caee2747eb68a7a
3 | size 21963836
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG_PheWAS/mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:526f990b8945224bd027ac1ad18365fed5409de3ea01c76aa91d1d5dbdeefa59
3 | size 135412328
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG_PheWAS/mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.keras:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:26928fe27362a5cda700a97b89606f73ba2853f7f54f2a2ae0adccc4f35c0be8
3 | size 23270296
4 | 


--------------------------------------------------------------------------------
/model_zoo/ECG_PheWAS/ukb_phewas.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c5b31985c77961e308c42fb02e99293834401271e34709c3f99e5dcf179ca08b
3 | size 309545
4 | 


--------------------------------------------------------------------------------
/model_zoo/PCLR/PCLR.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:50ca4fe3dd60c83d92eb4cd96cab4204e48193d6d42ee1704b758ab58675c9be
3 | size 25690896
4 | 


--------------------------------------------------------------------------------
/model_zoo/PCLR/PCLR/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR/saved_model.pb


--------------------------------------------------------------------------------
/model_zoo/PCLR/PCLR/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8340b2ad71f4afa0e7e19595a52e282f04d795ec1389b08f627bfaff04d68053
3 | size 25600481
4 | 


--------------------------------------------------------------------------------
/model_zoo/PCLR/PCLR/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR/variables/variables.index


--------------------------------------------------------------------------------
/model_zoo/PCLR/PCLR_lead_I/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR_lead_I/saved_model.pb


--------------------------------------------------------------------------------
/model_zoo/PCLR/PCLR_lead_I/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ed447ea195ee03ee40911130d0e304dfea0b1e361206d6394387c77068fb6dc3
3 | size 25555425
4 | 


--------------------------------------------------------------------------------
/model_zoo/PCLR/PCLR_lead_I/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR_lead_I/variables/variables.index


--------------------------------------------------------------------------------
/model_zoo/PCLR/PCLR_lead_II/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR_lead_II/saved_model.pb


--------------------------------------------------------------------------------
/model_zoo/PCLR/PCLR_lead_II/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:59347e3fb27c3fb98194e14b67ff2c2bb8288673425c6084ccc09441822b437a
3 | size 25553411
4 | 


--------------------------------------------------------------------------------
/model_zoo/PCLR/PCLR_lead_II/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR_lead_II/variables/variables.index


--------------------------------------------------------------------------------
/model_zoo/PCLR/get_representations.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | 
 3 | import numpy as np
 4 | from tensorflow.keras.models import load_model, Model
 5 | 
 6 | from preprocess_ecg import process_ecg, LEADS
 7 | 
 8 | 
 9 | def get_model() -> Model:
10 |     """Get PCLR embedding model"""
11 |     return load_model("./PCLR.h5")
12 | 
13 | 
14 | def get_representations(ecgs: List[Dict[str, np.ndarray]]) -> np.ndarray:
15 |     """
16 |     Uses PCLR trained model to build representations of ECGs
17 |     :param ecgs: A list of dictionaries mapping lead name to lead values.
18 |                  The lead values should be measured in milli-volts.
19 |                  Each lead should represent 10s of samples.
20 |     :return:
21 |     """
22 |     model = get_model()
23 |     ecgs = np.stack(list(map(process_ecg, ecgs)))
24 |     return model.predict(ecgs)
25 | 
26 | 
27 | def test_get_representations():
28 |     """Test to make sure get_representations works as expected"""
29 |     fake_ecg = {
30 |         lead: np.zeros(2500)
31 |         for lead in LEADS
32 |     }
33 |     fake_ecgs = [fake_ecg for _ in range(10)]
34 |     representations = get_representations(fake_ecgs)
35 |     assert representations.shape == (len(fake_ecgs), 320)
36 | 


--------------------------------------------------------------------------------
/model_zoo/PCLR/preprocess_ecg.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | import numpy as np
 3 | 
 4 | 
 5 | LEADS = [
 6 |     'I', 'II', 'III', 'aVR', 'aVL', 'aVF',
 7 |     'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
 8 | ]
 9 | 
10 | 
11 | def process_ecg(ecg: Dict[str, np.ndarray], ecg_samples: int = 4096) -> np.ndarray:
12 |     """
13 |     Prepares an ECG for use in a tensorflow model
14 |     :param ecg: A dictionary mapping lead name to lead values.
15 |                 The lead values should be measured in milli-volts.
16 |                 Each lead should represent 10s of samples.
17 |     :param ecg_samples: Length of each lead for input into the model.
18 |     :return: a numpy array of the ECG shaped (ecg_samples, 12)
19 |     """
20 |     assert set(ecg.keys()) == set(LEADS)
21 | 
22 |     out = np.zeros((ecg_samples, 12))
23 |     for i, lead_name in enumerate(LEADS):
24 |         lead = ecg[lead_name]
25 |         interpolated_lead = np.interp(
26 |             np.linspace(0, 1, ecg_samples),
27 |             np.linspace(0, 1, lead.shape[0]),
28 |             lead,
29 |         )
30 |         out[:, i] = interpolated_lead
31 |     return out
32 | 


--------------------------------------------------------------------------------
/model_zoo/PCLR/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.12.0
 2 | astunparse==1.6.3
 3 | cachetools==4.2.1
 4 | certifi==2020.12.5
 5 | chardet==4.0.0
 6 | gast==0.3.3
 7 | google-auth==1.28.0
 8 | google-auth-oauthlib==0.4.4
 9 | google-pasta==0.2.0
10 | grpcio==1.37.0
11 | h5py==2.10.0
12 | idna==2.10
13 | importlib-metadata==3.10.0
14 | Keras-Preprocessing==1.1.2
15 | Markdown==3.3.4
16 | numpy==1.22.0
17 | oauthlib==3.1.0
18 | opt-einsum==3.3.0
19 | protobuf==3.15.7
20 | pyasn1==0.4.8
21 | pyasn1-modules==0.2.8
22 | requests==2.25.1
23 | requests-oauthlib==1.3.0
24 | rsa==4.7.2
25 | scipy==1.4.1
26 | six==1.15.0
27 | tensorboard==2.4.1
28 | tensorboard-plugin-wit==1.8.0
29 | tensorflow==2.7.2
30 | tensorflow-estimator==2.3.0
31 | termcolor==1.1.0
32 | typing-extensions==3.7.4.3
33 | urllib3==1.26.5
34 | Werkzeug==1.0.1
35 | wrapt==1.12.1
36 | zipp==3.4.1
37 | 


--------------------------------------------------------------------------------
/model_zoo/README.md:
--------------------------------------------------------------------------------
 1 | # How to Add an Animal to the Model Zoo:
 2 | 
 3 | ## Create a new folder in this directory with you model name
 4 | Create a `README.md` file in your model directory with documentation and code snippets.
 5 | 
 6 | ## Document model inputs outpus and performance 
 7 | What shape data does the model expect? What data was used to train it? How does it perform?
 8 | 
 9 | ## Add the model weights and architecture file(s) with `git lfs`
10 | Add `.h5` or `.keras` or `.pb` model files to the model_zoo directory using git Large File Storage.
11 | 
12 | ## Add code snippets for model loading and inference
13 | 
14 | ## Add code snippets or command lines for model training.
15 | 
16 | ## Describe model architecture, interpretability and use-cases.
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/model_zoo/adiposity_mlandepi/README.md:
--------------------------------------------------------------------------------
1 | # Machine learning on >40,000 body MRIs and fat distribution
2 | 
3 | 1. Ingest the provided whole-body MRI `zip` files using `ingest.py`
4 | 2. Compute the 2D projections using the ingested 3D volumes with `compute_projections.py`
5 | 3. Train deep learning-based model(s) using `train.py`
6 | 4. Phenotype characterization and disease association with `downstream_associations_v3.ipynb`
7 | 


--------------------------------------------------------------------------------
/model_zoo/cardiac_mri_derived_left_ventricular_mass/Lreg.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:660c74be8d5128d33d305bc6dc7df89d8ccb1966f0e5118c26dbffaa65b950dc
3 | size 31427
4 | 


--------------------------------------------------------------------------------
/model_zoo/cardiac_mri_derived_left_ventricular_mass/Lseg.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c0e989cfebab9c505a8c7f76a3a92153583d63420cdc2a83e4722a4fc16ead02
3 | size 28767
4 | 


--------------------------------------------------------------------------------
/model_zoo/cardiac_mri_derived_left_ventricular_mass/calibrations_sax_all_diastole_segmented.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cc12e1edae86b3d6d75a48c7853dba94c8de1e561159678312f2decf02c072e1
3 | size 115323
4 | 


--------------------------------------------------------------------------------
/model_zoo/cardiac_mri_derived_left_ventricular_mass/metric_history_sax_diastole_segment_no_flat.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9f9e5414cb0fc28ec65062bda8e1d6e52ca587b311be05c2039a42cd96ad559a
3 | size 509238
4 | 


--------------------------------------------------------------------------------
/model_zoo/cardiac_mri_derived_left_ventricular_mass/per_class_roc_sax_all_diastole_segmented.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b2433bef8c35a7ab8eb7bf876d8a1e464a04b59bea195aa6757c86e3ea2c49f6
3 | size 44993
4 | 


--------------------------------------------------------------------------------
/model_zoo/cardiac_mri_derived_left_ventricular_mass/precision_recall_sax_all_diastole_segmented.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:698bbd48917a1f1d86b1be7d56cc0dae1d76ca1cdd082e86b2796e1c8d31bcce
3 | size 50560
4 | 


--------------------------------------------------------------------------------
/model_zoo/cardiac_mri_derived_left_ventricular_mass/sax_diastole_segment_no_flat.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:05f8fe4843d04cd526b8f2c2424ac98554238089e43a4376ffb2e12122206e4f
3 | size 15191464
4 | 


--------------------------------------------------------------------------------
/model_zoo/dropfuse/decoder_ecg_rest_median_raw_10.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:256c617fe91863c4994b367f0e1efbcbf7ff17378c593ba7498ce3d96fd66c4e
3 | size 23230192
4 | 


--------------------------------------------------------------------------------
/model_zoo/dropfuse/decoder_lax_4ch_heart_center.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5fa8a2f7dde41668dfe52b4465e8d9a6972956e553486b8b76f9400ffc6f5b41
3 | size 7010232
4 | 


--------------------------------------------------------------------------------
/model_zoo/dropfuse/dropout_pair_contrastive_lax_4ch_cycle_ecg_median_10_pretrained_256d_v2020_06_07.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fc7f8e9b5d5c1b17992d7c676b64f269f6f67248b07a68efe312e93e53934dff
3 | size 177298400
4 | 


--------------------------------------------------------------------------------
/model_zoo/dropfuse/encoder_ecg_rest_median_raw_10.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:209f4d9a6ebcee02ced493a320d2116787b9cfaba2476a653e442581557475d0
3 | size 21916216
4 | 


--------------------------------------------------------------------------------
/model_zoo/dropfuse/encoder_lax_4ch_heart_center.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:72d98c3ff072ea0fdac4753986ca913394bfff231dc9d9cc130bbe4fe276cdac
3 | size 6986488
4 | 


--------------------------------------------------------------------------------
/model_zoo/dropfuse/overview.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d18e2285270165380316c9e8b5919a3cc84b211e018c1cb3ec8faaf56a6d79e8
3 | size 405082
4 | 


--------------------------------------------------------------------------------
/model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Learning to Predict Cardiac Magnetic Resonance-Derived Left Ventricular Mass and Hypertrophy from 12-Lead Electrocardiograms
 2 | 
 3 | This folder contains models and code supporting the work described in [this paper](https://www.ahajournals.org/doi/10.1161/CIRCIMAGING.120.012281?url_ver=Z39.88-2003&rfr_id=ori:rid:crossref.org&rfr_dat=cr_pub%20%200pubmed) published in the journal Circulation: Cardiovascular Imaging. 
 4 | 
 5 | # LVM-AI
 6 | Left Ventricular Mass-Artificial Intelligence (LVM-AI) is a one-dimensional convolutional neural network trained to predict CMR-derived LV mass using 12-lead ECGs. LVM-AI was trained within 32239 individuals from the UK Biobank with paired CMR and 12-lead ECG. It was provided with the entire 10 seconds of the 12-lead ECG waveform as well as participant age, sex, and BMI. 
 7 | LVM-AI was evaluated in a UK Biobank test set as well as an external health care–based Mass General Brigham (MGB) dataset. In both test sets, LVM-AI was compared to with traditional ECG-based rules for diagnosing CMR-derived left ventricular hypertrophy. Associations between LVM-AI predicted LV mass index and incident cardiovascular events were tested in the UK Biobank and a separate MGB-based ambulatory cohort (MGB outcomes)
 8 | ![Overview of the training and test samples](TrainingAndTestSets.jpg)
 9 | When compared with any ECG rule, LVM-AI demonstrated similar LVH discrimination in the UK Biobank (LVM-AI c-statistic 0.653 [95% CI, 0.608 -0.698] versus any ECG rule c-statistic 0.618 [95% CI, 0.574 -0.663], P=0.11) and superior discrimination in MGB (0.621; 95% CI, 0.592 -0.649 versus 0.588; 95% CI, 0.564 -0.611, P=0.02). 
10 | 
11 | 
12 | # Models 
13 | Three pre-trained models are included here:
14 | The model `ecg_rest_raw_age_sex_bmi_lvm_asymmetric_loss.h5` takes as input a 12 Lead resting ECG, as well as age, sex and BMI and has two outputs: one which regresses the left ventricular mass, and a second which gives a probability of left ventricular hypertrophy. This model was trained with the asymmetric loss described in the paper.  
15 | The model `ecg_rest_raw_lvm_asymmetric_loss.h5` takes only an ECG as input and regresses left ventricular mass. This model was also trained with the asymmetric loss.
16 | The third model, `ecg_rest_raw_lvm_symmetric_loss.h5` takes only an ECG as input and regresses left ventricular mass. This model was trained with the symmetric logcosh loss.  The raw voltage values from the ECG are normalized by dividing by 2000 prior to being input to the model.


--------------------------------------------------------------------------------
/model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/TrainingAndTestSets.jpg:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6bea557c5150adbec08f1e50410d7b61c5ce228752c4b9785f7243f5e88bcd69
3 | size 175820
4 | 


--------------------------------------------------------------------------------
/model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/ecg_rest_raw_age_sex_bmi_lvm_asymmetric_loss.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0764989632149590ff47f86e665cec31da592b7b6f2275f3c9ac9f949ccce017
3 | size 14870648
4 | 


--------------------------------------------------------------------------------
/model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/ecg_rest_raw_lvm_asymmetric_loss.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:236619e5a99e6e44bd37f35b88ff86a5d5ea4ca3f98dff437cdb0dd7f636015c
3 | size 12720392
4 | 


--------------------------------------------------------------------------------
/model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/ecg_rest_raw_lvm_symmetric_loss.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:370f9f672bdbd4e61ce1f439b9a58b3150ed7b8cc9a04f49e2923f297c3e1491
3 | size 12720392
4 | 


--------------------------------------------------------------------------------
/model_zoo/liver_fat_from_mri_ukb/README.md:
--------------------------------------------------------------------------------
 1 | # Machine learning enables new insights into clinical significance of and genetic contributions to liver fat accumulation
 2 | 
 3 | This folder contains models and code supporting the work described in [this paper](https://www.sciencedirect.com/science/article/pii/S2666979X21000823) published in Cell Genomics.
 4 | 
 5 | Here we host two models for estimating liver fat from abdominal MRI. 
 6 | The liver fat percentage training data is from the returned liver fat values in the [UK Biobank field ID 22402](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=22402).  These values were only calculated for the echo protocol, so to infer liver fat from the ideal protocol we used a teacher/student modeling approach.
 7 | 
 8 | ## Teacher Model
 9 | The teacher model was trained with abdominal MRIs acquired using the [echo protocol, UK Biobank field ID 20203](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=20203).  
10 | This model takes input of shape 160 x 160 x 10 and emits a scalar representing estimated liver fat percentage.
11 | The input TensorMap is defined at `tensormap.ukb.mri.gre_mullti_echo_10_te_liver`.
12 | The output TensorMap associated with these values is defined at `tensormap.ukb.mri.liver_fat`.
13 | The keras model file is at [liver_fat_from_echo.h5](liver_fat_from_echo.h5) and the model architecture is shown below.  The "?" in the input dimension represents the batch size of the input, which can be determined at runtime.  When training the teacher model we used a batch size of 8.
14 | ![https://www.medrxiv.org/content/10.1101/2020.09.03.20187195v1](liver_fat_from_echo_teacher_model.png)
15 | 
16 | 
17 | ## Student Model
18 | The teacher model made inferences on all available MRIs acquired with the echo protocol, which includes some individuals who also had abdominal MRI with the [ideal protocol,  UK Biobank field ID 20254](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=20254).
19 | The student model was trained on these individuals, using the teacher model's inferences as truth data and abdominal MRIs acquired with the ideal protocol as input.  
20 | This model takes input of shape 232 x 256 x 36 and also emits a scalar representing estimated liver fat percentage.
21 | The input TensorMap is defined at `tensormap.ukb.mri.lms_ideal_optimised_low_flip_6dyn`.
22 | The output TensorMap associated with these values is defined at `tensormap.ukb.mri.liver_fat_echo_predicted`. 
23 | The keras model file is at [liver_fat_from_ideal.h5](liver_fat_from_ideal.h5) and the model architecture is shown below. The "?" in the input dimension represents the batch size of the input, which can be determined at runtime.  When training the student model we used a batch size of 5.
24 | ![Architecture Diagram](liver_fat_from_ideal_student_model.png)
25 | 


--------------------------------------------------------------------------------
/model_zoo/liver_fat_from_mri_ukb/liver_fat_from_echo.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:624ae5c619c9831b822fe586f657677d4b9938b38da7148f459e5a429a801df4
3 | size 4926304
4 | 


--------------------------------------------------------------------------------
/model_zoo/liver_fat_from_mri_ukb/liver_fat_from_echo_teacher_model.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bf649630b906da34cd46647771986c3e7d0911e63093eb28d67a0983f8becda6
3 | size 380259
4 | 


--------------------------------------------------------------------------------
/model_zoo/liver_fat_from_mri_ukb/liver_fat_from_ideal.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:31c61e2fbe7eb685e7b86a069c97cbfaba8224ece0145522718640acfcb07017
3 | size 9896304
4 | 


--------------------------------------------------------------------------------
/model_zoo/liver_fat_from_mri_ukb/liver_fat_from_ideal_student_model.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2f15f673801826361835d0631f9de1aca36493e7ef695f78b13b11348551ff94
3 | size 433324
4 | 


--------------------------------------------------------------------------------
/model_zoo/mi_feature_selection/README.md:
--------------------------------------------------------------------------------
 1 | # ML4HEN-COX
 2 | 
 3 | ## Description
 4 | 
 5 | This repo contains code to perform feature selection in very large datasets --- in both number of samples and number of covariates --- using survival data.
 6 | 
 7 | ## Requirements
 8 | 
 9 | This code was tested using python 3.7.
10 | It can be used using virtual env.
11 | 
12 | ```bash
13 | python3 -m venv env
14 | source env/bin/activate
15 | pip install -r requirements.txt
16 | ```
17 | 
18 | ## Usage
19 | 
20 | Several files are provided:
21 | 
22 | * [coxnet_training_testing_evaluating.py](./coxnet_training_testing_evaluating.py): fitting CoxNet models
23 | * [xgboost_training_testing_evaluating.py](./xgboost_training_testing_evaluating.py): fitting XgCox models
24 | * [2020.11.30_analysis_cleaned2.r](./2020.11.30_analysis_cleaned2.r): downstream R code
25 | * [2020.11.30_analysis_cleaned2.ipynb](./2020.11.30_analysis_cleaned2.ipynb): downstream notebook
26 | 
27 | ### Model loading
28 | 
29 | The provided xgboost model can be loaded as follows:
30 | 
31 | ```py
32 | import xgboost as xgb
33 | xgcox = xgb.Booster()
34 | xgcox.load_model("models/xgcox_model.json")
35 | ```
36 | 
37 | and the CoxNet model as:
38 | 
39 | ```py
40 | import pickle
41 | from sksurv.linear_model import CoxnetSurvivalAnalysis
42 | xxx = pickle.load(open('models/coxnet_survival_05_final.pickle', 'rb'))
43 | ```
44 | 
45 | ### Citation
46 | 
47 | **[Selection of 51 predictors from 13,782 candidate multimodal features using machine learning improves coronary artery disease prediction](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8672148/)**, Saaket Agrawal, BS*, Marcus D. R. Klarqvist, PhD,  MSc, MSc*, Connor Emdin, DPhil, MD, Aniruddh P. Patel, MD, Manish D. Paranjpe, BA, Patrick T. Ellinor, MD, PhD, Anthony Philippakis, MD, PhD, Kenney Ng, PhD, Puneet Batra, PhD, Amit V. Khera, MD, MSc
48 | 
49 | 


--------------------------------------------------------------------------------
/model_zoo/mi_feature_selection/models/coxnet_survival_05_final.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/mi_feature_selection/models/coxnet_survival_05_final.pickle


--------------------------------------------------------------------------------
/model_zoo/mi_feature_selection/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit_survival==0.13.1
2 | fastparquet==0.5.0
3 | xgboost==1.1.0_SNAPSHOT
4 | numpy==1.18.5
5 | pandas==1.1.5
6 | pyarrow==0.16.0
7 | scikit_learn==0.24.2
8 | 


--------------------------------------------------------------------------------
/model_zoo/registration_reveals_genetics/README.md:
--------------------------------------------------------------------------------
 1 | This folder contains the code and notebooks used in our paper: ["Genetic Architectures of Medical Images Revealed by Registration of Multiple Modalities"](https://doi.org/10.1177/11779322241282489)
 2 | 
 3 | We show how the systematic importance of registration for finding genetic signals directly from medical imaging modalities.
 4 | This is demonstrated across a wide range of registration techniques.  
 5 | Our multimodal autoencoder comparison framework allows us to learn representations of medical images before and after registration.
 6 | The learned registration methods considered are graphically summarized here:
 7 | ![Learned Registration Methods](./registration.png)
 8 | 
 9 | For example, to train a uni-modal autoencoder for DXA 2 scans:
10 | ```bash
11 |   python /path/to/ml4h/ml4h/recipes.py \
12 |     --mode train \
13 |     --tensors /path/to/hd5_tensors/ \
14 |     --output_folder /path/to/output/ \
15 |     --tensormap_prefix ml4h.tensormap.ukb \
16 |     --input_tensors dxa.dxa_2 --output_tensors dxa.dxa_2 \
17 |     --encoder_blocks conv_encode --merge_blocks --decoder_blocks conv_decode \
18 |     --activation swish --conv_layers 32 --conv_width 31 --dense_blocks 32 32 32 32 32 --dense_layers 256 --block_size 3 \
19 |     --inspect_model --learning_rate 0.0001 \
20 |     --batch_size 4 --epochs 216 --training_steps 128 --validation_steps 36 --test_steps 4 --patience 36 \
21 |     --id dxa_2_autoencoder_256d
22 | ```
23 | 
24 | To train the cross-modal (DXA 2 <-> DXA5) registration with the DropFuse model the command line is:
25 | ```bash
26 |   python /path/to/ml4h/ml4h/recipes.py \
27 |     --mode train \
28 |     --tensors /path/to/hd5_tensors/ \
29 |     --output_folder /path/to/output/ \
30 |     --tensormap_prefix ml4h.tensormap.ukb \
31 |     --input_tensors dxa.dxa_2 dxa.dxa_5 --output_tensors dxa.dxa_2 dxa.dxa_5 \
32 |     --encoder_blocks conv_encode --merge_blocks pair --decoder_blocks conv_decode \
33 |     --pairs dxa.dxa_2 dxa.dxa_5 --pair_loss contrastive --pair_loss_weight 0.1 --pair_merge dropout \
34 |     --activation swish --conv_layers 32 --conv_width 31 --dense_blocks 32 32 32 32 32 --dense_layers 256 --block_size 3 \
35 |     --inspect_model --learning_rate 0.0001 \
36 |     --batch_size 4 --epochs 216 --training_steps 128 --validation_steps 36 --test_steps 4 --patience 36 \
37 |     --id dxa_2_5_dropfuse_256d
38 | ```
39 | Similiarly, autoencoders and cross modal fusion for all the modalities considered in the paper can be trained by changing the `--input_tensors` and `--output_tensors` arguments to point at the appropriate `TensorMap`, and if necessary updating the model architecture hyperparameters.
40 | Table 1 lists all the modalities included in the paper.
41 | ![Table of modalities](./table1.png)
42 | 
43 | Then with latent space inference with models before and after registration we can evaluate their learned representations.
44 | ```bash
45 |     python /home/sam/ml4h/ml4h/recipes.py \
46 |     --mode infer_encoders \
47 |     --tensors /path/to/hd5_tensors/ \
48 |     --output_folder /path/to/output/ \
49 |     --tensormap_prefix ml4h.tensormap.ukb \
50 |     --input_tensors dxa.dxa_2 --output_tensors dxa.dxa_2 \ 
51 |     --id dxa_2_autoencoder_256d \
52 |     --model_file /path/to/output/dxa_2_autoencoder_256d/dxa_2_autoencoder_256d.h5
53 | ```
54 | 
55 | We compare the strength and number of biological signals found with the [Latent Space Comparisons notebook](./latent_space_comparisons.ipynb).
56 | This notebook is used to populate the data summarized in Table 2 of the paper.
57 | ![Table of results](./table2.png)
58 | 


--------------------------------------------------------------------------------
/model_zoo/registration_reveals_genetics/registration.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a61ad1186235a5c24958202a73279d4a9b0f08a13c82b85e92f40d38e4c0119c
3 | size 529604
4 | 


--------------------------------------------------------------------------------
/model_zoo/registration_reveals_genetics/table1.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fa54d40f5143e9efad3ce5e01c53bc950bca9301a7ee6a7c8c0edcdcc3d7ea01
3 | size 259349
4 | 


--------------------------------------------------------------------------------
/model_zoo/registration_reveals_genetics/table2.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c0a056860884eafcd450fa22df45ab19b4513fa3e35572354187a9ed1d6b89f1
3 | size 344842
4 | 


--------------------------------------------------------------------------------
/model_zoo/silhouette_mri/README.md:
--------------------------------------------------------------------------------
 1 | # Estimating body fat distribution from silhouette images
 2 | 
 3 | ## Description
 4 | 
 5 | This repo contains code to prepare silhouettes from UK Biobank whole-body magnetic resonance images and training deep learning models to estimate fat-depot volumes.
 6 | 
 7 | ## Usage
 8 | 
 9 | Several files are provided:
10 | 
11 | * [ingest_mri.py](../../ml4h/applications/ingest/ingest_mri.py): ingesting UKB MRI data
12 | * [two_d_projection.py](../../ml4h/applications/ingest/two_d_projection.py): computing 2-dimensional projections
13 | * [ingest_autosegment.py](../../ml4h/applications/ingest/ingest_autosegment.py): autosegmenting axial slices
14 | * [train_models.py](./train_models.py): training deep-learning models
15 | * [callbacks.py](./callbacks.py): supporting callbacks required during training
16 | * [shrinkage_loss.py](./shrinkage_loss.py): supportive loss required during training
17 | 
18 | ### Citation
19 | 
20 | **[Estimating body fat distribution - a driver of cardiometabolic health - from silhouette images](https://www.medrxiv.org/content/10.1101/2022.01.14.22269328v2)**, Marcus D. R. Klarqvist, PhD*, Saaket Agrawal, BS*, Nathaniel Diamant, BS, Patrick T. Ellinor, MD, PhD, Anthony Philippakis, MD, PhD,  Kenney Ng, PhD, Puneet Batra, PhD, Amit V. Khera, MD
21 | 


--------------------------------------------------------------------------------
/model_zoo/silhouette_mri/callbacks.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Marcus D. R. Klarqvist, PhD, MSc
 4 | # https://github.com/mklarqvist/tf-computer-vision
 5 | #
 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | # of this software and associated documentation files (the "Software"), to deal
 8 | # in the Software without restriction, including without limitation the rights
 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 | #
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | # SOFTWARE.
23 | import tensorflow as tf
24 | 
25 | class LossHistory(tf.keras.callbacks.Callback):
26 |     def __init__(self, decay_function):
27 |         self.decay_function = decay_function
28 |         self.async_safe = True
29 | 
30 |     def on_train_begin(self, logs={}):
31 |         self.losses = []
32 |         self.lr = []
33 | 
34 |     def on_epoch_end(self, batch, logs={}):
35 |         self.losses.append(logs.get("loss"))
36 |         self.lr.append(self.decay_function(len(self.losses)))
37 | 
38 | 
39 | class BatchMetricsLogger(tf.keras.callbacks.Callback):
40 |     """Callback function used during `model.evaluate` calls with batch size set to 1.
41 |     This approach stores `tf.keras.metrics` callback results for each test example.
42 |     Only data generated by `tf.keras.metrics` functions will produce correct results!
43 | 
44 |     Example use:
45 |     ```python
46 |     metrics_dict = {m.name: m for m in model.metrics}
47 |     logger = BatchMetricsLogger(metrics = metrics_dict)
48 |     eval = model.evaluate(test_ds, callbacks=[logger], verbose=1)
49 |     eval_batch = pd.DataFrame(logger.storage, index = test_data.index)
50 |     ```
51 |     """
52 | 
53 |     def __init__(self, metrics):
54 |         super(BatchMetricsLogger, self).__init__()
55 |         self.metrics = metrics
56 |         self.storage = []
57 |         self.async_safe = True
58 | 
59 |     #
60 |     def on_test_batch_end(self, batch, logs=None):
61 |         self.storage.append(logs)
62 | 


--------------------------------------------------------------------------------
/phenotype_labels/disease/DATES.md:
--------------------------------------------------------------------------------
 1 | # Dates in the UK Biobank
 2 | 
 3 | ## Attended assessment center
 4 | 
 5 | * Date FieldID 53
 6 | 
 7 | Useful for:
 8 | 
 9 | 1. Defining threshold date for incidence
10 | 1. Defining dates for things that don't otherwise have an associated date
11 | 
12 | ## Birth
13 | 
14 | * Date FieldID 34: Year of birth
15 | * Date FieldID 52: Month of birth
16 | * Date Field 33: birth date (*Note: this field is restricted due to its precision*)
17 | 
18 | ## Lost to follow-up
19 | 
20 | * Date FieldID 191
21 | 
22 | ## Died
23 | 
24 | * Date FieldID 40000
25 | 
26 | ## ICD10
27 | 
28 | * Date FieldID ==> derived from HESIN (Hospital Episode Statistics data in Showcase)
29 | * Main ICD10: 41202
30 | * Secondary ICD10: 41204
31 | * ICD10 Primary Cause of Death: 40001
32 | * ICD10 Secondary Cause of Death: 40002
33 | 
34 | ## ICD9
35 | 
36 | * Date FieldID ==> derived from HESIN
37 | * Main ICD9: 41203
38 | * Secondary ICD9: 41205
39 | 
40 | ## Operation (OPCS4)
41 | 
42 | * Date FieldID ==> derived from HESIN
43 | * Main OPCS4: 41200
44 | * Secondary OPCS4: 41210
45 | * Self-reported:
46 |   * FieldID: 20004
47 |   * *float32 Year*: 20010 (need to truncate and add month/day)
48 | 
49 | ## Special cases
50 | 
51 | * Subarachnoid hemorrhage
52 |   * FieldID: 42013
53 |   * Date: 42012
54 | * Intracerebral hemorrhage
55 |   * FieldID: 42011
56 |   * Date: 42010
57 | * Hemorrhagic stroke
58 |   * FieldID: 42009
59 |   * Date: 42008
60 | * Stroke
61 |   * FieldID: 42007
62 |   * Date: 42006
63 | * Myocardial infarction
64 |   * FieldID: 42001
65 |   * Date: 42000
66 | * Self-reported operation
67 |   * FieldID: 20004
68 |   * Date: 20010
69 | * Non-cancer illness
70 |   * FieldID: 20002
71 |   * Date: 20008
72 | * Cancer
73 |   * FieldID: 20001
74 |   * Date: 20006
75 | 


--------------------------------------------------------------------------------
/phenotype_labels/disease/README.md:
--------------------------------------------------------------------------------
 1 | # CVDI/Disease
 2 | CVDI/Disease computes derived phenotypes, with incidence, prevalence, death, and censoring, from tabfiles (Seung Hoan's phenotype definition format). As a brief review, tabfiles contain 3 columns: Field, Coding, and exclude. E.g.:
 3 | 
 4 | ```
 5 | Field	Coding	exclude
 6 | 20002	1076,1079	0
 7 | ```
 8 | 
 9 | *Field* is the UK Biobank FieldID (e.g., `phenotype.FieldID`)
10 | 
11 | *Coding* is the UK Biobank value (e.g., `phenotype.value` or `coding.coding`)
12 | 
13 | *Exclude* is whether the row is an exclusion criterion (1) or not (0)
14 | 
15 | # Install updated dependencies
16 | `go get -u`
17 | 
18 | # Build (examples)
19 | `go build -o cvdidisease.osx *.go`
20 | 
21 | `GOOS=linux go build -o cvdidisease.linux *.go`
22 | 
23 | # Database dependencies
24 | This requires the materialized tables (defined in the SQL files in this directory) to exist in tables with the same name as their filename (except the suffix).


--------------------------------------------------------------------------------
/phenotype_labels/disease/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"context"
 6 | 	"flag"
 7 | 	"log"
 8 | 	"os"
 9 | 	"path"
10 | 	"strings"
11 | 
12 | 	"cloud.google.com/go/bigquery"
13 | )
14 | 
15 | type WrappedBigQuery struct {
16 | 	Context  context.Context
17 | 	Client   *bigquery.Client
18 | 	Project  string
19 | 	Database string
20 | }
21 | 
22 | var (
23 | 	BufferSize = 4096 * 8
24 | 	STDOUT     = bufio.NewWriterSize(os.Stdout, BufferSize)
25 | )
26 | 
27 | var materializedDB string
28 | 
29 | func main() {
30 | 	defer STDOUT.Flush()
31 | 
32 | 	var BQ = &WrappedBigQuery{
33 | 		Context: context.Background(),
34 | 	}
35 | 	var tabfile string
36 | 	var displayQuery bool
37 | 	var override bool
38 | 	var diseaseName string
39 | 
40 | 	flag.StringVar(&BQ.Project, "project", "", "Google Cloud project you want to use for billing purposes only")
41 | 	flag.StringVar(&BQ.Database, "database", "", "BigQuery source database name (note: must be formatted as project.database, e.g., broad-ml4cvd.ukbb7089_201904)")
42 | 	flag.StringVar(&tabfile, "tabfile", "", "Tabfile-formatted phenotype definition")
43 | 	flag.StringVar(&materializedDB, "materialized", "broad-ml4cvd.ukbb7089_201904", "project.database storing materialized view tables")
44 | 	flag.BoolVar(&displayQuery, "display-query", false, "Display the constructed query and exit?")
45 | 	flag.BoolVar(&override, "override", false, "Force run, even if this tool thinks your tabfile is inadequate?")
46 | 	flag.StringVar(&diseaseName, "disease", "", "If not specified, the tabfile will be parsed and become the disease name.")
47 | 	flag.Parse()
48 | 
49 | 	if BQ.Project == "" || BQ.Database == "" || tabfile == "" || materializedDB == "" {
50 | 		flag.PrintDefaults()
51 | 		os.Exit(1)
52 | 	}
53 | 
54 | 	tabs, err := ParseTabFile(tabfile)
55 | 	if err != nil {
56 | 		log.Fatalln(err)
57 | 	}
58 | 
59 | 	if diseaseName == "" {
60 | 		diseaseName = path.Base(tabfile)
61 | 		if parts := strings.Split(diseaseName, "."); len(parts) > 1 {
62 | 			diseaseName = strings.Join(parts[0:len(parts)-1], ".")
63 | 		}
64 | 	}
65 | 
66 | 	log.Println("Processing disease", diseaseName)
67 | 
68 | 	missingFields, err := tabs.CheckSensibility()
69 | 	if err != nil && !override {
70 | 		log.Println(err)
71 | 		log.Fatalf("%s: Add the missing fields to your tabfile, or re-run with the -override flag to process anyway.\n", diseaseName)
72 | 	} else if err != nil && override {
73 | 		log.Println(diseaseName, err)
74 | 		log.Printf("%s: Overriding error check for missing fields and continuing.\n", diseaseName)
75 | 	}
76 | 
77 | 	BQ.Client, err = bigquery.NewClient(BQ.Context, BQ.Project)
78 | 	if err != nil {
79 | 		log.Fatalln("Connecting to BigQuery:", err)
80 | 	}
81 | 	defer BQ.Client.Close()
82 | 
83 | 	query, err := BuildQuery(BQ, tabs, displayQuery)
84 | 	if err != nil {
85 | 		log.Fatalln(diseaseName, err)
86 | 	}
87 | 
88 | 	if displayQuery {
89 | 		return
90 | 	}
91 | 
92 | 	if err := ExecuteQuery(BQ, query, diseaseName, missingFields); err != nil {
93 | 		log.Fatalln(diseaseName, err)
94 | 	}
95 | }
96 | 


--------------------------------------------------------------------------------
/phenotype_labels/disease/materialized_hesin_dates.sql:
--------------------------------------------------------------------------------
 1 | WITH oper4 AS (
 2 |   SELECT 41200 FieldID, eid, oper4 code, 
 3 |     CASE 
 4 |       WHEN h.admidate IS NOT NULL THEN h.admidate
 5 |       WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 
 6 |       ELSE h.epistart
 7 |     END vdate
 8 |   FROM `broad-ml4cvd.ukbb7089_201904.hesin` h
 9 |   WHERE oper4 IS NOT NULL
10 | ), diag_icd10 AS (
11 |   SELECT 41202 FieldID, eid, diag_icd10 code,
12 |     CASE 
13 |       WHEN h.admidate IS NOT NULL THEN h.admidate
14 |       WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 
15 |       ELSE h.epistart
16 |     END vdate
17 |   FROM `broad-ml4cvd.ukbb7089_201904.hesin` h
18 |   WHERE diag_icd10 IS NOT NULL
19 | ), diag_icd9 AS (
20 |   SELECT 41203 FieldID, eid, diag_icd9 code,
21 |     CASE 
22 |       WHEN h.admidate IS NOT NULL THEN h.admidate
23 |       WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 
24 |       ELSE h.epistart
25 |     END vdate
26 |   FROM `broad-ml4cvd.ukbb7089_201904.hesin` h
27 |   WHERE diag_icd9 IS NOT NULL
28 | ), oper4secondary AS (
29 |   SELECT 41210 FieldID, h.eid, sec.oper4 code, 
30 |     CASE 
31 |       WHEN h.admidate IS NOT NULL THEN h.admidate
32 |       WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 
33 |       ELSE h.epistart
34 |     END vdate
35 |   FROM `broad-ml4cvd.ukbb7089_201904.hesin_oper` sec
36 |   LEFT JOIN `broad-ml4cvd.ukbb7089_201904.hesin` h ON sec.eid=h.eid AND sec.record_id=h.record_id
37 |   WHERE TRUE
38 |     AND sec.oper4 IS NOT NULL
39 | ), diag_icd10_secondary AS (
40 |   SELECT 41204 FieldID, h.eid, sec.diag_icd10 code, 
41 |     CASE 
42 |       WHEN h.admidate IS NOT NULL THEN h.admidate
43 |       WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 
44 |       ELSE h.epistart
45 |     END vdate
46 |   FROM `broad-ml4cvd.ukbb7089_201904.hesin_diag10` sec
47 |   LEFT JOIN `broad-ml4cvd.ukbb7089_201904.hesin` h ON sec.eid=h.eid AND sec.record_id=h.record_id
48 |   WHERE TRUE
49 |     AND sec.diag_icd10 IS NOT NULL
50 | ), diag_icd9_secondary AS (
51 |   SELECT 41205 FieldID, h.eid, sec.diag_icd9 code, 
52 |     CASE 
53 |       WHEN h.admidate IS NOT NULL THEN h.admidate
54 |       WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 
55 |       ELSE h.epistart
56 |     END vdate
57 |   FROM `broad-ml4cvd.ukbb7089_201904.hesin_diag9` sec
58 |   LEFT JOIN `broad-ml4cvd.ukbb7089_201904.hesin` h ON sec.eid=h.eid AND sec.record_id=h.record_id
59 |   WHERE TRUE
60 |     AND sec.diag_icd9 IS NOT NULL
61 | )
62 | 
63 | SELECT 
64 |   diagnostics.eid sample_id, diagnostics.FieldID, diagnostics.code value, 
65 |   CASE 
66 |     WHEN MIN(PARSE_DATE("%E4Y-%m-%d", vdate)) IS NULL THEN MIN(PARSE_DATE("%E4Y-%m-%d", p.value))
67 |     ELSE MIN(PARSE_DATE("%E4Y-%m-%d", vdate))
68 |   END first_date
69 | FROM (
70 |   SELECT * FROM oper4
71 |   UNION DISTINCT
72 |   SELECT * FROM diag_icd10
73 |   UNION DISTINCT
74 |   SELECT * FROM diag_icd9
75 |   UNION DISTINCT
76 |   SELECT * FROM oper4secondary
77 |   UNION DISTINCT
78 |   SELECT * FROM diag_icd10_secondary
79 |   UNION DISTINCT
80 |   SELECT * FROM diag_icd9_secondary
81 | ) diagnostics
82 | JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` p ON p.sample_id = diagnostics.eid AND p.array_idx=0 AND p.instance=0 AND p.FieldID=53
83 | GROUP BY diagnostics.eid, diagnostics.FieldID, diagnostics.code
84 | ORDER BY first_date ASC
85 | ;


--------------------------------------------------------------------------------
/phenotype_labels/disease/materialized_special_dates.sql:
--------------------------------------------------------------------------------
 1 | WITH dated_fields AS (
 2 |   SELECT p.FieldID, p.sample_id eid, p.value code, cod.meaning,
 3 |     CASE
 4 |       WHEN SAFE.PARSE_DATE("%E4Y-%m-%d", d.value) IS NULL THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value)
 5 |       WHEN cod.meaning LIKE ('%unknown%') THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value)
 6 |       ELSE SAFE.PARSE_DATE("%E4Y-%m-%d", d.value)
 7 |     END vdate
 8 |   FROM `broad-ml4cvd.ukbb7089_201904.phenotype` p
 9 |   JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` denroll ON denroll.FieldID=53 AND denroll.sample_id=p.sample_id AND denroll.instance = 0 AND denroll.array_idx = 0
10 |   JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` d ON d.sample_id=p.sample_id AND d.instance = p.instance AND d.array_idx = p.array_idx
11 |     AND (
12 |       FALSE
13 |       OR (p.FieldID=42013 AND d.FieldID=42012)
14 |       OR (p.FieldID=42011 AND d.FieldID=42010)
15 |       OR (p.FieldID=42009 AND d.FieldID=42008)
16 |       OR (p.FieldID=42007 AND d.FieldID=42006)
17 |       OR (p.FieldID=42001 AND d.FieldID=42000)
18 |     )
19 |   LEFT JOIN `broad-ml4cvd.ukbb7089_201904.coding` cod ON cod.coding_file_id = d.coding_file_id AND cod.coding = d.value
20 | ), 
21 | dated_fields_fractional AS (
22 |   SELECT p.FieldID, p.sample_id eid, p.value code, cod.meaning,
23 |   CASE
24 |       WHEN SAFE.PARSE_DATE("%Y", d.value) IS NULL THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value)
25 |       WHEN cod.meaning LIKE ('%unknown%') THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value)
26 |       ELSE SAFE.PARSE_DATE("%Y", d.value)
27 |     END vdate
28 |   FROM `broad-ml4cvd.ukbb7089_201904.phenotype` p
29 |   JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` denroll ON denroll.FieldID=53 AND denroll.sample_id=p.sample_id AND denroll.instance = 0 AND denroll.array_idx = 0
30 |   JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` d ON d.sample_id=p.sample_id AND d.instance = p.instance AND d.array_idx = p.array_idx
31 |     AND (
32 |       FALSE
33 |       OR (p.FieldID=20004 AND d.FieldID=20010)
34 |       OR (p.FieldID=20002 AND d.FieldID=20008)
35 |       OR (p.FieldID=20001 AND d.FieldID=20006)
36 |     )
37 |   LEFT JOIN `broad-ml4cvd.ukbb7089_201904.coding` cod ON cod.coding_file_id = d.coding_file_id AND cod.coding = d.value
38 | )
39 | 
40 | SELECT 
41 |   diagnostics.eid sample_id, diagnostics.FieldID, diagnostics.code value, MIN(vdate) first_date
42 | FROM (
43 |   SELECT * FROM dated_fields
44 |   UNION DISTINCT
45 |   SELECT * FROM dated_fields_fractional
46 | ) diagnostics
47 | WHERE TRUE
48 |   AND vdate IS NOT NULL
49 | GROUP BY diagnostics.eid, diagnostics.FieldID, diagnostics.code
50 | ;


--------------------------------------------------------------------------------
/phenotype_labels/disease/special_fields.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | var (
 4 | 	// These identify which FieldIDs should be special-cased: either to the
 5 | 	// HESIN table or the table with fields that have a specially-known date.
 6 | 	// All other fields can be queried against the main phenotype table, with an
 7 | 	// assumed date of whatever makes the most sense for your purposes
 8 | 	// (generally, I use the enrollment date, but you could use birthdate, etc).
 9 | 
10 | 	MaterializedHesin = map[int]struct{}{
11 | 		41210: struct{}{},
12 | 		41202: struct{}{},
13 | 		41204: struct{}{},
14 | 		40001: struct{}{},
15 | 		40002: struct{}{},
16 | 		41200: struct{}{},
17 | 		41203: struct{}{},
18 | 		41205: struct{}{},
19 | 	}
20 | 
21 | 	MaterializedSpecial = map[int]struct{}{
22 | 		42013: struct{}{},
23 | 		42011: struct{}{},
24 | 		42009: struct{}{},
25 | 		42007: struct{}{},
26 | 		42001: struct{}{},
27 | 		20004: struct{}{},
28 | 		20002: struct{}{},
29 | 		20001: struct{}{},
30 | 	}
31 | )
32 | 
33 | var (
34 | 	// These can be helpful to make sure that the user is including all fields
35 | 	// that use the same family of codes
36 | 
37 | 	ICD9 = map[int]struct{}{
38 | 		41203: struct{}{},
39 | 		41205: struct{}{},
40 | 	}
41 | 
42 | 	ICD10 = map[int]struct{}{
43 | 		41202: struct{}{},
44 | 		41204: struct{}{},
45 | 		40001: struct{}{},
46 | 		40002: struct{}{},
47 | 	}
48 | 
49 | 	OPCS = map[int]struct{}{
50 | 		41200: struct{}{},
51 | 		41210: struct{}{},
52 | 	}
53 | )
54 | 
55 | func IsHesin(fieldID int) bool {
56 | 	_, exists := MaterializedHesin[fieldID]
57 | 
58 | 	return exists
59 | }
60 | 
61 | func IsSpecial(fieldID int) bool {
62 | 	_, exists := MaterializedSpecial[fieldID]
63 | 
64 | 	return exists
65 | }
66 | 


--------------------------------------------------------------------------------
/phenotype_labels/disease/time_handling.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"time"
 6 | )
 7 | 
 8 | const NullMarker = "NULL"
 9 | 
10 | func TimeToUKBDate(t time.Time) string {
11 | 	if t.Equal(time.Time{}) {
12 | 		return NullMarker
13 | 	}
14 | 
15 | 	return t.Format("2006-01-02")
16 | }
17 | 
18 | func TimesToFractionalYears(earlier, later time.Time) string {
19 | 	if later.Before(earlier) {
20 | 		return NullMarker
21 | 	}
22 | 	y, m, d, h, min, sec := time_diff(earlier, later)
23 | 
24 | 	return fmt.Sprintf("%.6f", float64(y)+float64(m)/12+float64(d)/(12*30)+float64(h)/(24*365)+float64(min)/(60*24*365)+float64(sec)/(60*60*24*365))
25 | }
26 | 
27 | // Taken directly from https://stackoverflow.com/a/36531443/199475
28 | func time_diff(a, b time.Time) (year, month, day, hour, min, sec int) {
29 | 	if a.Location() != b.Location() {
30 | 		b = b.In(a.Location())
31 | 	}
32 | 	if a.After(b) {
33 | 		a, b = b, a
34 | 	}
35 | 	y1, M1, d1 := a.Date()
36 | 	y2, M2, d2 := b.Date()
37 | 
38 | 	h1, m1, s1 := a.Clock()
39 | 	h2, m2, s2 := b.Clock()
40 | 
41 | 	year = int(y2 - y1)
42 | 	month = int(M2 - M1)
43 | 	day = int(d2 - d1)
44 | 	hour = int(h2 - h1)
45 | 	min = int(m2 - m1)
46 | 	sec = int(s2 - s1)
47 | 
48 | 	// Normalize negative values
49 | 	if sec < 0 {
50 | 		sec += 60
51 | 		min--
52 | 	}
53 | 	if min < 0 {
54 | 		min += 60
55 | 		hour--
56 | 	}
57 | 	if hour < 0 {
58 | 		hour += 24
59 | 		day--
60 | 	}
61 | 	if day < 0 {
62 | 		// days in month:
63 | 		t := time.Date(y1, M1, 32, 0, 0, 0, 0, time.UTC)
64 | 		day += 32 - t.Day()
65 | 		month--
66 | 	}
67 | 	if month < 0 {
68 | 		month += 12
69 | 		year--
70 | 	}
71 | 
72 | 	return
73 | }
74 | 


--------------------------------------------------------------------------------
/phenotype_labels/phecodes/README.md:
--------------------------------------------------------------------------------
 1 | # Phecodes
 2 | 
 3 | Phecodes are a mapping of ICD (9 and 10) codes to a hierarchy of diseases.
 4 | 
 5 | To put Phecodes on top of a UKBB dataset (ingested into bigquery according to the ml4h ingest script) you have to 
 6 | 1. follow the instructions in load_phecodes.sh to load the raw phecode information (icd9 -> phenotype) into the database.
 7 | 2. run map_phecodes.py to create a local csv file with mappings of the HESIN table to phecodes (the dataset is hardcoded for now)
 8 | 3. load that csv file into bigquery manually, for example with 
 9 | 
10 | ```
11 | gsutil cp ukbb_dev_phecode_mapping.csv.gz gs://ml4cvd/projects/pbatra/ukbb_dev/
12 | bq load \
13 |  --replace \
14 |  --source_format=CSV \
15 |  --schema  phecode_mapping.json \
16 |  ukbb_dev.phecode_mapping gs://ml4cvd/projects/pbatra/ukbb_dev/ukbb_dev_phecode_mapping.csv.gz
17 | ```
18 | 


--------------------------------------------------------------------------------
/phenotype_labels/phecodes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/phenotype_labels/phecodes/__init__.py


--------------------------------------------------------------------------------
/phenotype_labels/phecodes/icd10.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED", 
 4 |     "name": "icd10", 
 5 |     "type": "STRING"
 6 |   }, 
 7 |   {
 8 |     "mode": "REQUIRED", 
 9 |     "name": "phecode", 
10 |     "type": "STRING"
11 |   },
12 |   {
13 |     "name": "excluded_phecodes", 
14 |     "type": "STRING"
15 |   }, 
16 |   {
17 |     "name": "excluded_phenotypes", 
18 |     "type": "STRING"
19 |   } 
20 | ]
21 | 


--------------------------------------------------------------------------------
/phenotype_labels/phecodes/load_phecodes.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | set -o pipefail
 4 | set -o nounset
 5 | 
 6 | #RAW DATA LOCATOINS
 7 | #beta version of ICD10 (WHO) -> phecodes mapping
 8 | #https://phewascatalog.org/phecodes_icd10
 9 | #There is an ICD10CM (US) version as well
10 | #Phecode definition map is here
11 | #https://phewascatalog.org/phecodes
12 | 
13 | #SHARED_DATASET -- should already be created
14 | SHARED_DATA=shared_data
15 | #specific to this func
16 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
17 | 
18 | #Phecode CSVs (beta 2.2 release) are located in 
19 | ICD10_FILE="gs://ml4h/data/phecodes/Phecode_map_v1_2_icd10_beta.csv.gz"
20 | PHECODE_DEF_FILE="gs://ml4h/data/phecodes/phecode_definitions1.2.csv.gz"
21 | 
22 | #phecode definitions
23 | bq load \
24 |  --replace \
25 |  --source_format=CSV \
26 |  --skip_leading_rows 1 \
27 |  --schema  ${__dir}/phecode_dictionary.json \
28 |  ${SHARED_DATA}.phecode_dictionary ${PHECODE_DEF_FILE}
29 |         
30 | #icd10
31 | bq load \
32 |  --replace \
33 |  --source_format=CSV \
34 |  --skip_leading_rows 1 \
35 |  --format=prettyjson \
36 |  --schema  ${__dir}/icd10.json \
37 |  ${SHARED_DATA}.phecode_icd10 ${ICD10_FILE}  
38 | 
39 | 


--------------------------------------------------------------------------------
/phenotype_labels/phecodes/phecode_dictionary.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED", 
 4 |     "name": "phecode", 
 5 |     "type": "STRING"
 6 |   }, 
 7 |   {
 8 |     "mode": "REQUIRED", 
 9 |     "name": "phenotype", 
10 |     "type": "STRING"
11 |   }, 
12 |   {
13 |     "mode": "REQUIRED", 
14 |     "name": "phecode_exclude_range", 
15 |     "type": "STRING"
16 |   }, 
17 |   {
18 |     "mode": "REQUIRED", 
19 |     "name": "sex", 
20 |     "type": "STRING"
21 |   },
22 |   {
23 |     "mode": "REQUIRED", 
24 |     "name": "rollup", 
25 |     "type": "INTEGER"
26 |   },
27 |   {
28 |     "mode": "REQUIRED", 
29 |     "name": "leaf", 
30 |     "type": "INTEGER"
31 |   },
32 |   {
33 |     "mode": "REQUIRED", 
34 |     "name": "category_number", 
35 |     "type": "INTEGER"
36 |   },
37 |   {
38 |     "mode": "REQUIRED", 
39 |     "name": "category", 
40 |     "type": "STRING"
41 |   } 
42 | 
43 | 
44 | ]
45 | 


--------------------------------------------------------------------------------
/phenotype_labels/phecodes/phecode_mapping.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "REQUIRED", 
 4 |     "name": "eid", 
 5 |     "type": "INTEGER"
 6 |   },
 7 |   {
 8 |     "mode": "REQUIRED", 
 9 |     "name": "record_id", 
10 |     "type": "INTEGER"
11 |   }, 
12 |   {
13 |     "mode": "NULLABLE",
14 |     "name": "admidate", 
15 |     "type": "STRING"
16 |   }, 
17 |   {
18 |     "mode": "NULLABLE",
19 |     "name": "diag_icd10", 
20 |     "type": "STRING"
21 |   },
22 |   {
23 |     "mode": "NULLABLE",
24 |     "name": "phecode", 
25 |     "type": "STRING"
26 |   }
27 | ]
28 | 


--------------------------------------------------------------------------------
/scripts/create_dev_dataset.py:
--------------------------------------------------------------------------------
 1 | """ Creates a ukbb_dev dataset which is a miniature version of a typical ukbbs.
 2 | 
 3 | NOTE: dataset names, tables to be copied are currently hardcoded,
 4 | will fail if dev dataset already
 5 | exists.
 6 | """
 7 | from google.cloud import bigquery
 8 | 
 9 | # CONSTANTS
10 | 
11 | ORIGINAL_DATASET = 'ukbb7089_201904'
12 | DEV_DATASET = 'ukbb_dev'
13 | EXACT_TABLES = ['censor', 'coding', 'dictionary']
14 | LIMITED_SAMPLE_TABLES = ['hesin', 'hesin_diag10', 'hesin_diag9', 'hesin_oper']
15 | client = bigquery.Client()  # should already be set to default project
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     PROJECT = client.project
20 |     FULL_DEV_DATASET = f"{PROJECT}.{DEV_DATASET}"
21 |     print(FULL_DEV_DATASET)
22 |     dataset = bigquery.Dataset.from_string(FULL_DEV_DATASET)
23 |     dataset = client.create_dataset(dataset)
24 |     print('Dataset {} created.'.format(dataset.dataset_id))
25 | 
26 |     print(f"working on {DEV_DATASET}.phenotype")
27 |     # create 1/1000th size dataset
28 |     query_job = client.query(f"""
29 |                 CREATE TABLE {DEV_DATASET}.phenotype
30 |                 AS SELECT * FROM {ORIGINAL_DATASET}.phenotype
31 |                 WHERE MOD(sample_id,1000)=4""")
32 | 
33 |     rows = query_job.result()
34 | 
35 |     # copy some tables exactly
36 |     for table in EXACT_TABLES:
37 |         print(f"working on {DEV_DATASET}.{table}")
38 |         query_job = client.query(f"""
39 |                 CREATE TABLE {DEV_DATASET}.{table}
40 |                 AS SELECT * FROM {ORIGINAL_DATASET}.{table}
41 |                 """)
42 |         rows = query_job.result()
43 | 
44 |     # copy hesin tables by limiting eids within sample_ids
45 |     for table in LIMITED_SAMPLE_TABLES:
46 |         print(f"working on {DEV_DATASET}.{table}")
47 |         query_job = client.query(f"""
48 |                 CREATE TABLE {DEV_DATASET}.{table}
49 |                 AS SELECT * FROM {ORIGINAL_DATASET}.{table}
50 |                 where eid in (select sample_id from
51 |                 {DEV_DATASET}.phenotype)
52 |                 """)
53 |         rows = query_job.result()
54 | 


--------------------------------------------------------------------------------
/scripts/detach_disk.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | DISK=${1:-data}
3 | shift 1
4 | VMS=$(gcloud compute instances list | awk '{print $1}')
5 | ZONE=us-central1-a
6 | for VM in $VMS;
7 |   do gcloud compute instances detach-disk $VM --zone $ZONE --disk=$DISK ;
8 | done


--------------------------------------------------------------------------------
/scripts/train_subsets.sh:
--------------------------------------------------------------------------------
 1 | ECHO=
 2 | MODEL_FILES=
 3 | TENSORS="/mnt/disks/annotated-cardiac-tensors-45k-2021-03-25/2020-09-21/"
 4 | TENSOR_MAPS="ecg.ecg_rest_median_raw_10 mri.lax_4ch_heart_center "
 5 | #array=( "drop_fuse_unsupervised_train_64.csv" "drop_fuse_unsupervised_train_128.csv" "drop_fuse_unsupervised_train_256.csv" "drop_fuse_unsupervised_train_512.csv" "drop_fuse_unsupervised_train_1024.csv" "drop_fuse_unsupervised_train_2048.csv" "drop_fuse_unsupervised_train_4096.csv")
 6 | array=("drop_fuse_unsupervised_train_8192.csv" "drop_fuse_unsupervised_train_16384.csv")
 7 | 
 8 | for i in "${array[@]}"
 9 | do
10 |     $ECHO ./scripts/tf.sh /home/sam/ml4h/ml4h/recipes.py --mode train_block \
11 |     --tensors "$TENSORS" --input_tensors "$TENSOR_MAPS" --output_tensors "$TENSOR_MAPS" \
12 |     --encoder_blocks /home/sam/trained_models/hypertuned_48m_16e_ecg_median_raw_10_autoencoder_256d/encoder_ecg_rest_median_raw_10.h5 \
13 |         /home/sam/trained_models/hypertuned_64m_18e_lax_4ch_heart_center_autoencoder_256d/encoder_lax_4ch_heart_center.h5 \
14 |     --merge_blocks pair \
15 |     --decoder_blocks /home/sam/trained_models/hypertuned_48m_16e_ecg_median_raw_10_autoencoder_256d/decoder_ecg_rest_median_raw_10.h5 \
16 |         /home/sam/trained_models/hypertuned_64m_18e_lax_4ch_heart_center_autoencoder_256d/decoder_lax_4ch_heart_center.h5 \
17 |     --pairs "$TENSOR_MAPS" --pair_loss contrastive --pair_loss_weight 0.1 --pair_merge dropout \
18 |     --batch_size 4 --epochs 316 --training_steps 128 --validation_steps 32 --test_steps 1 \
19 |     --num_workers 4 --patience 16 --tensormap_prefix ml4h.tensormap.ukb \
20 |     --id "drop_fuse_early_stop_v3_${i%.*}" --output_folder /home/sam/trained_models/ \
21 |     --inspect_model --activation mish --dense_layers 256 \
22 |     --train_csv "/home/sam/csvs/${i}" \
23 |     --valid_csv /home/sam/csvs/drop_fuse_unsupervised_valid.csv \
24 |     --test_csv /home/sam/csvs/sample_id_returned_lv_mass.csv
25 | 
26 | 
27 |     $ECHO ./scripts/tf.sh /home/sam/ml4h/ml4h/recipes.py --mode infer_encoders \
28 |     --tensors "$TENSORS" --input_tensors "$TENSOR_MAPS" --output_tensors "$TENSOR_MAPS" \
29 |     --model_file "/home/sam/trained_models/drop_fuse_early_stop_v3_${i%.*}/drop_fuse_early_stop_v3_${i%.*}.h5" \
30 |     --id "drop_fuse_early_stop_v3_${i%.*}" --output_folder /home/sam/trained_models/ \
31 |     --sample_csv /home/sam/csvs/sample_id_returned_lv_mass.csv \
32 |     --tensormap_prefix ml4h.tensormap.ukb \
33 |     --dense_layers 256
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/validate_tensors.sh:
--------------------------------------------------------------------------------
 1 | # use this script to validate the tensors created by the tensorize.sh script
 2 | # expects two positional arguments: directory containing the tensors and the number of threads to use
 3 | # example: ./validate_tensors.sh /mnt/disks/tensors/ 20 | tee completed_tensors.txt
 4 | # the output will be in the following form:
 5 | # OK - /mnt/disks/tensors/ukb1234.hd5
 6 | # BAD - /mnt/disks/tensors/ukb5678.hd5
 7 | 
 8 | 
 9 | INPUT_TENSORS_DIR=$1
10 | NUMBER_OF_THREADS=$2
11 | 
12 | 
13 | find ${INPUT_TENSORS_DIR}/*.hd5 | \
14 |     xargs -P ${NUMBER_OF_THREADS} -I {} \
15 |         bash -c "h5dump -n {} | (grep -q 'HDF5 \"{}\"' && echo 'OK - {}' || echo 'BAD - {}')"


--------------------------------------------------------------------------------
/scripts/vm_image/ml4cvd-image.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # server-conf-scripts are for configuration of a *fresh* VM and should not be
 4 | # treated as startup scripts. (They are not idempotent.)
 5 | 
 6 | GCP_BUCKET="ml4h-core"
 7 | 
 8 | # We assume we are running as a regular user, not root.
 9 | 
10 | # Enable gcsfuse to allow mounting of the google storage bucket as if it were a drive
11 | export GCSFUSE_REPO=gcsfuse-`lsb_release -c -s`
12 | echo "deb http://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
13 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
14 | 
15 | # Install frequently-used packages
16 | # First, update apt since we have added new repos (above)
17 | sudo apt-get update
18 | 
19 | sudo apt install -y r-base r-base-core unzip wget bzip2 python sqlite3 gcsfuse
20 | 
21 | # Make gcsfuse auto-mount to /mnt/${GCP_BUCKET} in the future. Modify fstab to
22 | # do this automatically. Via
23 | # https://github.com/GoogleCloudPlatform/gcsfuse/blob/master/docs/mounting.md
24 | # and https://serverfault.com/a/830726/118452 to enable easier mount with read and
25 | # write access by non-root users.
26 | echo "${GCP_BUCKET} /mnt/${GCP_BUCKET} gcsfuse rw,allow_other,implicit_dirs,default_permissions,file_mode=777,dir_mode=777" | sudo tee -a /etc/fstab
27 | echo "fc-9a7c5487-04c9-4182-b3ec-13de7f6b409b /mnt/imputed_v2 gcsfuse ro,allow_other,implicit_dirs,default_permissions,file_mode=777,dir_mode=777" | sudo tee -a /etc/fstab
28 | echo "fc-7d5088b4-7673-45b5-95c2-17ae00a04183 /mnt/imputed_v3 gcsfuse ro,allow_other,implicit_dirs,default_permissions,file_mode=777,dir_mode=777" | sudo tee -a /etc/fstab
29 | 
30 | 
31 | # Enable docker (assumes Ubuntu, of any supported version)
32 | # See https://docs.docker.com/install/linux/docker-ce/ubuntu/#set-up-the-repository
33 | sudo apt-get remove docker docker-engine docker.io containerd runc
34 | sudo apt-get install -y \
35 |     ca-certificates \
36 |     curl \
37 |     gnupg \
38 |     lsb-release
39 | 
40 | sudo mkdir -p /etc/apt/keyrings
41 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
42 | echo \
43 |   "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
44 |   $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
45 | 
46 | sudo apt-get update
47 | sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
48 | sudo service docker start
49 | 
50 | sudo systemctl enable docker
51 | sudo groupadd -f docker
52 | 
53 | # Manually install gcr
54 | # Via https://cloud.google.com/container-registry/docs/advanced-authentication#standalone_docker_credential_helper
55 | VERSION=1.5.0
56 | OS=linux
57 | ARCH=amd64
58 | curl -fsSL "https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v${VERSION}/docker-credential-gcr_${OS}_${ARCH}-${VERSION}.tar.gz" \
59 |   | tar xz --to-stdout ./docker-credential-gcr | sudo tee -a /usr/bin/docker-credential-gcr 1>/dev/null && sudo chmod +x /usr/bin/docker-credential-gcr
60 | docker-credential-gcr configure-docker
61 | 
62 | sudo apt-get install -y python-setuptools
63 | 
64 | 
65 | #
66 | # Do last
67 | #
68 | 
69 | # Cleanup apt cache
70 | sudo apt autoremove -y
71 | 


--------------------------------------------------------------------------------
/scripts/vm_launch/launch_dl_instance.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | NAME=${1:-sam-p4}
 4 | shift 1
 5 | INSTANCE_TYPE=${1:-n1-standard-4}
 6 | shift 1
 7 | DISK_SIZE=${1:-100GB}
 8 | shift 1
 9 | ACCEL=${1:-nvidia-tesla-t4}
10 | shift 1
11 | 
12 | echo "Creating GPU instance ${NAME} from family dl-image of type ${INSTANCE_TYPE} with GPU ${ACCEL}..."
13 | 
14 | echo "$@"
15 | 
16 | gcloud compute instances create ${NAME} \
17 | --project broad-ml4cvd \
18 | --zone us-central1-a \
19 | --image-project broad-ml4cvd \
20 | --image-family dl-image \
21 | --accelerator=type=${ACCEL},count=1 \
22 | --maintenance-policy=TERMINATE \
23 | --boot-disk-type=pd-standard \
24 | --boot-disk-size=${DISK_SIZE} \
25 | --service-account 783282864357-compute@developer.gserviceaccount.com \
26 | --scopes https://www.googleapis.com/auth/cloud-platform \
27 | --machine-type ${INSTANCE_TYPE} \
28 | --metadata startup-script-url=gs://ml4cvd/projects/jamesp/home/startup.sh \
29 | "$@"
30 | 
31 | # Previously used the base ubuntu:
32 | # --image-project ubuntu-os-cloud \
33 | # --image-family ubuntu-1804-lts \
34 | 
35 | # You can choose whatever size you like for the boot disk:
36 | # --boot-disk-size 300GB
37 | 


--------------------------------------------------------------------------------
/scripts/vm_launch/launch_instance.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | NAME=${1:-jpp-1}
 4 | shift 1
 5 | INSTANCE_TYPE=${1:-n1-standard-1}
 6 | shift 1
 7 | DISK_SIZE=${1:-100GB}
 8 | shift 1
 9 | 
10 | echo "Creating instance ${NAME} from family ml4h-image of type ${INSTANCE_TYPE}..."
11 | 
12 | echo "$@"
13 | 
14 | gcloud compute instances create ${NAME} \
15 | --project broad-ml4cvd \
16 | --zone us-central1-a \
17 | --image-project broad-ml4cvd \
18 | --image-family ml4cvd-image \
19 | --boot-disk-type=pd-standard \
20 | --boot-disk-size=${DISK_SIZE} \
21 | --service-account 783282864357-compute@developer.gserviceaccount.com \
22 | --scopes https://www.googleapis.com/auth/cloud-platform \
23 | --machine-type ${INSTANCE_TYPE} \
24 | --metadata startup-script-url=gs://ml4cvd/projects/jamesp/home/startup.sh \
25 | "$@"
26 | 
27 | # Previously used the base ubuntu:
28 | # --image-project ubuntu-os-cloud \
29 | # --image-family ubuntu-1804-lts \
30 | 
31 | # You can choose whatever size you like for the boot disk:
32 | # --boot-disk-size 300GB
33 | 


--------------------------------------------------------------------------------
/scripts/vm_launch/run_once.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Source this once
 4 | # Exit and reconnect after adding any groups
 5 | 
 6 | # Allow this user to run docker images without sudo
 7 | sudo usermod -aG docker $(whoami)
 8 | 
 9 | # Use the docker-credential-gcr that we installed on bootup
10 | docker-credential-gcr configure-docker
11 | 
12 | # Install pre-commit
13 | sudo apt install python3-pip
14 | pip3 install pre-commit
15 | 


--------------------------------------------------------------------------------
/scripts/vm_start.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | VM=${1:-sam-gpu2}
 3 | shift 1
 4 | ZONE=us-central1-a
 5 | MAX_TRIES=1000
 6 | COUNTER=0
 7 | while [[ $COUNTER -lt $(( $MAX_TRIES )) ]]; do
 8 |     sleep 1s
 9 |     gcloud compute instances start $VM --zone $ZONE
10 |     if [[ $? -eq 0 ]]
11 |     then
12 |       echo "Potentially started vm: ${VM} after ${COUNTER} attempts."
13 |       gcloud compute ssh $VM --zone $ZONE
14 |       if [[ $? -eq 0 ]]
15 |       then
16 |         break
17 |       else
18 |         let COUNTER=COUNTER+1
19 |         echo "Actually, no. Could not start vm: ${VM}, unsuccessful attempt: ${COUNTER}."
20 |         sleep 1s
21 |       fi
22 |     else
23 |       let COUNTER=COUNTER+1
24 |       sleep 1s
25 |       echo "Could not start vm: ${VM}, unsuccessful attempt: ${COUNTER}."
26 |       sleep 1s
27 |     fi
28 | done
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from setuptools import setup, find_packages
 3 | 
 4 | here = pathlib.Path(__file__).parent.resolve()
 5 | # Get the requirements from the requirements file
 6 | requirements = (here / 'docker/vm_boot_images/config/tensorflow-requirements.txt').read_text(encoding='utf-8')
 7 | long_description = (here / 'README.md').read_text(encoding='utf-8')
 8 | 
 9 | 
10 | setup(
11 |     name='ml4h',
12 |     version='0.1.0',
13 |     description='Machine Learning for Health python package',
14 |     long_description=long_description,  # Optional
15 |     long_description_content_type='text/markdown',
16 |     url='https://github.com/broadinstitute/ml4h',
17 |     python_requires='>=3.6',
18 |     install_requires=requirements,
19 |     packages=find_packages(),
20 | )
21 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pytest
 3 | 
 4 | from ml4h.arguments import parse_args
 5 | from ml4h.test_utils import TMAPS as MOCK_TMAPS
 6 | from ml4h.test_utils import build_hdf5s
 7 | 
 8 | 
 9 | def pytest_configure(config):
10 |     pytest.N_TENSORS = 100
11 |     config.addinivalue_line("markers", "slow: mark tests as slow")
12 | 
13 | 
14 | @pytest.fixture(scope='class')
15 | def default_arguments(tmpdir_factory):
16 |     temp_dir = tmpdir_factory.mktemp('data')
17 |     build_hdf5s(temp_dir, MOCK_TMAPS.values(), n=pytest.N_TENSORS)
18 |     hdf5_dir = str(temp_dir)
19 |     inp_key = '3d_cont'
20 |     out_key = '1d_cat'
21 |     sys.argv = [
22 |         '',
23 |         '--output_folder', hdf5_dir,
24 |         '--input_tensors', inp_key,
25 |         '--output_tensors', out_key,
26 |         '--tensors', hdf5_dir,
27 |         '--pool_x', '1',
28 |         '--pool_y', '1',
29 |         '--pool_z', '1',
30 |         '--training_steps', '2',
31 |         '--test_steps', '10',
32 |         '--test_ratio', '0.6',
33 |         '--validation_steps', '2',
34 |         '--valid_ratio', '0.2',
35 |         '--epochs', '2',
36 |         '--num_workers', '0',
37 |         '--batch_size', '4',
38 |         '--gcs_cloud_bucket','ml4h-core/anamika/gcs-test4/',
39 | 
40 |     ]
41 |     args = parse_args()
42 |     return args
43 | 


--------------------------------------------------------------------------------
/tests/test_arguments.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pytest
 3 | from ml4h.arguments import parse_args
 4 | from ml4h.test_utils import TMAPS as MOCK_TMAPS
 5 | 
 6 | 
 7 | class TestUConnect:
 8 | 
 9 |     def test_no_u(self, tmpdir):
10 |         sys.argv = [
11 |             'train',
12 |             '--output_folder', str(tmpdir),
13 |         ]
14 |         args = parse_args()
15 |         assert len(args.u_connect) == 0
16 | 
17 |     def test_simple_u(self, tmpdir):
18 |         inp_key = '3d_cont'
19 |         sys.argv = [
20 |             'train',
21 |             '--output_folder', str(tmpdir),
22 |             '--input_tensors', inp_key,
23 |             '--output_tensors', inp_key,
24 |             '--u_connect', inp_key, inp_key,
25 |         ]
26 |         args = parse_args()
27 |         assert len(args.u_connect) == 1
28 |         inp, out = list(args.u_connect.items())[0]
29 |         tmap = MOCK_TMAPS[inp_key]
30 |         assert inp == tmap
31 |         assert out == {tmap}
32 | 
33 |     def test_many_to_one(self, tmpdir):
34 |         inp_key1 = '3d_cont'
35 |         inp_key2 = '3d_cat'
36 |         sys.argv = [
37 |             'train',
38 |             '--output_folder', str(tmpdir),
39 |             '--input_tensors', inp_key1, inp_key2,
40 |             '--output_tensors', inp_key1,
41 |             '--u_connect', inp_key1, inp_key1,
42 |             '--u_connect', inp_key2, inp_key1,
43 |         ]
44 |         args = parse_args()
45 |         assert len(args.u_connect) == 2
46 |         assert args.u_connect[MOCK_TMAPS[inp_key1]] == {MOCK_TMAPS[inp_key1]}
47 |         assert args.u_connect[MOCK_TMAPS[inp_key2]] == {MOCK_TMAPS[inp_key1]}
48 | 
49 |     def test_one_to_many(self, tmpdir):
50 |         key1 = '3d_cont'
51 |         key2 = '3d_cat'
52 |         sys.argv = [
53 |             'train',
54 |             '--output_folder', str(tmpdir),
55 |             '--input_tensors', key1, key2,
56 |             '--output_tensors', key1, key2,
57 |             '--u_connect', key1, key1,
58 |             '--u_connect', key1, key2,
59 |         ]
60 |         args = parse_args()
61 |         assert len(args.u_connect) == 1
62 |         assert args.u_connect[MOCK_TMAPS[key1]] == {MOCK_TMAPS[key1], MOCK_TMAPS[key2]}
63 | 
64 |     def test_multi_u(self, tmpdir):
65 |         key1 = '3d_cont'
66 |         key2 = '3d_cat'
67 |         sys.argv = [
68 |             'train',
69 |             '--output_folder', str(tmpdir),
70 |             '--input_tensors', key1, key2,
71 |             '--output_tensors', key1, key2,
72 |             '--u_connect', key1, key1,
73 |             '--u_connect', key2, key2,
74 |         ]
75 |         args = parse_args()
76 |         assert len(args.u_connect) == 2
77 |         assert args.u_connect[MOCK_TMAPS[key1]] == {MOCK_TMAPS[key1]}
78 |         assert args.u_connect[MOCK_TMAPS[key2]] == {MOCK_TMAPS[key2]}
79 | 


--------------------------------------------------------------------------------