├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ └── ml4cvd-issue-template.md └── workflows │ ├── RELEASE.md │ ├── increment-version.yml │ ├── publish-to-gcr-ghcr.yml │ ├── publish-to-pypi.yml │ ├── publish-to-terra.yml │ └── python-package.yml ├── .gitignore ├── .lfsconfig ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE.txt ├── MANIFEST.in ├── NOTICE.txt ├── README.md ├── RECIPE_EXAMPLES.md ├── benchmarks ├── BENCHMARKS.md ├── benchmark.py └── data.py ├── docker ├── DOCKER.md ├── ml4h_deploy │ ├── Dockerfile │ ├── README.md │ └── process_files.py └── vm_boot_images │ ├── CREATE_DOCKER_IMAGES.md │ ├── Dockerfile │ ├── build.sh │ └── config │ ├── fastai-requirements.txt │ ├── fastai.sh │ ├── pyukbb.sh │ ├── tensorflow-requirements.txt │ └── ubuntu.sh ├── git_secrets_provider_ml4h.txt ├── go.mod ├── go.sum ├── ingest ├── bulkprocess │ ├── README.md │ ├── dicom-metadata.go │ ├── dicom-overlay.go │ ├── field-ids.go │ ├── output.go │ ├── renamer.go │ └── zip-metadata.go ├── cmd │ ├── batcher │ │ ├── batcher.linux │ │ ├── dicom.go │ │ ├── functions.go │ │ └── main.go │ ├── build_curl_command.py │ ├── dicom2jpeg │ │ ├── dicom2jpeg.linux │ │ └── main.go │ ├── downloader │ │ ├── downloader.linux │ │ └── main.go │ ├── gene2chrpos │ │ ├── gene2chrpos.osx │ │ ├── lookups │ │ │ ├── ensembl.grch37.p13.genes │ │ │ └── url.txt │ │ └── main.go │ ├── manifester │ │ ├── main.go │ │ └── manifester.linux │ └── merge-lvef │ │ ├── main.go │ │ └── merge-lvef.linux ├── partners_ecg │ ├── organize_xmls.py │ └── remove_xml_duplicates.py └── ukbb_csv_bigquery │ ├── README.md │ ├── censor │ ├── censor.go │ ├── censor_result.go │ ├── main.go │ ├── query_single.go │ └── time_handling.go │ ├── convertcoding │ ├── cc_test.go │ └── main.go │ ├── convertdict │ ├── cd_test.go │ └── main.go │ ├── convertpheno │ ├── flagslice.go │ └── main.go │ ├── convertsample │ └── main.go │ ├── decrypt_all.sh │ ├── do_all.sh │ ├── firstdate │ ├── README.md │ └── main.go │ ├── importcensor │ ├── censor.json │ └── import.sh │ ├── importcoding │ ├── coding.json │ └── import.sh │ ├── importdict │ ├── dictionary.json │ └── import.sh │ ├── importhesin │ ├── hesin.json │ ├── hesin_diag.json │ ├── hesin_diag10.json │ ├── hesin_diag9.json │ ├── hesin_lubitz.json │ ├── hesin_oper.json │ └── import.sh │ ├── importpheno │ ├── append.sh │ └── phenotype.json │ ├── importsample │ ├── import.sh │ └── sample.json │ ├── inspect_screenshot.png │ └── preprocessing_data.ipynb ├── ml4h ├── DATA_MODELING_TESTS.md ├── DatabaseClient.py ├── TensorMap.py ├── __init__.py ├── applications │ ├── feature_selection │ │ ├── 2020.11.30_analysis_cleaned2.r │ │ ├── coxnet_training_testing_evaluating.py │ │ └── xgboost_training_testing_evaluating.py │ ├── ingest │ │ ├── ingest_autosegment.py │ │ ├── ingest_mri.py │ │ ├── ingest_xml_metadata.py │ │ ├── requirements.txt │ │ └── two_d_projection.py │ └── jpp_inference_rv │ │ ├── README.md │ │ ├── infer_on_sax.py │ │ └── infer_to_hd5_local.sh ├── arguments.py ├── data_descriptions.py ├── defines.py ├── explorations.py ├── hypertuning.py ├── logger.py ├── make_tensor_maps_for_partners_ecg_labels.py ├── metrics.py ├── ml4ht_integration │ ├── __init__.py │ ├── tensor_generator.py │ └── tensor_map.py ├── models │ ├── Block.py │ ├── __init__.py │ ├── basic_blocks.py │ ├── conv_blocks.py │ ├── diffusion_blocks.py │ ├── inspect.py │ ├── layer_wrappers.py │ ├── legacy_models.py │ ├── merge_blocks.py │ ├── model_factory.py │ ├── perceiver_blocks.py │ ├── pretrained_blocks.py │ ├── train.py │ ├── train_diffusion.py │ ├── transformer_blocks.py │ └── transformer_blocks_embedding.py ├── normalizer.py ├── optimizers.py ├── plots.py ├── recipes.py ├── runtime_data_defines.py ├── tensor_generators.py ├── tensorize │ ├── PARTNERS.md │ ├── README.md │ ├── TENSORIZE.md │ ├── __init__.py │ ├── dataflow │ │ ├── __init__.py │ │ ├── bigquery_ukb_queries.py │ │ ├── fieldids.json │ │ ├── load_fieldids.sh │ │ ├── ml4h_dataflow.yml │ │ └── requirements_ml4h_dataflow.txt │ ├── merge_hd5s.py │ ├── tensor_writer_mgb.py │ ├── tensor_writer_ukbb.py │ └── tensorize_dataflow.py ├── tensormap │ ├── __init__.py │ ├── celeba.py │ ├── gatk.py │ ├── general.py │ ├── mgb │ │ ├── __init__.py │ │ ├── dynamic.py │ │ ├── ecg.py │ │ └── xdl.py │ ├── mnist.py │ ├── tensor_map_maker.py │ ├── text.py │ └── ukb │ │ ├── __init__.py │ │ ├── categorical.py │ │ ├── continuous.py │ │ ├── demographics.py │ │ ├── disease.py │ │ ├── dxa.py │ │ ├── ecg.py │ │ ├── embedding.py │ │ ├── genetics.py │ │ ├── mri.py │ │ ├── mri_brain.py │ │ ├── mri_ecg.py │ │ ├── mri_vtk.py │ │ └── survival.py ├── test_utils.py └── visualization_tools │ ├── __init__.py │ ├── annotation_storage.py │ ├── annotations.py │ ├── annotations_schema.json │ ├── batch_image_annotations.py │ ├── dicom_interactive_plots.py │ ├── dicom_plots.py │ ├── ecg_interactive_plots.py │ ├── ecg_reshape.py │ ├── ecg_static_plots.py │ ├── facets.py │ └── hd5_mri_plots.py ├── model_zoo ├── DROID-MVP │ ├── droid_mvp_checkpoint │ │ ├── checkpoint │ │ ├── chkp.data-00000-of-00001 │ │ └── chkp.index │ ├── droid_mvp_inference.py │ ├── droid_mvp_model_description.py │ ├── movinet_a2_base │ │ ├── checkpoint │ │ ├── ckpt-1.data-00000-of-00001 │ │ └── ckpt-1.index │ └── readme.md ├── DROID-RV │ ├── droid_rv_checkpoint │ │ ├── checkpoint │ │ ├── chkp.data-00000-of-00001 │ │ └── chkp.index │ ├── droid_rv_inference.py │ ├── droid_rv_model_description.py │ ├── droid_rvef_checkpoint │ │ ├── checkpoint │ │ ├── chkp.data-00000-of-00001 │ │ └── chkp.index │ ├── movinet_a2_base │ │ ├── checkpoint │ │ ├── ckpt-1.data-00000-of-00001 │ │ └── ckpt-1.index │ └── readme.md ├── DROID │ ├── README.md │ ├── data_descriptions │ │ ├── __init__.py │ │ ├── echo.py │ │ └── wide_file.py │ ├── echo_defines.py │ ├── echo_supervised_inference_recipe.py │ ├── echo_supervised_training_recipe.py │ ├── echo_to_lmdb.py │ ├── encoders │ │ ├── LA_DROID │ │ │ └── model │ │ │ │ ├── checkpoint │ │ │ │ ├── chkp.data-00000-of-00001 │ │ │ │ └── chkp.index │ │ └── LV_DROID │ │ │ └── model │ │ │ ├── checkpoint │ │ │ ├── chkp.data-00000-of-00001 │ │ │ └── chkp.index │ └── model_descriptions │ │ ├── __init__.py │ │ └── echo.py ├── ECG2AF │ ├── README.md │ ├── architecture.png │ ├── ecg2af_infer.ipynb │ ├── ecg2af_quintuplet_v2024_01_13.keras │ ├── ecg_5000_survival_curve_af_quadruple_task_mgh_v2021_05_21.h5 │ ├── km.jpg │ ├── salience.jpg │ ├── strip_II_survival_curve_af_v2021_06_15.h5 │ ├── strip_I_survival_curve_af_v2021_06_15.h5 │ └── study_design.jpg ├── ECG_PheWAS │ ├── README.md │ ├── decoder_median.h5 │ ├── decoder_median.keras │ ├── ecg_median_autoencoder_latent_space_infer.ipynb │ ├── ecg_median_autoencoder_reconstruct_phecode.ipynb │ ├── ecg_write_biosppy_medians.ipynb │ ├── encoder_median.h5 │ ├── encoder_median.keras │ ├── latent_space_phewas.ipynb │ ├── mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.h5 │ ├── mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.keras │ ├── pandas_boxplots_phewas.ipynb │ ├── phecode_projection.ipynb │ └── ukb_phewas.png ├── PCLR │ ├── PCLR.h5 │ ├── PCLR │ │ ├── saved_model.pb │ │ └── variables │ │ │ ├── variables.data-00000-of-00001 │ │ │ └── variables.index │ ├── PCLR_lead_I │ │ ├── saved_model.pb │ │ └── variables │ │ │ ├── variables.data-00000-of-00001 │ │ │ └── variables.index │ ├── PCLR_lead_II │ │ ├── saved_model.pb │ │ └── variables │ │ │ ├── variables.data-00000-of-00001 │ │ │ └── variables.index │ ├── README.md │ ├── build_model.py │ ├── get_representations.py │ ├── preprocess_ecg.py │ └── requirements.txt ├── README.md ├── adiposity_mlandepi │ ├── README.md │ ├── compute_projections.py │ ├── downstream_associations_v3.ipynb │ ├── ingest.py │ ├── shrinkage_loss.py │ └── train.py ├── cardiac_mri_derived_left_ventricular_mass │ ├── Lreg.png │ ├── Lseg.png │ ├── README.md │ ├── arguments_2020-11-13_11-47.txt │ ├── calibrations_sax_all_diastole_segmented.png │ ├── metric_history_sax_diastole_segment_no_flat.png │ ├── per_class_roc_sax_all_diastole_segmented.png │ ├── precision_recall_sax_all_diastole_segmented.png │ └── sax_diastole_segment_no_flat.h5 ├── dropfuse │ ├── README.md │ ├── decoder_ecg_rest_median_raw_10.h5 │ ├── decoder_lax_4ch_heart_center.h5 │ ├── dropout_pair_contrastive_lax_4ch_cycle_ecg_median_10_pretrained_256d_v2020_06_07.h5 │ ├── encoder_ecg_rest_median_raw_10.h5 │ ├── encoder_lax_4ch_heart_center.h5 │ └── overview.png ├── left_ventricular_mass_from_ecg_student_and_mri_teacher │ ├── README.md │ ├── TrainingAndTestSets.jpg │ ├── ecg_rest_raw_age_sex_bmi_lvm_asymmetric_loss.h5 │ ├── ecg_rest_raw_lvm_asymmetric_loss.h5 │ └── ecg_rest_raw_lvm_symmetric_loss.h5 ├── liver_fat_from_mri_ukb │ ├── README.md │ ├── liver_fat_from_echo.h5 │ ├── liver_fat_from_echo_teacher_model.png │ ├── liver_fat_from_ideal.h5 │ └── liver_fat_from_ideal_student_model.png ├── mi_feature_selection │ ├── 2020.11.30_analysis_cleaned2.ipynb │ ├── 2020.11.30_analysis_cleaned2.r │ ├── README.md │ ├── coxnet_training_testing_evaluating.py │ ├── models │ │ ├── coxnet_survival_05_final.pickle │ │ └── xgcox_model.json │ ├── requirements.txt │ └── xgboost_training_testing_evaluating.py ├── registration_reveals_genetics │ ├── README.md │ ├── latent_space_comparisons.ipynb │ ├── registration.png │ ├── table1.png │ └── table2.png └── silhouette_mri │ ├── README.md │ ├── callbacks.py │ ├── shrinkage_loss.py │ └── train_models.py ├── notebooks ├── ML4H_ML_intro.ipynb ├── ML4H_Model_Factory_Intro.ipynb ├── latent_space_bias_detection.ipynb ├── mnist_survival_analysis_demo.ipynb ├── review_results │ ├── identify_a_sample_to_review.ipynb │ ├── image_annotations.ipynb │ ├── review_one_sample.ipynb │ └── test_error_handling_for_notebook_visualizations.ipynb ├── terra_featured_workspace │ ├── generate_synthetic_tabular_data.ipynb │ ├── image_annotations_demo.ipynb │ ├── ml4h_setup.ipynb │ ├── review_model_results_interactive.ipynb │ ├── review_one_sample_interactive.ipynb │ └── workspace_description.md └── typecast_column_for_hesin.ipynb ├── phenotype_labels ├── disease │ ├── DATES.md │ ├── README.md │ ├── main.go │ ├── materialized_hesin_dates.sql │ ├── materialized_special_dates.sql │ ├── result.go │ ├── special_fields.go │ ├── tabfile.go │ └── time_handling.go └── phecodes │ ├── README.md │ ├── __init__.py │ ├── icd10.json │ ├── load_phecodes.sh │ ├── map_phecodes.py │ ├── phecode_dictionary.json │ └── phecode_mapping.json ├── pylintrc ├── scripts ├── create_dev_dataset.py ├── detach_disk.sh ├── jupyter.sh ├── latent_space_gwas.py ├── merge_hd5s.sh ├── tensorize.sh ├── tf.sh ├── train_subsets.sh ├── validate_tensors.sh ├── vm_image │ └── ml4cvd-image.sh ├── vm_launch │ ├── launch_dl_instance.sh │ ├── launch_instance.sh │ └── run_once.sh └── vm_start.sh ├── setup.py └── tests ├── conftest.py ├── ml4ht_integration └── test_tensor_map.py ├── test_arguments.py ├── test_models.py ├── test_recipes.py └── test_tensor_generators.py /.github/ISSUE_TEMPLATE/ml4cvd-issue-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: ml4h issue template 3 | about: default issue template 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **What** 11 | Summarize the issue in 1-2 sentences. 12 | 13 | **Why** 14 | Describe why this issue should be solved, new feature implemented, etc. 15 | 16 | **How** 17 | High-level overview of how you propose to address. 18 | 19 | **Acceptance Criteria** 20 | Unambiguous milestones; if any are incomplete, the PR cannot be merged. 21 | -------------------------------------------------------------------------------- /.github/workflows/RELEASE.md: -------------------------------------------------------------------------------- 1 | Release process 2 | 3 | 4 | For PRs and after merge, testing is run with: 5 | [python-package.yml](python-package.yml) 6 | 7 | 8 | 9 | Manually navigate to GitHub's Releases page and select Draft a new release. 10 | https://github.com/broadinstitute/ml4h/releases 11 | 12 | This process should automatically kick off the following workflows 13 | 14 | Creation of updated docker images on CPU and GPU base images and published in GCR and GHCR 15 | [publish-to-gcr-ghcr.yml](publish-to-gcr-ghcr.yml) 16 | 17 | Images are named: 18 | tf2.9-latest-cpu 19 | tf2.9-latest-gpu 20 | And can be found on [GitHubs Container Registry](https://github.com/broadinstitute/ml4h/pkgs/container/ml4h) 21 | 22 | Updating of ml4h library and published to Pypi 23 | [publish-to-pypi.yml](publish-to-pypi.yml) 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /.github/workflows/increment-version.yml: -------------------------------------------------------------------------------- 1 | name: Increment version 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: write 9 | 10 | jobs: 11 | update_version: 12 | if: ${{ github.event.release.tag_name != '' }} 13 | name: Update version 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Check out source code 17 | uses: actions/checkout@v4 18 | env: 19 | GIT_LFS_SKIP_SMUDGE: 1 20 | 21 | - name: Checkout main for version edit 22 | run: | 23 | export GIT_LFS_SKIP_SMUDGE=1 24 | # Note: the following account information will not work on GHES 25 | export GIT_LFS_SKIP_SMUDGE=1 26 | git config --global user.name "github-actions[bot]" 27 | git config --global user.email {user.id}+{user.login}@users.noreply.github.com 28 | git fetch 29 | git checkout main 30 | 31 | - name: Replace string in file 32 | run: | 33 | grep "version" setup.py 34 | if [[ ${{ github.event.release.tag_name }} =~ [v0-9.]* ]]; then 35 | sed -i "s/version='[v0-9.]*',/version='${{ github.event.release.tag_name }}',/g" setup.py 36 | else 37 | echo "Tag is an unexpected value and no version uodate will occur" 38 | fi 39 | 40 | - name: Check for version update 41 | run: cat setup.py 42 | 43 | - name: Push to git 44 | run: | 45 | git config lfs.https://github.com/broadinstitute/ml4h.locksverify false 46 | git config lfs.https://github.com/broadinstitute/ml4h.git.locksverify false 47 | git remote set-url origin https://${{ secrets.GHCR_USERNAME }}:${{ secrets.GHCR_TOKEN }}@github.com/${{ github.repository }} 48 | git add setup.py 49 | git commit -m "Version bump to ${{ github.event.release.tag_name }}" 50 | git push 51 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-gcr-ghcr.yml: -------------------------------------------------------------------------------- 1 | name: Push to GCR/GHCR GitHub Action 2 | on: 3 | push: 4 | tags: 5 | - '*' # Push events to every tag not containing / 6 | workflow_dispatch: 7 | 8 | 9 | jobs: 10 | build-and-push-to-gcr-service-account: 11 | name: Build & push to GCR/GHCR 12 | runs-on: self-hosted 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Authenticate to Google Cloud 16 | id: auth 17 | uses: google-github-actions/auth@v2 18 | with: 19 | credentials_json: '${{ secrets.B64_GCLOUD_SERVICE_ACCOUNT_JSON }}' 20 | - name: Building and pushing the image 21 | run: | 22 | echo "${{ secrets.GHCR_TOKEN }}" | docker login ghcr.io -u "${{ secrets.GHCR_USERNAME }}" --password-stdin 23 | yes | gcloud auth configure-docker gcr.io 24 | docker system prune --all --force 25 | ./docker/vm_boot_images/build.sh -P 26 | docker system prune --all --force 27 | ./docker/vm_boot_images/build.sh -c -P 28 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distribution 📦 to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' # Push events to every tag not containing / 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | name: Build distribution 📦 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: [3.12] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | pip cache purge 26 | python -m pip install --upgrade pip 27 | # Install the ml4h Python package. 28 | pip install . 29 | pip install -r docker/vm_boot_images/config/tensorflow-requirements.txt 30 | - name: Install pypa/build 31 | run: >- 32 | python -m pip install build --user 33 | - name: Build a binary wheel and a source tarball 34 | run: python -m build 35 | - name: Store the distribution packages 36 | uses: actions/upload-artifact@v4 37 | with: 38 | name: python-package-distributions-${{ matrix.python-version }} 39 | path: dist/ 40 | 41 | publish-to-pypi: 42 | name: >- 43 | Publish Python 🐍 distribution 📦 to PyPI 44 | needs: 45 | - build 46 | runs-on: ubuntu-latest 47 | environment: 48 | name: pypi 49 | url: https://pypi.org/p/ml4h 50 | permissions: 51 | id-token: write # IMPORTANT: mandatory for trusted publishing 52 | 53 | steps: 54 | - name: Download all the dists 55 | uses: actions/download-artifact@v4 56 | with: 57 | name: python-package-distributions-3.12 58 | path: dist/ 59 | - name: Publish distribution 📦 to PyPI 60 | uses: pypa/gh-action-pypi-publish@release/v1 61 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install the ml4h Python package and run its tests. 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Test ml4h Python package 5 | 6 | on: 7 | workflow_dispatch: 8 | # Allows manually triggering workflow in GitHub UI on selected branch. 9 | # GitHub doc: https://docs.github.com/en/free-pro-team@latest/actions/reference/events-that-trigger-workflows#workflow_dispatch. 10 | # GitHub blog demo: https://github.blog/changelog/2020-07-06-github-actions-manual-triggers-with-workflow_dispatch/. 11 | 12 | push: 13 | branches: [ master ] 14 | 15 | pull_request: 16 | branches: [ master ] 17 | 18 | jobs: 19 | build: 20 | 21 | runs-on: ubuntu-latest 22 | strategy: 23 | matrix: 24 | python-version: [3.11, 3.12] 25 | 26 | steps: 27 | - uses: actions/checkout@v2 28 | - name: Set up Python ${{ matrix.python-version }} 29 | uses: actions/setup-python@v2 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | - name: Install dependencies 33 | run: | 34 | pip cache purge 35 | python -m pip install --upgrade pip 36 | # Install the ml4h Python package. 37 | pip install . 38 | - name: Test with pytest and pytest-xdist 39 | run: | 40 | pytest tests -m "not slow" -n auto 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .vscode 3 | .idea/* 4 | *.linux 5 | *.log 6 | **/__pycache__/* 7 | trained_models/* 8 | recipes_output/* 9 | .ipynb_checkpoints 10 | ml4h.egg-info/* 11 | .Rproj.user 12 | docker/terra_image/*/** 13 | build/ 14 | dist/ 15 | -------------------------------------------------------------------------------- /.lfsconfig: -------------------------------------------------------------------------------- 1 | [lfs] 2 | url = git@github.com:broadinstitute/ml4h.git 3 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/kynan/nbstripout 3 | rev: 0.6.1 4 | hooks: 5 | - id: nbstripout 6 | files: ".ipynb" 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v4.4.0 9 | hooks: 10 | - id: trailing-whitespace 11 | files: ".py" 12 | - id: end-of-file-fixer 13 | files: ".py" 14 | - id: debug-statements 15 | files: ".py" 16 | - repo: https://github.com/Lucas-C/pre-commit-hooks 17 | rev: v1.3.1 18 | hooks: 19 | - id: remove-tabs 20 | files: ".py" 21 | - repo: https://github.com/asottile/add-trailing-comma 22 | rev: v2.4.0 23 | hooks: 24 | - id: add-trailing-comma 25 | exclude: "ml4h/tensormap/ukb/by_script.py" 26 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | ML4H is released under the following BSD 3-Clause License: 2 | 3 | Copyright (c) 2025, Broad Institute, Inc. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | * Neither the name Broad Institute, Inc. 16 | nor the names of its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include docker/vm_boot_images/config/tensorflow-requirements.txt -------------------------------------------------------------------------------- /benchmarks/BENCHMARKS.md: -------------------------------------------------------------------------------- 1 | # Data generation benchmarks :straight_ruler: 2 | Benchmarks keep track of how quickly we can produce data for training models. 3 | To run all benchmarks: 4 | ```bash 5 | python benchmarks/benchmark.py 6 | ``` 7 | 8 | 9 | ## Index 10 | * [Running benchmarks](#running-benchmarks) 11 | * [Contributing benchmarks](#running-benchmarks) 12 | 13 | 14 | ## Running benchmarks 15 | Benchmarks are run using [benchmarks.py](./benchmark.py). 16 | You can specify specific benchmarks using 17 | ```bash 18 | python benchmarks/benchmarks.py [name of benchmark] [name of another benchmark] ... 19 | ``` 20 | The names of the currently available benchmarks are the directory names under [benchmark_results](./benchmark_results). 21 | 22 | ## Contributing benchmarks 23 | 24 | There are three components to a benchmark: 25 | 1. [The type of data](#data-descriptions) 26 | 2. [The way the data is iterated over](#generatorfactories) 27 | 28 | All three can be tinkered with. 29 | 30 | ### Data descriptions 31 | Synthetic data is produced in [data.py](./data.py) using the function `data.build_example`. 32 | A synthetic datum is described by a 3-tuple 33 | ``` 34 | [name, shape, data type] 35 | ``` 36 | For example, the `ecg_single_task` benchmark simulates reading ECG bmi pairs, 37 | so it has two data descriptions: 38 | ```python 39 | ('ecg', (5000, 12), StorageType.CONTINUOUS), 40 | ('bmi', (1,), StorageType.CONTINUOUS), 41 | ``` 42 | 43 | ### GeneratorFactories 44 | `GeneratorFactorie`s prepare the data for a class of generators and produce data generators that use that data. 45 | for example `benchmark.TensorGeneratorFactory` builds `ml4h.TensorGenerator`s with `hd5`s to store data. 46 | `GeneratorFactorie`s must implement a `setup` function which prepares the synthetic data given a `DataDescription` 47 | and a number of synthetic samples to build. 48 | 49 | `GeneratorFactorie`s must also implement `__call__(batch_size: int, num_workers: int)` 50 | which produces a data generator with that batch size and number of multiprocessing workers. 51 | -------------------------------------------------------------------------------- /docker/DOCKER.md: -------------------------------------------------------------------------------- 1 | # ML4H Docker 2 | 3 | ## Editing and pushing the docker 4 | 5 | To edit the packages inside the ML4H docker container, first edit: 6 | ``` 7 | ml4h/ml4h/docker/vm_boot_images/config/tensorflow-requirements.txt 8 | ``` 9 | Add a line for each package, with optional version numbers. 10 | 11 | Then, the docker container should be pushed to both the [Google Container Registry](https://console.cloud.google.com/gcr/images/broad-ml4cvd/GLOBAL/deeplearning) and the [Github GHCR Repository](https://github.com/broadinstitute/ml4h/pkgs/container/ml4h). 12 | 13 | For GHCR, you will to generate a [personal access token](https://github.com/settings/tokens) on github, and grant docker access: 14 | ``` 15 | docker login ghcr.io -u GITHUB_USERNAME -p ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 16 | ``` 17 | 18 | Finally, use the ```jupyter.sh``` script to build, tag and push an ML4H image. Use ```-c``` for the CPU-only image: 19 | ``` 20 | cd ml4h 21 | ./docker/vm_boot_images/build.sh -P 22 | ./docker/vm_boot_images/build.sh -c -P 23 | ``` 24 | Note that each image will have two tags: a short unique SHA1 tag from ```HEAD```, and either ```tf2.9-latest-gpu``` or ```tf2.9-latest-cpu```. 25 | -------------------------------------------------------------------------------- /docker/ml4h_deploy/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM us-central1-docker.pkg.dev/broad-ml4cvd/deeplearning/ml4h:tf2.19-latest-cpu 2 | 3 | # Set the working directory 4 | WORKDIR /app 5 | 6 | # Install TensorFlow (or any other necessary libraries) 7 | RUN pip install tensorflow 8 | 9 | # Copy the Keras model file into the Docker image 10 | COPY ecg_5000_hf_quintuplet_dropout_v2023_04_17.keras /app/ecg_5000_hf_quintuplet_dropout_v2023_04_17.keras 11 | 12 | # Copy the Python script 13 | COPY process_files.py /app/process_files.py 14 | 15 | RUN pip install ml4h 16 | 17 | # Define the command to run the script 18 | CMD ["python", "process_files.py", "/data"] -------------------------------------------------------------------------------- /docker/ml4h_deploy/README.md: -------------------------------------------------------------------------------- 1 | # Make a deployment docker with a model from the Model Factory 2 | Edit `Dockerfile` and `process_files.py` to copy and load your `.keras` model file. 3 | Then build the docker image: 4 | ```bash 5 | docker build -t ecg2hf_finngen_deploy . 6 | ``` 7 | Then run the docker image: 8 | ```bash 9 | docker run -v /home/sam/ecg_xml:/data -v /home/sam:/output ecg2hf_finngen_deploy 10 | ``` 11 | If it works, you should see the output in `/home/sam`. Then save your docker image as tarball: 12 | ```bash 13 | docker save ecg2hf_finngen_deploy:latest -o ecg2hf_finngen_deploy.tar 14 | ``` 15 | 16 | ## Deploy to FinnGEN 17 | Download the tarball (maybe a huge 20GB+ file). Then split it into smaller files, because FinnGEN has a limit of 5GB per file: 18 | ```bash 19 | split -b 2300M ecg2hf_finngen_deploy.tar ecg2hf_finngen_deploy_part_ 20 | ``` 21 | Login to your finngen account and navigate to the green bucket Google Console page. 22 | The address depends on the sandbox version. Currently, it is at: [https://console.cloud.google.com/storage/browser/fg-production-sandbox-6_greenuploads/sam](https://console.cloud.google.com/storage/browser/fg-production-sandbox-6_greenuploads/sam). 23 | Upload all the parts here. Then after they pass the virus scan, which takes ~20 minutes, they will show up in your FinnGEN IVM at the path `/finngen/green/sam`. 24 | You can replace `sam` with any folder name you want, but must be consistent between the upload and the IVM path. 25 | 26 | 27 | More docs are here: [https://docs.finngen.fi/working-in-the-sandbox/quirks-and-features/how-to-upload-to-your-own-ivm-via-finngen-green](https://docs.finngen.fi/working-in-the-sandbox/quirks-and-features/how-to-upload-to-your-own-ivm-via-finngen-green) 28 | 29 | Once all the pieces have been uploaded, reassemble them in the sandbox: 30 | ```bash 31 | cd /finngen/green/sam 32 | cat ecg2hf_finngen_deploy_part_* > ~/ecg2hf_finngen_deploy.tar 33 | ``` 34 | 35 | Load the docker image: 36 | ```bash 37 | cd ~ 38 | docker load -i ecg2hf_finngen_deploy.tar 39 | ``` 40 | Then run the docker image: 41 | ``` 42 | docker run -v /finngen/library-red/EAS_HEART_FAILURE_1.0/data/ecg:/data -v /home/ivm/output:/output ecg2hf_finngen_deploy 43 | ``` -------------------------------------------------------------------------------- /docker/vm_boot_images/Dockerfile: -------------------------------------------------------------------------------- 1 | # The suggested base images are: 2 | # - ufoym/deepo:all-py36-jupyter for GPU-enabled machines 3 | # - ufoym/deepo:all-py36-jupyter-cpu for CPU-only (non-GPU-enabled) machines 4 | # BASE_IMAGE can be specified at build time by adding the following argument: 5 | # --build_arg BASE_IMAGE="some_other_image" 6 | 7 | ARG BASE_IMAGE 8 | FROM ${BASE_IMAGE} 9 | 10 | LABEL maintainer="Sam Freesun Friedman " 11 | 12 | # Setup time zone (or else docker build hangs) 13 | ENV TZ=America/New_York 14 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 15 | 16 | COPY ./config/* /app/ 17 | WORKDIR /app 18 | 19 | # Note that some layers are kept separate to encourage layer re-use and to try 20 | # to minimize full recompilation where possible. 21 | 22 | # Basic setup 23 | #RUN rm /etc/apt/sources.list.d/cuda.list 24 | ##RUN rm /etc/apt/sources.list.d/nvidia-ml.list 25 | #RUN apt-key del 7fa2af80 26 | #RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub 27 | #RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub 28 | RUN ./ubuntu.sh 29 | 30 | # Point any MLflow tracking hooks at the main MLflow instance on Cloud Run 31 | ENV MLFLOW_TRACKING_URI='https://mlflow-783282864357.us-central1.run.app' 32 | 33 | # FastAI. See the Developer Install under https://github.com/fastai/fastai/ to 34 | # understand this odd sequence of installing then uninstalling fastai before 35 | # installing it from github. (Basically, to get its deps.) 36 | # RUN pip3 install -r fastai-requirements.txt 37 | # RUN pip3 uninstall -y fastai 38 | # RUN ./fastai.sh 39 | 40 | RUN apt-get update 41 | RUN apt-get upgrade -y 42 | RUN apt-get install python3 python3-pip python3-tk libgl1-mesa-glx libxt-dev -y 43 | RUN apt-get install -y wget unzip curl python3-pydot graphviz git ffmpeg 44 | 45 | # Requirements for the tensorflow project 46 | RUN pip3 install --upgrade pip 47 | #RUN pip3 install -r pre_requirements.txt 48 | RUN pip3 install -r tensorflow-requirements.txt 49 | -------------------------------------------------------------------------------- /docker/vm_boot_images/config/fastai-requirements.txt: -------------------------------------------------------------------------------- 1 | --find-links https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html 2 | torch_nightly 3 | fastai 4 | -------------------------------------------------------------------------------- /docker/vm_boot_images/config/fastai.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install the github repo version 4 | git clone https://github.com/fastai/fastai 5 | cd fastai 6 | 7 | # Peg our version to a known-working SHA, since they make 8 | # post-1.0 breaking changes literally every day... 9 | git reset --hard 14868ca69483afbaa8e28d4e281c148d1dad1c89 10 | 11 | tools/run-after-git-clone 12 | pip install -e .[dev] 13 | -------------------------------------------------------------------------------- /docker/vm_boot_images/config/pyukbb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Fetch the latest available version 4 | wget https://storage.googleapis.com/ml4cvd/ml4cvd-master.zip 5 | unzip ml4cvd-master.zip 6 | cd ml4cvd-master/pyukbb 7 | 8 | tools/run-after-git-clone 9 | pip install -e .[dev] 10 | -------------------------------------------------------------------------------- /docker/vm_boot_images/config/tensorflow-requirements.txt: -------------------------------------------------------------------------------- 1 | pydot 2 | nibabel==4.0.2 3 | pydicom==1.2.2 4 | seaborn 5 | scikit-image 6 | peakutils 7 | biosppy 8 | imageio 9 | ipywidgets>=7.5.1 10 | bokeh 11 | pillow 12 | notebook 13 | pytest 14 | pytest-xdist 15 | pysam 16 | tensorflow==2.19.0 17 | tensorflow_hub 18 | tensorflow_probability 19 | tensorflow-text 20 | tf-models-official 21 | keras-tuner 22 | numcodecs 23 | beautifulsoup4 24 | lxml 25 | xmltodict 26 | google-cloud-bigquery 27 | google-cloud-bigquery-storage 28 | pandas_gbq 29 | pyarrow 30 | altair 31 | facets-overview 32 | plotnine 33 | vega 34 | ipycanvas>=0.7.0 35 | ipyannotations>=0.2.1 36 | torch==2.2.2 37 | opencv-python 38 | blosc 39 | boto3 40 | ml4ht==0.0.10 41 | google-cloud-storage 42 | umap-learn[plot] 43 | neurite 44 | voxelmorph 45 | pystrum 46 | av 47 | lmdb 48 | -------------------------------------------------------------------------------- /docker/vm_boot_images/config/ubuntu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Other necessities 4 | apt-get update 5 | 6 | echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections 7 | 8 | apt-get install -y wget unzip curl python3-pydot python3-pydot-ng graphviz ttf-mscorefonts-installer git pip ffmpeg 9 | 10 | wget https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb 11 | dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb 12 | cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-local-8138232B-keyring.gpg /usr/share/keyrings/ 13 | apt-get update 14 | apt-get -y install cudnn 15 | -------------------------------------------------------------------------------- /git_secrets_provider_ml4h.txt: -------------------------------------------------------------------------------- 1 | private_key 2 | private_key_id 3 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module go_ml4h 2 | 3 | go 1.20 4 | 5 | require ( 6 | cloud.google.com/go v0.110.0 // indirect 7 | cloud.google.com/go/bigquery v1.51.2 // indirect 8 | cloud.google.com/go/compute v1.19.0 // indirect 9 | cloud.google.com/go/compute/metadata v0.2.3 // indirect 10 | cloud.google.com/go/iam v0.13.0 // indirect 11 | cloud.google.com/go/storage v1.29.0 // indirect 12 | github.com/andybalholm/brotli v1.0.4 // indirect 13 | github.com/apache/arrow/go/v12 v12.0.0 // indirect 14 | github.com/apache/thrift v0.16.0 // indirect 15 | github.com/carbocation/genomisc v0.0.0-20221110225648-66a475457014 // indirect 16 | github.com/carbocation/pfx v0.0.0-20210408121254-ad6c6d3ac2f0 // indirect 17 | github.com/csimplestring/go-csv v0.0.0-20180328183906-5b8b3cd94f2c // indirect 18 | github.com/goccy/go-json v0.9.11 // indirect 19 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 20 | github.com/golang/protobuf v1.5.3 // indirect 21 | github.com/golang/snappy v0.0.4 // indirect 22 | github.com/google/flatbuffers v2.0.8+incompatible // indirect 23 | github.com/google/go-cmp v0.5.9 // indirect 24 | github.com/google/s2a-go v0.1.0 // indirect 25 | github.com/google/uuid v1.3.0 // indirect 26 | github.com/googleapis/enterprise-certificate-proxy v0.2.3 // indirect 27 | github.com/googleapis/gax-go/v2 v2.8.0 // indirect 28 | github.com/klauspost/asmfmt v1.3.2 // indirect 29 | github.com/klauspost/compress v1.15.9 // indirect 30 | github.com/klauspost/cpuid/v2 v2.0.9 // indirect 31 | github.com/krolaw/zipstream v0.0.0-20180621105154-0a2661891f94 // indirect 32 | github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect 33 | github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect 34 | github.com/pierrec/lz4/v4 v4.1.15 // indirect 35 | github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect 36 | github.com/zeebo/xxh3 v1.0.2 // indirect 37 | go.opencensus.io v0.24.0 // indirect 38 | golang.org/x/crypto v0.7.0 // indirect 39 | golang.org/x/mod v0.8.0 // indirect 40 | golang.org/x/net v0.9.0 // indirect 41 | golang.org/x/oauth2 v0.7.0 // indirect 42 | golang.org/x/sync v0.1.0 // indirect 43 | golang.org/x/sys v0.7.0 // indirect 44 | golang.org/x/text v0.9.0 // indirect 45 | golang.org/x/tools v0.6.0 // indirect 46 | golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect 47 | google.golang.org/api v0.118.0 // indirect 48 | google.golang.org/appengine v1.6.7 // indirect 49 | google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect 50 | google.golang.org/grpc v1.55.0 // indirect 51 | google.golang.org/protobuf v1.30.0 // indirect 52 | gopkg.in/guregu/null.v3 v3.5.0 // indirect 53 | ) 54 | -------------------------------------------------------------------------------- /ingest/bulkprocess/README.md: -------------------------------------------------------------------------------- 1 | # Downloading UKBB bulk data (cardiac MRI) 2 | 3 | Prepare permissions 4 | 1. Make sure that you have created a `.ukbkey` file containing the application ID on line 1 and the private key on line 2 (directly downloadable as an attachment from the email that you received from the UKBB). This file should not be readable by anyone without proper UKBB permissions, so consider setting this to be user-readable only. 5 | 6 | ```bash 7 | ./ukbunpack 6764.enc .ukbkey 8 | ./ukbconv 6764.enc_ukb bulk -s20216 9 | mv ~/ml/ingest/cmd/downloader/main.go . 10 | go run main.go 11 | ``` 12 | Download data 13 | 1. Download the encrypted file (`ukb21481.enc`) and decrypt it to the encoded file (`ukb21481.enc_ukb`) 14 | 1. Extract the list of all samples with the field of interest. 20208 is Heart MRI Long Axis `ukbconv ukb21481.enc_ukb bulk -s20208` 15 | * Atttempt #2: Try to get all MRI fields at once. `ukbconv ukb21481.enc_ukb bulk -ifields.list` 16 | 1. Inspect: `wc -l ukb21481.bulk` and you can see that there is one entry per person for whom this data exists 17 | 1. You cannot download more than 1,000 samples' bulk files at a time. So, iteratively do it: 18 | * For now, just take 50 19 | * `head -n 50 ukb21481.bulk > heart.50` 20 | * `ukbfetch -bheart.50` *(Note: no space between `-b` and `heart.50`)* -------------------------------------------------------------------------------- /ingest/bulkprocess/field-ids.go: -------------------------------------------------------------------------------- 1 | package bulkprocess 2 | 3 | type FieldID string 4 | 5 | const ( 6 | AorticDistensibility FieldID = "20210" 7 | BloodFlow FieldID = "20213" 8 | CineTagging FieldID = "20211" 9 | SHMOLLI FieldID = "20214" 10 | LVOT FieldID = "20212" 11 | LAX FieldID = "20208" 12 | Scout FieldID = "20207" 13 | SAX FieldID = "20209" 14 | ) 15 | -------------------------------------------------------------------------------- /ingest/bulkprocess/output.go: -------------------------------------------------------------------------------- 1 | package bulkprocess 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/araddon/dateparse" 8 | ) 9 | 10 | type DicomOutput struct { 11 | SampleID string 12 | ZipFile string 13 | FieldID string 14 | Instance string 15 | Index string 16 | Dicom DicomRow 17 | DicomMeta DicomMeta 18 | } 19 | 20 | type DicomRow struct { 21 | Filename string 22 | PatientID string 23 | StudyID string 24 | StudyDescription string 25 | Date string 26 | SeriesID string 27 | SeriesDescription string 28 | Modality string // Not always present 29 | AET string 30 | Host string 31 | } 32 | 33 | func (d DicomRow) ParsedDate() (time.Time, error) { 34 | res, err := dateparse.ParseAny(d.Date) 35 | if err == nil { 36 | return res, nil 37 | } 38 | 39 | // Try some known values that dateparse fails to understand 40 | return time.Parse("02-Jan-2006 15:04:05", d.Date) 41 | } 42 | 43 | func stringSliceToDicomStruct(input []string) (out DicomOutput, err error) { 44 | if l := len(input); l < 9 || l > 10 { 45 | return out, fmt.Errorf("Expected 9 or 10 fields, found %d", l) 46 | } 47 | 48 | out.Dicom.Filename = input[0] 49 | out.Dicom.PatientID = input[1] 50 | out.Dicom.StudyID = input[2] 51 | out.Dicom.StudyDescription = input[3] 52 | out.Dicom.Date = input[4] 53 | out.Dicom.SeriesID = input[5] 54 | out.Dicom.SeriesDescription = input[6] 55 | 56 | if len(input) == 10 { 57 | out.Dicom.Modality = input[7] 58 | out.Dicom.AET = input[8] 59 | out.Dicom.Host = input[9] 60 | } else { 61 | out.Dicom.AET = input[7] 62 | out.Dicom.Host = input[8] 63 | } 64 | 65 | return 66 | } 67 | -------------------------------------------------------------------------------- /ingest/bulkprocess/zip-metadata.go: -------------------------------------------------------------------------------- 1 | package bulkprocess 2 | 3 | import ( 4 | "fmt" 5 | "path/filepath" 6 | "strings" 7 | ) 8 | 9 | type ZipMetadata struct { 10 | SampleID string 11 | FieldID string 12 | Instance string 13 | Index string 14 | } 15 | 16 | func zipPathToMetadata(path string) (ZipMetadata, error) { 17 | filename := filepath.Base(path) 18 | 19 | // Remove .zip 20 | name := strings.Split(filename, ".")[0] 21 | 22 | data := strings.Split(name, "_") 23 | 24 | if len(data) != 4 { 25 | return ZipMetadata{}, fmt.Errorf("Expected filename to be of format sampleID_fieldID_instance_index.zip, but found %d parts instead of 4", len(data)) 26 | } 27 | 28 | return ZipMetadata{data[0], data[1], data[2], data[3]}, nil 29 | } 30 | -------------------------------------------------------------------------------- /ingest/cmd/batcher/batcher.linux: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6a3c75db2e09bbe690b9ed85525a5a71643d86da353cec65a73896fd0e2acc71 3 | size 18518072 4 | -------------------------------------------------------------------------------- /ingest/cmd/batcher/dicom.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "image" 7 | "image/color" 8 | "image/jpeg" 9 | "io" 10 | "io/ioutil" 11 | 12 | "github.com/gradienthealth/dicom" 13 | "github.com/gradienthealth/dicom/dicomtag" 14 | ) 15 | 16 | // Takes in a dicom file (in bytes), outputs one or more jpeg file equivalents 17 | // (in bytes) 18 | func DicomToJpeg(dicomReader io.Reader) ([][]byte, error) { 19 | dcm, err := ioutil.ReadAll(dicomReader) 20 | if err != nil { 21 | return nil, err 22 | } 23 | 24 | p, err := dicom.NewParserFromBytes(dcm, nil) 25 | if err != nil { 26 | return nil, err 27 | } 28 | 29 | parsedData, err := p.Parse(dicom.ParseOptions{DropPixelData: false}) 30 | if parsedData == nil || err != nil { 31 | return nil, fmt.Errorf("Error reading dicom: %v", err) 32 | } 33 | 34 | var output [][]byte 35 | 36 | for _, elem := range parsedData.Elements { 37 | if elem.Tag != dicomtag.PixelData { 38 | continue 39 | } 40 | 41 | data := elem.Value[0].(dicom.PixelDataInfo) 42 | 43 | for _, frame := range data.Frames { 44 | 45 | // Encapsulated 46 | 47 | if frame.IsEncapsulated { 48 | output = append(output, frame.EncapsulatedData.Data) 49 | continue 50 | } 51 | 52 | // Unencapsulated 53 | 54 | img := image.NewGray16(image.Rect(0, 0, frame.NativeData.Cols, frame.NativeData.Rows)) 55 | for j := 0; j < len(frame.NativeData.Data); j++ { 56 | // for now, assume we're not overflowing uint16, assume gray image 57 | img.SetGray16(j%frame.NativeData.Cols, j/frame.NativeData.Rows, color.Gray16{Y: uint16(frame.NativeData.Data[j][0])}) 58 | } 59 | buf := new(bytes.Buffer) 60 | jpeg.Encode(buf, img, &jpeg.Options{Quality: 100}) 61 | output = append(output, buf.Bytes()) 62 | } 63 | } 64 | 65 | return output, nil 66 | } 67 | -------------------------------------------------------------------------------- /ingest/cmd/batcher/functions.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "math/rand" 5 | "time" 6 | ) 7 | 8 | func init() { 9 | // Ensure different folder names on each run 10 | rand.Seed(time.Now().UTC().UnixNano()) 11 | } 12 | 13 | // RandOrthoglyphs produces a string of length n randomly. 14 | func RandOrthoglyphs(n int) string { 15 | var letters = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") 16 | lenLetters := len(letters) 17 | b := make([]rune, n) 18 | for i := range b { 19 | b[i] = letters[rand.Intn(lenLetters)] 20 | } 21 | return string(b) 22 | } 23 | -------------------------------------------------------------------------------- /ingest/cmd/build_curl_command.py: -------------------------------------------------------------------------------- 1 | """ 2 | To build curl commands from copy pasted forms from the biobank website 3 | """ 4 | 5 | import sys 6 | 7 | FORM_TEXT = """ 8 |
9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 | """ 18 | 19 | 20 | NAME = "DOWNLOAD.enc" # Downloaded file's name 21 | 22 | 23 | test = """ 24 |
25 | 26 | 27 | 28 | 29 | 30 | 31 |
32 | """ 33 | 34 | 35 | def get_fields(txt): 36 | i = txt.find('''name="fetch"''') 37 | if i == -1: 38 | print('Fetch form not in text') 39 | return 40 | action, i = get_field(txt, i, '''action="''') 41 | fields = {'action': action} 42 | for field in ['id', 's', 't', 'i', 'v']: 43 | fields[field], i = get_field(txt, i) 44 | return fields 45 | 46 | 47 | def get_field(txt, start, target='''value="'''): 48 | start = txt.find(target, start) 49 | end = txt.find('''"''', start + len(target)) 50 | return txt[start + len(target): end], end 51 | 52 | 53 | def fields_to_curl(name, action, id, s, t, i, v): 54 | return f""" 55 | curl -d "id={id}&s={s}&t={t}&i={i}&v={v}&submit=Fetch" \ 56 | -X POST {action} \ 57 | -o {name} 58 | """ 59 | 60 | 61 | def txt_to_curl(name, txt): 62 | return fields_to_curl(name, **get_fields(txt)) 63 | 64 | # check to see if an argument was provided (single argument with path to form text in a file) 65 | if len(sys.argv) > 1: 66 | try: 67 | with open (sys.argv[1], "r") as form_text_file: 68 | FORM_TEXT = form_text_file.read() 69 | except: 70 | print(f'This program expects the input argument, if provided, to be a path') 71 | print(f'to a file containing the form data from the ukbiobank website.') 72 | exit(1) 73 | 74 | print(txt_to_curl(NAME, FORM_TEXT)) 75 | -------------------------------------------------------------------------------- /ingest/cmd/dicom2jpeg/dicom2jpeg.linux: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:54810ae1323a965d0c24db89967cfc23f7aca642e809f421090e5b27faa1409c 3 | size 17949402 4 | -------------------------------------------------------------------------------- /ingest/cmd/downloader/downloader.linux: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4b997debdc843297b109d57d99ef18fd0e337aba677d94e6370597687d3c7c64 3 | size 2494206 4 | -------------------------------------------------------------------------------- /ingest/cmd/downloader/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/csv" 5 | "flag" 6 | "fmt" 7 | "log" 8 | "os" 9 | "os/exec" 10 | "time" 11 | ) 12 | 13 | func main() { 14 | // Consume a .bulk file 15 | // Download all data with ukbfetch 16 | 17 | var bulkPath, ukbKey, ukbFetch string 18 | var concurrency int 19 | 20 | flag.StringVar(&bulkPath, "bulk", "", "Path to *.bulk file, as specified by UKBB.") 21 | flag.StringVar(&ukbFetch, "ukbfetch", "ukbfetch", "Path to the ukbfetch utility (if not already in your PATH as ukbfetch).") 22 | flag.StringVar(&ukbKey, "ukbkey", ".ukbkey", "Path to the .ukbkey file with the app ID and special key.") 23 | flag.IntVar(&concurrency, "concurrency", 10, "Number of simultaneous connections to UK Biobank servers.") 24 | 25 | flag.Parse() 26 | 27 | log.Println("Note: This tool only checks for pre-existing files in the order specified by the bulk file.") 28 | 29 | if bulkPath == "" { 30 | flag.PrintDefaults() 31 | os.Exit(1) 32 | } 33 | 34 | f, err := os.Open(bulkPath) 35 | if err != nil { 36 | log.Fatalln(err) 37 | } 38 | 39 | c := csv.NewReader(f) 40 | c.Comma = ' ' 41 | 42 | entries, err := c.ReadAll() 43 | if err != nil { 44 | log.Fatalln(err) 45 | } 46 | 47 | // Note: The UK Biobank updated their rules to permit only 10 simultaneous 48 | // downloads per application in 3/2019. 49 | log.Println("Using up to", concurrency, "simultaneous downloads") 50 | 51 | // Make it 1-based 52 | concurrency = concurrency - 1 53 | 54 | sem := make(chan bool, concurrency) 55 | 56 | finishedCheckingExisting := false 57 | for i, row := range entries { 58 | exists := false 59 | zipFile := "" 60 | 61 | if !finishedCheckingExisting { 62 | // Since statting on a GCSFuse filesystem is slow, we assume sorted 63 | // order. If that is true, then once we stop finding files we have 64 | // already downloaded, we can stop checking. 65 | for _, suffix := range []string{"zip", "cram", "cram.crai"} { 66 | zipFile = fmt.Sprintf("%s_%s.%s", row[0], row[1], suffix) 67 | 68 | // If we already downloaded this file, skip it 69 | if _, err := os.Stat(zipFile); !os.IsNotExist(err) { 70 | log.Println(i, len(entries), "Already downloaded", zipFile) 71 | exists = true 72 | break 73 | } 74 | } 75 | } 76 | 77 | if exists { 78 | continue 79 | } 80 | 81 | finishedCheckingExisting = true 82 | 83 | log.Println(i, len(entries), "Downloading", zipFile) 84 | 85 | sem <- true 86 | go func(row []string) { 87 | defer func() { <-sem }() 88 | 89 | nErrors := 0 90 | for { 91 | if out, err := exec.Command(ukbFetch, fmt.Sprintf("-a%s", ukbKey), fmt.Sprintf("-e%s", row[0]), fmt.Sprintf("-d%s", row[1])).CombinedOutput(); err != nil && nErrors < 3 { 92 | nErrors++ 93 | log.Println(fmt.Errorf("Output: %s | Error: %s", string(out), err.Error())) 94 | log.Println("Sleeping 30 seconds and retrying") 95 | time.Sleep(30 * time.Second) 96 | continue 97 | } 98 | 99 | // If we already errored 3x or we had no error, break the loop 100 | break 101 | } 102 | }(append([]string{}, row...)) 103 | 104 | } 105 | 106 | for i := 0; i < cap(sem); i++ { 107 | sem <- true 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /ingest/cmd/gene2chrpos/gene2chrpos.osx: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5d83cce426c001f4e40e50ab096e0450b3ffc5e936bb240a0c04cb989b49f087 3 | size 12456236 4 | -------------------------------------------------------------------------------- /ingest/cmd/gene2chrpos/lookups/ensembl.grch37.p13.genes: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3415c81f7b51ccd8ead5928b06aece7e146c575560e30f420c59f93f7067e2a6 3 | size 20482401 4 | -------------------------------------------------------------------------------- /ingest/cmd/gene2chrpos/lookups/url.txt: -------------------------------------------------------------------------------- 1 | http://grch37.ensembl.org/biomart/martview/6f9488c78379ccab56985d13f802be0f?VIRTUALSCHEMANAME=default&ATTRIBUTES=hsapiens_gene_ensembl.default.feature_page.ensembl_gene_id|hsapiens_gene_ensembl.default.feature_page.ensembl_transcript_id|hsapiens_gene_ensembl.default.feature_page.ensembl_peptide_id|hsapiens_gene_ensembl.default.feature_page.chromosome_name|hsapiens_gene_ensembl.default.feature_page.start_position|hsapiens_gene_ensembl.default.feature_page.end_position|hsapiens_gene_ensembl.default.feature_page.strand|hsapiens_gene_ensembl.default.feature_page.transcript_start|hsapiens_gene_ensembl.default.feature_page.transcript_end|hsapiens_gene_ensembl.default.feature_page.transcript_length|hsapiens_gene_ensembl.default.feature_page.external_gene_name&FILTERS=&VISIBLEPANEL=attributepanel -------------------------------------------------------------------------------- /ingest/cmd/gene2chrpos/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/csv" 6 | "flag" 7 | "fmt" 8 | "io" 9 | "log" 10 | "strings" 11 | 12 | "github.com/gobuffalo/packr" 13 | ) 14 | 15 | const ( 16 | GeneStableID int = iota 17 | TranscriptStableID 18 | ProteinStableID 19 | Chromosome 20 | GeneStartOneBased 21 | GeneEndOneBased 22 | Strand 23 | TranscriptStartOneBased 24 | TranscriptEndOneBased 25 | TranscriptLengthIncludingUTRAndCDS 26 | GeneName 27 | ) 28 | 29 | func main() { 30 | var geneName string 31 | 32 | flag.StringVar(&geneName, "gene", "", "Name of the gene whose GRCH37 transcript's chr:pos you would like to lookup.") 33 | flag.Parse() 34 | 35 | if geneName == "" { 36 | flag.PrintDefaults() 37 | return 38 | } 39 | 40 | if err := Lookup(geneName); err != nil { 41 | log.Fatalln(err) 42 | } 43 | } 44 | 45 | func Lookup(geneName string) error { 46 | lookups := packr.NewBox("./lookups") 47 | 48 | file := lookups.Bytes("ensembl.grch37.p13.genes") 49 | buf := bytes.NewBuffer(file) 50 | cr := csv.NewReader(buf) 51 | cr.Comma = '\t' 52 | 53 | results := make([][]string, 0) 54 | 55 | header := make([]string, 0) 56 | var i int64 57 | for { 58 | rec, err := cr.Read() 59 | if err != nil && err == io.EOF { 60 | break 61 | } else if err != nil { 62 | return err 63 | } 64 | 65 | i++ 66 | if i == 1 { 67 | header = append(header, rec...) 68 | 69 | continue 70 | } 71 | 72 | if rec[GeneName] != geneName { 73 | continue 74 | } 75 | 76 | strand := "-" 77 | if rec[Strand] == "1" { 78 | strand = "+" 79 | } 80 | 81 | results = append(results, []string{rec[GeneName], rec[Chromosome], rec[TranscriptStartOneBased], rec[TranscriptEndOneBased], rec[TranscriptLengthIncludingUTRAndCDS], strand}) 82 | } 83 | 84 | if len(results) < 1 { 85 | return fmt.Errorf("No results were found for %s. Were you using a transcript name instead of a gene name?", geneName) 86 | } 87 | 88 | fmt.Println("Gene\tChromosome\tTranscriptStart\tTranscriptEnd\tTranscriptLength\tStrand") 89 | for _, result := range results { 90 | fmt.Println(strings.Join(result, "\t")) 91 | } 92 | 93 | return nil 94 | } 95 | -------------------------------------------------------------------------------- /ingest/cmd/manifester/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "io/ioutil" 7 | "log" 8 | "os" 9 | "runtime" 10 | "strings" 11 | 12 | "github.com/broadinstitute/ml4h/go/bulkprocess" 13 | ) 14 | 15 | func main() { 16 | // Makes one big combined manifest 17 | // Emits to stdout 18 | 19 | var path string 20 | 21 | flag.StringVar(&path, "path", "./", "Path where the UKBB bulk .zip files are being held.") 22 | 23 | flag.Parse() 24 | 25 | files, err := ioutil.ReadDir(path) 26 | if err != nil { 27 | log.Fatalln(err) 28 | } 29 | 30 | // Read each zip (names are significant) 31 | fmt.Printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", 32 | "sample_id", 33 | "field_id", 34 | "instance", 35 | "index", 36 | "zip_file", 37 | "dicom_file", 38 | "series", 39 | "date", 40 | "instance_number", 41 | "overlay_text", 42 | "overlay_fraction", 43 | "overlay_rows", 44 | "overlay_cols", 45 | "image_x", 46 | "image_y", 47 | "image_z", 48 | ) 49 | 50 | concurrency := 4 * runtime.NumCPU() 51 | 52 | results := make(chan string, concurrency) 53 | doneListening := make(chan struct{}) 54 | go func() { 55 | defer func() { doneListening <- struct{}{} }() 56 | // Serialize results so you don't dump text haphazardly into os.Stdout 57 | // (which is not goroutine safe). 58 | for { 59 | select { 60 | case res, ok := <-results: 61 | if !ok { 62 | return 63 | } 64 | 65 | fmt.Println(res) 66 | } 67 | } 68 | 69 | }() 70 | 71 | semaphore := make(chan struct{}, concurrency) 72 | 73 | for _, file := range files { 74 | 75 | // Will block after `concurrency` simultaneous goroutines are running 76 | semaphore <- struct{}{} 77 | 78 | go func(file os.FileInfo) { 79 | 80 | // Be sure to permit unblocking once we finish 81 | defer func() { <-semaphore }() 82 | 83 | if !strings.HasSuffix(file.Name(), ".zip") { 84 | return 85 | } 86 | 87 | err := bulkprocess.CardiacMRIZipIterator(path+file.Name(), func(dcm bulkprocess.DicomOutput) error { 88 | if err := PrintCSVRow(dcm, results); err != nil { 89 | log.Printf("Error parsing %+v\n", dcm) 90 | return err 91 | } 92 | 93 | return nil 94 | }) 95 | if err != nil { 96 | log.Println("Error parsing", path+file.Name()) 97 | log.Fatalln(err) 98 | } 99 | }(file) 100 | } 101 | 102 | // Make sure we finish all the reads before we exit, otherwise we'll lose 103 | // the last `concurrency` lines. 104 | for i := 0; i < cap(semaphore); i++ { 105 | semaphore <- struct{}{} 106 | } 107 | 108 | // Close the results channel and make sure we are done listening 109 | close(results) 110 | <-doneListening 111 | } 112 | 113 | func PrintCSVRow(row bulkprocess.DicomOutput, results chan<- string) error { 114 | studyDate, err := row.Dicom.ParsedDate() 115 | if err != nil { 116 | return err 117 | } 118 | 119 | overlayText := "NoOverlay" 120 | if row.DicomMeta.HasOverlay { 121 | overlayText = "HasOverlay" 122 | } 123 | 124 | results <- fmt.Sprintf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.8f\t%d\t%d\t%.2f\t%.2f\t%.2f", 125 | row.SampleID, row.FieldID, row.Instance, row.Index, row.ZipFile, 126 | row.Dicom.Filename, row.Dicom.SeriesDescription, studyDate.Format("2006-01-02"), 127 | row.DicomMeta.InstanceNumber, overlayText, row.DicomMeta.OverlayFraction, row.DicomMeta.OverlayRows, row.DicomMeta.OverlayCols, 128 | row.DicomMeta.PatientX, row.DicomMeta.PatientY, row.DicomMeta.PatientZ) 129 | return nil 130 | } 131 | -------------------------------------------------------------------------------- /ingest/cmd/manifester/manifester.linux: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:15317181e392337e49d20a4e5bc04e96e19cd3b9880e0bed0daee4dedd8a1413 3 | size 18546323 4 | -------------------------------------------------------------------------------- /ingest/cmd/merge-lvef/merge-lvef.linux: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:520b3aab986215f83f816ff157e422662e94ced2d5311f2126ec3f4787fba6c3 3 | size 2390746 4 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/censor/censor_result.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "time" 7 | 8 | "gopkg.in/guregu/null.v3" 9 | ) 10 | 11 | type CensorResult struct { 12 | SampleID int64 13 | 14 | // In the database, populate with a list of fields that we would like to 15 | // have (e.g., month of birth, lost to followup) but which were not present, 16 | // so we know if the table was constructed from incomplete data 17 | Missing []string 18 | 19 | // Guaranteed 20 | enrolled time.Time 21 | computed time.Time // Date this was computed 22 | 23 | // May be null appropriately 24 | died null.Time 25 | lost null.Time 26 | 27 | // Unsure 28 | phenoCensored time.Time 29 | deathCensored time.Time 30 | 31 | // convenience / not exported 32 | bornYear string 33 | bornMonth string 34 | } 35 | 36 | func (s CensorResult) Born() time.Time { 37 | // If we know year + month, then neutral assumption is that birthday is on 38 | // the middle day of the month. If we just know year, then assumption is 39 | // being born midway through the year (July 2). 40 | month := s.bornMonth 41 | day := "15" 42 | 43 | if month == "" { 44 | month = "7" 45 | day = "02" 46 | } 47 | 48 | dt, err := time.Parse("2006-01-02", fmt.Sprintf("%04s-%02s-%02s", s.bornYear, month, day)) 49 | if err != nil { 50 | return time.Time{} 51 | } 52 | 53 | return dt 54 | } 55 | 56 | func (s CensorResult) DiedString() string { 57 | if !s.died.Valid { 58 | return NullMarker 59 | } 60 | 61 | return TimeToUKBDate(s.died.Time) 62 | } 63 | 64 | func (s CensorResult) DeathCensored() time.Time { 65 | if s.died.Valid { 66 | return s.died.Time 67 | } 68 | 69 | if s.lost.Valid { 70 | return s.lost.Time 71 | } 72 | 73 | return s.deathCensored 74 | } 75 | 76 | func (s CensorResult) PhenoCensored() time.Time { 77 | if s.died.Valid { 78 | return s.died.Time 79 | } 80 | 81 | if s.lost.Valid { 82 | return s.lost.Time 83 | } 84 | 85 | return s.phenoCensored 86 | } 87 | 88 | func (s CensorResult) MissingToString() string { 89 | if res := strings.Join(s.Missing, "|"); len(res) > 0 { 90 | return res 91 | } 92 | 93 | return "NA" 94 | } 95 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/censor/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "log" 7 | "os" 8 | "cloud.google.com/go/bigquery" 9 | ) 10 | 11 | const NullMarker = "NA" 12 | 13 | type SamplePheno struct { 14 | SampleID int64 `bigquery:"sample_id"` 15 | Value string `bigquery:"value"` 16 | FieldID int64 `bigquery:"FieldID"` 17 | Instance int64 `bigquery:"instance"` 18 | ArrayIDX int64 `bigquery:"array_idx"` 19 | CodingFileID bigquery.NullInt64 `bigquery:"coding_file_id"` 20 | } 21 | 22 | type WrappedBigQuery struct { 23 | Context context.Context 24 | Client *bigquery.Client 25 | Project string 26 | Database string 27 | } 28 | 29 | func main() { 30 | var ( 31 | phenoCensorDateString string 32 | deathCensorDateString string 33 | BQ = &WrappedBigQuery{} 34 | ) 35 | 36 | flag.StringVar(&phenoCensorDateString, "pheno_censor", "", "With format YYYY-MM-DD, please provide the Hospital Data censor date from https://biobank.ctsu.ox.ac.uk/crystal/exinfo.cgi?src=Data_providers_and_dates") 37 | flag.StringVar(&deathCensorDateString, "death_censor", "", "With format YYYY-MM-DD, please provide the Death censor date from https://biobank.ctsu.ox.ac.uk/crystal/exinfo.cgi?src=Data_providers_and_dates") 38 | flag.StringVar(&BQ.Project, "project", "broad-ml4cvd", "Name of the Google Cloud project that hosts your BigQuery database instance") 39 | flag.StringVar(&BQ.Database, "bigquery", "", "BigQuery source database name") 40 | flag.Parse() 41 | 42 | if phenoCensorDateString == "" || deathCensorDateString == "" || BQ.Project == "" || BQ.Database == "" { 43 | flag.PrintDefaults() 44 | os.Exit(1) 45 | } 46 | 47 | log.Println("Using bigquery database", BQ.Database) 48 | log.Println("Output uses", NullMarker, "in place of null values. Please specify this when loading data into bigquery.") 49 | 50 | log.Println("Producing censoring table") 51 | if err := Censor(BQ, deathCensorDateString, phenoCensorDateString); err != nil { 52 | log.Fatalln(err) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/censor/query_single.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "cloud.google.com/go/bigquery" 7 | "google.golang.org/api/iterator" 8 | ) 9 | 10 | func BigQuerySingleFieldFirst(wbq *WrappedBigQuery, fieldID int64) (map[int64]string, error) { 11 | out := make(map[int64]string) 12 | 13 | query := wbq.Client.Query(fmt.Sprintf(`SELECT * 14 | FROM %s.phenotype 15 | WHERE 1=1 16 | AND FieldID=@FieldID 17 | ORDER BY instance ASC, array_idx ASC 18 | 19 | -- Uncomment for testing 20 | -- ORDER BY sample_id DESC 21 | -- LIMIT 10 22 | `, wbq.Database)) 23 | 24 | query.QueryConfig.Parameters = append(query.QueryConfig.Parameters, []bigquery.QueryParameter{ 25 | {Name: "FieldID", Value: fieldID}, 26 | }...) 27 | 28 | itr, err := query.Read(wbq.Context) 29 | if err != nil { 30 | return nil, err 31 | } 32 | for { 33 | var values SamplePheno 34 | err := itr.Next(&values) 35 | if err == iterator.Done { 36 | break 37 | } 38 | if err != nil { 39 | return nil, err 40 | } 41 | 42 | // Take only the first, since we use this for things like enrollment 43 | // date. If someone came to a follow-up visit, we don't want to say that 44 | // they "enrolled" at the time of their follow-up, for example. Relies 45 | // on sort order specified above in the query. 46 | if _, exists := out[values.SampleID]; exists { 47 | continue 48 | } 49 | out[values.SampleID] = values.Value 50 | } 51 | 52 | return out, nil 53 | } 54 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/censor/time_handling.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | func TimeToUKBDate(t time.Time) string { 9 | if t.Equal(time.Time{}) { 10 | return NullMarker 11 | } 12 | 13 | return t.Format("2006-01-02") 14 | } 15 | 16 | func TimesToFractionalYears(earlier, later time.Time) string { 17 | if later.Before(earlier) { 18 | return NullMarker 19 | } 20 | y, m, d, h, min, sec := time_diff(earlier, later) 21 | 22 | return fmt.Sprintf("%.6f", float64(y)+float64(m)/12+float64(d)/(12*30)+float64(h)/(24*365)+float64(min)/(60*24*365)+float64(sec)/(60*60*24*365)) 23 | } 24 | 25 | // Taken directly from https://stackoverflow.com/a/36531443/199475 26 | func time_diff(a, b time.Time) (year, month, day, hour, min, sec int) { 27 | if a.Location() != b.Location() { 28 | b = b.In(a.Location()) 29 | } 30 | if a.After(b) { 31 | a, b = b, a 32 | } 33 | y1, M1, d1 := a.Date() 34 | y2, M2, d2 := b.Date() 35 | 36 | h1, m1, s1 := a.Clock() 37 | h2, m2, s2 := b.Clock() 38 | 39 | year = int(y2 - y1) 40 | month = int(M2 - M1) 41 | day = int(d2 - d1) 42 | hour = int(h2 - h1) 43 | min = int(m2 - m1) 44 | sec = int(s2 - s1) 45 | 46 | // Normalize negative values 47 | if sec < 0 { 48 | sec += 60 49 | min-- 50 | } 51 | if min < 0 { 52 | min += 60 53 | hour-- 54 | } 55 | if hour < 0 { 56 | hour += 24 57 | day-- 58 | } 59 | if day < 0 { 60 | // days in month: 61 | t := time.Date(y1, M1, 32, 0, 0, 0, 0, time.UTC) 62 | day += 32 - t.Day() 63 | month-- 64 | } 65 | if month < 0 { 66 | month += 12 67 | year-- 68 | } 69 | 70 | return 71 | } 72 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/convertcoding/cc_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "testing" 4 | 5 | func Test2(t *testing.T) { 6 | main() 7 | } 8 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/convertcoding/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/csv" 6 | "flag" 7 | "fmt" 8 | "io" 9 | "log" 10 | "net/http" 11 | "strings" 12 | ) 13 | 14 | const ( 15 | ExpectedRows = 3 16 | ) 17 | 18 | func main() { 19 | var ( 20 | codingPath string 21 | ) 22 | 23 | flag.StringVar(&codingPath, "coding", "https://raw.githubusercontent.com/OxWearables/ukb_download_and_prep_template/main/Codings_Showcase.csv", "URL to CSV file with the UKBB data encodings") 24 | flag.Parse() 25 | 26 | if codingPath == "" { 27 | flag.PrintDefaults() 28 | log.Fatalln() 29 | } 30 | 31 | if err := ImportCoding(codingPath); err != nil { 32 | log.Fatalln(err) 33 | } 34 | } 35 | 36 | func ImportCoding(url string) error { 37 | log.Printf("Importing from %s\n", url) 38 | 39 | resp, err := http.Get(url) 40 | if err != nil { 41 | return err 42 | } 43 | reader := csv.NewReader(resp.Body) 44 | reader.Comma = ',' 45 | reader.LazyQuotes = true 46 | 47 | header := make([]string, 0) 48 | j := 0 49 | for ; ; j++ { 50 | row, err := reader.Read() 51 | if err != nil && err == io.EOF { 52 | resp.Body.Close() 53 | break 54 | } else if err != nil { 55 | buf := bytes.NewBuffer(nil) 56 | io.Copy(buf, resp.Body) 57 | if strings.Contains(buf.String(), "internal error") { 58 | log.Println("Coding File is not permitted to be downloaded from the UKBB") 59 | continue 60 | } 61 | } 62 | 63 | // Handle the header 64 | if j == 0 { 65 | log.Printf("Header (%d elements): %+v\n", len(row), row) 66 | header = append(header, row...) 67 | for k, v := range header { 68 | if v == "Coding" { 69 | header[k] = "coding_file_id" 70 | } else if v == "Value" { 71 | header[k] = "coding" 72 | } else if v == "Meaning" { 73 | header[k] = "meaning" 74 | } 75 | } 76 | 77 | if nCols := len(header); nCols != ExpectedRows { 78 | return fmt.Errorf("Expected a CSV with %d columns; got one with %d", ExpectedRows, nCols) 79 | } 80 | 81 | fmt.Println(strings.Join(header, "\t")) 82 | 83 | continue 84 | } 85 | 86 | // Handle the entries 87 | if len(row) == ExpectedRows { 88 | fmt.Println(strings.Join(row, "\t")) 89 | } 90 | } 91 | 92 | log.Println("Created coding file with", j, "entries") 93 | 94 | return nil 95 | } 96 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/convertdict/cd_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "testing" 4 | 5 | func Test4(t *testing.T) { 6 | main() 7 | } 8 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/convertdict/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/csv" 6 | "flag" 7 | "fmt" 8 | "io" 9 | "log" 10 | 11 | // "os" 12 | 13 | "net/http" 14 | "strings" 15 | ) 16 | 17 | const ( 18 | ExpectedRows = 17 19 | ) 20 | 21 | func main() { 22 | var ( 23 | dictPath string 24 | ) 25 | 26 | flag.StringVar(&dictPath, "dict", "https://biobank.ndph.ox.ac.uk/~bbdatan/Data_Dictionary_Showcase.tsv", "URL to CSV file with the UKBB data dictionary") 27 | // flag.StringVar(&dictPath, "dict", "/home/anamika/ml4h/data_dictionary/Data_Dictionary_Showcase.tsv", "URL to CSV file with the UKBB data dictionary") 28 | flag.Parse() 29 | 30 | if dictPath == "" { 31 | flag.PrintDefaults() 32 | log.Fatalln() 33 | } 34 | 35 | if err := ImportDictionary(dictPath); err != nil { 36 | log.Fatalln(err) 37 | } 38 | } 39 | 40 | func ImportDictionary(url string) error { 41 | log.Printf("Importing from %s\n", url) 42 | 43 | resp, err := http.Get(url) 44 | // resp, err := os.Open(url) 45 | if err != nil { 46 | return err 47 | } 48 | reader := csv.NewReader(resp.Body) 49 | // reader := csv.NewReader(resp) 50 | reader.Comma = '\t' 51 | reader.LazyQuotes = true 52 | 53 | header := make([]string, 0) 54 | j := 0 55 | for ; ; j++ { 56 | // log.Printf("Count J %d\n", j) 57 | row, err := reader.Read() 58 | if err != nil && err == io.EOF { 59 | resp.Body.Close() 60 | // resp.Close() 61 | break 62 | } else if err != nil { 63 | buf := bytes.NewBuffer(nil) 64 | io.Copy(buf, resp.Body) 65 | // io.Copy(buf, resp) 66 | if strings.Contains(buf.String(), "internal error") { 67 | log.Println("Dictionary File is not permitted to be downloaded from the UKBB") 68 | continue 69 | } 70 | } 71 | 72 | // Handle the header 73 | if j == 0 { 74 | log.Printf("Header (%d elements): %+v\n", len(row), row) 75 | header = append(header, row...) 76 | for k, v := range header { 77 | if v == "Coding" { 78 | header[k] = "coding_file_id" 79 | break 80 | } 81 | } 82 | 83 | if nCols := len(header); nCols != ExpectedRows { 84 | return fmt.Errorf("Expected a CSV with %d columns; got one with %d", ExpectedRows, nCols) 85 | } 86 | 87 | fmt.Println(strings.Join(header, "\t")) 88 | 89 | continue 90 | } 91 | 92 | // Handle the entries 93 | if len(row) == ExpectedRows { 94 | fmt.Println(strings.Join(row, "\t")) 95 | } 96 | } 97 | 98 | log.Println("Created dictionary file with", j, "entries") 99 | 100 | return nil 101 | } 102 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/convertpheno/flagslice.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "strings" 4 | 5 | type flagSlice []string 6 | 7 | func (i *flagSlice) String() string { 8 | if i == nil { 9 | return "" 10 | } 11 | 12 | return strings.Join([]string(*i), "\t") 13 | } 14 | 15 | func (i *flagSlice) Set(value string) error { 16 | *i = append(*i, value) 17 | return nil 18 | } 19 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/convertsample/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/csv" 5 | "flag" 6 | "fmt" 7 | "io" 8 | "log" 9 | "os" 10 | 11 | "github.com/carbocation/genomisc" 12 | ) 13 | 14 | const ( 15 | // .sample file field columns 16 | ID_1 = iota 17 | ID_2 18 | missing 19 | sex 20 | ) 21 | 22 | func main() { 23 | var ( 24 | samplePath string 25 | ) 26 | 27 | flag.StringVar(&samplePath, "sample", "", "genotyping .sample file for the UKBB") 28 | flag.Parse() 29 | 30 | if samplePath == "" { 31 | flag.PrintDefaults() 32 | os.Exit(1) 33 | } 34 | 35 | samplePath = genomisc.ExpandHome(samplePath) 36 | log.Printf("Importing %s\n", samplePath) 37 | 38 | // .sample file 39 | 40 | f, err := os.Open(samplePath) 41 | if err != nil { 42 | log.Fatalln(err) 43 | } 44 | defer f.Close() 45 | 46 | delim := genomisc.DetermineDelimiter(f) 47 | 48 | f.Seek(0, 0) 49 | fileCSV := csv.NewReader(f) 50 | fileCSV.Comma = delim 51 | 52 | // .sample files have 2 header rows that we will discard 53 | fileCSV.Read() 54 | fileCSV.Read() 55 | 56 | i := 0 57 | fmt.Printf("sample_id\tfile_row\n") 58 | for ; ; i++ { 59 | row, err := fileCSV.Read() 60 | if err != nil && err == io.EOF { 61 | break 62 | } else if err != nil { 63 | log.Fatalln(err) 64 | } 65 | 66 | fmt.Printf("%s\t%d\n", row[ID_1], i) 67 | } 68 | 69 | log.Println("Extracted", i, "records from the .sample file") 70 | } 71 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/decrypt_all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | set -o pipefail 4 | set -o nounset 5 | 6 | #this script takes a folder of .enc files from UKBB, keys, and produces .csv.gz 7 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 8 | enc_directory=${__dir}/uk_biobank_4_1_2019 9 | 10 | for file in 28112 23300 23301 23302 11 | do 12 | ${__dir}/ukbunpack ${enc_directory}/ukb${file}.enc ${__dir}/k17488_${file}.key 13 | ${__dir}/ukbconv ${enc_directory}/ukb${file}.enc_ukb csv 14 | gzip ${enc_directory}/ukb${file}.csv 15 | done 16 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/firstdate/README.md: -------------------------------------------------------------------------------- 1 | #THIS IS ARCHIVAL -- no code/notes here are meant to be used, but we want to keep the notes around 2 | # Dates in the UK Biobank 3 | 4 | ## Attended assessment center 5 | 6 | * Date FieldID 53 7 | 8 | Useful for: 9 | 10 | 1. Defining threshold date for incidence 11 | 1. Defining dates for things that don't otherwise have an associated date 12 | 13 | ## Birth 14 | 15 | * Date FieldID 34: Year of birth 16 | * Date FieldID 52: Month of birth 17 | * Date Field 33: birth date (*Note: this field is restricted due to its precision*) 18 | 19 | ## Lost to follow-up 20 | 21 | * Date FieldID 191 22 | 23 | ## Died 24 | 25 | * Date FieldID 40000 26 | 27 | ## ICD10 28 | 29 | * Date FieldID ==> derived from HESIN 30 | * Main ICD10: 41202 31 | * Secondary ICD10: 41204 32 | * ICD10 Primary Cause of Death: 40001 33 | * ICD10 Secondary Cause of Death: 40002 34 | 35 | ## ICD9 36 | 37 | * Date FieldID ==> derived from HESIN 38 | * Main ICD9: 41203 39 | * Secondary ICD9: 41205 40 | 41 | ## Operation (OPCS4) 42 | 43 | * Date FieldID ==> derived from HESIN 44 | * Main OPCS4: 41200 45 | * Secondary OPCS4: 41210 46 | * Self-reported: 47 | * FieldID: 20004 48 | * *float32 Year*: 20010 (need to truncate and add month/day) 49 | 50 | ## Special cases 51 | 52 | * Myocardial infarction: 53 | * FieldID: 42001 54 | * Date: 42000 55 | * Non-cancer illness: 56 | * FieldID: 20002 57 | * Date: 20008 58 | * Cancer: 59 | * FieldID: 20001 60 | * Date: 20006 61 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/firstdate/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | func main() { 4 | // http://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=41253 5 | 6 | // Run and merge from 3 separate queries: 7 | // 8 | // 1) for ICD/opcode fields, look into hesin 9 | // 2) for special fields with dates, look at their dates 10 | // 3) for all other fields, use the enrollment date based on their array_idx 11 | 12 | // assign ICDs to their transformed FieldIDs - different for main and secondary 13 | 14 | // Then, the output: 15 | // SampleID FieldID Value Date 16 | 17 | // Then left join on censor 18 | // GROUP BY sample_id 19 | 20 | // Downstream: 21 | // Pass 1: Fetch censor data for everyone 22 | // Pass 2: 23 | } 24 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importcensor/censor.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "NULLABLE", 4 | "name": "sample_id", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "mode": "NULLABLE", 9 | "name": "birthdate", 10 | "type": "DATE" 11 | }, 12 | { 13 | "mode": "NULLABLE", 14 | "name": "enroll_date", 15 | "type": "DATE" 16 | }, 17 | { 18 | "mode": "NULLABLE", 19 | "name": "enroll_age", 20 | "type": "FLOAT" 21 | }, 22 | { 23 | "mode": "NULLABLE", 24 | "name": "death_date", 25 | "type": "DATE" 26 | }, 27 | { 28 | "mode": "NULLABLE", 29 | "name": "death_age", 30 | "type": "FLOAT" 31 | }, 32 | { 33 | "mode": "NULLABLE", 34 | "name": "death_censor_date", 35 | "type": "DATE" 36 | }, 37 | { 38 | "mode": "NULLABLE", 39 | "name": "death_censor_age", 40 | "type": "FLOAT" 41 | }, 42 | { 43 | "mode": "NULLABLE", 44 | "name": "phenotype_censor_date", 45 | "type": "DATE" 46 | }, 47 | { 48 | "mode": "NULLABLE", 49 | "name": "phenotype_censor_age", 50 | "type": "FLOAT" 51 | }, 52 | { 53 | "mode": "NULLABLE", 54 | "name": "lost_to_followup_date", 55 | "type": "DATE" 56 | }, 57 | { 58 | "mode": "NULLABLE", 59 | "name": "lost_to_followup_age", 60 | "type": "FLOAT" 61 | }, 62 | { 63 | "mode": "NULLABLE", 64 | "name": "computed_date", 65 | "type": "DATE" 66 | }, 67 | { 68 | "mode": "NULLABLE", 69 | "name": "missing_fields", 70 | "type": "STRING" 71 | } 72 | ] 73 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importcensor/import.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | set -o pipefail 4 | set -o nounset 5 | #pass in 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903" 7 | DATASET=$2 #e.g. "ukbb7089_201903" 8 | 9 | #specific to this func 10 | GEO="US" 11 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 12 | 13 | bq --location=${GEO} load \ 14 | --field_delimiter "\t" \ 15 | --quote "" \ 16 | --replace \ 17 | --source_format=CSV \ 18 | --null_marker "NA" \ 19 | --skip_leading_rows 1 \ 20 | ${DATASET}.censor ${BUCKET}/censor.tsv.gz \ 21 | ${__dir}/censor.json 22 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importcoding/coding.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "coding_file_id", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "coding", 10 | "type": "STRING" 11 | }, 12 | { 13 | "mode": "REQUIRED", 14 | "name": "meaning", 15 | "type": "STRING" 16 | } 17 | ] 18 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importcoding/import.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | set -o pipefail 4 | set -o nounset 5 | #pass in 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903" 7 | DATASET=$2 #e.g. "ukbb7089_201903" 8 | 9 | #specific to this func 10 | GEO="US" 11 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 12 | 13 | bq --location=${GEO} load \ 14 | --field_delimiter "\t" \ 15 | --replace \ 16 | --quote "" \ 17 | --source_format=CSV \ 18 | --skip_leading_rows 1 \ 19 | ${DATASET}.coding ${BUCKET}/coding.tsv.gz \ 20 | ${__dir}/coding.json 21 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importdict/dictionary.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "Path", 5 | "type": "STRING" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "Category", 10 | "type": "INTEGER" 11 | }, 12 | { 13 | "mode": "REQUIRED", 14 | "name": "FieldID", 15 | "type": "INTEGER" 16 | }, 17 | { 18 | "mode": "REQUIRED", 19 | "name": "Field", 20 | "type": "STRING" 21 | }, 22 | { 23 | "mode": "REQUIRED", 24 | "name": "Participants", 25 | "type": "INTEGER" 26 | }, 27 | { 28 | "mode": "REQUIRED", 29 | "name": "Items", 30 | "type": "INTEGER" 31 | }, 32 | { 33 | "mode": "REQUIRED", 34 | "name": "Stability", 35 | "type": "STRING" 36 | }, 37 | { 38 | "mode": "REQUIRED", 39 | "name": "ValueType", 40 | "type": "STRING" 41 | }, 42 | { 43 | "name": "Units", 44 | "type": "STRING" 45 | }, 46 | { 47 | "mode": "REQUIRED", 48 | "name": "ItemType", 49 | "type": "STRING" 50 | }, 51 | { 52 | "mode": "REQUIRED", 53 | "name": "Strata", 54 | "type": "STRING" 55 | }, 56 | { 57 | "mode": "REQUIRED", 58 | "name": "Sexed", 59 | "type": "STRING" 60 | }, 61 | { 62 | "mode": "REQUIRED", 63 | "name": "Instances", 64 | "type": "INTEGER" 65 | }, 66 | { 67 | "mode": "REQUIRED", 68 | "name": "Array", 69 | "type": "INTEGER" 70 | }, 71 | { 72 | "name": "coding_file_id", 73 | "type": "INTEGER" 74 | }, 75 | { 76 | "name": "Notes", 77 | "type": "STRING" 78 | }, 79 | { 80 | "mode": "REQUIRED", 81 | "name": "Link", 82 | "type": "STRING" 83 | } 84 | ] 85 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importdict/import.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | set -o pipefail 4 | set -o nounset 5 | #pass in 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903" 7 | DATASET=$2 #e.g. "ukbb7089_201903" 8 | 9 | #specific to this func 10 | GEO="US" 11 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 12 | 13 | #note hardcoded dictionary.json, dictionary.tsv.gz, dictionary table 14 | # Special for dict: need to disable quotes 15 | bq --location=${GEO} load \ 16 | --field_delimiter "\t" \ 17 | --replace \ 18 | --quote "" \ 19 | --source_format=CSV \ 20 | --skip_leading_rows 1 \ 21 | ${DATASET}.dictionary ${BUCKET}/dictionary.tsv.gz \ 22 | ${__dir}/dictionary.json 23 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importhesin/hesin_diag.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "eid", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "ins_index", 10 | "type": "INTEGER" 11 | }, 12 | { 13 | "mode": "NULLABLE", 14 | "name": "arr_index", 15 | "type": "INTEGER" 16 | }, 17 | { 18 | "mode": "NULLABLE", 19 | "name": "level", 20 | "type": "INTEGER" 21 | }, 22 | { 23 | "mode": "NULLABLE", 24 | "name": "diag_icd9", 25 | "type": "STRING" 26 | }, 27 | { 28 | "mode": "NULLABLE", 29 | "name": "diag_icd9_nb", 30 | "type": "INTEGER" 31 | }, 32 | { 33 | "mode": "NULLABLE", 34 | "name": "diag_icd10", 35 | "type": "STRING" 36 | }, 37 | { 38 | "mode": "NULLABLE", 39 | "name": "diag_icd10_nb", 40 | "type": "STRING" 41 | } 42 | ] 43 | 44 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importhesin/hesin_diag10.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "eid", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "record_id", 10 | "type": "INTEGER" 11 | }, 12 | { 13 | "mode": "NULLABLE", 14 | "name": "arr_index", 15 | "type": "INTEGER" 16 | }, 17 | { 18 | "mode": "NULLABLE", 19 | "name": "diag_icd10", 20 | "type": "STRING" 21 | }, 22 | { 23 | "mode": "NULLABLE", 24 | "name": "diag_icd10_nb", 25 | "type": "STRING" 26 | } 27 | ] 28 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importhesin/hesin_diag9.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "eid", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "record_id", 10 | "type": "INTEGER" 11 | }, 12 | { 13 | "mode": "NULLABLE", 14 | "name": "arr_index", 15 | "type": "INTEGER" 16 | }, 17 | { 18 | "mode": "NULLABLE", 19 | "name": "diag_icd9", 20 | "type": "STRING" 21 | }, 22 | { 23 | "mode": "NULLABLE", 24 | "name": "diag_icd9_nb", 25 | "type": "STRING" 26 | } 27 | ] 28 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importhesin/hesin_lubitz.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "eid", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "record_id", 10 | "type": "INTEGER" 11 | }, 12 | { 13 | "mode": "NULLABLE", 14 | "name": "admidate", 15 | "type": "STRING" 16 | }, 17 | { 18 | "mode": "NULLABLE", 19 | "name": "cause_icd10", 20 | "type": "STRING" 21 | }, 22 | { 23 | "mode": "NULLABLE", 24 | "name": "cause_icd10_nb", 25 | "type": "STRING" 26 | }, 27 | { 28 | "mode": "NULLABLE", 29 | "name": "diag_icd10", 30 | "type": "STRING" 31 | }, 32 | { 33 | "mode": "NULLABLE", 34 | "name": "diag_icd10_nb", 35 | "type": "STRING" 36 | }, 37 | { 38 | "mode": "NULLABLE", 39 | "name": "diag_icd9", 40 | "type": "STRING" 41 | }, 42 | { 43 | "mode": "NULLABLE", 44 | "name": "diag_icd9_nb", 45 | "type": "STRING" 46 | }, 47 | { 48 | "mode": "NULLABLE", 49 | "name": "disdate", 50 | "type": "STRING" 51 | }, 52 | { 53 | "mode": "NULLABLE", 54 | "name": "epiend", 55 | "type": "STRING" 56 | }, 57 | { 58 | "mode": "NULLABLE", 59 | "name": "epistart", 60 | "type": "STRING" 61 | }, 62 | { 63 | "mode": "NULLABLE", 64 | "name": "opdate", 65 | "type": "STRING" 66 | }, 67 | { 68 | "mode": "NULLABLE", 69 | "name": "oper4", 70 | "type": "STRING" 71 | }, 72 | { 73 | "mode": "NULLABLE", 74 | "name": "oper4_nb", 75 | "type": "STRING" 76 | }, 77 | { 78 | "mode": "NULLABLE", 79 | "name": "operstat", 80 | "type": "INTEGER" 81 | }, 82 | { 83 | "mode": "NULLABLE", 84 | "name": "source", 85 | "type": "STRING" 86 | } 87 | ] 88 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importhesin/hesin_oper.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "NULLABLE", 4 | "name": "eid", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "mode": "NULLABLE", 9 | "name": "ins_index", 10 | "type": "INTEGER" 11 | }, 12 | { 13 | "mode": "NULLABLE", 14 | "name": "arr_index", 15 | "type": "INTEGER" 16 | }, 17 | { 18 | "mode": "NULLABLE", 19 | "name": "level", 20 | "type": "INTEGER" 21 | }, 22 | { 23 | "mode": "NULLABLE", 24 | "name": "opdate", 25 | "type": "STRING" 26 | }, 27 | { 28 | "mode": "NULLABLE", 29 | "name": "oper3", 30 | "type": "STRING" 31 | }, 32 | { 33 | "mode": "NULLABLE", 34 | "name": "oper3_nb", 35 | "type": "STRING" 36 | }, 37 | { 38 | "mode": "NULLABLE", 39 | "name": "oper4", 40 | "type": "STRING" 41 | }, 42 | { 43 | "mode": "NULLABLE", 44 | "name": "oper4_nb", 45 | "type": "STRING" 46 | }, 47 | { 48 | "mode": "NULLABLE", 49 | "name": "posopdur", 50 | "type": "INTEGER" 51 | }, 52 | { 53 | "mode": "NULLABLE", 54 | "name": "preopdur", 55 | "type": "INTEGER" 56 | } 57 | ] 58 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importhesin/import.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | set -o pipefail 4 | set -o nounset 5 | #pass in 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903" 7 | DATASET=$2 #e.g. "ukbb7089_201903" 8 | 9 | #specific to this func 10 | GEO="US" 11 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 12 | 13 | 14 | #for lubitz, must replace hesin.json with hesin_lubitz.json 15 | #for NAME in hesin hesin_diag10 hesin_diag9 hesin_oper 16 | for NAME in hesin hesin_diag hesin_oper 17 | do 18 | bq --location=${GEO} load \ 19 | --field_delimiter "\t" \ 20 | --replace \ 21 | --source_format=CSV \ 22 | --skip_leading_rows 1 \ 23 | ${DATASET}.${NAME} ${BUCKET}/${NAME}.tsv.gz \ 24 | ${__dir}/${NAME}.json 25 | done 26 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importpheno/append.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | set -o pipefail 4 | set -o nounset 5 | #pass in 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903" 7 | DATASET=$2 #e.g. "ukbb7089_201903" 8 | 9 | #specific to this func 10 | GEO="US" 11 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 12 | 13 | # For phenotypes, we expect to add repeatedly, so we don't replace here. Note: 14 | # if you run append.sh with the same data twice, you'll just duplicate the 15 | # contents of the table. 16 | bq --location=${GEO} load \ 17 | --field_delimiter "\t" \ 18 | --quote "" \ 19 | --null_marker "NULL" \ 20 | --source_format=CSV \ 21 | --skip_leading_rows 1 \ 22 | ${DATASET}.phenotype ${BUCKET}/phenotype.tsv \ 23 | ${__dir}/phenotype.json 24 | 25 | 26 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importpheno/phenotype.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "sample_id", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "FieldID", 10 | "type": "INTEGER" 11 | }, 12 | { 13 | "mode": "REQUIRED", 14 | "name": "instance", 15 | "type": "INTEGER" 16 | }, 17 | { 18 | "mode": "REQUIRED", 19 | "name": "array_idx", 20 | "type": "INTEGER" 21 | }, 22 | { 23 | 24 | "name": "value", 25 | "type": "STRING" 26 | }, 27 | { 28 | "name": "coding_file_id", 29 | "type": "INTEGER" 30 | } 31 | ] 32 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importsample/import.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | set -o pipefail 4 | set -o nounset 5 | #pass in 6 | BUCKET=$1 #e.g. "gs://ml4cvd/projects/jamesp/bigquery/201903" 7 | DATASET=$2 #e.g. "ukbb7089_201903" 8 | 9 | #specific to this func 10 | GEO="US" 11 | NAME="sample" 12 | 13 | 14 | bq --location=${GEO} load \ 15 | --field_delimiter "\t" \ 16 | --replace \ 17 | --quote "" \ 18 | --source_format=CSV \ 19 | --skip_leading_rows 1 \ 20 | ${DATASET}.${NAME} ${TABLE_LOC}/${NAME}.tsv.gz 21 | ${NAME}.json 22 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/importsample/sample.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "sample_id", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "file_row", 10 | "type": "INTEGER" 11 | } 12 | ] 13 | -------------------------------------------------------------------------------- /ingest/ukbb_csv_bigquery/inspect_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ingest/ukbb_csv_bigquery/inspect_screenshot.png -------------------------------------------------------------------------------- /ml4h/DATA_MODELING_TESTS.md: -------------------------------------------------------------------------------- 1 | # Data/Modeling/Tests 2 | ## Running tests 3 | Tests can be run in Docker with 4 | ``` 5 | ${HOME}/ml4h/scripts/tf.sh -T ${HOME}/ml4h/tests 6 | ``` 7 | Tests can be run locally in a conda environment with 8 | ``` 9 | python -m pytest ${HOME}/ml4h/tests 10 | ``` 11 | Some of the tests are slow due to creating, saving and loading `tensorflow` models. 12 | To skip those tests to move quickly, run 13 | ``` 14 | python -m pytest ${HOME}/ml4h/tests -m "not slow" 15 | ``` 16 | pytest can also run specific tests using `::`. For example 17 | ``` 18 | python -m pytest ${HOME}/ml4h/tests/test_models.py::TestMakeMultimodalMultitaskModel::test_u_connect_segment 19 | ``` 20 | For more pytest usage information, checkout the [usage guide](https://docs.pytest.org/en/latest/usage.html). 21 | 22 | ### Phenotypic SQLite database 23 | Data for 500k people containing almost everything available in the UK Biobank Showcase 24 | 25 | `/mnt/disks/data/raw/sql/ukbb7089.r10data.db` 26 | 27 | To access the data using `sqlite`: 28 | 29 | `sqlite3 /mnt/disks/data/raw/sql/ukbb7089.r10data.db` 30 | 31 | The data can also be accessed through [BigQuery](https://console.cloud.google.com/bigquery?project=broad-ml4cvd&p=broad-ml4cvd&page=project). 32 | 33 | 34 | ### Cardiac MRI 35 | 212,158 individual zip files in ~20k people. Dicom-formatted files inside: 36 | 37 | `/mnt/disks/data/raw/mris/cardiac/*.zip` 38 | 39 | ### Liver MRI 40 | 10,132 individual zip files in ~10k people. Dicom-formatted files inside: 41 | 42 | `/mnt/disks/data/raw/mris/liver/*.zip` 43 | 44 | ### ECG: XML 45 | 119,097 ECGGs (12-lead resting and 3-lead exercise): 46 | 47 | `/mnt/disks/data/raw/ecgs/*.xml` 48 | 49 | ### Direct Genotypes 50 | ~800k/person: 51 | 52 | `/mnt/imputed_v2` 53 | 54 | ### Imputed Genotypes 55 | 90 million/person: 56 | 57 | `/mnt/imputed_v3` 58 | 59 | ## Modeling with TensorFlow 60 | Once you have a virtual machine and an environment setup it is time to start learning. 61 | The first step is to create training data by writing tensors to the disk. 62 | 63 | To write tensors with default categorical and continuous phenotypes, and no MRI or EKG data 64 | ``` 65 | ${HOME}/ml/scripts/tf.sh ${HOME}/ml/ml4h/recipes.py --mode tensorize --tensors ${HOME}/my_tensors/ --max_sample_id 1003000 --mri_field_id --xml_field_id 66 | ``` 67 | This should take about a minute to run and will output the SQL queries as well as the counts for the phenotype categories and responses that it finds. Now let's train a model: 68 | ``` 69 | ${HOME}/ml/scripts/tf.sh ${HOME}/ml/ml4h/recipes.py --mode train --tensors ${HOME}/my_tensors/ --input_tensors categorical-phenotypes-94 --output_tensors coronary_artery_disease_soft --id my_first_mlp_for_cvd 70 | ``` 71 | This model should achieve about 75% validation set accuracy on predicting from the phenotypes whether this person was labelled with an ICD code corresponding to cardivascular disease. 72 | -------------------------------------------------------------------------------- /ml4h/DatabaseClient.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod, ABC 2 | 3 | import sqlite3 4 | from google.cloud.bigquery import Client 5 | 6 | 7 | class DatabaseClient(ABC): 8 | def __init__(self, client): 9 | self.client = client 10 | super(DatabaseClient, self).__init__() 11 | 12 | @abstractmethod 13 | def execute(self, query: str): 14 | pass 15 | 16 | 17 | class BigQueryDatabaseClient(DatabaseClient): 18 | """ If running locally, run the following commandline to authenticate yourself: 19 | gcloud auth application-default login 20 | """ 21 | 22 | def __init__(self, client=None, credentials_file=None): 23 | if client is not None: 24 | super(BigQueryDatabaseClient, self).__init__(client) 25 | else: 26 | if credentials_file is not None: 27 | bigquery_client = Client.from_service_account_json(credentials_file) 28 | else: 29 | raise ValueError("BigQueryDatabaseClient requires a client or a credentials_file.") 30 | super(BigQueryDatabaseClient, self).__init__(bigquery_client) 31 | 32 | def execute(self, query: str): 33 | query_job = self.client.query(query) # API request 34 | rows = query_job.result() # Waits for query to finish 35 | return rows 36 | 37 | 38 | class SqLiteDatabaseClient(DatabaseClient): 39 | def __init__(self, client=None, db_file=None): 40 | if client is not None: 41 | super(SqLiteDatabaseClient, self).__init__(client) 42 | else: 43 | if db_file is not None: 44 | super(SqLiteDatabaseClient, self).__init__(sqlite3.connect(db_file).cursor()) 45 | else: 46 | raise ValueError("SqLiteDatabaseClient requires a client or a db_file.") 47 | 48 | def execute(self, query: str): 49 | return self.client.execute(query) 50 | 51 | 52 | if '__main__' == __name__: 53 | credentials_file = '/Users/kyuksel/ml4h/bigquery-viewer-credentials.json' 54 | db_client = BigQueryDatabaseClient(credentials_file=credentials_file) 55 | 56 | dataset = 'broad-ml4cvd.ukbb7089_r10data' 57 | 58 | dictionary_table = f"`{dataset}.dictionary`" 59 | phenotype_table = f"`{dataset}.phenotype`" 60 | coding_table = f"`{dataset}.coding`" 61 | 62 | fid = 20001 63 | fids = [3143, 3144] 64 | sample_id = 2907043 65 | 66 | job_title_field_id = 22600 67 | icd10_field = 41202 68 | query = \ 69 | f"SELECT value FROM {phenotype_table} WHERE fieldid={icd10_field} AND sample_id={sample_id}" 70 | 71 | rows = db_client.execute(query.format()) 72 | for row in rows: 73 | print(row) 74 | -------------------------------------------------------------------------------- /ml4h/__init__.py: -------------------------------------------------------------------------------- 1 | from . import defines 2 | -------------------------------------------------------------------------------- /ml4h/applications/ingest/requirements.txt: -------------------------------------------------------------------------------- 1 | fastparquet 2 | blosc 3 | xxhash 4 | zstandard 5 | cv2 6 | scipy 7 | pandas 8 | numpy 9 | h5py 10 | -------------------------------------------------------------------------------- /ml4h/applications/jpp_inference_rv/README.md: -------------------------------------------------------------------------------- 1 | # Reproduction scripts: inference RV 2 | 3 | This folder contains scripts to reproduce the models used in: 4 | 5 | **Genetic Analysis of Right Heart Structure and Function in 45,000 People**. James P. Pirruccello*, Paolo Di Achille*, Victor Nauffal*, Mahan Nekoui, Samuel N. Friedman, Marcus D. R. Klarqvist, Mark D. Chaffin, Shaan Khurshid, Carolina Roselli, Puneet Batra, Kenney Ng, Steven A. Lubitz, Jennifer E. Ho, Mark E. Lindsay, Anthony Philippakis, Patrick T. Ellinor. [To appear] 6 | 7 | ## Example 8 | 9 | Given a pre-trained semantic segmentation model `sax_slices_jamesp_4b_hyperopted_dropout_pap_dupe.h5` and the `ml4h.tensormap` that was used to generate data we can proceed to make inference on new data. 10 | 11 | ```py 12 | import infer_on_sax # Local file 13 | 14 | # Pre-trained model 15 | model = prepare_model("/tf/sax_slices_jamesp_4b_hyperopted_dropout_pap_dupe.h5", ml4h.tensormap.ukb.mri.cine_segmented_sax_slice_jamesp) 16 | # Enumerate the target files of interest. 17 | files = glob.glob('/mnt/disks/annotated-cardiac-tensors-44k/2020-09-21/*.hd5') 18 | # Partition the files into buckets and retrieve the files corresponding to that bucket. 19 | # For example, embarassingly parallel computation across 50 GCP VMs with NVidia P4 GPUs 20 | # using the provided shell script. 21 | files = split_files_for_parallel_computing(files, partition_number=0, total_partitions=50) 22 | jpp_infer_short_axis(files, model, output_path='/tf/') 23 | ``` 24 | 25 | A provided shell script `infer_hdf5_to_local.sh` streamline the procedure of spawning multiple GCP VMs with attached disks and GPUs for inference. Make sure you modify this file for executing the appropriate commands on the VMs. 26 | -------------------------------------------------------------------------------- /ml4h/applications/jpp_inference_rv/infer_to_hd5_local.sh: -------------------------------------------------------------------------------- 1 | # Snapshot to spawn. 2 | target_vm_snapshot = rv-parameterization 3 | # Disk(s) to mount. 4 | target_disk = annotated-cardiac-tensors-45k 5 | # Base prefix name for the VM. 6 | vm_base_name = pdiachil-rv-inference 7 | # Username. 8 | user = pdiachil 9 | # Branch of ML4H to use. 10 | ml4h_branch = pd_sf_blox 11 | 12 | for j in {0..4} 13 | do 14 | start=$((j*10)) 15 | end=$((start+10-1)) 16 | for k in {0..2} 17 | do 18 | for i in $(seq $start $end) 19 | do 20 | # Spawn a VM instance given prepared VM snapshot. 21 | gcloud compute instances create ${vm_base_name}-${i} \ 22 | --machine-type=n1-standard-8 \ 23 | --boot-disk-size=150 \ 24 | --image=${target_vm_snapshot} \ 25 | --maintenance-policy=TERMINATE \ 26 | --accelerator=type=nvidia-tesla-t4,count=1 & 27 | sleep 1 # Sleep for a second 28 | done # End loop i 29 | wait # Wait until completion 30 | done # End loop k 31 | wait # Wait until completion 32 | for i in $(seq $start $end) 33 | do 34 | # Attach disk(s) to the spawned VM instance. 35 | gcloud compute instances attach-disk ${vm_base_name}-${i} --disk=${target_disk} --mode ro & 36 | sleep 1 # Sleep for a second 37 | done # End loop i 38 | wait # Wait until completeion 39 | sleep 25 # Sleep for 25 seconds 40 | for i in $(seq $start $end) 41 | do 42 | gcloud compute ssh ${vm_base_name}-${i} --command="cd /home/${user};cd ml;git pull;git checkout ${ml4h_branch};git pull;nohup bash /home/${user}/ml/scripts/infer_to_hd5.sh $i > /home/${user}/out_${i}.out 2> /home/${user}/out_${i}.err < /dev/null &" & 43 | sleep 1 # Sleep for a second 44 | done # End loop i 45 | done # End outer VM loop 46 | -------------------------------------------------------------------------------- /ml4h/logger.py: -------------------------------------------------------------------------------- 1 | """Provides config settings for the logger and a way to load them""" 2 | 3 | import sys 4 | import os 5 | import errno 6 | import logging 7 | 8 | 9 | def load_config(log_level, log_dir, log_file_basename, log_file_suffix): 10 | from logging import config as logging_config 11 | 12 | try: 13 | os.makedirs(log_dir) 14 | except OSError as e: 15 | if e.errno != errno.EEXIST: 16 | raise e 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | log_file = "{}/{}_{}.log".format(log_dir, log_file_basename, log_file_suffix) 21 | 22 | try: 23 | logging_config.dictConfig(_create_config(log_level, log_file)) 24 | success_msg = "Logging configuration was loaded. Log messages can be found at {}.".format(log_file) 25 | logger.info(success_msg) 26 | except Exception as e: 27 | logger.error("Failed to load logging config!") 28 | raise e 29 | 30 | 31 | def _create_config(log_level, log_file): 32 | return { 33 | 'version': 1, 34 | 'disable_existing_loggers': False, 35 | 'formatters': { 36 | 'simple': { 37 | 'format': '%(asctime)s - %(module)s:%(lineno)d - %(levelname)s - %(message)s', 38 | }, 39 | 'detailed': { 40 | 'format': '%(name)s:%(levelname)s %(module)s:%(lineno)d: %(message)s', 41 | }, 42 | }, 43 | 'handlers': { 44 | 'console': { 45 | 'level': log_level, 46 | 'class': 'logging.StreamHandler', 47 | 'formatter': 'simple', 48 | 'stream': sys.stdout, 49 | }, 50 | 'file': { 51 | 'level': log_level, 52 | 'class': 'logging.FileHandler', 53 | 'formatter': 'simple', 54 | 'filename': log_file, 55 | 'mode': 'w', 56 | }, 57 | }, 58 | 'loggers': { 59 | '': { 60 | 'handlers': ['console', 'file'], 61 | 'level': log_level, 62 | }, 63 | }, 64 | } 65 | -------------------------------------------------------------------------------- /ml4h/ml4ht_integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/ml4ht_integration/__init__.py -------------------------------------------------------------------------------- /ml4h/models/Block.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Dict, List, Callable 3 | 4 | import tensorflow as tf 5 | 6 | from ml4h.TensorMap import TensorMap 7 | 8 | Tensor = tf.Tensor 9 | Block = Callable[[Tensor, Dict[TensorMap, List[Tensor]]], Tensor] 10 | 11 | 12 | class Block(ABC): 13 | @abstractmethod 14 | def __call__(self, x: Tensor, intermediates: Dict[TensorMap, List[Tensor]] = None) -> Tensor: 15 | pass 16 | 17 | def can_apply(self): 18 | return True 19 | -------------------------------------------------------------------------------- /ml4h/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/models/__init__.py -------------------------------------------------------------------------------- /ml4h/normalizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC, abstractmethod 3 | 4 | from ml4h.defines import EPS 5 | from tensorflow.keras.applications import imagenet_utils 6 | 7 | 8 | class Normalizer(ABC): 9 | @abstractmethod 10 | def normalize(self, tensor: np.ndarray) -> np.ndarray: 11 | """Shape preserving transformation""" 12 | pass 13 | 14 | def normalize_loading_option(self, tensor: np.ndarray, _) -> np.ndarray: 15 | """Shape preserving transformation for use with DataDescription. 16 | Defaults to the normalize function if not defined in the descendant""" 17 | return self.normalize(tensor) 18 | 19 | def un_normalize(self, tensor: np.ndarray) -> np.ndarray: 20 | """The inverse of normalize if possible. Otherwise identity.""" 21 | return tensor 22 | 23 | 24 | class Standardize(Normalizer): 25 | def __init__(self, mean: float, std: float): 26 | self.mean, self.std = mean, std 27 | 28 | def normalize(self, tensor: np.ndarray) -> np.ndarray: 29 | return (tensor - self.mean) / self.std 30 | 31 | def un_normalize(self, tensor: np.ndarray) -> np.ndarray: 32 | return tensor * self.std + self.mean 33 | 34 | 35 | class ZeroMeanStd1(Normalizer): 36 | def normalize(self, tensor: np.ndarray) -> np.ndarray: 37 | tensor -= np.mean(tensor) 38 | tensor /= np.std(tensor) + EPS 39 | return tensor 40 | 41 | 42 | class NonZeroNormalize(Normalizer): 43 | def normalize(self, tensor: np.ndarray) -> np.ndarray: 44 | nonzero = tensor > 0 45 | tensor[nonzero] = (tensor[nonzero] - tensor[nonzero].mean() + 1e-9) / ( 46 | tensor[nonzero].std() + 1e-9 47 | ) 48 | return tensor 49 | 50 | 51 | class TopKNormalize(Normalizer): 52 | def __init__(self, n_top: int = 50): 53 | self.n_top = n_top 54 | 55 | def normalize(self, tensor: np.ndarray) -> np.ndarray: 56 | """Find top K itensity voxels are set upper range to the mean of those""" 57 | upper = np.mean(sorted(np.max(tensor, axis=-1).flatten())[::-1][0:self.n_top]) 58 | tensor = np.where(tensor >= upper, upper, tensor) 59 | tensor /= tensor.max() 60 | return tensor 61 | 62 | 63 | class ImagenetNormalizeTorch(Normalizer): 64 | def normalize(self, tensor: np.ndarray) -> np.ndarray: 65 | # This is equivalent to: 66 | # x /= 255. 67 | # mean = [0.485, 0.456, 0.406] 68 | # std = [0.229, 0.224, 0.225] 69 | # when mode is torch 70 | return imagenet_utils.preprocess_input(tensor, data_format=None, mode="torch") 71 | 72 | 73 | class RandomStandardize(Normalizer): 74 | def __init__(self, mean: float, std: float, ratio: float = 0.5): 75 | self.mean, self.std, self.ratio = mean, std, ratio 76 | 77 | def normalize(self, tensor: np.ndarray) -> np.ndarray: 78 | if np.random.rand() > self.ratio: 79 | return (tensor - self.mean) / (self.std + EPS) 80 | else: 81 | return (tensor - np.mean(tensor)) / (np.std(tensor) + EPS) 82 | 83 | def un_normalize(self, tensor: np.ndarray) -> np.ndarray: 84 | return tensor * self.std + self.mean 85 | -------------------------------------------------------------------------------- /ml4h/tensorize/PARTNERS.md: -------------------------------------------------------------------------------- 1 | # Partners ECG 2 | Organizing and Tensorizing MUSE 12-lead ECGs 3 | 4 | ## Table of Contents 5 | 1. [Organizing XMLs and Removing Duplicates](#organizing-xmls-and-removing-duplicates) 6 | 2. [Tensorizing XMLs to HDF5](#tensorizing-xmls-to-hdf5) 7 | 3. [ECG Data Structure](#ecg-data-structure) 8 | 4. [Extracting ECG Metadata](#extracting-ecg-metadata) 9 | 5. [Other documentation](#other-documentation) 10 | 11 | ## Organizing XMLs and Removing Duplicates 12 | `ingest/partners_ecg/organize_xml.py` moves XML files from a single directory into the appropriate yyyy-mm directory. 13 | 14 | `ingest/partners_ecg/remove_xml_duplicates.py` finds and removes exact duplicate XML files, as defined by every bit of two files being identical, determined via SHA-256 hashing. 15 | 16 | ## Tensorizing XMLs to HDF5 17 | `tensorize_partners` mode in `recipes.py` extracts data from all XML files and saves as [HDF5 files](https://www.hdfgroup.org). Tensorization also removes duplicates that contain nearly the same information, except for minor differences, for example minor version changes in acquisition software. This duplicate detection is done by matching patient-date-time fields. 18 | 19 | This mode is called with the following arguments: 20 | `--xml_folder` to specify the directory containing ECG XMLs. 21 | `--tensors` to specify the directory where tensorized HD5 files should be saved. 22 | 23 | All the ECGs belonging to one patient, identified by medical record number (MRN), will be saved to one HD5, indexed by ECG acquisition date and time: 24 | ``` 25 | .hd5 26 | └--partners_ecg_rest 27 | | 28 | |--date_1 29 | | └--ECG Data 30 | | 31 | └--date_2 32 | └--ECG Data 33 | ``` 34 | 35 | ## ECG Data Structure 36 | Voltage is saved from XMLs as a dictionary of numpy arrays indexed by leads in the set `("I", "II", "V1", "V2", "V3", "V4", "V5", "V6")`, e.g.: 37 | 38 | ``` 39 | voltage = {'I': array([0, -4, -2, ..., 7]), 40 | {'II': array([2, -9, 0, ..., 5]), 41 | ... 42 | {'V6': array([1, -4, -3, ..., 4]), 43 | ``` 44 | 45 | Every other element extracted from the XML is returned as a string, even if the underlying primitive type is a number (e.g. age). Here are some of the more important elements: 46 | 47 | ``` 48 | acquisitiondate 49 | atrialrate 50 | dateofbirth 51 | diagnosis_computer 52 | diagnosis_md 53 | ecgsamplebase 54 | ecgsampleexponent 55 | gender 56 | heightin 57 | location 58 | locationname 59 | overreaderfirstname 60 | overreaderid 61 | overreaderlastname 62 | patientid 63 | paxis 64 | poffset 65 | ponset 66 | printerval 67 | qoffset 68 | qonset 69 | qrscount 70 | qrsduration 71 | qtcfrederica 72 | qtcorrected 73 | qtinterval 74 | race 75 | raxis 76 | taxis 77 | toffset 78 | ventricularrate 79 | weightlbs 80 | ``` 81 | 82 | ## Extracting ECG metadata 83 | 84 | `explore` mode in `recipes.py` extracts data specified by `--input_tensors` from all HD5 files given to `--tensors` and calculates summary statistics. Additionally, all metadata is saved to a large CSV file: 85 | 86 | This CSV file will be used to construct a performant, queryable database to identify future cohorts for research projects. 87 | 88 | ## Other documentation 89 | GE documentation is stored in a shared Partners Dropbox folder ([link](https://www.dropbox.com/sh/c5tgm0lory72ge0/AADqKvUicDdyWzHYhtad0lU4a?dl=0)), including 1. physician's guide to the Marquette 12SL ECG analysis program, 2. guide to MuseDB search, and 3. Muse v9 XML developer's guide. 90 | -------------------------------------------------------------------------------- /ml4h/tensorize/README.md: -------------------------------------------------------------------------------- 1 | # Run Dataflow 2 | The following steps will run a Dataflow pipeline remotely, which in turn will, tensorize fields of a type 3 | specified by the user (e.g. categorical, continuous) and write them onto a GCS bucket in the form of 4 | one `hd5` file per sample id. 5 | 6 | * Clone the repo and cd into it: 7 | ``` 8 | git clone git@github.com:broadinstitute/ml4h.git 9 | cd ml4h 10 | ``` 11 | 12 | * Create and activate the right Python environment: 13 | ``` 14 | conda env create -f ml4h/tensorize/dataflow/ml4h_dataflow.yml 15 | conda activate ml4h_dataflow 16 | ``` 17 | 18 | * Make sure you are authenticated by Google Cloud: 19 | ``` 20 | gcloud auth application-default login 21 | ``` 22 | 23 | * Re install ml4h if you have made any changes: 24 | ``` 25 | pip install . 26 | ``` 27 | 28 | * Run with the help option to see the list of command line arguments. 29 | ``` 30 | python ml4h/tensorize/tensorize_dataflow.py -h 31 | ``` 32 | 33 | * Comment out the requirements in setup.py. Because some dataflow requirements conflict with ml4h base requirements you must comment out the lines (currently lines 6 and 16) in setup.py in the repo root: 34 | ``` 35 | requirements = (here / 'docker/vm_boot_images/config/tensorflow-requirements.txt').read_text(encoding='utf-8') 36 | ... 37 | install_requires=requirements, 38 | ``` 39 | 40 | * **Note** that Google requires the `id` consist of only the 41 | characters `[-a-z0-9]`, i.e. starting with a letter and ending with a letter or number. 42 | 43 | * Run the application to submit the pipeline to Dataflow to be executed remotely provided the 44 | command line argument `--beam_runner` is set to `DataflowRunner`. Set it to `DirectRunner` for local execution. 45 | For example: 46 | ``` 47 | python ml4h/tensorize/tensorize_dataflow.py \ 48 | --id categorical-v2023-01-16 \ 49 | --tensor_type categorical \ 50 | --bigquery_dataset ukbb_dev \ 51 | --beam_runner DataflowRunner \ 52 | --repo_root /Users/sam/Dropbox/Code/ml4h \ 53 | --gcs_output_path tensors/continuous_v2023_01_17 54 | ``` 55 | 56 | * The pipeline can be run multiple times to tensorize different types of fields. This will populate the per-sample tensors 57 | in specified GCS buckets. In order to unify them, they can be downloaded via `gsutil` as shown below 58 | and merged using `merge_hd5s.py` script. 59 | ``` 60 | gsutil -m cp -r 61 | ``` 62 | -------------------------------------------------------------------------------- /ml4h/tensorize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/tensorize/__init__.py -------------------------------------------------------------------------------- /ml4h/tensorize/dataflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/tensorize/dataflow/__init__.py -------------------------------------------------------------------------------- /ml4h/tensorize/dataflow/fieldids.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "fieldid", 5 | "type": "INTEGER" 6 | } 7 | ] 8 | -------------------------------------------------------------------------------- /ml4h/tensorize/dataflow/load_fieldids.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | set -o pipefail 4 | set -o nounset 5 | 6 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 7 | #RAW DATA LOCATIONS 8 | #field file should already be in cloud before running this. looks like 9 | # field_id 10 | # val1 11 | # val2 12 | # etc 13 | # upload using gsutil cp fieldids.csv gs://ml4h/data/ 14 | # depending on access pattern, we can make this less one-offy. 15 | FIELDID_FILE="gs://ml4cvd/data/fieldids.csv" 16 | 17 | #SHARED_DATASET -- should already be created, location of shared data across UKBB applications 18 | SHARED_DATA="shared_data" 19 | 20 | 21 | bq load \ 22 | --replace \ 23 | --source_format=CSV \ 24 | --skip_leading_rows 1 \ 25 | --schema ${__dir}/fieldids.json \ 26 | ${SHARED_DATA}.tensorization_fieldids ${FIELDID_FILE} 27 | -------------------------------------------------------------------------------- /ml4h/tensorize/dataflow/ml4h_dataflow.yml: -------------------------------------------------------------------------------- 1 | # Minimal set of packages to tensorize using Dataflow (tested on OSX-64) 2 | # To be used to create the Python env on which to pip freeze to create the requirements file for Dataflow 3 | 4 | name: ml4h_dataflow 5 | channels: 6 | - defaults 7 | - anaconda 8 | dependencies: 9 | - python==3.8.10 10 | # - pip==22.3.1 11 | # - pip: 12 | # - apache-beam[gcp]==2.12.0 13 | # - google-cloud-storage==1.13.0 14 | # - h5py==2.9.0 15 | -------------------------------------------------------------------------------- /ml4h/tensorize/tensorize_dataflow.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import logging 4 | 5 | import apache_beam as beam 6 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions 7 | 8 | from ml4h.defines import GCS_BUCKET 9 | from ml4h.tensorize.dataflow import bigquery_ukb_queries 10 | 11 | 12 | def parse_args(): 13 | now_string = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M') 14 | 15 | parser = argparse.ArgumentParser() 16 | 17 | parser.add_argument( 18 | '--id', default=f"run_{now_string}", 19 | help='User-defined identifier for this pipeline run. ' 20 | 'Per Google: the name must consist of only the characters [-a-z0-9], ' 21 | 'starting with a letter and ending with a letter or number.', 22 | ) 23 | parser.add_argument( 24 | '--tensor_type', default="categorical", 25 | help='Type of data to be tensorized', 26 | choices=['categorical', 'continuous', 'icd', 'disease', 'death', 'phecode_disease'], 27 | ) 28 | parser.add_argument( 29 | '--bigquery_dataset', default='ukbb_dev', 30 | help='BigQuery dataset where the data will be drawn from', 31 | ) 32 | parser.add_argument( 33 | '--beam_runner', default='DirectRunner', 34 | help='Apache Beam runner that will execute the pipeline', 35 | choices=['DirectRunner', 'DataflowRunner'], 36 | ) 37 | parser.add_argument( 38 | '--repo_root', 39 | help='Root directory of the cloned ml repo', 40 | ) 41 | parser.add_argument( 42 | '--gcp_project', default='broad-ml4cvd', 43 | help='Name of the Google Cloud Platform project', 44 | ) 45 | parser.add_argument( 46 | '--gcp_region', default='us-central1', 47 | help='Google Cloud Platform region', 48 | ) 49 | # parser.add_argument('--gcs_bucket', default='ml4h', 50 | # help='Name of the Google Cloud Storage bucket where tensors will be written to') 51 | parser.add_argument( 52 | '--gcs_output_path', 53 | help='gs:// folder path excluding the bucket name where tensors will be written to ' 54 | 'e.g. specifying /path/to/folder will write to gs:///path/to/folder', 55 | ) 56 | parser.add_argument( 57 | "--logging_level", default='INFO', help="Logging level", 58 | choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], 59 | ) 60 | 61 | return parser.parse_args() 62 | 63 | 64 | if __name__ == "__main__": 65 | args = parse_args() 66 | 67 | logging.getLogger().setLevel(args.logging_level) 68 | 69 | packaging_args = [ 70 | f'--requirements_file={args.repo_root}/ml4h/tensorize/dataflow/requirements_ml4h_dataflow.txt', 71 | f'--setup_file={args.repo_root}/setup.py', 72 | ] 73 | 74 | pipeline_opts = PipelineOptions(flags=packaging_args) 75 | google_cloud_options = pipeline_opts.view_as(GoogleCloudOptions) 76 | google_cloud_options.region = args.gcp_region 77 | google_cloud_options.project = args.gcp_project 78 | google_cloud_options.job_name = args.id 79 | google_cloud_options.staging_location = f"gs://{GCS_BUCKET}/dataflow/staging" 80 | google_cloud_options.temp_location = f"gs://{GCS_BUCKET}/dataflow/temp" 81 | pipeline_opts.view_as(StandardOptions).runner = args.beam_runner 82 | 83 | pipeline = beam.Pipeline(options=pipeline_opts) 84 | 85 | bigquery_ukb_queries.tensorize_sql_fields( 86 | pipeline, 87 | args.gcs_output_path, 88 | args.bigquery_dataset, 89 | args.tensor_type, 90 | ) 91 | -------------------------------------------------------------------------------- /ml4h/tensormap/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/tensormap/__init__.py -------------------------------------------------------------------------------- /ml4h/tensormap/gatk.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import logging 3 | import numpy as np 4 | from typing import Dict 5 | 6 | from ml4h.TensorMap import TensorMap, Interpretation 7 | from ml4h.normalizer import Standardize 8 | 9 | DNA_SYMBOLS = {'A': 0, 'C': 1, 'G': 2, 'T': 3} 10 | VARIANT_LABELS = {'NOT_SNP': 0, 'NOT_INDEL': 1, 'SNP': 2, 'INDEL': 3} 11 | 12 | 13 | def tensor_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray: 14 | return np.array(hd5[tm.name]) 15 | 16 | 17 | reference = TensorMap('reference', shape=(128, len(DNA_SYMBOLS)), tensor_from_file=tensor_from_hd5) 18 | read_tensor = TensorMap('read_tensor', shape=(128, 128, 15), tensor_from_file=tensor_from_hd5) 19 | dp = TensorMap('dp', shape=(1,), normalization=Standardize(mean=34, std=8.6), tensor_from_file=tensor_from_hd5) 20 | fs = TensorMap('fs', shape=(1,), normalization=Standardize(mean=4.03, std=7.2), tensor_from_file=tensor_from_hd5) 21 | qd = TensorMap('qd', shape=(1,), normalization=Standardize(mean=12.8, std=6.1), tensor_from_file=tensor_from_hd5) 22 | mq = TensorMap('mq', shape=(1,), normalization=Standardize(mean=59.1, std=8.6), tensor_from_file=tensor_from_hd5) 23 | sor = TensorMap('sor', shape=(1,), normalization=Standardize(mean=1.03, std=0.8), tensor_from_file=tensor_from_hd5) 24 | mqranksum = TensorMap( 25 | 'mqranksum', shape=(1,), 26 | normalization=Standardize(mean=-0.23, std=1.1), tensor_from_file=tensor_from_hd5, 27 | ) 28 | readposranksum = TensorMap( 29 | 'readposranksum', shape=(1,), 30 | normalization=Standardize(mean=-0.04, std=1.2), tensor_from_file=tensor_from_hd5, 31 | ) 32 | 33 | 34 | def variant_label_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray: 35 | one_hot = np.zeros(tm.shape, dtype=np.float32) 36 | variant_str = str(hd5['variant_label'][()], 'utf-8') 37 | for channel in tm.channel_map: 38 | if channel.lower() == variant_str.lower(): 39 | one_hot[tm.channel_map[channel]] = 1.0 40 | if one_hot.sum() != 1: 41 | raise ValueError(f'TensorMap {tm.name} missing or invalid label: {variant_str} one_hot: {one_hot}') 42 | return one_hot 43 | 44 | 45 | variant_label = TensorMap( 46 | 'variant_label', Interpretation.CATEGORICAL, 47 | shape=(len(VARIANT_LABELS),), 48 | tensor_from_file=variant_label_from_hd5, 49 | channel_map=VARIANT_LABELS, 50 | ) 51 | -------------------------------------------------------------------------------- /ml4h/tensormap/mgb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/tensormap/mgb/__init__.py -------------------------------------------------------------------------------- /ml4h/tensormap/mgb/xdl.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import h5py 4 | import numpy as np 5 | from ml4h.TensorMap import TensorMap, Interpretation 6 | 7 | ecg_5000_std = TensorMap('ecg_5000_std', Interpretation.CONTINUOUS, shape=(5000, 12)) 8 | ecg_single_lead_I = TensorMap(f'ecg_strip_I', Interpretation.CONTINUOUS, shape=(5000, 1)) 9 | 10 | hypertension_icd_only = TensorMap( 11 | name='hypertension_icd_only', interpretation=Interpretation.CATEGORICAL, 12 | channel_map={'no_hypertension_icd_only': 0, 'hypertension_icd_only': 1}, 13 | ) 14 | hypertension_icd_bp = TensorMap( 15 | name='hypertension_icd_bp', interpretation=Interpretation.CATEGORICAL, 16 | channel_map={'no_hypertension_icd_bp': 0, 'hypertension_icd_bp': 1}, 17 | ) 18 | hypertension_icd_bp_med = TensorMap( 19 | name='hypertension_icd_bp_med', interpretation=Interpretation.CATEGORICAL, 20 | channel_map={'no_hypertension_icd_bp_med': 0, 'hypertension_icd_bp_med': 1}, 21 | ) 22 | hypertension_med = TensorMap( 23 | name='start_fu_hypertension_med', interpretation=Interpretation.CATEGORICAL, 24 | channel_map={'no_hypertension_medication': 0, 'hypertension_medication': 1}, 25 | ) 26 | 27 | lvef = TensorMap(name='LVEF', interpretation=Interpretation.CONTINUOUS, channel_map={'LVEF': 0}) 28 | 29 | age = TensorMap(name='age_in_days', interpretation=Interpretation.CONTINUOUS, channel_map={'age_in_days': 0}) 30 | sex = TensorMap(name='sex', interpretation=Interpretation.CATEGORICAL, channel_map={'Female': 0, 'Male': 1}) 31 | 32 | cad = TensorMap(name='cad', interpretation=Interpretation.CATEGORICAL, channel_map={'no_cad': 0, 'cad': 1}) 33 | dm = TensorMap(name='dm', interpretation=Interpretation.CATEGORICAL, channel_map={'no_dm': 0, 'dm': 1}) 34 | hypercholesterolemia = TensorMap( 35 | name='hypercholesterolemia', interpretation=Interpretation.CATEGORICAL, 36 | channel_map={'no_hypercholesterolemia': 0, 'hypercholesterolemia': 1}, 37 | ) 38 | 39 | n_intervals = 25 40 | af_tmap = TensorMap('survival_curve_af', Interpretation.SURVIVAL_CURVE, shape=(n_intervals*2,)) 41 | death_tmap = TensorMap('death_event', Interpretation.SURVIVAL_CURVE, shape=(n_intervals*2,)) 42 | 43 | 44 | def ecg_median_biosppy(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray: 45 | tensor = np.zeros(tm.shape, dtype=np.float32) 46 | for lead in tm.channel_map: 47 | tensor[:, tm.channel_map[lead]] = hd5[f'{tm.path_prefix}{lead}'] 48 | tensor = np.nan_to_num(tensor) 49 | return tensor 50 | 51 | ecg_channel_map = { 52 | 'I': 0, 'II': 1, 'III': 2, 'aVR': 3, 'aVL': 4, 'aVF': 5, 53 | 'V1': 6, 'V2': 7, 'V3': 8, 'V4': 9, 'V5': 10, 'V6': 11, 54 | } 55 | 56 | ecg_biosppy_median_60bpm = TensorMap( 57 | 'median', Interpretation.CONTINUOUS, path_prefix='median_60bpm_', shape=(600, 12), 58 | tensor_from_file=ecg_median_biosppy, 59 | channel_map=ecg_channel_map, 60 | ) 61 | -------------------------------------------------------------------------------- /ml4h/tensormap/ukb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/tensormap/ukb/__init__.py -------------------------------------------------------------------------------- /ml4h/tensormap/ukb/embedding.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.models import load_model 2 | 3 | from ml4h.TensorMap import TensorMap, Interpretation 4 | from ml4h.models.model_factory import get_custom_objects 5 | from ml4h.tensormap.ukb.ecg import ecg_rest_median_raw_10 6 | 7 | custom_dict = get_custom_objects([]) 8 | ecg_model_file = '/home/sam/hypertuned_48m_16e_ecg_median_raw_10_autoencoder_256d/encoder_ecg_rest_median_raw_10.h5' 9 | ecg_median_autoencoder_256d = TensorMap( 10 | 'ecg_median_autoencoder_256d', Interpretation.EMBEDDING, shape=(256,), 11 | model=load_model(ecg_model_file, custom_objects=custom_dict), 12 | parents=[ecg_rest_median_raw_10], 13 | ) 14 | -------------------------------------------------------------------------------- /ml4h/visualization_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/ml4h/visualization_tools/__init__.py -------------------------------------------------------------------------------- /ml4h/visualization_tools/annotations_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "sample_id", 4 | "type": "STRING", 5 | "mode": "REQUIRED" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "annotator", 10 | "type": "STRING" 11 | }, 12 | { 13 | "mode": "REQUIRED", 14 | "name": "annotation_timestamp", 15 | "type": "TIMESTAMP" 16 | }, 17 | { 18 | "mode": "REQUIRED", 19 | "name": "key", 20 | "type": "STRING" 21 | }, 22 | { 23 | "mode": "NULLABLE", 24 | "name": "value_numeric", 25 | "type": "NUMERIC" 26 | }, 27 | { 28 | "mode": "NULLABLE", 29 | "name": "value_string", 30 | "type": "STRING" 31 | }, 32 | { 33 | "mode": "NULLABLE", 34 | "name": "comment", 35 | "type": "STRING" 36 | } 37 | ] 38 | -------------------------------------------------------------------------------- /ml4h/visualization_tools/ecg_static_plots.py: -------------------------------------------------------------------------------- 1 | """Methods for integration of static plots within notebooks.""" 2 | import os 3 | import tempfile 4 | from typing import List, Optional, Union 5 | 6 | from IPython.display import HTML 7 | from IPython.display import SVG 8 | import numpy as np 9 | from ml4h.plots import plot_ecg_rest 10 | from ml4h.runtime_data_defines import get_resting_ecg_hd5_folder 11 | from ml4h.runtime_data_defines import get_resting_ecg_svg_folder 12 | import tensorflow as tf 13 | 14 | 15 | def display_resting_ecg(sample_id: Union[int, str], folder: Optional[str] = None) -> Union[HTML, SVG]: 16 | """Retrieve (or render) and display the SVG of the resting ECG. 17 | 18 | Args: 19 | sample_id: The id of the ECG SVG to retrieve. 20 | folder: The local or Cloud Storage path under which the files reside. 21 | 22 | Returns: 23 | An IPython SVG object or a notebook-friendly error. 24 | """ 25 | if folder is None: 26 | svg_folder = get_resting_ecg_svg_folder(sample_id) 27 | hd5_folder = get_resting_ecg_hd5_folder(sample_id) 28 | else: 29 | svg_folder = folder 30 | hd5_folder = folder 31 | 32 | with tempfile.TemporaryDirectory() as tmpdirname: 33 | # First, see if we already have one rendered. 34 | sample_svg = str(sample_id) + '.svg' 35 | local_path = os.path.join(tmpdirname, sample_svg) 36 | try: 37 | tf.io.gfile.copy(src=os.path.join(svg_folder, sample_svg), dst=local_path) 38 | return SVG(filename=local_path) 39 | except (tf.errors.NotFoundError, tf.errors.PermissionDeniedError) as e: 40 | pass 41 | # If not, dynamically render a SVG 42 | sample_hd5 = str(sample_id) + '.hd5' 43 | local_path = os.path.join(tmpdirname, sample_hd5) 44 | try: 45 | tf.io.gfile.copy(src=os.path.join(hd5_folder, sample_hd5), dst=local_path) 46 | except (tf.errors.NotFoundError, tf.errors.PermissionDeniedError) as e: 47 | return HTML(f''' 48 |
49 | Warning: Resting ECG not available for sample {sample_id} in {svg_folder} or {hd5_folder}: 50 |

{e.message}

51 | Use the folder parameter to read from a different local directory or Cloud Storage bucket. 52 |
''') 53 | 54 | try: 55 | # We don't need the resulting SVG, so send it to a temporary directory. 56 | with tempfile.TemporaryDirectory() as tmpdirname: 57 | return plot_ecg_rest(tensor_paths=[local_path], rows=[0], out_folder=tmpdirname, is_blind=False) 58 | except Exception as e: # pylint: disable=broad-except 59 | return HTML(f''' 60 |
61 | Warning: Unable to render static plot of resting ECG for sample {sample_id} from {hd5_folder}: 62 |

{e}

63 |
''') 64 | 65 | 66 | def major_breaks_x_resting_ecg(limits: List[float]) -> np.array: 67 | """Method to compute breaks for plotnine plots of ECG resting data. 68 | 69 | Args: 70 | limits: The approximate limits. 71 | 72 | Returns: 73 | The desired limits. 74 | """ 75 | step = 0.2 76 | if limits[0] <= 0: 77 | min_break = 0.0 78 | max_break = 2.5 79 | elif limits[0] <= 2.5: 80 | min_break = 2.5 81 | max_break = 5.0 82 | elif limits[0] <= 5.0: 83 | min_break = 5.0 84 | max_break = 7.5 85 | else: 86 | min_break = 7.5 87 | max_break = 10.0 88 | return np.arange(min_break, max_break + step, step) 89 | -------------------------------------------------------------------------------- /model_zoo/DROID-MVP/droid_mvp_checkpoint/checkpoint: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eb5a38d5763dc7f41acdb9468670ec94f1b31846285558ab69e302bd01917962 3 | size 65 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-MVP/droid_mvp_checkpoint/chkp.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d2196f91e9ae5e932b9c3dd546e3ce31e1a509fae5c3c13291bb0e496fb4e33a 3 | size 34826365 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-MVP/droid_mvp_checkpoint/chkp.index: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0af27bb4a0285fadf47c471f4f9047d86d767e6f697d9ec40cccd246b20f9cb7 3 | size 99789 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-MVP/droid_mvp_inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from droid_mvp_model_description import create_movinet_classifier, create_regressor_classifier 7 | import logging 8 | tf.get_logger().setLevel(logging.ERROR) 9 | 10 | pretrained_chkp_dir = "droid_mvp_checkpoint/chkp" 11 | movinet_chkp_dir = 'movinet_a2_base/' 12 | 13 | movinet_model, backbone = create_movinet_classifier( 14 | n_input_frames=16, 15 | batch_size=16, 16 | num_classes=600, 17 | checkpoint_dir=movinet_chkp_dir, 18 | ) 19 | 20 | backbone_output = backbone.layers[-1].output[0] 21 | flatten = tf.keras.layers.Flatten()(backbone_output) 22 | encoder = tf.keras.Model(inputs=[backbone.input], outputs=[flatten]) 23 | 24 | func_args = { 25 | 'input_shape': (16, 224, 224, 3), 26 | 'n_output_features': 0, # number of regression features 27 | 'categories': {"mvp_status_binary":2, "mvp_status_detailed":6}, 28 | 'category_order': ["mvp_status_binary", "mvp_status_detailed"], 29 | } 30 | 31 | model_plus_head = create_regressor_classifier(encoder, **func_args) 32 | 33 | model_plus_head.load_weights(pretrained_chkp_dir) 34 | 35 | random_video = np.random.random((1, 16, 224, 224, 3)) 36 | 37 | print(f""" 38 | DROID-MVP Predictions: 39 | {model_plus_head.predict(random_video)} 40 | """) -------------------------------------------------------------------------------- /model_zoo/DROID-MVP/droid_mvp_model_description.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from official.vision.beta.projects.movinet.modeling import movinet, movinet_model 4 | 5 | hidden_units = 256 6 | dropout_rate = 0.5 7 | 8 | def create_movinet_classifier( 9 | n_input_frames, 10 | batch_size, 11 | checkpoint_dir, 12 | num_classes, 13 | freeze_backbone=False 14 | ): 15 | backbone = movinet.Movinet(model_id='a2') 16 | model = movinet_model.MovinetClassifier(backbone=backbone, num_classes=600) 17 | model.build([1, 1, 1, 1, 3]) 18 | checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir) 19 | checkpoint = tf.train.Checkpoint(model=model) 20 | status = checkpoint.restore(checkpoint_path) 21 | status.assert_existing_objects_matched() 22 | 23 | model = movinet_model.MovinetClassifier( 24 | backbone=backbone, 25 | num_classes=num_classes 26 | ) 27 | model.build([batch_size, n_input_frames, 224, 224, 3]) 28 | 29 | if freeze_backbone: 30 | for layer in model.layers[:-1]: 31 | layer.trainable = False 32 | model.layers[-1].trainable = True 33 | 34 | return model, backbone 35 | 36 | def create_regressor_classifier(encoder, trainable=True, input_shape=(224, 224, 3), n_output_features=0, categories={}, 37 | category_order=None, add_dense={'regressor': False, 'classifier': False}): 38 | for layer in encoder.layers: 39 | layer.trainable = trainable 40 | 41 | inputs = tf.keras.Input(shape=input_shape, name='image') 42 | features = encoder(inputs) 43 | features = tf.keras.layers.Dropout(dropout_rate)(features) 44 | features = tf.keras.layers.Dense(hidden_units, activation="relu")(features) 45 | features = tf.keras.layers.Dropout(dropout_rate)(features) 46 | 47 | outputs = [] 48 | if n_output_features > 0: 49 | if add_dense['regressor']: 50 | features_reg = tf.keras.layers.Dense(hidden_units, activation="relu")(features) 51 | features_reg = tf.keras.layers.Dropout(dropout_rate)(features_reg) 52 | outputs.append(tf.keras.layers.Dense(n_output_features, activation=None, name='echolab')(features_reg)) 53 | else: 54 | outputs.append(tf.keras.layers.Dense(n_output_features, activation=None, name='echolab')(features)) 55 | if len(categories.keys()) > 0: 56 | if add_dense['classifier']: 57 | features = tf.keras.layers.Dense(hidden_units, activation="relu")(features) 58 | features = tf.keras.layers.Dropout(dropout_rate)(features) 59 | for category in category_order: 60 | activation = 'softmax' 61 | n_classes = categories[category] 62 | outputs.append(tf.keras.layers.Dense(n_classes, name='cls_'+category, activation=activation)(features)) 63 | 64 | model = tf.keras.Model(inputs=inputs, outputs=outputs, name="regressor_classifier") 65 | 66 | return model 67 | -------------------------------------------------------------------------------- /model_zoo/DROID-MVP/movinet_a2_base/checkpoint: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:78fdb1e081e9fc8d4e10e3bca4fe00117a236ddc4726bbf75594db19ae1be665 3 | size 69 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-MVP/movinet_a2_base/ckpt-1.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f393b7ef377ffaf59bd8bf081c72d05e74a576c5bba0d4bc180315432e49e557 3 | size 21240182 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-MVP/movinet_a2_base/ckpt-1.index: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1d801f29eace2f39bcc7b268ecf0d1bd117d9b3881cbcd810721c8c1b1f6c161 3 | size 10102 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-MVP/readme.md: -------------------------------------------------------------------------------- 1 | # DROID-MVP Inference Example 2 | 3 | This is a simple example script demonstrating how to load and run the DROID-MVP model. Model training and inference was performed using the code provided in the ML4H [model zoo](https://github.com/broadinstitute/ml4h/tree/master/model_zoo/DROID). The example below was adapted from the DROID inference code. 4 | 5 | 1. Download DROID docker image. Note: docker image is not compatible with Apple Silicon. 6 | 7 | `docker pull alalusim/droid:latest` 8 | 9 | 2. Pull github repo, including DROID-MVP model checkpoints stored using git lfs. 10 | 11 | ``` 12 | github clone https://github.com/broadinstitute/ml4h.git 13 | git lfs pull --include ml4h/model_zoo/DROID-MVP/droid_mvp_checkpoint/* 14 | git lfs pull --include ml4h/model_zoo/DROID-MVP/movinet_a2_base/* 15 | ``` 16 | 17 | 3. Run docker image while mounting ml4h directory and run example inference script. 18 | 19 | `docker run -it -v {PATH TO CLONED ML4H DIRECTORY}:/ml4h/ alalusim/droid:latest` 20 | 21 | ``` 22 | cd /ml4h/model_zoo/DROID-MVP/ 23 | python droid_mvp_inference.py 24 | ``` 25 | 26 | To use with your own data, format echocardiogram videos as tensors with shape (16, 224, 224, 3) before passing to the model. Code for data preprocessing, storage, loading, training, and inference can be found in the ml4h [model zoo](https://github.com/broadinstitute/ml4h/tree/master/model_zoo/DROID). 27 | 28 | Model outputs for DROID-MVP take the form: 29 | ``` 30 | [ 31 | [["MVP", "Not MVP"]], 32 | [["Anterior ", "Bileaflet", "Not MVP", "Posterior", "Superior Displacement", "MVP not otherwise specified"]], 33 | ] 34 | ``` 35 | 36 | Note that the model was optimized for predicting binary MVP status (the primary task) and that detailed MVP status was used as an auxiliary task to improve performance on the primary classification task. 37 | -------------------------------------------------------------------------------- /model_zoo/DROID-RV/droid_rv_checkpoint/checkpoint: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eb5a38d5763dc7f41acdb9468670ec94f1b31846285558ab69e302bd01917962 3 | size 65 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-RV/droid_rv_checkpoint/chkp.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2af9efac9cec47cdd0bb4ca0c539b153657583fc6f261ac42bf5ef01031792f0 3 | size 34827706 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-RV/droid_rv_checkpoint/chkp.index: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7a6c10fc7bfb8667a75ae8faeb19ed01278ca6d3fbf67488dd35834209921d17 3 | size 100586 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-RV/droid_rv_inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from droid_rv_model_description import create_movinet_classifier, create_regressor_classifier, rescale_droid_rv_outputs, rescale_droid_rvef_outputs 7 | import logging 8 | tf.get_logger().setLevel(logging.ERROR) 9 | 10 | droid_rv_checkpoint = "droid_rv_checkpoint/chkp" 11 | droid_rvef_checkpoint = "droid_rvef_checkpoint/chkp" 12 | movinet_chkp_dir = 'movinet_a2_base/' 13 | 14 | movinet_model, backbone = create_movinet_classifier( 15 | n_input_frames=16, 16 | batch_size=16, 17 | num_classes=600, 18 | checkpoint_dir=movinet_chkp_dir, 19 | ) 20 | 21 | backbone_output = backbone.layers[-1].output[0] 22 | flatten = tf.keras.layers.Flatten()(backbone_output) 23 | encoder = tf.keras.Model(inputs=[backbone.input], outputs=[flatten]) 24 | 25 | droid_rv_func_args = { 26 | 'input_shape': (16, 224, 224, 3), 27 | 'n_output_features': 2, # number of regression features 28 | 'categories': {"RV_size":2, "RV_function":2, "Sex":2}, 29 | 'category_order': ["RV_size", "RV_function", "Sex"], 30 | } 31 | 32 | droid_rvef_func_args = { 33 | 'input_shape': (16, 224, 224, 3), 34 | 'n_output_features': 4, # number of regression features 35 | 'categories': {"Sex":2}, 36 | 'category_order': ["Sex"], 37 | } 38 | 39 | droid_rv_model = create_regressor_classifier(encoder, **droid_rv_func_args) 40 | droid_rv_model.load_weights(droid_rv_checkpoint) 41 | 42 | droid_rvef_model = create_regressor_classifier(encoder, **droid_rvef_func_args) 43 | droid_rvef_model.load_weights(droid_rvef_checkpoint) 44 | 45 | random_video = np.random.random((1, 16, 224, 224, 3)) 46 | 47 | droid_rv_pred = droid_rv_model.predict(random_video) 48 | droid_rvef_pred = droid_rvef_model.predict(random_video) 49 | 50 | print(f""" 51 | 52 | DROID-RV Predictions: 53 | {rescale_droid_rv_outputs(droid_rv_pred)} 54 | 55 | DROID-RVEF Predictions: 56 | {rescale_droid_rvef_outputs(droid_rvef_pred)} 57 | 58 | """) -------------------------------------------------------------------------------- /model_zoo/DROID-RV/droid_rvef_checkpoint/checkpoint: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eb5a38d5763dc7f41acdb9468670ec94f1b31846285558ab69e302bd01917962 3 | size 65 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-RV/droid_rvef_checkpoint/chkp.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b635997716fc47e4d7694ed4dffb1fb19d32a3e8731bb250ec9bca9dc3283eba 3 | size 34820197 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-RV/droid_rvef_checkpoint/chkp.index: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b5cae8c8fadd814be6d5d2fd967d6479d08edcae3f40f842f107dda63effd935 3 | size 99789 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-RV/movinet_a2_base/checkpoint: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:78fdb1e081e9fc8d4e10e3bca4fe00117a236ddc4726bbf75594db19ae1be665 3 | size 69 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-RV/movinet_a2_base/ckpt-1.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f393b7ef377ffaf59bd8bf081c72d05e74a576c5bba0d4bc180315432e49e557 3 | size 21240182 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-RV/movinet_a2_base/ckpt-1.index: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1d801f29eace2f39bcc7b268ecf0d1bd117d9b3881cbcd810721c8c1b1f6c161 3 | size 10102 4 | -------------------------------------------------------------------------------- /model_zoo/DROID-RV/readme.md: -------------------------------------------------------------------------------- 1 | # DROID-RV Inference Example 2 | 3 | This is a simple example script demonstrating how to load and run the DROID-RV and DROID-RVEF models. Model training and inference was performed using the code provided in the ML4H [model zoo](https://github.com/broadinstitute/ml4h/tree/master/model_zoo/DROID). The example below was adapted from the DROID inference code. 4 | 5 | 1. Download DROID docker image. Note: docker image is not compatible with Apple Silicon. 6 | 7 | `docker pull alalusim/droid:latest` 8 | 9 | 2. Pull github repo, including DROID-RV model checkpoints stored using git lfs. 10 | 11 | ``` 12 | github clone https://github.com/broadinstitute/ml4h.git 13 | git lfs pull --include ml4h/model_zoo/DROID-RV/droid_rv_checkpoint/* 14 | git lfs pull --include ml4h/model_zoo/DROID-RV/droid_rvef_checkpoint/* 15 | git lfs pull --include ml4h/model_zoo/DROID-RV/movinet_a2_base/* 16 | ``` 17 | 18 | 3. Run docker image while mounting ml4h directory and run example inference script. 19 | 20 | `docker run -it -v {PATH TO CLONED ML4H DIRECTORY}:/ml4h/ alalusim/droid:latest` 21 | 22 | ``` 23 | cd /ml4h/model_zoo/DROID-RV/ 24 | python droid_rv_inference.py 25 | ``` 26 | 27 | To use with your own data, format echocardiogram videos as tensors with shape (16, 224, 224, 3) before passing to the model. Code for data preprocessing, storage, loading, training, and inference can be found in the ML4H [model zoo](https://github.com/broadinstitute/ml4h/tree/master/model_zoo/DROID). 28 | 29 | Model outputs for DROID-RV take the form: 30 | ``` 31 | [ 32 | [["Age", "RVEDD"]], 33 | [["Dilated", "Not Dilated"]], 34 | [["Hypokinetic", "Not Hypokinetic"]], 35 | [["Female", "Male"]] 36 | ] 37 | ``` 38 | 39 | Model outputs for DROID-RVEF take the form: 40 | ``` 41 | [ 42 | [["RVEF", "RV End-Diastolic Volume, "RV End-Systolic Volume", "Age"]], 43 | [["Female", "Male"]] 44 | ] 45 | ``` 46 | -------------------------------------------------------------------------------- /model_zoo/DROID/README.md: -------------------------------------------------------------------------------- 1 | # DROID (Dimensional Reconstruction of Imaging Data) 2 | 3 | DROID is a 3-D convolutional neural network modeling approach for echocardiographic view 4 | classification and quantification of LA dimension, LV wall thickness, chamber diameter and 5 | ejection fraction. 6 | 7 | The DROID echo movie encoder is based on the 8 | [MoViNet-A2-Base](https://tfhub.dev/tensorflow/movinet/a2/base/kinetics-600/classification/3) 9 | video classification model. MoViNet was fine-tuned in a supervised fashion to produce two 10 | specialized encoders: 11 | - DROID-LA 12 | - input views: PLAX, A4C, A2C 13 | - output predictions: LA A/P 14 | - DROID-LV 15 | - input views: PLAX, A4C, A2C 16 | - output predictions: LVEF, LVEDD, LVESD, IVS, PWT 17 | 18 | Multi-instance attention heads were then trained to integrate up to 40 view encodings to predict 19 | a single measurement of each type per echo study. 20 | 21 | ## Requirements 22 | In addition to the `ml4h` repository, DROID also requires `ml4ht_data_source` plus other dependencies. First, clone the 23 | ml4h repositories: 24 | ```commandline 25 | git clone https://github.com/broadinstitute/ml4h.git 26 | git clone https://github.com/broadinstitute/ml4ht_data_source.git 27 | ``` 28 | 29 | For convenience, we provide a docker image containing additional dependencies: 30 | ```commandline 31 | docker run -it --gpus all --rm -v {PARENT_DIRECTORY_OF_REPOS} -v {OPTIONAL_DATA_DIRECTORY} \ 32 | us-central1-docker.pkg.dev/broad-ml4cvd/droid/droid:0.1 /bin/bash 33 | ``` 34 | 35 | Within the docker container, install `ml4ht`: 36 | ```commandline 37 | pip install --user ml4ht_data_source 38 | ``` 39 | 40 | ## Usage 41 | ### Preprocessing 42 | The following scripts are designed to handle echo movies that have been processed and stored in Lightning 43 | Memory-Mapped Database (lmdb) files. We create one lmdb per echo study in which the keys are the filenames of the dicoms and 44 | the values are echo movies that have been anonymized, cropped, and converted to avis. See `echo_to_lmdb.py` for an 45 | example. 46 | 47 | ### Inference 48 | `echo_supervised_inference_recipe.py` can be used to obtain predictions from echo movies given either the DROID-LA or 49 | DROID-LV specialized encoders. 50 | 51 | An example of parameters to use when running this script are: 52 | ```commandline 53 | python echo_supervised_inference_recipe.py \ 54 | --n_input_frames 16 \ 55 | --output_labels LA_A_P \ 56 | --selected_views A4C --selected_views A2C --selected_views PLAX \ 57 | --selected_doppler standard \ 58 | --selected_quality good \ 59 | --selected_canonical on_axis \ 60 | --split_idx 0 \ 61 | --n_splits 1 \ 62 | --skip_modulo 4 \ 63 | --wide_file {WIDE_FILE_PATH} \ 64 | --splits_file {SPLITS_JSON} \ 65 | --lmdb_folder {LMDB_DIRECTORY_PATH} \ 66 | --pretrained_chkp_dir {SPECIALIZED_ENCODER_PATH} \ 67 | --movinet_chkp_dir {MoViNet-A2-Base_PATH} \ 68 | --output_dir {WHERE_TO_STORE_PREDICTIONS} 69 | ``` -------------------------------------------------------------------------------- /model_zoo/DROID/data_descriptions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/DROID/data_descriptions/__init__.py -------------------------------------------------------------------------------- /model_zoo/DROID/echo_defines.py: -------------------------------------------------------------------------------- 1 | category_dictionaries = { 2 | 'view': { 3 | 'PLAX': 0, 4 | 'Ascending_aorta': 1, 5 | 'RV_inflow': 2, 6 | 'RV_focused': 3, 7 | 'Pulmonary_artery': 4, 8 | 'PSAX_AV': 5, 9 | 'PSAX_MV': 6, 10 | 'PSAX_papillary': 7, 11 | 'PSAX_apex': 8, 12 | 'A4C': 9, 13 | 'A5C': 10, 14 | 'A3C': 11, 15 | 'A2C': 12, 16 | 'Suprasternal': 13, 17 | 'Subcostal': 14, 18 | }, 19 | 'doppler': { 20 | 'standard': 0, 21 | 'doppler': 1, 22 | '3-D': 2, 23 | }, 24 | 25 | 'quality': { 26 | 'good': 0, 27 | 'unusable': 1, 28 | }, 29 | 'canonical': { 30 | 'on_axis': 0, 31 | 'off_axis': 1, 32 | }, 33 | 'LV_EjectionFraction': { 34 | 'N': { 35 | 'index': 0, 36 | 'weight': 0.259667, 37 | }, 38 | 'A': { 39 | 'index': 1, 40 | 'weight': 0.862008, 41 | }, 42 | 'I': { 43 | 'index': 2, 44 | 'weight': 0.916131, 45 | }, 46 | 'L': { 47 | 'index': 3, 48 | 'weight': 0.980843, 49 | }, 50 | 'H': { 51 | 'index': 0, 52 | 'weight': 0.981351, 53 | }, 54 | }, 55 | 'LV_FunctionDescription': { 56 | '4.0': { 57 | 'index': 0, 58 | 'weight': 0.520803, 59 | }, 60 | '2.0': { 61 | 'index': 1, 62 | 'weight': 0.662169, 63 | }, 64 | '3.0': { 65 | 'index': 2, 66 | 'weight': 0.817028, 67 | }, 68 | }, 69 | 'LV_CavitySize': { 70 | 'N': { 71 | 'index': 0, 72 | 'weight': 0.209487, 73 | }, 74 | 'D': { 75 | 'index': 1, 76 | 'weight': 0.833406, 77 | }, 78 | 'S': { 79 | 'index': 2, 80 | 'weight': 0.957354, 81 | }, 82 | 'P': { 83 | 'index': 3, 84 | 'weight': 1.0, 85 | }, 86 | }, 87 | 'RV_SystolicFunction': { 88 | 'N': { 89 | 'index': 0, 90 | 'weight': 0.19156206811684748, 91 | }, 92 | 'Y': { 93 | 'index': 1, 94 | 'weight': 2.5944871794871798, 95 | }, 96 | 'A': { 97 | 'index': 2, 98 | 'weight': 4.161422989923915, 99 | }, 100 | 'L': { 101 | 'index': 3, 102 | 'weight': 8.256629946960423, 103 | }, 104 | }, 105 | } 106 | -------------------------------------------------------------------------------- /model_zoo/DROID/encoders/LA_DROID/model/checkpoint: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eb5a38d5763dc7f41acdb9468670ec94f1b31846285558ab69e302bd01917962 3 | size 65 4 | -------------------------------------------------------------------------------- /model_zoo/DROID/encoders/LA_DROID/model/chkp.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:29c6b22e560ee68834704974407e5ed3bdd8c6ad5adc4c1064e4a0bc8f75a79f 3 | size 34804093 4 | -------------------------------------------------------------------------------- /model_zoo/DROID/encoders/LA_DROID/model/chkp.index: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2c25e8c208cc21d755e3be52d32c61994ecf04a27401cb16c17f42f18d9a4482 3 | size 99388 4 | -------------------------------------------------------------------------------- /model_zoo/DROID/encoders/LV_DROID/model/checkpoint: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eb5a38d5763dc7f41acdb9468670ec94f1b31846285558ab69e302bd01917962 3 | size 65 4 | -------------------------------------------------------------------------------- /model_zoo/DROID/encoders/LV_DROID/model/chkp.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eb84c1c904df40ad73dc6048cca7a29a418be2dfcf52dcc0d1f9a6aaa4df4626 3 | size 34816429 4 | -------------------------------------------------------------------------------- /model_zoo/DROID/encoders/LV_DROID/model/chkp.index: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a92657cf830c64e12841d5cff5e13be6bec35ba1399501ed5cb7e913aa050a12 3 | size 99388 4 | -------------------------------------------------------------------------------- /model_zoo/DROID/model_descriptions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/DROID/model_descriptions/__init__.py -------------------------------------------------------------------------------- /model_zoo/ECG2AF/architecture.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:494e8f6bb6f032b21877e4b3fa69a6d9e00c3f5d9fd898c100943db84a2f911e 3 | size 1301867 4 | -------------------------------------------------------------------------------- /model_zoo/ECG2AF/ecg2af_infer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "987044ef-389c-4d67-aed1-ae420a92f35f", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "from tensorflow.keras.models import load_model\n", 12 | "from ml4h.models.model_factory import get_custom_objects\n", 13 | "from ml4h.tensormap.ukb.demographics import age_in_days, af_dummy2, sex_dummy1\n", 14 | "from ml4h.tensormap.ukb.survival import mgb_afib_wrt_instance2, mgb_death_wrt_instance2" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "abbbac4b-5b97-4e6d-a5ab-0f39df9e41b1", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "output_tensormaps = {tm.output_name(): tm for tm in [mgb_afib_wrt_instance2, mgb_death_wrt_instance2, age_in_days, af_dummy2, sex_dummy1]}\n", 25 | "model = load_model('./ecg2af_quintuplet_v2024_01_13.keras')\n", 26 | "ecg = np.random.random((1, 5000, 12))\n", 27 | "prediction = model(ecg)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "1998e754-4f5f-4bb3-b611-ce06ad8fee36", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "for name, pred in zip(model.output_names, prediction):\n", 38 | " otm = output_tensormaps[name]\n", 39 | " if otm.is_survival_curve():\n", 40 | " intervals = otm.shape[-1] // 2\n", 41 | " days_per_bin = 1 + otm.days_window // intervals\n", 42 | " predicted_survivals = np.cumprod(pred[:, :intervals], axis=1)\n", 43 | " print(f'AF Risk {otm} prediction is: {str(1 - predicted_survivals[0, -1])}')\n", 44 | " else:\n", 45 | " print(f'{otm} prediction is {pred}')\n", 46 | " " 47 | ] 48 | } 49 | ], 50 | "metadata": { 51 | "kernelspec": { 52 | "display_name": "Python 3 (ipykernel)", 53 | "language": "python", 54 | "name": "python3" 55 | }, 56 | "language_info": { 57 | "codemirror_mode": { 58 | "name": "ipython", 59 | "version": 3 60 | }, 61 | "file_extension": ".py", 62 | "mimetype": "text/x-python", 63 | "name": "python", 64 | "nbconvert_exporter": "python", 65 | "pygments_lexer": "ipython3", 66 | "version": "3.11.12" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 5 71 | } 72 | -------------------------------------------------------------------------------- /model_zoo/ECG2AF/ecg2af_quintuplet_v2024_01_13.keras: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:de38e009be5613ebb31ee1969bc1cab648d0f11c865e8afbd60e4bb7784f5627 3 | size 43797442 4 | -------------------------------------------------------------------------------- /model_zoo/ECG2AF/ecg_5000_survival_curve_af_quadruple_task_mgh_v2021_05_21.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ece878a5ed4f1523735a35648a8fa2a3086261d2bf38a5db16c8f3d0fc34c667 3 | size 220080440 4 | -------------------------------------------------------------------------------- /model_zoo/ECG2AF/km.jpg: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e1f87168d95c01f55f6b19dc460f5329002e4256bb33d0b16a8cd7fff4144969 3 | size 242929 4 | -------------------------------------------------------------------------------- /model_zoo/ECG2AF/salience.jpg: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3b4e44790f22eecd5fb972032ea13eac24828ed308e318f4188334d1c8675f97 3 | size 567352 4 | -------------------------------------------------------------------------------- /model_zoo/ECG2AF/strip_II_survival_curve_af_v2021_06_15.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ef58a4b7294d91cb4e4686e6fd4ee72098719d55c43b9b0d487b34066923e0da 3 | size 219947384 4 | -------------------------------------------------------------------------------- /model_zoo/ECG2AF/strip_I_survival_curve_af_v2021_06_15.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d4c6b591d12b8fa4c1c0115adab442c68c7c33ce98c9071e66f7dd4f33f03296 3 | size 219883672 4 | -------------------------------------------------------------------------------- /model_zoo/ECG2AF/study_design.jpg: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3a8f3b219ed14d2a542ae983e78a70e7ea8eaf036ede3ef0f388bcfeb3ccb790 3 | size 241160 4 | -------------------------------------------------------------------------------- /model_zoo/ECG_PheWAS/README.md: -------------------------------------------------------------------------------- 1 | ## ECG PheWAS 2 | This directory contains python notebooks and instructions to create the models and results from 3 | [this NPJ Digital Medicine Paper](https://www.nature.com/articles/s41746-024-01418-9). 4 | 5 | The raw model files are stored using `git lfs` so you must have `git` and `git lfs` installed and localize the full ~135MB autoencoder as well as the component decoder and encoder: 6 | ```bash 7 | git lfs pull --include model_zoo/ECG_PheWAS/*.h5 8 | ``` 9 | 10 | Our model expects ECG median waveforms with 600 milliVolt voltages across 12 leads as input and produces 11 | a 256 dimensional latent space encoding, as well as a reconstructed ECG with the same shape as the input. 12 | The notebook [ecg_write_biosppy_medians.ipynb](./ecg_write_biosppy_medians.ipynb) provides an example of creating these median waveforms from 10 second 12 lead ECGs. 13 | 14 | The electrocardiogram (ECG) is an inexpensive and widely available diagnostic tool, and therefore has great potential 15 | to facilitate disease detection in large-scale populations. 16 | Both cardiac and noncardiac diseases may alter the appearance of the ECG, though the extent to which diseases across 17 | the human phenotypic landscape can be detected on the ECG remains unclear. 18 | We developed an autoencoder model that encodes and reconstructs ECG waveform data within a 19 | multidimensional latent space. 20 | The ECG latent space model demonstrated a greater number of associations than ECG models using standard ECG intervals 21 | alone, and generally resulted in improvements in discrimination of diseases compared to models comprising 22 | only age, sex, and race. 23 | We further demonstrate how a latent space model can be used to generate disease-specific ECG waveforms and facilitate 24 | disease profiling for individual patients. 25 | 26 | To create a model from scratch run: 27 | ```bash 28 | python /path/to/ml4h/ml4h/recipes.py \ 29 | --mode train \ 30 | --tensors /path/to/hd5_tensors/ \ 31 | --output_folder /path/to/output/ \ 32 | --tensormap_prefix ml4h.tensormap.ukb \ 33 | --input_tensors ecg.ecg_biosppy_median_60bpm --output_tensors ecg.ecg_biosppy_median_60bpm \ 34 | --encoder_blocks conv_encode --decoder_blocks conv_decode --activation mish --conv_layers 23 23 \ 35 | --dense_blocks 46 --block_size 5 --dense_layers 256 --dense_normalize layer_norm \ 36 | --batch_size 2 --epochs 96 --training_steps 128 --validation_steps 36 --test_steps 32 --patience 64 \ 37 | --id ecg_median_autoencoder 38 | ``` 39 | 40 | Given this model, infer a latent space with: 41 | ```bash 42 | python /path/to/ml4h/ml4h/recipes.py \ 43 | --mode infer_encoders \ 44 | --tensors /path/to/hd5_tensors/ \ 45 | --output_folder /path/to/output/ \ 46 | --tensormap_prefix ml4h.tensormap.ukb \ 47 | --input_tensors ecg.ecg_biosppy_median_60bpm --output_tensors ecg.ecg_biosppy_median_60bpm \ 48 | --model_file /path/to/ml4h/model_zoo/ECG_PheWAS/mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.h5 \ 49 | --id ecg_median_autoencoder 50 | ``` 51 | 52 | With this latent space and phecode diagnoses for the same cohort, the jupyter notebook 53 | [latent_space_phewas](./latent_space_phewas.ipynb) 54 | allows you to conduct the PheWAS analysis. 55 | 56 | ![UKB PheWAS Plot](./ukb_phewas.png) 57 | -------------------------------------------------------------------------------- /model_zoo/ECG_PheWAS/decoder_median.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2d6fda0d012dd40f06c806a37d03620a29d9707f9e537b79f8c002cbef6e060c 3 | size 23259088 4 | -------------------------------------------------------------------------------- /model_zoo/ECG_PheWAS/decoder_median.keras: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4d397f589150bf51d96d3fc73cff3c586afa24a2373cb679033e17a5e7318062 3 | size 23270296 4 | -------------------------------------------------------------------------------- /model_zoo/ECG_PheWAS/encoder_median.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f1a09f154272b4c45f75d6e8eb18eb1c36773fc3a9fb44ee693f4a7fd331e1dc 3 | size 21950032 4 | -------------------------------------------------------------------------------- /model_zoo/ECG_PheWAS/encoder_median.keras: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3fabb54ff60d3a7824a9824205a20f8545f0587aaecd43cf7caee2747eb68a7a 3 | size 21963836 4 | -------------------------------------------------------------------------------- /model_zoo/ECG_PheWAS/mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:526f990b8945224bd027ac1ad18365fed5409de3ea01c76aa91d1d5dbdeefa59 3 | size 135412328 4 | -------------------------------------------------------------------------------- /model_zoo/ECG_PheWAS/mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.keras: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:26928fe27362a5cda700a97b89606f73ba2853f7f54f2a2ae0adccc4f35c0be8 3 | size 23270296 4 | -------------------------------------------------------------------------------- /model_zoo/ECG_PheWAS/ukb_phewas.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c5b31985c77961e308c42fb02e99293834401271e34709c3f99e5dcf179ca08b 3 | size 309545 4 | -------------------------------------------------------------------------------- /model_zoo/PCLR/PCLR.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:50ca4fe3dd60c83d92eb4cd96cab4204e48193d6d42ee1704b758ab58675c9be 3 | size 25690896 4 | -------------------------------------------------------------------------------- /model_zoo/PCLR/PCLR/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR/saved_model.pb -------------------------------------------------------------------------------- /model_zoo/PCLR/PCLR/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8340b2ad71f4afa0e7e19595a52e282f04d795ec1389b08f627bfaff04d68053 3 | size 25600481 4 | -------------------------------------------------------------------------------- /model_zoo/PCLR/PCLR/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR/variables/variables.index -------------------------------------------------------------------------------- /model_zoo/PCLR/PCLR_lead_I/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR_lead_I/saved_model.pb -------------------------------------------------------------------------------- /model_zoo/PCLR/PCLR_lead_I/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ed447ea195ee03ee40911130d0e304dfea0b1e361206d6394387c77068fb6dc3 3 | size 25555425 4 | -------------------------------------------------------------------------------- /model_zoo/PCLR/PCLR_lead_I/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR_lead_I/variables/variables.index -------------------------------------------------------------------------------- /model_zoo/PCLR/PCLR_lead_II/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR_lead_II/saved_model.pb -------------------------------------------------------------------------------- /model_zoo/PCLR/PCLR_lead_II/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:59347e3fb27c3fb98194e14b67ff2c2bb8288673425c6084ccc09441822b437a 3 | size 25553411 4 | -------------------------------------------------------------------------------- /model_zoo/PCLR/PCLR_lead_II/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/PCLR/PCLR_lead_II/variables/variables.index -------------------------------------------------------------------------------- /model_zoo/PCLR/get_representations.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | 3 | import numpy as np 4 | from tensorflow.keras.models import load_model, Model 5 | 6 | from preprocess_ecg import process_ecg, LEADS 7 | 8 | 9 | def get_model() -> Model: 10 | """Get PCLR embedding model""" 11 | return load_model("./PCLR.h5") 12 | 13 | 14 | def get_representations(ecgs: List[Dict[str, np.ndarray]]) -> np.ndarray: 15 | """ 16 | Uses PCLR trained model to build representations of ECGs 17 | :param ecgs: A list of dictionaries mapping lead name to lead values. 18 | The lead values should be measured in milli-volts. 19 | Each lead should represent 10s of samples. 20 | :return: 21 | """ 22 | model = get_model() 23 | ecgs = np.stack(list(map(process_ecg, ecgs))) 24 | return model.predict(ecgs) 25 | 26 | 27 | def test_get_representations(): 28 | """Test to make sure get_representations works as expected""" 29 | fake_ecg = { 30 | lead: np.zeros(2500) 31 | for lead in LEADS 32 | } 33 | fake_ecgs = [fake_ecg for _ in range(10)] 34 | representations = get_representations(fake_ecgs) 35 | assert representations.shape == (len(fake_ecgs), 320) 36 | -------------------------------------------------------------------------------- /model_zoo/PCLR/preprocess_ecg.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | import numpy as np 3 | 4 | 5 | LEADS = [ 6 | 'I', 'II', 'III', 'aVR', 'aVL', 'aVF', 7 | 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 8 | ] 9 | 10 | 11 | def process_ecg(ecg: Dict[str, np.ndarray], ecg_samples: int = 4096) -> np.ndarray: 12 | """ 13 | Prepares an ECG for use in a tensorflow model 14 | :param ecg: A dictionary mapping lead name to lead values. 15 | The lead values should be measured in milli-volts. 16 | Each lead should represent 10s of samples. 17 | :param ecg_samples: Length of each lead for input into the model. 18 | :return: a numpy array of the ECG shaped (ecg_samples, 12) 19 | """ 20 | assert set(ecg.keys()) == set(LEADS) 21 | 22 | out = np.zeros((ecg_samples, 12)) 23 | for i, lead_name in enumerate(LEADS): 24 | lead = ecg[lead_name] 25 | interpolated_lead = np.interp( 26 | np.linspace(0, 1, ecg_samples), 27 | np.linspace(0, 1, lead.shape[0]), 28 | lead, 29 | ) 30 | out[:, i] = interpolated_lead 31 | return out 32 | -------------------------------------------------------------------------------- /model_zoo/PCLR/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.12.0 2 | astunparse==1.6.3 3 | cachetools==4.2.1 4 | certifi==2020.12.5 5 | chardet==4.0.0 6 | gast==0.3.3 7 | google-auth==1.28.0 8 | google-auth-oauthlib==0.4.4 9 | google-pasta==0.2.0 10 | grpcio==1.37.0 11 | h5py==2.10.0 12 | idna==2.10 13 | importlib-metadata==3.10.0 14 | Keras-Preprocessing==1.1.2 15 | Markdown==3.3.4 16 | numpy==1.22.0 17 | oauthlib==3.1.0 18 | opt-einsum==3.3.0 19 | protobuf==3.15.7 20 | pyasn1==0.4.8 21 | pyasn1-modules==0.2.8 22 | requests==2.25.1 23 | requests-oauthlib==1.3.0 24 | rsa==4.7.2 25 | scipy==1.4.1 26 | six==1.15.0 27 | tensorboard==2.4.1 28 | tensorboard-plugin-wit==1.8.0 29 | tensorflow==2.7.2 30 | tensorflow-estimator==2.3.0 31 | termcolor==1.1.0 32 | typing-extensions==3.7.4.3 33 | urllib3==1.26.5 34 | Werkzeug==1.0.1 35 | wrapt==1.12.1 36 | zipp==3.4.1 37 | -------------------------------------------------------------------------------- /model_zoo/README.md: -------------------------------------------------------------------------------- 1 | # How to Add an Animal to the Model Zoo: 2 | 3 | ## Create a new folder in this directory with you model name 4 | Create a `README.md` file in your model directory with documentation and code snippets. 5 | 6 | ## Document model inputs outpus and performance 7 | What shape data does the model expect? What data was used to train it? How does it perform? 8 | 9 | ## Add the model weights and architecture file(s) with `git lfs` 10 | Add `.h5` or `.keras` or `.pb` model files to the model_zoo directory using git Large File Storage. 11 | 12 | ## Add code snippets for model loading and inference 13 | 14 | ## Add code snippets or command lines for model training. 15 | 16 | ## Describe model architecture, interpretability and use-cases. 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /model_zoo/adiposity_mlandepi/README.md: -------------------------------------------------------------------------------- 1 | # Machine learning on >40,000 body MRIs and fat distribution 2 | 3 | 1. Ingest the provided whole-body MRI `zip` files using `ingest.py` 4 | 2. Compute the 2D projections using the ingested 3D volumes with `compute_projections.py` 5 | 3. Train deep learning-based model(s) using `train.py` 6 | 4. Phenotype characterization and disease association with `downstream_associations_v3.ipynb` 7 | -------------------------------------------------------------------------------- /model_zoo/cardiac_mri_derived_left_ventricular_mass/Lreg.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:660c74be8d5128d33d305bc6dc7df89d8ccb1966f0e5118c26dbffaa65b950dc 3 | size 31427 4 | -------------------------------------------------------------------------------- /model_zoo/cardiac_mri_derived_left_ventricular_mass/Lseg.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c0e989cfebab9c505a8c7f76a3a92153583d63420cdc2a83e4722a4fc16ead02 3 | size 28767 4 | -------------------------------------------------------------------------------- /model_zoo/cardiac_mri_derived_left_ventricular_mass/calibrations_sax_all_diastole_segmented.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cc12e1edae86b3d6d75a48c7853dba94c8de1e561159678312f2decf02c072e1 3 | size 115323 4 | -------------------------------------------------------------------------------- /model_zoo/cardiac_mri_derived_left_ventricular_mass/metric_history_sax_diastole_segment_no_flat.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9f9e5414cb0fc28ec65062bda8e1d6e52ca587b311be05c2039a42cd96ad559a 3 | size 509238 4 | -------------------------------------------------------------------------------- /model_zoo/cardiac_mri_derived_left_ventricular_mass/per_class_roc_sax_all_diastole_segmented.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b2433bef8c35a7ab8eb7bf876d8a1e464a04b59bea195aa6757c86e3ea2c49f6 3 | size 44993 4 | -------------------------------------------------------------------------------- /model_zoo/cardiac_mri_derived_left_ventricular_mass/precision_recall_sax_all_diastole_segmented.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:698bbd48917a1f1d86b1be7d56cc0dae1d76ca1cdd082e86b2796e1c8d31bcce 3 | size 50560 4 | -------------------------------------------------------------------------------- /model_zoo/cardiac_mri_derived_left_ventricular_mass/sax_diastole_segment_no_flat.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:05f8fe4843d04cd526b8f2c2424ac98554238089e43a4376ffb2e12122206e4f 3 | size 15191464 4 | -------------------------------------------------------------------------------- /model_zoo/dropfuse/decoder_ecg_rest_median_raw_10.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:256c617fe91863c4994b367f0e1efbcbf7ff17378c593ba7498ce3d96fd66c4e 3 | size 23230192 4 | -------------------------------------------------------------------------------- /model_zoo/dropfuse/decoder_lax_4ch_heart_center.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5fa8a2f7dde41668dfe52b4465e8d9a6972956e553486b8b76f9400ffc6f5b41 3 | size 7010232 4 | -------------------------------------------------------------------------------- /model_zoo/dropfuse/dropout_pair_contrastive_lax_4ch_cycle_ecg_median_10_pretrained_256d_v2020_06_07.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fc7f8e9b5d5c1b17992d7c676b64f269f6f67248b07a68efe312e93e53934dff 3 | size 177298400 4 | -------------------------------------------------------------------------------- /model_zoo/dropfuse/encoder_ecg_rest_median_raw_10.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:209f4d9a6ebcee02ced493a320d2116787b9cfaba2476a653e442581557475d0 3 | size 21916216 4 | -------------------------------------------------------------------------------- /model_zoo/dropfuse/encoder_lax_4ch_heart_center.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:72d98c3ff072ea0fdac4753986ca913394bfff231dc9d9cc130bbe4fe276cdac 3 | size 6986488 4 | -------------------------------------------------------------------------------- /model_zoo/dropfuse/overview.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d18e2285270165380316c9e8b5919a3cc84b211e018c1cb3ec8faaf56a6d79e8 3 | size 405082 4 | -------------------------------------------------------------------------------- /model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning to Predict Cardiac Magnetic Resonance-Derived Left Ventricular Mass and Hypertrophy from 12-Lead Electrocardiograms 2 | 3 | This folder contains models and code supporting the work described in [this paper](https://www.ahajournals.org/doi/10.1161/CIRCIMAGING.120.012281?url_ver=Z39.88-2003&rfr_id=ori:rid:crossref.org&rfr_dat=cr_pub%20%200pubmed) published in the journal Circulation: Cardiovascular Imaging. 4 | 5 | # LVM-AI 6 | Left Ventricular Mass-Artificial Intelligence (LVM-AI) is a one-dimensional convolutional neural network trained to predict CMR-derived LV mass using 12-lead ECGs. LVM-AI was trained within 32239 individuals from the UK Biobank with paired CMR and 12-lead ECG. It was provided with the entire 10 seconds of the 12-lead ECG waveform as well as participant age, sex, and BMI. 7 | LVM-AI was evaluated in a UK Biobank test set as well as an external health care–based Mass General Brigham (MGB) dataset. In both test sets, LVM-AI was compared to with traditional ECG-based rules for diagnosing CMR-derived left ventricular hypertrophy. Associations between LVM-AI predicted LV mass index and incident cardiovascular events were tested in the UK Biobank and a separate MGB-based ambulatory cohort (MGB outcomes) 8 | ![Overview of the training and test samples](TrainingAndTestSets.jpg) 9 | When compared with any ECG rule, LVM-AI demonstrated similar LVH discrimination in the UK Biobank (LVM-AI c-statistic 0.653 [95% CI, 0.608 -0.698] versus any ECG rule c-statistic 0.618 [95% CI, 0.574 -0.663], P=0.11) and superior discrimination in MGB (0.621; 95% CI, 0.592 -0.649 versus 0.588; 95% CI, 0.564 -0.611, P=0.02). 10 | 11 | 12 | # Models 13 | Three pre-trained models are included here: 14 | The model `ecg_rest_raw_age_sex_bmi_lvm_asymmetric_loss.h5` takes as input a 12 Lead resting ECG, as well as age, sex and BMI and has two outputs: one which regresses the left ventricular mass, and a second which gives a probability of left ventricular hypertrophy. This model was trained with the asymmetric loss described in the paper. 15 | The model `ecg_rest_raw_lvm_asymmetric_loss.h5` takes only an ECG as input and regresses left ventricular mass. This model was also trained with the asymmetric loss. 16 | The third model, `ecg_rest_raw_lvm_symmetric_loss.h5` takes only an ECG as input and regresses left ventricular mass. This model was trained with the symmetric logcosh loss. The raw voltage values from the ECG are normalized by dividing by 2000 prior to being input to the model. -------------------------------------------------------------------------------- /model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/TrainingAndTestSets.jpg: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6bea557c5150adbec08f1e50410d7b61c5ce228752c4b9785f7243f5e88bcd69 3 | size 175820 4 | -------------------------------------------------------------------------------- /model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/ecg_rest_raw_age_sex_bmi_lvm_asymmetric_loss.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0764989632149590ff47f86e665cec31da592b7b6f2275f3c9ac9f949ccce017 3 | size 14870648 4 | -------------------------------------------------------------------------------- /model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/ecg_rest_raw_lvm_asymmetric_loss.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:236619e5a99e6e44bd37f35b88ff86a5d5ea4ca3f98dff437cdb0dd7f636015c 3 | size 12720392 4 | -------------------------------------------------------------------------------- /model_zoo/left_ventricular_mass_from_ecg_student_and_mri_teacher/ecg_rest_raw_lvm_symmetric_loss.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:370f9f672bdbd4e61ce1f439b9a58b3150ed7b8cc9a04f49e2923f297c3e1491 3 | size 12720392 4 | -------------------------------------------------------------------------------- /model_zoo/liver_fat_from_mri_ukb/README.md: -------------------------------------------------------------------------------- 1 | # Machine learning enables new insights into clinical significance of and genetic contributions to liver fat accumulation 2 | 3 | This folder contains models and code supporting the work described in [this paper](https://www.sciencedirect.com/science/article/pii/S2666979X21000823) published in Cell Genomics. 4 | 5 | Here we host two models for estimating liver fat from abdominal MRI. 6 | The liver fat percentage training data is from the returned liver fat values in the [UK Biobank field ID 22402](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=22402). These values were only calculated for the echo protocol, so to infer liver fat from the ideal protocol we used a teacher/student modeling approach. 7 | 8 | ## Teacher Model 9 | The teacher model was trained with abdominal MRIs acquired using the [echo protocol, UK Biobank field ID 20203](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=20203). 10 | This model takes input of shape 160 x 160 x 10 and emits a scalar representing estimated liver fat percentage. 11 | The input TensorMap is defined at `tensormap.ukb.mri.gre_mullti_echo_10_te_liver`. 12 | The output TensorMap associated with these values is defined at `tensormap.ukb.mri.liver_fat`. 13 | The keras model file is at [liver_fat_from_echo.h5](liver_fat_from_echo.h5) and the model architecture is shown below. The "?" in the input dimension represents the batch size of the input, which can be determined at runtime. When training the teacher model we used a batch size of 8. 14 | ![https://www.medrxiv.org/content/10.1101/2020.09.03.20187195v1](liver_fat_from_echo_teacher_model.png) 15 | 16 | 17 | ## Student Model 18 | The teacher model made inferences on all available MRIs acquired with the echo protocol, which includes some individuals who also had abdominal MRI with the [ideal protocol, UK Biobank field ID 20254](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=20254). 19 | The student model was trained on these individuals, using the teacher model's inferences as truth data and abdominal MRIs acquired with the ideal protocol as input. 20 | This model takes input of shape 232 x 256 x 36 and also emits a scalar representing estimated liver fat percentage. 21 | The input TensorMap is defined at `tensormap.ukb.mri.lms_ideal_optimised_low_flip_6dyn`. 22 | The output TensorMap associated with these values is defined at `tensormap.ukb.mri.liver_fat_echo_predicted`. 23 | The keras model file is at [liver_fat_from_ideal.h5](liver_fat_from_ideal.h5) and the model architecture is shown below. The "?" in the input dimension represents the batch size of the input, which can be determined at runtime. When training the student model we used a batch size of 5. 24 | ![Architecture Diagram](liver_fat_from_ideal_student_model.png) 25 | -------------------------------------------------------------------------------- /model_zoo/liver_fat_from_mri_ukb/liver_fat_from_echo.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:624ae5c619c9831b822fe586f657677d4b9938b38da7148f459e5a429a801df4 3 | size 4926304 4 | -------------------------------------------------------------------------------- /model_zoo/liver_fat_from_mri_ukb/liver_fat_from_echo_teacher_model.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:bf649630b906da34cd46647771986c3e7d0911e63093eb28d67a0983f8becda6 3 | size 380259 4 | -------------------------------------------------------------------------------- /model_zoo/liver_fat_from_mri_ukb/liver_fat_from_ideal.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:31c61e2fbe7eb685e7b86a069c97cbfaba8224ece0145522718640acfcb07017 3 | size 9896304 4 | -------------------------------------------------------------------------------- /model_zoo/liver_fat_from_mri_ukb/liver_fat_from_ideal_student_model.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2f15f673801826361835d0631f9de1aca36493e7ef695f78b13b11348551ff94 3 | size 433324 4 | -------------------------------------------------------------------------------- /model_zoo/mi_feature_selection/README.md: -------------------------------------------------------------------------------- 1 | # ML4HEN-COX 2 | 3 | ## Description 4 | 5 | This repo contains code to perform feature selection in very large datasets --- in both number of samples and number of covariates --- using survival data. 6 | 7 | ## Requirements 8 | 9 | This code was tested using python 3.7. 10 | It can be used using virtual env. 11 | 12 | ```bash 13 | python3 -m venv env 14 | source env/bin/activate 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | ## Usage 19 | 20 | Several files are provided: 21 | 22 | * [coxnet_training_testing_evaluating.py](./coxnet_training_testing_evaluating.py): fitting CoxNet models 23 | * [xgboost_training_testing_evaluating.py](./xgboost_training_testing_evaluating.py): fitting XgCox models 24 | * [2020.11.30_analysis_cleaned2.r](./2020.11.30_analysis_cleaned2.r): downstream R code 25 | * [2020.11.30_analysis_cleaned2.ipynb](./2020.11.30_analysis_cleaned2.ipynb): downstream notebook 26 | 27 | ### Model loading 28 | 29 | The provided xgboost model can be loaded as follows: 30 | 31 | ```py 32 | import xgboost as xgb 33 | xgcox = xgb.Booster() 34 | xgcox.load_model("models/xgcox_model.json") 35 | ``` 36 | 37 | and the CoxNet model as: 38 | 39 | ```py 40 | import pickle 41 | from sksurv.linear_model import CoxnetSurvivalAnalysis 42 | xxx = pickle.load(open('models/coxnet_survival_05_final.pickle', 'rb')) 43 | ``` 44 | 45 | ### Citation 46 | 47 | **[Selection of 51 predictors from 13,782 candidate multimodal features using machine learning improves coronary artery disease prediction](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8672148/)**, Saaket Agrawal, BS*, Marcus D. R. Klarqvist, PhD, MSc, MSc*, Connor Emdin, DPhil, MD, Aniruddh P. Patel, MD, Manish D. Paranjpe, BA, Patrick T. Ellinor, MD, PhD, Anthony Philippakis, MD, PhD, Kenney Ng, PhD, Puneet Batra, PhD, Amit V. Khera, MD, MSc 48 | 49 | -------------------------------------------------------------------------------- /model_zoo/mi_feature_selection/models/coxnet_survival_05_final.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/model_zoo/mi_feature_selection/models/coxnet_survival_05_final.pickle -------------------------------------------------------------------------------- /model_zoo/mi_feature_selection/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit_survival==0.13.1 2 | fastparquet==0.5.0 3 | xgboost==1.1.0_SNAPSHOT 4 | numpy==1.18.5 5 | pandas==1.1.5 6 | pyarrow==0.16.0 7 | scikit_learn==0.24.2 8 | -------------------------------------------------------------------------------- /model_zoo/registration_reveals_genetics/README.md: -------------------------------------------------------------------------------- 1 | This folder contains the code and notebooks used in our paper: ["Genetic Architectures of Medical Images Revealed by Registration of Multiple Modalities"](https://doi.org/10.1177/11779322241282489) 2 | 3 | We show how the systematic importance of registration for finding genetic signals directly from medical imaging modalities. 4 | This is demonstrated across a wide range of registration techniques. 5 | Our multimodal autoencoder comparison framework allows us to learn representations of medical images before and after registration. 6 | The learned registration methods considered are graphically summarized here: 7 | ![Learned Registration Methods](./registration.png) 8 | 9 | For example, to train a uni-modal autoencoder for DXA 2 scans: 10 | ```bash 11 | python /path/to/ml4h/ml4h/recipes.py \ 12 | --mode train \ 13 | --tensors /path/to/hd5_tensors/ \ 14 | --output_folder /path/to/output/ \ 15 | --tensormap_prefix ml4h.tensormap.ukb \ 16 | --input_tensors dxa.dxa_2 --output_tensors dxa.dxa_2 \ 17 | --encoder_blocks conv_encode --merge_blocks --decoder_blocks conv_decode \ 18 | --activation swish --conv_layers 32 --conv_width 31 --dense_blocks 32 32 32 32 32 --dense_layers 256 --block_size 3 \ 19 | --inspect_model --learning_rate 0.0001 \ 20 | --batch_size 4 --epochs 216 --training_steps 128 --validation_steps 36 --test_steps 4 --patience 36 \ 21 | --id dxa_2_autoencoder_256d 22 | ``` 23 | 24 | To train the cross-modal (DXA 2 <-> DXA5) registration with the DropFuse model the command line is: 25 | ```bash 26 | python /path/to/ml4h/ml4h/recipes.py \ 27 | --mode train \ 28 | --tensors /path/to/hd5_tensors/ \ 29 | --output_folder /path/to/output/ \ 30 | --tensormap_prefix ml4h.tensormap.ukb \ 31 | --input_tensors dxa.dxa_2 dxa.dxa_5 --output_tensors dxa.dxa_2 dxa.dxa_5 \ 32 | --encoder_blocks conv_encode --merge_blocks pair --decoder_blocks conv_decode \ 33 | --pairs dxa.dxa_2 dxa.dxa_5 --pair_loss contrastive --pair_loss_weight 0.1 --pair_merge dropout \ 34 | --activation swish --conv_layers 32 --conv_width 31 --dense_blocks 32 32 32 32 32 --dense_layers 256 --block_size 3 \ 35 | --inspect_model --learning_rate 0.0001 \ 36 | --batch_size 4 --epochs 216 --training_steps 128 --validation_steps 36 --test_steps 4 --patience 36 \ 37 | --id dxa_2_5_dropfuse_256d 38 | ``` 39 | Similiarly, autoencoders and cross modal fusion for all the modalities considered in the paper can be trained by changing the `--input_tensors` and `--output_tensors` arguments to point at the appropriate `TensorMap`, and if necessary updating the model architecture hyperparameters. 40 | Table 1 lists all the modalities included in the paper. 41 | ![Table of modalities](./table1.png) 42 | 43 | Then with latent space inference with models before and after registration we can evaluate their learned representations. 44 | ```bash 45 | python /home/sam/ml4h/ml4h/recipes.py \ 46 | --mode infer_encoders \ 47 | --tensors /path/to/hd5_tensors/ \ 48 | --output_folder /path/to/output/ \ 49 | --tensormap_prefix ml4h.tensormap.ukb \ 50 | --input_tensors dxa.dxa_2 --output_tensors dxa.dxa_2 \ 51 | --id dxa_2_autoencoder_256d \ 52 | --model_file /path/to/output/dxa_2_autoencoder_256d/dxa_2_autoencoder_256d.h5 53 | ``` 54 | 55 | We compare the strength and number of biological signals found with the [Latent Space Comparisons notebook](./latent_space_comparisons.ipynb). 56 | This notebook is used to populate the data summarized in Table 2 of the paper. 57 | ![Table of results](./table2.png) 58 | -------------------------------------------------------------------------------- /model_zoo/registration_reveals_genetics/registration.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a61ad1186235a5c24958202a73279d4a9b0f08a13c82b85e92f40d38e4c0119c 3 | size 529604 4 | -------------------------------------------------------------------------------- /model_zoo/registration_reveals_genetics/table1.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fa54d40f5143e9efad3ce5e01c53bc950bca9301a7ee6a7c8c0edcdcc3d7ea01 3 | size 259349 4 | -------------------------------------------------------------------------------- /model_zoo/registration_reveals_genetics/table2.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c0a056860884eafcd450fa22df45ab19b4513fa3e35572354187a9ed1d6b89f1 3 | size 344842 4 | -------------------------------------------------------------------------------- /model_zoo/silhouette_mri/README.md: -------------------------------------------------------------------------------- 1 | # Estimating body fat distribution from silhouette images 2 | 3 | ## Description 4 | 5 | This repo contains code to prepare silhouettes from UK Biobank whole-body magnetic resonance images and training deep learning models to estimate fat-depot volumes. 6 | 7 | ## Usage 8 | 9 | Several files are provided: 10 | 11 | * [ingest_mri.py](../../ml4h/applications/ingest/ingest_mri.py): ingesting UKB MRI data 12 | * [two_d_projection.py](../../ml4h/applications/ingest/two_d_projection.py): computing 2-dimensional projections 13 | * [ingest_autosegment.py](../../ml4h/applications/ingest/ingest_autosegment.py): autosegmenting axial slices 14 | * [train_models.py](./train_models.py): training deep-learning models 15 | * [callbacks.py](./callbacks.py): supporting callbacks required during training 16 | * [shrinkage_loss.py](./shrinkage_loss.py): supportive loss required during training 17 | 18 | ### Citation 19 | 20 | **[Estimating body fat distribution - a driver of cardiometabolic health - from silhouette images](https://www.medrxiv.org/content/10.1101/2022.01.14.22269328v2)**, Marcus D. R. Klarqvist, PhD*, Saaket Agrawal, BS*, Nathaniel Diamant, BS, Patrick T. Ellinor, MD, PhD, Anthony Philippakis, MD, PhD, Kenney Ng, PhD, Puneet Batra, PhD, Amit V. Khera, MD 21 | -------------------------------------------------------------------------------- /model_zoo/silhouette_mri/callbacks.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Marcus D. R. Klarqvist, PhD, MSc 4 | # https://github.com/mklarqvist/tf-computer-vision 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | import tensorflow as tf 24 | 25 | class LossHistory(tf.keras.callbacks.Callback): 26 | def __init__(self, decay_function): 27 | self.decay_function = decay_function 28 | self.async_safe = True 29 | 30 | def on_train_begin(self, logs={}): 31 | self.losses = [] 32 | self.lr = [] 33 | 34 | def on_epoch_end(self, batch, logs={}): 35 | self.losses.append(logs.get("loss")) 36 | self.lr.append(self.decay_function(len(self.losses))) 37 | 38 | 39 | class BatchMetricsLogger(tf.keras.callbacks.Callback): 40 | """Callback function used during `model.evaluate` calls with batch size set to 1. 41 | This approach stores `tf.keras.metrics` callback results for each test example. 42 | Only data generated by `tf.keras.metrics` functions will produce correct results! 43 | 44 | Example use: 45 | ```python 46 | metrics_dict = {m.name: m for m in model.metrics} 47 | logger = BatchMetricsLogger(metrics = metrics_dict) 48 | eval = model.evaluate(test_ds, callbacks=[logger], verbose=1) 49 | eval_batch = pd.DataFrame(logger.storage, index = test_data.index) 50 | ``` 51 | """ 52 | 53 | def __init__(self, metrics): 54 | super(BatchMetricsLogger, self).__init__() 55 | self.metrics = metrics 56 | self.storage = [] 57 | self.async_safe = True 58 | 59 | # 60 | def on_test_batch_end(self, batch, logs=None): 61 | self.storage.append(logs) 62 | -------------------------------------------------------------------------------- /phenotype_labels/disease/DATES.md: -------------------------------------------------------------------------------- 1 | # Dates in the UK Biobank 2 | 3 | ## Attended assessment center 4 | 5 | * Date FieldID 53 6 | 7 | Useful for: 8 | 9 | 1. Defining threshold date for incidence 10 | 1. Defining dates for things that don't otherwise have an associated date 11 | 12 | ## Birth 13 | 14 | * Date FieldID 34: Year of birth 15 | * Date FieldID 52: Month of birth 16 | * Date Field 33: birth date (*Note: this field is restricted due to its precision*) 17 | 18 | ## Lost to follow-up 19 | 20 | * Date FieldID 191 21 | 22 | ## Died 23 | 24 | * Date FieldID 40000 25 | 26 | ## ICD10 27 | 28 | * Date FieldID ==> derived from HESIN (Hospital Episode Statistics data in Showcase) 29 | * Main ICD10: 41202 30 | * Secondary ICD10: 41204 31 | * ICD10 Primary Cause of Death: 40001 32 | * ICD10 Secondary Cause of Death: 40002 33 | 34 | ## ICD9 35 | 36 | * Date FieldID ==> derived from HESIN 37 | * Main ICD9: 41203 38 | * Secondary ICD9: 41205 39 | 40 | ## Operation (OPCS4) 41 | 42 | * Date FieldID ==> derived from HESIN 43 | * Main OPCS4: 41200 44 | * Secondary OPCS4: 41210 45 | * Self-reported: 46 | * FieldID: 20004 47 | * *float32 Year*: 20010 (need to truncate and add month/day) 48 | 49 | ## Special cases 50 | 51 | * Subarachnoid hemorrhage 52 | * FieldID: 42013 53 | * Date: 42012 54 | * Intracerebral hemorrhage 55 | * FieldID: 42011 56 | * Date: 42010 57 | * Hemorrhagic stroke 58 | * FieldID: 42009 59 | * Date: 42008 60 | * Stroke 61 | * FieldID: 42007 62 | * Date: 42006 63 | * Myocardial infarction 64 | * FieldID: 42001 65 | * Date: 42000 66 | * Self-reported operation 67 | * FieldID: 20004 68 | * Date: 20010 69 | * Non-cancer illness 70 | * FieldID: 20002 71 | * Date: 20008 72 | * Cancer 73 | * FieldID: 20001 74 | * Date: 20006 75 | -------------------------------------------------------------------------------- /phenotype_labels/disease/README.md: -------------------------------------------------------------------------------- 1 | # CVDI/Disease 2 | CVDI/Disease computes derived phenotypes, with incidence, prevalence, death, and censoring, from tabfiles (Seung Hoan's phenotype definition format). As a brief review, tabfiles contain 3 columns: Field, Coding, and exclude. E.g.: 3 | 4 | ``` 5 | Field Coding exclude 6 | 20002 1076,1079 0 7 | ``` 8 | 9 | *Field* is the UK Biobank FieldID (e.g., `phenotype.FieldID`) 10 | 11 | *Coding* is the UK Biobank value (e.g., `phenotype.value` or `coding.coding`) 12 | 13 | *Exclude* is whether the row is an exclusion criterion (1) or not (0) 14 | 15 | # Install updated dependencies 16 | `go get -u` 17 | 18 | # Build (examples) 19 | `go build -o cvdidisease.osx *.go` 20 | 21 | `GOOS=linux go build -o cvdidisease.linux *.go` 22 | 23 | # Database dependencies 24 | This requires the materialized tables (defined in the SQL files in this directory) to exist in tables with the same name as their filename (except the suffix). -------------------------------------------------------------------------------- /phenotype_labels/disease/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "context" 6 | "flag" 7 | "log" 8 | "os" 9 | "path" 10 | "strings" 11 | 12 | "cloud.google.com/go/bigquery" 13 | ) 14 | 15 | type WrappedBigQuery struct { 16 | Context context.Context 17 | Client *bigquery.Client 18 | Project string 19 | Database string 20 | } 21 | 22 | var ( 23 | BufferSize = 4096 * 8 24 | STDOUT = bufio.NewWriterSize(os.Stdout, BufferSize) 25 | ) 26 | 27 | var materializedDB string 28 | 29 | func main() { 30 | defer STDOUT.Flush() 31 | 32 | var BQ = &WrappedBigQuery{ 33 | Context: context.Background(), 34 | } 35 | var tabfile string 36 | var displayQuery bool 37 | var override bool 38 | var diseaseName string 39 | 40 | flag.StringVar(&BQ.Project, "project", "", "Google Cloud project you want to use for billing purposes only") 41 | flag.StringVar(&BQ.Database, "database", "", "BigQuery source database name (note: must be formatted as project.database, e.g., broad-ml4cvd.ukbb7089_201904)") 42 | flag.StringVar(&tabfile, "tabfile", "", "Tabfile-formatted phenotype definition") 43 | flag.StringVar(&materializedDB, "materialized", "broad-ml4cvd.ukbb7089_201904", "project.database storing materialized view tables") 44 | flag.BoolVar(&displayQuery, "display-query", false, "Display the constructed query and exit?") 45 | flag.BoolVar(&override, "override", false, "Force run, even if this tool thinks your tabfile is inadequate?") 46 | flag.StringVar(&diseaseName, "disease", "", "If not specified, the tabfile will be parsed and become the disease name.") 47 | flag.Parse() 48 | 49 | if BQ.Project == "" || BQ.Database == "" || tabfile == "" || materializedDB == "" { 50 | flag.PrintDefaults() 51 | os.Exit(1) 52 | } 53 | 54 | tabs, err := ParseTabFile(tabfile) 55 | if err != nil { 56 | log.Fatalln(err) 57 | } 58 | 59 | if diseaseName == "" { 60 | diseaseName = path.Base(tabfile) 61 | if parts := strings.Split(diseaseName, "."); len(parts) > 1 { 62 | diseaseName = strings.Join(parts[0:len(parts)-1], ".") 63 | } 64 | } 65 | 66 | log.Println("Processing disease", diseaseName) 67 | 68 | missingFields, err := tabs.CheckSensibility() 69 | if err != nil && !override { 70 | log.Println(err) 71 | log.Fatalf("%s: Add the missing fields to your tabfile, or re-run with the -override flag to process anyway.\n", diseaseName) 72 | } else if err != nil && override { 73 | log.Println(diseaseName, err) 74 | log.Printf("%s: Overriding error check for missing fields and continuing.\n", diseaseName) 75 | } 76 | 77 | BQ.Client, err = bigquery.NewClient(BQ.Context, BQ.Project) 78 | if err != nil { 79 | log.Fatalln("Connecting to BigQuery:", err) 80 | } 81 | defer BQ.Client.Close() 82 | 83 | query, err := BuildQuery(BQ, tabs, displayQuery) 84 | if err != nil { 85 | log.Fatalln(diseaseName, err) 86 | } 87 | 88 | if displayQuery { 89 | return 90 | } 91 | 92 | if err := ExecuteQuery(BQ, query, diseaseName, missingFields); err != nil { 93 | log.Fatalln(diseaseName, err) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /phenotype_labels/disease/materialized_hesin_dates.sql: -------------------------------------------------------------------------------- 1 | WITH oper4 AS ( 2 | SELECT 41200 FieldID, eid, oper4 code, 3 | CASE 4 | WHEN h.admidate IS NOT NULL THEN h.admidate 5 | WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 6 | ELSE h.epistart 7 | END vdate 8 | FROM `broad-ml4cvd.ukbb7089_201904.hesin` h 9 | WHERE oper4 IS NOT NULL 10 | ), diag_icd10 AS ( 11 | SELECT 41202 FieldID, eid, diag_icd10 code, 12 | CASE 13 | WHEN h.admidate IS NOT NULL THEN h.admidate 14 | WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 15 | ELSE h.epistart 16 | END vdate 17 | FROM `broad-ml4cvd.ukbb7089_201904.hesin` h 18 | WHERE diag_icd10 IS NOT NULL 19 | ), diag_icd9 AS ( 20 | SELECT 41203 FieldID, eid, diag_icd9 code, 21 | CASE 22 | WHEN h.admidate IS NOT NULL THEN h.admidate 23 | WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 24 | ELSE h.epistart 25 | END vdate 26 | FROM `broad-ml4cvd.ukbb7089_201904.hesin` h 27 | WHERE diag_icd9 IS NOT NULL 28 | ), oper4secondary AS ( 29 | SELECT 41210 FieldID, h.eid, sec.oper4 code, 30 | CASE 31 | WHEN h.admidate IS NOT NULL THEN h.admidate 32 | WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 33 | ELSE h.epistart 34 | END vdate 35 | FROM `broad-ml4cvd.ukbb7089_201904.hesin_oper` sec 36 | LEFT JOIN `broad-ml4cvd.ukbb7089_201904.hesin` h ON sec.eid=h.eid AND sec.record_id=h.record_id 37 | WHERE TRUE 38 | AND sec.oper4 IS NOT NULL 39 | ), diag_icd10_secondary AS ( 40 | SELECT 41204 FieldID, h.eid, sec.diag_icd10 code, 41 | CASE 42 | WHEN h.admidate IS NOT NULL THEN h.admidate 43 | WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 44 | ELSE h.epistart 45 | END vdate 46 | FROM `broad-ml4cvd.ukbb7089_201904.hesin_diag10` sec 47 | LEFT JOIN `broad-ml4cvd.ukbb7089_201904.hesin` h ON sec.eid=h.eid AND sec.record_id=h.record_id 48 | WHERE TRUE 49 | AND sec.diag_icd10 IS NOT NULL 50 | ), diag_icd9_secondary AS ( 51 | SELECT 41205 FieldID, h.eid, sec.diag_icd9 code, 52 | CASE 53 | WHEN h.admidate IS NOT NULL THEN h.admidate 54 | WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate 55 | ELSE h.epistart 56 | END vdate 57 | FROM `broad-ml4cvd.ukbb7089_201904.hesin_diag9` sec 58 | LEFT JOIN `broad-ml4cvd.ukbb7089_201904.hesin` h ON sec.eid=h.eid AND sec.record_id=h.record_id 59 | WHERE TRUE 60 | AND sec.diag_icd9 IS NOT NULL 61 | ) 62 | 63 | SELECT 64 | diagnostics.eid sample_id, diagnostics.FieldID, diagnostics.code value, 65 | CASE 66 | WHEN MIN(PARSE_DATE("%E4Y-%m-%d", vdate)) IS NULL THEN MIN(PARSE_DATE("%E4Y-%m-%d", p.value)) 67 | ELSE MIN(PARSE_DATE("%E4Y-%m-%d", vdate)) 68 | END first_date 69 | FROM ( 70 | SELECT * FROM oper4 71 | UNION DISTINCT 72 | SELECT * FROM diag_icd10 73 | UNION DISTINCT 74 | SELECT * FROM diag_icd9 75 | UNION DISTINCT 76 | SELECT * FROM oper4secondary 77 | UNION DISTINCT 78 | SELECT * FROM diag_icd10_secondary 79 | UNION DISTINCT 80 | SELECT * FROM diag_icd9_secondary 81 | ) diagnostics 82 | JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` p ON p.sample_id = diagnostics.eid AND p.array_idx=0 AND p.instance=0 AND p.FieldID=53 83 | GROUP BY diagnostics.eid, diagnostics.FieldID, diagnostics.code 84 | ORDER BY first_date ASC 85 | ; -------------------------------------------------------------------------------- /phenotype_labels/disease/materialized_special_dates.sql: -------------------------------------------------------------------------------- 1 | WITH dated_fields AS ( 2 | SELECT p.FieldID, p.sample_id eid, p.value code, cod.meaning, 3 | CASE 4 | WHEN SAFE.PARSE_DATE("%E4Y-%m-%d", d.value) IS NULL THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value) 5 | WHEN cod.meaning LIKE ('%unknown%') THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value) 6 | ELSE SAFE.PARSE_DATE("%E4Y-%m-%d", d.value) 7 | END vdate 8 | FROM `broad-ml4cvd.ukbb7089_201904.phenotype` p 9 | JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` denroll ON denroll.FieldID=53 AND denroll.sample_id=p.sample_id AND denroll.instance = 0 AND denroll.array_idx = 0 10 | JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` d ON d.sample_id=p.sample_id AND d.instance = p.instance AND d.array_idx = p.array_idx 11 | AND ( 12 | FALSE 13 | OR (p.FieldID=42013 AND d.FieldID=42012) 14 | OR (p.FieldID=42011 AND d.FieldID=42010) 15 | OR (p.FieldID=42009 AND d.FieldID=42008) 16 | OR (p.FieldID=42007 AND d.FieldID=42006) 17 | OR (p.FieldID=42001 AND d.FieldID=42000) 18 | ) 19 | LEFT JOIN `broad-ml4cvd.ukbb7089_201904.coding` cod ON cod.coding_file_id = d.coding_file_id AND cod.coding = d.value 20 | ), 21 | dated_fields_fractional AS ( 22 | SELECT p.FieldID, p.sample_id eid, p.value code, cod.meaning, 23 | CASE 24 | WHEN SAFE.PARSE_DATE("%Y", d.value) IS NULL THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value) 25 | WHEN cod.meaning LIKE ('%unknown%') THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value) 26 | ELSE SAFE.PARSE_DATE("%Y", d.value) 27 | END vdate 28 | FROM `broad-ml4cvd.ukbb7089_201904.phenotype` p 29 | JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` denroll ON denroll.FieldID=53 AND denroll.sample_id=p.sample_id AND denroll.instance = 0 AND denroll.array_idx = 0 30 | JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` d ON d.sample_id=p.sample_id AND d.instance = p.instance AND d.array_idx = p.array_idx 31 | AND ( 32 | FALSE 33 | OR (p.FieldID=20004 AND d.FieldID=20010) 34 | OR (p.FieldID=20002 AND d.FieldID=20008) 35 | OR (p.FieldID=20001 AND d.FieldID=20006) 36 | ) 37 | LEFT JOIN `broad-ml4cvd.ukbb7089_201904.coding` cod ON cod.coding_file_id = d.coding_file_id AND cod.coding = d.value 38 | ) 39 | 40 | SELECT 41 | diagnostics.eid sample_id, diagnostics.FieldID, diagnostics.code value, MIN(vdate) first_date 42 | FROM ( 43 | SELECT * FROM dated_fields 44 | UNION DISTINCT 45 | SELECT * FROM dated_fields_fractional 46 | ) diagnostics 47 | WHERE TRUE 48 | AND vdate IS NOT NULL 49 | GROUP BY diagnostics.eid, diagnostics.FieldID, diagnostics.code 50 | ; -------------------------------------------------------------------------------- /phenotype_labels/disease/special_fields.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | var ( 4 | // These identify which FieldIDs should be special-cased: either to the 5 | // HESIN table or the table with fields that have a specially-known date. 6 | // All other fields can be queried against the main phenotype table, with an 7 | // assumed date of whatever makes the most sense for your purposes 8 | // (generally, I use the enrollment date, but you could use birthdate, etc). 9 | 10 | MaterializedHesin = map[int]struct{}{ 11 | 41210: struct{}{}, 12 | 41202: struct{}{}, 13 | 41204: struct{}{}, 14 | 40001: struct{}{}, 15 | 40002: struct{}{}, 16 | 41200: struct{}{}, 17 | 41203: struct{}{}, 18 | 41205: struct{}{}, 19 | } 20 | 21 | MaterializedSpecial = map[int]struct{}{ 22 | 42013: struct{}{}, 23 | 42011: struct{}{}, 24 | 42009: struct{}{}, 25 | 42007: struct{}{}, 26 | 42001: struct{}{}, 27 | 20004: struct{}{}, 28 | 20002: struct{}{}, 29 | 20001: struct{}{}, 30 | } 31 | ) 32 | 33 | var ( 34 | // These can be helpful to make sure that the user is including all fields 35 | // that use the same family of codes 36 | 37 | ICD9 = map[int]struct{}{ 38 | 41203: struct{}{}, 39 | 41205: struct{}{}, 40 | } 41 | 42 | ICD10 = map[int]struct{}{ 43 | 41202: struct{}{}, 44 | 41204: struct{}{}, 45 | 40001: struct{}{}, 46 | 40002: struct{}{}, 47 | } 48 | 49 | OPCS = map[int]struct{}{ 50 | 41200: struct{}{}, 51 | 41210: struct{}{}, 52 | } 53 | ) 54 | 55 | func IsHesin(fieldID int) bool { 56 | _, exists := MaterializedHesin[fieldID] 57 | 58 | return exists 59 | } 60 | 61 | func IsSpecial(fieldID int) bool { 62 | _, exists := MaterializedSpecial[fieldID] 63 | 64 | return exists 65 | } 66 | -------------------------------------------------------------------------------- /phenotype_labels/disease/time_handling.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | const NullMarker = "NULL" 9 | 10 | func TimeToUKBDate(t time.Time) string { 11 | if t.Equal(time.Time{}) { 12 | return NullMarker 13 | } 14 | 15 | return t.Format("2006-01-02") 16 | } 17 | 18 | func TimesToFractionalYears(earlier, later time.Time) string { 19 | if later.Before(earlier) { 20 | return NullMarker 21 | } 22 | y, m, d, h, min, sec := time_diff(earlier, later) 23 | 24 | return fmt.Sprintf("%.6f", float64(y)+float64(m)/12+float64(d)/(12*30)+float64(h)/(24*365)+float64(min)/(60*24*365)+float64(sec)/(60*60*24*365)) 25 | } 26 | 27 | // Taken directly from https://stackoverflow.com/a/36531443/199475 28 | func time_diff(a, b time.Time) (year, month, day, hour, min, sec int) { 29 | if a.Location() != b.Location() { 30 | b = b.In(a.Location()) 31 | } 32 | if a.After(b) { 33 | a, b = b, a 34 | } 35 | y1, M1, d1 := a.Date() 36 | y2, M2, d2 := b.Date() 37 | 38 | h1, m1, s1 := a.Clock() 39 | h2, m2, s2 := b.Clock() 40 | 41 | year = int(y2 - y1) 42 | month = int(M2 - M1) 43 | day = int(d2 - d1) 44 | hour = int(h2 - h1) 45 | min = int(m2 - m1) 46 | sec = int(s2 - s1) 47 | 48 | // Normalize negative values 49 | if sec < 0 { 50 | sec += 60 51 | min-- 52 | } 53 | if min < 0 { 54 | min += 60 55 | hour-- 56 | } 57 | if hour < 0 { 58 | hour += 24 59 | day-- 60 | } 61 | if day < 0 { 62 | // days in month: 63 | t := time.Date(y1, M1, 32, 0, 0, 0, 0, time.UTC) 64 | day += 32 - t.Day() 65 | month-- 66 | } 67 | if month < 0 { 68 | month += 12 69 | year-- 70 | } 71 | 72 | return 73 | } 74 | -------------------------------------------------------------------------------- /phenotype_labels/phecodes/README.md: -------------------------------------------------------------------------------- 1 | # Phecodes 2 | 3 | Phecodes are a mapping of ICD (9 and 10) codes to a hierarchy of diseases. 4 | 5 | To put Phecodes on top of a UKBB dataset (ingested into bigquery according to the ml4h ingest script) you have to 6 | 1. follow the instructions in load_phecodes.sh to load the raw phecode information (icd9 -> phenotype) into the database. 7 | 2. run map_phecodes.py to create a local csv file with mappings of the HESIN table to phecodes (the dataset is hardcoded for now) 8 | 3. load that csv file into bigquery manually, for example with 9 | 10 | ``` 11 | gsutil cp ukbb_dev_phecode_mapping.csv.gz gs://ml4cvd/projects/pbatra/ukbb_dev/ 12 | bq load \ 13 | --replace \ 14 | --source_format=CSV \ 15 | --schema phecode_mapping.json \ 16 | ukbb_dev.phecode_mapping gs://ml4cvd/projects/pbatra/ukbb_dev/ukbb_dev_phecode_mapping.csv.gz 17 | ``` 18 | -------------------------------------------------------------------------------- /phenotype_labels/phecodes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/ml4h/99d04fa473eabb419a1d8d7559545cedcbd81f0c/phenotype_labels/phecodes/__init__.py -------------------------------------------------------------------------------- /phenotype_labels/phecodes/icd10.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "icd10", 5 | "type": "STRING" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "phecode", 10 | "type": "STRING" 11 | }, 12 | { 13 | "name": "excluded_phecodes", 14 | "type": "STRING" 15 | }, 16 | { 17 | "name": "excluded_phenotypes", 18 | "type": "STRING" 19 | } 20 | ] 21 | -------------------------------------------------------------------------------- /phenotype_labels/phecodes/load_phecodes.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | set -o pipefail 4 | set -o nounset 5 | 6 | #RAW DATA LOCATOINS 7 | #beta version of ICD10 (WHO) -> phecodes mapping 8 | #https://phewascatalog.org/phecodes_icd10 9 | #There is an ICD10CM (US) version as well 10 | #Phecode definition map is here 11 | #https://phewascatalog.org/phecodes 12 | 13 | #SHARED_DATASET -- should already be created 14 | SHARED_DATA=shared_data 15 | #specific to this func 16 | __dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 17 | 18 | #Phecode CSVs (beta 2.2 release) are located in 19 | ICD10_FILE="gs://ml4h/data/phecodes/Phecode_map_v1_2_icd10_beta.csv.gz" 20 | PHECODE_DEF_FILE="gs://ml4h/data/phecodes/phecode_definitions1.2.csv.gz" 21 | 22 | #phecode definitions 23 | bq load \ 24 | --replace \ 25 | --source_format=CSV \ 26 | --skip_leading_rows 1 \ 27 | --schema ${__dir}/phecode_dictionary.json \ 28 | ${SHARED_DATA}.phecode_dictionary ${PHECODE_DEF_FILE} 29 | 30 | #icd10 31 | bq load \ 32 | --replace \ 33 | --source_format=CSV \ 34 | --skip_leading_rows 1 \ 35 | --format=prettyjson \ 36 | --schema ${__dir}/icd10.json \ 37 | ${SHARED_DATA}.phecode_icd10 ${ICD10_FILE} 38 | 39 | -------------------------------------------------------------------------------- /phenotype_labels/phecodes/phecode_dictionary.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "phecode", 5 | "type": "STRING" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "phenotype", 10 | "type": "STRING" 11 | }, 12 | { 13 | "mode": "REQUIRED", 14 | "name": "phecode_exclude_range", 15 | "type": "STRING" 16 | }, 17 | { 18 | "mode": "REQUIRED", 19 | "name": "sex", 20 | "type": "STRING" 21 | }, 22 | { 23 | "mode": "REQUIRED", 24 | "name": "rollup", 25 | "type": "INTEGER" 26 | }, 27 | { 28 | "mode": "REQUIRED", 29 | "name": "leaf", 30 | "type": "INTEGER" 31 | }, 32 | { 33 | "mode": "REQUIRED", 34 | "name": "category_number", 35 | "type": "INTEGER" 36 | }, 37 | { 38 | "mode": "REQUIRED", 39 | "name": "category", 40 | "type": "STRING" 41 | } 42 | 43 | 44 | ] 45 | -------------------------------------------------------------------------------- /phenotype_labels/phecodes/phecode_mapping.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "mode": "REQUIRED", 4 | "name": "eid", 5 | "type": "INTEGER" 6 | }, 7 | { 8 | "mode": "REQUIRED", 9 | "name": "record_id", 10 | "type": "INTEGER" 11 | }, 12 | { 13 | "mode": "NULLABLE", 14 | "name": "admidate", 15 | "type": "STRING" 16 | }, 17 | { 18 | "mode": "NULLABLE", 19 | "name": "diag_icd10", 20 | "type": "STRING" 21 | }, 22 | { 23 | "mode": "NULLABLE", 24 | "name": "phecode", 25 | "type": "STRING" 26 | } 27 | ] 28 | -------------------------------------------------------------------------------- /scripts/create_dev_dataset.py: -------------------------------------------------------------------------------- 1 | """ Creates a ukbb_dev dataset which is a miniature version of a typical ukbbs. 2 | 3 | NOTE: dataset names, tables to be copied are currently hardcoded, 4 | will fail if dev dataset already 5 | exists. 6 | """ 7 | from google.cloud import bigquery 8 | 9 | # CONSTANTS 10 | 11 | ORIGINAL_DATASET = 'ukbb7089_201904' 12 | DEV_DATASET = 'ukbb_dev' 13 | EXACT_TABLES = ['censor', 'coding', 'dictionary'] 14 | LIMITED_SAMPLE_TABLES = ['hesin', 'hesin_diag10', 'hesin_diag9', 'hesin_oper'] 15 | client = bigquery.Client() # should already be set to default project 16 | 17 | 18 | if __name__ == '__main__': 19 | PROJECT = client.project 20 | FULL_DEV_DATASET = f"{PROJECT}.{DEV_DATASET}" 21 | print(FULL_DEV_DATASET) 22 | dataset = bigquery.Dataset.from_string(FULL_DEV_DATASET) 23 | dataset = client.create_dataset(dataset) 24 | print('Dataset {} created.'.format(dataset.dataset_id)) 25 | 26 | print(f"working on {DEV_DATASET}.phenotype") 27 | # create 1/1000th size dataset 28 | query_job = client.query(f""" 29 | CREATE TABLE {DEV_DATASET}.phenotype 30 | AS SELECT * FROM {ORIGINAL_DATASET}.phenotype 31 | WHERE MOD(sample_id,1000)=4""") 32 | 33 | rows = query_job.result() 34 | 35 | # copy some tables exactly 36 | for table in EXACT_TABLES: 37 | print(f"working on {DEV_DATASET}.{table}") 38 | query_job = client.query(f""" 39 | CREATE TABLE {DEV_DATASET}.{table} 40 | AS SELECT * FROM {ORIGINAL_DATASET}.{table} 41 | """) 42 | rows = query_job.result() 43 | 44 | # copy hesin tables by limiting eids within sample_ids 45 | for table in LIMITED_SAMPLE_TABLES: 46 | print(f"working on {DEV_DATASET}.{table}") 47 | query_job = client.query(f""" 48 | CREATE TABLE {DEV_DATASET}.{table} 49 | AS SELECT * FROM {ORIGINAL_DATASET}.{table} 50 | where eid in (select sample_id from 51 | {DEV_DATASET}.phenotype) 52 | """) 53 | rows = query_job.result() 54 | -------------------------------------------------------------------------------- /scripts/detach_disk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | DISK=${1:-data} 3 | shift 1 4 | VMS=$(gcloud compute instances list | awk '{print $1}') 5 | ZONE=us-central1-a 6 | for VM in $VMS; 7 | do gcloud compute instances detach-disk $VM --zone $ZONE --disk=$DISK ; 8 | done -------------------------------------------------------------------------------- /scripts/train_subsets.sh: -------------------------------------------------------------------------------- 1 | ECHO= 2 | MODEL_FILES= 3 | TENSORS="/mnt/disks/annotated-cardiac-tensors-45k-2021-03-25/2020-09-21/" 4 | TENSOR_MAPS="ecg.ecg_rest_median_raw_10 mri.lax_4ch_heart_center " 5 | #array=( "drop_fuse_unsupervised_train_64.csv" "drop_fuse_unsupervised_train_128.csv" "drop_fuse_unsupervised_train_256.csv" "drop_fuse_unsupervised_train_512.csv" "drop_fuse_unsupervised_train_1024.csv" "drop_fuse_unsupervised_train_2048.csv" "drop_fuse_unsupervised_train_4096.csv") 6 | array=("drop_fuse_unsupervised_train_8192.csv" "drop_fuse_unsupervised_train_16384.csv") 7 | 8 | for i in "${array[@]}" 9 | do 10 | $ECHO ./scripts/tf.sh /home/sam/ml4h/ml4h/recipes.py --mode train_block \ 11 | --tensors "$TENSORS" --input_tensors "$TENSOR_MAPS" --output_tensors "$TENSOR_MAPS" \ 12 | --encoder_blocks /home/sam/trained_models/hypertuned_48m_16e_ecg_median_raw_10_autoencoder_256d/encoder_ecg_rest_median_raw_10.h5 \ 13 | /home/sam/trained_models/hypertuned_64m_18e_lax_4ch_heart_center_autoencoder_256d/encoder_lax_4ch_heart_center.h5 \ 14 | --merge_blocks pair \ 15 | --decoder_blocks /home/sam/trained_models/hypertuned_48m_16e_ecg_median_raw_10_autoencoder_256d/decoder_ecg_rest_median_raw_10.h5 \ 16 | /home/sam/trained_models/hypertuned_64m_18e_lax_4ch_heart_center_autoencoder_256d/decoder_lax_4ch_heart_center.h5 \ 17 | --pairs "$TENSOR_MAPS" --pair_loss contrastive --pair_loss_weight 0.1 --pair_merge dropout \ 18 | --batch_size 4 --epochs 316 --training_steps 128 --validation_steps 32 --test_steps 1 \ 19 | --num_workers 4 --patience 16 --tensormap_prefix ml4h.tensormap.ukb \ 20 | --id "drop_fuse_early_stop_v3_${i%.*}" --output_folder /home/sam/trained_models/ \ 21 | --inspect_model --activation mish --dense_layers 256 \ 22 | --train_csv "/home/sam/csvs/${i}" \ 23 | --valid_csv /home/sam/csvs/drop_fuse_unsupervised_valid.csv \ 24 | --test_csv /home/sam/csvs/sample_id_returned_lv_mass.csv 25 | 26 | 27 | $ECHO ./scripts/tf.sh /home/sam/ml4h/ml4h/recipes.py --mode infer_encoders \ 28 | --tensors "$TENSORS" --input_tensors "$TENSOR_MAPS" --output_tensors "$TENSOR_MAPS" \ 29 | --model_file "/home/sam/trained_models/drop_fuse_early_stop_v3_${i%.*}/drop_fuse_early_stop_v3_${i%.*}.h5" \ 30 | --id "drop_fuse_early_stop_v3_${i%.*}" --output_folder /home/sam/trained_models/ \ 31 | --sample_csv /home/sam/csvs/sample_id_returned_lv_mass.csv \ 32 | --tensormap_prefix ml4h.tensormap.ukb \ 33 | --dense_layers 256 34 | done 35 | -------------------------------------------------------------------------------- /scripts/validate_tensors.sh: -------------------------------------------------------------------------------- 1 | # use this script to validate the tensors created by the tensorize.sh script 2 | # expects two positional arguments: directory containing the tensors and the number of threads to use 3 | # example: ./validate_tensors.sh /mnt/disks/tensors/ 20 | tee completed_tensors.txt 4 | # the output will be in the following form: 5 | # OK - /mnt/disks/tensors/ukb1234.hd5 6 | # BAD - /mnt/disks/tensors/ukb5678.hd5 7 | 8 | 9 | INPUT_TENSORS_DIR=$1 10 | NUMBER_OF_THREADS=$2 11 | 12 | 13 | find ${INPUT_TENSORS_DIR}/*.hd5 | \ 14 | xargs -P ${NUMBER_OF_THREADS} -I {} \ 15 | bash -c "h5dump -n {} | (grep -q 'HDF5 \"{}\"' && echo 'OK - {}' || echo 'BAD - {}')" -------------------------------------------------------------------------------- /scripts/vm_image/ml4cvd-image.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # server-conf-scripts are for configuration of a *fresh* VM and should not be 4 | # treated as startup scripts. (They are not idempotent.) 5 | 6 | GCP_BUCKET="ml4h-core" 7 | 8 | # We assume we are running as a regular user, not root. 9 | 10 | # Enable gcsfuse to allow mounting of the google storage bucket as if it were a drive 11 | export GCSFUSE_REPO=gcsfuse-`lsb_release -c -s` 12 | echo "deb http://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list 13 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - 14 | 15 | # Install frequently-used packages 16 | # First, update apt since we have added new repos (above) 17 | sudo apt-get update 18 | 19 | sudo apt install -y r-base r-base-core unzip wget bzip2 python sqlite3 gcsfuse 20 | 21 | # Make gcsfuse auto-mount to /mnt/${GCP_BUCKET} in the future. Modify fstab to 22 | # do this automatically. Via 23 | # https://github.com/GoogleCloudPlatform/gcsfuse/blob/master/docs/mounting.md 24 | # and https://serverfault.com/a/830726/118452 to enable easier mount with read and 25 | # write access by non-root users. 26 | echo "${GCP_BUCKET} /mnt/${GCP_BUCKET} gcsfuse rw,allow_other,implicit_dirs,default_permissions,file_mode=777,dir_mode=777" | sudo tee -a /etc/fstab 27 | echo "fc-9a7c5487-04c9-4182-b3ec-13de7f6b409b /mnt/imputed_v2 gcsfuse ro,allow_other,implicit_dirs,default_permissions,file_mode=777,dir_mode=777" | sudo tee -a /etc/fstab 28 | echo "fc-7d5088b4-7673-45b5-95c2-17ae00a04183 /mnt/imputed_v3 gcsfuse ro,allow_other,implicit_dirs,default_permissions,file_mode=777,dir_mode=777" | sudo tee -a /etc/fstab 29 | 30 | 31 | # Enable docker (assumes Ubuntu, of any supported version) 32 | # See https://docs.docker.com/install/linux/docker-ce/ubuntu/#set-up-the-repository 33 | sudo apt-get remove docker docker-engine docker.io containerd runc 34 | sudo apt-get install -y \ 35 | ca-certificates \ 36 | curl \ 37 | gnupg \ 38 | lsb-release 39 | 40 | sudo mkdir -p /etc/apt/keyrings 41 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg 42 | echo \ 43 | "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ 44 | $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null 45 | 46 | sudo apt-get update 47 | sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin 48 | sudo service docker start 49 | 50 | sudo systemctl enable docker 51 | sudo groupadd -f docker 52 | 53 | # Manually install gcr 54 | # Via https://cloud.google.com/container-registry/docs/advanced-authentication#standalone_docker_credential_helper 55 | VERSION=1.5.0 56 | OS=linux 57 | ARCH=amd64 58 | curl -fsSL "https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v${VERSION}/docker-credential-gcr_${OS}_${ARCH}-${VERSION}.tar.gz" \ 59 | | tar xz --to-stdout ./docker-credential-gcr | sudo tee -a /usr/bin/docker-credential-gcr 1>/dev/null && sudo chmod +x /usr/bin/docker-credential-gcr 60 | docker-credential-gcr configure-docker 61 | 62 | sudo apt-get install -y python-setuptools 63 | 64 | 65 | # 66 | # Do last 67 | # 68 | 69 | # Cleanup apt cache 70 | sudo apt autoremove -y 71 | -------------------------------------------------------------------------------- /scripts/vm_launch/launch_dl_instance.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | NAME=${1:-sam-p4} 4 | shift 1 5 | INSTANCE_TYPE=${1:-n1-standard-4} 6 | shift 1 7 | DISK_SIZE=${1:-100GB} 8 | shift 1 9 | ACCEL=${1:-nvidia-tesla-t4} 10 | shift 1 11 | 12 | echo "Creating GPU instance ${NAME} from family dl-image of type ${INSTANCE_TYPE} with GPU ${ACCEL}..." 13 | 14 | echo "$@" 15 | 16 | gcloud compute instances create ${NAME} \ 17 | --project broad-ml4cvd \ 18 | --zone us-central1-a \ 19 | --image-project broad-ml4cvd \ 20 | --image-family dl-image \ 21 | --accelerator=type=${ACCEL},count=1 \ 22 | --maintenance-policy=TERMINATE \ 23 | --boot-disk-type=pd-standard \ 24 | --boot-disk-size=${DISK_SIZE} \ 25 | --service-account 783282864357-compute@developer.gserviceaccount.com \ 26 | --scopes https://www.googleapis.com/auth/cloud-platform \ 27 | --machine-type ${INSTANCE_TYPE} \ 28 | --metadata startup-script-url=gs://ml4cvd/projects/jamesp/home/startup.sh \ 29 | "$@" 30 | 31 | # Previously used the base ubuntu: 32 | # --image-project ubuntu-os-cloud \ 33 | # --image-family ubuntu-1804-lts \ 34 | 35 | # You can choose whatever size you like for the boot disk: 36 | # --boot-disk-size 300GB 37 | -------------------------------------------------------------------------------- /scripts/vm_launch/launch_instance.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | NAME=${1:-jpp-1} 4 | shift 1 5 | INSTANCE_TYPE=${1:-n1-standard-1} 6 | shift 1 7 | DISK_SIZE=${1:-100GB} 8 | shift 1 9 | 10 | echo "Creating instance ${NAME} from family ml4h-image of type ${INSTANCE_TYPE}..." 11 | 12 | echo "$@" 13 | 14 | gcloud compute instances create ${NAME} \ 15 | --project broad-ml4cvd \ 16 | --zone us-central1-a \ 17 | --image-project broad-ml4cvd \ 18 | --image-family ml4cvd-image \ 19 | --boot-disk-type=pd-standard \ 20 | --boot-disk-size=${DISK_SIZE} \ 21 | --service-account 783282864357-compute@developer.gserviceaccount.com \ 22 | --scopes https://www.googleapis.com/auth/cloud-platform \ 23 | --machine-type ${INSTANCE_TYPE} \ 24 | --metadata startup-script-url=gs://ml4cvd/projects/jamesp/home/startup.sh \ 25 | "$@" 26 | 27 | # Previously used the base ubuntu: 28 | # --image-project ubuntu-os-cloud \ 29 | # --image-family ubuntu-1804-lts \ 30 | 31 | # You can choose whatever size you like for the boot disk: 32 | # --boot-disk-size 300GB 33 | -------------------------------------------------------------------------------- /scripts/vm_launch/run_once.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Source this once 4 | # Exit and reconnect after adding any groups 5 | 6 | # Allow this user to run docker images without sudo 7 | sudo usermod -aG docker $(whoami) 8 | 9 | # Use the docker-credential-gcr that we installed on bootup 10 | docker-credential-gcr configure-docker 11 | 12 | # Install pre-commit 13 | sudo apt install python3-pip 14 | pip3 install pre-commit 15 | -------------------------------------------------------------------------------- /scripts/vm_start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | VM=${1:-sam-gpu2} 3 | shift 1 4 | ZONE=us-central1-a 5 | MAX_TRIES=1000 6 | COUNTER=0 7 | while [[ $COUNTER -lt $(( $MAX_TRIES )) ]]; do 8 | sleep 1s 9 | gcloud compute instances start $VM --zone $ZONE 10 | if [[ $? -eq 0 ]] 11 | then 12 | echo "Potentially started vm: ${VM} after ${COUNTER} attempts." 13 | gcloud compute ssh $VM --zone $ZONE 14 | if [[ $? -eq 0 ]] 15 | then 16 | break 17 | else 18 | let COUNTER=COUNTER+1 19 | echo "Actually, no. Could not start vm: ${VM}, unsuccessful attempt: ${COUNTER}." 20 | sleep 1s 21 | fi 22 | else 23 | let COUNTER=COUNTER+1 24 | sleep 1s 25 | echo "Could not start vm: ${VM}, unsuccessful attempt: ${COUNTER}." 26 | sleep 1s 27 | fi 28 | done 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup, find_packages 3 | 4 | here = pathlib.Path(__file__).parent.resolve() 5 | # Get the requirements from the requirements file 6 | requirements = (here / 'docker/vm_boot_images/config/tensorflow-requirements.txt').read_text(encoding='utf-8') 7 | long_description = (here / 'README.md').read_text(encoding='utf-8') 8 | 9 | 10 | setup( 11 | name='ml4h', 12 | version='0.1.0', 13 | description='Machine Learning for Health python package', 14 | long_description=long_description, # Optional 15 | long_description_content_type='text/markdown', 16 | url='https://github.com/broadinstitute/ml4h', 17 | python_requires='>=3.6', 18 | install_requires=requirements, 19 | packages=find_packages(), 20 | ) 21 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pytest 3 | 4 | from ml4h.arguments import parse_args 5 | from ml4h.test_utils import TMAPS as MOCK_TMAPS 6 | from ml4h.test_utils import build_hdf5s 7 | 8 | 9 | def pytest_configure(config): 10 | pytest.N_TENSORS = 100 11 | config.addinivalue_line("markers", "slow: mark tests as slow") 12 | 13 | 14 | @pytest.fixture(scope='class') 15 | def default_arguments(tmpdir_factory): 16 | temp_dir = tmpdir_factory.mktemp('data') 17 | build_hdf5s(temp_dir, MOCK_TMAPS.values(), n=pytest.N_TENSORS) 18 | hdf5_dir = str(temp_dir) 19 | inp_key = '3d_cont' 20 | out_key = '1d_cat' 21 | sys.argv = [ 22 | '', 23 | '--output_folder', hdf5_dir, 24 | '--input_tensors', inp_key, 25 | '--output_tensors', out_key, 26 | '--tensors', hdf5_dir, 27 | '--pool_x', '1', 28 | '--pool_y', '1', 29 | '--pool_z', '1', 30 | '--training_steps', '2', 31 | '--test_steps', '10', 32 | '--test_ratio', '0.6', 33 | '--validation_steps', '2', 34 | '--valid_ratio', '0.2', 35 | '--epochs', '2', 36 | '--num_workers', '0', 37 | '--batch_size', '4', 38 | '--gcs_cloud_bucket','ml4h-core/anamika/gcs-test4/', 39 | 40 | ] 41 | args = parse_args() 42 | return args 43 | -------------------------------------------------------------------------------- /tests/test_arguments.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pytest 3 | from ml4h.arguments import parse_args 4 | from ml4h.test_utils import TMAPS as MOCK_TMAPS 5 | 6 | 7 | class TestUConnect: 8 | 9 | def test_no_u(self, tmpdir): 10 | sys.argv = [ 11 | 'train', 12 | '--output_folder', str(tmpdir), 13 | ] 14 | args = parse_args() 15 | assert len(args.u_connect) == 0 16 | 17 | def test_simple_u(self, tmpdir): 18 | inp_key = '3d_cont' 19 | sys.argv = [ 20 | 'train', 21 | '--output_folder', str(tmpdir), 22 | '--input_tensors', inp_key, 23 | '--output_tensors', inp_key, 24 | '--u_connect', inp_key, inp_key, 25 | ] 26 | args = parse_args() 27 | assert len(args.u_connect) == 1 28 | inp, out = list(args.u_connect.items())[0] 29 | tmap = MOCK_TMAPS[inp_key] 30 | assert inp == tmap 31 | assert out == {tmap} 32 | 33 | def test_many_to_one(self, tmpdir): 34 | inp_key1 = '3d_cont' 35 | inp_key2 = '3d_cat' 36 | sys.argv = [ 37 | 'train', 38 | '--output_folder', str(tmpdir), 39 | '--input_tensors', inp_key1, inp_key2, 40 | '--output_tensors', inp_key1, 41 | '--u_connect', inp_key1, inp_key1, 42 | '--u_connect', inp_key2, inp_key1, 43 | ] 44 | args = parse_args() 45 | assert len(args.u_connect) == 2 46 | assert args.u_connect[MOCK_TMAPS[inp_key1]] == {MOCK_TMAPS[inp_key1]} 47 | assert args.u_connect[MOCK_TMAPS[inp_key2]] == {MOCK_TMAPS[inp_key1]} 48 | 49 | def test_one_to_many(self, tmpdir): 50 | key1 = '3d_cont' 51 | key2 = '3d_cat' 52 | sys.argv = [ 53 | 'train', 54 | '--output_folder', str(tmpdir), 55 | '--input_tensors', key1, key2, 56 | '--output_tensors', key1, key2, 57 | '--u_connect', key1, key1, 58 | '--u_connect', key1, key2, 59 | ] 60 | args = parse_args() 61 | assert len(args.u_connect) == 1 62 | assert args.u_connect[MOCK_TMAPS[key1]] == {MOCK_TMAPS[key1], MOCK_TMAPS[key2]} 63 | 64 | def test_multi_u(self, tmpdir): 65 | key1 = '3d_cont' 66 | key2 = '3d_cat' 67 | sys.argv = [ 68 | 'train', 69 | '--output_folder', str(tmpdir), 70 | '--input_tensors', key1, key2, 71 | '--output_tensors', key1, key2, 72 | '--u_connect', key1, key1, 73 | '--u_connect', key2, key2, 74 | ] 75 | args = parse_args() 76 | assert len(args.u_connect) == 2 77 | assert args.u_connect[MOCK_TMAPS[key1]] == {MOCK_TMAPS[key1]} 78 | assert args.u_connect[MOCK_TMAPS[key2]] == {MOCK_TMAPS[key2]} 79 | --------------------------------------------------------------------------------