├── .DS_Store ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── clinical └── preprocess-clinical-data.ipynb ├── genomic └── preprocess-genomic-data.ipynb ├── images ├── .DS_Store └── architecture.png ├── imaging ├── preprocess-imaging-data.ipynb └── src │ ├── Dockerfile │ ├── dcm2nifti_processing.py │ ├── nsclc-radiogenomics-imaging-workflow.json │ ├── radiomics_utils.py │ └── requirements.txt └── model-train-test └── train-test-model.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/machine-learning-pipelines-for-multimodal-health-data/710eb405d2511394eefedbb07c5dd07d6ca96e48/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # PyCharm 148 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 150 | # and can be added to the global gitignore or merged into this file. For a more nuclear 151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 152 | #.idea/ 153 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Analyzing Multimodal Health Data on AWS 2 | 3 | ## Introduction 4 | This repository contains code samples related to the two-part blog series [Building Scalable Machine Learning Pipelines for Multimodal Health Data on AWS](https://aws.amazon.com/blogs/industries/building-scalable-machine-learning-pipelines-for-multimodal-health-data-on-aws/) and [Training Machine Learning Models on Multimodal Health Data with Amazon SageMaker](https://aws.amazon.com/blogs/industries/training-machine-learning-models-on-multimodal-health-data-with-amazon-sagemaker/) 5 | 6 | ## Machine Learning Pipelines for Multimodal Health Data 7 | 8 | You can use these artifacts to recreate the pipelines and analysis presented in the blog posts, as shown below. 9 | 10 | ![Architecture on AWS](./images/architecture.png) 11 | 12 | ## Project Structure 13 | 14 | Artifacts for processing each data modality are located in corresponding subdirectories of this repo. 15 | 16 | ``` 17 | ./ 18 | ./genomics/ <-- Artifacts for genomics pipeline 19 | ./clinical/ <-- Artifacts for clinical pipeline * 20 | ./imaging/ <-- Artifacts for medical imaging pipeline 21 | ./model-train-test/ <-- Artifacts for performing model training and testing 22 | ``` 23 | 24 | \* The clinical data can also be preprocessed with Amazon SageMaker Data Wrangler, as discussed in the blog. 25 | 26 | ## Security 27 | 28 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 29 | 30 | ## License 31 | 32 | This library is licensed under the MIT-0 License. See the LICENSE file. 33 | -------------------------------------------------------------------------------- /clinical/preprocess-clinical-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 259, 6 | "id": "95344a61", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "## This script is used to read and preprocess clinical data (in tabular format) from S3 and store features in SageMaker FeatureStore" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 260, 16 | "id": "efa5b476", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import boto3\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import io, os\n", 25 | "from time import gmtime, strftime, sleep\n", 26 | "import time\n", 27 | "import sagemaker\n", 28 | "from sagemaker.session import Session\n", 29 | "from sagemaker import get_execution_role\n", 30 | "from sagemaker.feature_store.feature_group import FeatureGroup" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "d3e657f8", 36 | "metadata": {}, 37 | "source": [ 38 | "## Set up SageMaker FeatureStore" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 261, 44 | "id": "da137d82", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "region = boto3.Session().region_name\n", 49 | "\n", 50 | "boto_session = boto3.Session(region_name=region)\n", 51 | "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n", 52 | "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n", 53 | "\n", 54 | "feature_store_session = Session(\n", 55 | " boto_session=boto_session,\n", 56 | " sagemaker_client=sagemaker_client,\n", 57 | " sagemaker_featurestore_runtime_client=featurestore_runtime\n", 58 | ")\n", 59 | "\n", 60 | "role = get_execution_role()\n", 61 | "s3_client = boto3.client('s3', region_name=region)\n", 62 | "\n", 63 | "default_s3_bucket_name = feature_store_session.default_bucket()\n", 64 | "prefix = 'sagemaker-featurestore-demo'" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "78ef8a21", 70 | "metadata": {}, 71 | "source": [ 72 | "## Get data from S3" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 262, 78 | "id": "04fa869d", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# Get data from S3 \n", 83 | "bucket_clin = 'nsclc-clinical-genomic-data'\n", 84 | "#bucket_clin = \n", 85 | "\n", 86 | "# Clinical data \n", 87 | "#data_key_clin = 'Clinical-data-119patients.csv'\n", 88 | "data_key_clin = 'NSCLCR01Radiogenomic_DATA_LABELS_2018-05-22_1500-shifted.csv'\n", 89 | "#data_key_clin = \n", 90 | "\n", 91 | "data_location_clin = 's3://{}/{}'.format(bucket_clin, data_key_clin)\n", 92 | "data_clinical = pd.read_csv(data_location_clin)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "id": "e80ed14b", 98 | "metadata": {}, 99 | "source": [ 100 | "## Preprocess Data" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 264, 106 | "id": "d6131af6", 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "data": { 111 | "text/plain": [ 112 | "(147, 89)" 113 | ] 114 | }, 115 | "execution_count": 264, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "# Keep samples starting with \"R01-*\" as these IDs have corresponding medical imaging data. Delete samples with Case IDs \"AMC-*\". \n", 122 | "data_clinical = data_clinical[~data_clinical[\"Case ID\"].str.contains(\"AMC\")]\n", 123 | "\n", 124 | "# Delete columns with ID and dates\n", 125 | "list_delete_cols = ['Quit Smoking Year', 'Date of Recurrence', 'Date of Last Known Alive', 'Date of Death', 'CT Date', 'PET Date']\n", 126 | "data_clinical.drop(list_delete_cols, axis=1, inplace=True)\n", 127 | "\n", 128 | "# List of features with catergorical value\n", 129 | "list_encode_cols = [\"Patient affiliation\", \"Gender\", \"Ethnicity\", \"Smoking status\", \"%GG\", \"Tumor Location (choice=RUL)\", \"Tumor Location (choice=RML)\", \"Tumor Location (choice=RLL)\", \"Tumor Location (choice=LUL)\", \"Tumor Location (choice=LLL)\", \"Tumor Location (choice=L Lingula)\", \"Tumor Location (choice=Unknown)\", \"Histology \", \"Pathological T stage\", \"Pathological N stage\", \"Pathological M stage\", \"Histopathological Grade\", \"Lymphovascular invasion\", \"Pleural invasion (elastic, visceral, or parietal)\", \"EGFR mutation status\", \"KRAS mutation status\", \"ALK translocation status\", \"Adjuvant Treatment\", \"Chemotherapy\", \"Radiation\", \"Recurrence\", \"Recurrence Location\"]\n", 130 | "\n", 131 | "# List of features with numeric value\n", 132 | "list_nonenc_cols = [\"Case ID\", \"Age at Histological Diagnosis\", \"Weight (lbs)\", \"Pack Years\", \"Time to Death (days)\", \"Days between CT and surgery\", \"Survival Status\"]\n", 133 | "\n", 134 | "# One-hot encoding of features with categorical value\n", 135 | "data_clinical_enc = pd.get_dummies(data_clinical[list_encode_cols])\n", 136 | "\n", 137 | "data_clinical_nonenc = data_clinical[list_nonenc_cols]\n", 138 | "\n", 139 | "# Combine all features\n", 140 | "data_clin = pd.concat([data_clinical_enc, data_clinical_nonenc], axis=1)\n", 141 | "\n", 142 | "# Feature names inside FeatureStore should not have special chars and should be < 64 chars long\n", 143 | "# Update feature names accordingly\n", 144 | "\n", 145 | "l_char = ['-',' ','%','/','<','>','(',')','=',',',':']\n", 146 | "\n", 147 | "for col in (data_clin.columns):\n", 148 | "\n", 149 | " if (col == \"Case ID\"):\n", 150 | " data_clin.rename(columns={col: col.replace(' ','_')}, inplace = True)\n", 151 | " continue\n", 152 | "\n", 153 | " for char in l_char:\n", 154 | " if char in col:\n", 155 | " data_clin.rename(columns={col: col.replace(char,'')}, inplace = True)\n", 156 | " col = col.replace(char,'')\n", 157 | " \n", 158 | " if (len(col)>=64):\n", 159 | " data_clin.rename(columns={col: col[:60]}, inplace = True)\n", 160 | " \n", 161 | "# Change label (survival status) \"Dead\"=1 and \"Alive\"=0 \n", 162 | "data_clin[\"SurvivalStatus\"].replace({\"Dead\": \"1\", \"Alive\": \"0\"}, inplace=True)\n", 163 | "\n", 164 | "\n", 165 | "# Drop samples with missing values. \n", 166 | "# Fill NaN with 0. For eg. PackYears for non-smokers is \"NA\". Change it to 0.\n", 167 | "data_clin = data_clin[data_clin['Weightlbs'] != \"Not Collected\"]\n", 168 | "data_clin = data_clin[data_clin['PackYears'] != \"Not Collected\"]\n", 169 | "data_clin.fillna(0)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "id": "33d68a8a", 175 | "metadata": {}, 176 | "source": [ 177 | "## Ingest data into FeatureStore" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 266, 183 | "id": "a03327c5", 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "Patientaffiliation_Stanford\n", 191 | "Patientaffiliation_VA\n", 192 | "Gender_Female\n", 193 | "Gender_Male\n", 194 | "Ethnicity_AfricanAmerican\n", 195 | "Ethnicity_Asian\n", 196 | "Ethnicity_Caucasian\n", 197 | "Ethnicity_HispanicLatino\n", 198 | "Ethnicity_NativeHawaiianPacificIslander\n", 199 | "Smokingstatus_Current\n", 200 | "Smokingstatus_Former\n", 201 | "Smokingstatus_Nonsmoker\n", 202 | "GG_0\n", 203 | "GG_100\n", 204 | "GG_2550\n", 205 | "GG_5075\n", 206 | "GG_75100\n", 207 | "GG_025\n", 208 | "GG_NotAssessed\n", 209 | "TumorLocationchoiceRUL_Checked\n", 210 | "TumorLocationchoiceRUL_Unchecked\n", 211 | "TumorLocationchoiceRML_Checked\n", 212 | "TumorLocationchoiceRML_Unchecked\n", 213 | "TumorLocationchoiceRLL_Checked\n", 214 | "TumorLocationchoiceRLL_Unchecked\n", 215 | "TumorLocationchoiceLUL_Checked\n", 216 | "TumorLocationchoiceLUL_Unchecked\n", 217 | "TumorLocationchoiceLLL_Checked\n", 218 | "TumorLocationchoiceLLL_Unchecked\n", 219 | "TumorLocationchoiceLLingula_Checked\n", 220 | "TumorLocationchoiceLLingula_Unchecked\n", 221 | "TumorLocationchoiceUnknown_Unchecked\n", 222 | "Histology_Adenocarcinoma\n", 223 | "Histology_NSCLCNOSnototherwisespecified\n", 224 | "Histology_Squamouscellcarcinoma\n", 225 | "PathologicalTstage_T1a\n", 226 | "PathologicalTstage_T1b\n", 227 | "PathologicalTstage_T2a\n", 228 | "PathologicalTstage_T2b\n", 229 | "PathologicalTstage_T3\n", 230 | "PathologicalTstage_T4\n", 231 | "PathologicalTstage_Tis\n", 232 | "PathologicalNstage_N0\n", 233 | "PathologicalNstage_N1\n", 234 | "PathologicalNstage_N2\n", 235 | "PathologicalMstage_M0\n", 236 | "PathologicalMstage_M1a\n", 237 | "PathologicalMstage_M1b\n", 238 | "HistopathologicalGrade_G1Welldifferentiated\n", 239 | "HistopathologicalGrade_G2Moderatelydifferentiated\n", 240 | "HistopathologicalGrade_G3Poorlydifferentiated\n", 241 | "HistopathologicalGrade_OtherTypeIWelltomoderatelydifferentiated\n", 242 | "HistopathologicalGrade_OtherTypeIIModeratelytopoorlydifferen\n", 243 | "Lymphovascularinvasion_Absent\n", 244 | "Lymphovascularinvasion_NotCollected\n", 245 | "Lymphovascularinvasion_Present\n", 246 | "Pleuralinvasionelasticvisceralorparietal_No\n", 247 | "Pleuralinvasionelasticvisceralorparietal_Notcollected\n", 248 | "Pleuralinvasionelasticvisceralorparietal_Yes\n", 249 | "EGFRmutationstatus_Mutant\n", 250 | "EGFRmutationstatus_Notcollected\n", 251 | "EGFRmutationstatus_Unknown\n", 252 | "EGFRmutationstatus_Wildtype\n", 253 | "KRASmutationstatus_Mutant\n", 254 | "KRASmutationstatus_Notcollected\n", 255 | "KRASmutationstatus_Unknown\n", 256 | "KRASmutationstatus_Wildtype\n", 257 | "ALKtranslocationstatus_Notcollected\n", 258 | "ALKtranslocationstatus_Translocated\n", 259 | "ALKtranslocationstatus_Unknown\n", 260 | "ALKtranslocationstatus_Wildtype\n", 261 | "AdjuvantTreatment_No\n", 262 | "AdjuvantTreatment_Yes\n", 263 | "Chemotherapy_No\n", 264 | "Chemotherapy_Yes\n", 265 | "Radiation_No\n", 266 | "Radiation_Yes\n", 267 | "Recurrence_no\n", 268 | "Recurrence_yes\n", 269 | "RecurrenceLocation_distant\n", 270 | "RecurrenceLocation_local\n", 271 | "RecurrenceLocation_regional\n", 272 | "Case_ID\n", 273 | "AgeatHistologicalDiagnosis\n", 274 | "Weightlbs\n", 275 | "PackYears\n", 276 | "TimetoDeathdays\n", 277 | "DaysbetweenCTandsurgery\n", 278 | "SurvivalStatus\n", 279 | "Waiting for Feature Group Creation\n", 280 | "Waiting for Feature Group Creation\n", 281 | "FeatureGroup clinical-feature-group-29-00-11-19 successfully created.\n" 282 | ] 283 | }, 284 | { 285 | "data": { 286 | "text/plain": [ 287 | "IngestionManagerPandas(feature_group_name='clinical-feature-group-29-00-11-19', sagemaker_fs_runtime_client_config=, max_workers=3, max_processes=1, _async_result=, _processing_pool=, _failed_indices=[])" 288 | ] 289 | }, 290 | "execution_count": 266, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "clinical_feature_group_name = 'clinical-feature-group-' + strftime('%d-%H-%M-%S', gmtime())\n", 297 | "clinical_feature_group = FeatureGroup(name=clinical_feature_group_name, sagemaker_session=feature_store_session)\n", 298 | "\n", 299 | "current_time_sec = int(round(time.time()))\n", 300 | "\n", 301 | "def cast_object_to_string(data_frame):\n", 302 | " for label in data_frame.columns:\n", 303 | " print (label)\n", 304 | " if data_frame.dtypes[label] == 'object':\n", 305 | " data_frame[label] = data_frame[label].astype(\"str\").astype(\"string\")\n", 306 | "\n", 307 | "# Cast object dtype to string. SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.\n", 308 | "cast_object_to_string(data_clin)\n", 309 | "\n", 310 | "# Record identifier and event time feature names\n", 311 | "record_identifier_feature_name = \"Case_ID\"\n", 312 | "event_time_feature_name = \"EventTime\"\n", 313 | "\n", 314 | "# Append EventTime feature\n", 315 | "data_clin[event_time_feature_name] = pd.Series([current_time_sec]*len(data_clin), dtype=\"float64\")\n", 316 | "\n", 317 | "## If event time generates NaN\n", 318 | "data_clin[event_time_feature_name] = data_clin[event_time_feature_name].fillna(0)\n", 319 | "\n", 320 | "# Load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.\n", 321 | "clinical_feature_group.load_feature_definitions(data_frame=data_clin); # output is suppressed\n", 322 | "\n", 323 | "\n", 324 | "def wait_for_feature_group_creation_complete(feature_group):\n", 325 | " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", 326 | " while status == \"Creating\":\n", 327 | " print(\"Waiting for Feature Group Creation\")\n", 328 | " time.sleep(5)\n", 329 | " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", 330 | " if status != \"Created\":\n", 331 | " raise RuntimeError(f\"Failed to create feature group {feature_group.name}\")\n", 332 | " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n", 333 | "\n", 334 | "clinical_feature_group.create(\n", 335 | " s3_uri=f\"s3://{default_s3_bucket_name}/{prefix}\",\n", 336 | " record_identifier_name=record_identifier_feature_name,\n", 337 | " event_time_feature_name=event_time_feature_name,\n", 338 | " role_arn=role,\n", 339 | " enable_online_store=True\n", 340 | ")\n", 341 | "\n", 342 | "wait_for_feature_group_creation_complete(feature_group=clinical_feature_group)\n", 343 | "\n", 344 | "clinical_feature_group.ingest(\n", 345 | " data_frame=data_clin, max_workers=3, wait=True\n", 346 | ")" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "id": "2ac04c7a", 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [] 356 | } 357 | ], 358 | "metadata": { 359 | "kernelspec": { 360 | "display_name": "conda_python3", 361 | "language": "python", 362 | "name": "conda_python3" 363 | }, 364 | "language_info": { 365 | "codemirror_mode": { 366 | "name": "ipython", 367 | "version": 3 368 | }, 369 | "file_extension": ".py", 370 | "mimetype": "text/x-python", 371 | "name": "python", 372 | "nbconvert_exporter": "python", 373 | "pygments_lexer": "ipython3", 374 | "version": "3.6.13" 375 | } 376 | }, 377 | "nbformat": 4, 378 | "nbformat_minor": 5 379 | } 380 | -------------------------------------------------------------------------------- /genomic/preprocess-genomic-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "possible-regular", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "## This script is used to read genomic data (in tabular format) from S3 and store features in SageMaker FeatureStore" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 7, 16 | "id": "nasty-consensus", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import boto3\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import io, os\n", 25 | "from time import gmtime, strftime, sleep\n", 26 | "import time\n", 27 | "import sagemaker\n", 28 | "from sagemaker.session import Session\n", 29 | "from sagemaker import get_execution_role\n", 30 | "from sagemaker.feature_store.feature_group import FeatureGroup" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "sound-venture", 36 | "metadata": {}, 37 | "source": [ 38 | "## Set up SageMaker FeatureStore" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 8, 44 | "id": "usual-today", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "region = boto3.Session().region_name\n", 49 | "\n", 50 | "boto_session = boto3.Session(region_name=region)\n", 51 | "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n", 52 | "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n", 53 | "\n", 54 | "feature_store_session = Session(\n", 55 | " boto_session=boto_session,\n", 56 | " sagemaker_client=sagemaker_client,\n", 57 | " sagemaker_featurestore_runtime_client=featurestore_runtime\n", 58 | ")\n", 59 | "\n", 60 | "role = get_execution_role()\n", 61 | "s3_client = boto3.client('s3', region_name=region)\n", 62 | "\n", 63 | "default_s3_bucket_name = feature_store_session.default_bucket()\n", 64 | "prefix = 'sagemaker-featurestore-demo'" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "ranking-district", 70 | "metadata": {}, 71 | "source": [ 72 | "## Get data from S3" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 9, 78 | "id": "chicken-refrigerator", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# Get data from S3 \n", 83 | "bucket_gen = 'nsclc-clinical-genomic-data'\n", 84 | "#bucket_gen = \n", 85 | "\n", 86 | "# Genomic data \n", 87 | "data_key_gen = 'Genomic-data-119patients.csv'\n", 88 | "#data_key_gen = \n", 89 | "\n", 90 | "data_location_gen = 's3://{}/{}'.format(bucket_gen, data_key_gen)\n", 91 | "data_gen = pd.read_csv(data_location_gen)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "introductory-brazil", 97 | "metadata": {}, 98 | "source": [ 99 | "## Ingest data into FeatureStore" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "operating-infrastructure", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "genomic_feature_group_name = 'genomic-feature-group-' + strftime('%d-%H-%M-%S', gmtime())\n", 110 | "genomic_feature_group = FeatureGroup(name=genomic_feature_group_name, sagemaker_session=feature_store_session)\n", 111 | "\n", 112 | "current_time_sec = int(round(time.time()))\n", 113 | "\n", 114 | "def cast_object_to_string(data_frame):\n", 115 | " for label in data_frame.columns:\n", 116 | " if data_frame.dtypes[label] == 'object':\n", 117 | " data_frame[label] = data_frame[label].astype(\"str\").astype(\"string\")\n", 118 | "\n", 119 | "# Cast object dtype to string. SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.\n", 120 | "cast_object_to_string(data_gen)\n", 121 | "\n", 122 | "# Record identifier and event time feature names\n", 123 | "record_identifier_feature_name = \"Case_ID\"\n", 124 | "event_time_feature_name = \"EventTime\"\n", 125 | "\n", 126 | "# Append EventTime feature\n", 127 | "data_gen[event_time_feature_name] = pd.Series([current_time_sec]*len(data_gen), dtype=\"float64\")\n", 128 | "\n", 129 | "# Load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.\n", 130 | "genomic_feature_group.load_feature_definitions(data_frame=data_gen); # output is suppressed\n", 131 | "\n", 132 | "\n", 133 | "def wait_for_feature_group_creation_complete(feature_group):\n", 134 | " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", 135 | " while status == \"Creating\":\n", 136 | " print(\"Waiting for Feature Group Creation\")\n", 137 | " time.sleep(5)\n", 138 | " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", 139 | " if status != \"Created\":\n", 140 | " raise RuntimeError(f\"Failed to create feature group {feature_group.name}\")\n", 141 | " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n", 142 | "\n", 143 | "genomic_feature_group.create(\n", 144 | " s3_uri=f\"s3://{default_s3_bucket_name}/{prefix}\",\n", 145 | " record_identifier_name=record_identifier_feature_name,\n", 146 | " event_time_feature_name=event_time_feature_name,\n", 147 | " role_arn=role,\n", 148 | " enable_online_store=True\n", 149 | ")\n", 150 | "\n", 151 | "wait_for_feature_group_creation_complete(feature_group=genomic_feature_group)\n", 152 | "\n", 153 | "genomic_feature_group.ingest(\n", 154 | " data_frame=data_gen, max_workers=3, wait=True\n", 155 | ")" 156 | ] 157 | } 158 | ], 159 | "metadata": { 160 | "kernelspec": { 161 | "display_name": "conda_python3", 162 | "language": "python", 163 | "name": "conda_python3" 164 | }, 165 | "language_info": { 166 | "codemirror_mode": { 167 | "name": "ipython", 168 | "version": 3 169 | }, 170 | "file_extension": ".py", 171 | "mimetype": "text/x-python", 172 | "name": "python", 173 | "nbconvert_exporter": "python", 174 | "pygments_lexer": "ipython3", 175 | "version": "3.6.13" 176 | } 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 5 180 | } 181 | -------------------------------------------------------------------------------- /images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/machine-learning-pipelines-for-multimodal-health-data/710eb405d2511394eefedbb07c5dd07d6ca96e48/images/.DS_Store -------------------------------------------------------------------------------- /images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/machine-learning-pipelines-for-multimodal-health-data/710eb405d2511394eefedbb07c5dd07d6ca96e48/images/architecture.png -------------------------------------------------------------------------------- /imaging/preprocess-imaging-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import uuid\n", 10 | "import json\n", 11 | "from time import gmtime, strftime\n", 12 | "import boto3\n", 13 | "import sagemaker\n", 14 | "from sagemaker.session import Session\n", 15 | "from sagemaker.feature_store.feature_group import FeatureGroup\n", 16 | "\n", 17 | "role = sagemaker.get_execution_role()\n", 18 | "sagemaker_session = sagemaker.Session()\n", 19 | "region = sagemaker_session.boto_region_name\n", 20 | "boto_session = boto3.Session(region_name=region)\n", 21 | "\n", 22 | "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", 23 | "\n", 24 | "suffix=uuid.uuid1().hex # to be used in resource names" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "pwd" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "cd src" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!sed -i \"s|##REGION##|{region}|g\" Dockerfile" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!cat Dockerfile" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "Build a container image from the Dockerfile" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "scrolled": true 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "!pip install -q sagemaker-studio-image-build" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "scrolled": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "!sm-docker build . --repository medical-image-processing-smstudio:1.0" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "Define the input and output data location. Please insert your bucket names to `input_data_bucket` and `output_data_bucket`." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "input_data_bucket=''\n", 106 | "input_data_prefix='nsclc_radiogenomics'\n", 107 | "input_data_uri='s3://%s/%s' % (input_data_bucket, input_data_prefix)\n", 108 | "print(input_data_uri)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "output_data_bucket=''\n", 118 | "output_data_prefix='nsclc_radiogenomics'\n", 119 | "output_data_uri='s3://%s/%s' % (output_data_bucket, output_data_prefix)\n", 120 | "print(output_data_uri)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Be sure to use the image and tag name defined in `!sm-docker build` command. We will be replacing the placeholders in the Stepfunctions state machine definition json file with your bucket and image uri." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "ecr_image_uri='%s.dkr.ecr.%s.amazonaws.com/medical-image-processing-smstudio:1.0' % (account_id, region)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "!sed -i \"s|##INPUT_DATA_S3URI##|{input_data_uri}|g\" nsclc-radiogenomics-imaging-workflow.json\n", 146 | "!sed -i \"s|##OUTPUT_DATA_S3URI##|{output_data_uri}|g\" nsclc-radiogenomics-imaging-workflow.json\n", 147 | "!sed -i \"s|##ECR_IMAGE_URI##|{ecr_image_uri}|g\" nsclc-radiogenomics-imaging-workflow.json\n", 148 | "!sed -i \"s|##IAM_ROLE_ARN##|{role}|g\" nsclc-radiogenomics-imaging-workflow.json" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "with open('nsclc-radiogenomics-imaging-workflow.json') as f:\n", 158 | " state_machine_json = json.load(f)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "We need to create an IAM execution role for the Stepfunctions workflow." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "iam = boto3.client('iam')\n", 175 | "\n", 176 | "my_managed_policy = {\n", 177 | " \"Version\": \"2012-10-17\",\n", 178 | " \"Statement\": [\n", 179 | " {\n", 180 | " \"Effect\": \"Allow\",\n", 181 | " \"Action\": [\n", 182 | " \"events:PutTargets\",\n", 183 | " \"events:DescribeRule\",\n", 184 | " \"events:PutRule\"\n", 185 | " ],\n", 186 | " \"Resource\": [\n", 187 | " \"arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule\",\n", 188 | " \"arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule\",\n", 189 | " \"arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule\",\n", 190 | " \"arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule\",\n", 191 | " \"arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule\"\n", 192 | " ]\n", 193 | " },\n", 194 | " {\n", 195 | " \"Effect\": \"Allow\",\n", 196 | " \"Action\": \"iam:PassRole\",\n", 197 | " \"Resource\": role,\n", 198 | " \"Condition\": {\n", 199 | " \"StringEquals\": {\n", 200 | " \"iam:PassedToService\": \"sagemaker.amazonaws.com\"\n", 201 | " }\n", 202 | " }\n", 203 | " },\n", 204 | " {\n", 205 | " \"Effect\": \"Allow\",\n", 206 | " \"Action\": [\n", 207 | " \"sagemaker:CreateEndpoint\",\n", 208 | " \"sagemaker:CreateEndpointConfig\",\n", 209 | " \"sagemaker:CreateHyperParameterTuningJob\",\n", 210 | " \"sagemaker:CreateModel\",\n", 211 | " \"sagemaker:CreateProcessingJob\",\n", 212 | " \"sagemaker:CreateTrainingJob\",\n", 213 | " \"sagemaker:CreateTransformJob\",\n", 214 | " \"sagemaker:DeleteEndpoint\",\n", 215 | " \"sagemaker:DeleteEndpointConfig\",\n", 216 | " \"sagemaker:DescribeHyperParameterTuningJob\",\n", 217 | " \"sagemaker:DescribeProcessingJob\",\n", 218 | " \"sagemaker:DescribeTrainingJob\",\n", 219 | " \"sagemaker:DescribeTransformJob\",\n", 220 | " \"sagemaker:ListProcessingJobs\",\n", 221 | " \"sagemaker:ListTags\",\n", 222 | " \"sagemaker:StopHyperParameterTuningJob\",\n", 223 | " \"sagemaker:StopProcessingJob\",\n", 224 | " \"sagemaker:StopTrainingJob\",\n", 225 | " \"sagemaker:StopTransformJob\",\n", 226 | " \"sagemaker:UpdateEndpoint\",\n", 227 | " ],\n", 228 | " \"Resource\": \"*\"\n", 229 | " }\n", 230 | " ]\n", 231 | "}\n", 232 | "\n", 233 | "trust_policy = {\n", 234 | " \"Version\": \"2012-10-17\",\n", 235 | " \"Statement\": [\n", 236 | " {\n", 237 | " \"Sid\": \"\",\n", 238 | " \"Effect\": \"Allow\",\n", 239 | " \"Principal\": {\n", 240 | " \"Service\": [\"states.amazonaws.com\", \"sagemaker.amazonaws.com\"]},\n", 241 | " \"Action\": \"sts:AssumeRole\"\n", 242 | " }\n", 243 | " ]\n", 244 | "}" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "policy_name = 'MyStepFunctionsWorkflowExecutionPolicy-%s' % suffix\n", 254 | "role_name = 'MyStepFunctionsWorkflowExecutionRole-%s' % suffix\n", 255 | "policy_response = iam.create_policy(\n", 256 | " PolicyName=policy_name,\n", 257 | " PolicyDocument=json.dumps(my_managed_policy)\n", 258 | ")\n", 259 | "\n", 260 | "role_response = iam.create_role(\n", 261 | " RoleName=role_name,\n", 262 | " AssumeRolePolicyDocument=json.dumps(trust_policy),\n", 263 | " Description='Role to execute StepFunctions workflow which submits SageMaker jobs',\n", 264 | " MaxSessionDuration=3600,\n", 265 | ")\n", 266 | "\n", 267 | "# Attach a policy to role\n", 268 | "iam.attach_role_policy(\n", 269 | " PolicyArn=policy_response['Policy']['Arn'],\n", 270 | " RoleName=role_name\n", 271 | ")\n", 272 | "iam.attach_role_policy(\n", 273 | " PolicyArn='arn:aws:iam::aws:policy/CloudWatchEventsFullAccess',\n", 274 | " RoleName=role_name\n", 275 | ")" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "Create a Stepfunctions workflow, i.e. a state machine." 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "sfn = boto3.client('stepfunctions')\n", 292 | "sfn_execution_role = role_response['Role']['Arn']\n", 293 | "state_machine_name = 'nsclc-radiogenomics-imaging-workflow-%s' % suffix\n", 294 | "sfn_response = sfn.create_state_machine(name = state_machine_name,\n", 295 | " definition = json.dumps(state_machine_json),\n", 296 | " roleArn = sfn_execution_role,\n", 297 | " type = 'STANDARD')" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "We will be running this workflow for all the `RO1` subjects." 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "subject_list = ['R01-%03d'%i for i in range(1,163)]" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "Execute!" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "stateMachineArn=sfn_response['stateMachineArn']\n", 330 | "\n", 331 | "feature_store_name = 'imaging-feature-group-%s' % suffix\n", 332 | "processing_job_name = 'dcm-nifti-conversion-%s' % suffix\n", 333 | "offline_store_s3uri = '%s/multimodal-imaging-featurestore' % output_data_uri\n", 334 | "payload = {\n", 335 | " \"PreprocessingJobName\": processing_job_name,\n", 336 | " \"FeatureStoreName\": feature_store_name,\n", 337 | " \"OfflineStoreS3Uri\": offline_store_s3uri,\n", 338 | " \"Subject\": subject_list\n", 339 | "}\n", 340 | "exeution_response = sfn.start_execution(stateMachineArn=stateMachineArn,\n", 341 | " name=suffix,\n", 342 | " input=json.dumps(payload))" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "print(exeution_response)" 352 | ] 353 | } 354 | ], 355 | "metadata": { 356 | "instance_type": "ml.t3.medium", 357 | "kernelspec": { 358 | "display_name": "Python 3 (Data Science)", 359 | "language": "python", 360 | "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" 361 | }, 362 | "language_info": { 363 | "codemirror_mode": { 364 | "name": "ipython", 365 | "version": 3 366 | }, 367 | "file_extension": ".py", 368 | "mimetype": "text/x-python", 369 | "name": "python", 370 | "nbconvert_exporter": "python", 371 | "pygments_lexer": "ipython3", 372 | "version": "3.7.10" 373 | } 374 | }, 375 | "nbformat": 4, 376 | "nbformat_minor": 5 377 | } 378 | -------------------------------------------------------------------------------- /imaging/src/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim-buster 2 | 3 | COPY ./requirements.txt /opt/ 4 | RUN pip3 install --no-cache-dir -r /opt/requirements.txt 5 | ENV PYTHONUNBUFFERED=TRUE 6 | ENV AWS_DEFAULT_REGION=##REGION## 7 | 8 | COPY ./dcm2nifti_processing.py /opt/ 9 | COPY ./radiomics_utils.py /opt/ 10 | 11 | ENTRYPOINT ["python3", "/opt/dcm2nifti_processing.py"] 12 | -------------------------------------------------------------------------------- /imaging/src/dcm2nifti_processing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import dcmstack 4 | from glob import glob 5 | import pydicom 6 | import nibabel as nib 7 | import numpy as np 8 | import sys 9 | import os 10 | import json 11 | import time 12 | import logging 13 | from nilearn import plotting 14 | import matplotlib.pyplot as plt 15 | import radiomics_utils as utils 16 | 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(logging.DEBUG) 19 | 20 | if __name__ == '__main__': 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--subject', type=str, default='R01-003', 23 | help='Subject ID (default: R01-003)') 24 | parser.add_argument('--feature_store_name', type=str, default='nsclc-radiogenomics-imaging-feature-group', 25 | help='SageMaker Feature Store Group Name (default: nsclc-radiogenomics-imaging-feature-group)') 26 | parser.add_argument('--offline_store_s3uri', type=str, 27 | help='SageMaker Feature Offline Store S3 URI Example: s3://multimodal-image-data-processed/nsclc-radiogenomics-multimodal-imaging-featurestore.') 28 | 29 | args = parser.parse_args() 30 | 31 | data_dir = '/opt/ml/processing/input/' 32 | output_dir = '/opt/ml/processing/output/' 33 | 34 | # assume one subject comes in, 35 | # we need to find out where the CT dicom files are 36 | # and segmentation file 37 | logger.info('Locating DICOM files') 38 | jsons = glob(os.path.join(data_dir, '*', '*', '*.json')) 39 | logger.info('%d jsons found.' % len(jsons)) 40 | print('%d jsons found.' % len(jsons)) 41 | 42 | valid_jsons = [] 43 | file_info = [] 44 | seg_string = ['3D Slicer segmentation result', 'ePAD Generated DSO'] 45 | for i, jsonfile in enumerate(jsons): 46 | with open(jsonfile) as f: 47 | aa = json.load(f) 48 | if seg_string[0].lower() in aa['Total'][-1].lower() or seg_string[1].lower() in aa['Total'][-1].lower(): 49 | logger.debug(aa['Total'][-3:]) 50 | print(aa['Total'][-3:]) 51 | valid_jsons.append(jsonfile) 52 | file_info.append([aa['StudyUID'], aa['Date'], aa['SeriesUID'], jsonfile]) 53 | 54 | if len(valid_jsons) > 1: 55 | raise Exception('there are more than one segmentation for this patient') 56 | 57 | print(file_info) 58 | 59 | src_dcms = glob(os.path.join(data_dir, file_info[0][0], file_info[0][1], '*', '*dcm')) 60 | src_seg_dcm = [i for i in src_dcms if file_info[0][2] in i] 61 | logging.info(src_seg_dcm) 62 | src_dcms.remove(src_seg_dcm[0]) 63 | #print(src_dcms) 64 | print('# of src_dcms: %d' % len(src_dcms)) 65 | print('# of src_seg_dcm: %d' % len(src_seg_dcm)) 66 | 67 | # work with CT scan and load as a Nifti image 68 | logger.info('Creating nifti images from DICOM files') 69 | stacks = dcmstack.parse_and_stack(src_dcms) 70 | stack = list(stacks.values())[0] 71 | nii = stack.to_nifti() 72 | img = nii.get_fdata() 73 | 74 | # work with CT segmentation file, load as a numpy array and create a Nifti image 75 | dcm = pydicom.dcmread(src_seg_dcm[0]) 76 | n_frames_seg = int(dcm.NumberOfFrames) 77 | seg = dcm.pixel_array 78 | # reorient the seg array 79 | seg = np.fliplr(seg.T) 80 | 81 | # if seg and img don't have the same dimension, pad the images 82 | if img.shape != seg.shape: 83 | # read all dicoms and parse out the instance number and slice location 84 | # assuming the files are from R01-098 onwards with ePAD Generated DSO 85 | d_sort_instance_number=[] 86 | for tmp_dcm_fname in src_dcms: 87 | tmp_dcm = pydicom.dcmread(tmp_dcm_fname) 88 | d_sort_instance_number.append((int(tmp_dcm[0x0020, 0x0013].value), tmp_dcm[0x0020, 0x0032].value)) 89 | d_sort_instance_number = sorted(d_sort_instance_number, key=lambda aa: aa[0]) 90 | 91 | patient_img_position_first = dcm[0x5200, 0x9230][0][0x0020, 0x9113][0]['ImagePositionPatient'].value 92 | patient_img_position_last = dcm[0x5200, 0x9230][-1][0x0020, 0x9113][0]['ImagePositionPatient'].value 93 | 94 | slice_instance_number_1 = [i for i, j in d_sort_instance_number if j == patient_img_position_first][0] 95 | slice_instance_number_2 = [i for i, j in d_sort_instance_number if j == patient_img_position_last][0] 96 | top_slice_instance_number = min(slice_instance_number_1, slice_instance_number_2) 97 | 98 | # logger.debug(np.nonzero(seg.sum(axis=1).sum(axis=1))[0]) 99 | tmp_seg = np.zeros_like(img) 100 | starting_index = img.shape[-1] - top_slice_instance_number - n_frames_seg # the seg and the image is flipped and need to locate from bottom. 101 | ending_index = starting_index + n_frames_seg 102 | tmp_seg[:, :, starting_index:ending_index] = seg 103 | seg = tmp_seg 104 | seg_nii = nib.Nifti1Image(seg, nii.affine, header = nii.header) 105 | 106 | # save some viz 107 | logger.info('Saving files.') 108 | prefix = '%s_%s' % (args.subject, file_info[0][1]) 109 | f1 = plt.figure(figsize=(16,6)) 110 | g1 = plotting.plot_roi(seg_nii, bg_img = nii, figure = f1, alpha = 0.4, title = 'Lung CT with segmentation') 111 | g1.savefig(os.path.join(output_dir, 'PNG', '%s_ortho-view.png' % prefix), dpi = 150) 112 | 113 | f2 = plt.figure(figsize=(16,6)) 114 | g2 = plotting.plot_roi(seg_nii, bg_img = nii, figure = f2, alpha = 0.4, title = 'Lung CT with segmentation', 115 | display_mode='z', cut_coords=4) 116 | g2.savefig(os.path.join(output_dir, 'PNG', '%s_z-view.png' % prefix), dpi = 150) 117 | 118 | # save images 119 | imageName = os.path.join(output_dir, 'CT-Nifti', '%s.nii.gz' % prefix) 120 | maskName = os.path.join(output_dir, 'CT-SEG', '%s.nii.gz' % prefix) 121 | nii.to_filename(imageName) 122 | seg_nii.to_filename(maskName) 123 | 124 | # compute radiomic features 125 | logging.info('Computing radiomic features') 126 | df = utils.compute_features(imageName, maskName) 127 | 128 | # format dataframe for feature store 129 | record_id_column = 'Subject' 130 | event_time_column = 'EventTime' 131 | df[record_id_column] = args.subject 132 | current_time_sec = float(round(time.time())) 133 | df[event_time_column] = current_time_sec 134 | df['ScanDate'] = file_info[0][1] 135 | utils.cast_object_to_string(df) 136 | 137 | # check if feature store exists 138 | feature_group = utils.check_feature_group(args.feature_store_name) 139 | if not feature_group: 140 | feature_group = utils.create_feature_group(args.feature_store_name, df, args.offline_store_s3uri, 141 | record_id = record_id_column, event_time = event_time_column, 142 | enable_online_store = True) 143 | 144 | # ingest features into a FeatureStore 145 | feature_group.ingest(data_frame=df, max_workers=1, wait=True) 146 | 147 | print('Processing done for %s' % prefix) 148 | logging.info('Processing done for %s' % prefix) 149 | -------------------------------------------------------------------------------- /imaging/src/nsclc-radiogenomics-imaging-workflow.json: -------------------------------------------------------------------------------- 1 | { 2 | "StartAt": "iterate_over_subjects", 3 | "States": { 4 | "iterate_over_subjects": { 5 | "ItemsPath": "$.Subject", 6 | "MaxConcurrency": 50, 7 | "Type": "Map", 8 | "Next": "Finish", 9 | "Iterator": { 10 | "StartAt": "DICOM/NIfTI Conversion and Radiomic Feature Extraction", 11 | "States": { 12 | "Fallback": { 13 | "Type": "Pass", 14 | "Result": "This iteration failed for some reason", 15 | "End": true 16 | }, 17 | "DICOM/NIfTI Conversion and Radiomic Feature Extraction": { 18 | "Type": "Task", 19 | "OutputPath": "$.ProcessingJobArn", 20 | "Resource": "arn:aws:states:::sagemaker:createProcessingJob.sync", 21 | "Retry": [ 22 | { 23 | "ErrorEquals": [ 24 | "SageMaker.AmazonSageMakerException" 25 | ], 26 | "IntervalSeconds": 15, 27 | "MaxAttempts": 8, 28 | "BackoffRate": 1.5 29 | } 30 | ], 31 | "Catch": [ 32 | { 33 | "ErrorEquals": [ 34 | "States.TaskFailed" 35 | ], 36 | "Next": "Fallback" 37 | } 38 | ], 39 | "Parameters": { 40 | "ProcessingJobName.$": "States.Format('{}-{}', $$.Execution.Input['PreprocessingJobName'], $)", 41 | "ProcessingInputs": [ 42 | { 43 | "InputName": "DICOM", 44 | "AppManaged": false, 45 | "S3Input": { 46 | "S3Uri.$": "States.Format('##INPUT_DATA_S3URI##/{}' , $)", 47 | "LocalPath": "/opt/ml/processing/input", 48 | "S3DataType": "S3Prefix", 49 | "S3InputMode": "File", 50 | "S3DataDistributionType": "FullyReplicated", 51 | "S3CompressionType": "None" 52 | } 53 | } 54 | ], 55 | "ProcessingOutputConfig": { 56 | "Outputs": [ 57 | { 58 | "OutputName": "CT-Nifti", 59 | "AppManaged": false, 60 | "S3Output": { 61 | "S3Uri": "##OUTPUT_DATA_S3URI##/CT-Nifti", 62 | "LocalPath": "/opt/ml/processing/output/CT-Nifti", 63 | "S3UploadMode": "EndOfJob" 64 | } 65 | }, 66 | { 67 | "OutputName": "CT-SEG", 68 | "AppManaged": false, 69 | "S3Output": { 70 | "S3Uri": "##OUTPUT_DATA_S3URI##/CT-SEG", 71 | "LocalPath": "/opt/ml/processing/output/CT-SEG", 72 | "S3UploadMode": "EndOfJob" 73 | } 74 | }, 75 | { 76 | "OutputName": "PNG", 77 | "AppManaged": false, 78 | "S3Output": { 79 | "S3Uri": "##OUTPUT_DATA_S3URI##/PNG", 80 | "LocalPath": "/opt/ml/processing/output/PNG", 81 | "S3UploadMode": "EndOfJob" 82 | } 83 | } 84 | ] 85 | }, 86 | "AppSpecification": { 87 | "ImageUri": "##ECR_IMAGE_URI##", 88 | "ContainerArguments.$": "States.Array('--subject', $, '--feature_store_name', $$.Execution.Input['FeatureStoreName'], '--offline_store_s3uri', $$.Execution.Input['OfflineStoreS3Uri'])", 89 | "ContainerEntrypoint": [ 90 | "python3", 91 | "/opt/dcm2nifti_processing.py" 92 | ] 93 | }, 94 | "RoleArn": "##IAM_ROLE_ARN##", 95 | "ProcessingResources": { 96 | "ClusterConfig": { 97 | "InstanceCount": 1, 98 | "InstanceType": "ml.r5.large", 99 | "VolumeSizeInGB": 5 100 | } 101 | } 102 | }, 103 | "End": true 104 | } 105 | } 106 | } 107 | }, 108 | "Finish": { 109 | "Type": "Succeed" 110 | } 111 | } 112 | } -------------------------------------------------------------------------------- /imaging/src/radiomics_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import time 3 | import numpy as np 4 | from radiomics import featureextractor 5 | import boto3 6 | import sagemaker 7 | from sagemaker.session import Session 8 | from sagemaker.feature_store.feature_group import FeatureGroup 9 | from sagemaker import get_execution_role 10 | 11 | sagemaker_session = sagemaker.Session() 12 | region = sagemaker_session.boto_region_name 13 | print('region is %s' % region) 14 | 15 | boto_session = boto3.Session(region_name=region) 16 | role = get_execution_role() 17 | print('role is %s' % role) 18 | 19 | sagemaker_client = boto3.client(service_name='sagemaker', region_name=region) 20 | featurestore_runtime = boto3.client('sagemaker-featurestore-runtime', region_name=region) 21 | 22 | feature_store_session = Session( 23 | boto_session=boto_session, 24 | sagemaker_client=sagemaker_client, 25 | sagemaker_featurestore_runtime_client=featurestore_runtime 26 | ) 27 | 28 | 29 | def cast_object_to_string(data_frame): 30 | for label in data_frame.columns: 31 | if data_frame.dtypes[label] == 'object': 32 | data_frame[label] = data_frame[label].astype("str").astype("string") 33 | 34 | 35 | def compute_features(imageName, maskName): 36 | extractor = featureextractor.RadiomicsFeatureExtractor() 37 | featureVector = extractor.execute(imageName, maskName) 38 | 39 | new_dict={} 40 | for featureName in featureVector.keys(): 41 | print("Computed %s: %s" % (featureName, featureVector[featureName])) 42 | print(type(featureVector[featureName])) 43 | if isinstance(featureVector[featureName], np.ndarray): 44 | new_dict[featureName]=float(featureVector[featureName]) 45 | else: 46 | new_dict[featureName]=featureVector[featureName] 47 | 48 | df=pd.DataFrame.from_dict(new_dict, orient='index').T 49 | df=df.convert_dtypes(convert_integer=False) 50 | df['imageName']=imageName 51 | df['maskName']=maskName 52 | 53 | return df 54 | 55 | def check_feature_group(feature_group_name): 56 | feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) 57 | status = None 58 | try: 59 | status = feature_group.describe()['FeatureGroupStatus'] 60 | except: 61 | pass 62 | 63 | if status == 'Created': 64 | return feature_group 65 | elif status is None: 66 | return False 67 | else: 68 | wait_for_feature_group_creation_complete(feature_group) 69 | return feature_group 70 | 71 | 72 | 73 | def create_feature_group(feature_group_name, dataframe, s3uri, record_id = 'Subject', event_time = 'EventTime', 74 | enable_online_store = True): 75 | print(feature_group_name) 76 | print(feature_store_session) 77 | feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) 78 | feature_group.load_feature_definitions(data_frame=dataframe) 79 | feature_group.create(s3_uri=s3uri, 80 | record_identifier_name=record_id, 81 | event_time_feature_name=event_time, 82 | role_arn=role, 83 | enable_online_store=enable_online_store) 84 | wait_for_feature_group_creation_complete(feature_group) 85 | 86 | return feature_group 87 | 88 | 89 | def wait_for_feature_group_creation_complete(feature_group): 90 | status = feature_group.describe()['FeatureGroupStatus'] 91 | while status == "Creating": 92 | print("Waiting for Feature Group Creation") 93 | time.sleep(5) 94 | status = feature_group.describe()['FeatureGroupStatus'] 95 | if status != "Created": 96 | raise RuntimeError(f"Failed to create feature group {feature_group.name}") 97 | print(f"FeatureGroup {feature_group.name} successfully created.") -------------------------------------------------------------------------------- /imaging/src/requirements.txt: -------------------------------------------------------------------------------- 1 | pydicom == 2.1.2 2 | numpy == 1.19.5 3 | nibabel == 3.2.1 4 | nilearn == 0.7.0 5 | matplotlib == 3.1.3 6 | dcmstack == 0.8.0 7 | pandas == 1.1.5 8 | pyradiomics == 3.0.1 9 | sagemaker == 2.27.0 10 | boto3 == 1.17.18 --------------------------------------------------------------------------------