├── .DS_Store
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── clinical
    └── preprocess-clinical-data.ipynb
├── genomic
    └── preprocess-genomic-data.ipynb
├── images
    ├── .DS_Store
    └── architecture.png
├── imaging
    ├── preprocess-imaging-data.ipynb
    └── src
    │   ├── Dockerfile
    │   ├── dcm2nifti_processing.py
    │   ├── nsclc-radiogenomics-imaging-workflow.json
    │   ├── radiomics_utils.py
    │   └── requirements.txt
└── model-train-test
    └── train-test-model.ipynb


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/machine-learning-pipelines-for-multimodal-health-data/710eb405d2511394eefedbb07c5dd07d6ca96e48/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 
141 | # pytype static type analyzer
142 | .pytype/
143 | 
144 | # Cython debug symbols
145 | cython_debug/
146 | 
147 | # PyCharm
148 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
151 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Analyzing Multimodal Health Data on AWS
 2 | 
 3 | ## Introduction
 4 | This repository contains code samples related to the two-part blog series [Building Scalable Machine Learning Pipelines for Multimodal Health Data on AWS](https://aws.amazon.com/blogs/industries/building-scalable-machine-learning-pipelines-for-multimodal-health-data-on-aws/) and [Training Machine Learning Models on Multimodal Health Data with Amazon SageMaker](https://aws.amazon.com/blogs/industries/training-machine-learning-models-on-multimodal-health-data-with-amazon-sagemaker/)
 5 | 
 6 | ## Machine Learning Pipelines for Multimodal Health Data
 7 | 
 8 | You can use these artifacts to recreate the pipelines and analysis presented in the blog posts, as shown below.  
 9 | 
10 | ![Architecture on AWS](./images/architecture.png)
11 | 
12 | ## Project Structure
13 | 
14 | Artifacts for processing each data modality are located in corresponding subdirectories of this repo.  
15 | 
16 | ```
17 | ./
18 | ./genomics/ <-- Artifacts for genomics pipeline
19 | ./clinical/ <-- Artifacts for clinical pipeline *
20 | ./imaging/  <-- Artifacts for medical imaging pipeline
21 | ./model-train-test/ <-- Artifacts for performing model training and testing
22 | ```
23 | 
24 | \* The clinical data can also be preprocessed with Amazon SageMaker Data Wrangler, as discussed in the blog.
25 | 
26 | ## Security
27 | 
28 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
29 | 
30 | ## License
31 | 
32 | This library is licensed under the MIT-0 License. See the LICENSE file.
33 | 


--------------------------------------------------------------------------------
/clinical/preprocess-clinical-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 259,
  6 |    "id": "95344a61",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "## This script is used to read and preprocess clinical data (in tabular format) from S3 and store features in SageMaker FeatureStore"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 260,
 16 |    "id": "efa5b476",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import boto3\n",
 21 |     "import numpy as np\n",
 22 |     "import pandas as pd\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "import io, os\n",
 25 |     "from time import gmtime, strftime, sleep\n",
 26 |     "import time\n",
 27 |     "import sagemaker\n",
 28 |     "from sagemaker.session import Session\n",
 29 |     "from sagemaker import get_execution_role\n",
 30 |     "from sagemaker.feature_store.feature_group import FeatureGroup"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "d3e657f8",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "## Set up SageMaker FeatureStore"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 261,
 44 |    "id": "da137d82",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "region = boto3.Session().region_name\n",
 49 |     "\n",
 50 |     "boto_session = boto3.Session(region_name=region)\n",
 51 |     "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n",
 52 |     "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n",
 53 |     "\n",
 54 |     "feature_store_session = Session(\n",
 55 |     "    boto_session=boto_session,\n",
 56 |     "    sagemaker_client=sagemaker_client,\n",
 57 |     "    sagemaker_featurestore_runtime_client=featurestore_runtime\n",
 58 |     ")\n",
 59 |     "\n",
 60 |     "role = get_execution_role()\n",
 61 |     "s3_client = boto3.client('s3', region_name=region)\n",
 62 |     "\n",
 63 |     "default_s3_bucket_name = feature_store_session.default_bucket()\n",
 64 |     "prefix = 'sagemaker-featurestore-demo'"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "id": "78ef8a21",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "## Get data from S3"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 262,
 78 |    "id": "04fa869d",
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# Get data from S3 \n",
 83 |     "bucket_clin = 'nsclc-clinical-genomic-data'\n",
 84 |     "#bucket_clin = <S3-bucket-name>\n",
 85 |     "\n",
 86 |     "# Clinical data \n",
 87 |     "#data_key_clin = 'Clinical-data-119patients.csv'\n",
 88 |     "data_key_clin = 'NSCLCR01Radiogenomic_DATA_LABELS_2018-05-22_1500-shifted.csv'\n",
 89 |     "#data_key_clin = <file-name.csv>\n",
 90 |     "\n",
 91 |     "data_location_clin = 's3://{}/{}'.format(bucket_clin, data_key_clin)\n",
 92 |     "data_clinical = pd.read_csv(data_location_clin)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "id": "e80ed14b",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "## Preprocess Data"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 264,
106 |    "id": "d6131af6",
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "text/plain": [
112 |        "(147, 89)"
113 |       ]
114 |      },
115 |      "execution_count": 264,
116 |      "metadata": {},
117 |      "output_type": "execute_result"
118 |     }
119 |    ],
120 |    "source": [
121 |     "# Keep samples starting with \"R01-*\" as these IDs have corresponding medical imaging data. Delete samples with Case IDs \"AMC-*\". \n",
122 |     "data_clinical = data_clinical[~data_clinical[\"Case ID\"].str.contains(\"AMC\")]\n",
123 |     "\n",
124 |     "# Delete columns with ID and dates\n",
125 |     "list_delete_cols = ['Quit Smoking Year', 'Date of Recurrence', 'Date of Last Known Alive', 'Date of Death', 'CT Date', 'PET Date']\n",
126 |     "data_clinical.drop(list_delete_cols, axis=1, inplace=True)\n",
127 |     "\n",
128 |     "# List of features with catergorical value\n",
129 |     "list_encode_cols = [\"Patient affiliation\", \"Gender\", \"Ethnicity\", \"Smoking status\", \"%GG\", \"Tumor Location (choice=RUL)\", \"Tumor Location (choice=RML)\", \"Tumor Location (choice=RLL)\", \"Tumor Location (choice=LUL)\", \"Tumor Location (choice=LLL)\", \"Tumor Location (choice=L Lingula)\", \"Tumor Location (choice=Unknown)\", \"Histology \", \"Pathological T stage\", \"Pathological N stage\", \"Pathological M stage\", \"Histopathological Grade\", \"Lymphovascular invasion\", \"Pleural invasion (elastic, visceral, or parietal)\", \"EGFR mutation status\", \"KRAS mutation status\", \"ALK translocation status\", \"Adjuvant Treatment\", \"Chemotherapy\", \"Radiation\", \"Recurrence\", \"Recurrence Location\"]\n",
130 |     "\n",
131 |     "# List of features with numeric value\n",
132 |     "list_nonenc_cols = [\"Case ID\", \"Age at Histological Diagnosis\", \"Weight (lbs)\", \"Pack Years\", \"Time to Death (days)\", \"Days between CT and surgery\", \"Survival Status\"]\n",
133 |     "\n",
134 |     "# One-hot encoding of features with categorical value\n",
135 |     "data_clinical_enc = pd.get_dummies(data_clinical[list_encode_cols])\n",
136 |     "\n",
137 |     "data_clinical_nonenc = data_clinical[list_nonenc_cols]\n",
138 |     "\n",
139 |     "# Combine all features\n",
140 |     "data_clin = pd.concat([data_clinical_enc, data_clinical_nonenc], axis=1)\n",
141 |     "\n",
142 |     "# Feature names inside FeatureStore should not have special chars and should be < 64 chars long\n",
143 |     "# Update feature names accordingly\n",
144 |     "\n",
145 |     "l_char = ['-',' ','%','/','<','>','(',')','=',',',':']\n",
146 |     "\n",
147 |     "for col in (data_clin.columns):\n",
148 |     "\n",
149 |     "    if (col == \"Case ID\"):\n",
150 |     "        data_clin.rename(columns={col: col.replace(' ','_')}, inplace = True)\n",
151 |     "        continue\n",
152 |     "\n",
153 |     "    for char in l_char:\n",
154 |     "        if char in col:\n",
155 |     "            data_clin.rename(columns={col: col.replace(char,'')}, inplace = True)\n",
156 |     "            col = col.replace(char,'')\n",
157 |     "            \n",
158 |     "    if (len(col)>=64):\n",
159 |     "        data_clin.rename(columns={col: col[:60]}, inplace = True)\n",
160 |     "        \n",
161 |     "# Change label (survival status) \"Dead\"=1 and \"Alive\"=0 \n",
162 |     "data_clin[\"SurvivalStatus\"].replace({\"Dead\": \"1\", \"Alive\": \"0\"}, inplace=True)\n",
163 |     "\n",
164 |     "\n",
165 |     "# Drop samples with missing values. \n",
166 |     "# Fill NaN with 0. For eg. PackYears for non-smokers is \"NA\". Change it to 0.\n",
167 |     "data_clin = data_clin[data_clin['Weightlbs'] != \"Not Collected\"]\n",
168 |     "data_clin = data_clin[data_clin['PackYears'] != \"Not Collected\"]\n",
169 |     "data_clin.fillna(0)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "id": "33d68a8a",
175 |    "metadata": {},
176 |    "source": [
177 |     "## Ingest data into FeatureStore"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 266,
183 |    "id": "a03327c5",
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "name": "stdout",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "Patientaffiliation_Stanford\n",
191 |       "Patientaffiliation_VA\n",
192 |       "Gender_Female\n",
193 |       "Gender_Male\n",
194 |       "Ethnicity_AfricanAmerican\n",
195 |       "Ethnicity_Asian\n",
196 |       "Ethnicity_Caucasian\n",
197 |       "Ethnicity_HispanicLatino\n",
198 |       "Ethnicity_NativeHawaiianPacificIslander\n",
199 |       "Smokingstatus_Current\n",
200 |       "Smokingstatus_Former\n",
201 |       "Smokingstatus_Nonsmoker\n",
202 |       "GG_0\n",
203 |       "GG_100\n",
204 |       "GG_2550\n",
205 |       "GG_5075\n",
206 |       "GG_75100\n",
207 |       "GG_025\n",
208 |       "GG_NotAssessed\n",
209 |       "TumorLocationchoiceRUL_Checked\n",
210 |       "TumorLocationchoiceRUL_Unchecked\n",
211 |       "TumorLocationchoiceRML_Checked\n",
212 |       "TumorLocationchoiceRML_Unchecked\n",
213 |       "TumorLocationchoiceRLL_Checked\n",
214 |       "TumorLocationchoiceRLL_Unchecked\n",
215 |       "TumorLocationchoiceLUL_Checked\n",
216 |       "TumorLocationchoiceLUL_Unchecked\n",
217 |       "TumorLocationchoiceLLL_Checked\n",
218 |       "TumorLocationchoiceLLL_Unchecked\n",
219 |       "TumorLocationchoiceLLingula_Checked\n",
220 |       "TumorLocationchoiceLLingula_Unchecked\n",
221 |       "TumorLocationchoiceUnknown_Unchecked\n",
222 |       "Histology_Adenocarcinoma\n",
223 |       "Histology_NSCLCNOSnototherwisespecified\n",
224 |       "Histology_Squamouscellcarcinoma\n",
225 |       "PathologicalTstage_T1a\n",
226 |       "PathologicalTstage_T1b\n",
227 |       "PathologicalTstage_T2a\n",
228 |       "PathologicalTstage_T2b\n",
229 |       "PathologicalTstage_T3\n",
230 |       "PathologicalTstage_T4\n",
231 |       "PathologicalTstage_Tis\n",
232 |       "PathologicalNstage_N0\n",
233 |       "PathologicalNstage_N1\n",
234 |       "PathologicalNstage_N2\n",
235 |       "PathologicalMstage_M0\n",
236 |       "PathologicalMstage_M1a\n",
237 |       "PathologicalMstage_M1b\n",
238 |       "HistopathologicalGrade_G1Welldifferentiated\n",
239 |       "HistopathologicalGrade_G2Moderatelydifferentiated\n",
240 |       "HistopathologicalGrade_G3Poorlydifferentiated\n",
241 |       "HistopathologicalGrade_OtherTypeIWelltomoderatelydifferentiated\n",
242 |       "HistopathologicalGrade_OtherTypeIIModeratelytopoorlydifferen\n",
243 |       "Lymphovascularinvasion_Absent\n",
244 |       "Lymphovascularinvasion_NotCollected\n",
245 |       "Lymphovascularinvasion_Present\n",
246 |       "Pleuralinvasionelasticvisceralorparietal_No\n",
247 |       "Pleuralinvasionelasticvisceralorparietal_Notcollected\n",
248 |       "Pleuralinvasionelasticvisceralorparietal_Yes\n",
249 |       "EGFRmutationstatus_Mutant\n",
250 |       "EGFRmutationstatus_Notcollected\n",
251 |       "EGFRmutationstatus_Unknown\n",
252 |       "EGFRmutationstatus_Wildtype\n",
253 |       "KRASmutationstatus_Mutant\n",
254 |       "KRASmutationstatus_Notcollected\n",
255 |       "KRASmutationstatus_Unknown\n",
256 |       "KRASmutationstatus_Wildtype\n",
257 |       "ALKtranslocationstatus_Notcollected\n",
258 |       "ALKtranslocationstatus_Translocated\n",
259 |       "ALKtranslocationstatus_Unknown\n",
260 |       "ALKtranslocationstatus_Wildtype\n",
261 |       "AdjuvantTreatment_No\n",
262 |       "AdjuvantTreatment_Yes\n",
263 |       "Chemotherapy_No\n",
264 |       "Chemotherapy_Yes\n",
265 |       "Radiation_No\n",
266 |       "Radiation_Yes\n",
267 |       "Recurrence_no\n",
268 |       "Recurrence_yes\n",
269 |       "RecurrenceLocation_distant\n",
270 |       "RecurrenceLocation_local\n",
271 |       "RecurrenceLocation_regional\n",
272 |       "Case_ID\n",
273 |       "AgeatHistologicalDiagnosis\n",
274 |       "Weightlbs\n",
275 |       "PackYears\n",
276 |       "TimetoDeathdays\n",
277 |       "DaysbetweenCTandsurgery\n",
278 |       "SurvivalStatus\n",
279 |       "Waiting for Feature Group Creation\n",
280 |       "Waiting for Feature Group Creation\n",
281 |       "FeatureGroup clinical-feature-group-29-00-11-19 successfully created.\n"
282 |      ]
283 |     },
284 |     {
285 |      "data": {
286 |       "text/plain": [
287 |        "IngestionManagerPandas(feature_group_name='clinical-feature-group-29-00-11-19', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f1d6a0c7e80>, max_workers=3, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7f1d6a136e10>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])"
288 |       ]
289 |      },
290 |      "execution_count": 266,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "clinical_feature_group_name = 'clinical-feature-group-' + strftime('%d-%H-%M-%S', gmtime())\n",
297 |     "clinical_feature_group = FeatureGroup(name=clinical_feature_group_name, sagemaker_session=feature_store_session)\n",
298 |     "\n",
299 |     "current_time_sec = int(round(time.time()))\n",
300 |     "\n",
301 |     "def cast_object_to_string(data_frame):\n",
302 |     "    for label in data_frame.columns:\n",
303 |     "        print (label)\n",
304 |     "        if data_frame.dtypes[label] == 'object':\n",
305 |     "            data_frame[label] = data_frame[label].astype(\"str\").astype(\"string\")\n",
306 |     "\n",
307 |     "# Cast object dtype to string. SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.\n",
308 |     "cast_object_to_string(data_clin)\n",
309 |     "\n",
310 |     "# Record identifier and event time feature names\n",
311 |     "record_identifier_feature_name = \"Case_ID\"\n",
312 |     "event_time_feature_name = \"EventTime\"\n",
313 |     "\n",
314 |     "# Append EventTime feature\n",
315 |     "data_clin[event_time_feature_name] = pd.Series([current_time_sec]*len(data_clin), dtype=\"float64\")\n",
316 |     "\n",
317 |     "## If event time generates NaN\n",
318 |     "data_clin[event_time_feature_name] = data_clin[event_time_feature_name].fillna(0)\n",
319 |     "\n",
320 |     "# Load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.\n",
321 |     "clinical_feature_group.load_feature_definitions(data_frame=data_clin); # output is suppressed\n",
322 |     "\n",
323 |     "\n",
324 |     "def wait_for_feature_group_creation_complete(feature_group):\n",
325 |     "    status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
326 |     "    while status == \"Creating\":\n",
327 |     "        print(\"Waiting for Feature Group Creation\")\n",
328 |     "        time.sleep(5)\n",
329 |     "        status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
330 |     "    if status != \"Created\":\n",
331 |     "        raise RuntimeError(f\"Failed to create feature group {feature_group.name}\")\n",
332 |     "    print(f\"FeatureGroup {feature_group.name} successfully created.\")\n",
333 |     "\n",
334 |     "clinical_feature_group.create(\n",
335 |     "    s3_uri=f\"s3://{default_s3_bucket_name}/{prefix}\",\n",
336 |     "    record_identifier_name=record_identifier_feature_name,\n",
337 |     "    event_time_feature_name=event_time_feature_name,\n",
338 |     "    role_arn=role,\n",
339 |     "    enable_online_store=True\n",
340 |     ")\n",
341 |     "\n",
342 |     "wait_for_feature_group_creation_complete(feature_group=clinical_feature_group)\n",
343 |     "\n",
344 |     "clinical_feature_group.ingest(\n",
345 |     "    data_frame=data_clin, max_workers=3, wait=True\n",
346 |     ")"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "id": "2ac04c7a",
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": []
356 |   }
357 |  ],
358 |  "metadata": {
359 |   "kernelspec": {
360 |    "display_name": "conda_python3",
361 |    "language": "python",
362 |    "name": "conda_python3"
363 |   },
364 |   "language_info": {
365 |    "codemirror_mode": {
366 |     "name": "ipython",
367 |     "version": 3
368 |    },
369 |    "file_extension": ".py",
370 |    "mimetype": "text/x-python",
371 |    "name": "python",
372 |    "nbconvert_exporter": "python",
373 |    "pygments_lexer": "ipython3",
374 |    "version": "3.6.13"
375 |   }
376 |  },
377 |  "nbformat": 4,
378 |  "nbformat_minor": 5
379 | }
380 | 


--------------------------------------------------------------------------------
/genomic/preprocess-genomic-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "possible-regular",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "## This script is used to read genomic data (in tabular format) from S3 and store features in SageMaker FeatureStore"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 7,
 16 |    "id": "nasty-consensus",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import boto3\n",
 21 |     "import numpy as np\n",
 22 |     "import pandas as pd\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "import io, os\n",
 25 |     "from time import gmtime, strftime, sleep\n",
 26 |     "import time\n",
 27 |     "import sagemaker\n",
 28 |     "from sagemaker.session import Session\n",
 29 |     "from sagemaker import get_execution_role\n",
 30 |     "from sagemaker.feature_store.feature_group import FeatureGroup"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "sound-venture",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "## Set up SageMaker FeatureStore"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 8,
 44 |    "id": "usual-today",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "region = boto3.Session().region_name\n",
 49 |     "\n",
 50 |     "boto_session = boto3.Session(region_name=region)\n",
 51 |     "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n",
 52 |     "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n",
 53 |     "\n",
 54 |     "feature_store_session = Session(\n",
 55 |     "    boto_session=boto_session,\n",
 56 |     "    sagemaker_client=sagemaker_client,\n",
 57 |     "    sagemaker_featurestore_runtime_client=featurestore_runtime\n",
 58 |     ")\n",
 59 |     "\n",
 60 |     "role = get_execution_role()\n",
 61 |     "s3_client = boto3.client('s3', region_name=region)\n",
 62 |     "\n",
 63 |     "default_s3_bucket_name = feature_store_session.default_bucket()\n",
 64 |     "prefix = 'sagemaker-featurestore-demo'"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "id": "ranking-district",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "## Get data from S3"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 9,
 78 |    "id": "chicken-refrigerator",
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# Get data from S3 \n",
 83 |     "bucket_gen = 'nsclc-clinical-genomic-data'\n",
 84 |     "#bucket_gen = <S3-BUCKET-NAME>\n",
 85 |     "\n",
 86 |     "# Genomic data \n",
 87 |     "data_key_gen = 'Genomic-data-119patients.csv'\n",
 88 |     "#data_key_gen = <FILE-NAME.csv>\n",
 89 |     "\n",
 90 |     "data_location_gen = 's3://{}/{}'.format(bucket_gen, data_key_gen)\n",
 91 |     "data_gen = pd.read_csv(data_location_gen)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "id": "introductory-brazil",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Ingest data into FeatureStore"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "operating-infrastructure",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "genomic_feature_group_name = 'genomic-feature-group-' + strftime('%d-%H-%M-%S', gmtime())\n",
110 |     "genomic_feature_group = FeatureGroup(name=genomic_feature_group_name, sagemaker_session=feature_store_session)\n",
111 |     "\n",
112 |     "current_time_sec = int(round(time.time()))\n",
113 |     "\n",
114 |     "def cast_object_to_string(data_frame):\n",
115 |     "    for label in data_frame.columns:\n",
116 |     "        if data_frame.dtypes[label] == 'object':\n",
117 |     "            data_frame[label] = data_frame[label].astype(\"str\").astype(\"string\")\n",
118 |     "\n",
119 |     "# Cast object dtype to string. SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.\n",
120 |     "cast_object_to_string(data_gen)\n",
121 |     "\n",
122 |     "# Record identifier and event time feature names\n",
123 |     "record_identifier_feature_name = \"Case_ID\"\n",
124 |     "event_time_feature_name = \"EventTime\"\n",
125 |     "\n",
126 |     "# Append EventTime feature\n",
127 |     "data_gen[event_time_feature_name] = pd.Series([current_time_sec]*len(data_gen), dtype=\"float64\")\n",
128 |     "\n",
129 |     "# Load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.\n",
130 |     "genomic_feature_group.load_feature_definitions(data_frame=data_gen); # output is suppressed\n",
131 |     "\n",
132 |     "\n",
133 |     "def wait_for_feature_group_creation_complete(feature_group):\n",
134 |     "    status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
135 |     "    while status == \"Creating\":\n",
136 |     "        print(\"Waiting for Feature Group Creation\")\n",
137 |     "        time.sleep(5)\n",
138 |     "        status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
139 |     "    if status != \"Created\":\n",
140 |     "        raise RuntimeError(f\"Failed to create feature group {feature_group.name}\")\n",
141 |     "    print(f\"FeatureGroup {feature_group.name} successfully created.\")\n",
142 |     "\n",
143 |     "genomic_feature_group.create(\n",
144 |     "    s3_uri=f\"s3://{default_s3_bucket_name}/{prefix}\",\n",
145 |     "    record_identifier_name=record_identifier_feature_name,\n",
146 |     "    event_time_feature_name=event_time_feature_name,\n",
147 |     "    role_arn=role,\n",
148 |     "    enable_online_store=True\n",
149 |     ")\n",
150 |     "\n",
151 |     "wait_for_feature_group_creation_complete(feature_group=genomic_feature_group)\n",
152 |     "\n",
153 |     "genomic_feature_group.ingest(\n",
154 |     "    data_frame=data_gen, max_workers=3, wait=True\n",
155 |     ")"
156 |    ]
157 |   }
158 |  ],
159 |  "metadata": {
160 |   "kernelspec": {
161 |    "display_name": "conda_python3",
162 |    "language": "python",
163 |    "name": "conda_python3"
164 |   },
165 |   "language_info": {
166 |    "codemirror_mode": {
167 |     "name": "ipython",
168 |     "version": 3
169 |    },
170 |    "file_extension": ".py",
171 |    "mimetype": "text/x-python",
172 |    "name": "python",
173 |    "nbconvert_exporter": "python",
174 |    "pygments_lexer": "ipython3",
175 |    "version": "3.6.13"
176 |   }
177 |  },
178 |  "nbformat": 4,
179 |  "nbformat_minor": 5
180 | }
181 | 


--------------------------------------------------------------------------------
/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/machine-learning-pipelines-for-multimodal-health-data/710eb405d2511394eefedbb07c5dd07d6ca96e48/images/.DS_Store


--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/machine-learning-pipelines-for-multimodal-health-data/710eb405d2511394eefedbb07c5dd07d6ca96e48/images/architecture.png


--------------------------------------------------------------------------------
/imaging/preprocess-imaging-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import uuid\n",
 10 |     "import json\n",
 11 |     "from time import gmtime, strftime\n",
 12 |     "import boto3\n",
 13 |     "import sagemaker\n",
 14 |     "from sagemaker.session import Session\n",
 15 |     "from sagemaker.feature_store.feature_group import FeatureGroup\n",
 16 |     "\n",
 17 |     "role = sagemaker.get_execution_role()\n",
 18 |     "sagemaker_session = sagemaker.Session()\n",
 19 |     "region = sagemaker_session.boto_region_name\n",
 20 |     "boto_session = boto3.Session(region_name=region)\n",
 21 |     "\n",
 22 |     "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
 23 |     "\n",
 24 |     "suffix=uuid.uuid1().hex # to be used in resource names"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pwd"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "cd src"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!sed -i \"s|##REGION##|{region}|g\" Dockerfile"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!cat Dockerfile"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "Build a container image from the Dockerfile"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "scrolled": true
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "!pip install -q sagemaker-studio-image-build"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "scrolled": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "!sm-docker build . --repository medical-image-processing-smstudio:1.0"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "Define the input and output data location. Please insert your bucket names to `input_data_bucket` and `output_data_bucket`."
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "input_data_bucket='<your-s3-bucket-name>'\n",
106 |     "input_data_prefix='nsclc_radiogenomics'\n",
107 |     "input_data_uri='s3://%s/%s' % (input_data_bucket, input_data_prefix)\n",
108 |     "print(input_data_uri)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "output_data_bucket='<your-s3-bucket-name>'\n",
118 |     "output_data_prefix='nsclc_radiogenomics'\n",
119 |     "output_data_uri='s3://%s/%s' % (output_data_bucket, output_data_prefix)\n",
120 |     "print(output_data_uri)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Be sure to use the image and tag name defined in `!sm-docker build` command. We will be replacing the placeholders in the Stepfunctions state machine definition json file with your bucket and image uri."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "ecr_image_uri='%s.dkr.ecr.%s.amazonaws.com/medical-image-processing-smstudio:1.0' % (account_id, region)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "!sed -i \"s|##INPUT_DATA_S3URI##|{input_data_uri}|g\" nsclc-radiogenomics-imaging-workflow.json\n",
146 |     "!sed -i \"s|##OUTPUT_DATA_S3URI##|{output_data_uri}|g\" nsclc-radiogenomics-imaging-workflow.json\n",
147 |     "!sed -i \"s|##ECR_IMAGE_URI##|{ecr_image_uri}|g\" nsclc-radiogenomics-imaging-workflow.json\n",
148 |     "!sed -i \"s|##IAM_ROLE_ARN##|{role}|g\" nsclc-radiogenomics-imaging-workflow.json"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "with open('nsclc-radiogenomics-imaging-workflow.json') as f:\n",
158 |     "    state_machine_json = json.load(f)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "We need to create an IAM execution role for the Stepfunctions workflow."
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "iam = boto3.client('iam')\n",
175 |     "\n",
176 |     "my_managed_policy = {\n",
177 |     "    \"Version\": \"2012-10-17\",\n",
178 |     "    \"Statement\": [\n",
179 |     "        {\n",
180 |     "            \"Effect\": \"Allow\",\n",
181 |     "            \"Action\": [\n",
182 |     "                \"events:PutTargets\",\n",
183 |     "                \"events:DescribeRule\",\n",
184 |     "                \"events:PutRule\"\n",
185 |     "            ],\n",
186 |     "            \"Resource\": [\n",
187 |     "                \"arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule\",\n",
188 |     "                \"arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule\",\n",
189 |     "                \"arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule\",\n",
190 |     "                \"arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule\",\n",
191 |     "                \"arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule\"\n",
192 |     "            ]\n",
193 |     "        },\n",
194 |     "        {\n",
195 |     "            \"Effect\": \"Allow\",\n",
196 |     "            \"Action\": \"iam:PassRole\",\n",
197 |     "            \"Resource\": role,\n",
198 |     "            \"Condition\": {\n",
199 |     "                \"StringEquals\": {\n",
200 |     "                    \"iam:PassedToService\": \"sagemaker.amazonaws.com\"\n",
201 |     "                }\n",
202 |     "            }\n",
203 |     "        },\n",
204 |     "        {\n",
205 |     "            \"Effect\": \"Allow\",\n",
206 |     "            \"Action\": [\n",
207 |     "                \"sagemaker:CreateEndpoint\",\n",
208 |     "                \"sagemaker:CreateEndpointConfig\",\n",
209 |     "                \"sagemaker:CreateHyperParameterTuningJob\",\n",
210 |     "                \"sagemaker:CreateModel\",\n",
211 |     "                \"sagemaker:CreateProcessingJob\",\n",
212 |     "                \"sagemaker:CreateTrainingJob\",\n",
213 |     "                \"sagemaker:CreateTransformJob\",\n",
214 |     "                \"sagemaker:DeleteEndpoint\",\n",
215 |     "                \"sagemaker:DeleteEndpointConfig\",\n",
216 |     "                \"sagemaker:DescribeHyperParameterTuningJob\",\n",
217 |     "                \"sagemaker:DescribeProcessingJob\",\n",
218 |     "                \"sagemaker:DescribeTrainingJob\",\n",
219 |     "                \"sagemaker:DescribeTransformJob\",\n",
220 |     "                \"sagemaker:ListProcessingJobs\",\n",
221 |     "                \"sagemaker:ListTags\",\n",
222 |     "                \"sagemaker:StopHyperParameterTuningJob\",\n",
223 |     "                \"sagemaker:StopProcessingJob\",\n",
224 |     "                \"sagemaker:StopTrainingJob\",\n",
225 |     "                \"sagemaker:StopTransformJob\",\n",
226 |     "                \"sagemaker:UpdateEndpoint\",\n",
227 |     "            ],\n",
228 |     "            \"Resource\": \"*\"\n",
229 |     "        }\n",
230 |     "    ]\n",
231 |     "}\n",
232 |     "\n",
233 |     "trust_policy = {\n",
234 |     "    \"Version\": \"2012-10-17\",\n",
235 |     "    \"Statement\": [\n",
236 |     "        {\n",
237 |     "            \"Sid\": \"\",\n",
238 |     "            \"Effect\": \"Allow\",\n",
239 |     "            \"Principal\": {\n",
240 |     "            \"Service\": [\"states.amazonaws.com\", \"sagemaker.amazonaws.com\"]},\n",
241 |     "            \"Action\": \"sts:AssumeRole\"\n",
242 |     "        }\n",
243 |     "    ]\n",
244 |     "}"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "policy_name = 'MyStepFunctionsWorkflowExecutionPolicy-%s' % suffix\n",
254 |     "role_name = 'MyStepFunctionsWorkflowExecutionRole-%s' % suffix\n",
255 |     "policy_response = iam.create_policy(\n",
256 |     "  PolicyName=policy_name,\n",
257 |     "  PolicyDocument=json.dumps(my_managed_policy)\n",
258 |     ")\n",
259 |     "\n",
260 |     "role_response = iam.create_role(\n",
261 |     "    RoleName=role_name,\n",
262 |     "    AssumeRolePolicyDocument=json.dumps(trust_policy),\n",
263 |     "    Description='Role to execute StepFunctions workflow which submits SageMaker jobs',\n",
264 |     "    MaxSessionDuration=3600,\n",
265 |     ")\n",
266 |     "\n",
267 |     "# Attach a policy to role\n",
268 |     "iam.attach_role_policy(\n",
269 |     "    PolicyArn=policy_response['Policy']['Arn'],\n",
270 |     "    RoleName=role_name\n",
271 |     ")\n",
272 |     "iam.attach_role_policy(\n",
273 |     "    PolicyArn='arn:aws:iam::aws:policy/CloudWatchEventsFullAccess',\n",
274 |     "    RoleName=role_name\n",
275 |     ")"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "Create a Stepfunctions workflow, i.e. a state machine."
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": [
291 |     "sfn = boto3.client('stepfunctions')\n",
292 |     "sfn_execution_role = role_response['Role']['Arn']\n",
293 |     "state_machine_name = 'nsclc-radiogenomics-imaging-workflow-%s' % suffix\n",
294 |     "sfn_response = sfn.create_state_machine(name = state_machine_name,\n",
295 |     "                                        definition = json.dumps(state_machine_json),\n",
296 |     "                                        roleArn = sfn_execution_role,\n",
297 |     "                                        type = 'STANDARD')"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "We will be running this workflow for all the `RO1` subjects."
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "subject_list = ['R01-%03d'%i for i in range(1,163)]"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {},
319 |    "source": [
320 |     "Execute!"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "stateMachineArn=sfn_response['stateMachineArn']\n",
330 |     "\n",
331 |     "feature_store_name = 'imaging-feature-group-%s' % suffix\n",
332 |     "processing_job_name = 'dcm-nifti-conversion-%s' % suffix\n",
333 |     "offline_store_s3uri = '%s/multimodal-imaging-featurestore' % output_data_uri\n",
334 |     "payload = {\n",
335 |     "  \"PreprocessingJobName\": processing_job_name,\n",
336 |     "  \"FeatureStoreName\": feature_store_name,\n",
337 |     "  \"OfflineStoreS3Uri\": offline_store_s3uri,\n",
338 |     "  \"Subject\": subject_list\n",
339 |     "}\n",
340 |     "exeution_response = sfn.start_execution(stateMachineArn=stateMachineArn,\n",
341 |     "                                        name=suffix,\n",
342 |     "                                        input=json.dumps(payload))"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "print(exeution_response)"
352 |    ]
353 |   }
354 |  ],
355 |  "metadata": {
356 |   "instance_type": "ml.t3.medium",
357 |   "kernelspec": {
358 |    "display_name": "Python 3 (Data Science)",
359 |    "language": "python",
360 |    "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0"
361 |   },
362 |   "language_info": {
363 |    "codemirror_mode": {
364 |     "name": "ipython",
365 |     "version": 3
366 |    },
367 |    "file_extension": ".py",
368 |    "mimetype": "text/x-python",
369 |    "name": "python",
370 |    "nbconvert_exporter": "python",
371 |    "pygments_lexer": "ipython3",
372 |    "version": "3.7.10"
373 |   }
374 |  },
375 |  "nbformat": 4,
376 |  "nbformat_minor": 5
377 | }
378 | 


--------------------------------------------------------------------------------
/imaging/src/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-slim-buster
 2 | 
 3 | COPY ./requirements.txt /opt/
 4 | RUN pip3 install --no-cache-dir -r /opt/requirements.txt
 5 | ENV PYTHONUNBUFFERED=TRUE
 6 | ENV AWS_DEFAULT_REGION=##REGION##
 7 | 
 8 | COPY ./dcm2nifti_processing.py /opt/
 9 | COPY ./radiomics_utils.py /opt/
10 | 
11 | ENTRYPOINT ["python3", "/opt/dcm2nifti_processing.py"]
12 | 


--------------------------------------------------------------------------------
/imaging/src/dcm2nifti_processing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import dcmstack
  4 | from glob import glob
  5 | import pydicom
  6 | import nibabel as nib
  7 | import numpy as np
  8 | import sys
  9 | import os
 10 | import json
 11 | import time
 12 | import logging
 13 | from nilearn import plotting
 14 | import matplotlib.pyplot as plt
 15 | import radiomics_utils as utils
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | logger.setLevel(logging.DEBUG)
 19 | 
 20 | if __name__ == '__main__':
 21 |     parser = argparse.ArgumentParser()
 22 |     parser.add_argument('--subject', type=str, default='R01-003',
 23 |                         help='Subject ID (default: R01-003)')
 24 |     parser.add_argument('--feature_store_name', type=str, default='nsclc-radiogenomics-imaging-feature-group',
 25 |                         help='SageMaker Feature Store Group Name (default: nsclc-radiogenomics-imaging-feature-group)')
 26 |     parser.add_argument('--offline_store_s3uri', type=str,
 27 |                         help='SageMaker Feature Offline Store S3 URI Example: s3://multimodal-image-data-processed/nsclc-radiogenomics-multimodal-imaging-featurestore.')
 28 |     
 29 |     args = parser.parse_args()
 30 |     
 31 |     data_dir = '/opt/ml/processing/input/'
 32 |     output_dir = '/opt/ml/processing/output/'
 33 | 
 34 |     # assume one subject comes in,
 35 |     # we need to find out where the CT dicom files are
 36 |     # and segmentation file
 37 |     logger.info('Locating DICOM files')
 38 |     jsons = glob(os.path.join(data_dir, '*', '*', '*.json'))
 39 |     logger.info('%d jsons found.' % len(jsons))
 40 |     print('%d jsons found.' % len(jsons))
 41 | 
 42 |     valid_jsons = []
 43 |     file_info = []
 44 |     seg_string = ['3D Slicer segmentation result', 'ePAD Generated DSO']
 45 |     for i, jsonfile in enumerate(jsons):
 46 |         with open(jsonfile) as f:
 47 |             aa = json.load(f)
 48 |             if seg_string[0].lower() in aa['Total'][-1].lower() or seg_string[1].lower() in aa['Total'][-1].lower():
 49 |                 logger.debug(aa['Total'][-3:])
 50 |                 print(aa['Total'][-3:])
 51 |                 valid_jsons.append(jsonfile)
 52 |                 file_info.append([aa['StudyUID'], aa['Date'], aa['SeriesUID'], jsonfile])
 53 | 
 54 |     if len(valid_jsons) > 1:
 55 |         raise Exception('there are more than one segmentation for this patient')
 56 | 
 57 |     print(file_info)
 58 |     
 59 |     src_dcms = glob(os.path.join(data_dir, file_info[0][0], file_info[0][1], '*', '*dcm'))
 60 |     src_seg_dcm = [i for i in src_dcms if file_info[0][2] in i]
 61 |     logging.info(src_seg_dcm)
 62 |     src_dcms.remove(src_seg_dcm[0])
 63 |     #print(src_dcms)
 64 |     print('# of src_dcms: %d' % len(src_dcms))
 65 |     print('# of src_seg_dcm: %d' % len(src_seg_dcm))
 66 | 
 67 |     # work with CT scan and load as a Nifti image
 68 |     logger.info('Creating nifti images from DICOM files')
 69 |     stacks = dcmstack.parse_and_stack(src_dcms)
 70 |     stack = list(stacks.values())[0]
 71 |     nii = stack.to_nifti()
 72 |     img = nii.get_fdata()
 73 | 
 74 |     # work with CT segmentation file, load as a numpy array and create a Nifti image
 75 |     dcm = pydicom.dcmread(src_seg_dcm[0])
 76 |     n_frames_seg = int(dcm.NumberOfFrames)
 77 |     seg = dcm.pixel_array
 78 |     # reorient the seg array
 79 |     seg = np.fliplr(seg.T)
 80 |     
 81 |     # if seg and img don't have the same dimension, pad the images
 82 |     if img.shape != seg.shape:
 83 |         # read all dicoms and parse out the instance number and slice location
 84 |         # assuming the files are from R01-098 onwards with ePAD Generated DSO
 85 |         d_sort_instance_number=[]
 86 |         for tmp_dcm_fname in src_dcms:
 87 |             tmp_dcm = pydicom.dcmread(tmp_dcm_fname)
 88 |             d_sort_instance_number.append((int(tmp_dcm[0x0020, 0x0013].value), tmp_dcm[0x0020, 0x0032].value))
 89 |         d_sort_instance_number = sorted(d_sort_instance_number, key=lambda aa: aa[0])
 90 |         
 91 |         patient_img_position_first = dcm[0x5200, 0x9230][0][0x0020, 0x9113][0]['ImagePositionPatient'].value
 92 |         patient_img_position_last = dcm[0x5200, 0x9230][-1][0x0020, 0x9113][0]['ImagePositionPatient'].value
 93 |         
 94 |         slice_instance_number_1 = [i for i, j in d_sort_instance_number if j == patient_img_position_first][0]
 95 |         slice_instance_number_2 = [i for i, j in d_sort_instance_number if j == patient_img_position_last][0]
 96 |         top_slice_instance_number = min(slice_instance_number_1, slice_instance_number_2)
 97 | 
 98 | #     logger.debug(np.nonzero(seg.sum(axis=1).sum(axis=1))[0])
 99 |         tmp_seg = np.zeros_like(img)
100 |         starting_index = img.shape[-1] - top_slice_instance_number - n_frames_seg # the seg and the image is flipped and need to locate from bottom.
101 |         ending_index = starting_index + n_frames_seg
102 |         tmp_seg[:, :, starting_index:ending_index] = seg
103 |         seg = tmp_seg
104 |     seg_nii = nib.Nifti1Image(seg, nii.affine, header = nii.header)
105 | 
106 |     # save some viz
107 |     logger.info('Saving files.')
108 |     prefix = '%s_%s' % (args.subject, file_info[0][1])
109 |     f1 = plt.figure(figsize=(16,6))
110 |     g1 = plotting.plot_roi(seg_nii, bg_img = nii, figure = f1, alpha = 0.4, title = 'Lung CT with segmentation')
111 |     g1.savefig(os.path.join(output_dir, 'PNG', '%s_ortho-view.png' % prefix), dpi = 150)
112 | 
113 |     f2 = plt.figure(figsize=(16,6))
114 |     g2 = plotting.plot_roi(seg_nii, bg_img = nii, figure = f2, alpha = 0.4, title = 'Lung CT with segmentation', 
115 |                            display_mode='z', cut_coords=4)
116 |     g2.savefig(os.path.join(output_dir, 'PNG', '%s_z-view.png' % prefix), dpi = 150)
117 | 
118 |     # save images
119 |     imageName = os.path.join(output_dir, 'CT-Nifti', '%s.nii.gz' % prefix)
120 |     maskName = os.path.join(output_dir, 'CT-SEG', '%s.nii.gz' % prefix)
121 |     nii.to_filename(imageName)
122 |     seg_nii.to_filename(maskName)
123 |     
124 |     # compute radiomic features
125 |     logging.info('Computing radiomic features')
126 |     df = utils.compute_features(imageName, maskName)
127 |     
128 |     # format dataframe for feature store
129 |     record_id_column = 'Subject'
130 |     event_time_column = 'EventTime'
131 |     df[record_id_column] = args.subject
132 |     current_time_sec = float(round(time.time()))
133 |     df[event_time_column] = current_time_sec
134 |     df['ScanDate'] = file_info[0][1]
135 |     utils.cast_object_to_string(df)
136 | 
137 |     # check if feature store exists
138 |     feature_group = utils.check_feature_group(args.feature_store_name)
139 |     if not feature_group:
140 |         feature_group = utils.create_feature_group(args.feature_store_name, df, args.offline_store_s3uri,
141 |                                                    record_id = record_id_column, event_time = event_time_column, 
142 |                                                    enable_online_store = True)
143 | 
144 |     # ingest features into a FeatureStore
145 |     feature_group.ingest(data_frame=df, max_workers=1, wait=True)
146 |     
147 |     print('Processing done for %s' % prefix)
148 |     logging.info('Processing done for %s' % prefix)
149 | 


--------------------------------------------------------------------------------
/imaging/src/nsclc-radiogenomics-imaging-workflow.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "StartAt": "iterate_over_subjects",
  3 |   "States": {
  4 |     "iterate_over_subjects": {
  5 |       "ItemsPath": "$.Subject",
  6 |       "MaxConcurrency": 50,
  7 |       "Type": "Map",
  8 |       "Next": "Finish",
  9 |       "Iterator": {
 10 |         "StartAt": "DICOM/NIfTI Conversion and Radiomic Feature Extraction",
 11 |         "States": {
 12 |           "Fallback": {
 13 |             "Type": "Pass",
 14 |             "Result": "This iteration failed for some reason",
 15 |             "End": true
 16 |           },
 17 |           "DICOM/NIfTI Conversion and Radiomic Feature Extraction": {
 18 |             "Type": "Task",
 19 |             "OutputPath": "$.ProcessingJobArn",
 20 |             "Resource": "arn:aws:states:::sagemaker:createProcessingJob.sync",
 21 |             "Retry": [
 22 |               {
 23 |                 "ErrorEquals": [
 24 |                   "SageMaker.AmazonSageMakerException"
 25 |                 ],
 26 |                 "IntervalSeconds": 15,
 27 |                 "MaxAttempts": 8,
 28 |                 "BackoffRate": 1.5
 29 |               }
 30 |             ],
 31 |             "Catch": [
 32 |               {
 33 |                 "ErrorEquals": [
 34 |                   "States.TaskFailed"
 35 |                 ],
 36 |                 "Next": "Fallback"
 37 |               }
 38 |             ],
 39 |             "Parameters": {
 40 |               "ProcessingJobName.$": "States.Format('{}-{}', $$.Execution.Input['PreprocessingJobName'], $)",
 41 |               "ProcessingInputs": [
 42 |                 {
 43 |                   "InputName": "DICOM",
 44 |                   "AppManaged": false,
 45 |                   "S3Input": {
 46 |                     "S3Uri.$": "States.Format('##INPUT_DATA_S3URI##/{}' , $)",
 47 |                     "LocalPath": "/opt/ml/processing/input",
 48 |                     "S3DataType": "S3Prefix",
 49 |                     "S3InputMode": "File",
 50 |                     "S3DataDistributionType": "FullyReplicated",
 51 |                     "S3CompressionType": "None"
 52 |                   }
 53 |                 }
 54 |               ],
 55 |               "ProcessingOutputConfig": {
 56 |                 "Outputs": [
 57 |                   {
 58 |                     "OutputName": "CT-Nifti",
 59 |                     "AppManaged": false,
 60 |                     "S3Output": {
 61 |                       "S3Uri": "##OUTPUT_DATA_S3URI##/CT-Nifti",
 62 |                       "LocalPath": "/opt/ml/processing/output/CT-Nifti",
 63 |                       "S3UploadMode": "EndOfJob"
 64 |                     }
 65 |                   },
 66 |                   {
 67 |                     "OutputName": "CT-SEG",
 68 |                     "AppManaged": false,
 69 |                     "S3Output": {
 70 |                       "S3Uri": "##OUTPUT_DATA_S3URI##/CT-SEG",
 71 |                       "LocalPath": "/opt/ml/processing/output/CT-SEG",
 72 |                       "S3UploadMode": "EndOfJob"
 73 |                     }
 74 |                   },
 75 |                   {
 76 |                     "OutputName": "PNG",
 77 |                     "AppManaged": false,
 78 |                     "S3Output": {
 79 |                       "S3Uri": "##OUTPUT_DATA_S3URI##/PNG",
 80 |                       "LocalPath": "/opt/ml/processing/output/PNG",
 81 |                       "S3UploadMode": "EndOfJob"
 82 |                     }
 83 |                   }
 84 |                 ]
 85 |               },
 86 |               "AppSpecification": {
 87 |                 "ImageUri": "##ECR_IMAGE_URI##",
 88 |                 "ContainerArguments.$": "States.Array('--subject', $, '--feature_store_name', $$.Execution.Input['FeatureStoreName'], '--offline_store_s3uri', $$.Execution.Input['OfflineStoreS3Uri'])",
 89 |                 "ContainerEntrypoint": [
 90 |                   "python3",
 91 |                   "/opt/dcm2nifti_processing.py"
 92 |                 ]
 93 |               },
 94 |               "RoleArn": "##IAM_ROLE_ARN##",
 95 |               "ProcessingResources": {
 96 |                 "ClusterConfig": {
 97 |                   "InstanceCount": 1,
 98 |                   "InstanceType": "ml.r5.large",
 99 |                   "VolumeSizeInGB": 5
100 |                 }
101 |               }
102 |             },
103 |             "End": true
104 |           }
105 |         }
106 |       }
107 |     },
108 |     "Finish": {
109 |       "Type": "Succeed"
110 |     }
111 |   }
112 | }


--------------------------------------------------------------------------------
/imaging/src/radiomics_utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import time
 3 | import numpy as np
 4 | from radiomics import featureextractor
 5 | import boto3
 6 | import sagemaker
 7 | from sagemaker.session import Session
 8 | from sagemaker.feature_store.feature_group import FeatureGroup
 9 | from sagemaker import get_execution_role
10 | 
11 | sagemaker_session = sagemaker.Session()
12 | region = sagemaker_session.boto_region_name
13 | print('region is %s' % region)
14 | 
15 | boto_session = boto3.Session(region_name=region)
16 | role = get_execution_role()
17 | print('role is %s' % role)
18 | 
19 | sagemaker_client = boto3.client(service_name='sagemaker', region_name=region)
20 | featurestore_runtime = boto3.client('sagemaker-featurestore-runtime', region_name=region)
21 | 
22 | feature_store_session = Session(
23 |     boto_session=boto_session,
24 |     sagemaker_client=sagemaker_client,
25 |     sagemaker_featurestore_runtime_client=featurestore_runtime
26 | )
27 | 
28 | 
29 | def cast_object_to_string(data_frame):
30 |     for label in data_frame.columns:
31 |         if data_frame.dtypes[label] == 'object':
32 |             data_frame[label] = data_frame[label].astype("str").astype("string")
33 | 
34 |             
35 | def compute_features(imageName, maskName):
36 |     extractor = featureextractor.RadiomicsFeatureExtractor()
37 |     featureVector = extractor.execute(imageName, maskName)
38 |     
39 |     new_dict={}
40 |     for featureName in featureVector.keys():
41 |         print("Computed %s: %s" % (featureName, featureVector[featureName]))
42 |         print(type(featureVector[featureName]))
43 |         if isinstance(featureVector[featureName], np.ndarray):
44 |             new_dict[featureName]=float(featureVector[featureName])
45 |         else:
46 |             new_dict[featureName]=featureVector[featureName]
47 |             
48 |     df=pd.DataFrame.from_dict(new_dict, orient='index').T
49 |     df=df.convert_dtypes(convert_integer=False)
50 |     df['imageName']=imageName
51 |     df['maskName']=maskName
52 | 
53 |     return df
54 |             
55 | def check_feature_group(feature_group_name):
56 |     feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)
57 |     status = None
58 |     try:
59 |         status = feature_group.describe()['FeatureGroupStatus']
60 |     except:
61 |         pass
62 |     
63 |     if status == 'Created':
64 |         return feature_group
65 |     elif status is None:
66 |         return False
67 |     else:
68 |         wait_for_feature_group_creation_complete(feature_group)
69 |         return feature_group
70 |         
71 |     
72 |     
73 | def create_feature_group(feature_group_name, dataframe, s3uri, record_id = 'Subject', event_time = 'EventTime', 
74 |                          enable_online_store = True):
75 |     print(feature_group_name)
76 |     print(feature_store_session)
77 |     feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)
78 |     feature_group.load_feature_definitions(data_frame=dataframe)
79 |     feature_group.create(s3_uri=s3uri,
80 |                          record_identifier_name=record_id,
81 |                          event_time_feature_name=event_time,
82 |                          role_arn=role,
83 |                          enable_online_store=enable_online_store)
84 |     wait_for_feature_group_creation_complete(feature_group)
85 | 
86 |     return feature_group
87 |     
88 |     
89 | def wait_for_feature_group_creation_complete(feature_group):
90 |     status = feature_group.describe()['FeatureGroupStatus']
91 |     while status == "Creating":
92 |         print("Waiting for Feature Group Creation")
93 |         time.sleep(5)
94 |         status = feature_group.describe()['FeatureGroupStatus']
95 |     if status != "Created":
96 |         raise RuntimeError(f"Failed to create feature group {feature_group.name}")
97 |     print(f"FeatureGroup {feature_group.name} successfully created.")


--------------------------------------------------------------------------------
/imaging/src/requirements.txt:
--------------------------------------------------------------------------------
 1 | pydicom == 2.1.2
 2 | numpy == 1.19.5
 3 | nibabel == 3.2.1
 4 | nilearn == 0.7.0
 5 | matplotlib == 3.1.3
 6 | dcmstack == 0.8.0
 7 | pandas == 1.1.5
 8 | pyradiomics == 3.0.1
 9 | sagemaker == 2.27.0
10 | boto3 == 1.17.18


--------------------------------------------------------------------------------