├── .github └── workflows │ └── codeql-analysis.yml ├── .gitignore ├── AWS-RoseTTAFold.ipynb ├── CASP14-Analysis.ipynb ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── config ├── Dockerfile ├── cfn.yaml ├── container_buildspec.yml ├── download_ref_data.sh ├── run_aws_data_prep_ver.sh └── run_aws_predict_ver.sh ├── data ├── T1028.fa ├── T1036s1.fa └── T1078.fa ├── img ├── AWS-RoseTTAFold-arch.png ├── AWS-RoseTTAFold-deploy.png ├── LaunchStack.jpg └── RF_workflow.png ├── requirements.txt └── rfutils ├── __init__.py └── rfutils.py /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ main ] 20 | schedule: 21 | - cron: '29 9 * * 2' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v2 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v1 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v1 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v1 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode* 2 | .vscode/* 3 | venv* 4 | venv/* 5 | load_testing.ipynb 6 | plotting.ipynb 7 | job_names.txt 8 | data/*.csv 9 | data/*.yaml -------------------------------------------------------------------------------- /AWS-RoseTTAFold.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# AWS-RoseTTAFold" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## I. Introduction" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "This notebook runs the [RoseTTAFold](https://www.ipd.uw.edu/2021/07/rosettafold-accurate-protein-structure-prediction-accessible-to-all/) algorithm developed by Minkyung Baek et al. and described in [M. Baek et al., Science \n", 22 | "10.1126/science.abj8754 2021](https://www.ipd.uw.edu/wp-content/uploads/2021/07/Baek_etal_Science2021_RoseTTAFold.pdf) on AWS." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "\"RoseTTAFold" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "The AWS workflow depends on a Batch compute environment." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "\"AWS-RoseTTAFold" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## II. Environment setup" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "## Install dependencies\n", 60 | "%pip install -q -q -r requirements.txt" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "## Import helper functions at rfutils/rfutils.py\n", 70 | "from rfutils import rfutils\n", 71 | "\n", 72 | "## Load additional dependencies\n", 73 | "from Bio import SeqIO\n", 74 | "from Bio.Seq import Seq\n", 75 | "from Bio.SeqRecord import SeqRecord\n", 76 | "import boto3\n", 77 | "import glob\n", 78 | "import json\n", 79 | "import pandas as pd\n", 80 | "import sagemaker\n", 81 | "\n", 82 | "pd.set_option(\"max_colwidth\", None)\n", 83 | "\n", 84 | "# Get service clients\n", 85 | "session = boto3.session.Session()\n", 86 | "sm_session = sagemaker.session.Session()\n", 87 | "region = session.region_name\n", 88 | "role = sagemaker.get_execution_role()\n", 89 | "s3 = boto3.client(\"s3\", region_name=region)\n", 90 | "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n", 91 | "\n", 92 | "bucket = sm_session.default_bucket()\n", 93 | "print(f\"S3 bucket name is {bucket}\")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "## III. Input Protein Sequence" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Enter a protein sequence manually" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "seq = SeqRecord(\n", 117 | " Seq(\"MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\"),\n", 118 | " id=\"YP_025292.1\",\n", 119 | " name=\"HokC\",\n", 120 | " description=\"toxic membrane protein, small\",\n", 121 | ")" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "Or provide the path to a fasta file" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "seq = SeqIO.read(\"data/T1078.fa\", \"fasta\")" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "print(f\"Protein sequence for analysis is \\n{seq}\")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## IV. Submit RoseTTAFold Jobs" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "### Generate Job Name" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "job_name = rfutils.create_job_name(seq.id)\n", 170 | "print(f\"Automatically-generated job name is: {job_name}\")" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "### Upload fasta file to S3" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "input_uri = rfutils.upload_fasta_to_s3(seq, bucket, job_name)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "### Submit jobs to AWS Batch queues" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "Select the job definitions and Batch queues for your job." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "batch_resources = rfutils.get_rosettafold_batch_resources(region=region)\n", 210 | "\n", 211 | "cpu_queue = batch_resources[\"CPUJobQueue\"][0]\n", 212 | "gpu_queue = batch_resources[\"GPUJobQueue\"][0]\n", 213 | "cpu_data_prep_job_def = batch_resources[\"CPUDataPrepJobDefinition\"][0]\n", 214 | "cpu_predict_job_def = batch_resources[\"CPUPredictJobDefinition\"][0]\n", 215 | "gpu_predict_job_def = batch_resources[\"GPUPredictJobDefinition\"][0]\n", 216 | "\n", 217 | "batch_resources" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "Because our test sequence is small (less than 400 residues) we will run the prediction step on a GPU to decrease the job duration from hours to minutes." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "two_step_response = rfutils.submit_2_step_job(\n", 234 | " bucket=bucket,\n", 235 | " job_name=job_name,\n", 236 | " data_prep_job_definition=cpu_data_prep_job_def,\n", 237 | " data_prep_queue=cpu_queue,\n", 238 | " data_prep_cpu=8,\n", 239 | " data_prep_mem=32,\n", 240 | " predict_job_definition=gpu_predict_job_def, # Change this to the cpu_predict_job_def for large proteins\n", 241 | " predict_queue=gpu_queue, # Change this to the cpu_queue for large proteins\n", 242 | " predict_cpu=4,\n", 243 | " predict_mem=16,\n", 244 | " predict_gpu=True, # Change this to False for large proteins\n", 245 | ")\n", 246 | "data_prep_jobId = two_step_response[0][\"jobId\"]\n", 247 | "predict_jobId = two_step_response[1][\"jobId\"]" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "## V. Check Status of Data Prep and Prediction Jobs" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "rfutils.get_rf_job_info(\n", 264 | " cpu_queue,\n", 265 | " gpu_queue,\n", 266 | " hrs_in_past=1,\n", 267 | ")" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## VI. View Data Prep Results" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "Pause while the data prep job starts up" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "rfutils.wait_for_job_start(data_prep_jobId)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "Get logs for data prep job (Run this multiple times to see how the job progresses)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "data_prep_logStreamName = rfutils.get_batch_job_info(data_prep_jobId)[\"logStreamName\"]\n", 307 | "rfutils.get_batch_logs(data_prep_logStreamName).tail(n=5)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "Retrieve and Display Multiple Sequence Alignment (MSA) Results" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "rfutils.display_msa(data_prep_jobId, bucket)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "## VII. View Prediction Results" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "Pause while the predict job starts up" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "rfutils.wait_for_job_start(predict_jobId)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "Get logs for prediction job (Run this multiple times to see how the job progresses)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "data_prep_logStreamName = rfutils.get_batch_job_info(data_prep_jobId)[\"logStreamName\"]\n", 363 | "rfutils.get_batch_logs(data_prep_logStreamName).tail(n=5)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "## VIII. View Job Metrics" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "metrics = rfutils.get_rf_job_metrics(job_name, bucket, region)\n", 380 | "\n", 381 | "print(f'Number of sequences in MSA: {metrics[\"DATA_PREP\"][\"MSA_COUNT\"]}')\n", 382 | "print(f'Number of templates: {metrics[\"DATA_PREP\"][\"TEMPLATE_COUNT\"]}')\n", 383 | "print(f'MSA duration (sec): {metrics[\"DATA_PREP\"][\"MSA_DURATION\"]}')\n", 384 | "print(f'SS duration (sec): {metrics[\"DATA_PREP\"][\"SS_DURATION\"]}')\n", 385 | "print(f'Template search duration (sec): {metrics[\"DATA_PREP\"][\"TEMPLATE_DURATION\"]}')\n", 386 | "print(\n", 387 | " f'Total data prep duration (sec): {metrics[\"DATA_PREP\"][\"TOTAL_DATA_PREP_DURATION\"]}'\n", 388 | ")\n", 389 | "print(f'Total predict duration (sec): {metrics[\"PREDICT\"][\"TOTAL_PREDICT_DURATION\"]}')" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "## IX. Retrieve and Display Predicted Structure" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "rfutils.display_structure(predict_jobId, bucket, vmin=0.5, vmax=0.9)" 406 | ] 407 | } 408 | ], 409 | "metadata": { 410 | "instance_type": "ml.t3.medium", 411 | "interpreter": { 412 | "hash": "8ad3a54da4d511af1a5c2549d8f1b22d83bfd1079fb699a3f5552b91d143b102" 413 | }, 414 | "kernelspec": { 415 | "display_name": "Python 3 (Data Science)", 416 | "language": "python", 417 | "name": "python3" 418 | }, 419 | "language_info": { 420 | "codemirror_mode": { 421 | "name": "ipython", 422 | "version": 3 423 | }, 424 | "file_extension": ".py", 425 | "mimetype": "text/x-python", 426 | "name": "python", 427 | "nbconvert_exporter": "python", 428 | "pygments_lexer": "ipython3", 429 | "version": "3.8.9" 430 | } 431 | }, 432 | "nbformat": 4, 433 | "nbformat_minor": 4 434 | } 435 | -------------------------------------------------------------------------------- /CASP14-Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# AWS-RoseTTAFold: Bulk Job Analysis" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## I. Introduction" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "This notebook demonstrates how to analyze multiple protein simultaneously, in this case a subset of the CASP14 target set." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## II. Environment setup" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "## Install dependencies\n", 38 | "%pip install -q -q -r requirements.txt" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "## Import helper functions at rfutils/rfutils.py\n", 48 | "from rfutils import rfutils\n", 49 | "\n", 50 | "## Load additional dependencies\n", 51 | "from Bio import SeqIO\n", 52 | "from Bio.Seq import Seq\n", 53 | "from Bio.SeqRecord import SeqRecord\n", 54 | "import boto3\n", 55 | "import glob\n", 56 | "import json\n", 57 | "from IPython.display import display\n", 58 | "import pandas as pd\n", 59 | "import sagemaker\n", 60 | "\n", 61 | "pd.set_option(\"max_colwidth\", None)\n", 62 | "\n", 63 | "# Get service clients\n", 64 | "session = boto3.session.Session()\n", 65 | "sm_session = sagemaker.session.Session()\n", 66 | "region = session.region_name\n", 67 | "role = sagemaker.get_execution_role()\n", 68 | "s3 = boto3.client(\"s3\", region_name=region)\n", 69 | "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n", 70 | "\n", 71 | "bucket = sm_session.default_bucket()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## III. Input Protein Sequence" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "Download and process CASP14 sequences" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "!wget \"https://predictioncenter.org/download_area/CASP14/sequences/casp14.seq.txt\" -O \"data/casp14.fa\"\n", 95 | "!sed '137,138d' \"data/casp14.fa\" > \"data/casp14_dedup.fa\" # Remove duplicate entry for T1085\n", 96 | "\n", 97 | "casp14_iterator = SeqIO.parse(\"data/casp14_dedup.fa\", \"fasta\")\n", 98 | "casp14_df = pd.DataFrame(\n", 99 | " (\n", 100 | " (record.id, record.description, len(record), record.seq)\n", 101 | " for record in casp14_iterator\n", 102 | " ),\n", 103 | " columns=[\"id\", \"description\", \"length\", \"seq\"],\n", 104 | ").sort_values(by=\"length\")\n", 105 | "!rm data/casp14*" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Display information about CASP14 proteins" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "with pd.option_context(\"display.max_rows\", None):\n", 122 | " display(casp14_df.loc[:, (\"id\", \"description\")])" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "Plot distribution of the protein lengths" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "import matplotlib.pyplot as plt\n", 139 | "\n", 140 | "fig, ax = plt.subplots()\n", 141 | "plt.hist(casp14_df.length, bins=50)\n", 142 | "plt.ylabel(\"Sample count\")\n", 143 | "plt.xlabel(\"Residue count\")\n", 144 | "plt.title(\"CASP-14 Protein Length Distribution\")\n", 145 | "plt.show()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "Get the names of the AWS Batch resources deployed in your account." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "batch_resources = rfutils.get_rosettafold_batch_resources(region=region)\n", 162 | "\n", 163 | "cpu_queue = batch_resources[\"CPUJobQueue\"][0]\n", 164 | "gpu_queue = batch_resources[\"GPUJobQueue\"][0]\n", 165 | "cpu_data_prep_job_def = batch_resources[\"CPUDataPrepJobDefinition\"][0]\n", 166 | "cpu_predict_job_def = batch_resources[\"CPUPredictJobDefinition\"][0]\n", 167 | "gpu_predict_job_def = batch_resources[\"GPUPredictJobDefinition\"][0]\n", 168 | "\n", 169 | "batch_resources" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "Submit analysis jobs for a subset of CASP14 proteins" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "protein_count = 84 # Change this to analyze a smaller number of CASP14 targets\n", 186 | "job_name_list = []\n", 187 | "\n", 188 | "for row in casp14_df[:protein_count].itertuples(index=False):\n", 189 | " record = SeqRecord(row.seq, id=row.id, description=row.description)\n", 190 | " print(f\"Protein sequence for analysis is \\n{record.description}\")\n", 191 | " sequence_length = len(record.seq)\n", 192 | " print(f\"Sequence length is {sequence_length}\")\n", 193 | "\n", 194 | " if sequence_length < 400:\n", 195 | " prep_cpu = 8\n", 196 | " prep_mem = 32\n", 197 | " predict_cpu = 4\n", 198 | " predict_mem = 16\n", 199 | " predict_gpu = True\n", 200 | " predict_job_definition = gpu_predict_job_def\n", 201 | " predict_queue = gpu_queue\n", 202 | " else:\n", 203 | " prep_cpu = 8\n", 204 | " prep_mem = 64\n", 205 | " predict_cpu = 4\n", 206 | " predict_mem = 32\n", 207 | " predict_gpu = False\n", 208 | " predict_job_definition = cpu_predict_job_def\n", 209 | " predict_queue = cpu_queue\n", 210 | "\n", 211 | " job_name = rfutils.create_job_name(record.id)\n", 212 | " print(f\"Automatically-generated job name is: {job_name}\")\n", 213 | " job_name_list.append(job_name)\n", 214 | " input_uri = rfutils.upload_fasta_to_s3(record, bucket, job_name)\n", 215 | " two_step_response = rfutils.submit_2_step_job(\n", 216 | " bucket=bucket,\n", 217 | " job_name=job_name,\n", 218 | " data_prep_input_file=\"input.fa\",\n", 219 | " data_prep_job_definition=cpu_data_prep_job_def,\n", 220 | " data_prep_queue=cpu_queue,\n", 221 | " data_prep_cpu=prep_cpu,\n", 222 | " data_prep_mem=prep_mem,\n", 223 | " predict_job_definition=predict_job_definition,\n", 224 | " predict_queue=predict_queue,\n", 225 | " predict_cpu=predict_cpu,\n", 226 | " predict_mem=predict_mem,\n", 227 | " predict_gpu=predict_gpu,\n", 228 | " )" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "## IV. Check Status of Data Prep and Prediction Jobs" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "rfutils.get_rf_job_info(\n", 245 | " cpu_queue,\n", 246 | " gpu_queue,\n", 247 | " hrs_in_past=1,\n", 248 | ")" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "jobs = []\n", 258 | "for job_name in job_name_list:\n", 259 | " metrics = rfutils.get_rf_job_metrics(job_name, bucket, region)\n", 260 | " row = [\n", 261 | " job_name,\n", 262 | " metrics[\"DATA_PREP\"][\"JOB_ID\"],\n", 263 | " metrics[\"DATA_PREP\"][\"CPU\"],\n", 264 | " metrics[\"DATA_PREP\"][\"MEM\"],\n", 265 | " metrics[\"DATA_PREP\"][\"LENGTH\"],\n", 266 | " metrics[\"DATA_PREP\"][\"MSA_COUNT\"],\n", 267 | " metrics[\"DATA_PREP\"][\"TEMPLATE_COUNT\"],\n", 268 | " metrics[\"DATA_PREP\"][\"MSA_DURATION\"],\n", 269 | " metrics[\"DATA_PREP\"][\"SS_DURATION\"],\n", 270 | " metrics[\"DATA_PREP\"][\"TEMPLATE_DURATION\"],\n", 271 | " metrics[\"DATA_PREP\"][\"TOTAL_DATA_PREP_DURATION\"],\n", 272 | " metrics[\"PREDICT\"][\"JOB_ID\"],\n", 273 | " metrics[\"PREDICT\"][\"CPU\"],\n", 274 | " metrics[\"PREDICT\"][\"MEM\"],\n", 275 | " metrics[\"PREDICT\"][\"TOTAL_PREDICT_DURATION\"],\n", 276 | " ]\n", 277 | " jobs.append(row)\n", 278 | "metrics_df = pd.DataFrame(\n", 279 | " jobs,\n", 280 | " columns=[\n", 281 | " \"jobName\",\n", 282 | " \"dataPrepJobID\",\n", 283 | " \"dataPrepCPU\",\n", 284 | " \"dataPrepMEM\",\n", 285 | " \"sequenceLength\",\n", 286 | " \"MSACount\",\n", 287 | " \"templateCount\",\n", 288 | " \"MSADuration\",\n", 289 | " \"SSDuration\",\n", 290 | " \"templateDuration\",\n", 291 | " \"dataPrepDuration\",\n", 292 | " \"predictJobId\",\n", 293 | " \"predictCPU\",\n", 294 | " \"predictMEM\",\n", 295 | " \"predictDuration\",\n", 296 | " ],\n", 297 | ")\n", 298 | "metrics_df.sort_values(by=[\"dataPrepCPU\", \"dataPrepMEM\", \"predictCPU\", \"predictMEM\"])" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "metrics_df.to_csv(\"results.csv\")" 308 | ] 309 | } 310 | ], 311 | "metadata": { 312 | "instance_type": "ml.t3.medium", 313 | "interpreter": { 314 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" 315 | }, 316 | "kernelspec": { 317 | "display_name": "Python 3 (Data Science)", 318 | "language": "python", 319 | "name": "python3" 320 | }, 321 | "language_info": { 322 | "codemirror_mode": { 323 | "name": "ipython", 324 | "version": 3 325 | }, 326 | "file_extension": ".py", 327 | "mimetype": "text/x-python", 328 | "name": "python", 329 | "nbconvert_exporter": "python", 330 | "pygments_lexer": "ipython3", 331 | "version": "3.8.9" 332 | } 333 | }, 334 | "nbformat": 4, 335 | "nbformat_minor": 4 336 | } 337 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWS RoseTTAFold 2 | Infrastructure template and Jupyter notebooks for running RoseTTAFold on AWS Batch. 3 | 4 | ## Overview 5 | PProteins are large biomolecules that play an important role in the body. Knowing the physical structure of proteins is key to understanding their function. However, it can be difficult and expensive to determine the structure of many proteins experimentally. One alternative is to predict these structures using machine learning algorithms. Several high-profile research teams have released such algorithms, including [AlphaFold 2](https://deepmind.com/blog/article/alphafold-a-solution-to-a-50-year-old-grand-challenge-in-biology), [RoseTTAFold](https://www.ipd.uw.edu/2021/07/rosettafold-accurate-protein-structure-prediction-accessible-to-all/), and others. Their work was important enough for Science magazine to name it the ["2021 Breakthrough of the Year"](https://www.science.org/content/article/breakthrough-2021). 6 | 7 | Both AlphaFold 2 and RoseTTAFold use a multi-track transformer architecture trained on known protein templates to predict the structure of unknown peptide sequences. These predictions are heavily GPU-dependent and take anywhere from minutes to days to complete. The input features for these predictions include multiple sequence alignment (MSA) data. MSA algorithms are CPU-dependent and can themselves require several hours of processing time. 8 | 9 | Running both the MSA and structure prediction steps in the same computing environment can be cost inefficient, because the expensive GPU resources required for the prediction sit unused while the MSA step runs. Instead, using a high performance computing (HPC) service like AWS Batch (https://aws.amazon.com/batch/) allows us to run each step as a containerized job with the best fit of CPU, memory, and GPU resources. 10 | 11 | In this post, we demonstrate how to provision and use AWS Batch and other services to run AI-driven protein folding algorithms like RoseTTAFold. 12 | 13 | ## Setup 14 | ### Deploy the infrastructure stack 15 | 1. Choose *Launch Stack*: 16 | 17 | [![Launch Stack](img/LaunchStack.jpg)](https://console.aws.amazon.com/cloudformation/home#/stacks/create/review?templateURL=https://aws-hcls-ml.s3.amazonaws.com/blog_post_support_materials/aws-RoseTTAFold/cfn.yaml) 18 | 19 | 2. For *Stack Name*, enter a value unique to your account and region. 20 | 3. For *StackAvailabilityZone* choose an availability zone. 21 | 4. Select *I acknowledge that AWS CloudFormation might create IAM resources with custom names*. 22 | 5. Choose *Create stack*. 23 | 6. Wait approximately 30 minutes for AWS CloudFormation to create the infrastructure stack and AWS CodeBuild to build and publish the AWS-RoseTTAFold container to Amazon Elastic Container Registry (Amazon ECR). 24 | 25 | ### Load model weights and sequence database files 26 | 27 | *Option 1: Mount the FSx for Lustre file system to an EC2 instance* 28 | 29 | 1. Sign in to the AWS Management Console and open the Amazon EC2 console at [https://console.aws.amazon.com/ec2](https://console.aws.amazon.com/ec2). 30 | 2. In the navigation pane, under *Instances,* select *Launch Templates*. 31 | 3. Choose the *Launch template ID* for your stack, such as `aws-rosettafold-launch-template-stack-id-suffix`. 32 | 4. Choose *Actions, Launch instance from template.* 33 | 5. Launch a new EC2 instance and connect using either SSH or SSM. 34 | 6. Download and extract the network weights and sequence database files to the attached volume at `/fsx/aws-rosettafold-ref-data` according to installation steps 3 and 5 from the [RoseTTAFold public repository](https://github.com/RosettaCommons/RoseTTAFold). 35 | 36 | *Option 2: Lazy-load the data from a S3 data repository* 37 | 38 | 1. Create a new S3 bucket in your region of interest. 39 | 2. Download and extract the network weights and sequence database files as described above and transfer them to your S3 bucket. 40 | 3. Sign in to the AWS Management Console and open the Amazon FSx for Lustre console at [https://console.aws.amazon.com/fsx](https://console.aws.amazon.com/fsx/home). 41 | 4. Choose the *File System name* for your stack, such as `aws-rosettafold-fsx-lustre-stack-id-suffix`. 42 | 5. On the file system details page, choose *Data repository*, *Create data repository association*. 43 | 6. For *File system path* enter `/aws-rosettafold-ref-data`. 44 | 7. For *Data repository path* enter the s3 url for your new S3 bucket. 45 | 8. Choose *Create*. 46 | 47 | Creating the data repository association will immediately load the file metadata to the file system. However, the data itself will not be available until requested by a job. This will add several hours to the duration of the first job you submit. However, subsequent jobs will complete much faster. 48 | 49 | Once you have finished loading the model weights and sequence data base files, the FSx for Lustre file system will include the following files: 50 | 51 | ``` 52 | /fsx 53 | └── /aws-rosettafold-ref-data 54 | ├── /bfd 55 | │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata (1.4 TB) 56 | │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex (1.7 GB) 57 | │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata (15.7 GB) 58 | │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex (1.6 GB) 59 | │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata (304.4 GB) 60 | │ └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex (123.6 MB) 61 | ├── /pdb100_2021Mar03 62 | │ ├── LICENSE (20.4 KB) 63 | │ ├── pdb100_2021Mar03_a3m.ffdata (633.9 GB) 64 | │ ├── pdb100_2021Mar03_a3m.ffindex (3.9 MB) 65 | │ ├── pdb100_2021Mar03_cs219.ffdata (41.8 MB) 66 | │ ├── pdb100_2021Mar03_cs219.ffindex (2.8 MB) 67 | │ ├── pdb100_2021Mar03_hhm.ffdata (6.8 GB) 68 | │ ├── pdb100_2021Mar03_hhm.ffindex (3.4 GB) 69 | │ ├── pdb100_2021Mar03_pdb.ffdata (26.2 GB) 70 | │ └── pdb100_2021Mar03_pdb.ffindex (3.7 MB) 71 | ├── /UniRef30_2020_06 72 | │ ├── UniRef30_2020_06_a3m.ffdata (139.6 GB) 73 | │ ├── UniRef30_2020_06_a3m.ffindex (671.0 MG) 74 | │ ├── UniRef30_2020_06_cs219.ffdata (6.0 GB) 75 | │ ├── UniRef30_2020_06_cs219.ffindex (605.0 MB) 76 | │ ├── UniRef30_2020_06_hhm.ffdata (34.1 GB) 77 | │ ├── UniRef30_2020_06_hhm.ffindex (19.4 MB) 78 | │ └── UniRef30_2020_06.md5sums (379.0 B) 79 | └── /weights 80 | ├── RF2t.pt (126 MB KB) 81 | ├── Rosetta-DL_LICENSE.txt (3.1 KB) 82 | ├── RoseTTAFold_e2e.pt (533 MB) 83 | └── RoseTTAFold_pyrosetta.pt (506 MB) 84 | 85 | ``` 86 | 87 | ### Submit structure prediction jobs from Jupyter 88 | 89 | 1. [Clone the CodeCommit repository](https://docs.aws.amazon.com/codecommit/latest/userguide/how-to-connect.html#how-to-connect-http) created by CloudFormation to a Jupyter Notebook environment of your choice. 90 | 2. Use the `AWS-RoseTTAFold.ipynb` and `CASP14-Analysis.ipynb` notebooks to submit protein sequences for analysis. 91 | 92 | ## Architecture 93 | 94 | ![AWS-RoseTTAFold Architecture](img/AWS-RoseTTAFold-arch.png) 95 | 96 | This project creates two computing environments in AWS Batch to run the "end-to-end" protein folding workflow in RoseTTAFold. The first of these uses the optimal mix of `c4`, `m4`, and `r4` instance types based on the vCPU and memory requirements specified in the Batch job. The second environment uses `g4dn` on-demand instances to balance performance, availability, and cost. 97 | 98 | A scientist can create structure prediction jobs using one of the two included Jupyter notebooks. `AWS-RoseTTAFold.ipynb` demonstrates how to submit a single analysis job and view the results. `CASP14-Analysis.ipynb` demonstrates how to submit multiple jobs at once using the CASP14 target list. In both of these cases, submitting a sequence for analysis creates two Batch jobs, one for data preparation (using the CPU computing environment) and a second, dependent job for structure prediction (using the GPU computing environment). 99 | 100 | Both the data preparation and structure prediction use the same Docker image for execution. This image, based on the public Nvidia CUDA image for Ubuntu 20, includes the v1.1 release of the public [RoseTTAFold repository](https://github.com/RosettaCommons/RoseTTAFold), as well as additional scripts for integrating with AWS services. CodeBuild will automatically download this container definition and build the required image during stack creation. However, end users can make changes to this image by pushing to the CodeCommit repository included in the stack. For example, users could replace the included MSA algorithm ([hhblits](https://github.com/soedinglab/hh-suite)) with an alternative like [MMseqs2](https://github.com/soedinglab/MMseqs2) or replace the RoseTTAFold network with an alternative like AlphaFold 2 or [Uni-Fold](https://github.com/dptech-corp/Uni-Fold). 101 | 102 | ## Costs 103 | This workload costs approximately $760 per month to maintain, plus another $0.50 per job. 104 | 105 | ## Deployment 106 | 107 | ![AWS-RoseTTAFold Dewployment](img/AWS-RoseTTAFold-deploy.png) 108 | 109 | Running the CloudFormation template at `config/cfn.yaml` creates the following resources in the specified availability zone: 110 | 1. A new VPC with a private subnet, public subnet, NAT gateway, internet gateway, elastic IP, route tables, and S3 gateway endpoint. 111 | 2. A FSx Lustre file system with 1.2 TiB of storage and 1,200 MB/s throughput capacity. This file system can be linked to an S3 bucket for loading the required reference data when the first job executes. 112 | 3. An EC2 launch template for mounting the FSX file system to Batch compute instances. 113 | 4. A set of AWS Batch compute environments, job queues, and job definitions for running the CPU-dependent data prep job and a second for the GPU-dependent prediction job. 114 | 5. CodeCommit, CodeBuild, CodePipeline, and ECR resources for building and publishing the Batch container image. When CloudFormation creates the CodeCommit repository, it populates it with a zipped version of this repository stored in a public S3 bucket. CodeBuild uses this repository as its source and adds additional code from release 1.1 of the public [RoseTTAFold repository](https://github.com/RosettaCommons/RoseTTAFold). CodeBuild then publishes the resulting container image to ECR, where Batch jobs can use it as needed. 115 | 116 | ## Licensing 117 | This library is licensed under the MIT-0 License. See the LICENSE file for more information. 118 | 119 | The University of Washington has made the code and data in the [RoseTTAFold public repository](https://github.com/RosettaCommons) available under an [MIT license](https://github.com/RosettaCommons/RoseTTAFold/blob/main/LICENSE). However, the model weights used for prediction are only available for internal, non-profit, non-commercial research use. For information, please see the [full license agreement](https://files.ipd.uw.edu/pub/RoseTTAFold/Rosetta-DL_LICENSE.txt) and contact the University of Washington for details. 120 | 121 | ## Security 122 | 123 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 124 | 125 | ## More Information 126 | - [University of Washington Institute for Protein Design](https://www.ipd.uw.edu/2021/07/rosettafold-accurate-protein-structure-prediction-accessible-to-all/) 127 | - [RoseTTAFold Paper](https://www.ipd.uw.edu/wp-content/uploads/2021/07/Baek_etal_Science2021_RoseTTAFold.pdf) 128 | - [AWS Batch Documentation](https://docs.aws.amazon.com/batch/) 129 | - [CloudFormation Documentation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/Welcome.html) 130 | - [Explaination of the RoseTTAFold and AlphaFold 2 architectures](https://www.youtube.com/watch?v=Rfw7thgGTwI) 131 | - [David Baker's TED talk on protein design](https://www.ted.com/talks/david_baker_5_challenges_we_could_solve_by_designing_new_proteins) 132 | - [AWS ML Blog Post on running AlphaFold 2 on Amazon EC2](https://aws.amazon.com/blogs/machine-learning/run-alphafold-v2-0-on-amazon-ec2/) -------------------------------------------------------------------------------- /config/Dockerfile: -------------------------------------------------------------------------------- 1 | # Start with a copy of the cuda image maintained by Nvidia to avoid 2 | FROM nvcr.io/nvidia/cuda:11.4.2-base-ubuntu20.04 3 | 4 | # Install basic tools 5 | RUN apt-get update && apt-get install -y \ 6 | wget \ 7 | curl \ 8 | unzip 9 | 10 | # Install miniconda and awscli 11 | RUN curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ 12 | && chmod +x ~/miniconda.sh \ 13 | && ~/miniconda.sh -b -p /opt/conda \ 14 | && rm ~/miniconda.sh \ 15 | && /opt/conda/bin/conda update conda \ 16 | && /opt/conda/bin/conda install -c conda-forge awscli 17 | 18 | # Download and unzip v1.1 of the RoseTTAFold repository, available at 19 | # https://github.com/RosettaCommons/RoseTTAFold 20 | RUN wget https://github.com/RosettaCommons/RoseTTAFold/archive/refs/tags/v1.1.0.zip \ 21 | && unzip v1.1.0.zip \ 22 | && mv RoseTTAFold-1.1.0 /RoseTTAFold \ 23 | && rm v1.1.0.zip 24 | WORKDIR /RoseTTAFold 25 | 26 | # Install lddt, cs-blast, and libgomp1 27 | RUN ./install_dependencies.sh 28 | RUN /opt/conda/bin/conda env create -f RoseTTAFold-linux.yml \ 29 | && /opt/conda/bin/conda clean -ya 30 | RUN apt-get install libgomp1 31 | 32 | # Add the AWS-RoseTTAFold scripts 33 | COPY run_aws_data_prep_ver.sh . 34 | COPY run_aws_predict_ver.sh . 35 | COPY download_ref_data.sh . 36 | 37 | # Clean up unecessary files to save space 38 | RUN rm -rf \ 39 | example \ 40 | folding \ 41 | *.gz \ 42 | *.zip \ 43 | *.yml \ 44 | install_dependencies.sh 45 | 46 | # Create a directory to mount the FSx Lustre file system with ref data 47 | VOLUME /fsx 48 | 49 | # Activate conda\ 50 | RUN ["/bin/bash", "-c", \ 51 | "/opt/conda/bin/activate", \ 52 | "/opt/conda/bin/conda init bash", \ 53 | "source $HOME/.bashrc"] 54 | ENV PATH /opt/conda/bin:$PATH 55 | 56 | # Define the default run command. Batch will overwrite this at run time. 57 | CMD ["/bin/bash"] 58 | -------------------------------------------------------------------------------- /config/cfn.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: >- 3 | Creates a stack for running RoseTTAFold on AWS Batch. 4 | 5 | Parameters: 6 | StackAvailabilityZone: 7 | Description: Availability zone to deploy stack resources 8 | Type: "AWS::EC2::AvailabilityZone::Name" 9 | 10 | Resources: 11 | ################################################## 12 | # Network Configuration 13 | ################################################## 14 | VPC: 15 | Type: "AWS::EC2::VPC" 16 | Properties: 17 | EnableDnsSupport: "true" 18 | EnableDnsHostnames: "true" 19 | CidrBlock: "10.0.0.0/16" 20 | Tags: 21 | - Key: Application 22 | Value: AWS-RoseTTAFold 23 | - Key: Network 24 | Value: Public 25 | - Key: Name 26 | Value: 27 | !Join [ 28 | "-", 29 | [ 30 | "aws-rosettafold", 31 | "VPC", 32 | !Select [ 33 | 4, 34 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 35 | ], 36 | ], 37 | ] 38 | 39 | PublicSubnet0: 40 | Type: "AWS::EC2::Subnet" 41 | Properties: 42 | VpcId: !Ref VPC 43 | AvailabilityZone: !Ref StackAvailabilityZone 44 | CidrBlock: 45 | Fn::Select: 46 | - 0 47 | - Fn::Cidr: [!GetAtt VPC.CidrBlock, 6, 8] 48 | Tags: 49 | - Key: Application 50 | Value: AWS-RoseTTAFold 51 | - Key: Network 52 | Value: Public 53 | - Key: Name 54 | Value: 55 | !Join [ 56 | "-", 57 | [ 58 | "aws-rosettafold", 59 | "public-subnet", 60 | !Select [ 61 | 4, 62 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 63 | ], 64 | ], 65 | ] 66 | 67 | PrivateSubnet0: 68 | Type: "AWS::EC2::Subnet" 69 | Properties: 70 | VpcId: 71 | Ref: VPC 72 | AvailabilityZone: !Ref StackAvailabilityZone 73 | CidrBlock: 74 | Fn::Select: 75 | - 3 76 | - Fn::Cidr: [!GetAtt VPC.CidrBlock, 6, 8] 77 | Tags: 78 | - Key: Application 79 | Value: AWS-RoseTTAFold 80 | - Key: Network 81 | Value: Private 82 | - Key: Name 83 | Value: 84 | !Join [ 85 | "-", 86 | [ 87 | "aws-rosettafold", 88 | "private-subnet", 89 | !Select [ 90 | 4, 91 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 92 | ], 93 | ], 94 | ] 95 | 96 | InternetGateway: 97 | Type: "AWS::EC2::InternetGateway" 98 | Properties: 99 | Tags: 100 | - Key: Application 101 | Value: AWS-RoseTTAFold 102 | - Key: Network 103 | Value: Public 104 | - Key: Name 105 | Value: 106 | !Join [ 107 | "-", 108 | [ 109 | "aws-rosettafold", 110 | "igw", 111 | !Select [ 112 | 4, 113 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 114 | ], 115 | ], 116 | ] 117 | 118 | GatewayToInternet: 119 | Type: "AWS::EC2::VPCGatewayAttachment" 120 | Properties: 121 | VpcId: 122 | Ref: VPC 123 | InternetGatewayId: 124 | Ref: InternetGateway 125 | 126 | PublicRouteTable: 127 | Type: "AWS::EC2::RouteTable" 128 | Properties: 129 | VpcId: 130 | Ref: VPC 131 | Tags: 132 | - Key: Application 133 | Value: AWS-RoseTTAFold 134 | - Key: Network 135 | Value: Public 136 | - Key: Name 137 | Value: 138 | !Join [ 139 | "-", 140 | [ 141 | "aws-rosettafold", 142 | "public-route-table", 143 | !Select [ 144 | 4, 145 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 146 | ], 147 | ], 148 | ] 149 | 150 | PublicRoute: 151 | Type: "AWS::EC2::Route" 152 | DependsOn: GatewayToInternet 153 | Properties: 154 | RouteTableId: 155 | Ref: PublicRouteTable 156 | DestinationCidrBlock: 0.0.0.0/0 157 | GatewayId: 158 | Ref: InternetGateway 159 | 160 | PublicSubnetRouteTableAssociation0: 161 | Type: "AWS::EC2::SubnetRouteTableAssociation" 162 | Properties: 163 | SubnetId: 164 | Ref: PublicSubnet0 165 | RouteTableId: 166 | Ref: PublicRouteTable 167 | 168 | ElasticIP0: 169 | Type: "AWS::EC2::EIP" 170 | Properties: 171 | Domain: vpc 172 | Tags: 173 | - Key: Application 174 | Value: AWS-RoseTTAFold 175 | - Key: Name 176 | Value: 177 | !Join [ 178 | "-", 179 | [ 180 | "aws-rosettafold", 181 | "eip", 182 | !Select [ 183 | 4, 184 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 185 | ], 186 | ], 187 | ] 188 | 189 | NATGateway0: 190 | Type: "AWS::EC2::NatGateway" 191 | Properties: 192 | AllocationId: 193 | "Fn::GetAtt": 194 | - ElasticIP0 195 | - AllocationId 196 | SubnetId: 197 | Ref: PublicSubnet0 198 | Tags: 199 | - Key: Application 200 | Value: AWS-RoseTTAFold 201 | - Key: Name 202 | Value: 203 | !Join [ 204 | "-", 205 | [ 206 | "aws-rosettafold", 207 | "nat-gateway", 208 | !Select [ 209 | 4, 210 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 211 | ], 212 | ], 213 | ] 214 | 215 | PrivateRouteTable0: 216 | Type: "AWS::EC2::RouteTable" 217 | Properties: 218 | VpcId: 219 | Ref: VPC 220 | Tags: 221 | - Key: Application 222 | Value: AWS-RoseTTAFold 223 | - Key: Name 224 | Value: 225 | !Join [ 226 | "-", 227 | [ 228 | "aws-rosettafold", 229 | "private-route-table", 230 | !Select [ 231 | 4, 232 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 233 | ], 234 | ], 235 | ] 236 | 237 | PrivateRouteToInternet0: 238 | Type: "AWS::EC2::Route" 239 | Properties: 240 | RouteTableId: 241 | Ref: PrivateRouteTable0 242 | DestinationCidrBlock: 0.0.0.0/0 243 | NatGatewayId: 244 | Ref: NATGateway0 245 | 246 | PrivateSubnetRouteTableAssociation0: 247 | Type: "AWS::EC2::SubnetRouteTableAssociation" 248 | Properties: 249 | SubnetId: 250 | Ref: PrivateSubnet0 251 | RouteTableId: 252 | Ref: PrivateRouteTable0 253 | 254 | ################################################## 255 | # S3 256 | ################################################## 257 | 258 | ResultsS3: 259 | Type: "AWS::S3::Bucket" 260 | Properties: 261 | BucketEncryption: 262 | ServerSideEncryptionConfiguration: 263 | - ServerSideEncryptionByDefault: 264 | SSEAlgorithm: AES256 265 | Tags: 266 | - Key: Application 267 | Value: AWS-RoseTTAFold 268 | - Key: Name 269 | Value: 270 | !Join [ 271 | "-", 272 | [ 273 | "aws-rosettafold", 274 | "s3", 275 | !Select [ 276 | 4, 277 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 278 | ], 279 | ], 280 | ] 281 | DeletionPolicy: Retain 282 | UpdateReplacePolicy : Retain 283 | 284 | S3Endpoint: 285 | Type: "AWS::EC2::VPCEndpoint" 286 | Properties: 287 | RouteTableIds: 288 | - !Ref PublicRouteTable 289 | - !Ref PrivateRouteTable0 290 | ServiceName: !Sub "com.amazonaws.${AWS::Region}.s3" 291 | VpcId: !Ref VPC 292 | 293 | ################################################## 294 | # FSx File System 295 | ################################################## 296 | FSX: 297 | Type: AWS::FSx::FileSystem 298 | Properties: 299 | FileSystemType: "LUSTRE" 300 | FileSystemTypeVersion: "2.12" 301 | LustreConfiguration: 302 | DataCompressionType: "LZ4" 303 | DeploymentType: "PERSISTENT_2" 304 | PerUnitStorageThroughput: 1000 305 | SecurityGroupIds: 306 | - !GetAtt VPC.DefaultSecurityGroup 307 | StorageCapacity: 1200 308 | StorageType: "SSD" 309 | SubnetIds: 310 | - !Ref PrivateSubnet0 311 | Tags: 312 | - Key: Application 313 | Value: AWS-RoseTTAFold 314 | - Key: Name 315 | Value: 316 | !Join [ 317 | "-", 318 | [ 319 | "aws-rosettafold", 320 | "fsx-lustre", 321 | !Select [ 322 | 4, 323 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 324 | ], 325 | ], 326 | ] 327 | 328 | ################################################## 329 | # EC2 Launch Template 330 | ################################################## 331 | 332 | RFInstanceRole: 333 | Type: AWS::IAM::Role 334 | Properties: 335 | RoleName: 336 | !Join [ 337 | "-", 338 | [ 339 | "aws-rosettafold", 340 | "instance-role", 341 | !Select [ 342 | 4, 343 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 344 | ], 345 | ], 346 | ] 347 | Description: "Required service policies to support running RoseTTAFold on AWS Batch" 348 | AssumeRolePolicyDocument: 349 | Version: "2012-10-17" 350 | Statement: 351 | - Effect: Allow 352 | Principal: 353 | Service: 354 | - ec2.amazonaws.com 355 | Action: 356 | - "sts:AssumeRole" 357 | ManagedPolicyArns: 358 | - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly 359 | - arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role 360 | - arn:aws:iam::aws:policy/AmazonS3FullAccess 361 | Path: / 362 | Tags: 363 | - Key: Application 364 | Value: AWS-RoseTTAFold 365 | - Key: Name 366 | Value: 367 | !Join [ 368 | "-", 369 | [ 370 | "aws-rosettafold", 371 | "instance-role", 372 | !Select [ 373 | 4, 374 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 375 | ], 376 | ], 377 | ] 378 | 379 | InstanceProfile: 380 | Type: "AWS::IAM::InstanceProfile" 381 | Properties: 382 | InstanceProfileName: 383 | !Join [ 384 | "-", 385 | [ 386 | "aws-rosettafold", 387 | "instance-profile", 388 | !Select [ 389 | 4, 390 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 391 | ], 392 | ], 393 | ] 394 | Path: / 395 | Roles: 396 | - !Ref RFInstanceRole 397 | 398 | InstanceLaunchTemplate: 399 | Type: AWS::EC2::LaunchTemplate 400 | Properties: 401 | LaunchTemplateName: 402 | !Join [ 403 | "-", 404 | [ 405 | "aws-rosettafold", 406 | "launch-template", 407 | !Select [ 408 | 4, 409 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 410 | ], 411 | ], 412 | ] 413 | LaunchTemplateData: 414 | BlockDeviceMappings: 415 | - DeviceName: "/dev/xvda" 416 | Ebs: 417 | DeleteOnTermination: true 418 | Encrypted: true 419 | VolumeSize: 50 420 | VolumeType: "gp2" 421 | IamInstanceProfile: 422 | Name: !Ref InstanceProfile 423 | TagSpecifications: 424 | - ResourceType: "instance" 425 | Tags: 426 | - Key: Application 427 | Value: AWS-RoseTTAFold 428 | - Key: Name 429 | Value: 430 | !Join [ 431 | "-", 432 | [ 433 | "aws-rosettafold", 434 | "launch-template", 435 | !Select [ 436 | 4, 437 | !Split [ 438 | "-", 439 | !Select [2, !Split ["/", !Ref AWS::StackId]], 440 | ], 441 | ], 442 | ], 443 | ] 444 | UserData: 445 | Fn::Base64: 446 | Fn::Join: 447 | [ 448 | "", 449 | [ 450 | "MIME-Version: 1.0\n", 451 | "Content-Type: multipart/mixed; boundary=\"==MYBOUNDARY==\"\n", 452 | "\n", 453 | "--==MYBOUNDARY==\n", 454 | "Content-Type: text/cloud-config; charset=\"us-ascii\"\n", 455 | "\n", 456 | "runcmd:\n", 457 | "- file_system_id_01=", 458 | !Ref FSX, 459 | "\n", 460 | "- region=", 461 | !Ref AWS::Region, 462 | "\n", 463 | "- fsx_directory=/fsx\n", 464 | "- fsx_mount_name=", 465 | !GetAtt FSX.LustreMountName, 466 | "\n", 467 | "- amazon-linux-extras install -y lustre2.10\n", 468 | "- mkdir -p ${fsx_directory}\n", 469 | "- mount -t lustre ${file_system_id_01}.fsx.${region}.amazonaws.com@tcp:/${fsx_mount_name} ${fsx_directory}\n", 470 | "\n", 471 | "--==MYBOUNDARY==--", 472 | ], 473 | ] 474 | 475 | ################################################## 476 | # Container Services 477 | ################################################## 478 | RFCodeRepository: 479 | Type: AWS::CodeCommit::Repository 480 | Properties: 481 | Code: 482 | BranchName: "main" 483 | S3: 484 | Bucket: "aws-hcls-ml" 485 | Key: "blog_post_support_materials/aws-RoseTTAFold/aws-rosettafold.zip" 486 | RepositoryDescription: Code for running RoseTTAFold on AWS 487 | RepositoryName: 488 | !Join [ 489 | "-", 490 | [ 491 | "aws-rosettafold", 492 | "code-repo", 493 | !Select [ 494 | 4, 495 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 496 | ], 497 | ], 498 | ] 499 | Tags: 500 | - Key: Application 501 | Value: AWS-RoseTTAFold 502 | - Key: Name 503 | Value: 504 | !Join [ 505 | "-", 506 | [ 507 | "aws-rosettafold", 508 | "code-repo", 509 | !Select [ 510 | 4, 511 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 512 | ], 513 | ], 514 | ] 515 | 516 | RFContainerRegistry: 517 | Type: AWS::ECR::Repository 518 | Properties: 519 | EncryptionConfiguration: 520 | EncryptionType: AES256 521 | ImageScanningConfiguration: 522 | ScanOnPush: true 523 | RepositoryName: 524 | !Join [ 525 | "-", 526 | [ 527 | "aws-rosettafold", 528 | "container-repo", 529 | !Select [ 530 | 4, 531 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 532 | ], 533 | ], 534 | ] 535 | Tags: 536 | - Key: Application 537 | Value: AWS-RoseTTAFold 538 | - Key: Name 539 | Value: 540 | !Join [ 541 | "-", 542 | [ 543 | "aws-rosettafold", 544 | "container-repo", 545 | !Select [ 546 | 4, 547 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 548 | ], 549 | ], 550 | ] 551 | DeletionPolicy: Retain 552 | UpdateReplacePolicy : Retain 553 | 554 | CodeBuildRole: 555 | Type: AWS::IAM::Role 556 | Properties: 557 | RoleName: 558 | !Join [ 559 | "-", 560 | [ 561 | "aws-rosettafold", 562 | "codebuild-role", 563 | !Select [ 564 | 4, 565 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 566 | ], 567 | ], 568 | ] 569 | Description: "Required service policies to support building AWS-RoseTTAFold container" 570 | AssumeRolePolicyDocument: 571 | Version: "2012-10-17" 572 | Statement: 573 | - Effect: Allow 574 | Principal: 575 | Service: 576 | - codebuild.amazonaws.com 577 | Action: 578 | - "sts:AssumeRole" 579 | ManagedPolicyArns: 580 | - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess 581 | Path: / 582 | Policies: 583 | - PolicyName: RFCodeBuildPolicy 584 | PolicyDocument: 585 | Version: "2012-10-17" 586 | Statement: 587 | - Effect: Allow 588 | Action: 589 | - logs:CreateLogGroup 590 | - logs:CreateLogStream 591 | - logs:PutLogEvents 592 | Resource: 593 | - Fn::Join: 594 | [ 595 | ":", 596 | [ 597 | "arn:aws:logs", 598 | !Ref AWS::Region, 599 | !Ref AWS::AccountId, 600 | "log-group:/aws/codebuild/aws-rosettafold*", 601 | ], 602 | ] 603 | - Effect: Allow 604 | Action: 605 | - s3:PutObject 606 | - s3:GetObject 607 | - s3:GetObjectVersion 608 | - s3:GetBucketAcl 609 | - s3:GetBucketLocation 610 | Resource: 611 | - !Join [ 612 | "-", 613 | ["arn:aws:s3:::codepipeline", !Ref AWS::Region, "*"], 614 | ] 615 | - !Join ["", [!GetAtt ResultsS3.Arn, "*"]] 616 | - Effect: Allow 617 | Action: 618 | - codecommit:GitPull 619 | Resource: 620 | - Fn::Join: 621 | [ 622 | ":", 623 | [ 624 | "arn:aws:codecommit", 625 | !Ref AWS::Region, 626 | !Ref AWS::AccountId, 627 | !GetAtt RFCodeRepository.Name, 628 | ], 629 | ] 630 | - Effect: Allow 631 | Action: 632 | - codebuild:CreateReportGroup 633 | - codebuild:CreateReport 634 | - codebuild:UpdateReport 635 | - codebuild:BatchPutTestCases 636 | - codebuild:BatchPutCodeCoverages 637 | Resource: 638 | - Fn::Join: 639 | [ 640 | ":", 641 | [ 642 | "arn:aws:s3:::codebuild", 643 | !Ref AWS::Region, 644 | !Ref AWS::AccountId, 645 | "report-group/aws-rosettafold*", 646 | ], 647 | ] 648 | Tags: 649 | - Key: Application 650 | Value: AWS-RoseTTAFold 651 | - Key: Name 652 | Value: 653 | !Join [ 654 | "-", 655 | [ 656 | "aws-rosettafold", 657 | "codebuild-role", 658 | !Select [ 659 | 4, 660 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 661 | ], 662 | ], 663 | ] 664 | 665 | CodeBuildEncryptionKey: 666 | Type: "AWS::KMS::Key" 667 | Properties: 668 | KeyPolicy: 669 | Version: 2012-10-17 670 | Id: key-default-1 671 | Statement: 672 | - Sid: Enable IAM User Permissions 673 | Effect: Allow 674 | Principal: 675 | AWS: 676 | Fn::Join: [":", ["arn:aws:iam:", !Ref AWS::AccountId, "root"]] 677 | Action: "kms:*" 678 | Resource: "*" 679 | - Sid: Enable CodeBuild Encryption 680 | Effect: Allow 681 | Principal: 682 | AWS: !GetAtt CodeBuildRole.Arn 683 | Action: 684 | [ 685 | "kms:Encrypt", 686 | "kms:Decrypt", 687 | "kms:ReEncrypt*", 688 | "kms:GenerateDataKey*", 689 | "kms:DescribeKey", 690 | ] 691 | Resource: "*" 692 | Tags: 693 | - Key: Application 694 | Value: AWS-RoseTTAFold 695 | - Key: Name 696 | Value: 697 | !Join [ 698 | "-", 699 | [ 700 | "aws-rosettafold", 701 | "kms", 702 | !Select [ 703 | 4, 704 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 705 | ], 706 | ], 707 | ] 708 | 709 | RFCodeBuildProject: 710 | Type: AWS::CodeBuild::Project 711 | Properties: 712 | Artifacts: 713 | Type: NO_ARTIFACTS 714 | Description: Build Docker container for RoseTTAFold execution on AWS Batch 715 | EncryptionKey: !Ref CodeBuildEncryptionKey 716 | Environment: 717 | ComputeType: BUILD_GENERAL1_MEDIUM 718 | EnvironmentVariables: 719 | - Name: IMAGE_TAG 720 | Value: latest 721 | - Name: IMAGE_REPO_NAME 722 | Value: !Ref RFContainerRegistry 723 | - Name: ACCOUNT_ID 724 | Value: !Ref AWS::AccountId 725 | Image: aws/codebuild/standard:4.0 726 | ImagePullCredentialsType: CODEBUILD 727 | PrivilegedMode: true 728 | Type: LINUX_CONTAINER 729 | Name: 730 | !Join [ 731 | "-", 732 | [ 733 | "aws-rosettafold", 734 | "codebuild-project", 735 | !Select [ 736 | 4, 737 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 738 | ], 739 | ], 740 | ] 741 | ResourceAccessRole: !GetAtt CodeBuildRole.Arn 742 | ServiceRole: !GetAtt CodeBuildRole.Arn 743 | Source: 744 | BuildSpec: config/container_buildspec.yml 745 | GitCloneDepth: 1 746 | Location: !GetAtt RFCodeRepository.CloneUrlHttp 747 | Type: CODECOMMIT 748 | SourceVersion: refs/heads/main 749 | Tags: 750 | - Key: Application 751 | Value: AWS-RoseTTAFold 752 | - Key: Name 753 | Value: 754 | !Join [ 755 | "-", 756 | [ 757 | "aws-rosettafold", 758 | "codebuild-project", 759 | !Select [ 760 | 4, 761 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 762 | ], 763 | ], 764 | ] 765 | 766 | CodePipelineRole: 767 | Type: AWS::IAM::Role 768 | Properties: 769 | RoleName: 770 | !Join [ 771 | "-", 772 | [ 773 | "aws-rosettafold", 774 | "codepipeline-role", 775 | !Select [ 776 | 4, 777 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 778 | ], 779 | ], 780 | ] 781 | Description: "Required service policies to support running AWS-RoseTTAFold build pipeline" 782 | AssumeRolePolicyDocument: 783 | Version: "2012-10-17" 784 | Statement: 785 | - Effect: Allow 786 | Principal: 787 | Service: 788 | - codepipeline.amazonaws.com 789 | Action: 790 | - "sts:AssumeRole" 791 | Path: / 792 | Policies: 793 | - PolicyName: codePipelineDefault 794 | PolicyDocument: 795 | Version: "2012-10-17" 796 | Statement: 797 | - Action: 798 | - iam:PassRole 799 | Resource: "*" 800 | Effect: Allow 801 | Condition: 802 | StringEqualsIfExists: 803 | iam:PassedToService: 804 | - cloudformation.amazonaws.com 805 | - elasticbeanstalk.amazonaws.com 806 | - ec2.amazonaws.com 807 | - ecs-tasks.amazonaws.com 808 | - Action: 809 | - codecommit:CancelUploadArchive 810 | - codecommit:GetBranch 811 | - codecommit:GetCommit 812 | - codecommit:GetRepository 813 | - codecommit:GetUploadArchiveStatus 814 | - codecommit:UploadArchive 815 | Resource: "*" 816 | Effect: Allow 817 | - Action: 818 | - codedeploy:CreateDeployment 819 | - codedeploy:GetApplication 820 | - codedeploy:GetApplicationRevision 821 | - codedeploy:GetDeployment 822 | - codedeploy:GetDeploymentConfig 823 | - codedeploy:RegisterApplicationRevision 824 | Resource: "*" 825 | Effect: Allow 826 | - Action: 827 | - codestar-connections:UseConnection 828 | Resource: "*" 829 | Effect: Allow 830 | - Action: 831 | - elasticbeanstalk:* 832 | - ec2:* 833 | - elasticloadbalancing:* 834 | - autoscaling:* 835 | - cloudwatch:* 836 | - s3:* 837 | - sns:* 838 | - cloudformation:* 839 | - rds:* 840 | - sqs:* 841 | - ecs:* 842 | Resource: "*" 843 | Effect: Allow 844 | - Action: 845 | - lambda:InvokeFunction 846 | - lambda:ListFunctions 847 | Resource: "*" 848 | Effect: Allow 849 | - Action: 850 | - opsworks:CreateDeployment 851 | - opsworks:DescribeApps 852 | - opsworks:DescribeCommands 853 | - opsworks:DescribeDeployments 854 | - opsworks:DescribeInstances 855 | - opsworks:DescribeStacks 856 | - opsworks:UpdateApp 857 | - opsworks:UpdateStack 858 | Resource: "*" 859 | Effect: Allow 860 | - Action: 861 | - cloudformation:CreateStack 862 | - cloudformation:DeleteStack 863 | - cloudformation:DescribeStacks 864 | - cloudformation:UpdateStack 865 | - cloudformation:CreateChangeSet 866 | - cloudformation:DeleteChangeSet 867 | - cloudformation:DescribeChangeSet 868 | - cloudformation:ExecuteChangeSet 869 | - cloudformation:SetStackPolicy 870 | - cloudformation:ValidateTemplate 871 | Resource: "*" 872 | Effect: Allow 873 | - Action: 874 | - codebuild:BatchGetBuilds 875 | - codebuild:StartBuild 876 | - codebuild:BatchGetBuildBatches 877 | - codebuild:StartBuildBatch 878 | Resource: "*" 879 | Effect: Allow 880 | - Effect: Allow 881 | Action: 882 | - devicefarm:ListProjects 883 | - devicefarm:ListDevicePools 884 | - devicefarm:GetRun 885 | - devicefarm:GetUpload 886 | - devicefarm:CreateUpload 887 | - devicefarm:ScheduleRun 888 | Resource: "*" 889 | - Effect: Allow 890 | Action: 891 | - servicecatalog:ListProvisioningArtifacts 892 | - servicecatalog:CreateProvisioningArtifact 893 | - servicecatalog:DescribeProvisioningArtifact 894 | - servicecatalog:DeleteProvisioningArtifact 895 | - servicecatalog:UpdateProduct 896 | Resource: "*" 897 | - Effect: Allow 898 | Action: 899 | - cloudformation:ValidateTemplate 900 | Resource: "*" 901 | - Effect: Allow 902 | Action: 903 | - ecr:DescribeImages 904 | Resource: "*" 905 | - Effect: Allow 906 | Action: 907 | - states:DescribeExecution 908 | - states:DescribeStateMachine 909 | - states:StartExecution 910 | Resource: "*" 911 | - Effect: Allow 912 | Action: 913 | - appconfig:StartDeployment 914 | - appconfig:StopDeployment 915 | - appconfig:GetDeployment 916 | Resource: "*" 917 | Tags: 918 | - Key: Application 919 | Value: AWS-RoseTTAFold 920 | - Key: Name 921 | Value: 922 | !Join [ 923 | "-", 924 | [ 925 | "aws-rosettafold", 926 | "codepipeline-role", 927 | !Select [ 928 | 4, 929 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 930 | ], 931 | ], 932 | ] 933 | 934 | RFCodePipeline: 935 | Type: AWS::CodePipeline::Pipeline 936 | Properties: 937 | ArtifactStore: 938 | Location: !Ref ResultsS3 939 | Type: S3 940 | Name: 941 | !Join [ 942 | "-", 943 | [ 944 | "aws-rosettafold", 945 | "codepipeline", 946 | !Select [ 947 | 4, 948 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 949 | ], 950 | ], 951 | ] 952 | RestartExecutionOnUpdate: true 953 | RoleArn: !GetAtt CodePipelineRole.Arn 954 | Stages: 955 | - Name: Source 956 | Actions: 957 | - Name: Source 958 | ActionTypeId: 959 | Category: Source 960 | Owner: AWS 961 | Provider: CodeCommit 962 | Version: 1 963 | Configuration: 964 | RepositoryName: !GetAtt RFCodeRepository.Name 965 | BranchName: main 966 | PollForSourceChanges: "false" 967 | Namespace: SourceVariables 968 | OutputArtifacts: 969 | - Name: SourceArtifact 970 | Region: !Ref AWS::Region 971 | RunOrder: 1 972 | - Name: Build 973 | Actions: 974 | - Name: Build 975 | ActionTypeId: 976 | Category: Build 977 | Owner: AWS 978 | Provider: CodeBuild 979 | Version: 1 980 | Configuration: 981 | ProjectName: !Ref RFCodeBuildProject 982 | InputArtifacts: 983 | - Name: SourceArtifact 984 | Namespace: BuildVariables 985 | OutputArtifacts: 986 | - Name: BuildArtifact 987 | Region: !Ref AWS::Region 988 | RunOrder: 2 989 | Tags: 990 | - Key: Application 991 | Value: AWS-RoseTTAFold 992 | - Key: Name 993 | Value: 994 | !Join [ 995 | "-", 996 | [ 997 | "aws-rosettafold", 998 | "codepipeline", 999 | !Select [ 1000 | 4, 1001 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1002 | ], 1003 | ], 1004 | ] 1005 | 1006 | ################################################## 1007 | # Batch Environment 1008 | ################################################## 1009 | 1010 | CPUComputeEnvironment: 1011 | Type: AWS::Batch::ComputeEnvironment 1012 | Properties: 1013 | ComputeEnvironmentName: 1014 | !Join [ 1015 | "-", 1016 | [ 1017 | "aws-rosettafold", 1018 | "ce-cpu", 1019 | !Select [ 1020 | 4, 1021 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1022 | ], 1023 | ], 1024 | ] 1025 | ComputeResources: 1026 | AllocationStrategy: BEST_FIT_PROGRESSIVE 1027 | InstanceRole: !Ref InstanceProfile 1028 | InstanceTypes: 1029 | - optimal 1030 | LaunchTemplate: 1031 | LaunchTemplateId: !Ref InstanceLaunchTemplate 1032 | Version: $Latest 1033 | MaxvCpus: 256 1034 | MinvCpus: 0 1035 | SecurityGroupIds: 1036 | - !GetAtt VPC.DefaultSecurityGroup 1037 | Subnets: 1038 | - Ref: PrivateSubnet0 1039 | Type: EC2 1040 | State: ENABLED 1041 | Type: MANAGED 1042 | Tags: 1043 | Application: AWS-RoseTTAFold 1044 | Name: 1045 | !Join [ 1046 | "-", 1047 | [ 1048 | "aws-rosettafold", 1049 | "ce-cpu", 1050 | !Select [ 1051 | 4, 1052 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1053 | ], 1054 | ], 1055 | ] 1056 | 1057 | GPUComputeEnvironment: 1058 | Type: AWS::Batch::ComputeEnvironment 1059 | Properties: 1060 | ComputeEnvironmentName: 1061 | !Join [ 1062 | "-", 1063 | [ 1064 | "aws-rosettafold", 1065 | "ce-gpu", 1066 | !Select [ 1067 | 4, 1068 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1069 | ], 1070 | ], 1071 | ] 1072 | ComputeResources: 1073 | AllocationStrategy: BEST_FIT_PROGRESSIVE 1074 | InstanceRole: !Ref InstanceProfile 1075 | InstanceTypes: 1076 | - g4dn 1077 | LaunchTemplate: 1078 | LaunchTemplateId: !Ref InstanceLaunchTemplate 1079 | Version: $Latest 1080 | MaxvCpus: 256 1081 | MinvCpus: 0 1082 | SecurityGroupIds: 1083 | - !GetAtt VPC.DefaultSecurityGroup 1084 | Subnets: 1085 | - Ref: PrivateSubnet0 1086 | Type: EC2 1087 | State: ENABLED 1088 | Type: MANAGED 1089 | Tags: 1090 | Application: AWS-RoseTTAFold 1091 | Name: 1092 | !Join [ 1093 | "-", 1094 | [ 1095 | "aws-rosettafold", 1096 | "ce-gpu", 1097 | !Select [ 1098 | 4, 1099 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1100 | ], 1101 | ], 1102 | ] 1103 | 1104 | CPUJobQueue: 1105 | Type: AWS::Batch::JobQueue 1106 | Properties: 1107 | ComputeEnvironmentOrder: 1108 | - ComputeEnvironment: !Ref CPUComputeEnvironment 1109 | Order: 1 1110 | JobQueueName: 1111 | !Join [ 1112 | "-", 1113 | [ 1114 | "aws-rosettafold", 1115 | "queue-cpu", 1116 | !Select [ 1117 | 4, 1118 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1119 | ], 1120 | ], 1121 | ] 1122 | Priority: 10 1123 | State: ENABLED 1124 | Tags: 1125 | Application: AWS-RoseTTAFold 1126 | Name: 1127 | !Join [ 1128 | "-", 1129 | [ 1130 | "aws-rosettafold", 1131 | "queue-cpu", 1132 | !Select [ 1133 | 4, 1134 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1135 | ], 1136 | ], 1137 | ] 1138 | 1139 | GPUJobQueue: 1140 | Type: AWS::Batch::JobQueue 1141 | Properties: 1142 | ComputeEnvironmentOrder: 1143 | - ComputeEnvironment: !Ref GPUComputeEnvironment 1144 | Order: 1 1145 | JobQueueName: 1146 | !Join [ 1147 | "-", 1148 | [ 1149 | "aws-rosettafold", 1150 | "queue-gpu", 1151 | !Select [ 1152 | 4, 1153 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1154 | ], 1155 | ], 1156 | ] 1157 | Priority: 10 1158 | State: ENABLED 1159 | Tags: 1160 | Application: AWS-RoseTTAFold 1161 | Name: 1162 | !Join [ 1163 | "-", 1164 | [ 1165 | "aws-rosettafold", 1166 | "queue-gpu", 1167 | !Select [ 1168 | 4, 1169 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1170 | ], 1171 | ], 1172 | ] 1173 | 1174 | CPUDataPrepJobDefinition: 1175 | Type: AWS::Batch::JobDefinition 1176 | Properties: 1177 | ContainerProperties: 1178 | Command: 1179 | - "/bin/bash" 1180 | - "run_aws_data_prep_ver.sh" 1181 | - "-i" 1182 | - !Join ["", ["s3://", !Ref ResultsS3]] 1183 | - "-o" 1184 | - !Join ["", ["s3://", !Ref ResultsS3]] 1185 | - "-n" 1186 | - "input.fa" 1187 | - "-w" 1188 | - "/work" 1189 | - "-d" 1190 | - "/fsx/aws-rosettafold-ref-data" 1191 | - "-c" 1192 | - "8" 1193 | - "-m" 1194 | - "32" 1195 | Image: 1196 | !Join [":", [!GetAtt RFContainerRegistry.RepositoryUri, "latest"]] 1197 | LogConfiguration: 1198 | LogDriver: awslogs 1199 | MountPoints: 1200 | - ContainerPath: /fsx 1201 | ReadOnly: False 1202 | SourceVolume: fsx 1203 | ResourceRequirements: 1204 | - Type: VCPU 1205 | Value: 8 1206 | - Type: MEMORY 1207 | Value: 32000 1208 | Volumes: 1209 | - Name: fsx 1210 | Host: 1211 | SourcePath: /fsx 1212 | JobDefinitionName: 1213 | !Join [ 1214 | "-", 1215 | [ 1216 | "aws-rosettafold", 1217 | "job-def-cpudataprep", 1218 | !Select [ 1219 | 4, 1220 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1221 | ], 1222 | ], 1223 | ] 1224 | PlatformCapabilities: 1225 | - EC2 1226 | PropagateTags: true 1227 | RetryStrategy: 1228 | Attempts: 3 1229 | Tags: 1230 | Application: AWS-RoseTTAFold 1231 | Name: 1232 | !Join [ 1233 | "-", 1234 | [ 1235 | "aws-rosettafold", 1236 | "job-def-cpudataprep", 1237 | !Select [ 1238 | 4, 1239 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1240 | ], 1241 | ], 1242 | ] 1243 | Type: container 1244 | 1245 | GPUPredictJobDefinition: 1246 | Type: AWS::Batch::JobDefinition 1247 | Properties: 1248 | ContainerProperties: 1249 | Command: 1250 | - "/bin/bash" 1251 | - "run_aws_predict_ver.sh" 1252 | - "-i" 1253 | - !Join ["", ["s3://", !Ref ResultsS3]] 1254 | - "-o" 1255 | - !Join ["", ["s3://", !Ref ResultsS3]] 1256 | - "-w" 1257 | - "/work" 1258 | - "-d" 1259 | - "/fsx/aws-rosettafold-ref-data" 1260 | - "-x" 1261 | - "/fsx/aws-rosettafold-ref-data" 1262 | - "-c" 1263 | - "4" 1264 | - "-m" 1265 | - "16" 1266 | Image: 1267 | !Join [":", [!GetAtt RFContainerRegistry.RepositoryUri, "latest"]] 1268 | LogConfiguration: 1269 | LogDriver: awslogs 1270 | MountPoints: 1271 | - ContainerPath: /fsx 1272 | ReadOnly: False 1273 | SourceVolume: fsx 1274 | ResourceRequirements: 1275 | - Type: VCPU 1276 | Value: 4 1277 | - Type: MEMORY 1278 | Value: 16000 1279 | - Type: GPU 1280 | Value: 1 1281 | Volumes: 1282 | - Name: fsx 1283 | Host: 1284 | SourcePath: /fsx 1285 | JobDefinitionName: 1286 | !Join [ 1287 | "-", 1288 | [ 1289 | "aws-rosettafold", 1290 | "job-def-gpupredict", 1291 | !Select [ 1292 | 4, 1293 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1294 | ], 1295 | ], 1296 | ] 1297 | PlatformCapabilities: 1298 | - EC2 1299 | PropagateTags: true 1300 | RetryStrategy: 1301 | Attempts: 3 1302 | Tags: 1303 | Application: AWS-RoseTTAFold 1304 | Name: 1305 | !Join [ 1306 | "-", 1307 | [ 1308 | "aws-rosettafold", 1309 | "job-def-gpupredict", 1310 | !Select [ 1311 | 4, 1312 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1313 | ], 1314 | ], 1315 | ] 1316 | Type: container 1317 | 1318 | CPUPredictJobDefinition: 1319 | Type: AWS::Batch::JobDefinition 1320 | Properties: 1321 | ContainerProperties: 1322 | Command: 1323 | - "/bin/bash" 1324 | - "run_aws_predict_ver.sh" 1325 | - "-i" 1326 | - !Join ["", ["s3://", !Ref ResultsS3]] 1327 | - "-o" 1328 | - !Join ["", ["s3://", !Ref ResultsS3]] 1329 | - "-w" 1330 | - "/work" 1331 | - "-d" 1332 | - "/fsx/aws-rosettafold-ref-data" 1333 | - "-x" 1334 | - "/fsx/aws-rosettafold-ref-data" 1335 | - "-c" 1336 | - "4" 1337 | - "-m" 1338 | - "64" 1339 | Image: 1340 | !Join [":", [!GetAtt RFContainerRegistry.RepositoryUri, "latest"]] 1341 | LogConfiguration: 1342 | LogDriver: awslogs 1343 | MountPoints: 1344 | - ContainerPath: /fsx 1345 | ReadOnly: False 1346 | SourceVolume: fsx 1347 | ResourceRequirements: 1348 | - Type: VCPU 1349 | Value: 4 1350 | - Type: MEMORY 1351 | Value: 64000 1352 | Volumes: 1353 | - Name: fsx 1354 | Host: 1355 | SourcePath: /fsx 1356 | JobDefinitionName: 1357 | !Join [ 1358 | "-", 1359 | [ 1360 | "aws-rosettafold", 1361 | "job-def-cpupredict", 1362 | !Select [ 1363 | 4, 1364 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1365 | ], 1366 | ], 1367 | ] 1368 | PlatformCapabilities: 1369 | - EC2 1370 | PropagateTags: true 1371 | RetryStrategy: 1372 | Attempts: 3 1373 | Tags: 1374 | Application: AWS-RoseTTAFold 1375 | Name: 1376 | !Join [ 1377 | "-", 1378 | [ 1379 | "aws-rosettafold", 1380 | "job-def-cpupredict", 1381 | !Select [ 1382 | 4, 1383 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]], 1384 | ], 1385 | ], 1386 | ] 1387 | Type: container 1388 | 1389 | Outputs: 1390 | CodeRepoUri: 1391 | Description: URI for cloning the CodeCommit repository over HTTPS 1392 | Value: !GetAtt RFCodeRepository.CloneUrlHttp 1393 | Export: 1394 | Name: !Join [":", [!Ref "AWS::StackName", CodeRepoUri]] 1395 | CPUJobQueueName: 1396 | Description: Name of the CPU job queue. 1397 | Value: !Select [5, !Split [":", !Ref CPUJobQueue]] 1398 | Export: 1399 | Name: !Join [":", [!Ref "AWS::StackName", CPUJobQueueName]] 1400 | GPUJobQueueName: 1401 | Description: Name of the GPU job queue. 1402 | Value: !Select [5, !Split [":", !Ref GPUJobQueue]] 1403 | Export: 1404 | Name: !Join [":", [!Ref "AWS::StackName", GPUJobQueueName]] 1405 | CPUDataPrepJobDefinition: 1406 | Description: Name of the data prep CPU job definition. 1407 | Value: !Select [5, !Split [":", !Ref CPUDataPrepJobDefinition]] 1408 | Export: 1409 | Name: !Join [":", [!Ref "AWS::StackName", CPUDataPrepJobDefinition]] 1410 | GPUPredictJobDefinition: 1411 | Description: Name of the predict GPU job definition. 1412 | Value: !Select [5, !Split [":", !Ref GPUPredictJobDefinition]] 1413 | Export: 1414 | Name: !Join [":", [!Ref "AWS::StackName", GPUPredictJobDefinition]] 1415 | CPUPredictJobDefinition: 1416 | Description: Name of the predict CPU job definition. 1417 | Value: !Select [5, !Split [":", !Ref CPUPredictJobDefinition]] 1418 | Export: 1419 | Name: !Join [":", [!Ref "AWS::StackName", CPUPredictJobDefinition]] 1420 | -------------------------------------------------------------------------------- /config/container_buildspec.yml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | 3 | phases: 4 | pre_build: 5 | commands: 6 | - echo Logging in to Amazon ECR... 7 | - aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com 8 | build: 9 | commands: 10 | - echo Build started on `date` 11 | - echo Building the Docker image... 12 | - docker build -t $IMAGE_REPO_NAME:$IMAGE_TAG config 13 | - docker tag $IMAGE_REPO_NAME:$IMAGE_TAG $ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG 14 | post_build: 15 | commands: 16 | - echo Build completed on `date` 17 | - echo Pushing the Docker image... 18 | - docker push $ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG 19 | -------------------------------------------------------------------------------- /config/download_ref_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | yum install wget tar -y 3 | cd /fsx 4 | 5 | # NOTE: The RoseTTAFold network weights are covered under the Rosetta-DL software license. 6 | # Please see https://files.ipd.uw.edu/pub/RoseTTAFold/Rosetta-DL_LICENSE.txt for more 7 | # information. 8 | wget https://files.ipd.uw.edu/pub/RoseTTAFold/weights.tar.gz 9 | tar xfz weights.tar.gz 10 | rm weights.tar.gz 11 | 12 | # uniref30 [46G] 13 | wget http://wwwuser.gwdg.de/~compbiol/uniclust/2020_06/UniRef30_2020_06_hhsuite.tar.gz 14 | mkdir -p UniRef30_2020_06 15 | tar xfz UniRef30_2020_06_hhsuite.tar.gz -C ./UniRef30_2020_06 16 | rm UniRef30_2020_06_hhsuite.tar.gz 17 | 18 | # structure templates (including *_a3m.ffdata, *_a3m.ffindex) [over 100G] 19 | wget https://files.ipd.uw.edu/pub/RoseTTAFold/pdb100_2021Mar03.tar.gz 20 | tar xfz pdb100_2021Mar03.tar.gz 21 | rm pdb100_2021Mar03.sorted_opt.tar.gz 22 | 23 | # BFD [272G] 24 | wget https://bfd.mmseqs.com/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz 25 | mkdir -p bfd 26 | tar xfz bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz -C ./bfd 27 | rm bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz 28 | -------------------------------------------------------------------------------- /config/run_aws_data_prep_ver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ############################################################ 4 | # Run E2E RosettaFold analysis on AWS 5 | ## Options 6 | # -i (Required) S3 path to input folder 7 | # -o (Required) S3 path to output folder 8 | # -n Input file name (e.g. input.fa) 9 | # -p Prefix to use for output files 10 | # -w Path to working folder on run environment file system 11 | # -d Path to database folder on run environment file system 12 | # -c Max CPU count 13 | # -m Max memory amount (GB) 14 | # 15 | # Example CMD 16 | # ./AWS-RoseTTAFold/run_aws_e2e_ver.sh \ 17 | # -i s3://032243382548-rf-run-data/input \ 18 | # -o s3://032243382548-rf-run-data/output \ 19 | # -n input.fa 20 | # -w ~/work \ 21 | # -d /fsx \ 22 | # -c 16 \ 23 | # -m 64 \ 24 | 25 | # make the script stop when error (non-true exit code) is occuredcd 26 | set -e 27 | START="$(date +%s)" 28 | ############################################################ 29 | # >>> conda initialize >>> 30 | # !! Contents within this block are managed by 'conda init' !! 31 | __conda_setup="$('conda' 'shell.bash' 'hook' 2> /dev/null)" 32 | eval "$__conda_setup" 33 | unset __conda_setup 34 | # <<< conda initialize <<< 35 | ############################################################ 36 | 37 | unset -v SCRIPT PIPEDIR UUID INPUT_S3_FOLDER OUTPUT_S3_FOLDER \ 38 | INPUT_FILE WDIR DBDIR CPU MEM 39 | 40 | SCRIPT=`realpath -s $0` 41 | SCRIPTDIR=`dirname $SCRIPT` 42 | 43 | while getopts "i:o:n:p:w:d:c:m:" option 44 | do 45 | case $option in 46 | i) INPUT_S3_FOLDER=$OPTARG ;; # s3 URI to input folder 47 | o) OUTPUT_S3_FOLDER=$OPTARG ;; # s3 URI to output folder 48 | n) INPUT_FILE=$OPTARG ;; # input file name, e.g. input.fa 49 | p) UUID=$OPTARG ;; # File prefix 50 | w) WDIR=$OPTARG ;; # path to local working folder 51 | d) DBDIR=$OPTARG ;; # path to local sequence databases 52 | c) CPU=$OPTARG ;; # vCPU 53 | m) MEM=$OPTARG ;; # MEM (GB) 54 | *) exit 1 ;; 55 | esac 56 | done 57 | 58 | [ -z "$INPUT_S3_FOLDER" ] && { echo "\$INPUT_S3_OBJECT undefined"; exit 1; } 59 | [ -z "$OUTPUT_S3_FOLDER" ] && { echo "\$OUTPUT_S3_FOLDER undefined"; exit 1; } 60 | [ -z "$INPUT_FILE" ] && { INPUT_FILE="input.fa"; } 61 | [ -z "$WDIR" ] && { WDIR=$SCRIPTDIR; } 62 | [ -z "$DBDIR" ] && { DBDIR=$WDIR; } 63 | [ -z "$CPU" ] && { CPU="16"; } 64 | [ -z "$MEM" ] && { MEM="64"; } 65 | 66 | if [ -z "$UUID" ] 67 | then 68 | if [ -z "$AWS_BATCH_JOB_ID" ] 69 | then 70 | UUID=`date "+%Y%m%d%H%M%S"`; 71 | else 72 | UUID=$AWS_BATCH_JOB_ID; 73 | fi 74 | fi 75 | 76 | IN=$WDIR/input.fa 77 | aws s3 cp $INPUT_S3_FOLDER/$INPUT_FILE $IN 78 | 79 | ls $WDIR 80 | #LENGTH=`tail -n1 $IN | wc -m` 81 | LENGTH=`grep -v -e "^>" $IN | tr -d "\n" | wc -m` 82 | 83 | conda activate RoseTTAFold 84 | 85 | ############################################################ 86 | # 1. generate MSAs 87 | ############################################################ 88 | MSA_START="$(date +%s)" 89 | 90 | if [ ! -s $WDIR/t000_.msa0.a3m ] 91 | then 92 | export PIPEDIR=$DBDIR 93 | echo "Running HHblits" 94 | $SCRIPTDIR/input_prep/make_msa.sh $IN $WDIR $CPU $MEM $DBDIR 95 | fi 96 | 97 | MSA_COUNT=`grep "^>" $WDIR/t000_.msa0.a3m -c` 98 | 99 | aws s3 cp $WDIR/t000_.msa0.a3m $OUTPUT_S3_FOLDER/$UUID.msa0.a3m 100 | 101 | MSA_DURATION=$[ $(date +%s) - ${MSA_START} ] 102 | echo "${UUID} MSA duration: ${MSA_DURATION} sec" 103 | 104 | ############################################################ 105 | # 2. predict secondary structure for HHsearch run 106 | ############################################################ 107 | SS_START="$(date +%s)" 108 | if [ ! -s $WDIR/t000_.ss2 ] 109 | then 110 | export PIPEDIR=$SCRIPTDIR 111 | echo "Running PSIPRED" 112 | $SCRIPTDIR/input_prep/make_ss.sh $WDIR/t000_.msa0.a3m $WDIR/t000_.ss2 113 | fi 114 | 115 | aws s3 cp $WDIR/t000_.ss2 $OUTPUT_S3_FOLDER/$UUID.ss2 116 | 117 | SS_DURATION=$[ $(date +%s) - ${SS_START} ] 118 | echo "${UUID} SS duration: ${SS_DURATION} sec" 119 | 120 | ############################################################ 121 | # 3. search for templates 122 | ############################################################ 123 | TEMPLATE_START="$(date +%s)" 124 | DB="$DBDIR/pdb100_2021Mar03/pdb100_2021Mar03" 125 | if [ ! -s $WDIR/t000_.hhr ] 126 | then 127 | echo "Running hhsearch" 128 | HH="hhsearch -b 50 -B 500 -z 50 -Z 500 -mact 0.05 -cpu $CPU -maxmem $MEM -aliw 100000 -e 100 -p 5.0 -d $DB" 129 | cat $WDIR/t000_.ss2 $WDIR/t000_.msa0.a3m > $WDIR/t000_.msa0.ss2.a3m 130 | $HH -i $WDIR/t000_.msa0.ss2.a3m -o $WDIR/t000_.hhr -atab $WDIR/t000_.atab -v 2 131 | fi 132 | 133 | TEMPLATE_COUNT=`grep "^No [[:digit:]]*$" $WDIR/t000_.hhr -c` 134 | 135 | aws s3 cp $WDIR/t000_.msa0.ss2.a3m $OUTPUT_S3_FOLDER/$UUID.msa0.ss2.a3m 136 | aws s3 cp $WDIR/t000_.hhr $OUTPUT_S3_FOLDER/$UUID.hhr 137 | aws s3 cp $WDIR/t000_.atab $OUTPUT_S3_FOLDER/$UUID.atab 138 | 139 | TEMPLATE_DURATION=$[ $(date +%s) - ${TEMPLATE_START} ] 140 | echo "${UUID} template search duration: ${TEMPLATE_DURATION} sec" 141 | 142 | TOTAL_DATA_PREP_DURATION=$[ $(date +%s) - ${START} ] 143 | echo "${UUID} total data prep duration: ${TOTAL_DATA_PREP_DURATION} sec" 144 | 145 | # Collect metrics 146 | echo "DATA_PREP:" >> $WDIR/metrics.yaml 147 | echo " JOB_ID: ${UUID}" >> $WDIR/metrics.yaml 148 | echo " INPUT_S3_FOLDER: ${INPUT_S3_FOLDER}" >> $WDIR/metrics.yaml 149 | echo " INPUT_FILE: ${INPUT_S3_FILE}" >> $WDIR/metrics.yaml 150 | echo " OUTPUT_S3_FOLDER: ${OUTPUT_S3_FOLDER}" >> $WDIR/metrics.yaml 151 | echo " WDIR: ${WDIR}" >> $WDIR/metrics.yaml 152 | echo " DBDIR: ${DBDIR}" >> $WDIR/metrics.yaml 153 | echo " CPU: ${CPU}" >> $WDIR/metrics.yaml 154 | echo " MEM: ${MEM}" >> $WDIR/metrics.yaml 155 | echo " LENGTH: ${LENGTH}" >> $WDIR/metrics.yaml 156 | echo " MSA_COUNT: ${MSA_COUNT}" >> $WDIR/metrics.yaml 157 | echo " TEMPLATE_COUNT: ${TEMPLATE_COUNT}" >> $WDIR/metrics.yaml 158 | echo " START_TIME: ${START}" >> $WDIR/metrics.yaml 159 | echo " MSA_DURATION: ${MSA_DURATION}" >> $WDIR/metrics.yaml 160 | echo " SS_DURATION: ${SS_DURATION}" >> $WDIR/metrics.yaml 161 | echo " TEMPLATE_DURATION: ${TEMPLATE_DURATION}" >> $WDIR/metrics.yaml 162 | echo " TOTAL_DATA_PREP_DURATION: ${TOTAL_DATA_PREP_DURATION}" >> $WDIR/metrics.yaml 163 | 164 | aws s3 cp $WDIR/metrics.yaml $OUTPUT_S3_FOLDER/metrics.yaml 165 | 166 | echo "Done" -------------------------------------------------------------------------------- /config/run_aws_predict_ver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ############################################################ 4 | # Run RosettaFold prediction analysis on AWS 5 | ## Options 6 | # -i (Required) S3 path to input folder 7 | # -o (Required) S3 path to output folder 8 | # -p Prefix to use for output files 9 | # -w Path to working folder on run environment file system 10 | # -d Path to database folder on run environment file system 11 | # -x Pathe to model weights folder on run environment 12 | # -c Max CPU count 13 | # -m Max memory amount (GB) 14 | # 15 | # Example CMD 16 | # ./AWS-RoseTTAFold/run_aws_e2e_ver.sh \ 17 | # -i s3://032243382548-rf-run-data/input \ 18 | # -o s3://032243382548-rf-run-data/output \ 19 | # -w ~/work \ 20 | # -d /fsx/RoseTTAFold \ 21 | # -x /fsx/RoseTTAFold \ 22 | # -c 16 \ 23 | # -m 64 \ 24 | 25 | # make the script stop when error (non-true exit code) is occuredcd 26 | set -e 27 | START="$(date +%s)" 28 | ############################################################ 29 | # >>> conda initialize >>> 30 | # !! Contents within this block are managed by 'conda init' !! 31 | __conda_setup="$('conda' 'shell.bash' 'hook' 2> /dev/null)" 32 | eval "$__conda_setup" 33 | unset __conda_setup 34 | # <<< conda initialize <<< 35 | ############################################################ 36 | 37 | unset -v SCRIPT PIPEDIR UUID INPUT_S3_FOLDER OUTPUT_S3_FOLDER \ 38 | INPUT_FILE WDIR DBDIR MODEL_WEIGHTS_DIR CPU MEM 39 | 40 | SCRIPT=`realpath -s $0` 41 | SCRIPTDIR=`dirname $SCRIPT` 42 | 43 | while getopts "i:o:p:w:d:x:c:m:" option 44 | do 45 | case $option in 46 | i) INPUT_S3_FOLDER=$OPTARG ;; # s3 URI to input folder 47 | o) OUTPUT_S3_FOLDER=$OPTARG ;; # s3 URI to output folder 48 | p) UUID=$OPTARG ;; # File prefix 49 | w) WDIR=$OPTARG ;; # path to local working folder 50 | d) DBDIR=$OPTARG ;; # path to local sequence databases 51 | x) MODEL_WEIGHTS_DIR=$OPTARG ;; # path to local weights 52 | c) CPU=$OPTARG ;; # vCPU 53 | m) MEM=$OPTARG ;; # MEM (GB) 54 | *) exit 1 ;; 55 | esac 56 | done 57 | 58 | [ -z "$INPUT_S3_FOLDER" ] && { echo "\$INPUT_S3_OBJECT undefined"; exit 1; } 59 | [ -z "$OUTPUT_S3_FOLDER" ] && { echo "\$OUTPUT_S3_FOLDER undefined"; exit 1; } 60 | [ -z "$WDIR" ] && { WDIR=$SCRIPTDIR; } 61 | [ -z "$DBDIR" ] && { DBDIR=$WDIR; } 62 | [ -z "$MODEL_WEIGHTS_DIR" ] && { MODEL_WEIGHTS_DIR=$WDIR; } 63 | [ -z "$CPU" ] && { CPU="16"; } 64 | [ -z "$MEM" ] && { MEM="64"; } 65 | [ -z "$CUDA_VISIBLE_DEVICES" ] && { CUDA_VISIBLE_DEVICES="99"; } 66 | 67 | if [ -z "$UUID" ] 68 | then 69 | if [ -z "$AWS_BATCH_JOB_ID" ] 70 | then 71 | UUID=`date "+%Y%m%d%H%M%S"`; 72 | else 73 | UUID=$AWS_BATCH_JOB_ID; 74 | fi 75 | fi 76 | 77 | IN=$WDIR/input.fa 78 | 79 | conda activate RoseTTAFold 80 | 81 | aws s3 cp $INPUT_S3_FOLDER/$UUID.msa0.a3m $WDIR/t000_.msa0.a3m 82 | aws s3 cp $INPUT_S3_FOLDER/$UUID.hhr $WDIR/t000_.hhr 83 | aws s3 cp $INPUT_S3_FOLDER/$UUID.atab $WDIR/t000_.atab 84 | aws s3 cp $INPUT_S3_FOLDER/metrics.yaml $WDIR/metrics.yaml 85 | 86 | ############################################################ 87 | # End-to-end prediction 88 | ############################################################ 89 | PREDICT_START="$(date +%s)" 90 | if [ ! -s $WDIR/t000_.3track.npz ] 91 | then 92 | echo "Running end-to-end prediction" 93 | DB="$DBDIR/pdb100_2021Mar03/pdb100_2021Mar03" 94 | 95 | python $SCRIPTDIR/network/predict_e2e.py \ 96 | -m $MODEL_WEIGHTS_DIR/weights \ 97 | -i $WDIR/t000_.msa0.a3m \ 98 | -o $WDIR/t000_.e2e \ 99 | --hhr $WDIR/t000_.hhr \ 100 | --atab $WDIR/t000_.atab \ 101 | --db $DB 102 | fi 103 | 104 | aws s3 cp $WDIR/t000_.e2e.pdb $OUTPUT_S3_FOLDER/$UUID.e2e.pdb 105 | aws s3 cp $WDIR/t000_.e2e_init.pdb $OUTPUT_S3_FOLDER/$UUID.e2e_init.pdb 106 | aws s3 cp $WDIR/t000_.e2e.npz $OUTPUT_S3_FOLDER/$UUID.e2e.npz 107 | 108 | TOTAL_PREDICT_DURATION=$[ $(date +%s) - ${PREDICT_START} ] 109 | echo "${UUID} prediction duration: ${TOTAL_PREDICT_DURATION} sec" 110 | 111 | # Collect metrics 112 | echo "PREDICT:" >> $WDIR/metrics.yaml 113 | echo " JOB_ID: ${UUID}" >> $WDIR/metrics.yaml 114 | echo " INPUT_S3_FOLDER: ${INPUT_S3_FOLDER}" >> $WDIR/metrics.yaml 115 | echo " OUTPUT_S3_FOLDER: ${OUTPUT_S3_FOLDER}" >> $WDIR/metrics.yaml 116 | echo " WDIR: ${WDIR}" >> $WDIR/metrics_data_prep.yaml 117 | echo " DBDIR: ${DBDIR}" >> $WDIR/metrics.yaml 118 | echo " MODEL_WEIGHTS_DIR: ${MODEL_WEIGHTS_DIR}" >> $WDIR/metrics.yaml 119 | echo " CPU: ${CPU}" >> $WDIR/metrics.yaml 120 | echo " MEM: ${MEM}" >> $WDIR/metrics.yaml 121 | echo " GPU: ${CUDA_VISIBLE_DEVICES}" >> $WDIR/metrics.yaml 122 | echo " START_TIME: ${PREDICT_START}" >> $WDIR/metrics.yaml 123 | echo " TOTAL_PREDICT_DURATION: ${TOTAL_PREDICT_DURATION}" >> $WDIR/metrics.yaml 124 | 125 | aws s3 cp $WDIR/metrics.yaml $OUTPUT_S3_FOLDER/metrics.yaml 126 | 127 | echo "Done" -------------------------------------------------------------------------------- /data/T1028.fa: -------------------------------------------------------------------------------- 1 | >T1028 CalU17, Micromonospora echinospora, 316 residues| 2 | MARIGDLDAARPAPEAVPGDMVRIPGGTFLQGSPERTLDWLDREGQAFPRDWFTDETPQIPVTLPDYLIDRHQVTVAQFAAFVSRTGYVTSAERAGGSMVYGEQYWEIREGACWHRPAGYGSGIRGRDDHPVVHISFADAEAYARWAGRRLPTESEWERAATGPSYRLWPWGDTWDSRNANTAEHTAGALGDLDAWRTWWGAIHAVQGPMPQTTPVGAFSPRGDSVDGCADMTGNVYEWTSTLAHLYSPATRCDPTIHLVMGRSRVIRGGSWMNFRYQVRCAERLYGDPTGWSNFALGFRCARDVTAVPHVDDNGR -------------------------------------------------------------------------------- /data/T1036s1.fa: -------------------------------------------------------------------------------- 1 | >T1036s1 Monoclonal antibody 93k, Varicella-zoster virus, strain pOka, subunit 1, 622 residues| 2 | TKPTFYVCPPPTGSTIVRLEPPRTCPDYHLGKNFTEGIAVVYKENIAAYKFKATVYYKDVIVSTAWAGSSYTQITNRYADRVPIPVSEITDTIDKFGKCSSKATYVRNNHKVEAFNEDKNPQDMPLIASKYNSVGSKAWHTTNDTYMVAGTPGTYRTGTSVNCIIEEVEARSIFPYDSFGLSTGDIIYMSPFFGLRDGAYREHSNYAMDRFHQFEGYRQRDLDTRALLEPAARNFLVTPHLTVGWNWKPKRTEVCSLVKWREVEDVVRDEYAHNFRFTMKTLSTTFISETNEFNLNQIHLSQCVKEEARAIINRIYTTRYNSSHVRTGDIQTYLARGGFVVVFQPLLSNSLARLYLQELVRENTNHSPQKHPTRNTRSRRSVPVELRANRTITTTSSVEFAMLQFTYDHIQEHVNEMLARISSSWCQLQNRERALWSGLFPINPSALASTILDQRVKARILGDVISVSNCPELGSDTRIILQNSMRVSGSTTRCYSRPLISIVSLNGSGTVEGQLGTDNELIMSRDLLEPCVANHKRYFLFGHHYVYYEDYRYVREIAVHDVGMISTYVDLNLTLLKDREFMPLRVYTRDELRDTGLLDYSEIQRRNQMHSLRFYDIDKVVQ -------------------------------------------------------------------------------- /data/T1078.fa: -------------------------------------------------------------------------------- 1 | >T1078 Tsp1, Trichoderma virens, 138 residues| 2 | MAAPTPADKSMMAAVPEWTITNLKRVCNAGNTSCTWTFGVDTHLATATSCTYVVKANANASQASGGPVTCGPYTITSSWSGQFGPNNGFTTFAVTDFSKKLIVWPAYTDVQVQAGKVVSPNQSYAPANLPLEHHHHHH -------------------------------------------------------------------------------- /img/AWS-RoseTTAFold-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/AWS-RoseTTAFold-arch.png -------------------------------------------------------------------------------- /img/AWS-RoseTTAFold-deploy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/AWS-RoseTTAFold-deploy.png -------------------------------------------------------------------------------- /img/LaunchStack.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/LaunchStack.jpg -------------------------------------------------------------------------------- /img/RF_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/RF_workflow.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | biopython 2 | py3Dmol 3 | boto3 4 | sagemaker 5 | matplotlib 6 | pyyaml -------------------------------------------------------------------------------- /rfutils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/rfutils/__init__.py -------------------------------------------------------------------------------- /rfutils/rfutils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for the AWS-RoseTTAFold notebook. 3 | """ 4 | 5 | ## Load dependencies 6 | from Bio import SeqIO 7 | import boto3 8 | from datetime import datetime 9 | import json 10 | import matplotlib.pyplot as plt 11 | from matplotlib import colors 12 | import numpy as np 13 | import os 14 | import pandas as pd 15 | import py3Dmol 16 | import yaml 17 | from re import sub 18 | import sagemaker 19 | import string 20 | from string import ascii_uppercase, ascii_lowercase 21 | from time import sleep 22 | import uuid 23 | 24 | # Get service clients 25 | session = boto3.session.Session() 26 | sm_session = sagemaker.session.Session() 27 | region = session.region_name 28 | role = sagemaker.get_execution_role() 29 | s3 = boto3.client("s3", region_name=region) 30 | 31 | pymol_color_list = [ 32 | "#33ff33", 33 | "#00ffff", 34 | "#ff33cc", 35 | "#ffff00", 36 | "#ff9999", 37 | "#e5e5e5", 38 | "#7f7fff", 39 | "#ff7f00", 40 | "#7fff7f", 41 | "#199999", 42 | "#ff007f", 43 | "#ffdd5e", 44 | "#8c3f99", 45 | "#b2b2b2", 46 | "#007fff", 47 | "#c4b200", 48 | "#8cb266", 49 | "#00bfbf", 50 | "#b27f7f", 51 | "#fcd1a5", 52 | "#ff7f7f", 53 | "#ffbfdd", 54 | "#7fffff", 55 | "#ffff7f", 56 | "#00ff7f", 57 | "#337fcc", 58 | "#d8337f", 59 | "#bfff3f", 60 | "#ff7fff", 61 | "#d8d8ff", 62 | "#3fffbf", 63 | "#b78c4c", 64 | "#339933", 65 | "#66b2b2", 66 | "#ba8c84", 67 | "#84bf00", 68 | "#b24c66", 69 | "#7f7f7f", 70 | "#3f3fa5", 71 | "#a5512b", 72 | ] 73 | 74 | pymol_cmap = colors.ListedColormap(pymol_color_list) 75 | alphabet_list = list(ascii_uppercase + ascii_lowercase) 76 | 77 | aatypes = set("ACDEFGHIKLMNPQRSTVWY") 78 | 79 | 80 | def create_job_name(suffix=None): 81 | 82 | """ 83 | Define a simple job identifier 84 | """ 85 | 86 | if suffix == None: 87 | return datetime.utcnow().strftime("%Y%m%dT%H%M%S") 88 | else: 89 | ## Ensure that the suffix conforms to the Batch requirements, (only letters, 90 | ## numbers, hyphens, and underscores are allowed). 91 | suffix = sub("\W", "_", suffix) 92 | return datetime.utcnow().strftime("%Y%m%dT%H%M%S") + "_" + suffix 93 | 94 | 95 | def display_msa(jobId, bucket): 96 | """ 97 | Display the MSA plot in a Jupyter notebook cell 98 | """ 99 | 100 | info = get_batch_job_info(jobId) 101 | 102 | if info["status"] == "SUCCEEDED": 103 | print( 104 | f"Downloading MSA file from s3://{bucket}/{info['jobName']}/{info['jobName']}.msa0.a3m" 105 | ) 106 | s3.download_file( 107 | bucket, 108 | f"{info['jobName']}/{info['jobName']}.msa0.a3m", 109 | "data/alignment.msa", 110 | ) 111 | msa_all = parse_a3m("data/alignment.msa") 112 | plot_msa_info(msa_all) 113 | else: 114 | print( 115 | f"Data prep job {info['jobId']} is in {info['status']} status. Please try again once the job has completed." 116 | ) 117 | 118 | 119 | def display_structure( 120 | jobId, 121 | bucket, 122 | color="lDDT", 123 | show_sidechains=False, 124 | show_mainchains=False, 125 | chains=1, 126 | vmin=0.5, 127 | vmax=0.9, 128 | ): 129 | """ 130 | Display the predicted structure in a Jupyter notebook cell 131 | """ 132 | if color not in ["chain", "lDDT", "rainbow"]: 133 | raise ValueError("Color must be 'LDDT' (default), 'chain', or 'rainbow'") 134 | 135 | info = get_batch_job_info(jobId) 136 | 137 | if info["status"] == "SUCCEEDED": 138 | print( 139 | f"Downloading PDB file from s3://{bucket}/{info['jobName']}/{info['jobName']}.e2e.pdb" 140 | ) 141 | s3.download_file( 142 | bucket, f"{info['jobName']}/{info['jobName']}.e2e.pdb", "data/e2e.pdb" 143 | ) 144 | plot_pdb( 145 | "data/e2e.pdb", 146 | show_sidechains=show_sidechains, 147 | show_mainchains=show_mainchains, 148 | color=color, 149 | chains=chains, 150 | vmin=vmin, 151 | vmax=vmax, 152 | ).show() 153 | if color == "lDDT": 154 | plot_plddt_legend().show() 155 | else: 156 | print( 157 | f"{info['jobId']} is in {info['status']} status. Please try again once the job has completed." 158 | ) 159 | 160 | 161 | def get_batch_job_info(jobId): 162 | 163 | """ 164 | Retrieve and format information about a batch job. 165 | """ 166 | 167 | client = boto3.client("batch") 168 | job_description = client.describe_jobs(jobs=[jobId]) 169 | 170 | output = { 171 | "jobArn": job_description["jobs"][0]["jobArn"], 172 | "jobName": job_description["jobs"][0]["jobName"], 173 | "jobId": job_description["jobs"][0]["jobId"], 174 | "status": job_description["jobs"][0]["status"], 175 | "createdAt": datetime.utcfromtimestamp( 176 | job_description["jobs"][0]["createdAt"] / 1000 177 | ).strftime("%Y-%m-%dT%H:%M:%SZ"), 178 | "dependsOn": job_description["jobs"][0]["dependsOn"], 179 | "tags": job_description["jobs"][0]["tags"], 180 | } 181 | 182 | if output["status"] in ["STARTING", "RUNNING", "SUCCEEDED", "FAILED"]: 183 | output["logStreamName"] = job_description["jobs"][0]["container"][ 184 | "logStreamName" 185 | ] 186 | return output 187 | 188 | 189 | def get_batch_logs(logStreamName): 190 | 191 | """ 192 | Retrieve and format logs for batch job. 193 | """ 194 | 195 | client = boto3.client("logs") 196 | try: 197 | response = client.get_log_events( 198 | logGroupName="/aws/batch/job", logStreamName=logStreamName 199 | ) 200 | except client.meta.client.exceptions.ResourceNotFoundException: 201 | return f"Log stream {logStreamName} does not exist. Please try again in a few minutes" 202 | 203 | logs = pd.DataFrame.from_dict(response["events"]) 204 | logs.timestamp = logs.timestamp.transform( 205 | lambda x: datetime.fromtimestamp(x / 1000) 206 | ) 207 | logs.drop("ingestionTime", axis=1, inplace=True) 208 | return logs 209 | 210 | 211 | def get_rf_job_info( 212 | cpu_queue="AWS-RoseTTAFold-CPU", gpu_queue="AWS-RoseTTAFold-GPU", hrs_in_past=1 213 | ): 214 | 215 | """ 216 | Display information about recent AWS-RoseTTAFold jobs 217 | """ 218 | from datetime import datetime 219 | 220 | batch_client = boto3.client("batch") 221 | recent_jobs = list_recent_jobs([cpu_queue, gpu_queue], hrs_in_past) 222 | recent_job_df = pd.DataFrame.from_dict(recent_jobs) 223 | list_of_lists = [] 224 | if len(recent_job_df) > 0: 225 | detail_list = batch_client.describe_jobs(jobs=recent_job_df.jobId.to_list()) 226 | for job in detail_list["jobs"]: 227 | resource_dict = {} 228 | for resource in job["container"]["resourceRequirements"]: 229 | resource_dict[resource["type"]] = resource["value"] 230 | row = [ 231 | job["jobName"], 232 | job["jobId"], 233 | job["jobQueue"], 234 | job["status"], 235 | datetime.fromtimestamp(job["createdAt"] / 1000), 236 | datetime.fromtimestamp(job["startedAt"] / 1000) 237 | if "startedAt" in job 238 | else "NaT", 239 | datetime.fromtimestamp(job["stoppedAt"] / 1000) 240 | if "stoppedAt" in job 241 | else "NaT", 242 | str( 243 | datetime.fromtimestamp(job["stoppedAt"] / 1000) 244 | - datetime.fromtimestamp(job["startedAt"] / 1000) 245 | ) 246 | if "startedAt" in job and "stoppedAt" in job 247 | else "NaN", 248 | (job["stoppedAt"] / 1000) - (job["startedAt"] / 1000) 249 | if "startedAt" in job and "stoppedAt" in job 250 | else "NaN", 251 | job["jobDefinition"], 252 | job["container"]["logStreamName"] 253 | if "logStreamName" in job["container"] 254 | else "", 255 | int(resource_dict["VCPU"]), 256 | int(float(resource_dict["MEMORY"]) / 1000), 257 | int(resource_dict["GPU"]) if "GPU" in resource_dict else 0, 258 | ] 259 | list_of_lists.append(row) 260 | 261 | return pd.DataFrame( 262 | list_of_lists, 263 | columns=[ 264 | "jobName", 265 | "jobId", 266 | "jobQueue", 267 | "status", 268 | "createdAt", 269 | "startedAt", 270 | "stoppedAt", 271 | "duration", 272 | "duration_sec", 273 | "jobDefinition", 274 | "logStreamName", 275 | "vCPUs", 276 | "mem_GB", 277 | "GPUs", 278 | ], 279 | ).sort_values(by="jobName", ascending=False) 280 | 281 | 282 | def get_rf_job_metrics(job_name, bucket, region="us-east-1"): 283 | """ 284 | Retrieve RF job metrics from the metrics.yaml file 285 | """ 286 | 287 | s3.download_file( 288 | bucket, 289 | f"{job_name}/metrics.yaml", 290 | "data/metrics.yaml", 291 | ) 292 | 293 | with open("data/metrics.yaml", "r") as stream: 294 | try: 295 | metrics = yaml.safe_load(stream) 296 | except yaml.YAMLError as exc: 297 | print(exc) 298 | 299 | return metrics 300 | 301 | 302 | def get_rosettafold_batch_resources(region="us-east-1"): 303 | """ 304 | Retrieve a list of batch job definitions and queues created as part of an 305 | AWS-RoseTTAFold stack. 306 | """ 307 | batch = boto3.client("batch", region_name=region) 308 | 309 | job_definition_response = batch.describe_job_definitions() 310 | list_of_lists = [] 311 | 312 | job_list = [] 313 | for jd in job_definition_response["jobDefinitions"]: 314 | if jd["status"] == "ACTIVE" and "aws-rosettafold" in jd["jobDefinitionName"]: 315 | name_split = jd["jobDefinitionName"].split("-") 316 | entry = { 317 | "stackId": name_split[5], 318 | "dataPrepJobDefinition": jd["jobDefinitionName"], 319 | } 320 | row = [ 321 | name_split[5], 322 | name_split[4], 323 | "Job Definition", 324 | jd["jobDefinitionName"], 325 | ] 326 | job_list.append(row) 327 | 328 | job_queue_response = batch.describe_job_queues() 329 | jq_list = [] 330 | for jq in job_queue_response["jobQueues"]: 331 | if ( 332 | jq["state"] == "ENABLED" 333 | and jq["status"] == "VALID" 334 | and "aws-rosettafold-queue" in jq["jobQueueName"] 335 | ): 336 | name_split = jq["jobQueueName"].split("-") 337 | row = [name_split[4], name_split[3], "Job Queue", jq["jobQueueName"]] 338 | job_list.append(row) 339 | 340 | df = pd.DataFrame( 341 | job_list, 342 | columns=["stackId", "instanceType", "resourceType", "resourceName"], 343 | ).sort_values(by=["stackId", "instanceType"], ascending=False) 344 | df["type"] = df["instanceType"] + df["resourceType"] 345 | df = df.pivot(index="stackId", columns="type", values=["resourceName"]) 346 | df.columns = df.columns.get_level_values(1) 347 | df = df.rename( 348 | columns={ 349 | "cpudataprepJob Definition": "CPUDataPrepJobDefinition", 350 | "cpuJob Queue": "CPUJobQueue", 351 | "cpupredictJob Definition": "CPUPredictJobDefinition", 352 | "gpupredictJob Definition": "GPUPredictJobDefinition", 353 | "gpuJob Queue": "GPUJobQueue", 354 | } 355 | ) 356 | return df 357 | 358 | 359 | def list_recent_jobs(job_queues, hrs_in_past=1): 360 | 361 | """ 362 | Display recently-submitted jobs. 363 | """ 364 | 365 | batch_client = boto3.client("batch") 366 | result = [] 367 | for queue in job_queues: 368 | recent_queue_jobs = batch_client.list_jobs( 369 | jobQueue=queue, 370 | filters=[ 371 | { 372 | "name": "AFTER_CREATED_AT", 373 | "values": [ 374 | str(round(datetime.now().timestamp()) - (hrs_in_past * 3600)) 375 | ], 376 | } 377 | ], 378 | ) 379 | result = result + recent_queue_jobs["jobSummaryList"] 380 | 381 | return result 382 | 383 | 384 | def parse_a3m(filename): 385 | 386 | """ 387 | Read A3M and convert letters into integers in the 0..20 range, 388 | Copied from https://github.com/RosettaCommons/RoseTTAFold/blob/main/network/parsers.py 389 | """ 390 | 391 | msa = [] 392 | table = str.maketrans(dict.fromkeys(string.ascii_lowercase)) 393 | # read file line by line 394 | for line in open(filename, "r"): 395 | # skip labels 396 | if line[0] == ">": 397 | continue 398 | # remove right whitespaces 399 | line = line.rstrip() 400 | # remove lowercase letters and append to MSA 401 | msa.append(line.translate(table)) 402 | # convert letters into numbers 403 | alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype="|S1").view(np.uint8) 404 | msa = np.array([list(s) for s in msa], dtype="|S1").view(np.uint8) 405 | for i in range(alphabet.shape[0]): 406 | msa[msa == alphabet[i]] = i 407 | # treat all unknown characters as gaps 408 | msa[msa > 20] = 20 409 | return msa 410 | 411 | 412 | def read_pdb_renum(pdb_filename, Ls=None): 413 | 414 | """ 415 | Process pdb file. 416 | Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py 417 | """ 418 | 419 | if Ls is not None: 420 | L_init = 0 421 | new_chain = {} 422 | for L, c in zip(Ls, alphabet_list): 423 | new_chain.update({i: c for i in range(L_init, L_init + L)}) 424 | L_init += L 425 | n, pdb_out = 1, [] 426 | resnum_, chain_ = 1, "A" 427 | for line in open(pdb_filename, "r"): 428 | if line[:4] == "ATOM": 429 | chain = line[21:22] 430 | resnum = int(line[22 : 22 + 5]) 431 | if resnum != resnum_ or chain != chain_: 432 | resnum_, chain_ = resnum, chain 433 | n += 1 434 | if Ls is None: 435 | pdb_out.append("%s%4i%s" % (line[:22], n, line[26:])) 436 | else: 437 | pdb_out.append( 438 | "%s%s%4i%s" % (line[:21], new_chain[n - 1], n, line[26:]) 439 | ) 440 | return "".join(pdb_out) 441 | 442 | 443 | def plot_msa_info(msa): 444 | 445 | """ 446 | Plot a representation of the MSA coverage. 447 | Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py 448 | """ 449 | 450 | msa_arr = np.unique(msa, axis=0) 451 | total_msa_size = len(msa_arr) 452 | print(f"\n{total_msa_size} Sequences Found in Total\n") 453 | 454 | if total_msa_size > 1: 455 | plt.figure(figsize=(8, 5), dpi=100) 456 | plt.title("Sequence coverage") 457 | seqid = (msa[0] == msa_arr).mean(-1) 458 | seqid_sort = seqid.argsort() 459 | non_gaps = (msa_arr != 20).astype(float) 460 | non_gaps[non_gaps == 0] = np.nan 461 | plt.imshow( 462 | non_gaps[seqid_sort] * seqid[seqid_sort, None], 463 | interpolation="nearest", 464 | aspect="auto", 465 | cmap="rainbow_r", 466 | vmin=0, 467 | vmax=1, 468 | origin="lower", 469 | extent=(0, msa_arr.shape[1], 0, msa_arr.shape[0]), 470 | ) 471 | plt.plot((msa_arr != 20).sum(0), color="black") 472 | plt.xlim(0, msa_arr.shape[1]) 473 | plt.ylim(0, msa_arr.shape[0]) 474 | plt.colorbar( 475 | label="Sequence identity to query", 476 | ) 477 | plt.xlabel("Positions") 478 | plt.ylabel("Sequences") 479 | plt.show() 480 | else: 481 | print("Unable to display MSA of length 1") 482 | 483 | 484 | def plot_pdb( 485 | pred_output_path, 486 | show_sidechains=False, 487 | show_mainchains=False, 488 | color="lDDT", 489 | chains=None, 490 | Ls=None, 491 | vmin=0.5, 492 | vmax=0.9, 493 | color_HP=False, 494 | size=(800, 480), 495 | ): 496 | 497 | """ 498 | Create a 3D view of a pdb structure 499 | Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py 500 | """ 501 | 502 | if chains is None: 503 | chains = 1 if Ls is None else len(Ls) 504 | 505 | view = py3Dmol.view( 506 | js="https://3dmol.org/build/3Dmol.js", width=size[0], height=size[1] 507 | ) 508 | view.addModel(read_pdb_renum(pred_output_path, Ls), "pdb") 509 | if color == "lDDT": 510 | view.setStyle( 511 | { 512 | "cartoon": { 513 | "colorscheme": { 514 | "prop": "b", 515 | "gradient": "roygb", 516 | "min": vmin, 517 | "max": vmax, 518 | } 519 | } 520 | } 521 | ) 522 | elif color == "rainbow": 523 | view.setStyle({"cartoon": {"color": "spectrum"}}) 524 | elif color == "chain": 525 | for n, chain, color in zip(range(chains), alphabet_list, pymol_color_list): 526 | view.setStyle({"chain": chain}, {"cartoon": {"color": color}}) 527 | if show_sidechains: 528 | BB = ["C", "O", "N"] 529 | HP = [ 530 | "ALA", 531 | "GLY", 532 | "VAL", 533 | "ILE", 534 | "LEU", 535 | "PHE", 536 | "MET", 537 | "PRO", 538 | "TRP", 539 | "CYS", 540 | "TYR", 541 | ] 542 | if color_HP: 543 | view.addStyle( 544 | {"and": [{"resn": HP}, {"atom": BB, "invert": True}]}, 545 | {"stick": {"colorscheme": "yellowCarbon", "radius": 0.3}}, 546 | ) 547 | view.addStyle( 548 | {"and": [{"resn": HP, "invert": True}, {"atom": BB, "invert": True}]}, 549 | {"stick": {"colorscheme": "whiteCarbon", "radius": 0.3}}, 550 | ) 551 | view.addStyle( 552 | {"and": [{"resn": "GLY"}, {"atom": "CA"}]}, 553 | {"sphere": {"colorscheme": "yellowCarbon", "radius": 0.3}}, 554 | ) 555 | view.addStyle( 556 | {"and": [{"resn": "PRO"}, {"atom": ["C", "O"], "invert": True}]}, 557 | {"stick": {"colorscheme": "yellowCarbon", "radius": 0.3}}, 558 | ) 559 | else: 560 | view.addStyle( 561 | { 562 | "and": [ 563 | {"resn": ["GLY", "PRO"], "invert": True}, 564 | {"atom": BB, "invert": True}, 565 | ] 566 | }, 567 | {"stick": {"colorscheme": f"WhiteCarbon", "radius": 0.3}}, 568 | ) 569 | view.addStyle( 570 | {"and": [{"resn": "GLY"}, {"atom": "CA"}]}, 571 | {"sphere": {"colorscheme": f"WhiteCarbon", "radius": 0.3}}, 572 | ) 573 | view.addStyle( 574 | {"and": [{"resn": "PRO"}, {"atom": ["C", "O"], "invert": True}]}, 575 | {"stick": {"colorscheme": f"WhiteCarbon", "radius": 0.3}}, 576 | ) 577 | if show_mainchains: 578 | BB = ["C", "O", "N", "CA"] 579 | view.addStyle( 580 | {"atom": BB}, {"stick": {"colorscheme": f"WhiteCarbon", "radius": 0.3}} 581 | ) 582 | view.zoomTo() 583 | return view 584 | 585 | 586 | def plot_plddt_legend(dpi=100): 587 | 588 | """ 589 | Create 3D Plot legend 590 | Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py 591 | """ 592 | 593 | thresh = [ 594 | "plDDT:", 595 | "Very low (<50)", 596 | "Low (60)", 597 | "OK (70)", 598 | "Confident (80)", 599 | "Very high (>90)", 600 | ] 601 | plt.figure(figsize=(1, 0.1), dpi=dpi) 602 | ######################################## 603 | for c in ["#FFFFFF", "#FF0000", "#FFFF00", "#00FF00", "#00FFFF", "#0000FF"]: 604 | plt.bar(0, 0, color=c) 605 | plt.legend( 606 | thresh, 607 | frameon=False, 608 | loc="center", 609 | ncol=6, 610 | handletextpad=1, 611 | columnspacing=1, 612 | markerscale=0.5, 613 | ) 614 | plt.axis(False) 615 | return plt 616 | 617 | 618 | def submit_2_step_job( 619 | bucket=sm_session.default_bucket(), 620 | job_name=uuid.uuid4(), 621 | data_prep_input_file="input.fa", 622 | data_prep_job_definition="AWS-RoseTTAFold-CPU", 623 | data_prep_queue="AWS-RoseTTAFold-CPU", 624 | data_prep_cpu=8, 625 | data_prep_mem=32, 626 | predict_job_definition="AWS-RoseTTAFold-GPU", 627 | predict_queue="AWS-RoseTTAFold-GPU", 628 | predict_cpu=4, 629 | predict_mem=16, 630 | predict_gpu=True, 631 | db_path="/fsx/aws-rosettafold-ref-data", 632 | weights_path="/fsx/aws-rosettafold-ref-data", 633 | ): 634 | 635 | """ 636 | Submit a 2-step RoseTTAFold prediction job to AWS Batch. 637 | """ 638 | 639 | working_folder = f"s3://{bucket}/{job_name}" 640 | batch_client = boto3.client("batch") 641 | output_pdb_uri = f"{working_folder}/{job_name}.e2e.pdb" 642 | 643 | data_prep_response = submit_rf_data_prep_job( 644 | bucket=bucket, 645 | job_name=job_name, 646 | input_file=data_prep_input_file, 647 | job_definition=data_prep_job_definition, 648 | job_queue=data_prep_queue, 649 | cpu=data_prep_cpu, 650 | mem=data_prep_mem, 651 | db_path=db_path, 652 | ) 653 | 654 | predict_response = submit_rf_predict_job( 655 | bucket=bucket, 656 | job_name=job_name, 657 | job_definition=predict_job_definition, 658 | job_queue=predict_queue, 659 | cpu=predict_cpu, 660 | mem=predict_mem, 661 | gpu=predict_gpu, 662 | db_path=db_path, 663 | weights_path=weights_path, 664 | depends_on=data_prep_response["jobId"], 665 | ) 666 | 667 | print( 668 | f"Data prep job ID {data_prep_response['jobId']} and predict job ID {predict_response['jobId']} submitted" 669 | ) 670 | return [data_prep_response, predict_response] 671 | 672 | 673 | def submit_rf_data_prep_job( 674 | bucket=sm_session.default_bucket(), 675 | job_name=uuid.uuid4(), 676 | input_file="input.fa", 677 | job_definition="AWS-RoseTTAFold-CPU", 678 | job_queue="AWS-RoseTTAFold-CPU", 679 | cpu=8, 680 | mem=32, 681 | db_path="/fsx/aws-rosettafold-ref-data", 682 | ): 683 | 684 | """ 685 | Submit a RoseTTAFold data prep job (i.e. the first half of the e2e workflow) to AWS Batch. 686 | """ 687 | 688 | working_folder = f"s3://{bucket}/{job_name}" 689 | batch_client = boto3.client("batch") 690 | output_msa_uri = f"{working_folder}/{job_name}.msa0.a3m" 691 | output_hhr_uri = f"{working_folder}/{job_name}.hhr" 692 | output_atab_uri = f"{working_folder}/{job_name}.atab" 693 | 694 | response = batch_client.submit_job( 695 | jobDefinition=job_definition, 696 | jobName=str(job_name), 697 | jobQueue=job_queue, 698 | containerOverrides={ 699 | "command": [ 700 | "/bin/bash", 701 | "run_aws_data_prep_ver.sh", 702 | "-i", 703 | working_folder, 704 | "-n", 705 | input_file, 706 | "-o", 707 | working_folder, 708 | "-p", 709 | job_name, 710 | "-w", 711 | "/work", 712 | "-d", 713 | db_path, 714 | "-c", 715 | str(cpu), 716 | "-m", 717 | str(mem), 718 | ], 719 | "resourceRequirements": [ 720 | {"value": str(cpu), "type": "VCPU"}, 721 | {"value": str(mem * 1000), "type": "MEMORY"}, 722 | ], 723 | }, 724 | tags={ 725 | "output_msa_uri": output_msa_uri, 726 | "output_hhr_uri": output_hhr_uri, 727 | "output_atab_uri": output_atab_uri, 728 | }, 729 | ) 730 | print(f"Job ID {response['jobId']} submitted") 731 | return response 732 | 733 | 734 | def submit_rf_predict_job( 735 | bucket=sm_session.default_bucket(), 736 | job_name=uuid.uuid4(), 737 | job_definition="AWS-RoseTTAFold-GPU", 738 | job_queue="AWS-RoseTTAFold-GPU", 739 | cpu=4, 740 | mem=16, 741 | gpu=True, 742 | db_path="/fsx/aws-rosettafold-ref-data", 743 | weights_path="/fsx/aws-rosettafold-ref-data", 744 | depends_on="", 745 | ): 746 | 747 | """ 748 | Submit a RoseTTAFold prediction job (i.e. the second half of the e2e workflow) to AWS Batch. 749 | """ 750 | 751 | working_folder = f"s3://{bucket}/{job_name}" 752 | batch_client = boto3.client("batch") 753 | output_pdb_uri = f"{working_folder}/{job_name}.e2e.pdb" 754 | 755 | container_overrides = { 756 | "command": [ 757 | "/bin/bash", 758 | "run_aws_predict_ver.sh", 759 | "-i", 760 | working_folder, 761 | "-o", 762 | working_folder, 763 | "-p", 764 | job_name, 765 | "-w", 766 | "/work", 767 | "-d", 768 | db_path, 769 | "-x", 770 | weights_path, 771 | "-c", 772 | str(cpu), 773 | "-m", 774 | str(mem), 775 | ], 776 | "resourceRequirements": [ 777 | {"value": str(cpu), "type": "VCPU"}, 778 | {"value": str(mem * 1000), "type": "MEMORY"}, 779 | ], 780 | } 781 | 782 | if gpu: 783 | container_overrides["resourceRequirements"].append( 784 | {"value": "1", "type": "GPU"} 785 | ) 786 | 787 | response = batch_client.submit_job( 788 | jobDefinition=job_definition, 789 | jobName=str(job_name), 790 | jobQueue=job_queue, 791 | dependsOn=[{"jobId": depends_on, "type": "SEQUENTIAL"}], 792 | containerOverrides=container_overrides, 793 | tags={"output_pdb_uri": output_pdb_uri}, 794 | ) 795 | print(f"Job ID {response['jobId']} submitted") 796 | return response 797 | 798 | 799 | def upload_fasta_to_s3( 800 | record, bucket=sm_session.default_bucket(), job_name=uuid.uuid4() 801 | ): 802 | 803 | """ 804 | Create a fasta file and upload it to S3. 805 | """ 806 | 807 | s3 = boto3.client("s3", region_name=region) 808 | file_out = "_tmp.fasta" 809 | with open(file_out, "w") as f_out: 810 | SeqIO.write(record, f_out, "fasta") 811 | object_name = f"{job_name}/input.fa" 812 | response = s3.upload_file(file_out, bucket, object_name) 813 | os.remove(file_out) 814 | s3_uri = f"s3://{bucket}/{object_name}" 815 | print(f"Sequence file uploaded to {s3_uri}") 816 | return s3_uri 817 | 818 | 819 | def wait_for_job_start(jobId, pause=30): 820 | 821 | """ 822 | Pause while a job transitions into a running state. 823 | """ 824 | 825 | status = get_batch_job_info(jobId)["status"] 826 | print(status) 827 | while get_batch_job_info(jobId)["status"] in [ 828 | "SUBMITTED", 829 | "PENDING", 830 | "RUNNABLE", 831 | "STARTING", 832 | ]: 833 | sleep(30) 834 | new_status = get_batch_job_info(jobId)["status"] 835 | if new_status != status: 836 | print("\n" + new_status) 837 | else: 838 | print(".", end="") 839 | status = new_status 840 | --------------------------------------------------------------------------------