├── .github
    └── workflows
    │   └── codeql-analysis.yml
├── .gitignore
├── AWS-RoseTTAFold.ipynb
├── CASP14-Analysis.ipynb
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── config
    ├── Dockerfile
    ├── cfn.yaml
    ├── container_buildspec.yml
    ├── download_ref_data.sh
    ├── run_aws_data_prep_ver.sh
    └── run_aws_predict_ver.sh
├── data
    ├── T1028.fa
    ├── T1036s1.fa
    └── T1078.fa
├── img
    ├── AWS-RoseTTAFold-arch.png
    ├── AWS-RoseTTAFold-deploy.png
    ├── LaunchStack.jpg
    └── RF_workflow.png
├── requirements.txt
└── rfutils
    ├── __init__.py
    └── rfutils.py


/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ main ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ main ]
20 |   schedule:
21 |     - cron: '29 9 * * 2'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v2
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v1
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 | 
53 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
54 |     # If this step fails, then you should remove it and run the build manually (see below)
55 |     - name: Autobuild
56 |       uses: github/codeql-action/autobuild@v1
57 | 
58 |     # ℹ️ Command-line programs to run using the OS shell.
59 |     # 📚 https://git.io/JvXDl
60 | 
61 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 |     #    and modify them (or add more) to build your code if your project
63 |     #    uses a compiled language
64 | 
65 |     #- run: |
66 |     #   make bootstrap
67 |     #   make release
68 | 
69 |     - name: Perform CodeQL Analysis
70 |       uses: github/codeql-action/analyze@v1
71 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode*
2 | .vscode/*
3 | venv*
4 | venv/*
5 | load_testing.ipynb
6 | plotting.ipynb
7 | job_names.txt
8 | data/*.csv
9 | data/*.yaml


--------------------------------------------------------------------------------
/AWS-RoseTTAFold.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# AWS-RoseTTAFold"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## I. Introduction"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "This notebook runs the [RoseTTAFold](https://www.ipd.uw.edu/2021/07/rosettafold-accurate-protein-structure-prediction-accessible-to-all/) algorithm developed by Minkyung Baek et al. and described in [M. Baek et al., Science \n",
 22 |     "10.1126/science.abj8754 2021](https://www.ipd.uw.edu/wp-content/uploads/2021/07/Baek_etal_Science2021_RoseTTAFold.pdf) on AWS."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "<img src=\"img/RF_workflow.png\" alt=\"RoseTTAFold Workflow\" width=\"800px\" />"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "The AWS workflow depends on a Batch compute environment."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "<img src=\"img/AWS-RoseTTAFold-arch.png\" alt=\"AWS-RoseTTAFold Architecture\" width=\"800px\" />"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## II. Environment setup"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "## Install dependencies\n",
 60 |     "%pip install -q -q -r requirements.txt"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "## Import helper functions at rfutils/rfutils.py\n",
 70 |     "from rfutils import rfutils\n",
 71 |     "\n",
 72 |     "## Load additional dependencies\n",
 73 |     "from Bio import SeqIO\n",
 74 |     "from Bio.Seq import Seq\n",
 75 |     "from Bio.SeqRecord import SeqRecord\n",
 76 |     "import boto3\n",
 77 |     "import glob\n",
 78 |     "import json\n",
 79 |     "import pandas as pd\n",
 80 |     "import sagemaker\n",
 81 |     "\n",
 82 |     "pd.set_option(\"max_colwidth\", None)\n",
 83 |     "\n",
 84 |     "# Get service clients\n",
 85 |     "session = boto3.session.Session()\n",
 86 |     "sm_session = sagemaker.session.Session()\n",
 87 |     "region = session.region_name\n",
 88 |     "role = sagemaker.get_execution_role()\n",
 89 |     "s3 = boto3.client(\"s3\", region_name=region)\n",
 90 |     "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
 91 |     "\n",
 92 |     "bucket = sm_session.default_bucket()\n",
 93 |     "print(f\"S3 bucket name is {bucket}\")"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "## III. Input Protein Sequence"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "Enter a protein sequence manually"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "seq = SeqRecord(\n",
117 |     "    Seq(\"MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\"),\n",
118 |     "    id=\"YP_025292.1\",\n",
119 |     "    name=\"HokC\",\n",
120 |     "    description=\"toxic membrane protein, small\",\n",
121 |     ")"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "Or provide the path to a fasta file"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "seq = SeqIO.read(\"data/T1078.fa\", \"fasta\")"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "print(f\"Protein sequence for analysis is \\n{seq}\")"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## IV. Submit RoseTTAFold Jobs"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "### Generate Job Name"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "job_name = rfutils.create_job_name(seq.id)\n",
170 |     "print(f\"Automatically-generated job name is: {job_name}\")"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "### Upload fasta file to S3"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "input_uri = rfutils.upload_fasta_to_s3(seq, bucket, job_name)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "### Submit jobs to AWS Batch queues"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "Select the job definitions and Batch queues for your job."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "batch_resources = rfutils.get_rosettafold_batch_resources(region=region)\n",
210 |     "\n",
211 |     "cpu_queue = batch_resources[\"CPUJobQueue\"][0]\n",
212 |     "gpu_queue = batch_resources[\"GPUJobQueue\"][0]\n",
213 |     "cpu_data_prep_job_def = batch_resources[\"CPUDataPrepJobDefinition\"][0]\n",
214 |     "cpu_predict_job_def = batch_resources[\"CPUPredictJobDefinition\"][0]\n",
215 |     "gpu_predict_job_def = batch_resources[\"GPUPredictJobDefinition\"][0]\n",
216 |     "\n",
217 |     "batch_resources"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "Because our test sequence is small (less than 400 residues) we will run the prediction step on a GPU to decrease the job duration from hours to minutes."
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "two_step_response = rfutils.submit_2_step_job(\n",
234 |     "    bucket=bucket,\n",
235 |     "    job_name=job_name,\n",
236 |     "    data_prep_job_definition=cpu_data_prep_job_def,\n",
237 |     "    data_prep_queue=cpu_queue,\n",
238 |     "    data_prep_cpu=8,\n",
239 |     "    data_prep_mem=32,\n",
240 |     "    predict_job_definition=gpu_predict_job_def,  # Change this to the cpu_predict_job_def for large proteins\n",
241 |     "    predict_queue=gpu_queue,  # Change this to the cpu_queue for large proteins\n",
242 |     "    predict_cpu=4,\n",
243 |     "    predict_mem=16,\n",
244 |     "    predict_gpu=True,  # Change this to False for large proteins\n",
245 |     ")\n",
246 |     "data_prep_jobId = two_step_response[0][\"jobId\"]\n",
247 |     "predict_jobId = two_step_response[1][\"jobId\"]"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "## V. Check Status of Data Prep and Prediction Jobs"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "rfutils.get_rf_job_info(\n",
264 |     "    cpu_queue,\n",
265 |     "    gpu_queue,\n",
266 |     "    hrs_in_past=1,\n",
267 |     ")"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "## VI. View Data Prep Results"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "Pause while the data prep job starts up"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "rfutils.wait_for_job_start(data_prep_jobId)"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "Get logs for data prep job (Run this multiple times to see how the job progresses)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "data_prep_logStreamName = rfutils.get_batch_job_info(data_prep_jobId)[\"logStreamName\"]\n",
307 |     "rfutils.get_batch_logs(data_prep_logStreamName).tail(n=5)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "Retrieve and Display Multiple Sequence Alignment (MSA) Results"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "rfutils.display_msa(data_prep_jobId, bucket)"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "markdown",
328 |    "metadata": {},
329 |    "source": [
330 |     "## VII. View Prediction Results"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "Pause while the predict job starts up"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "rfutils.wait_for_job_start(predict_jobId)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "Get logs for prediction job (Run this multiple times to see how the job progresses)"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "data_prep_logStreamName = rfutils.get_batch_job_info(data_prep_jobId)[\"logStreamName\"]\n",
363 |     "rfutils.get_batch_logs(data_prep_logStreamName).tail(n=5)"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "## VIII. View Job Metrics"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": [
379 |     "metrics = rfutils.get_rf_job_metrics(job_name, bucket, region)\n",
380 |     "\n",
381 |     "print(f'Number of sequences in MSA: {metrics[\"DATA_PREP\"][\"MSA_COUNT\"]}')\n",
382 |     "print(f'Number of templates: {metrics[\"DATA_PREP\"][\"TEMPLATE_COUNT\"]}')\n",
383 |     "print(f'MSA duration (sec): {metrics[\"DATA_PREP\"][\"MSA_DURATION\"]}')\n",
384 |     "print(f'SS duration (sec): {metrics[\"DATA_PREP\"][\"SS_DURATION\"]}')\n",
385 |     "print(f'Template search duration (sec): {metrics[\"DATA_PREP\"][\"TEMPLATE_DURATION\"]}')\n",
386 |     "print(\n",
387 |     "    f'Total data prep duration (sec): {metrics[\"DATA_PREP\"][\"TOTAL_DATA_PREP_DURATION\"]}'\n",
388 |     ")\n",
389 |     "print(f'Total predict duration (sec): {metrics[\"PREDICT\"][\"TOTAL_PREDICT_DURATION\"]}')"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "markdown",
394 |    "metadata": {},
395 |    "source": [
396 |     "## IX. Retrieve and Display Predicted Structure"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "rfutils.display_structure(predict_jobId, bucket, vmin=0.5, vmax=0.9)"
406 |    ]
407 |   }
408 |  ],
409 |  "metadata": {
410 |   "instance_type": "ml.t3.medium",
411 |   "interpreter": {
412 |    "hash": "8ad3a54da4d511af1a5c2549d8f1b22d83bfd1079fb699a3f5552b91d143b102"
413 |   },
414 |   "kernelspec": {
415 |    "display_name": "Python 3 (Data Science)",
416 |    "language": "python",
417 |    "name": "python3"
418 |   },
419 |   "language_info": {
420 |    "codemirror_mode": {
421 |     "name": "ipython",
422 |     "version": 3
423 |    },
424 |    "file_extension": ".py",
425 |    "mimetype": "text/x-python",
426 |    "name": "python",
427 |    "nbconvert_exporter": "python",
428 |    "pygments_lexer": "ipython3",
429 |    "version": "3.8.9"
430 |   }
431 |  },
432 |  "nbformat": 4,
433 |  "nbformat_minor": 4
434 | }
435 | 


--------------------------------------------------------------------------------
/CASP14-Analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# AWS-RoseTTAFold: Bulk Job Analysis"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## I. Introduction"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "This notebook demonstrates how to analyze multiple protein simultaneously, in this case a subset of the CASP14 target set."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## II. Environment setup"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "## Install dependencies\n",
 38 |     "%pip install -q -q -r requirements.txt"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "## Import helper functions at rfutils/rfutils.py\n",
 48 |     "from rfutils import rfutils\n",
 49 |     "\n",
 50 |     "## Load additional dependencies\n",
 51 |     "from Bio import SeqIO\n",
 52 |     "from Bio.Seq import Seq\n",
 53 |     "from Bio.SeqRecord import SeqRecord\n",
 54 |     "import boto3\n",
 55 |     "import glob\n",
 56 |     "import json\n",
 57 |     "from IPython.display import display\n",
 58 |     "import pandas as pd\n",
 59 |     "import sagemaker\n",
 60 |     "\n",
 61 |     "pd.set_option(\"max_colwidth\", None)\n",
 62 |     "\n",
 63 |     "# Get service clients\n",
 64 |     "session = boto3.session.Session()\n",
 65 |     "sm_session = sagemaker.session.Session()\n",
 66 |     "region = session.region_name\n",
 67 |     "role = sagemaker.get_execution_role()\n",
 68 |     "s3 = boto3.client(\"s3\", region_name=region)\n",
 69 |     "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
 70 |     "\n",
 71 |     "bucket = sm_session.default_bucket()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## III. Input Protein Sequence"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "Download and process CASP14 sequences"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "!wget \"https://predictioncenter.org/download_area/CASP14/sequences/casp14.seq.txt\" -O \"data/casp14.fa\"\n",
 95 |     "!sed '137,138d' \"data/casp14.fa\" > \"data/casp14_dedup.fa\" # Remove duplicate entry for T1085\n",
 96 |     "\n",
 97 |     "casp14_iterator = SeqIO.parse(\"data/casp14_dedup.fa\", \"fasta\")\n",
 98 |     "casp14_df = pd.DataFrame(\n",
 99 |     "    (\n",
100 |     "        (record.id, record.description, len(record), record.seq)\n",
101 |     "        for record in casp14_iterator\n",
102 |     "    ),\n",
103 |     "    columns=[\"id\", \"description\", \"length\", \"seq\"],\n",
104 |     ").sort_values(by=\"length\")\n",
105 |     "!rm data/casp14*"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "Display information about CASP14 proteins"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "with pd.option_context(\"display.max_rows\", None):\n",
122 |     "    display(casp14_df.loc[:, (\"id\", \"description\")])"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "Plot distribution of the protein lengths"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "import matplotlib.pyplot as plt\n",
139 |     "\n",
140 |     "fig, ax = plt.subplots()\n",
141 |     "plt.hist(casp14_df.length, bins=50)\n",
142 |     "plt.ylabel(\"Sample count\")\n",
143 |     "plt.xlabel(\"Residue count\")\n",
144 |     "plt.title(\"CASP-14 Protein Length Distribution\")\n",
145 |     "plt.show()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "Get the names of the AWS Batch resources deployed in your account."
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "batch_resources = rfutils.get_rosettafold_batch_resources(region=region)\n",
162 |     "\n",
163 |     "cpu_queue = batch_resources[\"CPUJobQueue\"][0]\n",
164 |     "gpu_queue = batch_resources[\"GPUJobQueue\"][0]\n",
165 |     "cpu_data_prep_job_def = batch_resources[\"CPUDataPrepJobDefinition\"][0]\n",
166 |     "cpu_predict_job_def = batch_resources[\"CPUPredictJobDefinition\"][0]\n",
167 |     "gpu_predict_job_def = batch_resources[\"GPUPredictJobDefinition\"][0]\n",
168 |     "\n",
169 |     "batch_resources"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "Submit analysis jobs for a subset of CASP14 proteins"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "protein_count = 84  # Change this to analyze a smaller number of CASP14 targets\n",
186 |     "job_name_list = []\n",
187 |     "\n",
188 |     "for row in casp14_df[:protein_count].itertuples(index=False):\n",
189 |     "    record = SeqRecord(row.seq, id=row.id, description=row.description)\n",
190 |     "    print(f\"Protein sequence for analysis is \\n{record.description}\")\n",
191 |     "    sequence_length = len(record.seq)\n",
192 |     "    print(f\"Sequence length is {sequence_length}\")\n",
193 |     "\n",
194 |     "    if sequence_length < 400:\n",
195 |     "        prep_cpu = 8\n",
196 |     "        prep_mem = 32\n",
197 |     "        predict_cpu = 4\n",
198 |     "        predict_mem = 16\n",
199 |     "        predict_gpu = True\n",
200 |     "        predict_job_definition = gpu_predict_job_def\n",
201 |     "        predict_queue = gpu_queue\n",
202 |     "    else:\n",
203 |     "        prep_cpu = 8\n",
204 |     "        prep_mem = 64\n",
205 |     "        predict_cpu = 4\n",
206 |     "        predict_mem = 32\n",
207 |     "        predict_gpu = False\n",
208 |     "        predict_job_definition = cpu_predict_job_def\n",
209 |     "        predict_queue = cpu_queue\n",
210 |     "\n",
211 |     "    job_name = rfutils.create_job_name(record.id)\n",
212 |     "    print(f\"Automatically-generated job name is: {job_name}\")\n",
213 |     "    job_name_list.append(job_name)\n",
214 |     "    input_uri = rfutils.upload_fasta_to_s3(record, bucket, job_name)\n",
215 |     "    two_step_response = rfutils.submit_2_step_job(\n",
216 |     "        bucket=bucket,\n",
217 |     "        job_name=job_name,\n",
218 |     "        data_prep_input_file=\"input.fa\",\n",
219 |     "        data_prep_job_definition=cpu_data_prep_job_def,\n",
220 |     "        data_prep_queue=cpu_queue,\n",
221 |     "        data_prep_cpu=prep_cpu,\n",
222 |     "        data_prep_mem=prep_mem,\n",
223 |     "        predict_job_definition=predict_job_definition,\n",
224 |     "        predict_queue=predict_queue,\n",
225 |     "        predict_cpu=predict_cpu,\n",
226 |     "        predict_mem=predict_mem,\n",
227 |     "        predict_gpu=predict_gpu,\n",
228 |     "    )"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "## IV. Check Status of Data Prep and Prediction Jobs"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "rfutils.get_rf_job_info(\n",
245 |     "    cpu_queue,\n",
246 |     "    gpu_queue,\n",
247 |     "    hrs_in_past=1,\n",
248 |     ")"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "jobs = []\n",
258 |     "for job_name in job_name_list:\n",
259 |     "    metrics = rfutils.get_rf_job_metrics(job_name, bucket, region)\n",
260 |     "    row = [\n",
261 |     "        job_name,\n",
262 |     "        metrics[\"DATA_PREP\"][\"JOB_ID\"],\n",
263 |     "        metrics[\"DATA_PREP\"][\"CPU\"],\n",
264 |     "        metrics[\"DATA_PREP\"][\"MEM\"],\n",
265 |     "        metrics[\"DATA_PREP\"][\"LENGTH\"],\n",
266 |     "        metrics[\"DATA_PREP\"][\"MSA_COUNT\"],\n",
267 |     "        metrics[\"DATA_PREP\"][\"TEMPLATE_COUNT\"],\n",
268 |     "        metrics[\"DATA_PREP\"][\"MSA_DURATION\"],\n",
269 |     "        metrics[\"DATA_PREP\"][\"SS_DURATION\"],\n",
270 |     "        metrics[\"DATA_PREP\"][\"TEMPLATE_DURATION\"],\n",
271 |     "        metrics[\"DATA_PREP\"][\"TOTAL_DATA_PREP_DURATION\"],\n",
272 |     "        metrics[\"PREDICT\"][\"JOB_ID\"],\n",
273 |     "        metrics[\"PREDICT\"][\"CPU\"],\n",
274 |     "        metrics[\"PREDICT\"][\"MEM\"],\n",
275 |     "        metrics[\"PREDICT\"][\"TOTAL_PREDICT_DURATION\"],\n",
276 |     "    ]\n",
277 |     "    jobs.append(row)\n",
278 |     "metrics_df = pd.DataFrame(\n",
279 |     "    jobs,\n",
280 |     "    columns=[\n",
281 |     "        \"jobName\",\n",
282 |     "        \"dataPrepJobID\",\n",
283 |     "        \"dataPrepCPU\",\n",
284 |     "        \"dataPrepMEM\",\n",
285 |     "        \"sequenceLength\",\n",
286 |     "        \"MSACount\",\n",
287 |     "        \"templateCount\",\n",
288 |     "        \"MSADuration\",\n",
289 |     "        \"SSDuration\",\n",
290 |     "        \"templateDuration\",\n",
291 |     "        \"dataPrepDuration\",\n",
292 |     "        \"predictJobId\",\n",
293 |     "        \"predictCPU\",\n",
294 |     "        \"predictMEM\",\n",
295 |     "        \"predictDuration\",\n",
296 |     "    ],\n",
297 |     ")\n",
298 |     "metrics_df.sort_values(by=[\"dataPrepCPU\", \"dataPrepMEM\", \"predictCPU\", \"predictMEM\"])"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": [
307 |     "metrics_df.to_csv(\"results.csv\")"
308 |    ]
309 |   }
310 |  ],
311 |  "metadata": {
312 |   "instance_type": "ml.t3.medium",
313 |   "interpreter": {
314 |    "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
315 |   },
316 |   "kernelspec": {
317 |    "display_name": "Python 3 (Data Science)",
318 |    "language": "python",
319 |    "name": "python3"
320 |   },
321 |   "language_info": {
322 |    "codemirror_mode": {
323 |     "name": "ipython",
324 |     "version": 3
325 |    },
326 |    "file_extension": ".py",
327 |    "mimetype": "text/x-python",
328 |    "name": "python",
329 |    "nbconvert_exporter": "python",
330 |    "pygments_lexer": "ipython3",
331 |    "version": "3.8.9"
332 |   }
333 |  },
334 |  "nbformat": 4,
335 |  "nbformat_minor": 4
336 | }
337 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AWS RoseTTAFold
  2 | Infrastructure template and Jupyter notebooks for running RoseTTAFold on AWS Batch. 
  3 | 
  4 | ## Overview
  5 | PProteins are large biomolecules that play an important role in the body. Knowing the physical structure of proteins is key to understanding their function. However, it can be difficult and expensive to determine the structure of many proteins experimentally. One alternative is to predict these structures using machine learning algorithms. Several high-profile research teams have released such algorithms, including [AlphaFold 2](https://deepmind.com/blog/article/alphafold-a-solution-to-a-50-year-old-grand-challenge-in-biology), [RoseTTAFold](https://www.ipd.uw.edu/2021/07/rosettafold-accurate-protein-structure-prediction-accessible-to-all/), and others. Their work was important enough for Science magazine to name it the ["2021 Breakthrough of the Year"](https://www.science.org/content/article/breakthrough-2021).
  6 | 
  7 | Both AlphaFold 2 and RoseTTAFold use a multi-track transformer architecture trained on known protein templates to predict the structure of unknown peptide sequences. These predictions are heavily GPU-dependent and take anywhere from minutes to days to complete. The input features for these predictions include multiple sequence alignment (MSA) data. MSA algorithms are CPU-dependent and can themselves require several hours of processing time.
  8 | 
  9 | Running both the MSA and structure prediction steps in the same computing environment can be cost inefficient, because the expensive GPU resources required for the prediction sit unused while the MSA step runs. Instead, using a high performance computing (HPC) service like AWS Batch (https://aws.amazon.com/batch/) allows us to run each step as a containerized job with the best fit of CPU, memory, and GPU resources.
 10 | 
 11 | In this post, we demonstrate how to provision and use AWS Batch and other services to run AI-driven protein folding algorithms like RoseTTAFold.
 12 | 
 13 | ## Setup
 14 | ### Deploy the infrastructure stack
 15 | 1. Choose *Launch Stack*:
 16 | 
 17 |     [![Launch Stack](img/LaunchStack.jpg)](https://console.aws.amazon.com/cloudformation/home#/stacks/create/review?templateURL=https://aws-hcls-ml.s3.amazonaws.com/blog_post_support_materials/aws-RoseTTAFold/cfn.yaml)
 18 | 
 19 | 2. For *Stack Name*, enter a value unique to your account and region.
 20 | 3. For *StackAvailabilityZone* choose an availability zone.
 21 | 4. Select *I acknowledge that AWS CloudFormation might create IAM resources with custom names*.
 22 | 5. Choose *Create stack*.
 23 | 6. Wait approximately 30 minutes for AWS CloudFormation to create the infrastructure stack and AWS CodeBuild to build and publish the AWS-RoseTTAFold container to Amazon Elastic Container Registry (Amazon ECR).
 24 | 
 25 | ### Load model weights and sequence database files
 26 | 
 27 | *Option 1: Mount the FSx for Lustre file system to an EC2 instance*
 28 | 
 29 | 1. Sign in to the AWS Management Console and open the Amazon EC2 console at [https://console.aws.amazon.com/ec2](https://console.aws.amazon.com/ec2).
 30 | 2. In the navigation pane, under *Instances,* select *Launch Templates*.
 31 | 3. Choose the *Launch template ID* for your stack, such as  `aws-rosettafold-launch-template-stack-id-suffix`.
 32 | 4. Choose *Actions, Launch instance from template.*
 33 | 5. Launch a new EC2 instance and connect using either SSH or SSM.
 34 | 6. Download and extract the network weights and sequence database files to the attached volume at `/fsx/aws-rosettafold-ref-data` according to installation steps 3 and 5 from the [RoseTTAFold public repository](https://github.com/RosettaCommons/RoseTTAFold).
 35 | 
 36 | *Option 2: Lazy-load the data from a S3 data repository*
 37 | 
 38 | 1. Create a new S3 bucket in your region of interest.
 39 | 2. Download and extract the network weights and sequence database files as described above and transfer them to your S3 bucket.
 40 | 3. Sign in to the AWS Management Console and open the Amazon FSx for Lustre console at [https://console.aws.amazon.com/fsx](https://console.aws.amazon.com/fsx/home).
 41 | 4. Choose the *File System name* for your stack, such as `aws-rosettafold-fsx-lustre-stack-id-suffix`.
 42 | 5. On the file system details page, choose *Data repository*, *Create data repository association*.
 43 | 6. For *File system path* enter `/aws-rosettafold-ref-data`.
 44 | 7. For *Data repository path* enter the s3 url for your new S3 bucket.
 45 | 8. Choose *Create*.
 46 | 
 47 | Creating the data repository association will immediately load the file metadata to the file system. However, the data itself will not be available until requested by a job. This will add several hours to the duration of the first job you submit. However, subsequent jobs will complete much faster.
 48 | 
 49 | Once you have finished loading the model weights and sequence data base files, the FSx for Lustre file system will include the following files:
 50 | 
 51 | ```
 52 | /fsx
 53 | └── /aws-rosettafold-ref-data
 54 |     ├── /bfd
 55 |     │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata (1.4 TB)
 56 |     │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex (1.7 GB)
 57 |     │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata (15.7 GB)
 58 |     │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex (1.6 GB)
 59 |     │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata (304.4 GB)
 60 |     │   └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex (123.6 MB)
 61 |     ├── /pdb100_2021Mar03
 62 |     │   ├── LICENSE (20.4 KB)
 63 |     │   ├── pdb100_2021Mar03_a3m.ffdata (633.9 GB)
 64 |     │   ├── pdb100_2021Mar03_a3m.ffindex (3.9 MB)
 65 |     │   ├── pdb100_2021Mar03_cs219.ffdata (41.8 MB)
 66 |     │   ├── pdb100_2021Mar03_cs219.ffindex (2.8 MB)
 67 |     │   ├── pdb100_2021Mar03_hhm.ffdata (6.8 GB)
 68 |     │   ├── pdb100_2021Mar03_hhm.ffindex (3.4 GB)
 69 |     │   ├── pdb100_2021Mar03_pdb.ffdata (26.2 GB)
 70 |     │   └── pdb100_2021Mar03_pdb.ffindex (3.7 MB)
 71 |     ├── /UniRef30_2020_06
 72 |     │   ├── UniRef30_2020_06_a3m.ffdata (139.6 GB)
 73 |     │   ├── UniRef30_2020_06_a3m.ffindex (671.0 MG)
 74 |     │   ├── UniRef30_2020_06_cs219.ffdata (6.0 GB)
 75 |     │   ├── UniRef30_2020_06_cs219.ffindex (605.0 MB)
 76 |     │   ├── UniRef30_2020_06_hhm.ffdata (34.1 GB)
 77 |     │   ├── UniRef30_2020_06_hhm.ffindex (19.4 MB)
 78 |     │   └── UniRef30_2020_06.md5sums (379.0 B)
 79 |     └── /weights
 80 |         ├── RF2t.pt (126 MB KB)
 81 |         ├── Rosetta-DL_LICENSE.txt (3.1 KB)
 82 |         ├── RoseTTAFold_e2e.pt (533 MB)
 83 |         └── RoseTTAFold_pyrosetta.pt (506 MB)
 84 | 
 85 | ```
 86 | 
 87 | ### Submit structure prediction jobs from Jupyter
 88 | 
 89 | 1. [Clone the CodeCommit repository](https://docs.aws.amazon.com/codecommit/latest/userguide/how-to-connect.html#how-to-connect-http) created by CloudFormation to a Jupyter Notebook environment of your choice.
 90 | 2. Use the `AWS-RoseTTAFold.ipynb` and `CASP14-Analysis.ipynb` notebooks to submit protein sequences for analysis.
 91 | 
 92 | ## Architecture
 93 | 
 94 | ![AWS-RoseTTAFold Architecture](img/AWS-RoseTTAFold-arch.png)
 95 | 
 96 | This project creates two computing environments in AWS Batch to run the "end-to-end" protein folding workflow in RoseTTAFold. The first of these uses the optimal mix of `c4`, `m4`, and `r4` instance types based on the vCPU and memory requirements specified in the Batch job. The second environment uses `g4dn` on-demand instances to balance performance, availability, and cost.
 97 | 
 98 | A scientist can create structure prediction jobs using one of the two included Jupyter notebooks. `AWS-RoseTTAFold.ipynb` demonstrates how to submit a single analysis job and view the results. `CASP14-Analysis.ipynb` demonstrates how to submit multiple jobs at once using the CASP14 target list. In both of these cases, submitting a sequence for analysis creates two Batch jobs, one for data preparation (using the CPU computing environment) and a second, dependent job for structure prediction (using the GPU computing environment). 
 99 | 
100 | Both the data preparation and structure prediction use the same Docker image for execution. This image, based on the public Nvidia CUDA image for Ubuntu 20, includes the v1.1 release of the public [RoseTTAFold repository](https://github.com/RosettaCommons/RoseTTAFold), as well as additional scripts for integrating with AWS services. CodeBuild will automatically download this container definition and build the required image during stack creation. However, end users can make changes to this image by pushing to the CodeCommit repository included in the stack. For example, users could replace the included MSA algorithm ([hhblits](https://github.com/soedinglab/hh-suite)) with an alternative like [MMseqs2](https://github.com/soedinglab/MMseqs2) or replace the RoseTTAFold network with an alternative like AlphaFold 2 or [Uni-Fold](https://github.com/dptech-corp/Uni-Fold).
101 | 
102 | ## Costs
103 | This workload costs approximately $760 per month to maintain, plus another $0.50 per job.
104 | 
105 | ## Deployment
106 | 
107 | ![AWS-RoseTTAFold Dewployment](img/AWS-RoseTTAFold-deploy.png)
108 | 
109 | Running the CloudFormation template at `config/cfn.yaml` creates the following resources in the specified availability zone:
110 | 1. A new VPC with a private subnet, public subnet, NAT gateway, internet gateway, elastic IP, route tables, and S3 gateway endpoint.
111 | 2. A FSx Lustre file system with 1.2 TiB of storage and 1,200 MB/s throughput capacity. This file system can be linked to an S3 bucket for loading the required reference data when the first job executes.
112 | 3. An EC2 launch template for mounting the FSX file system to Batch compute instances.
113 | 4. A set of AWS Batch compute environments, job queues, and job definitions for running the CPU-dependent data prep job and a second for the GPU-dependent prediction job.
114 | 5. CodeCommit, CodeBuild, CodePipeline, and ECR resources for building and publishing the Batch container image. When CloudFormation creates the CodeCommit repository, it populates it with a zipped version of this repository stored in a public S3 bucket. CodeBuild uses this repository as its source and adds additional code from release 1.1 of the public [RoseTTAFold repository](https://github.com/RosettaCommons/RoseTTAFold). CodeBuild then publishes the resulting container image to ECR, where Batch jobs can use it as needed.
115 | 
116 | ## Licensing
117 | This library is licensed under the MIT-0 License. See the LICENSE file for more information.
118 | 
119 | The University of Washington has made the code and data in the [RoseTTAFold public repository](https://github.com/RosettaCommons) available under an [MIT license](https://github.com/RosettaCommons/RoseTTAFold/blob/main/LICENSE). However, the model weights used for prediction are only available for internal, non-profit, non-commercial research use. For information, please see the [full license agreement](https://files.ipd.uw.edu/pub/RoseTTAFold/Rosetta-DL_LICENSE.txt) and contact the University of Washington for details.
120 | 
121 | ## Security
122 | 
123 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
124 | 
125 | ## More Information
126 | - [University of Washington Institute for Protein Design](https://www.ipd.uw.edu/2021/07/rosettafold-accurate-protein-structure-prediction-accessible-to-all/)
127 | - [RoseTTAFold Paper](https://www.ipd.uw.edu/wp-content/uploads/2021/07/Baek_etal_Science2021_RoseTTAFold.pdf)
128 | - [AWS Batch Documentation](https://docs.aws.amazon.com/batch/)
129 | - [CloudFormation Documentation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/Welcome.html)
130 | - [Explaination of the RoseTTAFold and AlphaFold 2 architectures](https://www.youtube.com/watch?v=Rfw7thgGTwI)
131 | - [David Baker's TED talk on protein design](https://www.ted.com/talks/david_baker_5_challenges_we_could_solve_by_designing_new_proteins)
132 | - [AWS ML Blog Post on running AlphaFold 2 on Amazon EC2](https://aws.amazon.com/blogs/machine-learning/run-alphafold-v2-0-on-amazon-ec2/)


--------------------------------------------------------------------------------
/config/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Start with a copy of the cuda image maintained by Nvidia to avoid
 2 | FROM nvcr.io/nvidia/cuda:11.4.2-base-ubuntu20.04
 3 | 
 4 | # Install basic tools
 5 | RUN apt-get update && apt-get install -y \
 6 |     wget \
 7 |     curl \
 8 |     unzip
 9 | 
10 | # Install miniconda and awscli
11 | RUN curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
12 |     && chmod +x ~/miniconda.sh \
13 |     && ~/miniconda.sh -b -p /opt/conda \
14 |     && rm ~/miniconda.sh \
15 |     && /opt/conda/bin/conda update conda \
16 |     && /opt/conda/bin/conda install -c conda-forge awscli
17 | 
18 | # Download and unzip v1.1 of the RoseTTAFold repository, available at 
19 | # https://github.com/RosettaCommons/RoseTTAFold
20 | RUN wget https://github.com/RosettaCommons/RoseTTAFold/archive/refs/tags/v1.1.0.zip \
21 |     && unzip v1.1.0.zip \
22 |     && mv RoseTTAFold-1.1.0 /RoseTTAFold \
23 |     && rm v1.1.0.zip
24 | WORKDIR /RoseTTAFold
25 | 
26 | # Install lddt, cs-blast, and libgomp1
27 | RUN ./install_dependencies.sh
28 | RUN /opt/conda/bin/conda env create -f RoseTTAFold-linux.yml \
29 |     && /opt/conda/bin/conda clean -ya
30 | RUN apt-get install libgomp1
31 | 
32 | # Add the AWS-RoseTTAFold scripts
33 | COPY run_aws_data_prep_ver.sh .
34 | COPY run_aws_predict_ver.sh .
35 | COPY download_ref_data.sh .
36 | 
37 | # Clean up unecessary files to save space
38 | RUN rm -rf \
39 |     example \
40 |     folding \
41 |     *.gz \
42 |     *.zip \
43 |     *.yml \
44 |     install_dependencies.sh 
45 | 
46 | # Create a directory to mount the FSx Lustre file system with ref data
47 | VOLUME /fsx
48 | 
49 | # Activate conda\
50 | RUN ["/bin/bash", "-c", \
51 |     "/opt/conda/bin/activate", \
52 |     "/opt/conda/bin/conda init bash", \
53 |     "source $HOME/.bashrc"]
54 | ENV PATH /opt/conda/bin:$PATH
55 | 
56 | # Define the default run command. Batch will overwrite this at run time.
57 | CMD ["/bin/bash"] 
58 | 


--------------------------------------------------------------------------------
/config/cfn.yaml:
--------------------------------------------------------------------------------
   1 | AWSTemplateFormatVersion: 2010-09-09
   2 | Description: >-
   3 |   Creates a stack for running RoseTTAFold on AWS Batch.
   4 | 
   5 | Parameters:
   6 |   StackAvailabilityZone:
   7 |     Description: Availability zone to deploy stack resources
   8 |     Type: "AWS::EC2::AvailabilityZone::Name"
   9 | 
  10 | Resources:
  11 |   ##################################################
  12 |   # Network Configuration
  13 |   ##################################################
  14 |   VPC:
  15 |     Type: "AWS::EC2::VPC"
  16 |     Properties:
  17 |       EnableDnsSupport: "true"
  18 |       EnableDnsHostnames: "true"
  19 |       CidrBlock: "10.0.0.0/16"
  20 |       Tags:
  21 |         - Key: Application
  22 |           Value: AWS-RoseTTAFold
  23 |         - Key: Network
  24 |           Value: Public
  25 |         - Key: Name
  26 |           Value:
  27 |             !Join [
  28 |               "-",
  29 |               [
  30 |                 "aws-rosettafold",
  31 |                 "VPC",
  32 |                 !Select [
  33 |                   4,
  34 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
  35 |                 ],
  36 |               ],
  37 |             ]
  38 | 
  39 |   PublicSubnet0:
  40 |     Type: "AWS::EC2::Subnet"
  41 |     Properties:
  42 |       VpcId: !Ref VPC
  43 |       AvailabilityZone: !Ref StackAvailabilityZone
  44 |       CidrBlock:
  45 |         Fn::Select:
  46 |           - 0
  47 |           - Fn::Cidr: [!GetAtt VPC.CidrBlock, 6, 8]
  48 |       Tags:
  49 |         - Key: Application
  50 |           Value: AWS-RoseTTAFold
  51 |         - Key: Network
  52 |           Value: Public
  53 |         - Key: Name
  54 |           Value:
  55 |             !Join [
  56 |               "-",
  57 |               [
  58 |                 "aws-rosettafold",
  59 |                 "public-subnet",
  60 |                 !Select [
  61 |                   4,
  62 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
  63 |                 ],
  64 |               ],
  65 |             ]
  66 | 
  67 |   PrivateSubnet0:
  68 |     Type: "AWS::EC2::Subnet"
  69 |     Properties:
  70 |       VpcId:
  71 |         Ref: VPC
  72 |       AvailabilityZone: !Ref StackAvailabilityZone
  73 |       CidrBlock:
  74 |         Fn::Select:
  75 |           - 3
  76 |           - Fn::Cidr: [!GetAtt VPC.CidrBlock, 6, 8]
  77 |       Tags:
  78 |         - Key: Application
  79 |           Value: AWS-RoseTTAFold
  80 |         - Key: Network
  81 |           Value: Private
  82 |         - Key: Name
  83 |           Value:
  84 |             !Join [
  85 |               "-",
  86 |               [
  87 |                 "aws-rosettafold",
  88 |                 "private-subnet",
  89 |                 !Select [
  90 |                   4,
  91 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
  92 |                 ],
  93 |               ],
  94 |             ]
  95 | 
  96 |   InternetGateway:
  97 |     Type: "AWS::EC2::InternetGateway"
  98 |     Properties:
  99 |       Tags:
 100 |         - Key: Application
 101 |           Value: AWS-RoseTTAFold
 102 |         - Key: Network
 103 |           Value: Public
 104 |         - Key: Name
 105 |           Value:
 106 |             !Join [
 107 |               "-",
 108 |               [
 109 |                 "aws-rosettafold",
 110 |                 "igw",
 111 |                 !Select [
 112 |                   4,
 113 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 114 |                 ],
 115 |               ],
 116 |             ]
 117 | 
 118 |   GatewayToInternet:
 119 |     Type: "AWS::EC2::VPCGatewayAttachment"
 120 |     Properties:
 121 |       VpcId:
 122 |         Ref: VPC
 123 |       InternetGatewayId:
 124 |         Ref: InternetGateway
 125 | 
 126 |   PublicRouteTable:
 127 |     Type: "AWS::EC2::RouteTable"
 128 |     Properties:
 129 |       VpcId:
 130 |         Ref: VPC
 131 |       Tags:
 132 |         - Key: Application
 133 |           Value: AWS-RoseTTAFold
 134 |         - Key: Network
 135 |           Value: Public
 136 |         - Key: Name
 137 |           Value:
 138 |             !Join [
 139 |               "-",
 140 |               [
 141 |                 "aws-rosettafold",
 142 |                 "public-route-table",
 143 |                 !Select [
 144 |                   4,
 145 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 146 |                 ],
 147 |               ],
 148 |             ]
 149 | 
 150 |   PublicRoute:
 151 |     Type: "AWS::EC2::Route"
 152 |     DependsOn: GatewayToInternet
 153 |     Properties:
 154 |       RouteTableId:
 155 |         Ref: PublicRouteTable
 156 |       DestinationCidrBlock: 0.0.0.0/0
 157 |       GatewayId:
 158 |         Ref: InternetGateway
 159 | 
 160 |   PublicSubnetRouteTableAssociation0:
 161 |     Type: "AWS::EC2::SubnetRouteTableAssociation"
 162 |     Properties:
 163 |       SubnetId:
 164 |         Ref: PublicSubnet0
 165 |       RouteTableId:
 166 |         Ref: PublicRouteTable
 167 | 
 168 |   ElasticIP0:
 169 |     Type: "AWS::EC2::EIP"
 170 |     Properties:
 171 |       Domain: vpc
 172 |       Tags:
 173 |         - Key: Application
 174 |           Value: AWS-RoseTTAFold
 175 |         - Key: Name
 176 |           Value:
 177 |             !Join [
 178 |               "-",
 179 |               [
 180 |                 "aws-rosettafold",
 181 |                 "eip",
 182 |                 !Select [
 183 |                   4,
 184 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 185 |                 ],
 186 |               ],
 187 |             ]
 188 | 
 189 |   NATGateway0:
 190 |     Type: "AWS::EC2::NatGateway"
 191 |     Properties:
 192 |       AllocationId:
 193 |         "Fn::GetAtt":
 194 |           - ElasticIP0
 195 |           - AllocationId
 196 |       SubnetId:
 197 |         Ref: PublicSubnet0
 198 |       Tags:
 199 |         - Key: Application
 200 |           Value: AWS-RoseTTAFold
 201 |         - Key: Name
 202 |           Value:
 203 |             !Join [
 204 |               "-",
 205 |               [
 206 |                 "aws-rosettafold",
 207 |                 "nat-gateway",
 208 |                 !Select [
 209 |                   4,
 210 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 211 |                 ],
 212 |               ],
 213 |             ]
 214 | 
 215 |   PrivateRouteTable0:
 216 |     Type: "AWS::EC2::RouteTable"
 217 |     Properties:
 218 |       VpcId:
 219 |         Ref: VPC
 220 |       Tags:
 221 |         - Key: Application
 222 |           Value: AWS-RoseTTAFold
 223 |         - Key: Name
 224 |           Value:
 225 |             !Join [
 226 |               "-",
 227 |               [
 228 |                 "aws-rosettafold",
 229 |                 "private-route-table",
 230 |                 !Select [
 231 |                   4,
 232 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 233 |                 ],
 234 |               ],
 235 |             ]
 236 | 
 237 |   PrivateRouteToInternet0:
 238 |     Type: "AWS::EC2::Route"
 239 |     Properties:
 240 |       RouteTableId:
 241 |         Ref: PrivateRouteTable0
 242 |       DestinationCidrBlock: 0.0.0.0/0
 243 |       NatGatewayId:
 244 |         Ref: NATGateway0
 245 | 
 246 |   PrivateSubnetRouteTableAssociation0:
 247 |     Type: "AWS::EC2::SubnetRouteTableAssociation"
 248 |     Properties:
 249 |       SubnetId:
 250 |         Ref: PrivateSubnet0
 251 |       RouteTableId:
 252 |         Ref: PrivateRouteTable0
 253 | 
 254 |   ##################################################
 255 |   # S3
 256 |   ##################################################
 257 | 
 258 |   ResultsS3:
 259 |     Type: "AWS::S3::Bucket"
 260 |     Properties:
 261 |       BucketEncryption:
 262 |         ServerSideEncryptionConfiguration:
 263 |           - ServerSideEncryptionByDefault:
 264 |               SSEAlgorithm: AES256
 265 |       Tags:
 266 |         - Key: Application
 267 |           Value: AWS-RoseTTAFold
 268 |         - Key: Name
 269 |           Value:
 270 |             !Join [
 271 |               "-",
 272 |               [
 273 |                 "aws-rosettafold",
 274 |                 "s3",
 275 |                 !Select [
 276 |                   4,
 277 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 278 |                 ],
 279 |               ],
 280 |             ]
 281 |     DeletionPolicy: Retain
 282 |     UpdateReplacePolicy : Retain
 283 | 
 284 |   S3Endpoint:
 285 |     Type: "AWS::EC2::VPCEndpoint"
 286 |     Properties:
 287 |       RouteTableIds:
 288 |         - !Ref PublicRouteTable
 289 |         - !Ref PrivateRouteTable0
 290 |       ServiceName: !Sub "com.amazonaws.${AWS::Region}.s3"
 291 |       VpcId: !Ref VPC
 292 | 
 293 |   ##################################################
 294 |   # FSx File System
 295 |   ##################################################
 296 |   FSX:
 297 |     Type: AWS::FSx::FileSystem
 298 |     Properties:
 299 |       FileSystemType: "LUSTRE"
 300 |       FileSystemTypeVersion: "2.12"
 301 |       LustreConfiguration:
 302 |         DataCompressionType: "LZ4"
 303 |         DeploymentType: "PERSISTENT_2"
 304 |         PerUnitStorageThroughput: 1000
 305 |       SecurityGroupIds:
 306 |         - !GetAtt VPC.DefaultSecurityGroup
 307 |       StorageCapacity: 1200
 308 |       StorageType: "SSD"
 309 |       SubnetIds:
 310 |         - !Ref PrivateSubnet0      
 311 |       Tags:
 312 |         - Key: Application
 313 |           Value: AWS-RoseTTAFold
 314 |         - Key: Name
 315 |           Value:
 316 |             !Join [
 317 |               "-",
 318 |               [
 319 |                 "aws-rosettafold",
 320 |                 "fsx-lustre",
 321 |                 !Select [
 322 |                   4,
 323 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 324 |                 ],
 325 |               ],
 326 |             ]
 327 | 
 328 |   ##################################################
 329 |   # EC2 Launch Template
 330 |   ##################################################
 331 | 
 332 |   RFInstanceRole:
 333 |     Type: AWS::IAM::Role
 334 |     Properties:
 335 |       RoleName:
 336 |         !Join [
 337 |           "-",
 338 |           [
 339 |             "aws-rosettafold",
 340 |             "instance-role",
 341 |             !Select [
 342 |               4,
 343 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 344 |             ],
 345 |           ],
 346 |         ]
 347 |       Description: "Required service policies to support running RoseTTAFold on AWS Batch"
 348 |       AssumeRolePolicyDocument:
 349 |         Version: "2012-10-17"
 350 |         Statement:
 351 |           - Effect: Allow
 352 |             Principal:
 353 |               Service:
 354 |                 - ec2.amazonaws.com
 355 |             Action:
 356 |               - "sts:AssumeRole"
 357 |       ManagedPolicyArns:
 358 |         - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
 359 |         - arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role
 360 |         - arn:aws:iam::aws:policy/AmazonS3FullAccess
 361 |       Path: /
 362 |       Tags:
 363 |         - Key: Application
 364 |           Value: AWS-RoseTTAFold
 365 |         - Key: Name
 366 |           Value:
 367 |             !Join [
 368 |               "-",
 369 |               [
 370 |                 "aws-rosettafold",
 371 |                 "instance-role",
 372 |                 !Select [
 373 |                   4,
 374 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 375 |                 ],
 376 |               ],
 377 |             ]
 378 | 
 379 |   InstanceProfile:
 380 |     Type: "AWS::IAM::InstanceProfile"
 381 |     Properties:
 382 |       InstanceProfileName:
 383 |         !Join [
 384 |           "-",
 385 |           [
 386 |             "aws-rosettafold",
 387 |             "instance-profile",
 388 |             !Select [
 389 |               4,
 390 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 391 |             ],
 392 |           ],
 393 |         ]
 394 |       Path: /
 395 |       Roles:
 396 |         - !Ref RFInstanceRole
 397 | 
 398 |   InstanceLaunchTemplate:
 399 |     Type: AWS::EC2::LaunchTemplate
 400 |     Properties:
 401 |       LaunchTemplateName:
 402 |         !Join [
 403 |           "-",
 404 |           [
 405 |             "aws-rosettafold",
 406 |             "launch-template",
 407 |             !Select [
 408 |               4,
 409 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 410 |             ],
 411 |           ],
 412 |         ]
 413 |       LaunchTemplateData:
 414 |         BlockDeviceMappings:
 415 |           - DeviceName: "/dev/xvda"
 416 |             Ebs:
 417 |               DeleteOnTermination: true
 418 |               Encrypted: true
 419 |               VolumeSize: 50
 420 |               VolumeType: "gp2"
 421 |         IamInstanceProfile:
 422 |           Name: !Ref InstanceProfile
 423 |         TagSpecifications:
 424 |           - ResourceType: "instance"
 425 |             Tags:
 426 |               - Key: Application
 427 |                 Value: AWS-RoseTTAFold
 428 |               - Key: Name
 429 |                 Value:
 430 |                   !Join [
 431 |                     "-",
 432 |                     [
 433 |                       "aws-rosettafold",
 434 |                       "launch-template",
 435 |                       !Select [
 436 |                         4,
 437 |                         !Split [
 438 |                           "-",
 439 |                           !Select [2, !Split ["/", !Ref AWS::StackId]],
 440 |                         ],
 441 |                       ],
 442 |                     ],
 443 |                   ]
 444 |         UserData:
 445 |           Fn::Base64:
 446 |             Fn::Join:
 447 |               [
 448 |                 "",
 449 |                 [
 450 |                   "MIME-Version: 1.0\n",
 451 |                   "Content-Type: multipart/mixed; boundary=\"==MYBOUNDARY==\"\n",
 452 |                   "\n",
 453 |                   "--==MYBOUNDARY==\n",
 454 |                   "Content-Type: text/cloud-config; charset=\"us-ascii\"\n",
 455 |                   "\n",
 456 |                   "runcmd:\n",
 457 |                   "- file_system_id_01=",
 458 |                   !Ref FSX,
 459 |                   "\n",
 460 |                   "- region=",
 461 |                   !Ref AWS::Region,
 462 |                   "\n",
 463 |                   "- fsx_directory=/fsx\n",
 464 |                   "- fsx_mount_name=",
 465 |                   !GetAtt FSX.LustreMountName,
 466 |                   "\n",
 467 |                   "- amazon-linux-extras install -y lustre2.10\n",
 468 |                   "- mkdir -p ${fsx_directory}\n",
 469 |                   "- mount -t lustre ${file_system_id_01}.fsx.${region}.amazonaws.com@tcp:/${fsx_mount_name} ${fsx_directory}\n",
 470 |                   "\n",
 471 |                   "--==MYBOUNDARY==--",
 472 |                 ],
 473 |               ]
 474 | 
 475 |   ##################################################
 476 |   # Container Services
 477 |   ##################################################
 478 |   RFCodeRepository:
 479 |     Type: AWS::CodeCommit::Repository
 480 |     Properties:
 481 |       Code:
 482 |         BranchName: "main"
 483 |         S3:
 484 |           Bucket: "aws-hcls-ml"
 485 |           Key: "blog_post_support_materials/aws-RoseTTAFold/aws-rosettafold.zip"
 486 |       RepositoryDescription: Code for running RoseTTAFold on AWS
 487 |       RepositoryName:
 488 |         !Join [
 489 |           "-",
 490 |           [
 491 |             "aws-rosettafold",
 492 |             "code-repo",
 493 |             !Select [
 494 |               4,
 495 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 496 |             ],
 497 |           ],
 498 |         ]
 499 |       Tags:
 500 |         - Key: Application
 501 |           Value: AWS-RoseTTAFold
 502 |         - Key: Name
 503 |           Value:
 504 |             !Join [
 505 |               "-",
 506 |               [
 507 |                 "aws-rosettafold",
 508 |                 "code-repo",
 509 |                 !Select [
 510 |                   4,
 511 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 512 |                 ],
 513 |               ],
 514 |             ]
 515 | 
 516 |   RFContainerRegistry:
 517 |     Type: AWS::ECR::Repository
 518 |     Properties:
 519 |       EncryptionConfiguration:
 520 |         EncryptionType: AES256
 521 |       ImageScanningConfiguration:
 522 |         ScanOnPush: true
 523 |       RepositoryName:
 524 |         !Join [
 525 |           "-",
 526 |           [
 527 |             "aws-rosettafold",
 528 |             "container-repo",
 529 |             !Select [
 530 |               4,
 531 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 532 |             ],
 533 |           ],
 534 |         ]
 535 |       Tags:
 536 |         - Key: Application
 537 |           Value: AWS-RoseTTAFold
 538 |         - Key: Name
 539 |           Value:
 540 |             !Join [
 541 |               "-",
 542 |               [
 543 |                 "aws-rosettafold",
 544 |                 "container-repo",
 545 |                 !Select [
 546 |                   4,
 547 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 548 |                 ],
 549 |               ],
 550 |             ]
 551 |     DeletionPolicy: Retain
 552 |     UpdateReplacePolicy : Retain
 553 | 
 554 |   CodeBuildRole:
 555 |     Type: AWS::IAM::Role
 556 |     Properties:
 557 |       RoleName:
 558 |         !Join [
 559 |           "-",
 560 |           [
 561 |             "aws-rosettafold",
 562 |             "codebuild-role",
 563 |             !Select [
 564 |               4,
 565 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 566 |             ],
 567 |           ],
 568 |         ]
 569 |       Description: "Required service policies to support building AWS-RoseTTAFold container"
 570 |       AssumeRolePolicyDocument:
 571 |         Version: "2012-10-17"
 572 |         Statement:
 573 |           - Effect: Allow
 574 |             Principal:
 575 |               Service:
 576 |                 - codebuild.amazonaws.com
 577 |             Action:
 578 |               - "sts:AssumeRole"
 579 |       ManagedPolicyArns:
 580 |         - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess
 581 |       Path: /
 582 |       Policies:
 583 |         - PolicyName: RFCodeBuildPolicy
 584 |           PolicyDocument:
 585 |             Version: "2012-10-17"
 586 |             Statement:
 587 |               - Effect: Allow
 588 |                 Action:
 589 |                   - logs:CreateLogGroup
 590 |                   - logs:CreateLogStream
 591 |                   - logs:PutLogEvents
 592 |                 Resource:
 593 |                   - Fn::Join:
 594 |                       [
 595 |                         ":",
 596 |                         [
 597 |                           "arn:aws:logs",
 598 |                           !Ref AWS::Region,
 599 |                           !Ref AWS::AccountId,
 600 |                           "log-group:/aws/codebuild/aws-rosettafold*",
 601 |                         ],
 602 |                       ]
 603 |               - Effect: Allow
 604 |                 Action:
 605 |                   - s3:PutObject
 606 |                   - s3:GetObject
 607 |                   - s3:GetObjectVersion
 608 |                   - s3:GetBucketAcl
 609 |                   - s3:GetBucketLocation
 610 |                 Resource:
 611 |                   - !Join [
 612 |                       "-",
 613 |                       ["arn:aws:s3:::codepipeline", !Ref AWS::Region, "*"],
 614 |                     ]
 615 |                   - !Join ["", [!GetAtt ResultsS3.Arn, "*"]]
 616 |               - Effect: Allow
 617 |                 Action:
 618 |                   - codecommit:GitPull
 619 |                 Resource:
 620 |                   - Fn::Join:
 621 |                       [
 622 |                         ":",
 623 |                         [
 624 |                           "arn:aws:codecommit",
 625 |                           !Ref AWS::Region,
 626 |                           !Ref AWS::AccountId,
 627 |                           !GetAtt RFCodeRepository.Name,
 628 |                         ],
 629 |                       ]
 630 |               - Effect: Allow
 631 |                 Action:
 632 |                   - codebuild:CreateReportGroup
 633 |                   - codebuild:CreateReport
 634 |                   - codebuild:UpdateReport
 635 |                   - codebuild:BatchPutTestCases
 636 |                   - codebuild:BatchPutCodeCoverages
 637 |                 Resource:
 638 |                   - Fn::Join:
 639 |                       [
 640 |                         ":",
 641 |                         [
 642 |                           "arn:aws:s3:::codebuild",
 643 |                           !Ref AWS::Region,
 644 |                           !Ref AWS::AccountId,
 645 |                           "report-group/aws-rosettafold*",
 646 |                         ],
 647 |                       ]
 648 |       Tags:
 649 |         - Key: Application
 650 |           Value: AWS-RoseTTAFold
 651 |         - Key: Name
 652 |           Value:
 653 |             !Join [
 654 |               "-",
 655 |               [
 656 |                 "aws-rosettafold",
 657 |                 "codebuild-role",
 658 |                 !Select [
 659 |                   4,
 660 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 661 |                 ],
 662 |               ],
 663 |             ]
 664 | 
 665 |   CodeBuildEncryptionKey:
 666 |     Type: "AWS::KMS::Key"
 667 |     Properties:
 668 |       KeyPolicy:
 669 |         Version: 2012-10-17
 670 |         Id: key-default-1
 671 |         Statement:
 672 |           - Sid: Enable IAM User Permissions
 673 |             Effect: Allow
 674 |             Principal:
 675 |               AWS:
 676 |                 Fn::Join: [":", ["arn:aws:iam:", !Ref AWS::AccountId, "root"]]
 677 |             Action: "kms:*"
 678 |             Resource: "*"
 679 |           - Sid: Enable CodeBuild Encryption
 680 |             Effect: Allow
 681 |             Principal:
 682 |               AWS: !GetAtt CodeBuildRole.Arn
 683 |             Action:
 684 |               [
 685 |                 "kms:Encrypt",
 686 |                 "kms:Decrypt",
 687 |                 "kms:ReEncrypt*",
 688 |                 "kms:GenerateDataKey*",
 689 |                 "kms:DescribeKey",
 690 |               ]
 691 |             Resource: "*"
 692 |       Tags:
 693 |         - Key: Application
 694 |           Value: AWS-RoseTTAFold
 695 |         - Key: Name
 696 |           Value:
 697 |             !Join [
 698 |               "-",
 699 |               [
 700 |                 "aws-rosettafold",
 701 |                 "kms",
 702 |                 !Select [
 703 |                   4,
 704 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 705 |                 ],
 706 |               ],
 707 |             ]
 708 | 
 709 |   RFCodeBuildProject:
 710 |     Type: AWS::CodeBuild::Project
 711 |     Properties:
 712 |       Artifacts:
 713 |         Type: NO_ARTIFACTS
 714 |       Description: Build Docker container for RoseTTAFold execution on AWS Batch
 715 |       EncryptionKey: !Ref CodeBuildEncryptionKey
 716 |       Environment:
 717 |         ComputeType: BUILD_GENERAL1_MEDIUM
 718 |         EnvironmentVariables:
 719 |           - Name: IMAGE_TAG
 720 |             Value: latest
 721 |           - Name: IMAGE_REPO_NAME
 722 |             Value: !Ref RFContainerRegistry
 723 |           - Name: ACCOUNT_ID
 724 |             Value: !Ref AWS::AccountId
 725 |         Image: aws/codebuild/standard:4.0
 726 |         ImagePullCredentialsType: CODEBUILD
 727 |         PrivilegedMode: true
 728 |         Type: LINUX_CONTAINER
 729 |       Name:
 730 |         !Join [
 731 |           "-",
 732 |           [
 733 |             "aws-rosettafold",
 734 |             "codebuild-project",
 735 |             !Select [
 736 |               4,
 737 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 738 |             ],
 739 |           ],
 740 |         ]
 741 |       ResourceAccessRole: !GetAtt CodeBuildRole.Arn
 742 |       ServiceRole: !GetAtt CodeBuildRole.Arn
 743 |       Source:
 744 |         BuildSpec: config/container_buildspec.yml
 745 |         GitCloneDepth: 1
 746 |         Location: !GetAtt RFCodeRepository.CloneUrlHttp
 747 |         Type: CODECOMMIT
 748 |       SourceVersion: refs/heads/main
 749 |       Tags:
 750 |         - Key: Application
 751 |           Value: AWS-RoseTTAFold
 752 |         - Key: Name
 753 |           Value:
 754 |             !Join [
 755 |               "-",
 756 |               [
 757 |                 "aws-rosettafold",
 758 |                 "codebuild-project",
 759 |                 !Select [
 760 |                   4,
 761 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 762 |                 ],
 763 |               ],
 764 |             ]
 765 | 
 766 |   CodePipelineRole:
 767 |     Type: AWS::IAM::Role
 768 |     Properties:
 769 |       RoleName:
 770 |         !Join [
 771 |           "-",
 772 |           [
 773 |             "aws-rosettafold",
 774 |             "codepipeline-role",
 775 |             !Select [
 776 |               4,
 777 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 778 |             ],
 779 |           ],
 780 |         ]
 781 |       Description: "Required service policies to support running AWS-RoseTTAFold build pipeline"
 782 |       AssumeRolePolicyDocument:
 783 |         Version: "2012-10-17"
 784 |         Statement:
 785 |           - Effect: Allow
 786 |             Principal:
 787 |               Service:
 788 |                 - codepipeline.amazonaws.com
 789 |             Action:
 790 |               - "sts:AssumeRole"
 791 |       Path: /
 792 |       Policies:
 793 |         - PolicyName: codePipelineDefault
 794 |           PolicyDocument:
 795 |             Version: "2012-10-17"
 796 |             Statement:
 797 |               - Action:
 798 |                   - iam:PassRole
 799 |                 Resource: "*"
 800 |                 Effect: Allow
 801 |                 Condition:
 802 |                   StringEqualsIfExists:
 803 |                     iam:PassedToService:
 804 |                       - cloudformation.amazonaws.com
 805 |                       - elasticbeanstalk.amazonaws.com
 806 |                       - ec2.amazonaws.com
 807 |                       - ecs-tasks.amazonaws.com
 808 |               - Action:
 809 |                   - codecommit:CancelUploadArchive
 810 |                   - codecommit:GetBranch
 811 |                   - codecommit:GetCommit
 812 |                   - codecommit:GetRepository
 813 |                   - codecommit:GetUploadArchiveStatus
 814 |                   - codecommit:UploadArchive
 815 |                 Resource: "*"
 816 |                 Effect: Allow
 817 |               - Action:
 818 |                   - codedeploy:CreateDeployment
 819 |                   - codedeploy:GetApplication
 820 |                   - codedeploy:GetApplicationRevision
 821 |                   - codedeploy:GetDeployment
 822 |                   - codedeploy:GetDeploymentConfig
 823 |                   - codedeploy:RegisterApplicationRevision
 824 |                 Resource: "*"
 825 |                 Effect: Allow
 826 |               - Action:
 827 |                   - codestar-connections:UseConnection
 828 |                 Resource: "*"
 829 |                 Effect: Allow
 830 |               - Action:
 831 |                   - elasticbeanstalk:*
 832 |                   - ec2:*
 833 |                   - elasticloadbalancing:*
 834 |                   - autoscaling:*
 835 |                   - cloudwatch:*
 836 |                   - s3:*
 837 |                   - sns:*
 838 |                   - cloudformation:*
 839 |                   - rds:*
 840 |                   - sqs:*
 841 |                   - ecs:*
 842 |                 Resource: "*"
 843 |                 Effect: Allow
 844 |               - Action:
 845 |                   - lambda:InvokeFunction
 846 |                   - lambda:ListFunctions
 847 |                 Resource: "*"
 848 |                 Effect: Allow
 849 |               - Action:
 850 |                   - opsworks:CreateDeployment
 851 |                   - opsworks:DescribeApps
 852 |                   - opsworks:DescribeCommands
 853 |                   - opsworks:DescribeDeployments
 854 |                   - opsworks:DescribeInstances
 855 |                   - opsworks:DescribeStacks
 856 |                   - opsworks:UpdateApp
 857 |                   - opsworks:UpdateStack
 858 |                 Resource: "*"
 859 |                 Effect: Allow
 860 |               - Action:
 861 |                   - cloudformation:CreateStack
 862 |                   - cloudformation:DeleteStack
 863 |                   - cloudformation:DescribeStacks
 864 |                   - cloudformation:UpdateStack
 865 |                   - cloudformation:CreateChangeSet
 866 |                   - cloudformation:DeleteChangeSet
 867 |                   - cloudformation:DescribeChangeSet
 868 |                   - cloudformation:ExecuteChangeSet
 869 |                   - cloudformation:SetStackPolicy
 870 |                   - cloudformation:ValidateTemplate
 871 |                 Resource: "*"
 872 |                 Effect: Allow
 873 |               - Action:
 874 |                   - codebuild:BatchGetBuilds
 875 |                   - codebuild:StartBuild
 876 |                   - codebuild:BatchGetBuildBatches
 877 |                   - codebuild:StartBuildBatch
 878 |                 Resource: "*"
 879 |                 Effect: Allow
 880 |               - Effect: Allow
 881 |                 Action:
 882 |                   - devicefarm:ListProjects
 883 |                   - devicefarm:ListDevicePools
 884 |                   - devicefarm:GetRun
 885 |                   - devicefarm:GetUpload
 886 |                   - devicefarm:CreateUpload
 887 |                   - devicefarm:ScheduleRun
 888 |                 Resource: "*"
 889 |               - Effect: Allow
 890 |                 Action:
 891 |                   - servicecatalog:ListProvisioningArtifacts
 892 |                   - servicecatalog:CreateProvisioningArtifact
 893 |                   - servicecatalog:DescribeProvisioningArtifact
 894 |                   - servicecatalog:DeleteProvisioningArtifact
 895 |                   - servicecatalog:UpdateProduct
 896 |                 Resource: "*"
 897 |               - Effect: Allow
 898 |                 Action:
 899 |                   - cloudformation:ValidateTemplate
 900 |                 Resource: "*"
 901 |               - Effect: Allow
 902 |                 Action:
 903 |                   - ecr:DescribeImages
 904 |                 Resource: "*"
 905 |               - Effect: Allow
 906 |                 Action:
 907 |                   - states:DescribeExecution
 908 |                   - states:DescribeStateMachine
 909 |                   - states:StartExecution
 910 |                 Resource: "*"
 911 |               - Effect: Allow
 912 |                 Action:
 913 |                   - appconfig:StartDeployment
 914 |                   - appconfig:StopDeployment
 915 |                   - appconfig:GetDeployment
 916 |                 Resource: "*"
 917 |       Tags:
 918 |         - Key: Application
 919 |           Value: AWS-RoseTTAFold
 920 |         - Key: Name
 921 |           Value:
 922 |             !Join [
 923 |               "-",
 924 |               [
 925 |                 "aws-rosettafold",
 926 |                 "codepipeline-role",
 927 |                 !Select [
 928 |                   4,
 929 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 930 |                 ],
 931 |               ],
 932 |             ]
 933 | 
 934 |   RFCodePipeline:
 935 |     Type: AWS::CodePipeline::Pipeline
 936 |     Properties:
 937 |       ArtifactStore:
 938 |         Location: !Ref ResultsS3
 939 |         Type: S3
 940 |       Name:
 941 |         !Join [
 942 |           "-",
 943 |           [
 944 |             "aws-rosettafold",
 945 |             "codepipeline",
 946 |             !Select [
 947 |               4,
 948 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
 949 |             ],
 950 |           ],
 951 |         ]
 952 |       RestartExecutionOnUpdate: true
 953 |       RoleArn: !GetAtt CodePipelineRole.Arn
 954 |       Stages:
 955 |         - Name: Source
 956 |           Actions:
 957 |             - Name: Source
 958 |               ActionTypeId:
 959 |                 Category: Source
 960 |                 Owner: AWS
 961 |                 Provider: CodeCommit
 962 |                 Version: 1
 963 |               Configuration:
 964 |                 RepositoryName: !GetAtt RFCodeRepository.Name
 965 |                 BranchName: main
 966 |                 PollForSourceChanges: "false"
 967 |               Namespace: SourceVariables
 968 |               OutputArtifacts:
 969 |                 - Name: SourceArtifact
 970 |               Region: !Ref AWS::Region
 971 |               RunOrder: 1
 972 |         - Name: Build
 973 |           Actions:
 974 |             - Name: Build
 975 |               ActionTypeId:
 976 |                 Category: Build
 977 |                 Owner: AWS
 978 |                 Provider: CodeBuild
 979 |                 Version: 1
 980 |               Configuration:
 981 |                 ProjectName: !Ref RFCodeBuildProject
 982 |               InputArtifacts:
 983 |                 - Name: SourceArtifact
 984 |               Namespace: BuildVariables
 985 |               OutputArtifacts:
 986 |                 - Name: BuildArtifact
 987 |               Region: !Ref AWS::Region
 988 |               RunOrder: 2
 989 |       Tags:
 990 |         - Key: Application
 991 |           Value: AWS-RoseTTAFold
 992 |         - Key: Name
 993 |           Value:
 994 |             !Join [
 995 |               "-",
 996 |               [
 997 |                 "aws-rosettafold",
 998 |                 "codepipeline",
 999 |                 !Select [
1000 |                   4,
1001 |                   !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1002 |                 ],
1003 |               ],
1004 |             ]
1005 | 
1006 |   ##################################################
1007 |   # Batch Environment
1008 |   ##################################################
1009 | 
1010 |   CPUComputeEnvironment:
1011 |     Type: AWS::Batch::ComputeEnvironment
1012 |     Properties:
1013 |       ComputeEnvironmentName:
1014 |         !Join [
1015 |           "-",
1016 |           [
1017 |             "aws-rosettafold",
1018 |             "ce-cpu",
1019 |             !Select [
1020 |               4,
1021 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1022 |             ],
1023 |           ],
1024 |         ]
1025 |       ComputeResources:
1026 |         AllocationStrategy: BEST_FIT_PROGRESSIVE
1027 |         InstanceRole: !Ref InstanceProfile
1028 |         InstanceTypes:
1029 |           - optimal
1030 |         LaunchTemplate:
1031 |           LaunchTemplateId: !Ref InstanceLaunchTemplate
1032 |           Version: $Latest
1033 |         MaxvCpus: 256
1034 |         MinvCpus: 0
1035 |         SecurityGroupIds:
1036 |           - !GetAtt VPC.DefaultSecurityGroup
1037 |         Subnets:
1038 |           - Ref: PrivateSubnet0
1039 |         Type: EC2
1040 |       State: ENABLED
1041 |       Type: MANAGED
1042 |       Tags:
1043 |         Application: AWS-RoseTTAFold
1044 |         Name:
1045 |           !Join [
1046 |             "-",
1047 |             [
1048 |               "aws-rosettafold",
1049 |               "ce-cpu",
1050 |               !Select [
1051 |                 4,
1052 |                 !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1053 |               ],
1054 |             ],
1055 |           ]
1056 | 
1057 |   GPUComputeEnvironment:
1058 |     Type: AWS::Batch::ComputeEnvironment
1059 |     Properties:
1060 |       ComputeEnvironmentName:
1061 |         !Join [
1062 |           "-",
1063 |           [
1064 |             "aws-rosettafold",
1065 |             "ce-gpu",
1066 |             !Select [
1067 |               4,
1068 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1069 |             ],
1070 |           ],
1071 |         ]
1072 |       ComputeResources:
1073 |         AllocationStrategy: BEST_FIT_PROGRESSIVE
1074 |         InstanceRole: !Ref InstanceProfile
1075 |         InstanceTypes:
1076 |           - g4dn
1077 |         LaunchTemplate:
1078 |           LaunchTemplateId: !Ref InstanceLaunchTemplate
1079 |           Version: $Latest
1080 |         MaxvCpus: 256
1081 |         MinvCpus: 0
1082 |         SecurityGroupIds:
1083 |           - !GetAtt VPC.DefaultSecurityGroup
1084 |         Subnets:
1085 |           - Ref: PrivateSubnet0
1086 |         Type: EC2
1087 |       State: ENABLED
1088 |       Type: MANAGED
1089 |       Tags:
1090 |         Application: AWS-RoseTTAFold
1091 |         Name:
1092 |           !Join [
1093 |             "-",
1094 |             [
1095 |               "aws-rosettafold",
1096 |               "ce-gpu",
1097 |               !Select [
1098 |                 4,
1099 |                 !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1100 |               ],
1101 |             ],
1102 |           ]
1103 | 
1104 |   CPUJobQueue:
1105 |     Type: AWS::Batch::JobQueue
1106 |     Properties:
1107 |       ComputeEnvironmentOrder:
1108 |         - ComputeEnvironment: !Ref CPUComputeEnvironment
1109 |           Order: 1
1110 |       JobQueueName:
1111 |         !Join [
1112 |           "-",
1113 |           [
1114 |             "aws-rosettafold",
1115 |             "queue-cpu",
1116 |             !Select [
1117 |               4,
1118 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1119 |             ],
1120 |           ],
1121 |         ]
1122 |       Priority: 10
1123 |       State: ENABLED
1124 |       Tags:
1125 |         Application: AWS-RoseTTAFold
1126 |         Name:
1127 |           !Join [
1128 |             "-",
1129 |             [
1130 |               "aws-rosettafold",
1131 |               "queue-cpu",
1132 |               !Select [
1133 |                 4,
1134 |                 !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1135 |               ],
1136 |             ],
1137 |           ]
1138 | 
1139 |   GPUJobQueue:
1140 |     Type: AWS::Batch::JobQueue
1141 |     Properties:
1142 |       ComputeEnvironmentOrder:
1143 |         - ComputeEnvironment: !Ref GPUComputeEnvironment
1144 |           Order: 1
1145 |       JobQueueName:
1146 |         !Join [
1147 |           "-",
1148 |           [
1149 |             "aws-rosettafold",
1150 |             "queue-gpu",
1151 |             !Select [
1152 |               4,
1153 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1154 |             ],
1155 |           ],
1156 |         ]
1157 |       Priority: 10
1158 |       State: ENABLED
1159 |       Tags:
1160 |         Application: AWS-RoseTTAFold
1161 |         Name:
1162 |           !Join [
1163 |             "-",
1164 |             [
1165 |               "aws-rosettafold",
1166 |               "queue-gpu",
1167 |               !Select [
1168 |                 4,
1169 |                 !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1170 |               ],
1171 |             ],
1172 |           ]
1173 | 
1174 |   CPUDataPrepJobDefinition:
1175 |     Type: AWS::Batch::JobDefinition
1176 |     Properties:
1177 |       ContainerProperties:
1178 |         Command:
1179 |           - "/bin/bash"
1180 |           - "run_aws_data_prep_ver.sh"
1181 |           - "-i"
1182 |           - !Join ["", ["s3://", !Ref ResultsS3]]
1183 |           - "-o"
1184 |           - !Join ["", ["s3://", !Ref ResultsS3]]
1185 |           - "-n"
1186 |           - "input.fa"
1187 |           - "-w"
1188 |           - "/work"
1189 |           - "-d"
1190 |           - "/fsx/aws-rosettafold-ref-data"
1191 |           - "-c"
1192 |           - "8"
1193 |           - "-m"
1194 |           - "32"
1195 |         Image:
1196 |           !Join [":", [!GetAtt RFContainerRegistry.RepositoryUri, "latest"]]
1197 |         LogConfiguration:
1198 |           LogDriver: awslogs
1199 |         MountPoints:
1200 |           - ContainerPath: /fsx
1201 |             ReadOnly: False
1202 |             SourceVolume: fsx
1203 |         ResourceRequirements:
1204 |           - Type: VCPU
1205 |             Value: 8
1206 |           - Type: MEMORY
1207 |             Value: 32000
1208 |         Volumes:
1209 |           - Name: fsx
1210 |             Host:
1211 |               SourcePath: /fsx
1212 |       JobDefinitionName:
1213 |         !Join [
1214 |           "-",
1215 |           [
1216 |             "aws-rosettafold",
1217 |             "job-def-cpudataprep",
1218 |             !Select [
1219 |               4,
1220 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1221 |             ],
1222 |           ],
1223 |         ]
1224 |       PlatformCapabilities:
1225 |         - EC2
1226 |       PropagateTags: true
1227 |       RetryStrategy:
1228 |         Attempts: 3
1229 |       Tags:
1230 |         Application: AWS-RoseTTAFold
1231 |         Name:
1232 |           !Join [
1233 |             "-",
1234 |             [
1235 |               "aws-rosettafold",
1236 |               "job-def-cpudataprep",
1237 |               !Select [
1238 |                 4,
1239 |                 !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1240 |               ],
1241 |             ],
1242 |           ]
1243 |       Type: container    
1244 | 
1245 |   GPUPredictJobDefinition:
1246 |     Type: AWS::Batch::JobDefinition
1247 |     Properties:
1248 |       ContainerProperties:
1249 |         Command:
1250 |           - "/bin/bash"
1251 |           - "run_aws_predict_ver.sh"
1252 |           - "-i"
1253 |           - !Join ["", ["s3://", !Ref ResultsS3]]
1254 |           - "-o"
1255 |           - !Join ["", ["s3://", !Ref ResultsS3]]
1256 |           - "-w"
1257 |           - "/work"
1258 |           - "-d"
1259 |           - "/fsx/aws-rosettafold-ref-data"
1260 |           - "-x"
1261 |           - "/fsx/aws-rosettafold-ref-data"
1262 |           - "-c"
1263 |           - "4"
1264 |           - "-m"
1265 |           - "16"
1266 |         Image:
1267 |           !Join [":", [!GetAtt RFContainerRegistry.RepositoryUri, "latest"]]
1268 |         LogConfiguration:
1269 |           LogDriver: awslogs
1270 |         MountPoints:
1271 |           - ContainerPath: /fsx
1272 |             ReadOnly: False
1273 |             SourceVolume: fsx
1274 |         ResourceRequirements:
1275 |           - Type: VCPU
1276 |             Value: 4
1277 |           - Type: MEMORY
1278 |             Value: 16000
1279 |           - Type: GPU
1280 |             Value: 1
1281 |         Volumes:
1282 |           - Name: fsx
1283 |             Host:
1284 |               SourcePath: /fsx
1285 |       JobDefinitionName:
1286 |         !Join [
1287 |           "-",
1288 |           [
1289 |             "aws-rosettafold",
1290 |             "job-def-gpupredict",
1291 |             !Select [
1292 |               4,
1293 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1294 |             ],
1295 |           ],
1296 |         ]
1297 |       PlatformCapabilities:
1298 |         - EC2
1299 |       PropagateTags: true
1300 |       RetryStrategy:
1301 |         Attempts: 3
1302 |       Tags:
1303 |         Application: AWS-RoseTTAFold
1304 |         Name:
1305 |           !Join [
1306 |             "-",
1307 |             [
1308 |               "aws-rosettafold",
1309 |               "job-def-gpupredict",
1310 |               !Select [
1311 |                 4,
1312 |                 !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1313 |               ],
1314 |             ],
1315 |           ]
1316 |       Type: container
1317 | 
1318 |   CPUPredictJobDefinition:
1319 |     Type: AWS::Batch::JobDefinition
1320 |     Properties:
1321 |       ContainerProperties:
1322 |         Command:
1323 |           - "/bin/bash"
1324 |           - "run_aws_predict_ver.sh"
1325 |           - "-i"
1326 |           - !Join ["", ["s3://", !Ref ResultsS3]]
1327 |           - "-o"
1328 |           - !Join ["", ["s3://", !Ref ResultsS3]]
1329 |           - "-w"
1330 |           - "/work"
1331 |           - "-d"
1332 |           - "/fsx/aws-rosettafold-ref-data"
1333 |           - "-x"
1334 |           - "/fsx/aws-rosettafold-ref-data"
1335 |           - "-c"
1336 |           - "4"
1337 |           - "-m"
1338 |           - "64"
1339 |         Image:
1340 |           !Join [":", [!GetAtt RFContainerRegistry.RepositoryUri, "latest"]]
1341 |         LogConfiguration:
1342 |           LogDriver: awslogs
1343 |         MountPoints:
1344 |           - ContainerPath: /fsx
1345 |             ReadOnly: False
1346 |             SourceVolume: fsx
1347 |         ResourceRequirements:
1348 |           - Type: VCPU
1349 |             Value: 4
1350 |           - Type: MEMORY
1351 |             Value: 64000
1352 |         Volumes:
1353 |           - Name: fsx
1354 |             Host:
1355 |               SourcePath: /fsx
1356 |       JobDefinitionName:
1357 |         !Join [
1358 |           "-",
1359 |           [
1360 |             "aws-rosettafold",
1361 |             "job-def-cpupredict",
1362 |             !Select [
1363 |               4,
1364 |               !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1365 |             ],
1366 |           ],
1367 |         ]
1368 |       PlatformCapabilities:
1369 |         - EC2
1370 |       PropagateTags: true
1371 |       RetryStrategy:
1372 |         Attempts: 3
1373 |       Tags:
1374 |         Application: AWS-RoseTTAFold
1375 |         Name:
1376 |           !Join [
1377 |             "-",
1378 |             [
1379 |               "aws-rosettafold",
1380 |               "job-def-cpupredict",
1381 |               !Select [
1382 |                 4,
1383 |                 !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1384 |               ],
1385 |             ],
1386 |           ]
1387 |       Type: container      
1388 | 
1389 | Outputs:
1390 |   CodeRepoUri:
1391 |     Description: URI for cloning the CodeCommit repository over HTTPS
1392 |     Value: !GetAtt RFCodeRepository.CloneUrlHttp
1393 |     Export:
1394 |       Name: !Join [":", [!Ref "AWS::StackName", CodeRepoUri]]
1395 |   CPUJobQueueName:
1396 |     Description: Name of the CPU job queue.
1397 |     Value: !Select [5, !Split [":", !Ref CPUJobQueue]]
1398 |     Export:
1399 |       Name: !Join [":", [!Ref "AWS::StackName", CPUJobQueueName]]
1400 |   GPUJobQueueName:
1401 |     Description: Name of the GPU job queue.
1402 |     Value: !Select [5, !Split [":", !Ref GPUJobQueue]]
1403 |     Export:
1404 |       Name: !Join [":", [!Ref "AWS::StackName", GPUJobQueueName]]
1405 |   CPUDataPrepJobDefinition:
1406 |     Description: Name of the data prep CPU job definition.
1407 |     Value: !Select [5, !Split [":", !Ref CPUDataPrepJobDefinition]]
1408 |     Export:
1409 |       Name: !Join [":", [!Ref "AWS::StackName", CPUDataPrepJobDefinition]]  
1410 |   GPUPredictJobDefinition:
1411 |     Description: Name of the predict GPU job definition.
1412 |     Value: !Select [5, !Split [":", !Ref GPUPredictJobDefinition]]
1413 |     Export:
1414 |       Name: !Join [":", [!Ref "AWS::StackName", GPUPredictJobDefinition]]          
1415 |   CPUPredictJobDefinition:
1416 |     Description: Name of the predict CPU job definition.
1417 |     Value: !Select [5, !Split [":", !Ref CPUPredictJobDefinition]]
1418 |     Export:
1419 |       Name: !Join [":", [!Ref "AWS::StackName", CPUPredictJobDefinition]]                
1420 | 


--------------------------------------------------------------------------------
/config/container_buildspec.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   pre_build:
 5 |     commands:
 6 |       - echo Logging in to Amazon ECR...
 7 |       - aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
 8 |   build:
 9 |     commands:
10 |       - echo Build started on `date`
11 |       - echo Building the Docker image...
12 |       - docker build -t $IMAGE_REPO_NAME:$IMAGE_TAG config
13 |       - docker tag $IMAGE_REPO_NAME:$IMAGE_TAG $ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG
14 |   post_build:
15 |     commands:
16 |       - echo Build completed on `date`
17 |       - echo Pushing the Docker image...
18 |       - docker push $ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG
19 | 


--------------------------------------------------------------------------------
/config/download_ref_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | yum install wget tar -y
 3 | cd /fsx
 4 | 
 5 | # NOTE: The RoseTTAFold network weights are covered under the Rosetta-DL software license.
 6 | # Please see https://files.ipd.uw.edu/pub/RoseTTAFold/Rosetta-DL_LICENSE.txt for more
 7 | # information.
 8 | wget https://files.ipd.uw.edu/pub/RoseTTAFold/weights.tar.gz
 9 | tar xfz weights.tar.gz
10 | rm weights.tar.gz
11 | 
12 | # uniref30 [46G]
13 | wget http://wwwuser.gwdg.de/~compbiol/uniclust/2020_06/UniRef30_2020_06_hhsuite.tar.gz
14 | mkdir -p UniRef30_2020_06
15 | tar xfz UniRef30_2020_06_hhsuite.tar.gz -C ./UniRef30_2020_06
16 | rm UniRef30_2020_06_hhsuite.tar.gz
17 | 
18 | # structure templates (including *_a3m.ffdata, *_a3m.ffindex) [over 100G]
19 | wget https://files.ipd.uw.edu/pub/RoseTTAFold/pdb100_2021Mar03.tar.gz
20 | tar xfz pdb100_2021Mar03.tar.gz
21 | rm pdb100_2021Mar03.sorted_opt.tar.gz
22 | 
23 | # BFD [272G]
24 | wget https://bfd.mmseqs.com/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz
25 | mkdir -p bfd
26 | tar xfz bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz -C ./bfd
27 | rm bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz
28 | 


--------------------------------------------------------------------------------
/config/run_aws_data_prep_ver.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ############################################################
  4 | # Run E2E RosettaFold analysis on AWS
  5 | ## Options
  6 | # -i (Required) S3 path to input folder
  7 | # -o (Required) S3 path to output folder
  8 | # -n Input file name (e.g. input.fa)
  9 | # -p Prefix to use for output files
 10 | # -w Path to working folder on run environment file system
 11 | # -d Path to database folder on run environment file system
 12 | # -c Max CPU count
 13 | # -m Max memory amount (GB)
 14 | #
 15 | # Example CMD
 16 | # ./AWS-RoseTTAFold/run_aws_e2e_ver.sh \
 17 | #   -i s3://032243382548-rf-run-data/input \
 18 | #   -o s3://032243382548-rf-run-data/output \
 19 | #   -n input.fa
 20 | #   -w ~/work \
 21 | #   -d /fsx \
 22 | #   -c 16 \
 23 | #   -m 64 \
 24 | 
 25 | # make the script stop when error (non-true exit code) is occuredcd
 26 | set -e
 27 | START="$(date +%s)"
 28 | ############################################################
 29 | # >>> conda initialize >>>
 30 | # !! Contents within this block are managed by 'conda init' !!
 31 | __conda_setup="$('conda' 'shell.bash' 'hook' 2> /dev/null)"
 32 | eval "$__conda_setup"
 33 | unset __conda_setup
 34 | # <<< conda initialize <<<
 35 | ############################################################
 36 | 
 37 | unset -v SCRIPT PIPEDIR UUID INPUT_S3_FOLDER OUTPUT_S3_FOLDER \
 38 |     INPUT_FILE WDIR DBDIR CPU MEM
 39 | 
 40 | SCRIPT=`realpath -s $0`
 41 | SCRIPTDIR=`dirname $SCRIPT`
 42 | 
 43 | while getopts "i:o:n:p:w:d:c:m:" option
 44 | do
 45 |     case $option in
 46 |     i) INPUT_S3_FOLDER=$OPTARG ;; # s3 URI to input folder
 47 |     o) OUTPUT_S3_FOLDER=$OPTARG ;; # s3 URI to output folder
 48 |     n) INPUT_FILE=$OPTARG ;; # input file name, e.g. input.fa
 49 |     p) UUID=$OPTARG ;; # File prefix
 50 |     w) WDIR=$OPTARG ;; # path to local working folder
 51 |     d) DBDIR=$OPTARG ;; # path to local sequence databases
 52 |     c) CPU=$OPTARG ;; # vCPU
 53 |     m) MEM=$OPTARG ;; # MEM (GB)
 54 |     *) exit 1 ;;
 55 |     esac
 56 | done
 57 | 
 58 | [ -z "$INPUT_S3_FOLDER" ] && { echo "\$INPUT_S3_OBJECT undefined"; exit 1; }
 59 | [ -z "$OUTPUT_S3_FOLDER" ] && { echo "\$OUTPUT_S3_FOLDER undefined"; exit 1; }
 60 | [ -z "$INPUT_FILE" ] && { INPUT_FILE="input.fa"; }
 61 | [ -z "$WDIR" ] && { WDIR=$SCRIPTDIR; }
 62 | [ -z "$DBDIR" ] && { DBDIR=$WDIR; }
 63 | [ -z "$CPU" ] && { CPU="16"; }
 64 | [ -z "$MEM" ] && { MEM="64"; }
 65 | 
 66 | if [ -z "$UUID" ]
 67 | then
 68 |     if [ -z "$AWS_BATCH_JOB_ID" ]
 69 |     then
 70 |         UUID=`date "+%Y%m%d%H%M%S"`;
 71 |     else
 72 |         UUID=$AWS_BATCH_JOB_ID;
 73 |     fi
 74 | fi
 75 | 
 76 | IN=$WDIR/input.fa
 77 | aws s3 cp $INPUT_S3_FOLDER/$INPUT_FILE $IN
 78 | 
 79 | ls $WDIR
 80 | #LENGTH=`tail -n1 $IN | wc -m`
 81 | LENGTH=`grep -v -e "^>" $IN | tr -d "\n" | wc -m`
 82 | 
 83 | conda activate RoseTTAFold
 84 | 
 85 | ############################################################
 86 | # 1. generate MSAs
 87 | ############################################################
 88 | MSA_START="$(date +%s)"
 89 | 
 90 | if [ ! -s $WDIR/t000_.msa0.a3m ]
 91 | then
 92 |     export PIPEDIR=$DBDIR
 93 |     echo "Running HHblits"
 94 |     $SCRIPTDIR/input_prep/make_msa.sh $IN $WDIR $CPU $MEM $DBDIR
 95 | fi
 96 | 
 97 | MSA_COUNT=`grep "^>" $WDIR/t000_.msa0.a3m -c`
 98 | 
 99 | aws s3 cp $WDIR/t000_.msa0.a3m $OUTPUT_S3_FOLDER/$UUID.msa0.a3m
100 | 
101 | MSA_DURATION=$[ $(date +%s) - ${MSA_START} ]
102 | echo "${UUID} MSA duration: ${MSA_DURATION} sec"
103 | 
104 | ############################################################
105 | # 2. predict secondary structure for HHsearch run
106 | ############################################################
107 | SS_START="$(date +%s)"
108 | if [ ! -s $WDIR/t000_.ss2 ]
109 | then
110 |     export PIPEDIR=$SCRIPTDIR
111 |     echo "Running PSIPRED"
112 |     $SCRIPTDIR/input_prep/make_ss.sh $WDIR/t000_.msa0.a3m $WDIR/t000_.ss2
113 | fi
114 | 
115 | aws s3 cp $WDIR/t000_.ss2 $OUTPUT_S3_FOLDER/$UUID.ss2
116 | 
117 | SS_DURATION=$[ $(date +%s) - ${SS_START} ]
118 | echo "${UUID} SS duration: ${SS_DURATION} sec"
119 | 
120 | ############################################################
121 | # 3. search for templates
122 | ############################################################
123 | TEMPLATE_START="$(date +%s)"
124 | DB="$DBDIR/pdb100_2021Mar03/pdb100_2021Mar03"
125 | if [ ! -s $WDIR/t000_.hhr ]
126 | then
127 |     echo "Running hhsearch"
128 |     HH="hhsearch -b 50 -B 500 -z 50 -Z 500 -mact 0.05 -cpu $CPU -maxmem $MEM -aliw 100000 -e 100 -p 5.0 -d $DB"
129 |     cat $WDIR/t000_.ss2 $WDIR/t000_.msa0.a3m > $WDIR/t000_.msa0.ss2.a3m
130 |     $HH -i $WDIR/t000_.msa0.ss2.a3m -o $WDIR/t000_.hhr -atab $WDIR/t000_.atab -v 2
131 | fi
132 | 
133 | TEMPLATE_COUNT=`grep "^No [[:digit:]]*$" $WDIR/t000_.hhr -c`
134 | 
135 | aws s3 cp $WDIR/t000_.msa0.ss2.a3m $OUTPUT_S3_FOLDER/$UUID.msa0.ss2.a3m
136 | aws s3 cp $WDIR/t000_.hhr $OUTPUT_S3_FOLDER/$UUID.hhr
137 | aws s3 cp $WDIR/t000_.atab $OUTPUT_S3_FOLDER/$UUID.atab
138 | 
139 | TEMPLATE_DURATION=$[ $(date +%s) - ${TEMPLATE_START} ]
140 | echo "${UUID} template search duration: ${TEMPLATE_DURATION} sec"
141 | 
142 | TOTAL_DATA_PREP_DURATION=$[ $(date +%s) - ${START} ]
143 | echo "${UUID} total data prep duration: ${TOTAL_DATA_PREP_DURATION} sec"
144 | 
145 | # Collect metrics
146 | echo "DATA_PREP:" >> $WDIR/metrics.yaml
147 | echo "  JOB_ID: ${UUID}" >> $WDIR/metrics.yaml
148 | echo "  INPUT_S3_FOLDER: ${INPUT_S3_FOLDER}" >> $WDIR/metrics.yaml
149 | echo "  INPUT_FILE: ${INPUT_S3_FILE}" >> $WDIR/metrics.yaml
150 | echo "  OUTPUT_S3_FOLDER: ${OUTPUT_S3_FOLDER}" >> $WDIR/metrics.yaml
151 | echo "  WDIR: ${WDIR}" >> $WDIR/metrics.yaml
152 | echo "  DBDIR: ${DBDIR}" >> $WDIR/metrics.yaml
153 | echo "  CPU: ${CPU}" >> $WDIR/metrics.yaml
154 | echo "  MEM: ${MEM}" >> $WDIR/metrics.yaml
155 | echo "  LENGTH: ${LENGTH}" >> $WDIR/metrics.yaml
156 | echo "  MSA_COUNT: ${MSA_COUNT}" >> $WDIR/metrics.yaml
157 | echo "  TEMPLATE_COUNT: ${TEMPLATE_COUNT}" >> $WDIR/metrics.yaml
158 | echo "  START_TIME: ${START}" >> $WDIR/metrics.yaml
159 | echo "  MSA_DURATION: ${MSA_DURATION}" >> $WDIR/metrics.yaml
160 | echo "  SS_DURATION: ${SS_DURATION}" >> $WDIR/metrics.yaml
161 | echo "  TEMPLATE_DURATION: ${TEMPLATE_DURATION}" >> $WDIR/metrics.yaml
162 | echo "  TOTAL_DATA_PREP_DURATION: ${TOTAL_DATA_PREP_DURATION}" >> $WDIR/metrics.yaml
163 | 
164 | aws s3 cp $WDIR/metrics.yaml $OUTPUT_S3_FOLDER/metrics.yaml
165 | 
166 | echo "Done"


--------------------------------------------------------------------------------
/config/run_aws_predict_ver.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ############################################################
  4 | # Run RosettaFold prediction analysis on AWS
  5 | ## Options
  6 | # -i (Required) S3 path to input folder
  7 | # -o (Required) S3 path to output folder
  8 | # -p Prefix to use for output files
  9 | # -w Path to working folder on run environment file system
 10 | # -d Path to database folder on run environment file system
 11 | # -x Pathe to model weights folder on run environment
 12 | # -c Max CPU count
 13 | # -m Max memory amount (GB)
 14 | #
 15 | # Example CMD
 16 | # ./AWS-RoseTTAFold/run_aws_e2e_ver.sh \
 17 | #   -i s3://032243382548-rf-run-data/input \
 18 | #   -o s3://032243382548-rf-run-data/output \
 19 | #   -w ~/work \
 20 | #   -d /fsx/RoseTTAFold \
 21 | #   -x /fsx/RoseTTAFold \
 22 | #   -c 16 \
 23 | #   -m 64 \
 24 | 
 25 | # make the script stop when error (non-true exit code) is occuredcd
 26 | set -e
 27 | START="$(date +%s)"
 28 | ############################################################
 29 | # >>> conda initialize >>>
 30 | # !! Contents within this block are managed by 'conda init' !!
 31 | __conda_setup="$('conda' 'shell.bash' 'hook' 2> /dev/null)"
 32 | eval "$__conda_setup"
 33 | unset __conda_setup
 34 | # <<< conda initialize <<<
 35 | ############################################################
 36 | 
 37 | unset -v SCRIPT PIPEDIR UUID INPUT_S3_FOLDER OUTPUT_S3_FOLDER \
 38 |     INPUT_FILE WDIR DBDIR MODEL_WEIGHTS_DIR CPU MEM 
 39 | 
 40 | SCRIPT=`realpath -s $0`
 41 | SCRIPTDIR=`dirname $SCRIPT`
 42 | 
 43 | while getopts "i:o:p:w:d:x:c:m:" option
 44 | do
 45 |     case $option in
 46 |     i) INPUT_S3_FOLDER=$OPTARG ;; # s3 URI to input folder
 47 |     o) OUTPUT_S3_FOLDER=$OPTARG ;; # s3 URI to output folder    
 48 |     p) UUID=$OPTARG ;; # File prefix
 49 |     w) WDIR=$OPTARG ;; # path to local working folder
 50 |     d) DBDIR=$OPTARG ;; # path to local sequence databases
 51 |     x) MODEL_WEIGHTS_DIR=$OPTARG ;; # path to local weights 
 52 |     c) CPU=$OPTARG ;; # vCPU
 53 |     m) MEM=$OPTARG ;; # MEM (GB)
 54 |     *) exit 1 ;;
 55 |     esac
 56 | done
 57 | 
 58 | [ -z "$INPUT_S3_FOLDER" ] && { echo "\$INPUT_S3_OBJECT undefined"; exit 1; }
 59 | [ -z "$OUTPUT_S3_FOLDER" ] && { echo "\$OUTPUT_S3_FOLDER undefined"; exit 1; }
 60 | [ -z "$WDIR" ] && { WDIR=$SCRIPTDIR; }
 61 | [ -z "$DBDIR" ] && { DBDIR=$WDIR; }
 62 | [ -z "$MODEL_WEIGHTS_DIR" ] && { MODEL_WEIGHTS_DIR=$WDIR; }
 63 | [ -z "$CPU" ] && { CPU="16"; }
 64 | [ -z "$MEM" ] && { MEM="64"; }
 65 | [ -z "$CUDA_VISIBLE_DEVICES" ] && { CUDA_VISIBLE_DEVICES="99"; }
 66 | 
 67 | if [ -z "$UUID" ]
 68 | then
 69 |     if [ -z "$AWS_BATCH_JOB_ID" ]
 70 |     then
 71 |         UUID=`date "+%Y%m%d%H%M%S"`;
 72 |     else
 73 |         UUID=$AWS_BATCH_JOB_ID;
 74 |     fi
 75 | fi
 76 | 
 77 | IN=$WDIR/input.fa
 78 | 
 79 | conda activate RoseTTAFold
 80 | 
 81 | aws s3 cp $INPUT_S3_FOLDER/$UUID.msa0.a3m $WDIR/t000_.msa0.a3m 
 82 | aws s3 cp $INPUT_S3_FOLDER/$UUID.hhr $WDIR/t000_.hhr 
 83 | aws s3 cp $INPUT_S3_FOLDER/$UUID.atab $WDIR/t000_.atab 
 84 | aws s3 cp $INPUT_S3_FOLDER/metrics.yaml $WDIR/metrics.yaml 
 85 | 
 86 | ############################################################
 87 | # End-to-end prediction
 88 | ############################################################
 89 | PREDICT_START="$(date +%s)"
 90 | if [ ! -s $WDIR/t000_.3track.npz ]
 91 | then
 92 |     echo "Running end-to-end prediction"    
 93 |     DB="$DBDIR/pdb100_2021Mar03/pdb100_2021Mar03"
 94 | 
 95 |     python $SCRIPTDIR/network/predict_e2e.py \
 96 |         -m $MODEL_WEIGHTS_DIR/weights \
 97 |         -i $WDIR/t000_.msa0.a3m \
 98 |         -o $WDIR/t000_.e2e \
 99 |         --hhr $WDIR/t000_.hhr \
100 |         --atab $WDIR/t000_.atab \
101 |         --db $DB
102 | fi
103 | 
104 | aws s3 cp $WDIR/t000_.e2e.pdb $OUTPUT_S3_FOLDER/$UUID.e2e.pdb
105 | aws s3 cp $WDIR/t000_.e2e_init.pdb $OUTPUT_S3_FOLDER/$UUID.e2e_init.pdb
106 | aws s3 cp $WDIR/t000_.e2e.npz $OUTPUT_S3_FOLDER/$UUID.e2e.npz
107 | 
108 | TOTAL_PREDICT_DURATION=$[ $(date +%s) - ${PREDICT_START} ]
109 | echo "${UUID} prediction duration: ${TOTAL_PREDICT_DURATION} sec"
110 | 
111 | # Collect metrics
112 | echo "PREDICT:" >> $WDIR/metrics.yaml
113 | echo "  JOB_ID: ${UUID}" >> $WDIR/metrics.yaml
114 | echo "  INPUT_S3_FOLDER: ${INPUT_S3_FOLDER}" >> $WDIR/metrics.yaml
115 | echo "  OUTPUT_S3_FOLDER: ${OUTPUT_S3_FOLDER}" >> $WDIR/metrics.yaml
116 | echo "  WDIR: ${WDIR}" >> $WDIR/metrics_data_prep.yaml
117 | echo "  DBDIR: ${DBDIR}" >> $WDIR/metrics.yaml
118 | echo "  MODEL_WEIGHTS_DIR: ${MODEL_WEIGHTS_DIR}" >> $WDIR/metrics.yaml
119 | echo "  CPU: ${CPU}" >> $WDIR/metrics.yaml
120 | echo "  MEM: ${MEM}" >> $WDIR/metrics.yaml
121 | echo "  GPU: ${CUDA_VISIBLE_DEVICES}" >> $WDIR/metrics.yaml
122 | echo "  START_TIME: ${PREDICT_START}" >> $WDIR/metrics.yaml
123 | echo "  TOTAL_PREDICT_DURATION: ${TOTAL_PREDICT_DURATION}" >> $WDIR/metrics.yaml
124 | 
125 | aws s3 cp $WDIR/metrics.yaml $OUTPUT_S3_FOLDER/metrics.yaml
126 | 
127 | echo "Done"


--------------------------------------------------------------------------------
/data/T1028.fa:
--------------------------------------------------------------------------------
1 | >T1028 CalU17, Micromonospora echinospora, 316 residues|
2 | MARIGDLDAARPAPEAVPGDMVRIPGGTFLQGSPERTLDWLDREGQAFPRDWFTDETPQIPVTLPDYLIDRHQVTVAQFAAFVSRTGYVTSAERAGGSMVYGEQYWEIREGACWHRPAGYGSGIRGRDDHPVVHISFADAEAYARWAGRRLPTESEWERAATGPSYRLWPWGDTWDSRNANTAEHTAGALGDLDAWRTWWGAIHAVQGPMPQTTPVGAFSPRGDSVDGCADMTGNVYEWTSTLAHLYSPATRCDPTIHLVMGRSRVIRGGSWMNFRYQVRCAERLYGDPTGWSNFALGFRCARDVTAVPHVDDNGR


--------------------------------------------------------------------------------
/data/T1036s1.fa:
--------------------------------------------------------------------------------
1 | >T1036s1 Monoclonal antibody 93k, Varicella-zoster virus, strain pOka, subunit 1, 622 residues|
2 | TKPTFYVCPPPTGSTIVRLEPPRTCPDYHLGKNFTEGIAVVYKENIAAYKFKATVYYKDVIVSTAWAGSSYTQITNRYADRVPIPVSEITDTIDKFGKCSSKATYVRNNHKVEAFNEDKNPQDMPLIASKYNSVGSKAWHTTNDTYMVAGTPGTYRTGTSVNCIIEEVEARSIFPYDSFGLSTGDIIYMSPFFGLRDGAYREHSNYAMDRFHQFEGYRQRDLDTRALLEPAARNFLVTPHLTVGWNWKPKRTEVCSLVKWREVEDVVRDEYAHNFRFTMKTLSTTFISETNEFNLNQIHLSQCVKEEARAIINRIYTTRYNSSHVRTGDIQTYLARGGFVVVFQPLLSNSLARLYLQELVRENTNHSPQKHPTRNTRSRRSVPVELRANRTITTTSSVEFAMLQFTYDHIQEHVNEMLARISSSWCQLQNRERALWSGLFPINPSALASTILDQRVKARILGDVISVSNCPELGSDTRIILQNSMRVSGSTTRCYSRPLISIVSLNGSGTVEGQLGTDNELIMSRDLLEPCVANHKRYFLFGHHYVYYEDYRYVREIAVHDVGMISTYVDLNLTLLKDREFMPLRVYTRDELRDTGLLDYSEIQRRNQMHSLRFYDIDKVVQ


--------------------------------------------------------------------------------
/data/T1078.fa:
--------------------------------------------------------------------------------
1 | >T1078 Tsp1, Trichoderma virens, 138 residues|
2 | MAAPTPADKSMMAAVPEWTITNLKRVCNAGNTSCTWTFGVDTHLATATSCTYVVKANANASQASGGPVTCGPYTITSSWSGQFGPNNGFTTFAVTDFSKKLIVWPAYTDVQVQAGKVVSPNQSYAPANLPLEHHHHHH


--------------------------------------------------------------------------------
/img/AWS-RoseTTAFold-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/AWS-RoseTTAFold-arch.png


--------------------------------------------------------------------------------
/img/AWS-RoseTTAFold-deploy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/AWS-RoseTTAFold-deploy.png


--------------------------------------------------------------------------------
/img/LaunchStack.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/LaunchStack.jpg


--------------------------------------------------------------------------------
/img/RF_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/RF_workflow.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | biopython
2 | py3Dmol
3 | boto3
4 | sagemaker
5 | matplotlib
6 | pyyaml


--------------------------------------------------------------------------------
/rfutils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/rfutils/__init__.py


--------------------------------------------------------------------------------
/rfutils/rfutils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helper functions for the AWS-RoseTTAFold notebook.
  3 | """
  4 | 
  5 | ## Load dependencies
  6 | from Bio import SeqIO
  7 | import boto3
  8 | from datetime import datetime
  9 | import json
 10 | import matplotlib.pyplot as plt
 11 | from matplotlib import colors
 12 | import numpy as np
 13 | import os
 14 | import pandas as pd
 15 | import py3Dmol
 16 | import yaml
 17 | from re import sub
 18 | import sagemaker
 19 | import string
 20 | from string import ascii_uppercase, ascii_lowercase
 21 | from time import sleep
 22 | import uuid
 23 | 
 24 | # Get service clients
 25 | session = boto3.session.Session()
 26 | sm_session = sagemaker.session.Session()
 27 | region = session.region_name
 28 | role = sagemaker.get_execution_role()
 29 | s3 = boto3.client("s3", region_name=region)
 30 | 
 31 | pymol_color_list = [
 32 |     "#33ff33",
 33 |     "#00ffff",
 34 |     "#ff33cc",
 35 |     "#ffff00",
 36 |     "#ff9999",
 37 |     "#e5e5e5",
 38 |     "#7f7fff",
 39 |     "#ff7f00",
 40 |     "#7fff7f",
 41 |     "#199999",
 42 |     "#ff007f",
 43 |     "#ffdd5e",
 44 |     "#8c3f99",
 45 |     "#b2b2b2",
 46 |     "#007fff",
 47 |     "#c4b200",
 48 |     "#8cb266",
 49 |     "#00bfbf",
 50 |     "#b27f7f",
 51 |     "#fcd1a5",
 52 |     "#ff7f7f",
 53 |     "#ffbfdd",
 54 |     "#7fffff",
 55 |     "#ffff7f",
 56 |     "#00ff7f",
 57 |     "#337fcc",
 58 |     "#d8337f",
 59 |     "#bfff3f",
 60 |     "#ff7fff",
 61 |     "#d8d8ff",
 62 |     "#3fffbf",
 63 |     "#b78c4c",
 64 |     "#339933",
 65 |     "#66b2b2",
 66 |     "#ba8c84",
 67 |     "#84bf00",
 68 |     "#b24c66",
 69 |     "#7f7f7f",
 70 |     "#3f3fa5",
 71 |     "#a5512b",
 72 | ]
 73 | 
 74 | pymol_cmap = colors.ListedColormap(pymol_color_list)
 75 | alphabet_list = list(ascii_uppercase + ascii_lowercase)
 76 | 
 77 | aatypes = set("ACDEFGHIKLMNPQRSTVWY")
 78 | 
 79 | 
 80 | def create_job_name(suffix=None):
 81 | 
 82 |     """
 83 |     Define a simple job identifier
 84 |     """
 85 | 
 86 |     if suffix == None:
 87 |         return datetime.utcnow().strftime("%Y%m%dT%H%M%S")
 88 |     else:
 89 |         ## Ensure that the suffix conforms to the Batch requirements, (only letters,
 90 |         ## numbers, hyphens, and underscores are allowed).
 91 |         suffix = sub("\W", "_", suffix)
 92 |         return datetime.utcnow().strftime("%Y%m%dT%H%M%S") + "_" + suffix
 93 | 
 94 | 
 95 | def display_msa(jobId, bucket):
 96 |     """
 97 |     Display the MSA plot in a Jupyter notebook cell
 98 |     """
 99 | 
100 |     info = get_batch_job_info(jobId)
101 | 
102 |     if info["status"] == "SUCCEEDED":
103 |         print(
104 |             f"Downloading MSA file from s3://{bucket}/{info['jobName']}/{info['jobName']}.msa0.a3m"
105 |         )
106 |         s3.download_file(
107 |             bucket,
108 |             f"{info['jobName']}/{info['jobName']}.msa0.a3m",
109 |             "data/alignment.msa",
110 |         )
111 |         msa_all = parse_a3m("data/alignment.msa")
112 |         plot_msa_info(msa_all)
113 |     else:
114 |         print(
115 |             f"Data prep job {info['jobId']} is in {info['status']} status. Please try again once the job has completed."
116 |         )
117 | 
118 | 
119 | def display_structure(
120 |     jobId,
121 |     bucket,
122 |     color="lDDT",
123 |     show_sidechains=False,
124 |     show_mainchains=False,
125 |     chains=1,
126 |     vmin=0.5,
127 |     vmax=0.9,
128 | ):
129 |     """
130 |     Display the predicted structure in a Jupyter notebook cell
131 |     """
132 |     if color not in ["chain", "lDDT", "rainbow"]:
133 |         raise ValueError("Color must be 'LDDT' (default), 'chain', or 'rainbow'")
134 | 
135 |     info = get_batch_job_info(jobId)
136 | 
137 |     if info["status"] == "SUCCEEDED":
138 |         print(
139 |             f"Downloading PDB file from s3://{bucket}/{info['jobName']}/{info['jobName']}.e2e.pdb"
140 |         )
141 |         s3.download_file(
142 |             bucket, f"{info['jobName']}/{info['jobName']}.e2e.pdb", "data/e2e.pdb"
143 |         )
144 |         plot_pdb(
145 |             "data/e2e.pdb",
146 |             show_sidechains=show_sidechains,
147 |             show_mainchains=show_mainchains,
148 |             color=color,
149 |             chains=chains,
150 |             vmin=vmin,
151 |             vmax=vmax,
152 |         ).show()
153 |         if color == "lDDT":
154 |             plot_plddt_legend().show()
155 |     else:
156 |         print(
157 |             f"{info['jobId']} is in {info['status']} status. Please try again once the job has completed."
158 |         )
159 | 
160 | 
161 | def get_batch_job_info(jobId):
162 | 
163 |     """
164 |     Retrieve and format information about a batch job.
165 |     """
166 | 
167 |     client = boto3.client("batch")
168 |     job_description = client.describe_jobs(jobs=[jobId])
169 | 
170 |     output = {
171 |         "jobArn": job_description["jobs"][0]["jobArn"],
172 |         "jobName": job_description["jobs"][0]["jobName"],
173 |         "jobId": job_description["jobs"][0]["jobId"],
174 |         "status": job_description["jobs"][0]["status"],
175 |         "createdAt": datetime.utcfromtimestamp(
176 |             job_description["jobs"][0]["createdAt"] / 1000
177 |         ).strftime("%Y-%m-%dT%H:%M:%SZ"),
178 |         "dependsOn": job_description["jobs"][0]["dependsOn"],
179 |         "tags": job_description["jobs"][0]["tags"],
180 |     }
181 | 
182 |     if output["status"] in ["STARTING", "RUNNING", "SUCCEEDED", "FAILED"]:
183 |         output["logStreamName"] = job_description["jobs"][0]["container"][
184 |             "logStreamName"
185 |         ]
186 |     return output
187 | 
188 | 
189 | def get_batch_logs(logStreamName):
190 | 
191 |     """
192 |     Retrieve and format logs for batch job.
193 |     """
194 | 
195 |     client = boto3.client("logs")
196 |     try:
197 |         response = client.get_log_events(
198 |             logGroupName="/aws/batch/job", logStreamName=logStreamName
199 |         )
200 |     except client.meta.client.exceptions.ResourceNotFoundException:
201 |         return f"Log stream {logStreamName} does not exist. Please try again in a few minutes"
202 | 
203 |     logs = pd.DataFrame.from_dict(response["events"])
204 |     logs.timestamp = logs.timestamp.transform(
205 |         lambda x: datetime.fromtimestamp(x / 1000)
206 |     )
207 |     logs.drop("ingestionTime", axis=1, inplace=True)
208 |     return logs
209 | 
210 | 
211 | def get_rf_job_info(
212 |     cpu_queue="AWS-RoseTTAFold-CPU", gpu_queue="AWS-RoseTTAFold-GPU", hrs_in_past=1
213 | ):
214 | 
215 |     """
216 |     Display information about recent AWS-RoseTTAFold jobs
217 |     """
218 |     from datetime import datetime
219 | 
220 |     batch_client = boto3.client("batch")
221 |     recent_jobs = list_recent_jobs([cpu_queue, gpu_queue], hrs_in_past)
222 |     recent_job_df = pd.DataFrame.from_dict(recent_jobs)
223 |     list_of_lists = []
224 |     if len(recent_job_df) > 0:
225 |         detail_list = batch_client.describe_jobs(jobs=recent_job_df.jobId.to_list())
226 |         for job in detail_list["jobs"]:
227 |             resource_dict = {}
228 |             for resource in job["container"]["resourceRequirements"]:
229 |                 resource_dict[resource["type"]] = resource["value"]
230 |             row = [
231 |                 job["jobName"],
232 |                 job["jobId"],
233 |                 job["jobQueue"],
234 |                 job["status"],
235 |                 datetime.fromtimestamp(job["createdAt"] / 1000),
236 |                 datetime.fromtimestamp(job["startedAt"] / 1000)
237 |                 if "startedAt" in job
238 |                 else "NaT",
239 |                 datetime.fromtimestamp(job["stoppedAt"] / 1000)
240 |                 if "stoppedAt" in job
241 |                 else "NaT",
242 |                 str(
243 |                     datetime.fromtimestamp(job["stoppedAt"] / 1000)
244 |                     - datetime.fromtimestamp(job["startedAt"] / 1000)
245 |                 )
246 |                 if "startedAt" in job and "stoppedAt" in job
247 |                 else "NaN",
248 |                 (job["stoppedAt"] / 1000) - (job["startedAt"] / 1000)
249 |                 if "startedAt" in job and "stoppedAt" in job
250 |                 else "NaN",
251 |                 job["jobDefinition"],
252 |                 job["container"]["logStreamName"]
253 |                 if "logStreamName" in job["container"]
254 |                 else "",
255 |                 int(resource_dict["VCPU"]),
256 |                 int(float(resource_dict["MEMORY"]) / 1000),
257 |                 int(resource_dict["GPU"]) if "GPU" in resource_dict else 0,
258 |             ]
259 |             list_of_lists.append(row)
260 | 
261 |     return pd.DataFrame(
262 |         list_of_lists,
263 |         columns=[
264 |             "jobName",
265 |             "jobId",
266 |             "jobQueue",
267 |             "status",
268 |             "createdAt",
269 |             "startedAt",
270 |             "stoppedAt",
271 |             "duration",
272 |             "duration_sec",
273 |             "jobDefinition",
274 |             "logStreamName",
275 |             "vCPUs",
276 |             "mem_GB",
277 |             "GPUs",
278 |         ],
279 |     ).sort_values(by="jobName", ascending=False)
280 | 
281 | 
282 | def get_rf_job_metrics(job_name, bucket, region="us-east-1"):
283 |     """
284 |     Retrieve RF job metrics from the metrics.yaml file
285 |     """
286 | 
287 |     s3.download_file(
288 |         bucket,
289 |         f"{job_name}/metrics.yaml",
290 |         "data/metrics.yaml",
291 |     )
292 | 
293 |     with open("data/metrics.yaml", "r") as stream:
294 |         try:
295 |             metrics = yaml.safe_load(stream)
296 |         except yaml.YAMLError as exc:
297 |             print(exc)
298 | 
299 |     return metrics
300 | 
301 | 
302 | def get_rosettafold_batch_resources(region="us-east-1"):
303 |     """
304 |     Retrieve a list of batch job definitions and queues created as part of an
305 |     AWS-RoseTTAFold stack.
306 |     """
307 |     batch = boto3.client("batch", region_name=region)
308 | 
309 |     job_definition_response = batch.describe_job_definitions()
310 |     list_of_lists = []
311 | 
312 |     job_list = []
313 |     for jd in job_definition_response["jobDefinitions"]:
314 |         if jd["status"] == "ACTIVE" and "aws-rosettafold" in jd["jobDefinitionName"]:
315 |             name_split = jd["jobDefinitionName"].split("-")
316 |             entry = {
317 |                 "stackId": name_split[5],
318 |                 "dataPrepJobDefinition": jd["jobDefinitionName"],
319 |             }
320 |             row = [
321 |                 name_split[5],
322 |                 name_split[4],
323 |                 "Job Definition",
324 |                 jd["jobDefinitionName"],
325 |             ]
326 |             job_list.append(row)
327 | 
328 |     job_queue_response = batch.describe_job_queues()
329 |     jq_list = []
330 |     for jq in job_queue_response["jobQueues"]:
331 |         if (
332 |             jq["state"] == "ENABLED"
333 |             and jq["status"] == "VALID"
334 |             and "aws-rosettafold-queue" in jq["jobQueueName"]
335 |         ):
336 |             name_split = jq["jobQueueName"].split("-")
337 |             row = [name_split[4], name_split[3], "Job Queue", jq["jobQueueName"]]
338 |             job_list.append(row)
339 | 
340 |     df = pd.DataFrame(
341 |         job_list,
342 |         columns=["stackId", "instanceType", "resourceType", "resourceName"],
343 |     ).sort_values(by=["stackId", "instanceType"], ascending=False)
344 |     df["type"] = df["instanceType"] + df["resourceType"]
345 |     df = df.pivot(index="stackId", columns="type", values=["resourceName"])
346 |     df.columns = df.columns.get_level_values(1)
347 |     df = df.rename(
348 |         columns={
349 |             "cpudataprepJob Definition": "CPUDataPrepJobDefinition",
350 |             "cpuJob Queue": "CPUJobQueue",
351 |             "cpupredictJob Definition": "CPUPredictJobDefinition",
352 |             "gpupredictJob Definition": "GPUPredictJobDefinition",
353 |             "gpuJob Queue": "GPUJobQueue",
354 |         }
355 |     )
356 |     return df
357 | 
358 | 
359 | def list_recent_jobs(job_queues, hrs_in_past=1):
360 | 
361 |     """
362 |     Display recently-submitted jobs.
363 |     """
364 | 
365 |     batch_client = boto3.client("batch")
366 |     result = []
367 |     for queue in job_queues:
368 |         recent_queue_jobs = batch_client.list_jobs(
369 |             jobQueue=queue,
370 |             filters=[
371 |                 {
372 |                     "name": "AFTER_CREATED_AT",
373 |                     "values": [
374 |                         str(round(datetime.now().timestamp()) - (hrs_in_past * 3600))
375 |                     ],
376 |                 }
377 |             ],
378 |         )
379 |         result = result + recent_queue_jobs["jobSummaryList"]
380 | 
381 |     return result
382 | 
383 | 
384 | def parse_a3m(filename):
385 | 
386 |     """
387 |     Read A3M and convert letters into integers in the 0..20 range,
388 |     Copied from https://github.com/RosettaCommons/RoseTTAFold/blob/main/network/parsers.py
389 |     """
390 | 
391 |     msa = []
392 |     table = str.maketrans(dict.fromkeys(string.ascii_lowercase))
393 |     # read file line by line
394 |     for line in open(filename, "r"):
395 |         # skip labels
396 |         if line[0] == ">":
397 |             continue
398 |         # remove right whitespaces
399 |         line = line.rstrip()
400 |         # remove lowercase letters and append to MSA
401 |         msa.append(line.translate(table))
402 |     # convert letters into numbers
403 |     alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype="|S1").view(np.uint8)
404 |     msa = np.array([list(s) for s in msa], dtype="|S1").view(np.uint8)
405 |     for i in range(alphabet.shape[0]):
406 |         msa[msa == alphabet[i]] = i
407 |     # treat all unknown characters as gaps
408 |     msa[msa > 20] = 20
409 |     return msa
410 | 
411 | 
412 | def read_pdb_renum(pdb_filename, Ls=None):
413 | 
414 |     """
415 |     Process pdb file.
416 |     Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py
417 |     """
418 | 
419 |     if Ls is not None:
420 |         L_init = 0
421 |         new_chain = {}
422 |         for L, c in zip(Ls, alphabet_list):
423 |             new_chain.update({i: c for i in range(L_init, L_init + L)})
424 |             L_init += L
425 |     n, pdb_out = 1, []
426 |     resnum_, chain_ = 1, "A"
427 |     for line in open(pdb_filename, "r"):
428 |         if line[:4] == "ATOM":
429 |             chain = line[21:22]
430 |             resnum = int(line[22 : 22 + 5])
431 |             if resnum != resnum_ or chain != chain_:
432 |                 resnum_, chain_ = resnum, chain
433 |                 n += 1
434 |             if Ls is None:
435 |                 pdb_out.append("%s%4i%s" % (line[:22], n, line[26:]))
436 |             else:
437 |                 pdb_out.append(
438 |                     "%s%s%4i%s" % (line[:21], new_chain[n - 1], n, line[26:])
439 |                 )
440 |     return "".join(pdb_out)
441 | 
442 | 
443 | def plot_msa_info(msa):
444 | 
445 |     """
446 |     Plot a representation of the MSA coverage.
447 |     Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py
448 |     """
449 | 
450 |     msa_arr = np.unique(msa, axis=0)
451 |     total_msa_size = len(msa_arr)
452 |     print(f"\n{total_msa_size} Sequences Found in Total\n")
453 | 
454 |     if total_msa_size > 1:
455 |         plt.figure(figsize=(8, 5), dpi=100)
456 |         plt.title("Sequence coverage")
457 |         seqid = (msa[0] == msa_arr).mean(-1)
458 |         seqid_sort = seqid.argsort()
459 |         non_gaps = (msa_arr != 20).astype(float)
460 |         non_gaps[non_gaps == 0] = np.nan
461 |         plt.imshow(
462 |             non_gaps[seqid_sort] * seqid[seqid_sort, None],
463 |             interpolation="nearest",
464 |             aspect="auto",
465 |             cmap="rainbow_r",
466 |             vmin=0,
467 |             vmax=1,
468 |             origin="lower",
469 |             extent=(0, msa_arr.shape[1], 0, msa_arr.shape[0]),
470 |         )
471 |         plt.plot((msa_arr != 20).sum(0), color="black")
472 |         plt.xlim(0, msa_arr.shape[1])
473 |         plt.ylim(0, msa_arr.shape[0])
474 |         plt.colorbar(
475 |             label="Sequence identity to query",
476 |         )
477 |         plt.xlabel("Positions")
478 |         plt.ylabel("Sequences")
479 |         plt.show()
480 |     else:
481 |         print("Unable to display MSA of length 1")
482 | 
483 | 
484 | def plot_pdb(
485 |     pred_output_path,
486 |     show_sidechains=False,
487 |     show_mainchains=False,
488 |     color="lDDT",
489 |     chains=None,
490 |     Ls=None,
491 |     vmin=0.5,
492 |     vmax=0.9,
493 |     color_HP=False,
494 |     size=(800, 480),
495 | ):
496 | 
497 |     """
498 |     Create a 3D view of a pdb structure
499 |     Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py
500 |     """
501 | 
502 |     if chains is None:
503 |         chains = 1 if Ls is None else len(Ls)
504 | 
505 |     view = py3Dmol.view(
506 |         js="https://3dmol.org/build/3Dmol.js", width=size[0], height=size[1]
507 |     )
508 |     view.addModel(read_pdb_renum(pred_output_path, Ls), "pdb")
509 |     if color == "lDDT":
510 |         view.setStyle(
511 |             {
512 |                 "cartoon": {
513 |                     "colorscheme": {
514 |                         "prop": "b",
515 |                         "gradient": "roygb",
516 |                         "min": vmin,
517 |                         "max": vmax,
518 |                     }
519 |                 }
520 |             }
521 |         )
522 |     elif color == "rainbow":
523 |         view.setStyle({"cartoon": {"color": "spectrum"}})
524 |     elif color == "chain":
525 |         for n, chain, color in zip(range(chains), alphabet_list, pymol_color_list):
526 |             view.setStyle({"chain": chain}, {"cartoon": {"color": color}})
527 |     if show_sidechains:
528 |         BB = ["C", "O", "N"]
529 |         HP = [
530 |             "ALA",
531 |             "GLY",
532 |             "VAL",
533 |             "ILE",
534 |             "LEU",
535 |             "PHE",
536 |             "MET",
537 |             "PRO",
538 |             "TRP",
539 |             "CYS",
540 |             "TYR",
541 |         ]
542 |         if color_HP:
543 |             view.addStyle(
544 |                 {"and": [{"resn": HP}, {"atom": BB, "invert": True}]},
545 |                 {"stick": {"colorscheme": "yellowCarbon", "radius": 0.3}},
546 |             )
547 |             view.addStyle(
548 |                 {"and": [{"resn": HP, "invert": True}, {"atom": BB, "invert": True}]},
549 |                 {"stick": {"colorscheme": "whiteCarbon", "radius": 0.3}},
550 |             )
551 |             view.addStyle(
552 |                 {"and": [{"resn": "GLY"}, {"atom": "CA"}]},
553 |                 {"sphere": {"colorscheme": "yellowCarbon", "radius": 0.3}},
554 |             )
555 |             view.addStyle(
556 |                 {"and": [{"resn": "PRO"}, {"atom": ["C", "O"], "invert": True}]},
557 |                 {"stick": {"colorscheme": "yellowCarbon", "radius": 0.3}},
558 |             )
559 |         else:
560 |             view.addStyle(
561 |                 {
562 |                     "and": [
563 |                         {"resn": ["GLY", "PRO"], "invert": True},
564 |                         {"atom": BB, "invert": True},
565 |                     ]
566 |                 },
567 |                 {"stick": {"colorscheme": f"WhiteCarbon", "radius": 0.3}},
568 |             )
569 |             view.addStyle(
570 |                 {"and": [{"resn": "GLY"}, {"atom": "CA"}]},
571 |                 {"sphere": {"colorscheme": f"WhiteCarbon", "radius": 0.3}},
572 |             )
573 |             view.addStyle(
574 |                 {"and": [{"resn": "PRO"}, {"atom": ["C", "O"], "invert": True}]},
575 |                 {"stick": {"colorscheme": f"WhiteCarbon", "radius": 0.3}},
576 |             )
577 |     if show_mainchains:
578 |         BB = ["C", "O", "N", "CA"]
579 |         view.addStyle(
580 |             {"atom": BB}, {"stick": {"colorscheme": f"WhiteCarbon", "radius": 0.3}}
581 |         )
582 |     view.zoomTo()
583 |     return view
584 | 
585 | 
586 | def plot_plddt_legend(dpi=100):
587 | 
588 |     """
589 |     Create 3D Plot legend
590 |     Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py
591 |     """
592 | 
593 |     thresh = [
594 |         "plDDT:",
595 |         "Very low (<50)",
596 |         "Low (60)",
597 |         "OK (70)",
598 |         "Confident (80)",
599 |         "Very high (>90)",
600 |     ]
601 |     plt.figure(figsize=(1, 0.1), dpi=dpi)
602 |     ########################################
603 |     for c in ["#FFFFFF", "#FF0000", "#FFFF00", "#00FF00", "#00FFFF", "#0000FF"]:
604 |         plt.bar(0, 0, color=c)
605 |     plt.legend(
606 |         thresh,
607 |         frameon=False,
608 |         loc="center",
609 |         ncol=6,
610 |         handletextpad=1,
611 |         columnspacing=1,
612 |         markerscale=0.5,
613 |     )
614 |     plt.axis(False)
615 |     return plt
616 | 
617 | 
618 | def submit_2_step_job(
619 |     bucket=sm_session.default_bucket(),
620 |     job_name=uuid.uuid4(),
621 |     data_prep_input_file="input.fa",
622 |     data_prep_job_definition="AWS-RoseTTAFold-CPU",
623 |     data_prep_queue="AWS-RoseTTAFold-CPU",
624 |     data_prep_cpu=8,
625 |     data_prep_mem=32,
626 |     predict_job_definition="AWS-RoseTTAFold-GPU",
627 |     predict_queue="AWS-RoseTTAFold-GPU",
628 |     predict_cpu=4,
629 |     predict_mem=16,
630 |     predict_gpu=True,
631 |     db_path="/fsx/aws-rosettafold-ref-data",
632 |     weights_path="/fsx/aws-rosettafold-ref-data",
633 | ):
634 | 
635 |     """
636 |     Submit a 2-step RoseTTAFold prediction job  to AWS Batch.
637 |     """
638 | 
639 |     working_folder = f"s3://{bucket}/{job_name}"
640 |     batch_client = boto3.client("batch")
641 |     output_pdb_uri = f"{working_folder}/{job_name}.e2e.pdb"
642 | 
643 |     data_prep_response = submit_rf_data_prep_job(
644 |         bucket=bucket,
645 |         job_name=job_name,
646 |         input_file=data_prep_input_file,
647 |         job_definition=data_prep_job_definition,
648 |         job_queue=data_prep_queue,
649 |         cpu=data_prep_cpu,
650 |         mem=data_prep_mem,
651 |         db_path=db_path,
652 |     )
653 | 
654 |     predict_response = submit_rf_predict_job(
655 |         bucket=bucket,
656 |         job_name=job_name,
657 |         job_definition=predict_job_definition,
658 |         job_queue=predict_queue,
659 |         cpu=predict_cpu,
660 |         mem=predict_mem,
661 |         gpu=predict_gpu,
662 |         db_path=db_path,
663 |         weights_path=weights_path,
664 |         depends_on=data_prep_response["jobId"],
665 |     )
666 | 
667 |     print(
668 |         f"Data prep job ID {data_prep_response['jobId']} and predict job ID {predict_response['jobId']} submitted"
669 |     )
670 |     return [data_prep_response, predict_response]
671 | 
672 | 
673 | def submit_rf_data_prep_job(
674 |     bucket=sm_session.default_bucket(),
675 |     job_name=uuid.uuid4(),
676 |     input_file="input.fa",
677 |     job_definition="AWS-RoseTTAFold-CPU",
678 |     job_queue="AWS-RoseTTAFold-CPU",
679 |     cpu=8,
680 |     mem=32,
681 |     db_path="/fsx/aws-rosettafold-ref-data",
682 | ):
683 | 
684 |     """
685 |     Submit a RoseTTAFold data prep job (i.e. the first half of the e2e workflow) to AWS Batch.
686 |     """
687 | 
688 |     working_folder = f"s3://{bucket}/{job_name}"
689 |     batch_client = boto3.client("batch")
690 |     output_msa_uri = f"{working_folder}/{job_name}.msa0.a3m"
691 |     output_hhr_uri = f"{working_folder}/{job_name}.hhr"
692 |     output_atab_uri = f"{working_folder}/{job_name}.atab"
693 | 
694 |     response = batch_client.submit_job(
695 |         jobDefinition=job_definition,
696 |         jobName=str(job_name),
697 |         jobQueue=job_queue,
698 |         containerOverrides={
699 |             "command": [
700 |                 "/bin/bash",
701 |                 "run_aws_data_prep_ver.sh",
702 |                 "-i",
703 |                 working_folder,
704 |                 "-n",
705 |                 input_file,
706 |                 "-o",
707 |                 working_folder,
708 |                 "-p",
709 |                 job_name,
710 |                 "-w",
711 |                 "/work",
712 |                 "-d",
713 |                 db_path,
714 |                 "-c",
715 |                 str(cpu),
716 |                 "-m",
717 |                 str(mem),
718 |             ],
719 |             "resourceRequirements": [
720 |                 {"value": str(cpu), "type": "VCPU"},
721 |                 {"value": str(mem * 1000), "type": "MEMORY"},
722 |             ],
723 |         },
724 |         tags={
725 |             "output_msa_uri": output_msa_uri,
726 |             "output_hhr_uri": output_hhr_uri,
727 |             "output_atab_uri": output_atab_uri,
728 |         },
729 |     )
730 |     print(f"Job ID {response['jobId']} submitted")
731 |     return response
732 | 
733 | 
734 | def submit_rf_predict_job(
735 |     bucket=sm_session.default_bucket(),
736 |     job_name=uuid.uuid4(),
737 |     job_definition="AWS-RoseTTAFold-GPU",
738 |     job_queue="AWS-RoseTTAFold-GPU",
739 |     cpu=4,
740 |     mem=16,
741 |     gpu=True,
742 |     db_path="/fsx/aws-rosettafold-ref-data",
743 |     weights_path="/fsx/aws-rosettafold-ref-data",
744 |     depends_on="",
745 | ):
746 | 
747 |     """
748 |     Submit a RoseTTAFold prediction job (i.e. the second half of the e2e workflow) to AWS Batch.
749 |     """
750 | 
751 |     working_folder = f"s3://{bucket}/{job_name}"
752 |     batch_client = boto3.client("batch")
753 |     output_pdb_uri = f"{working_folder}/{job_name}.e2e.pdb"
754 | 
755 |     container_overrides = {
756 |         "command": [
757 |             "/bin/bash",
758 |             "run_aws_predict_ver.sh",
759 |             "-i",
760 |             working_folder,
761 |             "-o",
762 |             working_folder,
763 |             "-p",
764 |             job_name,
765 |             "-w",
766 |             "/work",
767 |             "-d",
768 |             db_path,
769 |             "-x",
770 |             weights_path,
771 |             "-c",
772 |             str(cpu),
773 |             "-m",
774 |             str(mem),
775 |         ],
776 |         "resourceRequirements": [
777 |             {"value": str(cpu), "type": "VCPU"},
778 |             {"value": str(mem * 1000), "type": "MEMORY"},
779 |         ],
780 |     }
781 | 
782 |     if gpu:
783 |         container_overrides["resourceRequirements"].append(
784 |             {"value": "1", "type": "GPU"}
785 |         )
786 | 
787 |     response = batch_client.submit_job(
788 |         jobDefinition=job_definition,
789 |         jobName=str(job_name),
790 |         jobQueue=job_queue,
791 |         dependsOn=[{"jobId": depends_on, "type": "SEQUENTIAL"}],
792 |         containerOverrides=container_overrides,
793 |         tags={"output_pdb_uri": output_pdb_uri},
794 |     )
795 |     print(f"Job ID {response['jobId']} submitted")
796 |     return response
797 | 
798 | 
799 | def upload_fasta_to_s3(
800 |     record, bucket=sm_session.default_bucket(), job_name=uuid.uuid4()
801 | ):
802 | 
803 |     """
804 |     Create a fasta file and upload it to S3.
805 |     """
806 | 
807 |     s3 = boto3.client("s3", region_name=region)
808 |     file_out = "_tmp.fasta"
809 |     with open(file_out, "w") as f_out:
810 |         SeqIO.write(record, f_out, "fasta")
811 |     object_name = f"{job_name}/input.fa"
812 |     response = s3.upload_file(file_out, bucket, object_name)
813 |     os.remove(file_out)
814 |     s3_uri = f"s3://{bucket}/{object_name}"
815 |     print(f"Sequence file uploaded to {s3_uri}")
816 |     return s3_uri
817 | 
818 | 
819 | def wait_for_job_start(jobId, pause=30):
820 | 
821 |     """
822 |     Pause while a job transitions into a running state.
823 |     """
824 | 
825 |     status = get_batch_job_info(jobId)["status"]
826 |     print(status)
827 |     while get_batch_job_info(jobId)["status"] in [
828 |         "SUBMITTED",
829 |         "PENDING",
830 |         "RUNNABLE",
831 |         "STARTING",
832 |     ]:
833 |         sleep(30)
834 |         new_status = get_batch_job_info(jobId)["status"]
835 |         if new_status != status:
836 |             print("\n" + new_status)
837 |         else:
838 |             print(".", end="")
839 |         status = new_status
840 | 


--------------------------------------------------------------------------------