├── .github
└── workflows
│ └── codeql-analysis.yml
├── .gitignore
├── AWS-RoseTTAFold.ipynb
├── CASP14-Analysis.ipynb
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── config
├── Dockerfile
├── cfn.yaml
├── container_buildspec.yml
├── download_ref_data.sh
├── run_aws_data_prep_ver.sh
└── run_aws_predict_ver.sh
├── data
├── T1028.fa
├── T1036s1.fa
└── T1078.fa
├── img
├── AWS-RoseTTAFold-arch.png
├── AWS-RoseTTAFold-deploy.png
├── LaunchStack.jpg
└── RF_workflow.png
├── requirements.txt
└── rfutils
├── __init__.py
└── rfutils.py
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ main ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ main ]
20 | schedule:
21 | - cron: '29 9 * * 2'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'python' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v2
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v1
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 |
53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
54 | # If this step fails, then you should remove it and run the build manually (see below)
55 | - name: Autobuild
56 | uses: github/codeql-action/autobuild@v1
57 |
58 | # ℹ️ Command-line programs to run using the OS shell.
59 | # 📚 https://git.io/JvXDl
60 |
61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 | # and modify them (or add more) to build your code if your project
63 | # uses a compiled language
64 |
65 | #- run: |
66 | # make bootstrap
67 | # make release
68 |
69 | - name: Perform CodeQL Analysis
70 | uses: github/codeql-action/analyze@v1
71 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode*
2 | .vscode/*
3 | venv*
4 | venv/*
5 | load_testing.ipynb
6 | plotting.ipynb
7 | job_names.txt
8 | data/*.csv
9 | data/*.yaml
--------------------------------------------------------------------------------
/AWS-RoseTTAFold.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# AWS-RoseTTAFold"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## I. Introduction"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "This notebook runs the [RoseTTAFold](https://www.ipd.uw.edu/2021/07/rosettafold-accurate-protein-structure-prediction-accessible-to-all/) algorithm developed by Minkyung Baek et al. and described in [M. Baek et al., Science \n",
22 | "10.1126/science.abj8754 2021](https://www.ipd.uw.edu/wp-content/uploads/2021/07/Baek_etal_Science2021_RoseTTAFold.pdf) on AWS."
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "
"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "The AWS workflow depends on a Batch compute environment."
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "
"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "## II. Environment setup"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "## Install dependencies\n",
60 | "%pip install -q -q -r requirements.txt"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "## Import helper functions at rfutils/rfutils.py\n",
70 | "from rfutils import rfutils\n",
71 | "\n",
72 | "## Load additional dependencies\n",
73 | "from Bio import SeqIO\n",
74 | "from Bio.Seq import Seq\n",
75 | "from Bio.SeqRecord import SeqRecord\n",
76 | "import boto3\n",
77 | "import glob\n",
78 | "import json\n",
79 | "import pandas as pd\n",
80 | "import sagemaker\n",
81 | "\n",
82 | "pd.set_option(\"max_colwidth\", None)\n",
83 | "\n",
84 | "# Get service clients\n",
85 | "session = boto3.session.Session()\n",
86 | "sm_session = sagemaker.session.Session()\n",
87 | "region = session.region_name\n",
88 | "role = sagemaker.get_execution_role()\n",
89 | "s3 = boto3.client(\"s3\", region_name=region)\n",
90 | "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
91 | "\n",
92 | "bucket = sm_session.default_bucket()\n",
93 | "print(f\"S3 bucket name is {bucket}\")"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "## III. Input Protein Sequence"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "Enter a protein sequence manually"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "seq = SeqRecord(\n",
117 | " Seq(\"MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\"),\n",
118 | " id=\"YP_025292.1\",\n",
119 | " name=\"HokC\",\n",
120 | " description=\"toxic membrane protein, small\",\n",
121 | ")"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "Or provide the path to a fasta file"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "seq = SeqIO.read(\"data/T1078.fa\", \"fasta\")"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "print(f\"Protein sequence for analysis is \\n{seq}\")"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "## IV. Submit RoseTTAFold Jobs"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "### Generate Job Name"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "job_name = rfutils.create_job_name(seq.id)\n",
170 | "print(f\"Automatically-generated job name is: {job_name}\")"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "### Upload fasta file to S3"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "input_uri = rfutils.upload_fasta_to_s3(seq, bucket, job_name)"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "### Submit jobs to AWS Batch queues"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "Select the job definitions and Batch queues for your job."
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "batch_resources = rfutils.get_rosettafold_batch_resources(region=region)\n",
210 | "\n",
211 | "cpu_queue = batch_resources[\"CPUJobQueue\"][0]\n",
212 | "gpu_queue = batch_resources[\"GPUJobQueue\"][0]\n",
213 | "cpu_data_prep_job_def = batch_resources[\"CPUDataPrepJobDefinition\"][0]\n",
214 | "cpu_predict_job_def = batch_resources[\"CPUPredictJobDefinition\"][0]\n",
215 | "gpu_predict_job_def = batch_resources[\"GPUPredictJobDefinition\"][0]\n",
216 | "\n",
217 | "batch_resources"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "Because our test sequence is small (less than 400 residues) we will run the prediction step on a GPU to decrease the job duration from hours to minutes."
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "two_step_response = rfutils.submit_2_step_job(\n",
234 | " bucket=bucket,\n",
235 | " job_name=job_name,\n",
236 | " data_prep_job_definition=cpu_data_prep_job_def,\n",
237 | " data_prep_queue=cpu_queue,\n",
238 | " data_prep_cpu=8,\n",
239 | " data_prep_mem=32,\n",
240 | " predict_job_definition=gpu_predict_job_def, # Change this to the cpu_predict_job_def for large proteins\n",
241 | " predict_queue=gpu_queue, # Change this to the cpu_queue for large proteins\n",
242 | " predict_cpu=4,\n",
243 | " predict_mem=16,\n",
244 | " predict_gpu=True, # Change this to False for large proteins\n",
245 | ")\n",
246 | "data_prep_jobId = two_step_response[0][\"jobId\"]\n",
247 | "predict_jobId = two_step_response[1][\"jobId\"]"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {},
253 | "source": [
254 | "## V. Check Status of Data Prep and Prediction Jobs"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "rfutils.get_rf_job_info(\n",
264 | " cpu_queue,\n",
265 | " gpu_queue,\n",
266 | " hrs_in_past=1,\n",
267 | ")"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "## VI. View Data Prep Results"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "Pause while the data prep job starts up"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": null,
287 | "metadata": {},
288 | "outputs": [],
289 | "source": [
290 | "rfutils.wait_for_job_start(data_prep_jobId)"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "Get logs for data prep job (Run this multiple times to see how the job progresses)"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "data_prep_logStreamName = rfutils.get_batch_job_info(data_prep_jobId)[\"logStreamName\"]\n",
307 | "rfutils.get_batch_logs(data_prep_logStreamName).tail(n=5)"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {},
313 | "source": [
314 | "Retrieve and Display Multiple Sequence Alignment (MSA) Results"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": null,
320 | "metadata": {},
321 | "outputs": [],
322 | "source": [
323 | "rfutils.display_msa(data_prep_jobId, bucket)"
324 | ]
325 | },
326 | {
327 | "cell_type": "markdown",
328 | "metadata": {},
329 | "source": [
330 | "## VII. View Prediction Results"
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {},
336 | "source": [
337 | "Pause while the predict job starts up"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "rfutils.wait_for_job_start(predict_jobId)"
347 | ]
348 | },
349 | {
350 | "cell_type": "markdown",
351 | "metadata": {},
352 | "source": [
353 | "Get logs for prediction job (Run this multiple times to see how the job progresses)"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "data_prep_logStreamName = rfutils.get_batch_job_info(data_prep_jobId)[\"logStreamName\"]\n",
363 | "rfutils.get_batch_logs(data_prep_logStreamName).tail(n=5)"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "## VIII. View Job Metrics"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {},
377 | "outputs": [],
378 | "source": [
379 | "metrics = rfutils.get_rf_job_metrics(job_name, bucket, region)\n",
380 | "\n",
381 | "print(f'Number of sequences in MSA: {metrics[\"DATA_PREP\"][\"MSA_COUNT\"]}')\n",
382 | "print(f'Number of templates: {metrics[\"DATA_PREP\"][\"TEMPLATE_COUNT\"]}')\n",
383 | "print(f'MSA duration (sec): {metrics[\"DATA_PREP\"][\"MSA_DURATION\"]}')\n",
384 | "print(f'SS duration (sec): {metrics[\"DATA_PREP\"][\"SS_DURATION\"]}')\n",
385 | "print(f'Template search duration (sec): {metrics[\"DATA_PREP\"][\"TEMPLATE_DURATION\"]}')\n",
386 | "print(\n",
387 | " f'Total data prep duration (sec): {metrics[\"DATA_PREP\"][\"TOTAL_DATA_PREP_DURATION\"]}'\n",
388 | ")\n",
389 | "print(f'Total predict duration (sec): {metrics[\"PREDICT\"][\"TOTAL_PREDICT_DURATION\"]}')"
390 | ]
391 | },
392 | {
393 | "cell_type": "markdown",
394 | "metadata": {},
395 | "source": [
396 | "## IX. Retrieve and Display Predicted Structure"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": null,
402 | "metadata": {},
403 | "outputs": [],
404 | "source": [
405 | "rfutils.display_structure(predict_jobId, bucket, vmin=0.5, vmax=0.9)"
406 | ]
407 | }
408 | ],
409 | "metadata": {
410 | "instance_type": "ml.t3.medium",
411 | "interpreter": {
412 | "hash": "8ad3a54da4d511af1a5c2549d8f1b22d83bfd1079fb699a3f5552b91d143b102"
413 | },
414 | "kernelspec": {
415 | "display_name": "Python 3 (Data Science)",
416 | "language": "python",
417 | "name": "python3"
418 | },
419 | "language_info": {
420 | "codemirror_mode": {
421 | "name": "ipython",
422 | "version": 3
423 | },
424 | "file_extension": ".py",
425 | "mimetype": "text/x-python",
426 | "name": "python",
427 | "nbconvert_exporter": "python",
428 | "pygments_lexer": "ipython3",
429 | "version": "3.8.9"
430 | }
431 | },
432 | "nbformat": 4,
433 | "nbformat_minor": 4
434 | }
435 |
--------------------------------------------------------------------------------
/CASP14-Analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# AWS-RoseTTAFold: Bulk Job Analysis"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## I. Introduction"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "This notebook demonstrates how to analyze multiple protein simultaneously, in this case a subset of the CASP14 target set."
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## II. Environment setup"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "## Install dependencies\n",
38 | "%pip install -q -q -r requirements.txt"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "## Import helper functions at rfutils/rfutils.py\n",
48 | "from rfutils import rfutils\n",
49 | "\n",
50 | "## Load additional dependencies\n",
51 | "from Bio import SeqIO\n",
52 | "from Bio.Seq import Seq\n",
53 | "from Bio.SeqRecord import SeqRecord\n",
54 | "import boto3\n",
55 | "import glob\n",
56 | "import json\n",
57 | "from IPython.display import display\n",
58 | "import pandas as pd\n",
59 | "import sagemaker\n",
60 | "\n",
61 | "pd.set_option(\"max_colwidth\", None)\n",
62 | "\n",
63 | "# Get service clients\n",
64 | "session = boto3.session.Session()\n",
65 | "sm_session = sagemaker.session.Session()\n",
66 | "region = session.region_name\n",
67 | "role = sagemaker.get_execution_role()\n",
68 | "s3 = boto3.client(\"s3\", region_name=region)\n",
69 | "account_id = boto3.client(\"sts\").get_caller_identity().get(\"Account\")\n",
70 | "\n",
71 | "bucket = sm_session.default_bucket()"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "## III. Input Protein Sequence"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "Download and process CASP14 sequences"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "!wget \"https://predictioncenter.org/download_area/CASP14/sequences/casp14.seq.txt\" -O \"data/casp14.fa\"\n",
95 | "!sed '137,138d' \"data/casp14.fa\" > \"data/casp14_dedup.fa\" # Remove duplicate entry for T1085\n",
96 | "\n",
97 | "casp14_iterator = SeqIO.parse(\"data/casp14_dedup.fa\", \"fasta\")\n",
98 | "casp14_df = pd.DataFrame(\n",
99 | " (\n",
100 | " (record.id, record.description, len(record), record.seq)\n",
101 | " for record in casp14_iterator\n",
102 | " ),\n",
103 | " columns=[\"id\", \"description\", \"length\", \"seq\"],\n",
104 | ").sort_values(by=\"length\")\n",
105 | "!rm data/casp14*"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "Display information about CASP14 proteins"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "with pd.option_context(\"display.max_rows\", None):\n",
122 | " display(casp14_df.loc[:, (\"id\", \"description\")])"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {},
128 | "source": [
129 | "Plot distribution of the protein lengths"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "import matplotlib.pyplot as plt\n",
139 | "\n",
140 | "fig, ax = plt.subplots()\n",
141 | "plt.hist(casp14_df.length, bins=50)\n",
142 | "plt.ylabel(\"Sample count\")\n",
143 | "plt.xlabel(\"Residue count\")\n",
144 | "plt.title(\"CASP-14 Protein Length Distribution\")\n",
145 | "plt.show()"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "Get the names of the AWS Batch resources deployed in your account."
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "batch_resources = rfutils.get_rosettafold_batch_resources(region=region)\n",
162 | "\n",
163 | "cpu_queue = batch_resources[\"CPUJobQueue\"][0]\n",
164 | "gpu_queue = batch_resources[\"GPUJobQueue\"][0]\n",
165 | "cpu_data_prep_job_def = batch_resources[\"CPUDataPrepJobDefinition\"][0]\n",
166 | "cpu_predict_job_def = batch_resources[\"CPUPredictJobDefinition\"][0]\n",
167 | "gpu_predict_job_def = batch_resources[\"GPUPredictJobDefinition\"][0]\n",
168 | "\n",
169 | "batch_resources"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "Submit analysis jobs for a subset of CASP14 proteins"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "protein_count = 84 # Change this to analyze a smaller number of CASP14 targets\n",
186 | "job_name_list = []\n",
187 | "\n",
188 | "for row in casp14_df[:protein_count].itertuples(index=False):\n",
189 | " record = SeqRecord(row.seq, id=row.id, description=row.description)\n",
190 | " print(f\"Protein sequence for analysis is \\n{record.description}\")\n",
191 | " sequence_length = len(record.seq)\n",
192 | " print(f\"Sequence length is {sequence_length}\")\n",
193 | "\n",
194 | " if sequence_length < 400:\n",
195 | " prep_cpu = 8\n",
196 | " prep_mem = 32\n",
197 | " predict_cpu = 4\n",
198 | " predict_mem = 16\n",
199 | " predict_gpu = True\n",
200 | " predict_job_definition = gpu_predict_job_def\n",
201 | " predict_queue = gpu_queue\n",
202 | " else:\n",
203 | " prep_cpu = 8\n",
204 | " prep_mem = 64\n",
205 | " predict_cpu = 4\n",
206 | " predict_mem = 32\n",
207 | " predict_gpu = False\n",
208 | " predict_job_definition = cpu_predict_job_def\n",
209 | " predict_queue = cpu_queue\n",
210 | "\n",
211 | " job_name = rfutils.create_job_name(record.id)\n",
212 | " print(f\"Automatically-generated job name is: {job_name}\")\n",
213 | " job_name_list.append(job_name)\n",
214 | " input_uri = rfutils.upload_fasta_to_s3(record, bucket, job_name)\n",
215 | " two_step_response = rfutils.submit_2_step_job(\n",
216 | " bucket=bucket,\n",
217 | " job_name=job_name,\n",
218 | " data_prep_input_file=\"input.fa\",\n",
219 | " data_prep_job_definition=cpu_data_prep_job_def,\n",
220 | " data_prep_queue=cpu_queue,\n",
221 | " data_prep_cpu=prep_cpu,\n",
222 | " data_prep_mem=prep_mem,\n",
223 | " predict_job_definition=predict_job_definition,\n",
224 | " predict_queue=predict_queue,\n",
225 | " predict_cpu=predict_cpu,\n",
226 | " predict_mem=predict_mem,\n",
227 | " predict_gpu=predict_gpu,\n",
228 | " )"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "## IV. Check Status of Data Prep and Prediction Jobs"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "rfutils.get_rf_job_info(\n",
245 | " cpu_queue,\n",
246 | " gpu_queue,\n",
247 | " hrs_in_past=1,\n",
248 | ")"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "jobs = []\n",
258 | "for job_name in job_name_list:\n",
259 | " metrics = rfutils.get_rf_job_metrics(job_name, bucket, region)\n",
260 | " row = [\n",
261 | " job_name,\n",
262 | " metrics[\"DATA_PREP\"][\"JOB_ID\"],\n",
263 | " metrics[\"DATA_PREP\"][\"CPU\"],\n",
264 | " metrics[\"DATA_PREP\"][\"MEM\"],\n",
265 | " metrics[\"DATA_PREP\"][\"LENGTH\"],\n",
266 | " metrics[\"DATA_PREP\"][\"MSA_COUNT\"],\n",
267 | " metrics[\"DATA_PREP\"][\"TEMPLATE_COUNT\"],\n",
268 | " metrics[\"DATA_PREP\"][\"MSA_DURATION\"],\n",
269 | " metrics[\"DATA_PREP\"][\"SS_DURATION\"],\n",
270 | " metrics[\"DATA_PREP\"][\"TEMPLATE_DURATION\"],\n",
271 | " metrics[\"DATA_PREP\"][\"TOTAL_DATA_PREP_DURATION\"],\n",
272 | " metrics[\"PREDICT\"][\"JOB_ID\"],\n",
273 | " metrics[\"PREDICT\"][\"CPU\"],\n",
274 | " metrics[\"PREDICT\"][\"MEM\"],\n",
275 | " metrics[\"PREDICT\"][\"TOTAL_PREDICT_DURATION\"],\n",
276 | " ]\n",
277 | " jobs.append(row)\n",
278 | "metrics_df = pd.DataFrame(\n",
279 | " jobs,\n",
280 | " columns=[\n",
281 | " \"jobName\",\n",
282 | " \"dataPrepJobID\",\n",
283 | " \"dataPrepCPU\",\n",
284 | " \"dataPrepMEM\",\n",
285 | " \"sequenceLength\",\n",
286 | " \"MSACount\",\n",
287 | " \"templateCount\",\n",
288 | " \"MSADuration\",\n",
289 | " \"SSDuration\",\n",
290 | " \"templateDuration\",\n",
291 | " \"dataPrepDuration\",\n",
292 | " \"predictJobId\",\n",
293 | " \"predictCPU\",\n",
294 | " \"predictMEM\",\n",
295 | " \"predictDuration\",\n",
296 | " ],\n",
297 | ")\n",
298 | "metrics_df.sort_values(by=[\"dataPrepCPU\", \"dataPrepMEM\", \"predictCPU\", \"predictMEM\"])"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "metrics_df.to_csv(\"results.csv\")"
308 | ]
309 | }
310 | ],
311 | "metadata": {
312 | "instance_type": "ml.t3.medium",
313 | "interpreter": {
314 | "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
315 | },
316 | "kernelspec": {
317 | "display_name": "Python 3 (Data Science)",
318 | "language": "python",
319 | "name": "python3"
320 | },
321 | "language_info": {
322 | "codemirror_mode": {
323 | "name": "ipython",
324 | "version": 3
325 | },
326 | "file_extension": ".py",
327 | "mimetype": "text/x-python",
328 | "name": "python",
329 | "nbconvert_exporter": "python",
330 | "pygments_lexer": "ipython3",
331 | "version": "3.8.9"
332 | }
333 | },
334 | "nbformat": 4,
335 | "nbformat_minor": 4
336 | }
337 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | this software and associated documentation files (the "Software"), to deal in
5 | the Software without restriction, including without limitation the rights to
6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | the Software, and to permit persons to whom the Software is furnished to do so.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 |
16 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AWS RoseTTAFold
2 | Infrastructure template and Jupyter notebooks for running RoseTTAFold on AWS Batch.
3 |
4 | ## Overview
5 | PProteins are large biomolecules that play an important role in the body. Knowing the physical structure of proteins is key to understanding their function. However, it can be difficult and expensive to determine the structure of many proteins experimentally. One alternative is to predict these structures using machine learning algorithms. Several high-profile research teams have released such algorithms, including [AlphaFold 2](https://deepmind.com/blog/article/alphafold-a-solution-to-a-50-year-old-grand-challenge-in-biology), [RoseTTAFold](https://www.ipd.uw.edu/2021/07/rosettafold-accurate-protein-structure-prediction-accessible-to-all/), and others. Their work was important enough for Science magazine to name it the ["2021 Breakthrough of the Year"](https://www.science.org/content/article/breakthrough-2021).
6 |
7 | Both AlphaFold 2 and RoseTTAFold use a multi-track transformer architecture trained on known protein templates to predict the structure of unknown peptide sequences. These predictions are heavily GPU-dependent and take anywhere from minutes to days to complete. The input features for these predictions include multiple sequence alignment (MSA) data. MSA algorithms are CPU-dependent and can themselves require several hours of processing time.
8 |
9 | Running both the MSA and structure prediction steps in the same computing environment can be cost inefficient, because the expensive GPU resources required for the prediction sit unused while the MSA step runs. Instead, using a high performance computing (HPC) service like AWS Batch (https://aws.amazon.com/batch/) allows us to run each step as a containerized job with the best fit of CPU, memory, and GPU resources.
10 |
11 | In this post, we demonstrate how to provision and use AWS Batch and other services to run AI-driven protein folding algorithms like RoseTTAFold.
12 |
13 | ## Setup
14 | ### Deploy the infrastructure stack
15 | 1. Choose *Launch Stack*:
16 |
17 | [](https://console.aws.amazon.com/cloudformation/home#/stacks/create/review?templateURL=https://aws-hcls-ml.s3.amazonaws.com/blog_post_support_materials/aws-RoseTTAFold/cfn.yaml)
18 |
19 | 2. For *Stack Name*, enter a value unique to your account and region.
20 | 3. For *StackAvailabilityZone* choose an availability zone.
21 | 4. Select *I acknowledge that AWS CloudFormation might create IAM resources with custom names*.
22 | 5. Choose *Create stack*.
23 | 6. Wait approximately 30 minutes for AWS CloudFormation to create the infrastructure stack and AWS CodeBuild to build and publish the AWS-RoseTTAFold container to Amazon Elastic Container Registry (Amazon ECR).
24 |
25 | ### Load model weights and sequence database files
26 |
27 | *Option 1: Mount the FSx for Lustre file system to an EC2 instance*
28 |
29 | 1. Sign in to the AWS Management Console and open the Amazon EC2 console at [https://console.aws.amazon.com/ec2](https://console.aws.amazon.com/ec2).
30 | 2. In the navigation pane, under *Instances,* select *Launch Templates*.
31 | 3. Choose the *Launch template ID* for your stack, such as `aws-rosettafold-launch-template-stack-id-suffix`.
32 | 4. Choose *Actions, Launch instance from template.*
33 | 5. Launch a new EC2 instance and connect using either SSH or SSM.
34 | 6. Download and extract the network weights and sequence database files to the attached volume at `/fsx/aws-rosettafold-ref-data` according to installation steps 3 and 5 from the [RoseTTAFold public repository](https://github.com/RosettaCommons/RoseTTAFold).
35 |
36 | *Option 2: Lazy-load the data from a S3 data repository*
37 |
38 | 1. Create a new S3 bucket in your region of interest.
39 | 2. Download and extract the network weights and sequence database files as described above and transfer them to your S3 bucket.
40 | 3. Sign in to the AWS Management Console and open the Amazon FSx for Lustre console at [https://console.aws.amazon.com/fsx](https://console.aws.amazon.com/fsx/home).
41 | 4. Choose the *File System name* for your stack, such as `aws-rosettafold-fsx-lustre-stack-id-suffix`.
42 | 5. On the file system details page, choose *Data repository*, *Create data repository association*.
43 | 6. For *File system path* enter `/aws-rosettafold-ref-data`.
44 | 7. For *Data repository path* enter the s3 url for your new S3 bucket.
45 | 8. Choose *Create*.
46 |
47 | Creating the data repository association will immediately load the file metadata to the file system. However, the data itself will not be available until requested by a job. This will add several hours to the duration of the first job you submit. However, subsequent jobs will complete much faster.
48 |
49 | Once you have finished loading the model weights and sequence data base files, the FSx for Lustre file system will include the following files:
50 |
51 | ```
52 | /fsx
53 | └── /aws-rosettafold-ref-data
54 | ├── /bfd
55 | │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata (1.4 TB)
56 | │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex (1.7 GB)
57 | │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata (15.7 GB)
58 | │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex (1.6 GB)
59 | │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata (304.4 GB)
60 | │ └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex (123.6 MB)
61 | ├── /pdb100_2021Mar03
62 | │ ├── LICENSE (20.4 KB)
63 | │ ├── pdb100_2021Mar03_a3m.ffdata (633.9 GB)
64 | │ ├── pdb100_2021Mar03_a3m.ffindex (3.9 MB)
65 | │ ├── pdb100_2021Mar03_cs219.ffdata (41.8 MB)
66 | │ ├── pdb100_2021Mar03_cs219.ffindex (2.8 MB)
67 | │ ├── pdb100_2021Mar03_hhm.ffdata (6.8 GB)
68 | │ ├── pdb100_2021Mar03_hhm.ffindex (3.4 GB)
69 | │ ├── pdb100_2021Mar03_pdb.ffdata (26.2 GB)
70 | │ └── pdb100_2021Mar03_pdb.ffindex (3.7 MB)
71 | ├── /UniRef30_2020_06
72 | │ ├── UniRef30_2020_06_a3m.ffdata (139.6 GB)
73 | │ ├── UniRef30_2020_06_a3m.ffindex (671.0 MG)
74 | │ ├── UniRef30_2020_06_cs219.ffdata (6.0 GB)
75 | │ ├── UniRef30_2020_06_cs219.ffindex (605.0 MB)
76 | │ ├── UniRef30_2020_06_hhm.ffdata (34.1 GB)
77 | │ ├── UniRef30_2020_06_hhm.ffindex (19.4 MB)
78 | │ └── UniRef30_2020_06.md5sums (379.0 B)
79 | └── /weights
80 | ├── RF2t.pt (126 MB KB)
81 | ├── Rosetta-DL_LICENSE.txt (3.1 KB)
82 | ├── RoseTTAFold_e2e.pt (533 MB)
83 | └── RoseTTAFold_pyrosetta.pt (506 MB)
84 |
85 | ```
86 |
87 | ### Submit structure prediction jobs from Jupyter
88 |
89 | 1. [Clone the CodeCommit repository](https://docs.aws.amazon.com/codecommit/latest/userguide/how-to-connect.html#how-to-connect-http) created by CloudFormation to a Jupyter Notebook environment of your choice.
90 | 2. Use the `AWS-RoseTTAFold.ipynb` and `CASP14-Analysis.ipynb` notebooks to submit protein sequences for analysis.
91 |
92 | ## Architecture
93 |
94 | 
95 |
96 | This project creates two computing environments in AWS Batch to run the "end-to-end" protein folding workflow in RoseTTAFold. The first of these uses the optimal mix of `c4`, `m4`, and `r4` instance types based on the vCPU and memory requirements specified in the Batch job. The second environment uses `g4dn` on-demand instances to balance performance, availability, and cost.
97 |
98 | A scientist can create structure prediction jobs using one of the two included Jupyter notebooks. `AWS-RoseTTAFold.ipynb` demonstrates how to submit a single analysis job and view the results. `CASP14-Analysis.ipynb` demonstrates how to submit multiple jobs at once using the CASP14 target list. In both of these cases, submitting a sequence for analysis creates two Batch jobs, one for data preparation (using the CPU computing environment) and a second, dependent job for structure prediction (using the GPU computing environment).
99 |
100 | Both the data preparation and structure prediction use the same Docker image for execution. This image, based on the public Nvidia CUDA image for Ubuntu 20, includes the v1.1 release of the public [RoseTTAFold repository](https://github.com/RosettaCommons/RoseTTAFold), as well as additional scripts for integrating with AWS services. CodeBuild will automatically download this container definition and build the required image during stack creation. However, end users can make changes to this image by pushing to the CodeCommit repository included in the stack. For example, users could replace the included MSA algorithm ([hhblits](https://github.com/soedinglab/hh-suite)) with an alternative like [MMseqs2](https://github.com/soedinglab/MMseqs2) or replace the RoseTTAFold network with an alternative like AlphaFold 2 or [Uni-Fold](https://github.com/dptech-corp/Uni-Fold).
101 |
102 | ## Costs
103 | This workload costs approximately $760 per month to maintain, plus another $0.50 per job.
104 |
105 | ## Deployment
106 |
107 | 
108 |
109 | Running the CloudFormation template at `config/cfn.yaml` creates the following resources in the specified availability zone:
110 | 1. A new VPC with a private subnet, public subnet, NAT gateway, internet gateway, elastic IP, route tables, and S3 gateway endpoint.
111 | 2. A FSx Lustre file system with 1.2 TiB of storage and 1,200 MB/s throughput capacity. This file system can be linked to an S3 bucket for loading the required reference data when the first job executes.
112 | 3. An EC2 launch template for mounting the FSX file system to Batch compute instances.
113 | 4. A set of AWS Batch compute environments, job queues, and job definitions for running the CPU-dependent data prep job and a second for the GPU-dependent prediction job.
114 | 5. CodeCommit, CodeBuild, CodePipeline, and ECR resources for building and publishing the Batch container image. When CloudFormation creates the CodeCommit repository, it populates it with a zipped version of this repository stored in a public S3 bucket. CodeBuild uses this repository as its source and adds additional code from release 1.1 of the public [RoseTTAFold repository](https://github.com/RosettaCommons/RoseTTAFold). CodeBuild then publishes the resulting container image to ECR, where Batch jobs can use it as needed.
115 |
116 | ## Licensing
117 | This library is licensed under the MIT-0 License. See the LICENSE file for more information.
118 |
119 | The University of Washington has made the code and data in the [RoseTTAFold public repository](https://github.com/RosettaCommons) available under an [MIT license](https://github.com/RosettaCommons/RoseTTAFold/blob/main/LICENSE). However, the model weights used for prediction are only available for internal, non-profit, non-commercial research use. For information, please see the [full license agreement](https://files.ipd.uw.edu/pub/RoseTTAFold/Rosetta-DL_LICENSE.txt) and contact the University of Washington for details.
120 |
121 | ## Security
122 |
123 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
124 |
125 | ## More Information
126 | - [University of Washington Institute for Protein Design](https://www.ipd.uw.edu/2021/07/rosettafold-accurate-protein-structure-prediction-accessible-to-all/)
127 | - [RoseTTAFold Paper](https://www.ipd.uw.edu/wp-content/uploads/2021/07/Baek_etal_Science2021_RoseTTAFold.pdf)
128 | - [AWS Batch Documentation](https://docs.aws.amazon.com/batch/)
129 | - [CloudFormation Documentation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/Welcome.html)
130 | - [Explaination of the RoseTTAFold and AlphaFold 2 architectures](https://www.youtube.com/watch?v=Rfw7thgGTwI)
131 | - [David Baker's TED talk on protein design](https://www.ted.com/talks/david_baker_5_challenges_we_could_solve_by_designing_new_proteins)
132 | - [AWS ML Blog Post on running AlphaFold 2 on Amazon EC2](https://aws.amazon.com/blogs/machine-learning/run-alphafold-v2-0-on-amazon-ec2/)
--------------------------------------------------------------------------------
/config/Dockerfile:
--------------------------------------------------------------------------------
1 | # Start with a copy of the cuda image maintained by Nvidia to avoid
2 | FROM nvcr.io/nvidia/cuda:11.4.2-base-ubuntu20.04
3 |
4 | # Install basic tools
5 | RUN apt-get update && apt-get install -y \
6 | wget \
7 | curl \
8 | unzip
9 |
10 | # Install miniconda and awscli
11 | RUN curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
12 | && chmod +x ~/miniconda.sh \
13 | && ~/miniconda.sh -b -p /opt/conda \
14 | && rm ~/miniconda.sh \
15 | && /opt/conda/bin/conda update conda \
16 | && /opt/conda/bin/conda install -c conda-forge awscli
17 |
18 | # Download and unzip v1.1 of the RoseTTAFold repository, available at
19 | # https://github.com/RosettaCommons/RoseTTAFold
20 | RUN wget https://github.com/RosettaCommons/RoseTTAFold/archive/refs/tags/v1.1.0.zip \
21 | && unzip v1.1.0.zip \
22 | && mv RoseTTAFold-1.1.0 /RoseTTAFold \
23 | && rm v1.1.0.zip
24 | WORKDIR /RoseTTAFold
25 |
26 | # Install lddt, cs-blast, and libgomp1
27 | RUN ./install_dependencies.sh
28 | RUN /opt/conda/bin/conda env create -f RoseTTAFold-linux.yml \
29 | && /opt/conda/bin/conda clean -ya
30 | RUN apt-get install libgomp1
31 |
32 | # Add the AWS-RoseTTAFold scripts
33 | COPY run_aws_data_prep_ver.sh .
34 | COPY run_aws_predict_ver.sh .
35 | COPY download_ref_data.sh .
36 |
37 | # Clean up unecessary files to save space
38 | RUN rm -rf \
39 | example \
40 | folding \
41 | *.gz \
42 | *.zip \
43 | *.yml \
44 | install_dependencies.sh
45 |
46 | # Create a directory to mount the FSx Lustre file system with ref data
47 | VOLUME /fsx
48 |
49 | # Activate conda\
50 | RUN ["/bin/bash", "-c", \
51 | "/opt/conda/bin/activate", \
52 | "/opt/conda/bin/conda init bash", \
53 | "source $HOME/.bashrc"]
54 | ENV PATH /opt/conda/bin:$PATH
55 |
56 | # Define the default run command. Batch will overwrite this at run time.
57 | CMD ["/bin/bash"]
58 |
--------------------------------------------------------------------------------
/config/cfn.yaml:
--------------------------------------------------------------------------------
1 | AWSTemplateFormatVersion: 2010-09-09
2 | Description: >-
3 | Creates a stack for running RoseTTAFold on AWS Batch.
4 |
5 | Parameters:
6 | StackAvailabilityZone:
7 | Description: Availability zone to deploy stack resources
8 | Type: "AWS::EC2::AvailabilityZone::Name"
9 |
10 | Resources:
11 | ##################################################
12 | # Network Configuration
13 | ##################################################
14 | VPC:
15 | Type: "AWS::EC2::VPC"
16 | Properties:
17 | EnableDnsSupport: "true"
18 | EnableDnsHostnames: "true"
19 | CidrBlock: "10.0.0.0/16"
20 | Tags:
21 | - Key: Application
22 | Value: AWS-RoseTTAFold
23 | - Key: Network
24 | Value: Public
25 | - Key: Name
26 | Value:
27 | !Join [
28 | "-",
29 | [
30 | "aws-rosettafold",
31 | "VPC",
32 | !Select [
33 | 4,
34 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
35 | ],
36 | ],
37 | ]
38 |
39 | PublicSubnet0:
40 | Type: "AWS::EC2::Subnet"
41 | Properties:
42 | VpcId: !Ref VPC
43 | AvailabilityZone: !Ref StackAvailabilityZone
44 | CidrBlock:
45 | Fn::Select:
46 | - 0
47 | - Fn::Cidr: [!GetAtt VPC.CidrBlock, 6, 8]
48 | Tags:
49 | - Key: Application
50 | Value: AWS-RoseTTAFold
51 | - Key: Network
52 | Value: Public
53 | - Key: Name
54 | Value:
55 | !Join [
56 | "-",
57 | [
58 | "aws-rosettafold",
59 | "public-subnet",
60 | !Select [
61 | 4,
62 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
63 | ],
64 | ],
65 | ]
66 |
67 | PrivateSubnet0:
68 | Type: "AWS::EC2::Subnet"
69 | Properties:
70 | VpcId:
71 | Ref: VPC
72 | AvailabilityZone: !Ref StackAvailabilityZone
73 | CidrBlock:
74 | Fn::Select:
75 | - 3
76 | - Fn::Cidr: [!GetAtt VPC.CidrBlock, 6, 8]
77 | Tags:
78 | - Key: Application
79 | Value: AWS-RoseTTAFold
80 | - Key: Network
81 | Value: Private
82 | - Key: Name
83 | Value:
84 | !Join [
85 | "-",
86 | [
87 | "aws-rosettafold",
88 | "private-subnet",
89 | !Select [
90 | 4,
91 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
92 | ],
93 | ],
94 | ]
95 |
96 | InternetGateway:
97 | Type: "AWS::EC2::InternetGateway"
98 | Properties:
99 | Tags:
100 | - Key: Application
101 | Value: AWS-RoseTTAFold
102 | - Key: Network
103 | Value: Public
104 | - Key: Name
105 | Value:
106 | !Join [
107 | "-",
108 | [
109 | "aws-rosettafold",
110 | "igw",
111 | !Select [
112 | 4,
113 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
114 | ],
115 | ],
116 | ]
117 |
118 | GatewayToInternet:
119 | Type: "AWS::EC2::VPCGatewayAttachment"
120 | Properties:
121 | VpcId:
122 | Ref: VPC
123 | InternetGatewayId:
124 | Ref: InternetGateway
125 |
126 | PublicRouteTable:
127 | Type: "AWS::EC2::RouteTable"
128 | Properties:
129 | VpcId:
130 | Ref: VPC
131 | Tags:
132 | - Key: Application
133 | Value: AWS-RoseTTAFold
134 | - Key: Network
135 | Value: Public
136 | - Key: Name
137 | Value:
138 | !Join [
139 | "-",
140 | [
141 | "aws-rosettafold",
142 | "public-route-table",
143 | !Select [
144 | 4,
145 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
146 | ],
147 | ],
148 | ]
149 |
150 | PublicRoute:
151 | Type: "AWS::EC2::Route"
152 | DependsOn: GatewayToInternet
153 | Properties:
154 | RouteTableId:
155 | Ref: PublicRouteTable
156 | DestinationCidrBlock: 0.0.0.0/0
157 | GatewayId:
158 | Ref: InternetGateway
159 |
160 | PublicSubnetRouteTableAssociation0:
161 | Type: "AWS::EC2::SubnetRouteTableAssociation"
162 | Properties:
163 | SubnetId:
164 | Ref: PublicSubnet0
165 | RouteTableId:
166 | Ref: PublicRouteTable
167 |
168 | ElasticIP0:
169 | Type: "AWS::EC2::EIP"
170 | Properties:
171 | Domain: vpc
172 | Tags:
173 | - Key: Application
174 | Value: AWS-RoseTTAFold
175 | - Key: Name
176 | Value:
177 | !Join [
178 | "-",
179 | [
180 | "aws-rosettafold",
181 | "eip",
182 | !Select [
183 | 4,
184 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
185 | ],
186 | ],
187 | ]
188 |
189 | NATGateway0:
190 | Type: "AWS::EC2::NatGateway"
191 | Properties:
192 | AllocationId:
193 | "Fn::GetAtt":
194 | - ElasticIP0
195 | - AllocationId
196 | SubnetId:
197 | Ref: PublicSubnet0
198 | Tags:
199 | - Key: Application
200 | Value: AWS-RoseTTAFold
201 | - Key: Name
202 | Value:
203 | !Join [
204 | "-",
205 | [
206 | "aws-rosettafold",
207 | "nat-gateway",
208 | !Select [
209 | 4,
210 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
211 | ],
212 | ],
213 | ]
214 |
215 | PrivateRouteTable0:
216 | Type: "AWS::EC2::RouteTable"
217 | Properties:
218 | VpcId:
219 | Ref: VPC
220 | Tags:
221 | - Key: Application
222 | Value: AWS-RoseTTAFold
223 | - Key: Name
224 | Value:
225 | !Join [
226 | "-",
227 | [
228 | "aws-rosettafold",
229 | "private-route-table",
230 | !Select [
231 | 4,
232 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
233 | ],
234 | ],
235 | ]
236 |
237 | PrivateRouteToInternet0:
238 | Type: "AWS::EC2::Route"
239 | Properties:
240 | RouteTableId:
241 | Ref: PrivateRouteTable0
242 | DestinationCidrBlock: 0.0.0.0/0
243 | NatGatewayId:
244 | Ref: NATGateway0
245 |
246 | PrivateSubnetRouteTableAssociation0:
247 | Type: "AWS::EC2::SubnetRouteTableAssociation"
248 | Properties:
249 | SubnetId:
250 | Ref: PrivateSubnet0
251 | RouteTableId:
252 | Ref: PrivateRouteTable0
253 |
254 | ##################################################
255 | # S3
256 | ##################################################
257 |
258 | ResultsS3:
259 | Type: "AWS::S3::Bucket"
260 | Properties:
261 | BucketEncryption:
262 | ServerSideEncryptionConfiguration:
263 | - ServerSideEncryptionByDefault:
264 | SSEAlgorithm: AES256
265 | Tags:
266 | - Key: Application
267 | Value: AWS-RoseTTAFold
268 | - Key: Name
269 | Value:
270 | !Join [
271 | "-",
272 | [
273 | "aws-rosettafold",
274 | "s3",
275 | !Select [
276 | 4,
277 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
278 | ],
279 | ],
280 | ]
281 | DeletionPolicy: Retain
282 | UpdateReplacePolicy : Retain
283 |
284 | S3Endpoint:
285 | Type: "AWS::EC2::VPCEndpoint"
286 | Properties:
287 | RouteTableIds:
288 | - !Ref PublicRouteTable
289 | - !Ref PrivateRouteTable0
290 | ServiceName: !Sub "com.amazonaws.${AWS::Region}.s3"
291 | VpcId: !Ref VPC
292 |
293 | ##################################################
294 | # FSx File System
295 | ##################################################
296 | FSX:
297 | Type: AWS::FSx::FileSystem
298 | Properties:
299 | FileSystemType: "LUSTRE"
300 | FileSystemTypeVersion: "2.12"
301 | LustreConfiguration:
302 | DataCompressionType: "LZ4"
303 | DeploymentType: "PERSISTENT_2"
304 | PerUnitStorageThroughput: 1000
305 | SecurityGroupIds:
306 | - !GetAtt VPC.DefaultSecurityGroup
307 | StorageCapacity: 1200
308 | StorageType: "SSD"
309 | SubnetIds:
310 | - !Ref PrivateSubnet0
311 | Tags:
312 | - Key: Application
313 | Value: AWS-RoseTTAFold
314 | - Key: Name
315 | Value:
316 | !Join [
317 | "-",
318 | [
319 | "aws-rosettafold",
320 | "fsx-lustre",
321 | !Select [
322 | 4,
323 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
324 | ],
325 | ],
326 | ]
327 |
328 | ##################################################
329 | # EC2 Launch Template
330 | ##################################################
331 |
332 | RFInstanceRole:
333 | Type: AWS::IAM::Role
334 | Properties:
335 | RoleName:
336 | !Join [
337 | "-",
338 | [
339 | "aws-rosettafold",
340 | "instance-role",
341 | !Select [
342 | 4,
343 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
344 | ],
345 | ],
346 | ]
347 | Description: "Required service policies to support running RoseTTAFold on AWS Batch"
348 | AssumeRolePolicyDocument:
349 | Version: "2012-10-17"
350 | Statement:
351 | - Effect: Allow
352 | Principal:
353 | Service:
354 | - ec2.amazonaws.com
355 | Action:
356 | - "sts:AssumeRole"
357 | ManagedPolicyArns:
358 | - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
359 | - arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role
360 | - arn:aws:iam::aws:policy/AmazonS3FullAccess
361 | Path: /
362 | Tags:
363 | - Key: Application
364 | Value: AWS-RoseTTAFold
365 | - Key: Name
366 | Value:
367 | !Join [
368 | "-",
369 | [
370 | "aws-rosettafold",
371 | "instance-role",
372 | !Select [
373 | 4,
374 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
375 | ],
376 | ],
377 | ]
378 |
379 | InstanceProfile:
380 | Type: "AWS::IAM::InstanceProfile"
381 | Properties:
382 | InstanceProfileName:
383 | !Join [
384 | "-",
385 | [
386 | "aws-rosettafold",
387 | "instance-profile",
388 | !Select [
389 | 4,
390 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
391 | ],
392 | ],
393 | ]
394 | Path: /
395 | Roles:
396 | - !Ref RFInstanceRole
397 |
398 | InstanceLaunchTemplate:
399 | Type: AWS::EC2::LaunchTemplate
400 | Properties:
401 | LaunchTemplateName:
402 | !Join [
403 | "-",
404 | [
405 | "aws-rosettafold",
406 | "launch-template",
407 | !Select [
408 | 4,
409 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
410 | ],
411 | ],
412 | ]
413 | LaunchTemplateData:
414 | BlockDeviceMappings:
415 | - DeviceName: "/dev/xvda"
416 | Ebs:
417 | DeleteOnTermination: true
418 | Encrypted: true
419 | VolumeSize: 50
420 | VolumeType: "gp2"
421 | IamInstanceProfile:
422 | Name: !Ref InstanceProfile
423 | TagSpecifications:
424 | - ResourceType: "instance"
425 | Tags:
426 | - Key: Application
427 | Value: AWS-RoseTTAFold
428 | - Key: Name
429 | Value:
430 | !Join [
431 | "-",
432 | [
433 | "aws-rosettafold",
434 | "launch-template",
435 | !Select [
436 | 4,
437 | !Split [
438 | "-",
439 | !Select [2, !Split ["/", !Ref AWS::StackId]],
440 | ],
441 | ],
442 | ],
443 | ]
444 | UserData:
445 | Fn::Base64:
446 | Fn::Join:
447 | [
448 | "",
449 | [
450 | "MIME-Version: 1.0\n",
451 | "Content-Type: multipart/mixed; boundary=\"==MYBOUNDARY==\"\n",
452 | "\n",
453 | "--==MYBOUNDARY==\n",
454 | "Content-Type: text/cloud-config; charset=\"us-ascii\"\n",
455 | "\n",
456 | "runcmd:\n",
457 | "- file_system_id_01=",
458 | !Ref FSX,
459 | "\n",
460 | "- region=",
461 | !Ref AWS::Region,
462 | "\n",
463 | "- fsx_directory=/fsx\n",
464 | "- fsx_mount_name=",
465 | !GetAtt FSX.LustreMountName,
466 | "\n",
467 | "- amazon-linux-extras install -y lustre2.10\n",
468 | "- mkdir -p ${fsx_directory}\n",
469 | "- mount -t lustre ${file_system_id_01}.fsx.${region}.amazonaws.com@tcp:/${fsx_mount_name} ${fsx_directory}\n",
470 | "\n",
471 | "--==MYBOUNDARY==--",
472 | ],
473 | ]
474 |
475 | ##################################################
476 | # Container Services
477 | ##################################################
478 | RFCodeRepository:
479 | Type: AWS::CodeCommit::Repository
480 | Properties:
481 | Code:
482 | BranchName: "main"
483 | S3:
484 | Bucket: "aws-hcls-ml"
485 | Key: "blog_post_support_materials/aws-RoseTTAFold/aws-rosettafold.zip"
486 | RepositoryDescription: Code for running RoseTTAFold on AWS
487 | RepositoryName:
488 | !Join [
489 | "-",
490 | [
491 | "aws-rosettafold",
492 | "code-repo",
493 | !Select [
494 | 4,
495 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
496 | ],
497 | ],
498 | ]
499 | Tags:
500 | - Key: Application
501 | Value: AWS-RoseTTAFold
502 | - Key: Name
503 | Value:
504 | !Join [
505 | "-",
506 | [
507 | "aws-rosettafold",
508 | "code-repo",
509 | !Select [
510 | 4,
511 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
512 | ],
513 | ],
514 | ]
515 |
516 | RFContainerRegistry:
517 | Type: AWS::ECR::Repository
518 | Properties:
519 | EncryptionConfiguration:
520 | EncryptionType: AES256
521 | ImageScanningConfiguration:
522 | ScanOnPush: true
523 | RepositoryName:
524 | !Join [
525 | "-",
526 | [
527 | "aws-rosettafold",
528 | "container-repo",
529 | !Select [
530 | 4,
531 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
532 | ],
533 | ],
534 | ]
535 | Tags:
536 | - Key: Application
537 | Value: AWS-RoseTTAFold
538 | - Key: Name
539 | Value:
540 | !Join [
541 | "-",
542 | [
543 | "aws-rosettafold",
544 | "container-repo",
545 | !Select [
546 | 4,
547 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
548 | ],
549 | ],
550 | ]
551 | DeletionPolicy: Retain
552 | UpdateReplacePolicy : Retain
553 |
554 | CodeBuildRole:
555 | Type: AWS::IAM::Role
556 | Properties:
557 | RoleName:
558 | !Join [
559 | "-",
560 | [
561 | "aws-rosettafold",
562 | "codebuild-role",
563 | !Select [
564 | 4,
565 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
566 | ],
567 | ],
568 | ]
569 | Description: "Required service policies to support building AWS-RoseTTAFold container"
570 | AssumeRolePolicyDocument:
571 | Version: "2012-10-17"
572 | Statement:
573 | - Effect: Allow
574 | Principal:
575 | Service:
576 | - codebuild.amazonaws.com
577 | Action:
578 | - "sts:AssumeRole"
579 | ManagedPolicyArns:
580 | - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess
581 | Path: /
582 | Policies:
583 | - PolicyName: RFCodeBuildPolicy
584 | PolicyDocument:
585 | Version: "2012-10-17"
586 | Statement:
587 | - Effect: Allow
588 | Action:
589 | - logs:CreateLogGroup
590 | - logs:CreateLogStream
591 | - logs:PutLogEvents
592 | Resource:
593 | - Fn::Join:
594 | [
595 | ":",
596 | [
597 | "arn:aws:logs",
598 | !Ref AWS::Region,
599 | !Ref AWS::AccountId,
600 | "log-group:/aws/codebuild/aws-rosettafold*",
601 | ],
602 | ]
603 | - Effect: Allow
604 | Action:
605 | - s3:PutObject
606 | - s3:GetObject
607 | - s3:GetObjectVersion
608 | - s3:GetBucketAcl
609 | - s3:GetBucketLocation
610 | Resource:
611 | - !Join [
612 | "-",
613 | ["arn:aws:s3:::codepipeline", !Ref AWS::Region, "*"],
614 | ]
615 | - !Join ["", [!GetAtt ResultsS3.Arn, "*"]]
616 | - Effect: Allow
617 | Action:
618 | - codecommit:GitPull
619 | Resource:
620 | - Fn::Join:
621 | [
622 | ":",
623 | [
624 | "arn:aws:codecommit",
625 | !Ref AWS::Region,
626 | !Ref AWS::AccountId,
627 | !GetAtt RFCodeRepository.Name,
628 | ],
629 | ]
630 | - Effect: Allow
631 | Action:
632 | - codebuild:CreateReportGroup
633 | - codebuild:CreateReport
634 | - codebuild:UpdateReport
635 | - codebuild:BatchPutTestCases
636 | - codebuild:BatchPutCodeCoverages
637 | Resource:
638 | - Fn::Join:
639 | [
640 | ":",
641 | [
642 | "arn:aws:s3:::codebuild",
643 | !Ref AWS::Region,
644 | !Ref AWS::AccountId,
645 | "report-group/aws-rosettafold*",
646 | ],
647 | ]
648 | Tags:
649 | - Key: Application
650 | Value: AWS-RoseTTAFold
651 | - Key: Name
652 | Value:
653 | !Join [
654 | "-",
655 | [
656 | "aws-rosettafold",
657 | "codebuild-role",
658 | !Select [
659 | 4,
660 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
661 | ],
662 | ],
663 | ]
664 |
665 | CodeBuildEncryptionKey:
666 | Type: "AWS::KMS::Key"
667 | Properties:
668 | KeyPolicy:
669 | Version: 2012-10-17
670 | Id: key-default-1
671 | Statement:
672 | - Sid: Enable IAM User Permissions
673 | Effect: Allow
674 | Principal:
675 | AWS:
676 | Fn::Join: [":", ["arn:aws:iam:", !Ref AWS::AccountId, "root"]]
677 | Action: "kms:*"
678 | Resource: "*"
679 | - Sid: Enable CodeBuild Encryption
680 | Effect: Allow
681 | Principal:
682 | AWS: !GetAtt CodeBuildRole.Arn
683 | Action:
684 | [
685 | "kms:Encrypt",
686 | "kms:Decrypt",
687 | "kms:ReEncrypt*",
688 | "kms:GenerateDataKey*",
689 | "kms:DescribeKey",
690 | ]
691 | Resource: "*"
692 | Tags:
693 | - Key: Application
694 | Value: AWS-RoseTTAFold
695 | - Key: Name
696 | Value:
697 | !Join [
698 | "-",
699 | [
700 | "aws-rosettafold",
701 | "kms",
702 | !Select [
703 | 4,
704 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
705 | ],
706 | ],
707 | ]
708 |
709 | RFCodeBuildProject:
710 | Type: AWS::CodeBuild::Project
711 | Properties:
712 | Artifacts:
713 | Type: NO_ARTIFACTS
714 | Description: Build Docker container for RoseTTAFold execution on AWS Batch
715 | EncryptionKey: !Ref CodeBuildEncryptionKey
716 | Environment:
717 | ComputeType: BUILD_GENERAL1_MEDIUM
718 | EnvironmentVariables:
719 | - Name: IMAGE_TAG
720 | Value: latest
721 | - Name: IMAGE_REPO_NAME
722 | Value: !Ref RFContainerRegistry
723 | - Name: ACCOUNT_ID
724 | Value: !Ref AWS::AccountId
725 | Image: aws/codebuild/standard:4.0
726 | ImagePullCredentialsType: CODEBUILD
727 | PrivilegedMode: true
728 | Type: LINUX_CONTAINER
729 | Name:
730 | !Join [
731 | "-",
732 | [
733 | "aws-rosettafold",
734 | "codebuild-project",
735 | !Select [
736 | 4,
737 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
738 | ],
739 | ],
740 | ]
741 | ResourceAccessRole: !GetAtt CodeBuildRole.Arn
742 | ServiceRole: !GetAtt CodeBuildRole.Arn
743 | Source:
744 | BuildSpec: config/container_buildspec.yml
745 | GitCloneDepth: 1
746 | Location: !GetAtt RFCodeRepository.CloneUrlHttp
747 | Type: CODECOMMIT
748 | SourceVersion: refs/heads/main
749 | Tags:
750 | - Key: Application
751 | Value: AWS-RoseTTAFold
752 | - Key: Name
753 | Value:
754 | !Join [
755 | "-",
756 | [
757 | "aws-rosettafold",
758 | "codebuild-project",
759 | !Select [
760 | 4,
761 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
762 | ],
763 | ],
764 | ]
765 |
766 | CodePipelineRole:
767 | Type: AWS::IAM::Role
768 | Properties:
769 | RoleName:
770 | !Join [
771 | "-",
772 | [
773 | "aws-rosettafold",
774 | "codepipeline-role",
775 | !Select [
776 | 4,
777 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
778 | ],
779 | ],
780 | ]
781 | Description: "Required service policies to support running AWS-RoseTTAFold build pipeline"
782 | AssumeRolePolicyDocument:
783 | Version: "2012-10-17"
784 | Statement:
785 | - Effect: Allow
786 | Principal:
787 | Service:
788 | - codepipeline.amazonaws.com
789 | Action:
790 | - "sts:AssumeRole"
791 | Path: /
792 | Policies:
793 | - PolicyName: codePipelineDefault
794 | PolicyDocument:
795 | Version: "2012-10-17"
796 | Statement:
797 | - Action:
798 | - iam:PassRole
799 | Resource: "*"
800 | Effect: Allow
801 | Condition:
802 | StringEqualsIfExists:
803 | iam:PassedToService:
804 | - cloudformation.amazonaws.com
805 | - elasticbeanstalk.amazonaws.com
806 | - ec2.amazonaws.com
807 | - ecs-tasks.amazonaws.com
808 | - Action:
809 | - codecommit:CancelUploadArchive
810 | - codecommit:GetBranch
811 | - codecommit:GetCommit
812 | - codecommit:GetRepository
813 | - codecommit:GetUploadArchiveStatus
814 | - codecommit:UploadArchive
815 | Resource: "*"
816 | Effect: Allow
817 | - Action:
818 | - codedeploy:CreateDeployment
819 | - codedeploy:GetApplication
820 | - codedeploy:GetApplicationRevision
821 | - codedeploy:GetDeployment
822 | - codedeploy:GetDeploymentConfig
823 | - codedeploy:RegisterApplicationRevision
824 | Resource: "*"
825 | Effect: Allow
826 | - Action:
827 | - codestar-connections:UseConnection
828 | Resource: "*"
829 | Effect: Allow
830 | - Action:
831 | - elasticbeanstalk:*
832 | - ec2:*
833 | - elasticloadbalancing:*
834 | - autoscaling:*
835 | - cloudwatch:*
836 | - s3:*
837 | - sns:*
838 | - cloudformation:*
839 | - rds:*
840 | - sqs:*
841 | - ecs:*
842 | Resource: "*"
843 | Effect: Allow
844 | - Action:
845 | - lambda:InvokeFunction
846 | - lambda:ListFunctions
847 | Resource: "*"
848 | Effect: Allow
849 | - Action:
850 | - opsworks:CreateDeployment
851 | - opsworks:DescribeApps
852 | - opsworks:DescribeCommands
853 | - opsworks:DescribeDeployments
854 | - opsworks:DescribeInstances
855 | - opsworks:DescribeStacks
856 | - opsworks:UpdateApp
857 | - opsworks:UpdateStack
858 | Resource: "*"
859 | Effect: Allow
860 | - Action:
861 | - cloudformation:CreateStack
862 | - cloudformation:DeleteStack
863 | - cloudformation:DescribeStacks
864 | - cloudformation:UpdateStack
865 | - cloudformation:CreateChangeSet
866 | - cloudformation:DeleteChangeSet
867 | - cloudformation:DescribeChangeSet
868 | - cloudformation:ExecuteChangeSet
869 | - cloudformation:SetStackPolicy
870 | - cloudformation:ValidateTemplate
871 | Resource: "*"
872 | Effect: Allow
873 | - Action:
874 | - codebuild:BatchGetBuilds
875 | - codebuild:StartBuild
876 | - codebuild:BatchGetBuildBatches
877 | - codebuild:StartBuildBatch
878 | Resource: "*"
879 | Effect: Allow
880 | - Effect: Allow
881 | Action:
882 | - devicefarm:ListProjects
883 | - devicefarm:ListDevicePools
884 | - devicefarm:GetRun
885 | - devicefarm:GetUpload
886 | - devicefarm:CreateUpload
887 | - devicefarm:ScheduleRun
888 | Resource: "*"
889 | - Effect: Allow
890 | Action:
891 | - servicecatalog:ListProvisioningArtifacts
892 | - servicecatalog:CreateProvisioningArtifact
893 | - servicecatalog:DescribeProvisioningArtifact
894 | - servicecatalog:DeleteProvisioningArtifact
895 | - servicecatalog:UpdateProduct
896 | Resource: "*"
897 | - Effect: Allow
898 | Action:
899 | - cloudformation:ValidateTemplate
900 | Resource: "*"
901 | - Effect: Allow
902 | Action:
903 | - ecr:DescribeImages
904 | Resource: "*"
905 | - Effect: Allow
906 | Action:
907 | - states:DescribeExecution
908 | - states:DescribeStateMachine
909 | - states:StartExecution
910 | Resource: "*"
911 | - Effect: Allow
912 | Action:
913 | - appconfig:StartDeployment
914 | - appconfig:StopDeployment
915 | - appconfig:GetDeployment
916 | Resource: "*"
917 | Tags:
918 | - Key: Application
919 | Value: AWS-RoseTTAFold
920 | - Key: Name
921 | Value:
922 | !Join [
923 | "-",
924 | [
925 | "aws-rosettafold",
926 | "codepipeline-role",
927 | !Select [
928 | 4,
929 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
930 | ],
931 | ],
932 | ]
933 |
934 | RFCodePipeline:
935 | Type: AWS::CodePipeline::Pipeline
936 | Properties:
937 | ArtifactStore:
938 | Location: !Ref ResultsS3
939 | Type: S3
940 | Name:
941 | !Join [
942 | "-",
943 | [
944 | "aws-rosettafold",
945 | "codepipeline",
946 | !Select [
947 | 4,
948 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
949 | ],
950 | ],
951 | ]
952 | RestartExecutionOnUpdate: true
953 | RoleArn: !GetAtt CodePipelineRole.Arn
954 | Stages:
955 | - Name: Source
956 | Actions:
957 | - Name: Source
958 | ActionTypeId:
959 | Category: Source
960 | Owner: AWS
961 | Provider: CodeCommit
962 | Version: 1
963 | Configuration:
964 | RepositoryName: !GetAtt RFCodeRepository.Name
965 | BranchName: main
966 | PollForSourceChanges: "false"
967 | Namespace: SourceVariables
968 | OutputArtifacts:
969 | - Name: SourceArtifact
970 | Region: !Ref AWS::Region
971 | RunOrder: 1
972 | - Name: Build
973 | Actions:
974 | - Name: Build
975 | ActionTypeId:
976 | Category: Build
977 | Owner: AWS
978 | Provider: CodeBuild
979 | Version: 1
980 | Configuration:
981 | ProjectName: !Ref RFCodeBuildProject
982 | InputArtifacts:
983 | - Name: SourceArtifact
984 | Namespace: BuildVariables
985 | OutputArtifacts:
986 | - Name: BuildArtifact
987 | Region: !Ref AWS::Region
988 | RunOrder: 2
989 | Tags:
990 | - Key: Application
991 | Value: AWS-RoseTTAFold
992 | - Key: Name
993 | Value:
994 | !Join [
995 | "-",
996 | [
997 | "aws-rosettafold",
998 | "codepipeline",
999 | !Select [
1000 | 4,
1001 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1002 | ],
1003 | ],
1004 | ]
1005 |
1006 | ##################################################
1007 | # Batch Environment
1008 | ##################################################
1009 |
1010 | CPUComputeEnvironment:
1011 | Type: AWS::Batch::ComputeEnvironment
1012 | Properties:
1013 | ComputeEnvironmentName:
1014 | !Join [
1015 | "-",
1016 | [
1017 | "aws-rosettafold",
1018 | "ce-cpu",
1019 | !Select [
1020 | 4,
1021 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1022 | ],
1023 | ],
1024 | ]
1025 | ComputeResources:
1026 | AllocationStrategy: BEST_FIT_PROGRESSIVE
1027 | InstanceRole: !Ref InstanceProfile
1028 | InstanceTypes:
1029 | - optimal
1030 | LaunchTemplate:
1031 | LaunchTemplateId: !Ref InstanceLaunchTemplate
1032 | Version: $Latest
1033 | MaxvCpus: 256
1034 | MinvCpus: 0
1035 | SecurityGroupIds:
1036 | - !GetAtt VPC.DefaultSecurityGroup
1037 | Subnets:
1038 | - Ref: PrivateSubnet0
1039 | Type: EC2
1040 | State: ENABLED
1041 | Type: MANAGED
1042 | Tags:
1043 | Application: AWS-RoseTTAFold
1044 | Name:
1045 | !Join [
1046 | "-",
1047 | [
1048 | "aws-rosettafold",
1049 | "ce-cpu",
1050 | !Select [
1051 | 4,
1052 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1053 | ],
1054 | ],
1055 | ]
1056 |
1057 | GPUComputeEnvironment:
1058 | Type: AWS::Batch::ComputeEnvironment
1059 | Properties:
1060 | ComputeEnvironmentName:
1061 | !Join [
1062 | "-",
1063 | [
1064 | "aws-rosettafold",
1065 | "ce-gpu",
1066 | !Select [
1067 | 4,
1068 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1069 | ],
1070 | ],
1071 | ]
1072 | ComputeResources:
1073 | AllocationStrategy: BEST_FIT_PROGRESSIVE
1074 | InstanceRole: !Ref InstanceProfile
1075 | InstanceTypes:
1076 | - g4dn
1077 | LaunchTemplate:
1078 | LaunchTemplateId: !Ref InstanceLaunchTemplate
1079 | Version: $Latest
1080 | MaxvCpus: 256
1081 | MinvCpus: 0
1082 | SecurityGroupIds:
1083 | - !GetAtt VPC.DefaultSecurityGroup
1084 | Subnets:
1085 | - Ref: PrivateSubnet0
1086 | Type: EC2
1087 | State: ENABLED
1088 | Type: MANAGED
1089 | Tags:
1090 | Application: AWS-RoseTTAFold
1091 | Name:
1092 | !Join [
1093 | "-",
1094 | [
1095 | "aws-rosettafold",
1096 | "ce-gpu",
1097 | !Select [
1098 | 4,
1099 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1100 | ],
1101 | ],
1102 | ]
1103 |
1104 | CPUJobQueue:
1105 | Type: AWS::Batch::JobQueue
1106 | Properties:
1107 | ComputeEnvironmentOrder:
1108 | - ComputeEnvironment: !Ref CPUComputeEnvironment
1109 | Order: 1
1110 | JobQueueName:
1111 | !Join [
1112 | "-",
1113 | [
1114 | "aws-rosettafold",
1115 | "queue-cpu",
1116 | !Select [
1117 | 4,
1118 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1119 | ],
1120 | ],
1121 | ]
1122 | Priority: 10
1123 | State: ENABLED
1124 | Tags:
1125 | Application: AWS-RoseTTAFold
1126 | Name:
1127 | !Join [
1128 | "-",
1129 | [
1130 | "aws-rosettafold",
1131 | "queue-cpu",
1132 | !Select [
1133 | 4,
1134 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1135 | ],
1136 | ],
1137 | ]
1138 |
1139 | GPUJobQueue:
1140 | Type: AWS::Batch::JobQueue
1141 | Properties:
1142 | ComputeEnvironmentOrder:
1143 | - ComputeEnvironment: !Ref GPUComputeEnvironment
1144 | Order: 1
1145 | JobQueueName:
1146 | !Join [
1147 | "-",
1148 | [
1149 | "aws-rosettafold",
1150 | "queue-gpu",
1151 | !Select [
1152 | 4,
1153 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1154 | ],
1155 | ],
1156 | ]
1157 | Priority: 10
1158 | State: ENABLED
1159 | Tags:
1160 | Application: AWS-RoseTTAFold
1161 | Name:
1162 | !Join [
1163 | "-",
1164 | [
1165 | "aws-rosettafold",
1166 | "queue-gpu",
1167 | !Select [
1168 | 4,
1169 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1170 | ],
1171 | ],
1172 | ]
1173 |
1174 | CPUDataPrepJobDefinition:
1175 | Type: AWS::Batch::JobDefinition
1176 | Properties:
1177 | ContainerProperties:
1178 | Command:
1179 | - "/bin/bash"
1180 | - "run_aws_data_prep_ver.sh"
1181 | - "-i"
1182 | - !Join ["", ["s3://", !Ref ResultsS3]]
1183 | - "-o"
1184 | - !Join ["", ["s3://", !Ref ResultsS3]]
1185 | - "-n"
1186 | - "input.fa"
1187 | - "-w"
1188 | - "/work"
1189 | - "-d"
1190 | - "/fsx/aws-rosettafold-ref-data"
1191 | - "-c"
1192 | - "8"
1193 | - "-m"
1194 | - "32"
1195 | Image:
1196 | !Join [":", [!GetAtt RFContainerRegistry.RepositoryUri, "latest"]]
1197 | LogConfiguration:
1198 | LogDriver: awslogs
1199 | MountPoints:
1200 | - ContainerPath: /fsx
1201 | ReadOnly: False
1202 | SourceVolume: fsx
1203 | ResourceRequirements:
1204 | - Type: VCPU
1205 | Value: 8
1206 | - Type: MEMORY
1207 | Value: 32000
1208 | Volumes:
1209 | - Name: fsx
1210 | Host:
1211 | SourcePath: /fsx
1212 | JobDefinitionName:
1213 | !Join [
1214 | "-",
1215 | [
1216 | "aws-rosettafold",
1217 | "job-def-cpudataprep",
1218 | !Select [
1219 | 4,
1220 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1221 | ],
1222 | ],
1223 | ]
1224 | PlatformCapabilities:
1225 | - EC2
1226 | PropagateTags: true
1227 | RetryStrategy:
1228 | Attempts: 3
1229 | Tags:
1230 | Application: AWS-RoseTTAFold
1231 | Name:
1232 | !Join [
1233 | "-",
1234 | [
1235 | "aws-rosettafold",
1236 | "job-def-cpudataprep",
1237 | !Select [
1238 | 4,
1239 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1240 | ],
1241 | ],
1242 | ]
1243 | Type: container
1244 |
1245 | GPUPredictJobDefinition:
1246 | Type: AWS::Batch::JobDefinition
1247 | Properties:
1248 | ContainerProperties:
1249 | Command:
1250 | - "/bin/bash"
1251 | - "run_aws_predict_ver.sh"
1252 | - "-i"
1253 | - !Join ["", ["s3://", !Ref ResultsS3]]
1254 | - "-o"
1255 | - !Join ["", ["s3://", !Ref ResultsS3]]
1256 | - "-w"
1257 | - "/work"
1258 | - "-d"
1259 | - "/fsx/aws-rosettafold-ref-data"
1260 | - "-x"
1261 | - "/fsx/aws-rosettafold-ref-data"
1262 | - "-c"
1263 | - "4"
1264 | - "-m"
1265 | - "16"
1266 | Image:
1267 | !Join [":", [!GetAtt RFContainerRegistry.RepositoryUri, "latest"]]
1268 | LogConfiguration:
1269 | LogDriver: awslogs
1270 | MountPoints:
1271 | - ContainerPath: /fsx
1272 | ReadOnly: False
1273 | SourceVolume: fsx
1274 | ResourceRequirements:
1275 | - Type: VCPU
1276 | Value: 4
1277 | - Type: MEMORY
1278 | Value: 16000
1279 | - Type: GPU
1280 | Value: 1
1281 | Volumes:
1282 | - Name: fsx
1283 | Host:
1284 | SourcePath: /fsx
1285 | JobDefinitionName:
1286 | !Join [
1287 | "-",
1288 | [
1289 | "aws-rosettafold",
1290 | "job-def-gpupredict",
1291 | !Select [
1292 | 4,
1293 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1294 | ],
1295 | ],
1296 | ]
1297 | PlatformCapabilities:
1298 | - EC2
1299 | PropagateTags: true
1300 | RetryStrategy:
1301 | Attempts: 3
1302 | Tags:
1303 | Application: AWS-RoseTTAFold
1304 | Name:
1305 | !Join [
1306 | "-",
1307 | [
1308 | "aws-rosettafold",
1309 | "job-def-gpupredict",
1310 | !Select [
1311 | 4,
1312 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1313 | ],
1314 | ],
1315 | ]
1316 | Type: container
1317 |
1318 | CPUPredictJobDefinition:
1319 | Type: AWS::Batch::JobDefinition
1320 | Properties:
1321 | ContainerProperties:
1322 | Command:
1323 | - "/bin/bash"
1324 | - "run_aws_predict_ver.sh"
1325 | - "-i"
1326 | - !Join ["", ["s3://", !Ref ResultsS3]]
1327 | - "-o"
1328 | - !Join ["", ["s3://", !Ref ResultsS3]]
1329 | - "-w"
1330 | - "/work"
1331 | - "-d"
1332 | - "/fsx/aws-rosettafold-ref-data"
1333 | - "-x"
1334 | - "/fsx/aws-rosettafold-ref-data"
1335 | - "-c"
1336 | - "4"
1337 | - "-m"
1338 | - "64"
1339 | Image:
1340 | !Join [":", [!GetAtt RFContainerRegistry.RepositoryUri, "latest"]]
1341 | LogConfiguration:
1342 | LogDriver: awslogs
1343 | MountPoints:
1344 | - ContainerPath: /fsx
1345 | ReadOnly: False
1346 | SourceVolume: fsx
1347 | ResourceRequirements:
1348 | - Type: VCPU
1349 | Value: 4
1350 | - Type: MEMORY
1351 | Value: 64000
1352 | Volumes:
1353 | - Name: fsx
1354 | Host:
1355 | SourcePath: /fsx
1356 | JobDefinitionName:
1357 | !Join [
1358 | "-",
1359 | [
1360 | "aws-rosettafold",
1361 | "job-def-cpupredict",
1362 | !Select [
1363 | 4,
1364 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1365 | ],
1366 | ],
1367 | ]
1368 | PlatformCapabilities:
1369 | - EC2
1370 | PropagateTags: true
1371 | RetryStrategy:
1372 | Attempts: 3
1373 | Tags:
1374 | Application: AWS-RoseTTAFold
1375 | Name:
1376 | !Join [
1377 | "-",
1378 | [
1379 | "aws-rosettafold",
1380 | "job-def-cpupredict",
1381 | !Select [
1382 | 4,
1383 | !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId]]],
1384 | ],
1385 | ],
1386 | ]
1387 | Type: container
1388 |
1389 | Outputs:
1390 | CodeRepoUri:
1391 | Description: URI for cloning the CodeCommit repository over HTTPS
1392 | Value: !GetAtt RFCodeRepository.CloneUrlHttp
1393 | Export:
1394 | Name: !Join [":", [!Ref "AWS::StackName", CodeRepoUri]]
1395 | CPUJobQueueName:
1396 | Description: Name of the CPU job queue.
1397 | Value: !Select [5, !Split [":", !Ref CPUJobQueue]]
1398 | Export:
1399 | Name: !Join [":", [!Ref "AWS::StackName", CPUJobQueueName]]
1400 | GPUJobQueueName:
1401 | Description: Name of the GPU job queue.
1402 | Value: !Select [5, !Split [":", !Ref GPUJobQueue]]
1403 | Export:
1404 | Name: !Join [":", [!Ref "AWS::StackName", GPUJobQueueName]]
1405 | CPUDataPrepJobDefinition:
1406 | Description: Name of the data prep CPU job definition.
1407 | Value: !Select [5, !Split [":", !Ref CPUDataPrepJobDefinition]]
1408 | Export:
1409 | Name: !Join [":", [!Ref "AWS::StackName", CPUDataPrepJobDefinition]]
1410 | GPUPredictJobDefinition:
1411 | Description: Name of the predict GPU job definition.
1412 | Value: !Select [5, !Split [":", !Ref GPUPredictJobDefinition]]
1413 | Export:
1414 | Name: !Join [":", [!Ref "AWS::StackName", GPUPredictJobDefinition]]
1415 | CPUPredictJobDefinition:
1416 | Description: Name of the predict CPU job definition.
1417 | Value: !Select [5, !Split [":", !Ref CPUPredictJobDefinition]]
1418 | Export:
1419 | Name: !Join [":", [!Ref "AWS::StackName", CPUPredictJobDefinition]]
1420 |
--------------------------------------------------------------------------------
/config/container_buildspec.yml:
--------------------------------------------------------------------------------
1 | version: 0.2
2 |
3 | phases:
4 | pre_build:
5 | commands:
6 | - echo Logging in to Amazon ECR...
7 | - aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
8 | build:
9 | commands:
10 | - echo Build started on `date`
11 | - echo Building the Docker image...
12 | - docker build -t $IMAGE_REPO_NAME:$IMAGE_TAG config
13 | - docker tag $IMAGE_REPO_NAME:$IMAGE_TAG $ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG
14 | post_build:
15 | commands:
16 | - echo Build completed on `date`
17 | - echo Pushing the Docker image...
18 | - docker push $ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG
19 |
--------------------------------------------------------------------------------
/config/download_ref_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | yum install wget tar -y
3 | cd /fsx
4 |
5 | # NOTE: The RoseTTAFold network weights are covered under the Rosetta-DL software license.
6 | # Please see https://files.ipd.uw.edu/pub/RoseTTAFold/Rosetta-DL_LICENSE.txt for more
7 | # information.
8 | wget https://files.ipd.uw.edu/pub/RoseTTAFold/weights.tar.gz
9 | tar xfz weights.tar.gz
10 | rm weights.tar.gz
11 |
12 | # uniref30 [46G]
13 | wget http://wwwuser.gwdg.de/~compbiol/uniclust/2020_06/UniRef30_2020_06_hhsuite.tar.gz
14 | mkdir -p UniRef30_2020_06
15 | tar xfz UniRef30_2020_06_hhsuite.tar.gz -C ./UniRef30_2020_06
16 | rm UniRef30_2020_06_hhsuite.tar.gz
17 |
18 | # structure templates (including *_a3m.ffdata, *_a3m.ffindex) [over 100G]
19 | wget https://files.ipd.uw.edu/pub/RoseTTAFold/pdb100_2021Mar03.tar.gz
20 | tar xfz pdb100_2021Mar03.tar.gz
21 | rm pdb100_2021Mar03.sorted_opt.tar.gz
22 |
23 | # BFD [272G]
24 | wget https://bfd.mmseqs.com/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz
25 | mkdir -p bfd
26 | tar xfz bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz -C ./bfd
27 | rm bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz
28 |
--------------------------------------------------------------------------------
/config/run_aws_data_prep_ver.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ############################################################
4 | # Run E2E RosettaFold analysis on AWS
5 | ## Options
6 | # -i (Required) S3 path to input folder
7 | # -o (Required) S3 path to output folder
8 | # -n Input file name (e.g. input.fa)
9 | # -p Prefix to use for output files
10 | # -w Path to working folder on run environment file system
11 | # -d Path to database folder on run environment file system
12 | # -c Max CPU count
13 | # -m Max memory amount (GB)
14 | #
15 | # Example CMD
16 | # ./AWS-RoseTTAFold/run_aws_e2e_ver.sh \
17 | # -i s3://032243382548-rf-run-data/input \
18 | # -o s3://032243382548-rf-run-data/output \
19 | # -n input.fa
20 | # -w ~/work \
21 | # -d /fsx \
22 | # -c 16 \
23 | # -m 64 \
24 |
25 | # make the script stop when error (non-true exit code) is occuredcd
26 | set -e
27 | START="$(date +%s)"
28 | ############################################################
29 | # >>> conda initialize >>>
30 | # !! Contents within this block are managed by 'conda init' !!
31 | __conda_setup="$('conda' 'shell.bash' 'hook' 2> /dev/null)"
32 | eval "$__conda_setup"
33 | unset __conda_setup
34 | # <<< conda initialize <<<
35 | ############################################################
36 |
37 | unset -v SCRIPT PIPEDIR UUID INPUT_S3_FOLDER OUTPUT_S3_FOLDER \
38 | INPUT_FILE WDIR DBDIR CPU MEM
39 |
40 | SCRIPT=`realpath -s $0`
41 | SCRIPTDIR=`dirname $SCRIPT`
42 |
43 | while getopts "i:o:n:p:w:d:c:m:" option
44 | do
45 | case $option in
46 | i) INPUT_S3_FOLDER=$OPTARG ;; # s3 URI to input folder
47 | o) OUTPUT_S3_FOLDER=$OPTARG ;; # s3 URI to output folder
48 | n) INPUT_FILE=$OPTARG ;; # input file name, e.g. input.fa
49 | p) UUID=$OPTARG ;; # File prefix
50 | w) WDIR=$OPTARG ;; # path to local working folder
51 | d) DBDIR=$OPTARG ;; # path to local sequence databases
52 | c) CPU=$OPTARG ;; # vCPU
53 | m) MEM=$OPTARG ;; # MEM (GB)
54 | *) exit 1 ;;
55 | esac
56 | done
57 |
58 | [ -z "$INPUT_S3_FOLDER" ] && { echo "\$INPUT_S3_OBJECT undefined"; exit 1; }
59 | [ -z "$OUTPUT_S3_FOLDER" ] && { echo "\$OUTPUT_S3_FOLDER undefined"; exit 1; }
60 | [ -z "$INPUT_FILE" ] && { INPUT_FILE="input.fa"; }
61 | [ -z "$WDIR" ] && { WDIR=$SCRIPTDIR; }
62 | [ -z "$DBDIR" ] && { DBDIR=$WDIR; }
63 | [ -z "$CPU" ] && { CPU="16"; }
64 | [ -z "$MEM" ] && { MEM="64"; }
65 |
66 | if [ -z "$UUID" ]
67 | then
68 | if [ -z "$AWS_BATCH_JOB_ID" ]
69 | then
70 | UUID=`date "+%Y%m%d%H%M%S"`;
71 | else
72 | UUID=$AWS_BATCH_JOB_ID;
73 | fi
74 | fi
75 |
76 | IN=$WDIR/input.fa
77 | aws s3 cp $INPUT_S3_FOLDER/$INPUT_FILE $IN
78 |
79 | ls $WDIR
80 | #LENGTH=`tail -n1 $IN | wc -m`
81 | LENGTH=`grep -v -e "^>" $IN | tr -d "\n" | wc -m`
82 |
83 | conda activate RoseTTAFold
84 |
85 | ############################################################
86 | # 1. generate MSAs
87 | ############################################################
88 | MSA_START="$(date +%s)"
89 |
90 | if [ ! -s $WDIR/t000_.msa0.a3m ]
91 | then
92 | export PIPEDIR=$DBDIR
93 | echo "Running HHblits"
94 | $SCRIPTDIR/input_prep/make_msa.sh $IN $WDIR $CPU $MEM $DBDIR
95 | fi
96 |
97 | MSA_COUNT=`grep "^>" $WDIR/t000_.msa0.a3m -c`
98 |
99 | aws s3 cp $WDIR/t000_.msa0.a3m $OUTPUT_S3_FOLDER/$UUID.msa0.a3m
100 |
101 | MSA_DURATION=$[ $(date +%s) - ${MSA_START} ]
102 | echo "${UUID} MSA duration: ${MSA_DURATION} sec"
103 |
104 | ############################################################
105 | # 2. predict secondary structure for HHsearch run
106 | ############################################################
107 | SS_START="$(date +%s)"
108 | if [ ! -s $WDIR/t000_.ss2 ]
109 | then
110 | export PIPEDIR=$SCRIPTDIR
111 | echo "Running PSIPRED"
112 | $SCRIPTDIR/input_prep/make_ss.sh $WDIR/t000_.msa0.a3m $WDIR/t000_.ss2
113 | fi
114 |
115 | aws s3 cp $WDIR/t000_.ss2 $OUTPUT_S3_FOLDER/$UUID.ss2
116 |
117 | SS_DURATION=$[ $(date +%s) - ${SS_START} ]
118 | echo "${UUID} SS duration: ${SS_DURATION} sec"
119 |
120 | ############################################################
121 | # 3. search for templates
122 | ############################################################
123 | TEMPLATE_START="$(date +%s)"
124 | DB="$DBDIR/pdb100_2021Mar03/pdb100_2021Mar03"
125 | if [ ! -s $WDIR/t000_.hhr ]
126 | then
127 | echo "Running hhsearch"
128 | HH="hhsearch -b 50 -B 500 -z 50 -Z 500 -mact 0.05 -cpu $CPU -maxmem $MEM -aliw 100000 -e 100 -p 5.0 -d $DB"
129 | cat $WDIR/t000_.ss2 $WDIR/t000_.msa0.a3m > $WDIR/t000_.msa0.ss2.a3m
130 | $HH -i $WDIR/t000_.msa0.ss2.a3m -o $WDIR/t000_.hhr -atab $WDIR/t000_.atab -v 2
131 | fi
132 |
133 | TEMPLATE_COUNT=`grep "^No [[:digit:]]*$" $WDIR/t000_.hhr -c`
134 |
135 | aws s3 cp $WDIR/t000_.msa0.ss2.a3m $OUTPUT_S3_FOLDER/$UUID.msa0.ss2.a3m
136 | aws s3 cp $WDIR/t000_.hhr $OUTPUT_S3_FOLDER/$UUID.hhr
137 | aws s3 cp $WDIR/t000_.atab $OUTPUT_S3_FOLDER/$UUID.atab
138 |
139 | TEMPLATE_DURATION=$[ $(date +%s) - ${TEMPLATE_START} ]
140 | echo "${UUID} template search duration: ${TEMPLATE_DURATION} sec"
141 |
142 | TOTAL_DATA_PREP_DURATION=$[ $(date +%s) - ${START} ]
143 | echo "${UUID} total data prep duration: ${TOTAL_DATA_PREP_DURATION} sec"
144 |
145 | # Collect metrics
146 | echo "DATA_PREP:" >> $WDIR/metrics.yaml
147 | echo " JOB_ID: ${UUID}" >> $WDIR/metrics.yaml
148 | echo " INPUT_S3_FOLDER: ${INPUT_S3_FOLDER}" >> $WDIR/metrics.yaml
149 | echo " INPUT_FILE: ${INPUT_S3_FILE}" >> $WDIR/metrics.yaml
150 | echo " OUTPUT_S3_FOLDER: ${OUTPUT_S3_FOLDER}" >> $WDIR/metrics.yaml
151 | echo " WDIR: ${WDIR}" >> $WDIR/metrics.yaml
152 | echo " DBDIR: ${DBDIR}" >> $WDIR/metrics.yaml
153 | echo " CPU: ${CPU}" >> $WDIR/metrics.yaml
154 | echo " MEM: ${MEM}" >> $WDIR/metrics.yaml
155 | echo " LENGTH: ${LENGTH}" >> $WDIR/metrics.yaml
156 | echo " MSA_COUNT: ${MSA_COUNT}" >> $WDIR/metrics.yaml
157 | echo " TEMPLATE_COUNT: ${TEMPLATE_COUNT}" >> $WDIR/metrics.yaml
158 | echo " START_TIME: ${START}" >> $WDIR/metrics.yaml
159 | echo " MSA_DURATION: ${MSA_DURATION}" >> $WDIR/metrics.yaml
160 | echo " SS_DURATION: ${SS_DURATION}" >> $WDIR/metrics.yaml
161 | echo " TEMPLATE_DURATION: ${TEMPLATE_DURATION}" >> $WDIR/metrics.yaml
162 | echo " TOTAL_DATA_PREP_DURATION: ${TOTAL_DATA_PREP_DURATION}" >> $WDIR/metrics.yaml
163 |
164 | aws s3 cp $WDIR/metrics.yaml $OUTPUT_S3_FOLDER/metrics.yaml
165 |
166 | echo "Done"
--------------------------------------------------------------------------------
/config/run_aws_predict_ver.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ############################################################
4 | # Run RosettaFold prediction analysis on AWS
5 | ## Options
6 | # -i (Required) S3 path to input folder
7 | # -o (Required) S3 path to output folder
8 | # -p Prefix to use for output files
9 | # -w Path to working folder on run environment file system
10 | # -d Path to database folder on run environment file system
11 | # -x Pathe to model weights folder on run environment
12 | # -c Max CPU count
13 | # -m Max memory amount (GB)
14 | #
15 | # Example CMD
16 | # ./AWS-RoseTTAFold/run_aws_e2e_ver.sh \
17 | # -i s3://032243382548-rf-run-data/input \
18 | # -o s3://032243382548-rf-run-data/output \
19 | # -w ~/work \
20 | # -d /fsx/RoseTTAFold \
21 | # -x /fsx/RoseTTAFold \
22 | # -c 16 \
23 | # -m 64 \
24 |
25 | # make the script stop when error (non-true exit code) is occuredcd
26 | set -e
27 | START="$(date +%s)"
28 | ############################################################
29 | # >>> conda initialize >>>
30 | # !! Contents within this block are managed by 'conda init' !!
31 | __conda_setup="$('conda' 'shell.bash' 'hook' 2> /dev/null)"
32 | eval "$__conda_setup"
33 | unset __conda_setup
34 | # <<< conda initialize <<<
35 | ############################################################
36 |
37 | unset -v SCRIPT PIPEDIR UUID INPUT_S3_FOLDER OUTPUT_S3_FOLDER \
38 | INPUT_FILE WDIR DBDIR MODEL_WEIGHTS_DIR CPU MEM
39 |
40 | SCRIPT=`realpath -s $0`
41 | SCRIPTDIR=`dirname $SCRIPT`
42 |
43 | while getopts "i:o:p:w:d:x:c:m:" option
44 | do
45 | case $option in
46 | i) INPUT_S3_FOLDER=$OPTARG ;; # s3 URI to input folder
47 | o) OUTPUT_S3_FOLDER=$OPTARG ;; # s3 URI to output folder
48 | p) UUID=$OPTARG ;; # File prefix
49 | w) WDIR=$OPTARG ;; # path to local working folder
50 | d) DBDIR=$OPTARG ;; # path to local sequence databases
51 | x) MODEL_WEIGHTS_DIR=$OPTARG ;; # path to local weights
52 | c) CPU=$OPTARG ;; # vCPU
53 | m) MEM=$OPTARG ;; # MEM (GB)
54 | *) exit 1 ;;
55 | esac
56 | done
57 |
58 | [ -z "$INPUT_S3_FOLDER" ] && { echo "\$INPUT_S3_OBJECT undefined"; exit 1; }
59 | [ -z "$OUTPUT_S3_FOLDER" ] && { echo "\$OUTPUT_S3_FOLDER undefined"; exit 1; }
60 | [ -z "$WDIR" ] && { WDIR=$SCRIPTDIR; }
61 | [ -z "$DBDIR" ] && { DBDIR=$WDIR; }
62 | [ -z "$MODEL_WEIGHTS_DIR" ] && { MODEL_WEIGHTS_DIR=$WDIR; }
63 | [ -z "$CPU" ] && { CPU="16"; }
64 | [ -z "$MEM" ] && { MEM="64"; }
65 | [ -z "$CUDA_VISIBLE_DEVICES" ] && { CUDA_VISIBLE_DEVICES="99"; }
66 |
67 | if [ -z "$UUID" ]
68 | then
69 | if [ -z "$AWS_BATCH_JOB_ID" ]
70 | then
71 | UUID=`date "+%Y%m%d%H%M%S"`;
72 | else
73 | UUID=$AWS_BATCH_JOB_ID;
74 | fi
75 | fi
76 |
77 | IN=$WDIR/input.fa
78 |
79 | conda activate RoseTTAFold
80 |
81 | aws s3 cp $INPUT_S3_FOLDER/$UUID.msa0.a3m $WDIR/t000_.msa0.a3m
82 | aws s3 cp $INPUT_S3_FOLDER/$UUID.hhr $WDIR/t000_.hhr
83 | aws s3 cp $INPUT_S3_FOLDER/$UUID.atab $WDIR/t000_.atab
84 | aws s3 cp $INPUT_S3_FOLDER/metrics.yaml $WDIR/metrics.yaml
85 |
86 | ############################################################
87 | # End-to-end prediction
88 | ############################################################
89 | PREDICT_START="$(date +%s)"
90 | if [ ! -s $WDIR/t000_.3track.npz ]
91 | then
92 | echo "Running end-to-end prediction"
93 | DB="$DBDIR/pdb100_2021Mar03/pdb100_2021Mar03"
94 |
95 | python $SCRIPTDIR/network/predict_e2e.py \
96 | -m $MODEL_WEIGHTS_DIR/weights \
97 | -i $WDIR/t000_.msa0.a3m \
98 | -o $WDIR/t000_.e2e \
99 | --hhr $WDIR/t000_.hhr \
100 | --atab $WDIR/t000_.atab \
101 | --db $DB
102 | fi
103 |
104 | aws s3 cp $WDIR/t000_.e2e.pdb $OUTPUT_S3_FOLDER/$UUID.e2e.pdb
105 | aws s3 cp $WDIR/t000_.e2e_init.pdb $OUTPUT_S3_FOLDER/$UUID.e2e_init.pdb
106 | aws s3 cp $WDIR/t000_.e2e.npz $OUTPUT_S3_FOLDER/$UUID.e2e.npz
107 |
108 | TOTAL_PREDICT_DURATION=$[ $(date +%s) - ${PREDICT_START} ]
109 | echo "${UUID} prediction duration: ${TOTAL_PREDICT_DURATION} sec"
110 |
111 | # Collect metrics
112 | echo "PREDICT:" >> $WDIR/metrics.yaml
113 | echo " JOB_ID: ${UUID}" >> $WDIR/metrics.yaml
114 | echo " INPUT_S3_FOLDER: ${INPUT_S3_FOLDER}" >> $WDIR/metrics.yaml
115 | echo " OUTPUT_S3_FOLDER: ${OUTPUT_S3_FOLDER}" >> $WDIR/metrics.yaml
116 | echo " WDIR: ${WDIR}" >> $WDIR/metrics_data_prep.yaml
117 | echo " DBDIR: ${DBDIR}" >> $WDIR/metrics.yaml
118 | echo " MODEL_WEIGHTS_DIR: ${MODEL_WEIGHTS_DIR}" >> $WDIR/metrics.yaml
119 | echo " CPU: ${CPU}" >> $WDIR/metrics.yaml
120 | echo " MEM: ${MEM}" >> $WDIR/metrics.yaml
121 | echo " GPU: ${CUDA_VISIBLE_DEVICES}" >> $WDIR/metrics.yaml
122 | echo " START_TIME: ${PREDICT_START}" >> $WDIR/metrics.yaml
123 | echo " TOTAL_PREDICT_DURATION: ${TOTAL_PREDICT_DURATION}" >> $WDIR/metrics.yaml
124 |
125 | aws s3 cp $WDIR/metrics.yaml $OUTPUT_S3_FOLDER/metrics.yaml
126 |
127 | echo "Done"
--------------------------------------------------------------------------------
/data/T1028.fa:
--------------------------------------------------------------------------------
1 | >T1028 CalU17, Micromonospora echinospora, 316 residues|
2 | MARIGDLDAARPAPEAVPGDMVRIPGGTFLQGSPERTLDWLDREGQAFPRDWFTDETPQIPVTLPDYLIDRHQVTVAQFAAFVSRTGYVTSAERAGGSMVYGEQYWEIREGACWHRPAGYGSGIRGRDDHPVVHISFADAEAYARWAGRRLPTESEWERAATGPSYRLWPWGDTWDSRNANTAEHTAGALGDLDAWRTWWGAIHAVQGPMPQTTPVGAFSPRGDSVDGCADMTGNVYEWTSTLAHLYSPATRCDPTIHLVMGRSRVIRGGSWMNFRYQVRCAERLYGDPTGWSNFALGFRCARDVTAVPHVDDNGR
--------------------------------------------------------------------------------
/data/T1036s1.fa:
--------------------------------------------------------------------------------
1 | >T1036s1 Monoclonal antibody 93k, Varicella-zoster virus, strain pOka, subunit 1, 622 residues|
2 | TKPTFYVCPPPTGSTIVRLEPPRTCPDYHLGKNFTEGIAVVYKENIAAYKFKATVYYKDVIVSTAWAGSSYTQITNRYADRVPIPVSEITDTIDKFGKCSSKATYVRNNHKVEAFNEDKNPQDMPLIASKYNSVGSKAWHTTNDTYMVAGTPGTYRTGTSVNCIIEEVEARSIFPYDSFGLSTGDIIYMSPFFGLRDGAYREHSNYAMDRFHQFEGYRQRDLDTRALLEPAARNFLVTPHLTVGWNWKPKRTEVCSLVKWREVEDVVRDEYAHNFRFTMKTLSTTFISETNEFNLNQIHLSQCVKEEARAIINRIYTTRYNSSHVRTGDIQTYLARGGFVVVFQPLLSNSLARLYLQELVRENTNHSPQKHPTRNTRSRRSVPVELRANRTITTTSSVEFAMLQFTYDHIQEHVNEMLARISSSWCQLQNRERALWSGLFPINPSALASTILDQRVKARILGDVISVSNCPELGSDTRIILQNSMRVSGSTTRCYSRPLISIVSLNGSGTVEGQLGTDNELIMSRDLLEPCVANHKRYFLFGHHYVYYEDYRYVREIAVHDVGMISTYVDLNLTLLKDREFMPLRVYTRDELRDTGLLDYSEIQRRNQMHSLRFYDIDKVVQ
--------------------------------------------------------------------------------
/data/T1078.fa:
--------------------------------------------------------------------------------
1 | >T1078 Tsp1, Trichoderma virens, 138 residues|
2 | MAAPTPADKSMMAAVPEWTITNLKRVCNAGNTSCTWTFGVDTHLATATSCTYVVKANANASQASGGPVTCGPYTITSSWSGQFGPNNGFTTFAVTDFSKKLIVWPAYTDVQVQAGKVVSPNQSYAPANLPLEHHHHHH
--------------------------------------------------------------------------------
/img/AWS-RoseTTAFold-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/AWS-RoseTTAFold-arch.png
--------------------------------------------------------------------------------
/img/AWS-RoseTTAFold-deploy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/AWS-RoseTTAFold-deploy.png
--------------------------------------------------------------------------------
/img/LaunchStack.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/LaunchStack.jpg
--------------------------------------------------------------------------------
/img/RF_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/img/RF_workflow.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | biopython
2 | py3Dmol
3 | boto3
4 | sagemaker
5 | matplotlib
6 | pyyaml
--------------------------------------------------------------------------------
/rfutils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-rosettafold/9a4e3fddbc07543bb53a026dfedfe37686b63e60/rfutils/__init__.py
--------------------------------------------------------------------------------
/rfutils/rfutils.py:
--------------------------------------------------------------------------------
1 | """
2 | Helper functions for the AWS-RoseTTAFold notebook.
3 | """
4 |
5 | ## Load dependencies
6 | from Bio import SeqIO
7 | import boto3
8 | from datetime import datetime
9 | import json
10 | import matplotlib.pyplot as plt
11 | from matplotlib import colors
12 | import numpy as np
13 | import os
14 | import pandas as pd
15 | import py3Dmol
16 | import yaml
17 | from re import sub
18 | import sagemaker
19 | import string
20 | from string import ascii_uppercase, ascii_lowercase
21 | from time import sleep
22 | import uuid
23 |
24 | # Get service clients
25 | session = boto3.session.Session()
26 | sm_session = sagemaker.session.Session()
27 | region = session.region_name
28 | role = sagemaker.get_execution_role()
29 | s3 = boto3.client("s3", region_name=region)
30 |
31 | pymol_color_list = [
32 | "#33ff33",
33 | "#00ffff",
34 | "#ff33cc",
35 | "#ffff00",
36 | "#ff9999",
37 | "#e5e5e5",
38 | "#7f7fff",
39 | "#ff7f00",
40 | "#7fff7f",
41 | "#199999",
42 | "#ff007f",
43 | "#ffdd5e",
44 | "#8c3f99",
45 | "#b2b2b2",
46 | "#007fff",
47 | "#c4b200",
48 | "#8cb266",
49 | "#00bfbf",
50 | "#b27f7f",
51 | "#fcd1a5",
52 | "#ff7f7f",
53 | "#ffbfdd",
54 | "#7fffff",
55 | "#ffff7f",
56 | "#00ff7f",
57 | "#337fcc",
58 | "#d8337f",
59 | "#bfff3f",
60 | "#ff7fff",
61 | "#d8d8ff",
62 | "#3fffbf",
63 | "#b78c4c",
64 | "#339933",
65 | "#66b2b2",
66 | "#ba8c84",
67 | "#84bf00",
68 | "#b24c66",
69 | "#7f7f7f",
70 | "#3f3fa5",
71 | "#a5512b",
72 | ]
73 |
74 | pymol_cmap = colors.ListedColormap(pymol_color_list)
75 | alphabet_list = list(ascii_uppercase + ascii_lowercase)
76 |
77 | aatypes = set("ACDEFGHIKLMNPQRSTVWY")
78 |
79 |
80 | def create_job_name(suffix=None):
81 |
82 | """
83 | Define a simple job identifier
84 | """
85 |
86 | if suffix == None:
87 | return datetime.utcnow().strftime("%Y%m%dT%H%M%S")
88 | else:
89 | ## Ensure that the suffix conforms to the Batch requirements, (only letters,
90 | ## numbers, hyphens, and underscores are allowed).
91 | suffix = sub("\W", "_", suffix)
92 | return datetime.utcnow().strftime("%Y%m%dT%H%M%S") + "_" + suffix
93 |
94 |
95 | def display_msa(jobId, bucket):
96 | """
97 | Display the MSA plot in a Jupyter notebook cell
98 | """
99 |
100 | info = get_batch_job_info(jobId)
101 |
102 | if info["status"] == "SUCCEEDED":
103 | print(
104 | f"Downloading MSA file from s3://{bucket}/{info['jobName']}/{info['jobName']}.msa0.a3m"
105 | )
106 | s3.download_file(
107 | bucket,
108 | f"{info['jobName']}/{info['jobName']}.msa0.a3m",
109 | "data/alignment.msa",
110 | )
111 | msa_all = parse_a3m("data/alignment.msa")
112 | plot_msa_info(msa_all)
113 | else:
114 | print(
115 | f"Data prep job {info['jobId']} is in {info['status']} status. Please try again once the job has completed."
116 | )
117 |
118 |
119 | def display_structure(
120 | jobId,
121 | bucket,
122 | color="lDDT",
123 | show_sidechains=False,
124 | show_mainchains=False,
125 | chains=1,
126 | vmin=0.5,
127 | vmax=0.9,
128 | ):
129 | """
130 | Display the predicted structure in a Jupyter notebook cell
131 | """
132 | if color not in ["chain", "lDDT", "rainbow"]:
133 | raise ValueError("Color must be 'LDDT' (default), 'chain', or 'rainbow'")
134 |
135 | info = get_batch_job_info(jobId)
136 |
137 | if info["status"] == "SUCCEEDED":
138 | print(
139 | f"Downloading PDB file from s3://{bucket}/{info['jobName']}/{info['jobName']}.e2e.pdb"
140 | )
141 | s3.download_file(
142 | bucket, f"{info['jobName']}/{info['jobName']}.e2e.pdb", "data/e2e.pdb"
143 | )
144 | plot_pdb(
145 | "data/e2e.pdb",
146 | show_sidechains=show_sidechains,
147 | show_mainchains=show_mainchains,
148 | color=color,
149 | chains=chains,
150 | vmin=vmin,
151 | vmax=vmax,
152 | ).show()
153 | if color == "lDDT":
154 | plot_plddt_legend().show()
155 | else:
156 | print(
157 | f"{info['jobId']} is in {info['status']} status. Please try again once the job has completed."
158 | )
159 |
160 |
161 | def get_batch_job_info(jobId):
162 |
163 | """
164 | Retrieve and format information about a batch job.
165 | """
166 |
167 | client = boto3.client("batch")
168 | job_description = client.describe_jobs(jobs=[jobId])
169 |
170 | output = {
171 | "jobArn": job_description["jobs"][0]["jobArn"],
172 | "jobName": job_description["jobs"][0]["jobName"],
173 | "jobId": job_description["jobs"][0]["jobId"],
174 | "status": job_description["jobs"][0]["status"],
175 | "createdAt": datetime.utcfromtimestamp(
176 | job_description["jobs"][0]["createdAt"] / 1000
177 | ).strftime("%Y-%m-%dT%H:%M:%SZ"),
178 | "dependsOn": job_description["jobs"][0]["dependsOn"],
179 | "tags": job_description["jobs"][0]["tags"],
180 | }
181 |
182 | if output["status"] in ["STARTING", "RUNNING", "SUCCEEDED", "FAILED"]:
183 | output["logStreamName"] = job_description["jobs"][0]["container"][
184 | "logStreamName"
185 | ]
186 | return output
187 |
188 |
189 | def get_batch_logs(logStreamName):
190 |
191 | """
192 | Retrieve and format logs for batch job.
193 | """
194 |
195 | client = boto3.client("logs")
196 | try:
197 | response = client.get_log_events(
198 | logGroupName="/aws/batch/job", logStreamName=logStreamName
199 | )
200 | except client.meta.client.exceptions.ResourceNotFoundException:
201 | return f"Log stream {logStreamName} does not exist. Please try again in a few minutes"
202 |
203 | logs = pd.DataFrame.from_dict(response["events"])
204 | logs.timestamp = logs.timestamp.transform(
205 | lambda x: datetime.fromtimestamp(x / 1000)
206 | )
207 | logs.drop("ingestionTime", axis=1, inplace=True)
208 | return logs
209 |
210 |
211 | def get_rf_job_info(
212 | cpu_queue="AWS-RoseTTAFold-CPU", gpu_queue="AWS-RoseTTAFold-GPU", hrs_in_past=1
213 | ):
214 |
215 | """
216 | Display information about recent AWS-RoseTTAFold jobs
217 | """
218 | from datetime import datetime
219 |
220 | batch_client = boto3.client("batch")
221 | recent_jobs = list_recent_jobs([cpu_queue, gpu_queue], hrs_in_past)
222 | recent_job_df = pd.DataFrame.from_dict(recent_jobs)
223 | list_of_lists = []
224 | if len(recent_job_df) > 0:
225 | detail_list = batch_client.describe_jobs(jobs=recent_job_df.jobId.to_list())
226 | for job in detail_list["jobs"]:
227 | resource_dict = {}
228 | for resource in job["container"]["resourceRequirements"]:
229 | resource_dict[resource["type"]] = resource["value"]
230 | row = [
231 | job["jobName"],
232 | job["jobId"],
233 | job["jobQueue"],
234 | job["status"],
235 | datetime.fromtimestamp(job["createdAt"] / 1000),
236 | datetime.fromtimestamp(job["startedAt"] / 1000)
237 | if "startedAt" in job
238 | else "NaT",
239 | datetime.fromtimestamp(job["stoppedAt"] / 1000)
240 | if "stoppedAt" in job
241 | else "NaT",
242 | str(
243 | datetime.fromtimestamp(job["stoppedAt"] / 1000)
244 | - datetime.fromtimestamp(job["startedAt"] / 1000)
245 | )
246 | if "startedAt" in job and "stoppedAt" in job
247 | else "NaN",
248 | (job["stoppedAt"] / 1000) - (job["startedAt"] / 1000)
249 | if "startedAt" in job and "stoppedAt" in job
250 | else "NaN",
251 | job["jobDefinition"],
252 | job["container"]["logStreamName"]
253 | if "logStreamName" in job["container"]
254 | else "",
255 | int(resource_dict["VCPU"]),
256 | int(float(resource_dict["MEMORY"]) / 1000),
257 | int(resource_dict["GPU"]) if "GPU" in resource_dict else 0,
258 | ]
259 | list_of_lists.append(row)
260 |
261 | return pd.DataFrame(
262 | list_of_lists,
263 | columns=[
264 | "jobName",
265 | "jobId",
266 | "jobQueue",
267 | "status",
268 | "createdAt",
269 | "startedAt",
270 | "stoppedAt",
271 | "duration",
272 | "duration_sec",
273 | "jobDefinition",
274 | "logStreamName",
275 | "vCPUs",
276 | "mem_GB",
277 | "GPUs",
278 | ],
279 | ).sort_values(by="jobName", ascending=False)
280 |
281 |
282 | def get_rf_job_metrics(job_name, bucket, region="us-east-1"):
283 | """
284 | Retrieve RF job metrics from the metrics.yaml file
285 | """
286 |
287 | s3.download_file(
288 | bucket,
289 | f"{job_name}/metrics.yaml",
290 | "data/metrics.yaml",
291 | )
292 |
293 | with open("data/metrics.yaml", "r") as stream:
294 | try:
295 | metrics = yaml.safe_load(stream)
296 | except yaml.YAMLError as exc:
297 | print(exc)
298 |
299 | return metrics
300 |
301 |
302 | def get_rosettafold_batch_resources(region="us-east-1"):
303 | """
304 | Retrieve a list of batch job definitions and queues created as part of an
305 | AWS-RoseTTAFold stack.
306 | """
307 | batch = boto3.client("batch", region_name=region)
308 |
309 | job_definition_response = batch.describe_job_definitions()
310 | list_of_lists = []
311 |
312 | job_list = []
313 | for jd in job_definition_response["jobDefinitions"]:
314 | if jd["status"] == "ACTIVE" and "aws-rosettafold" in jd["jobDefinitionName"]:
315 | name_split = jd["jobDefinitionName"].split("-")
316 | entry = {
317 | "stackId": name_split[5],
318 | "dataPrepJobDefinition": jd["jobDefinitionName"],
319 | }
320 | row = [
321 | name_split[5],
322 | name_split[4],
323 | "Job Definition",
324 | jd["jobDefinitionName"],
325 | ]
326 | job_list.append(row)
327 |
328 | job_queue_response = batch.describe_job_queues()
329 | jq_list = []
330 | for jq in job_queue_response["jobQueues"]:
331 | if (
332 | jq["state"] == "ENABLED"
333 | and jq["status"] == "VALID"
334 | and "aws-rosettafold-queue" in jq["jobQueueName"]
335 | ):
336 | name_split = jq["jobQueueName"].split("-")
337 | row = [name_split[4], name_split[3], "Job Queue", jq["jobQueueName"]]
338 | job_list.append(row)
339 |
340 | df = pd.DataFrame(
341 | job_list,
342 | columns=["stackId", "instanceType", "resourceType", "resourceName"],
343 | ).sort_values(by=["stackId", "instanceType"], ascending=False)
344 | df["type"] = df["instanceType"] + df["resourceType"]
345 | df = df.pivot(index="stackId", columns="type", values=["resourceName"])
346 | df.columns = df.columns.get_level_values(1)
347 | df = df.rename(
348 | columns={
349 | "cpudataprepJob Definition": "CPUDataPrepJobDefinition",
350 | "cpuJob Queue": "CPUJobQueue",
351 | "cpupredictJob Definition": "CPUPredictJobDefinition",
352 | "gpupredictJob Definition": "GPUPredictJobDefinition",
353 | "gpuJob Queue": "GPUJobQueue",
354 | }
355 | )
356 | return df
357 |
358 |
359 | def list_recent_jobs(job_queues, hrs_in_past=1):
360 |
361 | """
362 | Display recently-submitted jobs.
363 | """
364 |
365 | batch_client = boto3.client("batch")
366 | result = []
367 | for queue in job_queues:
368 | recent_queue_jobs = batch_client.list_jobs(
369 | jobQueue=queue,
370 | filters=[
371 | {
372 | "name": "AFTER_CREATED_AT",
373 | "values": [
374 | str(round(datetime.now().timestamp()) - (hrs_in_past * 3600))
375 | ],
376 | }
377 | ],
378 | )
379 | result = result + recent_queue_jobs["jobSummaryList"]
380 |
381 | return result
382 |
383 |
384 | def parse_a3m(filename):
385 |
386 | """
387 | Read A3M and convert letters into integers in the 0..20 range,
388 | Copied from https://github.com/RosettaCommons/RoseTTAFold/blob/main/network/parsers.py
389 | """
390 |
391 | msa = []
392 | table = str.maketrans(dict.fromkeys(string.ascii_lowercase))
393 | # read file line by line
394 | for line in open(filename, "r"):
395 | # skip labels
396 | if line[0] == ">":
397 | continue
398 | # remove right whitespaces
399 | line = line.rstrip()
400 | # remove lowercase letters and append to MSA
401 | msa.append(line.translate(table))
402 | # convert letters into numbers
403 | alphabet = np.array(list("ARNDCQEGHILKMFPSTWYV-"), dtype="|S1").view(np.uint8)
404 | msa = np.array([list(s) for s in msa], dtype="|S1").view(np.uint8)
405 | for i in range(alphabet.shape[0]):
406 | msa[msa == alphabet[i]] = i
407 | # treat all unknown characters as gaps
408 | msa[msa > 20] = 20
409 | return msa
410 |
411 |
412 | def read_pdb_renum(pdb_filename, Ls=None):
413 |
414 | """
415 | Process pdb file.
416 | Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py
417 | """
418 |
419 | if Ls is not None:
420 | L_init = 0
421 | new_chain = {}
422 | for L, c in zip(Ls, alphabet_list):
423 | new_chain.update({i: c for i in range(L_init, L_init + L)})
424 | L_init += L
425 | n, pdb_out = 1, []
426 | resnum_, chain_ = 1, "A"
427 | for line in open(pdb_filename, "r"):
428 | if line[:4] == "ATOM":
429 | chain = line[21:22]
430 | resnum = int(line[22 : 22 + 5])
431 | if resnum != resnum_ or chain != chain_:
432 | resnum_, chain_ = resnum, chain
433 | n += 1
434 | if Ls is None:
435 | pdb_out.append("%s%4i%s" % (line[:22], n, line[26:]))
436 | else:
437 | pdb_out.append(
438 | "%s%s%4i%s" % (line[:21], new_chain[n - 1], n, line[26:])
439 | )
440 | return "".join(pdb_out)
441 |
442 |
443 | def plot_msa_info(msa):
444 |
445 | """
446 | Plot a representation of the MSA coverage.
447 | Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py
448 | """
449 |
450 | msa_arr = np.unique(msa, axis=0)
451 | total_msa_size = len(msa_arr)
452 | print(f"\n{total_msa_size} Sequences Found in Total\n")
453 |
454 | if total_msa_size > 1:
455 | plt.figure(figsize=(8, 5), dpi=100)
456 | plt.title("Sequence coverage")
457 | seqid = (msa[0] == msa_arr).mean(-1)
458 | seqid_sort = seqid.argsort()
459 | non_gaps = (msa_arr != 20).astype(float)
460 | non_gaps[non_gaps == 0] = np.nan
461 | plt.imshow(
462 | non_gaps[seqid_sort] * seqid[seqid_sort, None],
463 | interpolation="nearest",
464 | aspect="auto",
465 | cmap="rainbow_r",
466 | vmin=0,
467 | vmax=1,
468 | origin="lower",
469 | extent=(0, msa_arr.shape[1], 0, msa_arr.shape[0]),
470 | )
471 | plt.plot((msa_arr != 20).sum(0), color="black")
472 | plt.xlim(0, msa_arr.shape[1])
473 | plt.ylim(0, msa_arr.shape[0])
474 | plt.colorbar(
475 | label="Sequence identity to query",
476 | )
477 | plt.xlabel("Positions")
478 | plt.ylabel("Sequences")
479 | plt.show()
480 | else:
481 | print("Unable to display MSA of length 1")
482 |
483 |
484 | def plot_pdb(
485 | pred_output_path,
486 | show_sidechains=False,
487 | show_mainchains=False,
488 | color="lDDT",
489 | chains=None,
490 | Ls=None,
491 | vmin=0.5,
492 | vmax=0.9,
493 | color_HP=False,
494 | size=(800, 480),
495 | ):
496 |
497 | """
498 | Create a 3D view of a pdb structure
499 | Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py
500 | """
501 |
502 | if chains is None:
503 | chains = 1 if Ls is None else len(Ls)
504 |
505 | view = py3Dmol.view(
506 | js="https://3dmol.org/build/3Dmol.js", width=size[0], height=size[1]
507 | )
508 | view.addModel(read_pdb_renum(pred_output_path, Ls), "pdb")
509 | if color == "lDDT":
510 | view.setStyle(
511 | {
512 | "cartoon": {
513 | "colorscheme": {
514 | "prop": "b",
515 | "gradient": "roygb",
516 | "min": vmin,
517 | "max": vmax,
518 | }
519 | }
520 | }
521 | )
522 | elif color == "rainbow":
523 | view.setStyle({"cartoon": {"color": "spectrum"}})
524 | elif color == "chain":
525 | for n, chain, color in zip(range(chains), alphabet_list, pymol_color_list):
526 | view.setStyle({"chain": chain}, {"cartoon": {"color": color}})
527 | if show_sidechains:
528 | BB = ["C", "O", "N"]
529 | HP = [
530 | "ALA",
531 | "GLY",
532 | "VAL",
533 | "ILE",
534 | "LEU",
535 | "PHE",
536 | "MET",
537 | "PRO",
538 | "TRP",
539 | "CYS",
540 | "TYR",
541 | ]
542 | if color_HP:
543 | view.addStyle(
544 | {"and": [{"resn": HP}, {"atom": BB, "invert": True}]},
545 | {"stick": {"colorscheme": "yellowCarbon", "radius": 0.3}},
546 | )
547 | view.addStyle(
548 | {"and": [{"resn": HP, "invert": True}, {"atom": BB, "invert": True}]},
549 | {"stick": {"colorscheme": "whiteCarbon", "radius": 0.3}},
550 | )
551 | view.addStyle(
552 | {"and": [{"resn": "GLY"}, {"atom": "CA"}]},
553 | {"sphere": {"colorscheme": "yellowCarbon", "radius": 0.3}},
554 | )
555 | view.addStyle(
556 | {"and": [{"resn": "PRO"}, {"atom": ["C", "O"], "invert": True}]},
557 | {"stick": {"colorscheme": "yellowCarbon", "radius": 0.3}},
558 | )
559 | else:
560 | view.addStyle(
561 | {
562 | "and": [
563 | {"resn": ["GLY", "PRO"], "invert": True},
564 | {"atom": BB, "invert": True},
565 | ]
566 | },
567 | {"stick": {"colorscheme": f"WhiteCarbon", "radius": 0.3}},
568 | )
569 | view.addStyle(
570 | {"and": [{"resn": "GLY"}, {"atom": "CA"}]},
571 | {"sphere": {"colorscheme": f"WhiteCarbon", "radius": 0.3}},
572 | )
573 | view.addStyle(
574 | {"and": [{"resn": "PRO"}, {"atom": ["C", "O"], "invert": True}]},
575 | {"stick": {"colorscheme": f"WhiteCarbon", "radius": 0.3}},
576 | )
577 | if show_mainchains:
578 | BB = ["C", "O", "N", "CA"]
579 | view.addStyle(
580 | {"atom": BB}, {"stick": {"colorscheme": f"WhiteCarbon", "radius": 0.3}}
581 | )
582 | view.zoomTo()
583 | return view
584 |
585 |
586 | def plot_plddt_legend(dpi=100):
587 |
588 | """
589 | Create 3D Plot legend
590 | Copied from https://github.com/sokrypton/ColabFold/blob/main/beta/colabfold.py
591 | """
592 |
593 | thresh = [
594 | "plDDT:",
595 | "Very low (<50)",
596 | "Low (60)",
597 | "OK (70)",
598 | "Confident (80)",
599 | "Very high (>90)",
600 | ]
601 | plt.figure(figsize=(1, 0.1), dpi=dpi)
602 | ########################################
603 | for c in ["#FFFFFF", "#FF0000", "#FFFF00", "#00FF00", "#00FFFF", "#0000FF"]:
604 | plt.bar(0, 0, color=c)
605 | plt.legend(
606 | thresh,
607 | frameon=False,
608 | loc="center",
609 | ncol=6,
610 | handletextpad=1,
611 | columnspacing=1,
612 | markerscale=0.5,
613 | )
614 | plt.axis(False)
615 | return plt
616 |
617 |
618 | def submit_2_step_job(
619 | bucket=sm_session.default_bucket(),
620 | job_name=uuid.uuid4(),
621 | data_prep_input_file="input.fa",
622 | data_prep_job_definition="AWS-RoseTTAFold-CPU",
623 | data_prep_queue="AWS-RoseTTAFold-CPU",
624 | data_prep_cpu=8,
625 | data_prep_mem=32,
626 | predict_job_definition="AWS-RoseTTAFold-GPU",
627 | predict_queue="AWS-RoseTTAFold-GPU",
628 | predict_cpu=4,
629 | predict_mem=16,
630 | predict_gpu=True,
631 | db_path="/fsx/aws-rosettafold-ref-data",
632 | weights_path="/fsx/aws-rosettafold-ref-data",
633 | ):
634 |
635 | """
636 | Submit a 2-step RoseTTAFold prediction job to AWS Batch.
637 | """
638 |
639 | working_folder = f"s3://{bucket}/{job_name}"
640 | batch_client = boto3.client("batch")
641 | output_pdb_uri = f"{working_folder}/{job_name}.e2e.pdb"
642 |
643 | data_prep_response = submit_rf_data_prep_job(
644 | bucket=bucket,
645 | job_name=job_name,
646 | input_file=data_prep_input_file,
647 | job_definition=data_prep_job_definition,
648 | job_queue=data_prep_queue,
649 | cpu=data_prep_cpu,
650 | mem=data_prep_mem,
651 | db_path=db_path,
652 | )
653 |
654 | predict_response = submit_rf_predict_job(
655 | bucket=bucket,
656 | job_name=job_name,
657 | job_definition=predict_job_definition,
658 | job_queue=predict_queue,
659 | cpu=predict_cpu,
660 | mem=predict_mem,
661 | gpu=predict_gpu,
662 | db_path=db_path,
663 | weights_path=weights_path,
664 | depends_on=data_prep_response["jobId"],
665 | )
666 |
667 | print(
668 | f"Data prep job ID {data_prep_response['jobId']} and predict job ID {predict_response['jobId']} submitted"
669 | )
670 | return [data_prep_response, predict_response]
671 |
672 |
673 | def submit_rf_data_prep_job(
674 | bucket=sm_session.default_bucket(),
675 | job_name=uuid.uuid4(),
676 | input_file="input.fa",
677 | job_definition="AWS-RoseTTAFold-CPU",
678 | job_queue="AWS-RoseTTAFold-CPU",
679 | cpu=8,
680 | mem=32,
681 | db_path="/fsx/aws-rosettafold-ref-data",
682 | ):
683 |
684 | """
685 | Submit a RoseTTAFold data prep job (i.e. the first half of the e2e workflow) to AWS Batch.
686 | """
687 |
688 | working_folder = f"s3://{bucket}/{job_name}"
689 | batch_client = boto3.client("batch")
690 | output_msa_uri = f"{working_folder}/{job_name}.msa0.a3m"
691 | output_hhr_uri = f"{working_folder}/{job_name}.hhr"
692 | output_atab_uri = f"{working_folder}/{job_name}.atab"
693 |
694 | response = batch_client.submit_job(
695 | jobDefinition=job_definition,
696 | jobName=str(job_name),
697 | jobQueue=job_queue,
698 | containerOverrides={
699 | "command": [
700 | "/bin/bash",
701 | "run_aws_data_prep_ver.sh",
702 | "-i",
703 | working_folder,
704 | "-n",
705 | input_file,
706 | "-o",
707 | working_folder,
708 | "-p",
709 | job_name,
710 | "-w",
711 | "/work",
712 | "-d",
713 | db_path,
714 | "-c",
715 | str(cpu),
716 | "-m",
717 | str(mem),
718 | ],
719 | "resourceRequirements": [
720 | {"value": str(cpu), "type": "VCPU"},
721 | {"value": str(mem * 1000), "type": "MEMORY"},
722 | ],
723 | },
724 | tags={
725 | "output_msa_uri": output_msa_uri,
726 | "output_hhr_uri": output_hhr_uri,
727 | "output_atab_uri": output_atab_uri,
728 | },
729 | )
730 | print(f"Job ID {response['jobId']} submitted")
731 | return response
732 |
733 |
734 | def submit_rf_predict_job(
735 | bucket=sm_session.default_bucket(),
736 | job_name=uuid.uuid4(),
737 | job_definition="AWS-RoseTTAFold-GPU",
738 | job_queue="AWS-RoseTTAFold-GPU",
739 | cpu=4,
740 | mem=16,
741 | gpu=True,
742 | db_path="/fsx/aws-rosettafold-ref-data",
743 | weights_path="/fsx/aws-rosettafold-ref-data",
744 | depends_on="",
745 | ):
746 |
747 | """
748 | Submit a RoseTTAFold prediction job (i.e. the second half of the e2e workflow) to AWS Batch.
749 | """
750 |
751 | working_folder = f"s3://{bucket}/{job_name}"
752 | batch_client = boto3.client("batch")
753 | output_pdb_uri = f"{working_folder}/{job_name}.e2e.pdb"
754 |
755 | container_overrides = {
756 | "command": [
757 | "/bin/bash",
758 | "run_aws_predict_ver.sh",
759 | "-i",
760 | working_folder,
761 | "-o",
762 | working_folder,
763 | "-p",
764 | job_name,
765 | "-w",
766 | "/work",
767 | "-d",
768 | db_path,
769 | "-x",
770 | weights_path,
771 | "-c",
772 | str(cpu),
773 | "-m",
774 | str(mem),
775 | ],
776 | "resourceRequirements": [
777 | {"value": str(cpu), "type": "VCPU"},
778 | {"value": str(mem * 1000), "type": "MEMORY"},
779 | ],
780 | }
781 |
782 | if gpu:
783 | container_overrides["resourceRequirements"].append(
784 | {"value": "1", "type": "GPU"}
785 | )
786 |
787 | response = batch_client.submit_job(
788 | jobDefinition=job_definition,
789 | jobName=str(job_name),
790 | jobQueue=job_queue,
791 | dependsOn=[{"jobId": depends_on, "type": "SEQUENTIAL"}],
792 | containerOverrides=container_overrides,
793 | tags={"output_pdb_uri": output_pdb_uri},
794 | )
795 | print(f"Job ID {response['jobId']} submitted")
796 | return response
797 |
798 |
799 | def upload_fasta_to_s3(
800 | record, bucket=sm_session.default_bucket(), job_name=uuid.uuid4()
801 | ):
802 |
803 | """
804 | Create a fasta file and upload it to S3.
805 | """
806 |
807 | s3 = boto3.client("s3", region_name=region)
808 | file_out = "_tmp.fasta"
809 | with open(file_out, "w") as f_out:
810 | SeqIO.write(record, f_out, "fasta")
811 | object_name = f"{job_name}/input.fa"
812 | response = s3.upload_file(file_out, bucket, object_name)
813 | os.remove(file_out)
814 | s3_uri = f"s3://{bucket}/{object_name}"
815 | print(f"Sequence file uploaded to {s3_uri}")
816 | return s3_uri
817 |
818 |
819 | def wait_for_job_start(jobId, pause=30):
820 |
821 | """
822 | Pause while a job transitions into a running state.
823 | """
824 |
825 | status = get_batch_job_info(jobId)["status"]
826 | print(status)
827 | while get_batch_job_info(jobId)["status"] in [
828 | "SUBMITTED",
829 | "PENDING",
830 | "RUNNABLE",
831 | "STARTING",
832 | ]:
833 | sleep(30)
834 | new_status = get_batch_job_info(jobId)["status"]
835 | if new_status != status:
836 | print("\n" + new_status)
837 | else:
838 | print(".", end="")
839 | status = new_status
840 |
--------------------------------------------------------------------------------