├── cdk.json
├── requirements-dev.txt
├── requirements.txt
├── src
    ├── lambda_producer.py
    └── lambda_consumer.py
├── layer
    └── python
    │   └── dask_processing.py
├── README.md
├── .gitignore
└── app.py


/cdk.json:
--------------------------------------------------------------------------------
1 | {
2 |   "app": "python app.py"
3 | }


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | aws-cdk-lib>=2.0.0
2 | constructs>=10.0.0
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | coiled==0.6.2
2 | boto3==1.24.29
3 | distributed==2023.5.0
4 | 


--------------------------------------------------------------------------------
/src/lambda_producer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import json
 4 | from datetime import datetime
 5 | 
 6 | import boto3
 7 | 
 8 | 
 9 | def producer(event, context):
10 |     """
11 |     Scheduled CRON lambda function which produces example files,
12 |     simulating some external data producer. Files are placed on
13 |     a bucket which has a listner to trigger the `handle_new_s3_file`
14 |     function to process them.
15 |     """
16 |     # We're only really needing to demonstrate 'some file' landing on
17 |     # s3 which triggers some downstream processing. Trying not to convolute
18 |     # the example with any added complexity, so we'll deposite a file which has
19 |     # a number and downstream processing can decide how to interpret it.
20 |     bucket = os.environ["S3_BUCKET"]
21 | 
22 |     d = datetime.utcnow()
23 |     key = f"data/year={d.year}/month={d.month}/day={d.day}/hour={d.hour}/minute={d.minute}/second={d.second}/data.json"
24 | 
25 |     data = json.dumps({"count": random.randint(10, 1_000)}).encode()
26 | 
27 |     boto3.client("s3").put_object(Bucket=bucket, Key=key, Body=data)
28 | 


--------------------------------------------------------------------------------
/layer/python/dask_processing.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | import json
 3 | import boto3
 4 | import pathlib
 5 | 
 6 | 
 7 | def process_s3_file(bucket, key):
 8 |     """
 9 |     Process a s3 file
10 |     """
11 |     import dask
12 |     import dask.dataframe as dd
13 | 
14 |     # **NOTE** Import any external deps here, so the client
15 |     # which runs on Lambda and may not have heavier libs,
16 |     # doesn't get imported there.
17 |     resp = boto3.client("s3").get_object(Bucket=bucket, Key=key)
18 |     count = json.loads(resp["Body"].read().decode())["count"]
19 | 
20 |     # Example processing:
21 |     # We are just going to create some busy work by
22 |     # making a timeseries in the span of 'count' and computing
23 |     # the mean, then returning; something that would be very
24 |     # uncomfortable to do on Lambda.
25 | 
26 |     # In practice, reading the file, massaging the data then writing
27 |     # to S3, Redshift, etc would be more useful.
28 | 
29 |     end = datetime.utcnow()
30 |     start = end - timedelta(days=count)
31 |     timeseries = dask.datasets.timeseries(start, end)
32 |     result = timeseries.groupby("name").mean().y.std().compute()
33 |     return result
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # What is this?
 3 | 
 4 | Example AWS Stack written in Python using [CDK](https://docs.aws.amazon.com/cdk/v2/guide/home.html), to
 5 | demonstrate using AWS Lambda as a Dask client to coordinate and offload work to an existing cluster.
 6 | 
 7 | ---
 8 | 
 9 | # What is in the stack?
10 | 
11 | The core example is files landing in S3, which then trigger a Lambda function. This function connects to an
12 | existing Dask cluster to submit the file for processing.
13 | 
14 | To complete the example, the stack deploys the S3 bucket along with an example 'producer' to mimick a data 
15 | vendor which deposits files every minute. From there the example compute uses the input from the file to do
16 | some arbitrary computations on the cluster.
17 | 
18 | ---
19 | 
20 | # How to try?
21 | 
22 | ## Build
23 | 
24 | ```bash
25 | $ python -m venv .env
26 | $ source .env/bin/activate
27 | $ pip install -r requirements-dev.txt
28 | $ pip install -r requirements.txt
29 | ```
30 | 
31 | This will install the necessary CDK, then this example's dependencies, and then build your Python files and your CloudFormation template.
32 | 
33 | Install the latest version of the AWS CDK CLI:
34 | 
35 | ```shell
36 | $ npm i -g aws-cdk
37 | ```
38 | 
39 | ### Run
40 | 
41 | ```bash
42 | cdk deploy \
43 |   --toolkit-stack-name <<bootstrap CDK stack name, if not the default>> \
44 |   --parameters CoiledToken=<<your coiled token>> \
45 |   --parameters CoiledAccount=<<your coiled account name>> \
46 |   --parameters CoiledUser=<<your coiled user>>
47 | ```
48 | 
49 | From there, you can go to Lambda functions run the `StartStop` Lambda function 
50 | with input of `{"action": "start"}`, (or wait for the scheduled CRON event). Then
51 | go to `SecretsManager` or your Coiled account to view the new cluster and should see
52 | new computations taking place every minute. 
53 | 
54 | You can also manually initiate new files by going to the `Producer` Lambda function and
55 | running it (input doesn't matter).
56 | 


--------------------------------------------------------------------------------
/src/lambda_consumer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import functools
  4 | import hashlib
  5 | from datetime import datetime
  6 | 
  7 | import boto3
  8 | import coiled
  9 | from distributed import wait
 10 | 
 11 | import dask_processing
 12 | from dask_processing import process_s3_file
 13 | 
 14 | 
 15 | def connect_to_cluster(error_no_cluster=False):
 16 |     """
 17 |     A helper decorator to add a `Optional[distributed.Client]` parameter to functions.
 18 |     Passing a `distributed.Client` if a Dask cluster is already running, otherwise `None`
 19 |     """
 20 | 
 21 |     def inner(f):
 22 |         @functools.wraps(f)
 23 |         def wrapper(event, context):
 24 |             client = boto3.client("secretsmanager")
 25 |             resp = client.get_secret_value(SecretId=os.environ["SECRET_ARN"])
 26 |             metadata = json.loads(resp["SecretString"])
 27 | 
 28 |             cluster_name = metadata.get("CLUSTER_NAME")
 29 |             if cluster_name:
 30 |                 cluster = coiled.Cluster(
 31 |                     name=cluster_name, shutdown_on_close=False, credentials="local"
 32 |                 )
 33 |                 client = cluster.get_client()
 34 |                 client.upload_file(dask_processing.__file__)
 35 |             elif error_no_cluster:
 36 |                 raise RuntimeError("No running cluster found.")
 37 |             else:
 38 |                 client = None
 39 | 
 40 |             return f(event, context, client)
 41 | 
 42 |         return wrapper
 43 | 
 44 |     return inner
 45 | 
 46 | 
 47 | @connect_to_cluster(error_no_cluster=True)
 48 | def consumer(event, context, client):
 49 |     """
 50 |     Lambda function triggered on new S3 files which need processing.
 51 | 
 52 |     It connects and offloads the processing work to an existing cluster.
 53 |     This is _very_ helpful in ETL type jobs where the Lambda resources can
 54 |     remain consistent across different processing jobs/files because the Lambda
 55 |     function itself doesn't perform any heavy movement/compute work. It only
 56 |     coordinates the work to be done on an existing cluster.
 57 |     """
 58 |     print(event)
 59 | 
 60 |     # Get bucket and key of file triggering this function
 61 |     bucket = event["Records"][0]["s3"]["bucket"]["name"]
 62 |     key = event["Records"][0]["s3"]["object"]["key"].replace("%3D", "=")
 63 | 
 64 |     value = boto3.client("s3").get_object(Bucket=bucket, Key=key)["Body"].read()
 65 |     print(value)
 66 | 
 67 |     # Offload the processing to the cluster
 68 |     job = client.submit(process_s3_file, bucket, key)
 69 |     wait(job)
 70 | 
 71 |     return
 72 | 
 73 | 
 74 | @connect_to_cluster(error_no_cluster=False)
 75 | def start_stop_cluster(event, context, client):
 76 |     """
 77 |     Scheduled CRON Lambda function which starts a Dask cluster using
 78 |     Coiled, then stores connection information in SecretsManager
 79 |     """
 80 |     if event["action"] == "start":
 81 |         print(event)
 82 |         if client is not None:
 83 |             return  # Cluster already running
 84 | 
 85 |         date = datetime.utcnow()
 86 |         cluster = coiled.Cluster(
 87 |             name=f"processing-cluster-{date.year}-{date.month}-{date.day}",
 88 |             software=_software_environment(),
 89 |             shutdown_on_close=False,
 90 |             n_workers=4,
 91 |             worker_cpu=2,
 92 |         )
 93 |         client = cluster.get_client()
 94 |         _update_secret(client)
 95 |     elif event["action"] == "stop":
 96 |         if client is None:
 97 |             return  # No cluster
 98 |         client.cluster.shutdown()
 99 |         _update_secret()
100 |     else:
101 |         raise ValueError(f"Unknown action '{event['action']}'")
102 | 
103 | 
104 | def _update_secret(client=None):
105 |     boto3.client("secretsmanager").put_secret_value(
106 |         SecretId=os.environ["SECRET_ARN"],
107 |         SecretString=json.dumps(
108 |             {}
109 |             if client is None
110 |             else {
111 |                 "CLUSTER_NAME": client.cluster.name,
112 |                 "SCHEDULER_ADDR": client.scheduler.address,
113 |                 "DASHBOARD_ADDR": client.dashboard_link,
114 |             }
115 |         ),
116 |     )
117 | 
118 | 
119 | def _current_environment():
120 |     # cmd = sys.executable + " -m pip freeze"
121 |     # return subprocess.check_output(cmd.split()).decode().splitlines()
122 |     # This would 'ideally' work, but technically with layers they aren't
123 |     # installed packages, only in the PYTHONPATH so aren't caught.
124 |     # Instead (probably a better way?) we stored them in an env var since
125 |     # there aren't many.
126 |     return os.environ["INSTALLED_PKGS"].splitlines()
127 | 
128 | 
129 | def _software_environment():
130 |     # TODO: Software environment combined with package_sync
131 |     # since we want a superset of current env
132 |     deps = _current_environment()
133 |     deps.extend(["dask[dataframe]", "s3fs", "bokeh==2.4.2"])
134 |     env_hash = hashlib.md5("".join(deps).encode()).hexdigest()[:5]
135 |     name = f"milesg-processing-cluster-{env_hash}"
136 |     coiled.create_software_environment(name=name, pip=deps)
137 |     return name
138 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode,node
  2 | # Edit at https://www.gitignore.io/?templates=osx,linux,python,windows,pycharm,visualstudiocode,node
  3 | 
  4 | ### Linux ###
  5 | *~
  6 | 
  7 | # temporary files which can be created if a process still has a handle open of a deleted file
  8 | .fuse_hidden*
  9 | 
 10 | # KDE directory preferences
 11 | .directory
 12 | 
 13 | # Linux trash folder which might appear on any partition or disk
 14 | .Trash-*
 15 | 
 16 | # .nfs files are created when an open file is removed but is still being accessed
 17 | .nfs*
 18 | 
 19 | ### Node ###
 20 | # Logs
 21 | logs
 22 | *.log
 23 | npm-debug.log*
 24 | yarn-debug.log*
 25 | yarn-error.log*
 26 | lerna-debug.log*
 27 | 
 28 | # Diagnostic reports (https://nodejs.org/api/report.html)
 29 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 30 | 
 31 | # Runtime data
 32 | pids
 33 | *.pid
 34 | *.seed
 35 | *.pid.lock
 36 | 
 37 | # Directory for instrumented libs generated by jscoverage/JSCover
 38 | lib-cov
 39 | 
 40 | # Coverage directory used by tools like istanbul
 41 | coverage
 42 | *.lcov
 43 | 
 44 | # nyc test coverage
 45 | .nyc_output
 46 | 
 47 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 48 | .grunt
 49 | 
 50 | # Bower dependency directory (https://bower.io/)
 51 | bower_components
 52 | 
 53 | # node-waf configuration
 54 | .lock-wscript
 55 | 
 56 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 57 | build/Release
 58 | 
 59 | # Dependency directories
 60 | node_modules/
 61 | jspm_packages/
 62 | 
 63 | # TypeScript v1 declaration files
 64 | typings/
 65 | 
 66 | # TypeScript cache
 67 | *.tsbuildinfo
 68 | 
 69 | # Optional npm cache directory
 70 | .npm
 71 | 
 72 | # Optional eslint cache
 73 | .eslintcache
 74 | 
 75 | # Optional REPL history
 76 | .node_repl_history
 77 | 
 78 | # Output of 'npm pack'
 79 | *.tgz
 80 | 
 81 | # Yarn Integrity file
 82 | .yarn-integrity
 83 | 
 84 | # dotenv environment variables file
 85 | .env
 86 | .env.test
 87 | 
 88 | # parcel-bundler cache (https://parceljs.org/)
 89 | .cache
 90 | 
 91 | # next.js build output
 92 | .next
 93 | 
 94 | # nuxt.js build output
 95 | .nuxt
 96 | 
 97 | # vuepress build output
 98 | .vuepress/dist
 99 | 
100 | # Serverless directories
101 | .serverless/
102 | 
103 | # FuseBox cache
104 | .fusebox/
105 | 
106 | # DynamoDB Local files
107 | .dynamodb/
108 | 
109 | ### OSX ###
110 | # General
111 | .DS_Store
112 | .AppleDouble
113 | .LSOverride
114 | 
115 | # Icon must end with two \r
116 | Icon
117 | 
118 | # Thumbnails
119 | ._*
120 | 
121 | # Files that might appear in the root of a volume
122 | .DocumentRevisions-V100
123 | .fseventsd
124 | .Spotlight-V100
125 | .TemporaryItems
126 | .Trashes
127 | .VolumeIcon.icns
128 | .com.apple.timemachine.donotpresent
129 | 
130 | # Directories potentially created on remote AFP share
131 | .AppleDB
132 | .AppleDesktop
133 | Network Trash Folder
134 | Temporary Items
135 | .apdisk
136 | 
137 | ### PyCharm ###
138 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
139 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
140 | 
141 | # User-specific stuff
142 | .idea/**/workspace.xml
143 | .idea/**/tasks.xml
144 | .idea/**/usage.statistics.xml
145 | .idea/**/dictionaries
146 | .idea/**/shelf
147 | 
148 | # Generated files
149 | .idea/**/contentModel.xml
150 | 
151 | # Sensitive or high-churn files
152 | .idea/**/dataSources/
153 | .idea/**/dataSources.ids
154 | .idea/**/dataSources.local.xml
155 | .idea/**/sqlDataSources.xml
156 | .idea/**/dynamic.xml
157 | .idea/**/uiDesigner.xml
158 | .idea/**/dbnavigator.xml
159 | 
160 | # Gradle
161 | .idea/**/gradle.xml
162 | .idea/**/libraries
163 | 
164 | # Gradle and Maven with auto-import
165 | # When using Gradle or Maven with auto-import, you should exclude module files,
166 | # since they will be recreated, and may cause churn.  Uncomment if using
167 | # auto-import.
168 | .idea/*.xml
169 | .idea/*.iml
170 | .idea
171 | # .idea/modules
172 | # *.iml
173 | # *.ipr
174 | 
175 | # CMake
176 | cmake-build-*/
177 | 
178 | # Mongo Explorer plugin
179 | .idea/**/mongoSettings.xml
180 | 
181 | # File-based project format
182 | *.iws
183 | 
184 | # IntelliJ
185 | out/
186 | 
187 | # mpeltonen/sbt-idea plugin
188 | .idea_modules/
189 | 
190 | # JIRA plugin
191 | atlassian-ide-plugin.xml
192 | 
193 | # Cursive Clojure plugin
194 | .idea/replstate.xml
195 | 
196 | # Crashlytics plugin (for Android Studio and IntelliJ)
197 | com_crashlytics_export_strings.xml
198 | crashlytics.properties
199 | crashlytics-build.properties
200 | fabric.properties
201 | 
202 | # Editor-based Rest Client
203 | .idea/httpRequests
204 | 
205 | # Android studio 3.1+ serialized cache file
206 | .idea/caches/build_file_checksums.ser
207 | 
208 | ### PyCharm Patch ###
209 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
210 | 
211 | # *.iml
212 | # modules.xml
213 | # .idea/misc.xml
214 | # *.ipr
215 | 
216 | # Sonarlint plugin
217 | .idea/sonarlint
218 | 
219 | ### Python ###
220 | # Byte-compiled / optimized / DLL files
221 | __pycache__/
222 | *.py[cod]
223 | *$py.class
224 | 
225 | # C extensions
226 | *.so
227 | 
228 | # Distribution / packaging
229 | .Python
230 | build/
231 | develop-eggs/
232 | dist/
233 | downloads/
234 | eggs/
235 | .eggs/
236 | lib64/
237 | parts/
238 | sdist/
239 | var/
240 | wheels/
241 | pip-wheel-metadata/
242 | share/python-wheels/
243 | *.egg-info/
244 | .installed.cfg
245 | *.egg
246 | MANIFEST
247 | 
248 | # PyInstaller
249 | #  Usually these files are written by a python script from a template
250 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
251 | *.manifest
252 | *.spec
253 | 
254 | # Installer logs
255 | pip-log.txt
256 | pip-delete-this-directory.txt
257 | 
258 | # Unit test / coverage reports
259 | htmlcov/
260 | .tox/
261 | .nox/
262 | .coverage
263 | .coverage.*
264 | nosetests.xml
265 | coverage.xml
266 | *.cover
267 | .hypothesis/
268 | .pytest_cache/
269 | 
270 | # Translations
271 | *.mo
272 | *.pot
273 | 
274 | # Django stuff:
275 | local_settings.py
276 | db.sqlite3
277 | db.sqlite3-journal
278 | 
279 | # Flask stuff:
280 | instance/
281 | .webassets-cache
282 | 
283 | # Scrapy stuff:
284 | .scrapy
285 | 
286 | # Sphinx documentation
287 | docs/_build/
288 | 
289 | # PyBuilder
290 | target/
291 | 
292 | # Jupyter Notebook
293 | .ipynb_checkpoints
294 | 
295 | # IPython
296 | profile_default/
297 | ipython_config.py
298 | 
299 | # pyenv
300 | .python-version
301 | 
302 | # pipenv
303 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
304 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
305 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
306 | #   install all needed dependencies.
307 | #Pipfile.lock
308 | 
309 | # celery beat schedule file
310 | celerybeat-schedule
311 | 
312 | # SageMath parsed files
313 | *.sage.py
314 | 
315 | # Environments
316 | .venv
317 | env/
318 | venv/
319 | ENV/
320 | env.bak/
321 | venv.bak/
322 | 
323 | # Spyder project settings
324 | .spyderproject
325 | .spyproject
326 | 
327 | # Rope project settings
328 | .ropeproject
329 | 
330 | # mkdocs documentation
331 | /site
332 | 
333 | # mypy
334 | .mypy_cache/
335 | .dmypy.json
336 | dmypy.json
337 | 
338 | # Pyre type checker
339 | .pyre/
340 | 
341 | ### VisualStudioCode ###
342 | .vscode
343 | 
344 | ### VisualStudioCode Patch ###
345 | # Ignore all local history of files
346 | .history
347 | 
348 | ### Windows ###
349 | # Windows thumbnail cache files
350 | Thumbs.db
351 | Thumbs.db:encryptable
352 | ehthumbs.db
353 | ehthumbs_vista.db
354 | 
355 | # Dump file
356 | *.stackdump
357 | 
358 | # Folder config file
359 | [Dd]esktop.ini
360 | 
361 | # Recycle Bin used on file shares
362 | $RECYCLE.BIN/
363 | 
364 | # Windows Installer files
365 | *.cab
366 | *.msi
367 | *.msix
368 | *.msm
369 | *.msp
370 | 
371 | # Windows shortcuts
372 | *.lnk
373 | 
374 | # End of https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode,node
375 | 
376 | ### CDK-specific ignores ###
377 | *.swp
378 | cdk.context.json
379 | package-lock.json
380 | yarn.lock
381 | .cdk.staging
382 | cdk.out
383 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | import hashlib
  3 | from aws_cdk import (
  4 |     aws_events as events,
  5 |     aws_lambda as lambda_,
  6 |     aws_iam as iam,
  7 |     aws_s3 as s3,
  8 |     aws_s3_notifications as s3_notifications,
  9 |     aws_events_targets as targets,
 10 |     aws_secretsmanager as secretsmanager,
 11 |     CfnParameter,
 12 |     SecretValue,
 13 |     App,
 14 |     BundlingOptions,
 15 |     Duration,
 16 |     Stack,
 17 |     RemovalPolicy,
 18 | )
 19 | 
 20 | 
 21 | class DaskLambdaExampleStack(Stack):
 22 |     # Bucket to store example data for processing
 23 |     bucket: s3.Bucket
 24 | 
 25 |     # Dummy producer of files for processing
 26 |     lambda_producer: lambda_.Function
 27 | 
 28 |     # Example connecting to existing cluster from AWS Lambda
 29 |     # and coordinating some work
 30 |     lambda_consumer: lambda_.Function
 31 | 
 32 |     # Function that starts/stops Dask clusters
 33 |     lambda_start_stop_cluster: lambda_.Function
 34 | 
 35 |     # Shared file between Dask workers and Lambda client
 36 |     dask_processing_layer: lambda_.LayerVersion
 37 | 
 38 |     # Dask dependencies for client, (dask, distributed, coiled)
 39 |     dask_dependencies_layer: lambda_.LayerVersion
 40 | 
 41 |     def __init__(self, app: App, id: str) -> None:
 42 |         super().__init__(app, id)
 43 |         self.coiled_token = CfnParameter(
 44 |             self,
 45 |             "CoiledToken",
 46 |             type="String",
 47 |             description="Coiled Token: realistically, already saved and retreive from SecretsManger",
 48 |         )
 49 |         self.coiled_account = CfnParameter(
 50 |             self,
 51 |             "CoiledAccount",
 52 |             type="String",
 53 |             description="Coiled Account: realistically, already saved and retreive from SecretsManger",
 54 |         )
 55 |         self.coiled_user = CfnParameter(
 56 |             self,
 57 |             "CoiledUser",
 58 |             type="String",
 59 |             description="Coiled User: realistically, already saved and retreive from SecretsManger",
 60 |         )
 61 | 
 62 |         self.make_bucket()
 63 |         self.make_secret()
 64 | 
 65 |         self.make_dask_dependencies_layer()
 66 |         self.make_dask_processing_layer()
 67 | 
 68 |         self.make_lambda_producer()
 69 |         self.make_lambda_consumer()
 70 |         self.make_lambda_start_stop_cluster()
 71 | 
 72 |     def make_bucket(self):
 73 |         self.bucket = s3.Bucket(
 74 |             self,
 75 |             "example2-data-bucket",
 76 |             removal_policy=RemovalPolicy.DESTROY,
 77 |             block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
 78 |         )
 79 |         self.bucket.add_to_resource_policy(
 80 |             iam.PolicyStatement(
 81 |                 sid="AllowReadingFromThisAccount",
 82 |                 effect=iam.Effect.ALLOW,
 83 |                 principals=[iam.AccountRootPrincipal()],
 84 |                 actions=["s3:List*", "s3:Get*"],
 85 |                 resources=[
 86 |                     self.bucket.bucket_arn,
 87 |                     f"{self.bucket.bucket_arn}/*",
 88 |                 ],
 89 |             )
 90 |         )
 91 | 
 92 |     def make_secret(self):
 93 |         self.secret = secretsmanager.Secret(
 94 |             self,
 95 |             "Secret",
 96 |             description="Connection information to running Dask Cluster",
 97 |             secret_object_value={
 98 |                 "CLUSTER_NAME": SecretValue.unsafe_plain_text(""),
 99 |                 "SCHEDULER_ADDR": SecretValue.unsafe_plain_text(""),
100 |                 "DASHBOARD_ADDR": SecretValue.unsafe_plain_text(""),
101 |             },
102 |         )
103 | 
104 |     def make_lambda_producer(self):
105 |         src_file = pathlib.Path(__file__).parent.joinpath("src/lambda_producer.py")
106 | 
107 |         self.lambda_producer = lambda_.Function(
108 |             self,
109 |             "ExampleProducerFunction",
110 |             description="Example producer function. Generating example data for processing",
111 |             code=lambda_.InlineCode(src_file.read_text()),
112 |             handler="index.producer",
113 |             timeout=Duration.seconds(5),
114 |             runtime=lambda_.Runtime.PYTHON_3_10,
115 |             environment={
116 |                 "S3_BUCKET": self.bucket.bucket_name,
117 |                 "SECRET_ARN": self.secret.secret_arn,
118 |             },
119 |         )
120 | 
121 |         # Allow this function to write to s3
122 |         self.lambda_producer.add_to_role_policy(
123 |             iam.PolicyStatement(
124 |                 effect=iam.Effect.ALLOW,
125 |                 actions=["s3:PutObject"],
126 |                 resources=[self.bucket.bucket_arn, f"{self.bucket.bucket_arn}/*"],
127 |             )
128 |         )
129 | 
130 |         # Invoke at regular intervals to produce new files
131 |         rule = events.Rule(
132 |             self, "S3ProducerRule", schedule=events.Schedule.rate(Duration.minutes(1))
133 |         )
134 |         rule.add_target(targets.LambdaFunction(self.lambda_producer))
135 | 
136 |     def make_lambda_consumer(self):
137 |         src_file = pathlib.Path(__file__).parent.joinpath("src/lambda_consumer.py")
138 | 
139 |         self.lambda_consumer = lambda_.Function(
140 |             self,
141 |             "ExampleConsumerFunction",
142 |             description="Example of Dask client from Lambda, coordinating work on remote cluster",
143 |             code=lambda_.InlineCode(src_file.read_text()),
144 |             handler="index.consumer",
145 |             # time waiting for cluster to process task, can be lower with 'fire-and-forget'
146 |             timeout=Duration.minutes(5),
147 |             runtime=lambda_.Runtime.PYTHON_3_10,
148 |             layers=[self.dask_processing_layer, self.dask_dependencies_layer],
149 |             environment={
150 |                 "SECRET_ARN": self.secret.secret_arn,
151 |                 "DASK_COILED__TOKEN": self.coiled_token.value_as_string,
152 |                 "DASK_COILED__ACCOUNT": self.coiled_account.value_as_string,
153 |                 "DASK_COILED__USER": self.coiled_user.value_as_string,
154 |             },
155 |         )
156 | 
157 |         # Get cluster connection info permission
158 |         self.lambda_consumer.add_to_role_policy(
159 |             iam.PolicyStatement(
160 |                 effect=iam.Effect.ALLOW,
161 |                 actions=["secretsmanager:GetSecretValue"],
162 |                 resources=[self.secret.secret_arn],
163 |             )
164 |         )
165 |         self.lambda_consumer.add_to_role_policy(
166 |             iam.PolicyStatement(
167 |                 effect=iam.Effect.ALLOW,
168 |                 actions=["s3:Get*", "s3:List*"],
169 |                 resources=[self.bucket.arn_for_objects("*"), self.bucket.bucket_arn],
170 |             )
171 |         )
172 | 
173 |         # Trigger consumer
174 |         notification = s3_notifications.LambdaDestination(self.lambda_consumer)
175 |         self.bucket.add_event_notification(s3.EventType.OBJECT_CREATED, notification)
176 | 
177 |         # Add bucket policy that this function, and thus cluster's inherited role
178 |         # will have access to read the bucket
179 |         self.bucket.add_to_resource_policy(
180 |             iam.PolicyStatement(
181 |                 sid="AllowReadingFromForLambdaConsumerRole",
182 |                 effect=iam.Effect.ALLOW,
183 |                 principals=[iam.ArnPrincipal(self.lambda_consumer.role.role_arn)],
184 |                 actions=["s3:List*", "s3:Get*"],
185 |                 resources=[
186 |                     self.bucket.bucket_arn,
187 |                     f"{self.bucket.bucket_arn}/*",
188 |                 ],
189 |             )
190 |         )
191 | 
192 |     def make_lambda_start_stop_cluster(self):
193 |         src_file = pathlib.Path(__file__).parent.joinpath("src/lambda_consumer.py")
194 | 
195 |         self.lambda_start_stop_cluster = lambda_.Function(
196 |             self,
197 |             "StartStopClusterFunction",
198 |             description="Example of starting and stopping a Dask cluster using coiled",
199 |             code=lambda_.InlineCode(src_file.read_text()),
200 |             handler="index.start_stop_cluster",
201 |             timeout=Duration.minutes(10),  # Time for creating software envs / cluster
202 |             runtime=lambda_.Runtime.PYTHON_3_10,
203 |             memory_size=1024,
204 |             layers=[self.dask_processing_layer, self.dask_dependencies_layer],
205 |             environment={
206 |                 "SECRET_ARN": self.secret.secret_arn,
207 |                 "INSTALLED_PKGS": pathlib.Path("requirements.txt").read_text(),
208 |                 "DASK_COILED__TOKEN": self.coiled_token.value_as_string,
209 |                 "DASK_COILED__ACCOUNT": self.coiled_account.value_as_string,
210 |                 "DASK_COILED__USER": self.coiled_user.value_as_string,
211 |             },
212 |         )
213 | 
214 |         # Update cluster connection info permission
215 |         self.lambda_start_stop_cluster.add_to_role_policy(
216 |             iam.PolicyStatement(
217 |                 effect=iam.Effect.ALLOW,
218 |                 actions=[
219 |                     "secretsmanager:PutSecretValue",
220 |                     "secretsmanager:GetSecretValue",
221 |                 ],
222 |                 resources=[self.secret.secret_arn],
223 |             )
224 |         )
225 |         # Trigger start/stop cluster
226 |         for action, schedule in (
227 |             ("start", "cron(0 7 ? * 1-6 *)"),  # Weekdays 7am
228 |             ("stop", "cron(0 17 ? * 1-6 *)"),  # Weekdays 5pm
229 |         ):
230 |             events.Rule(
231 |                 self,
232 |                 f"{action.capitalize()}Cluster",
233 |                 description=f"Schedule {action} of Dask cluster",
234 |                 schedule=events.Schedule.expression(schedule),
235 |                 targets=[
236 |                     targets.LambdaFunction(
237 |                         self.lambda_start_stop_cluster,
238 |                         event=events.RuleTargetInput.from_object({"action": action}),
239 |                     )
240 |                 ],
241 |             )
242 | 
243 |     def make_dask_processing_layer(self):
244 |         """
245 |         Layer which bridges Lambda client with code running on workers
246 |         """
247 |         self.dask_processing_layer = lambda_.LayerVersion(
248 |             self,
249 |             "dask_processing_layer",
250 |             code=lambda_.Code.from_asset("layer"),
251 |             description="Code for client and dask workers",
252 |             compatible_runtimes=[lambda_.Runtime.PYTHON_3_10],
253 |             removal_policy=RemovalPolicy.DESTROY,
254 |         )
255 | 
256 |     def make_dask_dependencies_layer(self):
257 |         """
258 |         Layer which bridges Lambda client with code running on workers
259 |         """
260 |         command = "pip install --no-cache -r requirements.txt -t /asset-output/python"
261 |         asset_key = pathlib.Path("./requirements.txt").read_bytes() + command.encode()
262 |         self.dask_dependencies_layer = lambda_.LayerVersion(
263 |             self,
264 |             "dask_dependencies_layer",
265 |             code=lambda_.Code.from_asset(
266 |                 "./",
267 |                 asset_hash=hashlib.md5(asset_key).hexdigest(),
268 |                 bundling=BundlingOptions(
269 |                     image=lambda_.Runtime.PYTHON_3_10.bundling_image,
270 |                     command=["bash", "-c", command],
271 |                 ),
272 |             ),
273 |             description="Dask dependencies (coiled, dask, distributed, etc)",
274 |             compatible_runtimes=[lambda_.Runtime.PYTHON_3_10],
275 |             removal_policy=RemovalPolicy.DESTROY,
276 |         )
277 | 
278 | 
279 | app = App()
280 | DaskLambdaExampleStack(app, "DaskLambdaExampleStack")
281 | app.synth()
282 | 


--------------------------------------------------------------------------------